目前的方法可以修改:
.strip_chars()
表示删除尾随逗号
.cast(pl.List(pl.Int32))
一次投下所有的东西
- 传递一个函数到
.to_struct(fields=)
以"动态地"重命名字段
cols = "blockSizes", "blockStarts"
(df.with_columns(
pl.col(col)
.str.strip_chars(",")
.str.split(",")
.cast(pl.List(pl.Int32))
.list.to_struct(
n_field_strategy = "max_width",
fields = lambda idx, col=col: f"{col}_{idx}"
)
for col in cols
)
.unnest(cols)
)
shape: (4, 5)
┌───────┬──────────────┬──────────────┬───────────────┬───────────────┐
│ chrom ┆ blockSizes_0 ┆ blockSizes_1 ┆ blockStarts_0 ┆ blockStarts_1 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i32 ┆ i32 ┆ i32 ┆ i32 │
╞═══════╪══════════════╪══════════════╪═══════════════╪═══════════════╡
│ 1 ┆ 10 ┆ 29 ┆ 0 ┆ 50 │
│ 1 ┆ 20 ┆ 22 ┆ 0 ┆ 45 │
│ 2 ┆ 30 ┆ 25 ┆ 0 ┆ 60 │
│ X ┆ 40 ┆ 23 ┆ 0 ┆ 70 │
└───────┴──────────────┴──────────────┴───────────────┴───────────────┘
也许创建一个函数会让事情变得更整洁.
def csv_to_struct(col):
expr = pl.col(col).str.strip_chars(",").str.split(",")
expr = expr.cast(pl.List(pl.Int32))
return expr.list.to_struct(
n_field_strategy = "max_width",
fields = lambda idx: f"{col}_{idx}"
)
cols = "blockSizes", "blockStarts"
df.with_columns(map(csv_to_struct, cols)).unnest(cols)
熔化/枢轴
另一种方法是使用.melt()
+.pivot()
cols = "blockSizes", "blockStarts"
(df.with_row_count()
.with_columns(
pl.col(cols).str.strip_chars(",").str.split(",").cast(pl.List(int))
)
.explode(cols)
.melt(["row_nr", "chrom"], variable_name="name")
.with_columns(
pl.format("{}_{}", "name", pl.col("row_nr").cum_count().over("row_nr", "name"))
)
.pivot("value", ["row_nr", "chrom"], "name")
)
shape: (4, 6)
┌────────┬───────┬──────────────┬──────────────┬───────────────┬───────────────┐
│ row_nr ┆ chrom ┆ blockSizes_0 ┆ blockSizes_1 ┆ blockStarts_0 ┆ blockStarts_1 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞════════╪═══════╪══════════════╪══════════════╪═══════════════╪═══════════════╡
│ 0 ┆ 1 ┆ 10 ┆ 29 ┆ 0 ┆ 50 │
│ 1 ┆ 1 ┆ 20 ┆ 22 ┆ 0 ┆ 45 │
│ 2 ┆ 2 ┆ 30 ┆ 25 ┆ 0 ┆ 60 │
│ 3 ┆ X ┆ 40 ┆ 23 ┆ 0 ┆ 70 │
└────────┴───────┴──────────────┴──────────────┴───────────────┴───────────────┘