我使用一个已经写入磁盘的大型数据集作为拼图分区数据集.
我如何将这些数据直接读入Polar中,以获得一些聚合计算结果? 我希望避免将镶木地板数据转换为Pandas (pq_df.to_pandas()),因为我的数据大于我的计算机内存.
以下是一个可重复使用的示例代码. 我很感谢你的意见.
import polars as pl # Version 0.20.3
import pyarrow as pa # Version 11.0.0
import pyarrow.parquet as pq
pl_df = pl.DataFrame({
"Name": ["ABC","DEF","GHI",'JKL'],
"date": ["2024-01-01","2024-01-10","2023-01-29","2023-01-29"],
"price":[1000,1500,1800,2100] ,
})
pl_df = pl_df.with_columns(date= pl.col("date").cast(pl.Date))
# write Polars data frame to disk as parquet dataset
pq.write_to_dataset( pl_df.to_arrow(), root_path=r"C:\Users\desktop PC\Downloads\test_pl", partition_cols=["date"],
compression ='gzip',existing_data_behavior='overwrite_or_ignore')
# Have a schema object of data written to parquet dataset
pd_df_schema = pa.Schema.from_pandas(pl_df.to_pandas())
# Read data written to parquet dataset
pq_df = pq.read_table(r"C:\Users\desktop PC\Downloads\test_pl",
schema=pd_df_schema,
)
# I want to use this parquest object to create a aggregate result via Polars with out using #"pq_df.to_pandas()" method.
df = (pl.from_pandas(pq_df.to_pandas()).lazy()
.group_by(["date"])
.agg(
[
pl.col("price").sum().alias("grouped_sum"),
pl.col("price").count().alias("grouped_count"),])
).collect(streaming=True)