您可以对所需的所有列名使用percentile_approx(请注意,我们删除了要对其执行GROUPBY的列):
all_aggregations = []
for col in sparkDF.drop('y').columns:
all_aggregations.extend(
[F.percentile_approx(col, 0.1).alias(f'{col}_perc_10'),
F.percentile_approx(col, 0.25).alias(f'{col}_perc_25'),
F.percentile_approx(col, 0.50).alias(f'{col}_perc_50'),
F.percentile_approx(col, 0.75).alias(f'{col}_perc_75'),
F.percentile_approx(col, 0.90).alias(f'{col}_perc_90'),
F.max(col).alias(f'{col}_max')]
)
sparkDF_summary_stats = sparkDF.groupby('y').agg(
*all_aggregations
)
对于任何使用早期版本的Pyspark的人,你可以使用F.expr
来计算百分位数(@Ala Tarighati得到this answer分):
all_aggregations = []
for col in sparkDF.drop('y').columns:
all_aggregations.extend(
[F.expr(f'percentile({col}, array(0.10))').alias(f'{col}_perc_10'),
F.expr(f'percentile({col}, array(0.25))').alias(f'{col}_perc_25'),
F.expr(f'percentile({col}, array(0.50))').alias(f'{col}_perc_50'),
F.expr(f'percentile({col}, array(0.75))').alias(f'{col}_perc_75'),
F.expr(f'percentile({col}, array(0.90))').alias(f'{col}_perc_90'),
F.max(col).alias(f'{col}_max')]
)
sparkDF_summary_stats = sparkDF.groupby('y').agg(
*all_aggregations
)
使用随机样本PesSpark数据帧:
np.random.seed(42)
random_cols = np.random.choice(sparkDF_summary_stats.columns, 4).tolist()
sparkDF_summary_stats.select(
random_cols
).show()
+------------------+------------------+-------------------+-------------------+
| col60_max| col100_perc_75| col37_perc_25| col68_perc_50|
+------------------+------------------+-------------------+-------------------+
|0.9888405413036631|0.7153223105291356| 0.3924451074226354|0.23228965409645264|
|0.9546663568790689|0.7837917844853972|0.26496706155544303| 0.4975660833887259|
|0.9969494174116696|0.6553831994634532|0.31725917435686757|0.43747492202372906|
|0.9919627472386433|0.7804711383801549|0.32662190574800876| 0.3862363952990896|
+------------------+------------------+-------------------+-------------------+