你可以试一下row_number(), monotonically_increasing_id()
.
通过在row_number
字段上获得%
来创建组,然后使用sum
窗口操作来获得组.
100
from pyspark.sql import *
from pyspark.sql.functions import *
window = Window.partitionBy(lit(1)).orderBy('Column A')
window_mid = Window.partitionBy(lit(1)).orderBy('mid')
df.withColumn("rn", row_number().over(window)).\
withColumn("grp", (col("rn")%2)).\
withColumn("mid", monotonically_increasing_id()).\
withColumn("sum", sum(col("grp")).over(window_mid)).\
groupBy("sum").agg(array_join(collect_list(col("Column A")),',').alias("column a"),
array_join(collect_list(col("Column B")),',').alias("column b"),
array_join(collect_list(col("Column C")),',').alias("column c"),
array_join(collect_list(col("Column D")),',').alias("column d")).\
drop(*['rn','grp','mid','sum']).show()
#+---------------+---------------+---------------+---------------+
#| column a| column b| column c| column d|
#+---------------+---------------+---------------+---------------+
#|Cell A1,Cell A2|Cell B1,Cell B2|Cell C1,Cell C2|Cell D1,Cell D2|
#|Cell A3,Cell A4|Cell B3,Cell B4|Cell C3,Cell C4|Cell D3,Cell D4|
#+---------------+---------------+---------------+---------------+