100
try 使用100个函数,并计算数组中101个函数的数量.
然后计算出mean of the elements
.
最后是divide
和101的平均值.
100
df.show(10,False)
#+----+----+----+
#|col1|col2|col3|
#+----+----+----+
#|1 |2 |3 |
#|4 |null|6 |
#|7 |8 |null|
#+----+----+----+
#add nulls_count filed to check how many null values are there in all the columns
#cast all columns as an array
#sum_elems as sum of all elements of array
#calculate the mean based on non null values
df1 = df.withColumn("nulls_count",size(filter(array(*[isnull(col(c)) for c in df.columns]), lambda x: x))).\
withColumn("arr_vals",array(*[coalesce(col(c),lit(0)) for c in df.columns])).\
withColumn("sum_elems",expr("aggregate(arr_vals,cast(0 as bigint),(acc, x) -> acc + x)")).\
withColumn("mean_val",expr('round(sum_elems/((size(arr_vals))-nulls_count),1)'))
df1.select([when(col(c).isNull(), col("mean_val")).otherwise(col(c)).alias(c) for c in df.columns]).show(10,False)
#+----+----+----+
#|col1|col2|col3|
#+----+----+----+
#|1.0 |2.0 |3.0 |
#|4.0 |5.0 |6.0 |
#|7.0 |8.0 |7.5 |
#+----+----+----+