convert pandas dataframe to spark dataframe
import pandas as pd from pyspark.sql import SparkSession filename = <'path to file'> spark = SparkSession.build.appName('pandasToSpark').getOrCreate() # Assuming file is csv pandas_df = pd.read_csv(filename) spark_df = spark.CreateDataFrame(pandas_df)
dataframe pandas to spark
from pyspark.sql import SparkSession #Create PySpark SparkSession spark = SparkSession.builder \ .master("local[1]") \ .appName("SparkByExamples.com") \ .getOrCreate() #Create PySpark DataFrame from Pandas sparkDF=spark.createDataFrame(pandasDF) sparkDF.printSchema() sparkDF.show() #Outputs below schema & DataFrame root |-- Name: string (nullable = true) |-- Age: long (nullable = true) +------+---+ | Name|Age| +------+---+ | Scott| 50| | Jeff| 45| |Thomas| 54| | Ann| 34| +------+---+
Source: sparkbyexamples.com
spark df to pandas df
some_df = sc.parallelize([ ("A", "no"), ("B", "yes"), ("B", "yes"), ("B", "no")] ).toDF(["user_id", "phone_number"]) pandas_df = some_df.toPandas()
Source: stackoverflow.com