from pyspark.sql import SparkSession
from pyspark.sql.types import *
= SparkSession.builder.appName('CSV-Reader').getOrCreate() spark
Common DataFrame operations
spark
Common DataFrame operations
Using the DataFrameReader interface to read a csv file
# In python, the syntax is as below.
= spark.read.csv('csv_file_path', header=True, schema=csv_file_schema) df
Saviing a DataFrame as a parquet file or SQL table
= '...'
parquet_path format("parquet").save(parquet_path) df.write.
= '...' # name of the table
parquet_table format("parquet").saveAsTable(parquet_table) df.write.
SQL table will cover later
Projections and filters
# in Spark, projections are done with `select()` method,
# while filters can be conducted using `filter()` or `where()` method.
= df.select([columns list]).where(col(column name == 'some condition'))
sub_df 5, truncate=False) sub_df.show(
Renaming, adding, dropping columns
'name of current column', 'renamed column name') df.withColumnRenamed(
'target column', 'new_column') df.withColumn(
'columns needed to drop') df.drop(
More details look at book Learning Spark
spark.stop()