from pyspark.sql.types import *
from pyspark.sql import SparkSession
Define a spark schema
spark
Setup
= SparkSession.builder.appName('Simple-table').getOrCreate() spark
Define it programmatically
= StructType([StructField("first name", StringType(), False),
schema "last name", StringType(), False),
StructField("weight", IntegerType(), False)]) StructField(
schema
StructType([StructField('first name', StringType(), False), StructField('last name', StringType(), False), StructField('weight', IntegerType(), False)])
False
indicate whether the field can be null (None) or not.
= [['Jake', 'Z', 60], ['Tom', 'X', 50]] data
= spark.createDataFrame(data, schema)
df df.show()
+----------+---------+------+
|first name|last name|weight|
+----------+---------+------+
| Jake| Z| 60|
| Tom| X| 50|
+----------+---------+------+
Define it using DDL
This method is much simper.
= "first_name STRING, last_name STRING, weight INT" schema
schema
'first_name STRING, last_name STRING, weight INT'
= spark.createDataFrame(data, schema)
df df.show()
+----------+---------+------+
|first_name|last_name|weight|
+----------+---------+------+
| Jake| Z| 60|
| Tom| X| 50|
+----------+---------+------+
Stop spark session
spark.stop()