Row and column in Spark

spark
Author

Youfeng Zhou

Published

November 21, 2022

Row

from pyspark.sql import SparkSession
from pyspark.sql import Row, Column
from pyspark.sql.functions import expr

spark = SparkSession.builder.appName('col-row').getOrCreate()
rows = [Row('Brooke', 20), Row('Denny', 31), Row('Jules', 30), Row('TD', 35), Row('Brooke', 25)]
rows
[<Row('Brooke', 20)>,
 <Row('Denny', 31)>,
 <Row('Jules', 30)>,
 <Row('TD', 35)>,
 <Row('Brooke', 25)>]
df = spark.createDataFrame(rows, ['name', 'age'])
df.show()
[Stage 0:>                                                          (0 + 1) / 1]
+------+---+
|  name|age|
+------+---+
|Brooke| 20|
| Denny| 31|
| Jules| 30|
|    TD| 35|
|Brooke| 25|
+------+---+
                                                                                

Column

new_col = expr("age + 3")
new_col
Column<'(age + 3)'>
df.withColumn('age after 3 years', new_col).show()
+------+---+-----------------+
|  name|age|age after 3 years|
+------+---+-----------------+
|Brooke| 20|               23|
| Denny| 31|               34|
| Jules| 30|               33|
|    TD| 35|               38|
|Brooke| 25|               28|
+------+---+-----------------+
df.age
Column<'age'>
df.withColumn('age after 5 years', (df.age + 5)).show()
+------+---+-----------------+
|  name|age|age after 5 years|
+------+---+-----------------+
|Brooke| 20|               25|
| Denny| 31|               36|
| Jules| 30|               35|
|    TD| 35|               40|
|Brooke| 25|               30|
+------+---+-----------------+
spark.stop()