Skip to content

Demo: Converting Parquet Dataset Into Delta Format

spark-shell \
  --packages \
  --conf \
assert(spark.version.matches("2.4.[2-6]"), "Delta Lake supports Spark 2.4.2+")

import org.apache.spark.sql.SparkSession

val deltaLake = "/tmp/delta"

// Create parquet table
val users = s"$deltaLake/users"
import spark.implicits._
val data = Seq(
  (0L, "Agata", "Warsaw", "Poland"),
  (1L, "Jacek", "Warsaw", "Poland"),
  (2L, "Bartosz", "Paris", "France")
).toDF("id", "name", "city", "country")
  .partitionBy("city", "country")

// TIP: Use git to version the users directory
//      to track the changes for import

// CONVERT TO DELTA only supports parquet tables
// TableIdentifier should be parquet.`users`

// Use TableIdentifier to refer to the parquet table
// The path itself would work too
val tableId = s"parquet.`$users`"
val partitionSchema = "city STRING, country STRING"

// Import users table into Delta Lake
// Well, convert the parquet table into delta table
// Use web UI to monitor execution, e.g. http://localhost:4040

val dt = DeltaTable.convertToDelta(spark, tableId, partitionSchema)

// users table is now in delta format