Demo: Generated Columns¶
Create Delta Table with Generated Column¶
val dataPath = "/tmp/delta/values"
import io.delta.tables.DeltaTable
import org.apache.spark.sql.types.DataTypes
DeltaTable.create
.addColumn("id", DataTypes.LongType, nullable = false)
.addColumn(
DeltaTable.columnBuilder("value")
.dataType(DataTypes.BooleanType)
.generatedAlwaysAs("true")
.build)
.location(dataPath)
.execute
Review Metadata¶
import org.apache.spark.sql.delta.DeltaLog
val deltaLog = DeltaLog.forTable(spark, dataPath)
println(deltaLog.snapshot.metadata.dataSchema("value").metadata.json)
{"delta.generationExpression":"true"}
Write to Delta Table¶
import io.delta.implicits._
import org.apache.spark.sql.SaveMode
spark.range(5)
.write
.mode(SaveMode.Append)
.delta(dataPath)
Show Table¶
DeltaTable.forPath(dataPath).toDF.orderBy('id).show
+---+-----+
| id|value|
+---+-----+
| 0| true|
| 1| true|
| 2| true|
| 3| true|
| 4| true|
+---+-----+
InvariantViolationException¶
The following one-row query will break the CHECK constraint on the generated column since the value is not true
.
spark.range(5, 6)
.withColumn("value", lit(false))
.write
.mode(SaveMode.Append)
.delta(dataPath)
org.apache.spark.sql.delta.schema.InvariantViolationException: CHECK constraint Generated Column (`value` <=> true) violated by row with values:
- value : false
at org.apache.spark.sql.delta.schema.InvariantViolationException$.apply(InvariantViolationException.scala:50)
at org.apache.spark.sql.delta.schema.InvariantViolationException$.apply(InvariantViolationException.scala:60)
at org.apache.spark.sql.delta.schema.InvariantViolationException.apply(InvariantViolationException.scala)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.delta.constraints.DeltaInvariantCheckerExec.$anonfun$doExecute$3(DeltaInvariantCheckerExec.scala:86)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:278)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1473)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:286)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:210)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)