I create a parquet file (transformed data) which I then use to create dims and facts.
Its been working fine. However I changed the code from "overwrite" to append the other day and now this code doesnt work
from pyspark.sql.functions import input_file_name, regexp_extract
parquet_file = "Files/Data/Silver/Transformed.parquet"
df = spark.read.parquet(parquet_file)
display(dftm)
I get the following (Just a snippet of the error)
Py4JJavaError Traceback (most recent call last) Cell In[8], line 6 3 parquet_file = "Files/Data/Silver/Transformed.parquet" 4 dftm = spark.read.parquet(parquet_file) ----> 6 display(dftm) File ~/cluster-env/trident_env/lib/python3.10/site-packages/notebookutils/visualization/display.py:245, in display(data, summary) 242 success = False 243 log4jLogger \ 244 .error(f"display failed with error, language: python, error: {err}, correlationId={correlation_id}") --> 245 raise err 246 finally: 247 duration_ms = ceil((time.time() - start_time) * 1000) File ~/cluster-env/trident_env/lib/python3.10/site-packages/notebookutils/visualization/display.py:237, in display(data, summary) 234 max_row_count = custom_options['maxRowCount'] 235 return display_jupyter.display_without_spark(data, summary, max_row_count) --> 237 return _display_with_spark(data, correlation_id, summary) 239 except ImportError: 240 success = False File ~/cluster-env/trident_env/lib/python3.10/site-packages/notebookutils/visualization/display.py:264, in _display_with_spark(data, correlation_id, summary) 260 if is_ipython_enabled(runtime.host_nbutils_version): 261 # pylint: disable=C0415 262 from IPython.display import publish_display_data 263 display_result = json.loads( --> 264 sc._jvm.display.getDisplayResultForIPython( 265 df._jdf, summary, correlation_id) 266 ) 267 if configs.enable_wrangler_entry: 268 display_result[WRANGLER_ENTRY_CONTEXT_KEY] = get_wrangler_display_entry_context(data) File ~/cluster-env/trident_env/lib/python3.10/site-packages/py4j/java_gateway.py:1322, in JavaMember.__call__(self, *args) 1316 command = proto.CALL_COMMAND_NAME +\ 1317 self.command_header +\ 1318 args_command +\ 1319 proto.END_COMMAND_PART 1321 answer = self.gateway_client.send_command(command) -> 1322 return_value = get_return_value( 1323 answer, self.gateway_client, self.target_id, self.name) 1325 for temp_arg in temp_args: 1326 if hasattr(temp_arg, "_detach"😞 File /opt/spark/python/lib/pyspark.zip/pyspark/errors/exceptions/captured.py:169, in capture_sql_exception.<locals>.deco(*a, **kw) 167 def deco(*a: Any, **kw: Any) -> Any: 168 try: --> 169 return f(*a, **kw) 170 except Py4JJavaError as e: 171 converted = convert_exception(e.java_exception) File ~/cluster-env/trident_env/lib/python3.10/site-packages/py4j/protocol.py:326, in get_return_value(answer, gateway_client, target_id, name) 324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) 325 if answer[1] == REFERENCE_TYPE: --> 326 raise Py4JJavaError( 327 "An error occurred while calling {0}{1}{2}.\n". 328 format(target_id, ".", name), value) 329 else: 330 raise Py4JError( 331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n". 332 format(target_id, ".", name, value)) Py4JJavaError: An error occurred while calling z:com.microsoft.spark.notebook.visualization.display.getDisplayResultForIPython. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 15.0 failed 4 times, most recent failure: Lost task 3.3 in stage 15.0 (TID 24) (vm-3cd76749 executor 1): org.apache.spark.SparkException: Parquet column cannot be converted in file abfss://986472b4-7316-485c-aa22-128e1cb29544@onelake.dfs.fabric.microsoft.com/ee31b1a4-16bf-4aae-aaab-f130bd4d53c6/Files/Data/Silver/taskmasterTransformed.parquet/part-00003-c289ad84-5db9-4b77-a910-d822f4e6af62-c000.snappy.parquet. Column: [ID], Expected: int, Found: BINARY. at o o java.base/java.lang.Thread.run(Thread.java:829) Caused by:
org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException: column: [ID], physicalType: BINARY, logicalType: int at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1127) at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:189) at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:175) at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:328) at