categorical_features = ['POSITION_CATEGORY']
numeric_features = ['APPLICATION_COUNT']
But when I run this specific line of code:
lg_model = lg_pipeline.fit(train_df)
I got this strange error:
IllegalArgumentException: Invalid slot names detected in features column: POSITION_CATEGORYEnc_KundnAEra: Drift och skOEtsel Special characters " , : \ [ ] { } will cause unexpected behavior in LGBM unless changed. This error can be fixed by renaming the problematic columns prior to vector assembly.
I will append the whole error stack at the end of the post.
As you can see, the column names AND the data have been mixed together!
If I limit the data in the Select-statment to LIMIT = 8000 it WORKS!
If I Select MORE columns the error occurs with LESS rows. I assume that this is a type of memory problem?
Regards from Norway,
Aslak Jonhaugen
IllegalArgumentException Traceback (most recent call last) Cell In[29], line 1 ----> 1 lg_model = lg_pipeline.fit(train_df) File ~/cluster-env/trident_env/lib/python3.10/site-packages/mlflow/utils/autologging_utils/safety.py:435, in safe_patch.<locals>.safe_patch_function(*args, **kwargs) 420 if ( 421 active_session_failed 422 or autologging_is_disabled(autologging_integration) (...) 429 # warning behavior during original function execution, since autologging is being 430 # skipped 431 with set_non_mlflow_warnings_behavior_for_current_thread( 432 disable_warnings=False, 433 reroute_warnings=False, 434 😞 --> 435 return original(*args, **kwargs) 437 # Whether or not the original / underlying function has been called during the 438 # execution of patched code 439 original_has_been_called = False File /opt/spark/python/lib/pyspark.zip/pyspark/ml/base.py:205, in Estimator.fit(self, dataset, params) 203 return self.copy(params)._fit(dataset) 204 else: --> 205 return self._fit(dataset) 206 else: 207 raise TypeError( 208 "Params must be either a param map or a list/tuple of param maps, " 209 "but got %s." % type(params) 210 ) File /opt/spark/python/lib/pyspark.zip/pyspark/ml/pipeline.py:134, in Pipeline._fit(self, dataset) 132 dataset = stage.transform(dataset) 133 else: # must be an Estimator --> 134 model = stage.fit(dataset) 135 transformers.append(model) 136 if i < indexOfLastEstimator: File ~/cluster-env/trident_env/lib/python3.10/site-packages/mlflow/utils/autologging_utils/safety.py:435, in safe_patch.<locals>.safe_patch_function(*args, **kwargs) 420 if ( 421 active_session_failed 422 or autologging_is_disabled(autologging_integration) (...) 429 # warning behavior during original function execution, since autologging is being 430 # skipped 431 with set_non_mlflow_warnings_behavior_for_current_thread( 432 disable_warnings=False, 433 reroute_warnings=False, 434 😞 --> 435 return original(*args, **kwargs) 437 # Whether or not the original / underlying function has been called during the 438 # execution of patched code 439 original_has_been_called = False File /opt/spark/python/lib/pyspark.zip/pyspark/ml/base.py:205, in Estimator.fit(self, dataset, params) 203 return self.copy(params)._fit(dataset) 204 else: --> 205 return self._fit(dataset) 206 else: 207 raise TypeError( 208 "Params must be either a param map or a list/tuple of param maps, " 209 "but got %s." % type(params) 210 ) File ~/cluster-env/trident_env/lib/python3.10/site-packages/synapse/ml/lightgbm/LightGBMRegressor.py:2105, in LightGBMRegressor._fit(self, dataset) 2104 def _fit(self, dataset): -> 2105 java_model = self._fit_java(dataset) 2106 return self._create_model(java_model) File /opt/spark/python/lib/pyspark.zip/pyspark/ml/wrapper.py:380, in JavaEstimator._fit_java(self, dataset) 377 assert self._java_obj is not None 379 self._transfer_params_to_java() --> 380 return self._java_obj.fit(dataset._jdf) File ~/cluster-env/trident_env/lib/python3.10/site-packages/py4j/java_gateway.py:1321, in JavaMember.__call__(self, *args) 1315 command = proto.CALL_COMMAND_NAME +\ 1316 self.command_header +\ 1317 args_command +\ 1318 proto.END_COMMAND_PART 1320 answer = self.gateway_client.send_command(command) -> 1321 return_value = get_return_value( 1322 answer, self.gateway_client, self.target_id, self.name) 1324 for temp_arg in temp_args: 1325 temp_arg._detach() File /opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py:196, in capture_sql_exception.<locals>.deco(*a, **kw) 192 converted = convert_exception(e.java_exception) 193 if not isinstance(converted, UnknownException): 194 # Hide where the exception came from that shows a non-Pythonic 195 # JVM exception message. --> 196 raise converted from None 197 else: 198 raise IllegalArgumentException: Invalid slot names detected in features column: POSITION_CATEGORYEnc_KundnAEra: Drift och skOEtsel Special characters " , : \ [ ] { } will cause unexpected behavior in LGBM unless changed. This error can be fixed by renaming the problematic columns prior to vector assembly.