Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/run_all_frameworks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,18 @@ jobs:
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
python -m pip install "coverage[toml]"
- name: Check Things
run: |
ls -lah venv
ls -lah venv/bin
ls -lah /home/runner/work/automlbenchmark/automlbenchmark/venv/bin
echo $(pwd)
- name: Run ${{ matrix.framework }} on ${{ matrix.task }}
run: |
source venv/bin/activate
ls /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage
/home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage --help
coverage --help
coverage run -m runbenchmark ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e
coverage xml
env:
Expand Down
40 changes: 21 additions & 19 deletions amlb/utils/serialization.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import logging

import math
import os
import pickle
import re
from typing import Optional


from .core import Namespace as ns, json_dump, json_load
from .process import profile

Expand Down Expand Up @@ -33,11 +35,10 @@ def _import_data_libraries():
# the serializer to use when there's no specific serializer available.
# mainly intended to serialize simple data structures like lists.
# allowed=['pickle', 'json']
fallback_serializer="json",
# if numpy can use pickle to serialize ndarrays,
numpy_allow_pickle=True,
# OPTION REMOVED: Only JSON is allowed. Pickle is evil.
# fallback_serializer="json",
# format used to serialize pandas dataframes/series between processes.
# allowed=['pickle', 'parquet', 'hdf', 'json']
# allowed=['parquet', 'json']
pandas_serializer="parquet",
# the compression format used when serializing pandas dataframes/series.
# allowed=[None, 'infer', 'bz2', 'gzip']
Expand Down Expand Up @@ -163,8 +164,14 @@ def serialize_data(data, path, config: Optional[ns] = None):
root, ext = os.path.splitext(path)
np, pd, sp = _import_data_libraries()
if np and isinstance(data, np.ndarray):
path = f"{root}.npy"
np.save(path, data, allow_pickle=config.numpy_allow_pickle)
if data.dtype == "object":
# Numpy cannot save object arrays without pickle
path = f"{root}.json"
data = data.squeeze().tolist()
json_dump(data, path, style="compact")
else:
path = f"{root}.npy"
np.save(path, data, allow_pickle=False)
elif sp and isinstance(data, sp.spmatrix):
# use custom extension to recognize sparsed matrices from file name.
# .npz is automatically appended if missing, and can also potentially be used for numpy arrays.
Expand All @@ -177,9 +184,7 @@ def serialize_data(data, path, config: Optional[ns] = None):
# for example, 'true' and 'false' are converted automatically to booleans, even for column names…
data.rename(str, axis="columns", inplace=True)
ser = config.pandas_serializer
if ser == "pickle":
data.to_pickle(path, compression=config.pandas_compression)
elif ser == "parquet":
if ser == "parquet":
if isinstance(data, pd.Series):
data = pd.DataFrame({__series__: data})
# parquet serialization doesn't support sparse dataframes
Expand All @@ -189,18 +194,15 @@ def serialize_data(data, path, config: Optional[ns] = None):
json_dump(dtypes, f"{path}.dtypes", style="compact")
data = unsparsify(data)
data.to_parquet(path, compression=config.pandas_parquet_compression)
elif ser == "hdf":
data.to_hdf(path, os.path.basename(path), mode="w", format="table")
elif ser == "json":
data.to_json(path, compression=config.pandas_compression)
else: # fallback serializer
if config.fallback_serializer == "json":
path = f"{root}.json"
json_dump(data, path, style="compact")
else:
path = f"{root}.pkl"
with open(path, "wb") as f:
pickle.dump(data, f)
raise ValueError(
f"Invalid pandas serialization {ser} must be 'parquet' or 'json'"
)
else: # fallback serializer
path = f"{root}.json"
json_dump(data, path, style="compact")
return path


Expand All @@ -212,7 +214,7 @@ def deserialize_data(path, config: Optional[ns] = None):
if ext == ".npy":
if np is None:
raise SerializationError(f"Numpy is required to deserialize {path}.")
return np.load(path, allow_pickle=config.numpy_allow_pickle)
return np.load(path)
elif ext == ".npz":
_, ext2 = os.path.splitext(base)
if ext2 == ".spy":
Expand Down
1 change: 0 additions & 1 deletion examples/custom/extensions/GradientBoosting/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def run(dataset: Dataset, config: TaskConfig):

save_predictions(
dataset=dataset,
output_file=config.output_predictions_file,
probabilities=probabilities,
predictions=predictions,
truth=y_test,
Expand Down
1 change: 0 additions & 1 deletion examples/custom/extensions/Stacking/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ def run(dataset, config):
probabilities = estimator.predict_proba(X_test) if is_classification else None

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/AutoGluon/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ def inference_time_regression(data: Union[str, pd.DataFrame]):
shutil.rmtree(predictor.path, ignore_errors=True)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
probabilities=probabilities,
probabilities_labels=prob_labels,
Expand Down
1 change: 0 additions & 1 deletion frameworks/AutoGluon/exec_ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ def run(dataset, config):
get_reusable_executor().shutdown(wait=True)

return result(
output_file=config.output_predictions_file,
predictions=predictions_only,
truth=truth_only,
target_is_encoded=False,
Expand Down
1 change: 0 additions & 1 deletion frameworks/FEDOT/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def run(dataset, config):
save_artifacts(fedot, config)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=dataset.test.y,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/FEDOT/exec_ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ def run(dataset, config):

save_artifacts(fedot, config)
return result(
output_file=config.output_predictions_file,
predictions=all_series_predictions,
truth=truth_only,
target_is_encoded=False,
Expand Down
1 change: 0 additions & 1 deletion frameworks/GAMA/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def infer(data: Union[str, pd.DataFrame]):
probabilities = gama_automl.predict_proba(X_test)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
probabilities=probabilities,
truth=y_test,
Expand Down
1 change: 0 additions & 1 deletion frameworks/H2OAutoML/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ def infer(path: str):
save_artifacts(aml, dataset=dataset, config=config)

return result(
output_file=config.output_predictions_file,
predictions=preds.predictions,
truth=preds.truth,
probabilities=preds.probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/MLPlan/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ def run(dataset, config):
target_encoded = False

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=truth,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/NaiveAutoML/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ def infer(data: Union[str, pd.DataFrame]):
save_artifacts(automl, config)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
probabilities=probabilities,
truth=dataset.test.y,
Expand Down
1 change: 0 additions & 1 deletion frameworks/RandomForest/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ def infer(data):
log.info("Finished inference time measurements.")

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
2 changes: 0 additions & 2 deletions frameworks/SapientML/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ def run(dataset, config):
)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand All @@ -88,7 +87,6 @@ def run(dataset, config):
)
else:
return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
training_duration=training.duration,
Expand Down
1 change: 0 additions & 1 deletion frameworks/TPOT/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ def infer(data):
save_artifacts(tpot, config)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/TunedRandomForest/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,6 @@ def infer(data):
log.info("Finished inference time measurements.")

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/autosklearn/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ def sample_one_test_row(seed: int):
save_artifacts(auto_sklearn, config)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=dataset.test.y if use_pandas else dataset.test.y_enc,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/flaml/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ def infer(data: Union[str, pd.DataFrame]):
log.info(f"Finished predict in {predict.duration}s.")

return result(
output_file=config.output_predictions_file,
probabilities=probabilities,
predictions=predictions,
truth=y_test,
Expand Down
1 change: 0 additions & 1 deletion frameworks/hyperoptsklearn/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ def default():
probabilities = None

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/lightautoml/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def infer(data: Union[str, pd.DataFrame]):
save_artifacts(automl, config)

return result(
output_file=config.output_predictions_file,
probabilities_labels=probabilities_labels,
probabilities=probabilities,
predictions=predictions,
Expand Down
1 change: 0 additions & 1 deletion frameworks/mljarsupervised/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ def infer(data: Union[str, pd.DataFrame]):
shutil.rmtree(results_path, ignore_errors=True)

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
1 change: 0 additions & 1 deletion frameworks/oboe/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def aml_models():
probabilities = None

return result(
output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
Expand Down
4 changes: 3 additions & 1 deletion frameworks/shared/callee.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ class FrameworkError(Exception):


def result(
output_file=None,
predictions=None,
truth=None,
probabilities=None,
Expand Down Expand Up @@ -94,6 +93,7 @@ def load_data(name, path, **_):
path = os.path.join(config.result_dir, ".".join([name, "data"]))
res[name] = serialize_data(arr, path, config=ser_config)
except BaseException as e:
log.error("Integration script failed with uncaught exception:")
log.exception(e)
res = dict(error_message=str(e), models_count=0)
finally:
Expand All @@ -107,6 +107,8 @@ def load_data(name, path, **_):
)
json_dump(inference_measurements, inference_file, style="compact")
res["others"]["inference_times"] = str(inference_file)

res.setdefault("output_file", config.output_predictions_file)
json_dump(res, config.result_file, style="compact")
Comment thread
PGijsbers marked this conversation as resolved.


Expand Down
20 changes: 13 additions & 7 deletions frameworks/shared/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,19 @@ PIP() {
$pip_exec "$@"
}

#if [[ -x "$(command -v $PY_VENV/bin/activate)" ]]; then
# $PY_ROOT/activate
#fi

#echo "PY=$(command -v PY)"
#echo "PIP=$(command -v PIP)"
echo "PY=$py_exec"
echo "PIP=$pip_exec"

PIP install --no-cache-dir -r $SHARED_DIR/requirements.txt
REQ_FILE="$SHARED_DIR/requirements.txt"

for line in $(grep -vE '^\s*#' "$REQ_FILE" | grep -vE '^\s*$'); do
pkg=$(echo "$line" | sed -E 's/[=><~!].*$//')
# In a line like "numpy==1.12.0" then pkg=numpy and line is the whole line

if ! PY -c "import $pkg" &> /dev/null; then
echo "$pkg not found. Installing from requirements.txt..."
PIP install --no-cache-dir "$line"
else
echo "$pkg is already installed by the framework, using that instead."
fi
done

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Avoid word-splitting in the requirements loop
The for line in $(grep ...) construct will split on whitespace and may mis-handle requirements with spaces or special characters. Switch to a while IFS= read -r line pattern to preserve each line intact and eliminate the extra subshell:

- for line in $(grep -vE '^\s*#' "$REQ_FILE" | grep -vE '^\s*$'); do
+ while IFS= read -r line; do
      pkg=${line%%[=><~!]*}
      if ! PY -c "import $pkg" &> /dev/null; then
          echo "$pkg not found. Installing from requirements.txt..."
          PIP install --no-cache-dir "$line"
      else
          echo "$pkg is already installed by the framework, using that instead."
      fi
- done
+ done < "$REQ_FILE"

This approach avoids splitting issues, reduces forking (sed can be replaced with parameter expansion), and is more robust.

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In frameworks/shared/setup.sh around lines 50 to 62, the loop uses `for line in
$(grep ...)` which causes word-splitting and can mishandle requirements with
spaces or special characters. Replace this loop with a `while IFS= read -r line`
construct to read each line intact without splitting. Also, remove the external
`sed` command by using shell parameter expansion to extract the package name
from each line. This will make the script more robust and efficient.

Loading
Loading