Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dlio_benchmark/data_generator/tf_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,6 @@ def generate(self):
filename = os.path.basename(out_path_spec)
self.storage.create_node(index_folder, exist_ok=True)
tfrecord_idx = f"{index_folder}/{filename}.idx"
if not os.path.isfile(tfrecord_idx):
call([tfrecord2idx_script, out_path_spec, tfrecord_idx])
if not self.storage.isfile(tfrecord_idx):
call([tfrecord2idx_script, out_path_spec, self.storage.get_uri(tfrecord_idx)])
np.random.seed()
3 changes: 3 additions & 0 deletions dlio_benchmark/framework/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,6 @@ def put_data(self, id, data, offset=None, length=None):
def get_data(self, id, data, offset=None, length=None):
return None

def isfile(self, id):
return False

7 changes: 6 additions & 1 deletion dlio_benchmark/framework/tf_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
DataLoaderType

import tensorflow as tf
import tensorflow_io as tfio
from tensorflow.python.framework import errors

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
Expand Down Expand Up @@ -102,7 +103,7 @@ def is_nativeio_available(self):

@dlp.log
def create_node(self, id, exist_ok=False):
tf.io.gfile.mkdir(id)
tf.io.gfile.makedirs(id)
return True

@dlp.log
Expand Down Expand Up @@ -140,3 +141,7 @@ def get_data(self, id, data, offset=None, length=None):
with tf.io.gfile.GFile(id, "r") as fd:
data = fd.read()
return data

@dlp.log
def isfile(self, id):
return tf.io.gfile.exists(id) and not tf.io.gfile.isdir(id)
4 changes: 4 additions & 0 deletions dlio_benchmark/storage/file_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,5 +99,9 @@ def get_data(self, id, data, offset=None, length=None):
data = fd.read()
return data

@dlp.log
def isfile(self, id):
return os.path.isfile(id)

def get_basename(self, id):
return os.path.basename(id)
4 changes: 4 additions & 0 deletions dlio_benchmark/storage/s3_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,9 @@ def put_data(self, id, data, offset=None, length=None):
def get_data(self, id, data, offset=None, length=None):
return super().get_data(self.get_uri(id), data, offset, length)

@dlp.log
def isfile(self, id):
return super().isfile(self.get_uri(id))

def get_basename(self, id):
return os.path.basename(id)
7 changes: 7 additions & 0 deletions dlio_benchmark/storage/storage_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,10 @@ def get_data(self, id, data, offset=None, length=None):
return self.framework.get_data(id, data, offset, length)
return None

def isfile(self, id):
"""
This method checks if the given path is a file
"""
if self.is_framework_nativeio_available:
return self.framework.isfile(id)
return None
4 changes: 3 additions & 1 deletion dlio_benchmark/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,9 @@ def validate(self):
if self.num_checkpoints_write > 0:
if self.num_checkpoints_read > self.num_checkpoints_write:
raise Exception(f"Number of checkpoints to read {self.num_checkpoints_read} cannot be larger than number of checkpoints to write {self.num_checkpoints_write}")

if self.ksm_present and self.checkpoint_randomize_tensor:
raise Exception(f"checkpoint.ksm is {self.ksm_present} which requires checkpoint.randomize_tensor to be False")

@staticmethod
def reset():
ConfigArguments.__instance = None
Expand Down
6 changes: 4 additions & 2 deletions docs/source/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -443,12 +443,13 @@ checkpoint
| Available options are: default, subset.
* - randomize_tensor
- True
- | randomize the tensors data
- | randomize the tensors data. If it is False, all the checkpoint data will be tensor of ones.
* - ksm
- (omitted)
- | Optional subsection to configure and enable Kernel Samepage Merging (KSM) optimization.
| **Simply adding this ``ksm:`` section (even if empty, e.g., ``ksm: {}``) enables KSM features.**
| See the KSM Configuration table below for optional nested keys to fine-tune KSM behavior.
| See the KSM Configuration table below for optional nested keys to fine-tune KSM behavior.
| To use ksm, one has to set randomize_tensor = False.

**KSM Configuration (Optional keys under `checkpoint.ksm`)**

Expand Down Expand Up @@ -486,6 +487,7 @@ checkpoint
checkpoint:
checkpoint_folder: checkpoints/another_model
# ... other checkpoint settings ...
randomize_tensor: False
ksm:
high_ram_trigger: 25.0
await_time: 150
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ pandas>=1.5.1
psutil>=5.9.8
pydftracer==1.0.11
pytest
tensorflow>=2.11.0
tensorflow>=2.13.1
tensorflow_io>=0.33.0
torch>=2.2.0
torchaudio
torchvision
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
x86_deps = [
f"hydra-core>={HYDRA_VERSION}",
"nvidia-dali-cuda120>=1.34.0",
"tensorflow>=2.11.0",
"tensorflow>=2.13.1",
"tensorflow_io>=0.33.0",
"torch>=2.2.0",
"torchaudio",
"torchvision",
Expand Down
15 changes: 9 additions & 6 deletions tests/dlio_benchmark_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,9 @@ def test_checkpoint_ksm_config() -> None:
'++workload.checkpoint.ksm={}',
'++workload.workflow.generate_data=False',
'++workload.workflow.train=False',
'++workload.checkpoint.num_checkpoints_write=0',
'++workload.checkpoint.num_checkpoints_read=0'
'++workload.checkpoint.num_checkpoints_write=1',
'++workload.checkpoint.num_checkpoints_read=1',
'++workload.checkpoint.randomize_tensor=False',
])
ConfigArguments.reset()
# Pass only the workload part of the config
Expand Down Expand Up @@ -384,8 +385,9 @@ def test_checkpoint_ksm_config() -> None:
'++workload.checkpoint.ksm.await_time=100',
'++workload.workflow.generate_data=False',
'++workload.workflow.train=False',
'++workload.checkpoint.num_checkpoints_write=0',
'++workload.checkpoint.num_checkpoints_read=0'
'++workload.checkpoint.num_checkpoints_write=1',
'++workload.checkpoint.num_checkpoints_read=1',
'++workload.checkpoint.randomize_tensor=False'
])
ConfigArguments.reset()
benchmark = DLIOBenchmark(cfg['workload'])
Expand All @@ -412,8 +414,9 @@ def test_checkpoint_ksm_config() -> None:
'++workload.workflow.checkpoint=True',
'++workload.workflow.generate_data=False',
'++workload.workflow.train=False',
'++workload.checkpoint.num_checkpoints_write=0',
'++workload.checkpoint.num_checkpoints_read=0'
'++workload.checkpoint.num_checkpoints_write=1',
'++workload.checkpoint.num_checkpoints_read=1',
'++workload.checkpoint.randomize_tensor=False'
])
ConfigArguments.reset()
benchmark = DLIOBenchmark(cfg['workload'])
Expand Down