Skip to content

Commit 08e5f92

Browse files
authored
Protobuf 7.34 fix for model size > 2gib (#1058)
### What does this PR do? Type of change: Bug fix For the newer version of protobuf when model size > 2gb the model throws an error when serializing to string specifically in windows. ``` Traceback (most recent call last): File "quantize.py", line 643, in <module> main(args) File "quantize.py", line 428, in main quantized_onnx_model = quantize_int4( args.onnx_path, ... use_column_major=args.use_column_major, ) File ".venv\Lib\site-packages\modelopt\onnx\quantization\int4.py", line 1529, in quantize onnx_model = _quantize_awq_lite( onnx_model, ... ) File ".venv\Lib\site-packages\modelopt\onnx\quantization\int4.py", line 1088, in _quantize_awq_lite save_onnx(augmented_model, augmented_onnx_path, use_external_data_format) File ".venv\Lib\site-packages\modelopt\onnx\utils.py", line 646, in save_onnx model_proto = model.SerializeToString() google.protobuf.message.EncodeError: Failed to serialize proto ``` See Nvbugid 5989474 for more context Suggested fix is to force it to save as external data when this happens. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Improved handling of model size checks so failures no longer crash workflows—serialization attempts now log warnings and fall back to external-data storage. * Loading now tolerates size-query errors and enables external-data mode when size is invalid or exceeds limits. * **Improvements** * Resilient opset upgrade path: version-conversion failures log a warning and apply a safe opset import fallback before saving. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: Hrishith Thadicherla <hthadicherla@nvidia.com>
1 parent 547d8fe commit 08e5f92

3 files changed

Lines changed: 67 additions & 18 deletions

File tree

modelopt/onnx/quantization/quantize.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,21 @@ def _preprocess_onnx(
176176
)
177177

178178
if original_opset_version < target_opset and original_opset_version != 1:
179-
onnx_model = onnx.version_converter.convert_version(onnx_model, target_opset)
179+
try:
180+
onnx_model = onnx.version_converter.convert_version(onnx_model, target_opset)
181+
except Exception as e:
182+
logger.warning(
183+
"onnx.version_converter failed (%s). Performing lightweight opset update.", e
184+
)
185+
current_opset = {opset.domain: opset.version for opset in onnx_model.opset_import}
186+
new_opset_imports = [onnx.helper.make_opsetid("", target_opset)]
187+
if "com.microsoft" not in current_opset:
188+
new_opset_imports.append(onnx.helper.make_opsetid("com.microsoft", 1))
189+
for domain, version in current_opset.items():
190+
if domain not in ["", "ai.onnx"]:
191+
new_opset_imports.append(onnx.helper.make_opsetid(domain, version))
192+
onnx_model.ClearField("opset_import")
193+
onnx_model.opset_import.extend(new_opset_imports)
180194
onnx_path = os.path.join(output_dir, f"{model_name}_opset{target_opset}.onnx")
181195
save_onnx(onnx_model, onnx_path, use_external_data_format)
182196
logger.info(f"Model is cloned to {onnx_path} with opset_version {target_opset}")

modelopt/onnx/trt_utils.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,22 @@ def load_onnx_model(
322322

323323
# Load the model and weights
324324
onnx_model = onnx.load(onnx_path, load_external_data=True)
325-
size_threshold = 2 * (1024**3) # 2GB
326-
use_external_data_format = onnx_model.ByteSize() > size_threshold or use_external_data_format
325+
if not use_external_data_format:
326+
try:
327+
model_size = onnx_model.ByteSize()
328+
except Exception as e:
329+
logger.warning(
330+
"Failed to compute model size with ByteSize (%s). Saving tensors as external data.",
331+
e,
332+
)
333+
use_external_data_format = True
334+
else:
335+
if model_size <= 0 or model_size >= onnx.checker.MAXIMUM_PROTOBUF:
336+
use_external_data_format = True
337+
logger.debug(
338+
"Model is too large to save as a single file but 'use_external_data_format'"
339+
" is False. Saving tensors as external data, regardless."
340+
)
327341

328342
# If inputs are dynamic and override shapes are given, set them as static
329343
dynamic_inputs = get_dynamic_graph_inputs(onnx_model)

modelopt/onnx/utils.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,19 @@ def _get_unique_name(old_name):
558558

559559
def check_model(model: onnx.ModelProto) -> None:
560560
"""Checks if the given model is valid."""
561-
if model.ByteSize() > (2 * (1024**3)): # 2GB limit
561+
save_as_external_data = False
562+
try:
563+
model_size = model.ByteSize()
564+
except Exception as e:
565+
logger.warning(
566+
"Failed to compute model size with ByteSize (%s). Using external data path.", e
567+
)
568+
save_as_external_data = True
569+
else:
570+
if model_size <= 0 or model_size > (2 * (1024**3)):
571+
save_as_external_data = True
572+
573+
if save_as_external_data:
562574
with tempfile.TemporaryDirectory() as temp_dir:
563575
# ONNX also looks in CWD, so we need to use a unique id
564576
unique_id = str(uuid.uuid4())[:8]
@@ -642,21 +654,18 @@ def get_variable_inputs(node: Node) -> list[Variable]:
642654
def save_onnx(model: onnx.ModelProto, onnx_path: str, save_as_external_data: bool = False):
643655
"""Save an ONNX model to given path. If a model is larger than 2GB, will save with external data."""
644656
size_threshold = 2 * (1024**3) # 2GB
645-
try:
646-
model_proto = model.SerializeToString()
647-
model_size = len(model_proto)
648-
save_as_external_data = save_as_external_data or model_size > size_threshold
649-
logger.debug(
650-
f"Model size: {model_size} bytes, using external data: {save_as_external_data}"
651-
)
652-
653-
except ValueError as e:
654-
if "Message onnx.ModelProto exceeds maximum protobuf size of 2GB" in str(e):
655-
logger.warning("Model exceeds 2GB limit, switching to external data storage")
657+
if not save_as_external_data:
658+
try:
659+
model_proto = model.SerializeToString()
660+
except Exception as e:
661+
logger.warning("Failed to serialize model. Saving tensors as external data. (%s)", e)
656662
save_as_external_data = True
657663
else:
658-
logger.error(f"Failed to serialize model: {e!s}")
659-
raise
664+
model_size = len(model_proto)
665+
save_as_external_data = model_size > size_threshold
666+
logger.debug(
667+
f"Model size: {model_size} bytes, using external data: {save_as_external_data}"
668+
)
660669

661670
# Set ir_version to 10, remove it once ORT supports ir_version 11
662671
model.ir_version = 10
@@ -1162,7 +1171,19 @@ def infer_types_verification(model: onnx.ModelProto) -> onnx.ModelProto:
11621171

11631172
def infer_shapes(model: onnx.ModelProto, **kwargs):
11641173
"""Infers shapes of the onnx graph, handles large models."""
1165-
if model.ByteSize() > (2 * (1024**3)): # 2GB limit
1174+
save_as_external_data = False
1175+
try:
1176+
model_size = model.ByteSize()
1177+
except Exception as e:
1178+
logger.warning(
1179+
"Failed to compute model size with ByteSize (%s). Using external data path.", e
1180+
)
1181+
save_as_external_data = True
1182+
else:
1183+
if model_size <= 0 or model_size > (2 * (1024**3)):
1184+
save_as_external_data = True
1185+
1186+
if save_as_external_data:
11661187
with tempfile.TemporaryDirectory() as temp_dir:
11671188
# ONNX also looks in CWD, so we need to use a unique id
11681189
unique_id = str(uuid.uuid4())[:8]

0 commit comments

Comments
 (0)