Skip to content

Add S.M.A.R.T. support for NVMe drives #1680

@mikkorantalainen

Description

@mikkorantalainen

How about supporting NVMe drives here, too?

munin/plugins/node.d/smart_

Lines 352 to 374 in 0d38de7

for line in stdout.decode().splitlines():
if not line:
# the table is finished
in_table_data = False
elif not in_table_data:
# process header data
if line.startswith('Device Model:') or line.startswith('Device:'):
value = line.split(':', 1)[1].strip()
# ignore the "Version" string
model = ' '.join(token for token in value.split() if token != 'Version')
elif line.startswith('ID# ATTRIBUTE_NAME'):
# Start reading the Attributes block
in_table_data = True
else:
# we can ignore other header lines
pass
else:
# this is a data table row
tokens = line.split()
key = tokens[1].replace('-', '_')
value = tokens[3]
threshold = None if tokens[5] in INVALID_THRESHOLDS_BLACKLIST else tokens[5]
smart_values[key] = {"value": value, "threshold": threshold}

Quick patch I made locally installed version 2.0.75-1ubuntu1 which should give a good idea how this could be implemented without affecting SATA results but still supporting NVMe drives, too. Some of the reported values are way higher than 100 so I'm not sure how this should be graphed.

--- /etc/munin/plugins/smart_	2024-02-22 06:17:06.000000000 +0200
+++ /etc/munin/plugins/smart_	2026-02-19 18:20:07.803466750 +0200
@@ -126,81 +126,99 @@
     if verbose:
         sys.stderr.write('{}: {}\n'.format(plugin_name, s))
 
 
 def guess_full_path(hard_drive):
     """ try to find the full path for a given hard disk name
 
     None is returned if no device node was found.
     """
     for dev_dir in ('/dev', '/dev/disk/by-id'):
         full_path = os.path.join(dev_dir, hard_drive)
         if os.path.exists(full_path):
             return full_path
     else:
         return None
 
 
 def is_fatal_exitcode(exit_code):
     # The exit code represents a bitmask.
     # Bits 0/1/2 belong to fatal errors (see smartctl's man page). Check if one of these is set.
+    # However, exit code 12 means Samsung NVMe with warranty period exceeded which is still okay for reading smart data
+    if exit_code == 12:
+        return False
     return (exit_code & 0b111) > 0
 
 
 def read_values(hard_drive):
     smart_values = {}
     try:
         verboselog('Reading S.M.A.R.T values')
         os.putenv('LC_ALL', 'C')
         device = guess_full_path(hard_drive)
         command_tokens = [smartctl_bin] + smartctl_args.split()
         if not smartctl_ignore_standby:
             command_tokens.extend(('-n', 'standby'))
         command_tokens.extend(('-A', '-i', device))
         proc = subprocess.Popen(command_tokens, stdout=subprocess.PIPE)
         stdout, stderr = proc.communicate()
-        in_table_data = False
+        in_ata_table_data = False
+        in_nvme_table_data = False
         last_output_line = None
         model = "unknown"
         for line in stdout.decode().splitlines():
             if not line:
                 # the table is finished
-                in_table_data = False
-            elif not in_table_data:
+                in_ata_table_data = False
+                in_nvme_table_data = False
+            elif in_ata_table_data:
+                # this is a ATA data table row
+                tokens = line.split()
+                key = tokens[1].replace('-', '_')
+                value = tokens[3]
+                threshold = None if tokens[5] in INVALID_THRESHOLDS_BLACKLIST else tokens[5]
+                smart_values[key] = {"value": value, "threshold": threshold}
+            elif in_nvme_table_data:
+                # this is a NVMe data table row
+                tokens = line.split(':')
+                key = tokens[0].replace('-', '_').replace(':', '').replace(' ', '_')
+                value = tokens[1].replace(',', '').replace('%', '').replace(' Celsius', '')
+                value = value.split('[', 1)[0].strip()
+                if value.startswith('0x'):
+                    value = str(int(value, 16))
+                threshold = None
+                smart_values[key] = {"value": value, "threshold": threshold}
+            else:
                 # process header data
-                if line.startswith('Device Model:') or line.startswith('Device:'):
+                if line.startswith('Device Model:') or line.startswith('Device:') or line.startswith('Device Number:'):
                     value = line.split(':', 1)[1].strip()
                     # ignore the "Version" string
                     model = ' '.join(token for token in value.split() if token != 'Version')
                 elif line.startswith('ID# ATTRIBUTE_NAME'):
                     # Start reading the Attributes block
-                    in_table_data = True
+                    in_ata_table_data = True
+                elif line.startswith('SMART/Health Information'):
+                    # Start reading the Attributes block
+                    in_nvme_table_data = True
                 else:
                     # we can ignore other header lines
                     pass
-            else:
-                # this is a data table row
-                tokens = line.split()
-                key = tokens[1].replace('-', '_')
-                value = tokens[3]
-                threshold = None if tokens[5] in INVALID_THRESHOLDS_BLACKLIST else tokens[5]
-                smart_values[key] = {"value": value, "threshold": threshold}
             last_output_line = line
         real_exit_code = proc.returncode
         if real_exit_code > 0:
             # Allow to turn off warnings for some bits
             num_exit_status = real_exit_code & ~smartctl_ignore_exitcode_bitmask
         else:
             num_exit_status = 0
         if num_exit_status != 0:
             if is_fatal_exitcode(num_exit_status):
                 verboselog('smartctl cannot access S.M.A.R.T values on drive {}. Command exited '
                            'with code {}'.format(hard_drive, num_exit_status))
                 verboselog(last_output_line)
             else:
                 # the error is not fatal, but we should announce a warning
                 verboselog('smartctl exited with code {}. {} may be FAILING RIGHT NOW!'
                            .format(num_exit_status, hard_drive))
     except Exception as exc:
         verboselog('Cannot access S.M.A.R.T values ({})! Check user rights or proper '
                    'smartmontools installation/arguments.'.format(exc))
         sys.exit(1)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions