check_iperf3/check_iperf3.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
This module provides a Nagios plugin to run and evaluate network performance tests using iperf3.

The script executes an iperf3 test, evaluates the results based on user-defined thresholds, and
outputs a formatted Nagios status message along with performance data. The script supports both
TCP and UDP tests, bidirectional testing, and customizable thresholds for throughput and
retransmissions.

Key functions in this module:
- `run_iperf3`: Executes an iperf3 test and returns the parsed JSON output.
- `determine_result`: Analyzes the iperf3 output and determines the Nagios status code and message.
- `build_perfdata`: Constructs the Nagios performance data string based on iperf3 results.
- `check_iperf3`: Orchestrates the test execution, result evaluation, and Nagios output generation.

Command-line interface (CLI):
- The script expects command-line arguments to configure the iperf3 test. These arguments are parsed
  into an `options` object, which is then passed through the testing and evaluation functions.

Usage:
    Run this module as a standalone script to initiate a network performance test:

    python check_iperf3.py --remote <host> --port <port> --rate-critical <rate_critical> ...

The `options` object is populated with the user-specified parameters, and the `check_iperf3`
function is invoked to execute the test, evaluate the results, and output Nagios-compatible
status and performance data.

Example:
    python check_iperf3.py --remote 192.168.1.1 --port 5201 --rate-critical 1000000000 ...

This would run an iperf3 test against the specified remote host and port, evaluating the throughput
against the provided critical and warning thresholds, and outputting the result in Nagios format.
"""


from optparse import OptionParser, OptionGroup
import subprocess
import json
import sys
from typing import Tuple, Optional, List, Union, NoReturn


def nagexit(exit_code: int, statuslines: Union[str, List[str]], perfdata: Optional[List[str]]=None) -> NoReturn:
    """
    Exits this Nagios plugin with the appropriate status message and performance data.

    This function formats and prints a Nagios-compatible status line and exits the
    script with the provided exit code. The exit code must be one of the standard
    Nagios status codes:
        0 - OK
        1 - WARNING
        2 - CRITICAL
        3 - UNKNOWN

    Args:
        exit_code (int): The exit code indicating the plugin result. Must be 0, 1, 2, or 3.
        statuslines (str | List[str]): The human-readable status message(s). If a list
            is provided, each item will be printed on a new line.
        perfdata (List, optional): A list of performance data strings to be appended
            in Nagios performance data format. Defaults to None.

    Returns:
        NoReturn: This function does not return; it exits the program using sys.exit().
    """

    status_code = {0: 'OK', 1: 'WARNING',
                   2: 'CRITICAL', 3: 'UNKNOWN'}[exit_code]
    status = statuslines if isinstance(statuslines, str) else "\n".join(statuslines)
    perf = "|" + ("; ".join(perfdata)) if perfdata else ""
    if status:
        print(f"{status_code}: {status}{perf}")
    else:
        print(f"{status_code}{perf}")
    sys.exit(exit_code)


def make_iperf_cmdline(options) -> List[str]:
    """
    Constructs the command-line arguments for an iperf3 network performance test.

    This function generates a list of command-line arguments for running `iperf3`
    based on the provided options. It supports TCP/UDP, bidirectional or reverse
    tests, and can specify either a data limit in bytes or a time duration.

    Args:
        options: An object containing the following attributes:
            - remote (str): The target hostname or IP address of the iperf3 server.
            - port (int): The port number to connect to on the server.
            - bidir (bool): If True, run a bidirectional test (overrides downstream).
            - downstream (bool): If True, run a reverse test (server to client).
            - udp (bool): If True, use UDP instead of TCP.
            - bytes (str): Optional. Amount of data to transfer (e.g., "10M").
            - time (int): Duration of the test in seconds (used if `bytes` is not set).

    Returns:
        List[str]: A list of command-line arguments for iperf3.
    """
    params = [
        "iperf3",
        "--json",
        "--client", options.remote,
        "--port", str(options.port),
        "--connect-timeout", "10000",
    ]
    if options.bidir:
        params.append("--bidir")
    elif options.downstream:
        params.append("--reverse")
    if options.udp:
        params.append("--udp")
    if options.bytes:
        params += ["--bytes", options.bytes]
    else:
        params += ["--time", str(options.time)]
    return params


def run_iperf3(options) -> Union[dict, NoReturn]:
    """
    Executes an iperf3 test using the given options and returns the parsed JSON result.

    This function builds the iperf3 command line using `make_iperf_cmdline(options)`,
    executes the command, and returns the resulting output as a parsed JSON dictionary.
    If the iperf3 executable is not found, or the command fails, it exits the program
    with an appropriate Nagios status using `nagexit`.

    Args:
        options: An object containing iperf3 options. Must be compatible with
            `make_iperf_cmdline()` (i.e., have attributes like `remote`, `port`,
            `bidir`, `downstream`, `udp`, `bytes`, and `time`).

    Returns:
        dict: The parsed JSON output from iperf3 if the command succeeds.

    Exits:
        Exits with code 3 (UNKNOWN) if iperf3 is not found or if the command fails.
    """
    try:
        params = make_iperf_cmdline(options)
        raw_data = subprocess.check_output(
            params,
            stderr=subprocess.STDOUT,
            universal_newlines=True
        )
        return json.loads(raw_data)
    except OSError as e:
        if e.errno == 2:
            nagexit(3, "cannot find iperf3")
    except subprocess.CalledProcessError as e:
        nagexit(3, e.output)
    nagexit(3, "what happended?")


def determine_result(options, json_data) -> Tuple[int, Optional[List[str]]]:
    """
    Evaluates iperf3 test results against performance thresholds and determines the Nagios status.

    This function checks the iperf3 output (`json_data`) for errors and compares metrics like
    transfer rate and retransmissions against critical and warning thresholds provided in `options`.

    If an error is found in the iperf3 output, the function exits with a Nagios UNKNOWN state.

    Args:
        options: An object with threshold attributes used for evaluation.
            Expected attributes include:
            - rate_up_warn (int): Warning threshold for upstream rate in bits per second.
            - rate_up_crit (int): Critical threshold for upstream rate in bits per second.
            - rate_down_warn (int): Warning threshold for downstream rate in bits per second.
            - rate_down_crit (int): Critical threshold for downstream rate in bits per second.
            - retrans_warn (int): Warning threshold for retransmissions (TCP only).
            - retrans_crit (int): Critical threshold for retransmissions (TCP only).
            - udp (bool): Whether the test was run over UDP (retransmissions are ignored if True).
        json_data (dict): Parsed JSON output from an iperf3 test.

    Returns:
        Tuple[int, Optional[List[str]]]:
            - An integer exit code for Nagios (0 = OK, 1 = WARNING, 2 = CRITICAL).
            - A list of status messages if thresholds were breached, or an empty list if all checks
              passed.

    Exits:
        Exits with code 3 (UNKNOWN) if an "error" key is present in the iperf3 output.
    """
    rc = 0
    statuslines = []

    if "error" in json_data:
        nagexit(3, json_data["error"])

    json_end = json_data['end']

    bps_avg_up = json_end['sum_sent']['bits_per_second'] if options.bidir or not options.downstream else None
    bps_avg_down = json_end['sum_sent_bidir_reverse']['bits_per_second'] if options.bidir else \
        json_data['end']['sum_sent']['bits_per_second'] if options.downstream else None

    if bps_avg_up is not None:
        if options.rate_up_crit and bps_avg_up <= options.rate_up_crit:
            rc = max(rc, 2)
            statuslines.append("upstream rate below critical threshold")
        elif options.rate_up_warn and bps_avg_up <= options.rate_up_warn:
            rc = max(rc, 1)
            statuslines.append("upstream rate below warning threshold")

    if bps_avg_down is not None:
        if options.rate_down_crit and bps_avg_down <= options.rate_down_crit:
            rc = max(rc, 2)
            statuslines.append("downpstream rate below critical threshold")
        elif options.rate_down_warn and bps_avg_down <= options.rate_down_warn:
            rc = max(rc, 1)
            statuslines.append("downstream rate below warning threshold")

    if not options.udp and options.retrans_crit and options.retrans_warn:
        retransmits = json_data["end"]["sum_sent"]["retransmits"]
        if retransmits >= options.retrans_crit:
            rc = max(rc, 2)
            statuslines.append("retransmissions over critical threshold")
        elif retransmits >= options.retrans_warn:
            rc = max(rc, 1)
            statuslines.append("retransmissions over warning threshold")

    return rc, statuslines


def build_single_perfdata(key: str, value, limit_warn=None, limit_crit=None) -> str:
    """
    Builds a single Nagios performance data string for a metric.

    This function formats a key-value pair into a Nagios-compatible performance data
    string, optionally including warning and critical thresholds.

    Args:
        key (str): The name of the metric (e.g., 'throughput').
        value: The current value of the metric. Typically numeric.
        limit_warn (optional): The warning threshold for the metric. Defaults to None.
        limit_crit (optional): The critical threshold for the metric. Defaults to None.

    Returns:
        str: A formatted performance data string suitable for use in Nagios plugins,
             e.g., "'throughput'=500;400;600".
    """
    if limit_warn is None and limit_crit is None:
        return f"'{key}'={value}"
    return f"'{key}'={value};{limit_warn or ''};{limit_crit or ''}"


def bits_per_second(bps: float) -> str:
    """
    Formats a bits-per-second value as a string with a 'b' suffix.

    If the value is greater than 1000, it is rounded to the nearest whole number
    before formatting.

    Args:
        bps (float): The bits-per-second value to format.

    Returns:
        str: The formatted string with a 'b' suffix, e.g., "950.1234b" or "1200b".
    """
    if bps > 1000:
        bps = round(bps)
    return f"{bps}b"


def packets(pkts: int) -> str:
    """
    Formats a packet count as a string with the 'packets' suffix.

    Args:
        pkts (int): The number of packets.

    Returns:
        str: A string representing the packet count, e.g., "123packets".
    """
    return f"{pkts}packets"


def percent(prcnt: float) -> str:
    """
    Formats a float value as a percentage string.

    Args:
        prcnt (float): The percentage value to format.

    Returns:
        str: The formatted percentage string with a '%' suffix, e.g., "99.5%".
    """
    return f"{prcnt}%"


def build_perfdata(options, json_data: dict) -> List[str]:
    """
    Constructs a list of Nagios performance data strings based on iperf3 test results.

    This function extracts key metrics from the iperf3 JSON output, including per-interval
    throughput, average throughput (upstream and/or downstream), retransmissions (for TCP),
    and CPU utilization. It uses `build_single_perfdata()` to format each data point for
    inclusion in a Nagios plugin's performance data output.

    Args:
        options: An object containing test configuration and threshold values. Expected attributes:
            - bidir (bool): Whether the test was bidirectional.
            - downstream (bool): Whether to use reverse (download) direction.
            - udp (bool): Whether the test used UDP (retransmissions ignored).
            - rate_up_warn (int): Warning threshold for upstream rate in bits per second.
            - rate_up_crit (int): Critical threshold for upstream rate in bits per second.
            - rate_down_warn (int): Warning threshold for downstream rate in bits per second.
            - rate_down_crit (int): Critical threshold for downstream rate in bits per second.
            - retrans_warn (int): Warning threshold for retransmissions (TCP only).
            - retrans_crit (int): Critical threshold for retransmissions.
        json_data (dict): Parsed iperf3 output containing measurement results.

    Returns:
        List[str]: A list of performance data strings formatted for Nagios output.
    """
    perfdata = [
        build_single_perfdata(
            f"bps{i+1}", bits_per_second(intv['sum']['bits_per_second']))
        for i, intv in enumerate(json_data["intervals"])
    ]

    json_end = json_data['end']

    bps_avg_up = json_end['sum_sent']['bits_per_second'] if options.bidir or not options.downstream else None
    bps_avg_down = json_end['sum_sent_bidir_reverse']['bits_per_second'] if options.bidir else \
        json_data['end']['sum_sent']['bits_per_second'] if options.downstream else None
    if not options.udp:
        retrans_sum_up = json_end['sum_sent']['retransmits'] if options.bidir or not options.downstream else None
        retrans_sum_down = json_end['sum_sent_bidir_reverse']['retransmits'] if options.bidir else \
            json_data['end']['sum_sent']['retransmits'] if options.downstream else None

    if bps_avg_up is not None:
        perfdata.append(build_single_perfdata("bps_avg_up", bits_per_second(
            bps_avg_up), options.rate_up_warn, options.rate_up_crit))
    if bps_avg_down is not None:
        perfdata.append(build_single_perfdata("bps_avg_down", bits_per_second(
            bps_avg_down), options.rate_down_warn, options.rate_down_crit))

    if not options.udp:
        if retrans_sum_up is not None:
            perfdata.append(build_single_perfdata("retrans_sum_up", packets(
                retrans_sum_up), options.retrans_warn, options.retrans_crit))
        if retrans_sum_down is not None:
            perfdata.append(build_single_perfdata("retrans_sum_down", packets(
                retrans_sum_down), options.retrans_warn, options.retrans_crit))

    perfdata.append(build_single_perfdata("local_cpu",
        percent(json_end['cpu_utilization_percent']['host_total'])))
    perfdata.append(build_single_perfdata("remote_cpu",
        percent(json_end['cpu_utilization_percent']['remote_total'])))
    return perfdata

def check_iperf3(options):
    """
    Executes an iperf3 test, evaluates the results, and outputs Nagios status and performance data.

    This function orchestrates the process of running an iperf3 test with the specified options,
    parsing the JSON output, and evaluating the results against user-defined thresholds. Based on
    the evaluation, it outputs a Nagios status message (OK, WARNING, CRITICAL, or UNKNOWN) and
    performance data. It then exits the program with the appropriate exit code.

    Args:
        options: An object containing configuration options for the iperf3 test. Expected attributes
            include:
            - remote (str): The target iperf3 server's hostname or IP address.
            - port (int): The port number to use for the test.
            - bidir (bool): Whether the test should be bidirectional.
            - downstream (bool): Whether the test should use reverse mode (server to client).
            - udp (bool): Whether the test should use UDP instead of TCP.
            - rate_up_warn (int): Warning threshold for upstream rate in bits per second.
            - rate_up_crit (int): Critical threshold for upstream rate in bits per second.
            - rate_down_warn (int): Warning threshold for downstream rate in bits per second.
            - rate_down_crit (int): Critical threshold for downstream rate in bits per second.
            - retrans_warn (int): Warning threshold for retransmissions (TCP only).
            - retrans_crit (int): Critical threshold for retransmissions (TCP only).
            - bytes (str): Optional. Data to transfer (e.g., '10M').
            - time (int): Duration of the test in seconds.

    Exits:
        Exits the program with the appropriate Nagios status code (0 for OK, 1 for WARNING,
        2 for CRITICAL, 3 for UNKNOWN) and prints the relevant status message and performance data.
    """
    json_data = run_iperf3(options)
    rc, statuslines = determine_result(options, json_data)
    perfdata = build_perfdata(options, json_data)
    nagexit(rc, statuslines, perfdata)


if __name__ == "__main__":
    DESC = "%prog is used to run an iperf3 check against a given host."
    parser = OptionParser(description=DESC, version="%prog version 0.2")

    gen_opts = OptionGroup(parser, "Generic options")
    thres_opts = OptionGroup(parser, "Threshold options")
    parser.add_option_group(gen_opts)
    parser.add_option_group(thres_opts)

    # transfer rate
    thres_opts.add_option("-w", "--rate-warning", dest="rate_warn",
                          type="int", metavar="BITS", action="store",
                          help="Defines the transfer rate's warning threshold")
    thres_opts.add_option("-c", "--rate-critical", dest="rate_crit",
                          type="int", metavar="BITS", action="store",
                          help="Defines the transfer rate's critical threshold")
    thres_opts.add_option("--rate-up-warning", dest="rate_up_warn",
                          type="int", metavar="BITS", action="store",
                          help="Defines the upstream rate's warning threshold")
    thres_opts.add_option("--rate-up-critical", dest="rate_up_crit",
                          type="int", metavar="BITS", action="store",
                          help="Defines the upstream rate's critical threshold")
    thres_opts.add_option("--rate-down-warning", dest="rate_down_warn",
                          type="int", metavar="BITS", action="store",
                          help="Defines the downstream rate's warning threshold")
    thres_opts.add_option("--rate-down-critical", dest="rate_down_crit",
                          type="int", metavar="BITS", action="store",
                          help="Defines the downstream rate's critical threshold")

    # retransmits
    thres_opts.add_option("-W", "--retransmit-warning", dest="retrans_warn",
                          type="int", metavar="RETRANS", action="store",
                          help="Defines the retransmission warning threshold")
    thres_opts.add_option("-C", "--retransmit-critical", dest="retrans_crit",
                          type="int", metavar="RETRANS", action="store",
                          help="Defines the retransmission critical threshold")

    # -r / --remote
    gen_opts.add_option("-r", "--remote", dest="remote",
                        type="string", action="store",
                        help="iperf3 server to connect to")

    # -p / --port
    gen_opts.add_option("-p", "--port", dest="port",
                        type="int", action="store", default=5201,
                        help="iperf3 server port to connect to [default: %default]")

    # -d / --downstream
    gen_opts.add_option("-d", "--downstream", dest="downstream",
                        action="store_true", default=False,
                        help="measure downstream instead of upstream")

    # --bidir
    gen_opts.add_option("--bidir", dest="bidir",
                        action="store_true", default=False,
                        help="test in both directions (normal and reverse), with both the client and server sending and receiving data simultaneously")

    # -u / --udp
    gen_opts.add_option("-u", "--udp", dest="udp",
                        action="store_true", default=False,
                        help="use UDP rather than TCP")

    # -t / --time
    gen_opts.add_option("-t", "--time", dest="time",
                        type="int", action="store", default=10,
                        help="time in seconds to transmit for [default: %default]")

    # -n / --bytes
    gen_opts.add_option("-n", "--bytes", dest="bytes",
                        type="string", action="store",
                        help="number of bytes to transmit (instead of --time)")

    (opts, args) = parser.parse_args()
    if not opts.remote or opts.time <= 0:
        parser.print_help()
        sys.exit(3)
    if not opts.rate_up_warn:
        opts.rate_up_warn = opts.rate_warn
    if not opts.rate_down_warn:
        opts.rate_down_warn = opts.rate_warn
    if not opts.rate_up_crit:
        opts.rate_up_crit = opts.rate_crit
    if not opts.rate_down_crit:
        opts.rate_down_crit = opts.rate_crit

    check_iperf3(opts)