#!/usr/bin/bash
#-
# Copyright (c) 2025 Red Hat, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Written by Mikolaj Izdebski <mizdebsk@redhat.com>
# Written by Robert Foss <rfoss@redhat.com>
set -eu
shopt -s globstar nullglob

PROGNAME=rhel-drivers
VERSION="20251004-1.el10_1"

progname() { basename "$0"; }
die() {
    echo "${PROGNAME}: $*" >&2
    exit 1
}

quiet=0
verbose=0

info() {
    [ "$quiet" -eq 1 ] && return 0
    printf '%s\n' "$*" >&2
}

log() {
    [ "$verbose" -eq 1 ] || return 0
    printf '%s\n' "$*" >&2
}

usage() {
    cat <<EOF
${PROGNAME} - install hardware drivers

Usage:
  ${PROGNAME} [GLOBAL OPTIONS] <subcommand> [ARGS...]

Global options:
  --help            Show this help and exit
  --version         Show version and exit
  --verbose         Increase verbosity
  --quiet           Suppress non-error output

Subcommands:
  install           Install drivers
  remove            Remove drivers
  list              List drivers

Run "${PROGNAME} <subcommand> --help" for subcommand help.

EOF
}

help_install() {
    cat <<EOF
${PROGNAME} install - install hardware drivers

Usage:
  ${PROGNAME} install [OPTIONS] [ARGS...]

Options:
  --auto-detect     Auto-detect drivers to install
  --dry-run         Show what would happen, don't change anything
  --force           Force install (ignore checks)
  --help            Show this help for 'install' and exit

Arguments:
  Zero or more driver identifiers.

EOF
}

help_remove() {
    cat <<EOF
${PROGNAME} remove - remove installed hardware drivers

Usage:
  ${PROGNAME} remove [OPTIONS] [ARGS...]

Options:
  --dry-run         Show what would happen, don't change anything
  --all             Remove all installed drivers
  --help            Show this help for 'remove' and exit

Arguments:
  Zero or more driver identifiers.

EOF
}

help_list() {
    cat <<EOF
${PROGNAME} list - list available or installed hardware drivers

Usage:
  ${PROGNAME} list [OPTIONS]

Options:
  --available       List available drivers (default)
  --installed       List installed drivers
  --help            Show this help for 'list' and exit

Arguments:
  None.

EOF
}

# -------- global option parsing (long options only) --------
while [ "$#" -gt 0 ]; do
    case "$1" in
    --help)
        usage
        exit 0
        ;;
    --version)
        echo "${PROGNAME} $VERSION"
        exit 0
        ;;
    --verbose)
        verbose=1
        shift
        ;;
    --quiet)
        quiet=1
        shift
        ;;
    --)
        shift
        break
        ;; # end of global options
    --*) die "Unknown global option: $1" ;;
    *) break ;; # first non-option = subcommand
    esac
done

[ "$#" -gt 0 ] || {
    usage
    die "No subcommand specified"
}

subcmd=$1
shift

# -------- functions --------
verify_repos() {
    log "[verify_repos] Checking repository status"

    if ! type -P subscription-manager >/dev/null 2>&1; then
        info "Warning: Subscription Manager is absent."
        info "You may need to enable appropriate repositories yourself."
        return 0
    fi

    # Get RHEL version
    version=$(awk -F'=' '/VERSION_ID/{ gsub(/"/,""); print $2=int($2)}' /etc/os-release)

    # Define required repositories
    supplementary_repo="rhel-${version}-for-$(arch)-supplementary-rpms"
    extensions_repo="rhel-${version}-for-$(arch)-extensions-rpms"

    log "[verify_repos] Required repositories: $supplementary_repo, $extensions_repo"

    # Get list of enabled repositories
    supplementary_enabled=0
    extensions_enabled=0
    if [ -f /etc/yum.repos.d/redhat.repo ]; then
        supplementary_enabled=$(sed -n '/'"${supplementary_repo}"'/,$p' /etc/yum.repos.d/redhat.repo | grep enabled | head -1 | cut -d= -f2 | cut -d' ' -f2)
        extensions_enabled=$(sed -n '/'"${extensions_repo}"'/,$p' /etc/yum.repos.d/redhat.repo | grep enabled | head -1 | cut -d= -f2 | cut -d' ' -f2)
    fi

    # Check and enable supplementary repository
    if [ "$supplementary_enabled" = "1" ]; then
        log "[verify_repos] Repository already enabled: $supplementary_repo"
    else
        info "Enabling repository: $supplementary_repo"
        subscription-manager repos --enable="$supplementary_repo" || die "Failed to enable $supplementary_repo"
    fi

    # Check and enable extensions repository
    if [ "$extensions_enabled" = "1" ]; then
        log "[verify_repos] Repository already enabled: $extensions_repo"
    else
        info "Enabling repository: $extensions_repo"
        subscription-manager repos --enable="$extensions_repo" || die "Failed to enable $extensions_repo"
    fi

    log "[verify_repos] Repository verification complete"
}

autodetect() {
    drivers_found="$(autodetect_nvidia || true)"
    log "[autodetect] Found the following hardware: $drivers_found"
    [ -n "$drivers_found" ] || return 1
    echo "$drivers_found"
}

autodetect_nvidia() {
    pci_class_display="03"
    nvidia_vendor="10de"
    supported_gpus="/usr/share/rhel-drivers/nvidia/supported-gpus.json"
    # Allow overriding /sys/devices for testing
    modalias_path="${__rhel_drivers_modalias_path:-/sys/devices}"

    # Extract key-value pairs and store them in an associative array
    if [ ! -e $supported_gpus ]; then
        die "Can't find $supported_gpus"
    fi
    declare -A gpus
    devid_name_mapping=$(jq -r '.chips.[] | select(.features | index("kernelopen")) | "gpus[\(.devid | sub("^0x"; "") | ascii_downcase)]=\"\(.name)\";"' $supported_gpus)
    eval "$devid_name_mapping"

    for modalias_file in "$modalias_path"/**/modalias; do
        regex_pattern=".+:v(.+)d(.+)sv(.+)sd(.+)bc(.+)sc(.+)i(.*)"
        modalias=$(cat "$modalias_file" | tr '[:upper:]' '[:lower:]')
        if [[ "$modalias" =~ $regex_pattern ]]; then
            vendor="${BASH_REMATCH[1]: -4}"
            device="${BASH_REMATCH[2]: -4}"
            subvendor="${BASH_REMATCH[3]: -4}"
            subdevice="${BASH_REMATCH[4]: -4}"
            baseclass="${BASH_REMATCH[5]}"
            subclass="${BASH_REMATCH[6]}"
            interface="${BASH_REMATCH[7]}"

            if ! [ "$baseclass" == "$pci_class_display" ]; then
                continue
            fi

            if ! [ "$vendor" == "$nvidia_vendor" ]; then
                continue
            fi

            if [ -v gpus[$device] ]; then
                log "Found GPU: ${gpus[${device}]}"
                echo "nvidia"
                return 0
            fi
        fi
    done
    return 1
}

install_nvidia() {
    driver="$1"
    latest="$(echo $driver_avail | tr " " "\n" | sort -r | head -n 1)"
    version="$latest"
    log "[install_nvidia] Latest driver version: $latest"

    case "$driver" in
    nvidia:*)
        echo "\$driver=$driver"
        if ! grep -Fxq "$driver" <<<"$driver_avail"; then
            die "No such driver: $driver"
        fi
        version="$driver"
        ;;
    esac
    version="${version#nvidia:}"
    dnf_cmd="dnf install"
    dnf_cmd="$dnf_cmd \
        cublasmp \
        cuda-compat \
        cuda-toolkit \
        cudnn \
        dnf-plugin-nvidia \
        libnccl-devel \
        libnccl-static \
        nvidia-driver-580.95.05 \
        nvidia-driver-cuda \
        nvlink5"
    log "Installing NVIDIA + CUDA driver: $version"
    if [ $dry_run -eq 0 ]; then
        eval $dnf_cmd
    fi
}

remove_nvidia() {
    driver="$1"
    version="$driver"
    dnf_cmd="dnf remove"
    dnf_cmd="$dnf_cmd \
        cublasmp \
        cuda-compat \
        cuda-toolkit \
        cudnn \
        dnf-plugin-nvidia \
        libnccl-devel \
        libnccl-static \
        nvidia-driver-580.95.05 \
        nvidia-driver-cuda \
        nvlink5"

    log "Remove: $dnf_cmd"
    eval $dnf_cmd
}

# -------- subcommands --------
cmd_install() {
    # Verify and enable required repositories
    verify_repos

    auto_detect=0 dry_run=0 force=0

    # parse install options
    while [ "$#" -gt 0 ]; do
        case "$1" in
        --help)
            help_install
            exit 0
            ;;
        --auto-detect)
            auto_detect=1
            shift
            ;;
        --dry-run)
            dry_run=1
            shift
            ;;
        --force)
            force=1
            shift
            ;;
        --)
            shift
            break
            ;;
        --*) die "Unknown option for 'install': $1" ;;
        *) break ;;
        esac
    done

    log "[install] auto_detect=$auto_detect dry_run=$dry_run force=$force"
    log "[install] drivers: [$@]"

    drivers="$@"
    if [ "$#" -eq 0 ] && [ $auto_detect -eq 0 ]; then
        die "Not specified what to install"
    fi
    if [ "$#" -gt 0 ] && [ $auto_detect -ne 0 ]; then
        die "Both autodetect and something to install"
    fi
    if [ $auto_detect -ne 0 ]; then
        drivers=$(autodetect) || die "No compatible hardware found!"
    fi

    log "[install] Installing the following drivers: $drivers"
    for driver in $drivers; do
        case "$driver" in
        nvidia*) install_nvidia $driver ;;
        *) die "Unknown driver: $driver" ;;
        esac
    done
}

cmd_remove() {
    dry_run=0 all=0

    while [ "$#" -gt 0 ]; do
        case "$1" in
        --help)
            help_remove
            exit 0
            ;;
        --dry-run)
            dry_run=1
            shift
            ;;
        --all)
            all=1
            shift
            ;;
        --)
            shift
            break
            ;;
        --*) die "Unknown option for 'remove': $1" ;;
        *) break ;;
        esac
    done

    if [ "$#" -eq 0 ] && [ $all -eq 0 ]; then
        die "Not specified what to remove"
    fi
    if [ "$#" -gt 0 ] && [ $all -ne 0 ]; then
        die "Both all and something to remove"
    fi
    if [ $all -ne 0 ]; then
        die "Removal of all drivers is not implemented yet"
    fi

    dnf_cmd="dnf remove"
    while [ "$#" -gt 0 ]; do
        case "$1" in
        nvidia:*)
            if ! grep -Fxq "$1" <<<"$driver_inst"; then
                die "Driver not installed: $1"
            fi
            version="${1#nvidia:}"
            remove_nvidia $version
            ;;
        *) die "Unknown driver: $1" ;;
        esac
        shift
    done
}

cmd_list() {
    # Verify and enable required repositories
    verify_repos

    opt_available=0 opt_installed=0

    while [ "$#" -gt 0 ]; do
        case "$1" in
        --help)
            help_list
            exit 0
            ;;
        --available)
            opt_available=1
            shift
            ;;
        --installed)
            opt_installed=1
            shift
            ;;
        --)
            shift
            break
            ;;
        --*) die "Unknown option for 'list': $1" ;;
        *) break ;;
        esac
    done

    [ "$#" -eq 0 ] || die "'list' does not take arguments"

    log "[list] available=$opt_available installed=$opt_installed"

    if [ "$opt_available" -eq 0 ] && [ "$opt_installed" -eq 0 ]; then
        opt_available=1
    fi

    if [ "$opt_available" -eq 1 ]; then
        # Check which drivers have autodetected hardware support
        autodetected=$(autodetect 2>/dev/null || true)
        [ -n "$autodetected" ] && log "[list] Autodetected hardware: $autodetected"

        # Mark drivers: * = installed, > = autodetected hardware
        marked_avail=""
        while IFS= read -r driver; do
            [ -z "$driver" ] && continue

            log "[list] Comparing driver='$driver' with driver_inst='$driver_inst'"
            mark_installed=$([ -n "$driver_inst" ] && [ "$driver" = "$driver_inst" ] && echo "*" || echo " ")
            mark_autodetect=$(echo "$autodetected" | grep -qw "${driver%%:*}" && echo ">" || echo " ")

            marked_avail="${marked_avail}${mark_installed}${mark_autodetect} ${driver}"$'\n'
        done <<<"$driver_avail"

        info "Available drivers:
${marked_avail%$'\n'}"
    fi

    if [ "$opt_installed" -eq 1 ]; then
        info "Installed drivers:
$driver_inst"
    fi
}

if driver_inst=nvidia:$(rpm -q --qf '%{version}' nvidia-driver 2>/dev/null); then
    log "Currently installed driver version: $driver_inst"
else
    driver_inst=""
    log "Driver is currently NOT installed"
fi

driver_avail=$(dnf -q \
    repoquery --qf 'nvidia:%{version}\n' nvidia-driver | sort -V -r)
log "Available driver versions:
$driver_avail"

# -------- dispatch --------
case "$subcmd" in
install | in) cmd_install "$@" ;;
remove | rm) cmd_remove "$@" ;;
list | ls) cmd_list "$@" ;;
--help | help)
    usage
    exit 0
    ;;
--version | version)
    echo "${PROGNAME} $VERSION"
    exit 0
    ;;
*)
    usage
    die "Unknown subcommand: $subcmd"
    ;;
esac
