Skip to content

util

check_if_valid_host(queuetype, debug, hosttype=HostType.SUBMIT)

Checks if the current host is a submit host.

Uses the qconf program to determine if the current host is a submit host.

Need to implement a similar feature for SLURM.

Parameters:

Name Type Description Default
queuetype QueueType

What queue type to check.

required
debug bool

Used in development settings. SubmitHosts can be added in the enum.py file for debugging.

required

Raises:

Type Description
CalledProcessError

When there is a problem with the qconf program.

NotASubmitHost

When the current host is not a submit host.

NotImplementedError

When the queuetype is not supported.

Returns:

Type Description
bool

True for success or False for failure

Source code in hpcman/hpcman/queue/util.py
def check_if_valid_host(queuetype: QueueType, debug: bool, hosttype: HostType = HostType.SUBMIT) -> bool:
    """Checks if the current host is a submit host.

    Uses the `qconf` program to determine if the current host is a submit host.

    Need to implement a similar feature for SLURM.

    Args:
        queuetype: What queue type to check.
        debug: Used in development settings. SubmitHosts can be added in the `enum.py` file for debugging.

    Raises:
        CalledProcessError: When there is a problem with the `qconf` program.
        NotASubmitHost: When the current host is not a submit host.
        NotImplementedError: When the queuetype is not supported.

    Returns:
        True for success or False for failure
    """
    hostname = platform.node()
    if debug:
        load_dotenv(override=True)
        submit_hosts: list[str] = [str(environ.get("SUBMIT_HOST"))]
    else:
        if queuetype is QueueType.SGE:
            opt = "-ss"
            if hosttype is HostType.ADMIN:
                opt = "-sh"
            try:
                submit_hosts = subprocess.run(["qconf", opt], capture_output=True, text=True).stdout.split()
            except subprocess.CalledProcessError as e:
                print(f"There was a problem checking the {hosttype.value} host: {e.stderr}")
                raise e
            if hostname not in submit_hosts and hostname not in [el.value for el in SLURMHosts]:
                if hosttype is HostType.SUBMIT:
                    raise NotASubmitHost(f"{hostname} is not a submit host. Try again on shell.cqls.oregonstate.edu.")
                elif hosttype is HostType.ADMIN:
                    print(f"Unable to check SGE queues from {hostname}. Try on shell-hpc.cqls.oregonstate.edu")
                    return False
                else:
                    raise NotImplementedError(f"Cannot find hosts for {hosttype.value}")
        elif queuetype is QueueType.SLURM:
            try:
                _ = subprocess.run(["sinfo"], text=True, check=True, capture_output=True)
            except subprocess.CalledProcessError as e:
                raise NotASubmitHost(
                    f"{hostname} is not a submit host. Try again on shell-hpc.cqls.oregonstate.edu."
                ) from e
            except FileNotFoundError as e:
                raise SinfoMissingError(
                    f"sinfo does not appear to be in your $PATH. Check your $PATH variable and try again."
                )
        else:
            raise NotImplementedError(f"Cannot find submit host for {queuetype}")
    return True

check_valid_slurm_association(query, assoc_type='users', debug=False)

Compares provided user or account to list of valid Slurm users or accounts

Suggested use is checking for True/False then raising InvalidSlurmUser or InvalidSlurmAccount

Also populates global ASSOCIATIONS variable that can be reused.

Source code in hpcman/hpcman/queue/util.py
def check_valid_slurm_association(query: str, assoc_type: Literal["users", "accounts"] = "users", debug: bool = False) -> bool:
    """Compares provided user or account to list of valid Slurm users or accounts

    Suggested use is checking for True/False then raising InvalidSlurmUser or InvalidSlurmAccount

    Also populates global ASSOCIATIONS variable that can be reused.
    """
    global ASSOCIATIONS
    if ASSOCIATIONS is None:
        ASSOCIATIONS = get_valid_slurm_associations()

    choices = list(getattr(ASSOCIATIONS, assoc_type))

    if query in choices:
        return True
    else:
        give_suggestions(query=query, choices=choices, suggestion_type=f"Slurm {assoc_type[:-1]}")
        return False

check_valid_slurm_node(query)

Compares provided node to list of valid Slurm nodes.

Populates the global NODES variable.

Source code in hpcman/hpcman/queue/util.py
def check_valid_slurm_node(query: str) -> bool:
    """Compares provided node to list of valid Slurm nodes.

    Populates the global NODES variable.
    """
    global NODES
    load_nodes()

    if is_nodes_set(NODES):
        choices = NODES

    if query in choices:
        return True
    else:
        give_suggestions(query=query, choices=list(choices), suggestion_type=f"Slurm node")
        return False

check_valid_slurm_partition(query)

Compares provided partition to list of valid Slurm partitions.

Populates the global PARTITIONS variable.

Source code in hpcman/hpcman/queue/util.py
def check_valid_slurm_partition(query: str) -> bool:
    """Compares provided partition to list of valid Slurm partitions.

    Populates the global PARTITIONS variable.
    """
    global PARTITIONS
    load_partitions()

    if is_partitions_set(PARTITIONS):
        choices = PARTITIONS

    if query in choices:
        return True
    else:
        give_suggestions(query=query, choices=list(choices), suggestion_type=f"Slurm partition")
        return False

get_allowed_accounts_per_partition()

Uses pyslurm to get allowed accounts per partition

Source code in hpcman/hpcman/queue/util.py
def get_allowed_accounts_per_partition() -> dict[str, set[str]]:
    """Uses pyslurm to get allowed accounts per partition"""
    global PARTITIONS
    load_partitions()
    if is_partitions_set(PARTITIONS):
        return {part.name: set(part.allowed_accounts) for part in PARTITIONS.values()}
    else:
        rprint("Unable to load partition data. Exiting.")
        exit(1)

get_job_info(job_id)

Get job info from Slurm using pyslurm

Source code in hpcman/hpcman/queue/util.py
def get_job_info(job_id: int) -> SlurmJob | None:
    """Get job info from Slurm using pyslurm"""
    try:
        job_info: SlurmJob = pyslurm.Job.load(job_id)
        return job_info
    except pyslurm.RPCError as e:
        rprint(f"Unable to get job info for {job_id}: {e}")
        return None

get_valid_slurm_associations()

Loads associations in pyslurm to get a list of valid users and accounts

Source code in hpcman/hpcman/queue/util.py
def get_valid_slurm_associations() -> SlurmAssociations:
    """Loads associations in pyslurm to get a list of valid users and accounts"""
    users: dict[str, SlurmUser] = {}
    accounts: dict[str, SlurmAccount] = {}
    for assoc in pyslurm.db.Associations.load().values():
        if assoc.user is None:
            acct = SlurmAccount(assoc.account, assoc.parent_account)
            accounts[acct.account] = acct
        else:
            user = users.get(assoc.user, None)
            if user is None:
                user = SlurmUser(name=assoc.user)
            user.accounts.add(assoc.account)
            if assoc.is_default:
                user.default_acct = assoc.account
            users[user.name] = user
    if not users and not accounts:
        raise SlurmDBError("Unable to load Slurm association data.")
    return(SlurmAssociations(users, accounts))

give_suggestions(query, choices, suggestion_type, cutoff_min=50, cutoff_suggest=85)

Provide suggestions from list using rapidfuzz

Source code in hpcman/hpcman/queue/util.py
def give_suggestions(query: str, choices: list[str], suggestion_type: str, cutoff_min: int = 50, cutoff_suggest: int = 85) -> None:
    """Provide suggestions from list using rapidfuzz"""
    import rapidfuzz as rf
    suggestions = rf.process.extract(query, choices, scorer=rf.fuzz.QRatio, score_cutoff=cutoff_min)
    if len(suggestions) == 0:
        rprint(f"Provided {suggestion_type} '{query}' is invalid. Choose from this list and try again: {choices}")
    elif (suggest := suggestions[0])[1] > cutoff_suggest:
        rprint(f"Provided {suggestion_type} '{query}' is invalid. Did you mean '{suggest[0]}'?")
    else:
        suggests = [item[0] for item in suggestions[:5]]
        if len(suggests) == 1:
            suggests = choices
        rprint(f"Provided {suggestion_type} '{query}' is invalid. Choose from this list and try again: {suggests}")

is_nodes_set(value)

Sets node type for type checker

Source code in hpcman/hpcman/queue/util.py
def is_nodes_set(value: dict[str, Node] | None) -> TypeGuard[dict[str, Node]]:
    """Sets node type for type checker"""
    return value is not None

is_partitions_set(value)

Sets partition type for type checker

Source code in hpcman/hpcman/queue/util.py
def is_partitions_set(value: dict[str, Partition] | None) -> TypeGuard[dict[str, Partition]]:
    """Sets partition type for type checker"""
    return value is not None

load_nodes()

Loads partitions into global PARTITIONS variable if needed.

Source code in hpcman/hpcman/queue/util.py
def load_nodes() -> None:
    """Loads partitions into global PARTITIONS variable if needed."""
    global NODES
    if NODES is None:
        NODES = dict(pyslurm.Nodes.load())

load_partitions()

Loads partitions into global PARTITIONS variable if needed.

Source code in hpcman/hpcman/queue/util.py
def load_partitions() -> None:
    """Loads partitions into global PARTITIONS variable if needed."""
    global PARTITIONS
    if PARTITIONS is None:
        PARTITIONS = dict(pyslurm.Partitions.load())