Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
openSUSE:Slowroll:Base:1
golang-github-vpenso-prometheus_slurm_exporter
Adjust-GPU-data-gathering-to-work-with-all-Slur...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File Adjust-GPU-data-gathering-to-work-with-all-Slurm-versions-since-18.08.patch of Package golang-github-vpenso-prometheus_slurm_exporter
From: Egbert Eich <eich@suse.com> Date: Tue Jun 27 13:09:51 2023 +0200 Subject: Adjust GPU data gathering to work with all Slurm versions since 18.08 Patch-mainline: Not yet Git-commit: dad0d76a32c784e658573a0025d423df610ab9e1 References: These changes have been ported from the development branch of https://github.com/vpenso/prometheus-slurm-exporter. Signed-off-by: Egbert Eich <eich@suse.com> Signed-off-by: Egbert Eich <eich@suse.de> --- gpus.go | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++++------------ node.go | 19 ++++---- 2 files changed, 147 insertions(+), 41 deletions(-) diff --git a/gpus.go b/gpus.go index ca3bcaf..7db5ab3 100644 --- a/gpus.go +++ b/gpus.go @@ -1,4 +1,4 @@ -/* Copyright 2020 Joeri Hermans, Victor Penso, Matteo Dessalvi +/* Copyright 2022 Joeri Hermans, Victor Penso, Matteo Dessalvi, Iztok Lebar Bajec This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,17 +16,19 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main import ( - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/common/log" - "io/ioutil" "os/exec" - "strings" + "regexp" "strconv" + "strings" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" ) type GPUsMetrics struct { alloc float64 idle float64 + other float64 total float64 utilization float64 } @@ -35,6 +37,11 @@ func GPUsGetMetrics() *GPUsMetrics { return ParseGPUsMetrics() } +/* TODO: + sinfo has gresUSED since slurm>=19.05.0rc01 https://github.com/SchedMD/slurm/blob/master/NEWS + revert to old process on slurm<19.05.0rc01 + --format=AllocGRES will return gres/gpu=8 + --format=AllocTRES will return billing=16,cpu=16,gres/gpu=8,mem=256G,node=1 func ParseAllocatedGPUs() float64 { var num_gpus = 0.0 @@ -53,21 +60,106 @@ func ParseAllocatedGPUs() float64 { return num_gpus } +*/ -func ParseTotalGPUs() float64 { +func ParseAllocatedGPUs(data []byte) float64 { var num_gpus = 0.0 + // sinfo -a -h --Format="Nodes: ,GresUsed:" --state=allocated + // 3 gpu:2 # slurm>=20.11.8 + // 1 gpu:(null):3(IDX:0-7) # slurm 21.08.5 + // 13 gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3) # slurm 21.08.5 + + sinfo_lines := string(data) + re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`) + if len(sinfo_lines) > 0 { + for _, line := range strings.Split(sinfo_lines, "\n") { + // log.info(line) + if len(line) > 0 && strings.Contains(line, "gpu:") { + nodes := strings.Fields(line)[0] + num_nodes, _ := strconv.ParseFloat(nodes, 64) + node_active_gpus := strings.Fields(line)[1] + num_node_active_gpus := 0.0 + for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") { + if strings.Contains(node_active_gpus_type, "gpu:") { + node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2] + num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64) + num_node_active_gpus += num_node_active_gpus_type + } + } + num_gpus += num_nodes * num_node_active_gpus + } + } + } - args := []string{"-h", "-o \"%n %G\""} - output := string(Execute("sinfo", args)) - if len(output) > 0 { - for _, line := range strings.Split(output, "\n") { - if len(line) > 0 { - line = strings.Trim(line, "\"") - descriptor := strings.Fields(line)[1] - descriptor = strings.TrimPrefix(descriptor, "gpu:") - descriptor = strings.Split(descriptor, "(")[0] - node_gpus, _ := strconv.ParseFloat(descriptor, 64) - num_gpus += node_gpus + return num_gpus +} + +func ParseIdleGPUs(data []byte) float64 { + var num_gpus = 0.0 + // sinfo -a -h --Format="Nodes: ,Gres: ,GresUsed:" --state=idle,allocated + // 3 gpu:4 gpu:2 # slurm 20.11.8 + // 1 gpu:8(S:0-1) gpu:(null):3(IDX:0-7) # slurm 21.08.5 + // 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3) # slurm 21.08.5 + + sinfo_lines := string(data) + re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`) + if len(sinfo_lines) > 0 { + for _, line := range strings.Split(sinfo_lines, "\n") { + // log.info(line) + if len(line) > 0 && strings.Contains(line, "gpu:") { + nodes := strings.Fields(line)[0] + num_nodes, _ := strconv.ParseFloat(nodes, 64) + node_gpus := strings.Fields(line)[1] + num_node_gpus := 0.0 + for _, node_gpus_type := range strings.Split(node_gpus, ",") { + if strings.Contains(node_gpus_type, "gpu:") { + node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2] + num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64) + num_node_gpus += num_node_gpus_type + } + } + num_node_active_gpus := 0.0 + node_active_gpus := strings.Fields(line)[2] + for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") { + if strings.Contains(node_active_gpus_type, "gpu:") { + node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2] + num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64) + num_node_active_gpus += num_node_active_gpus_type + } + } + num_gpus += num_nodes * (num_node_gpus - num_node_active_gpus) + } + } + } + + return num_gpus +} + +func ParseTotalGPUs(data []byte) float64 { + var num_gpus = 0.0 + // sinfo -a -h --Format="Nodes: ,Gres:" + // 3 gpu:4 # slurm 20.11.8 + // 1 gpu:8(S:0-1) # slurm 21.08.5 + // 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) # slurm 21.08.5 + + sinfo_lines := string(data) + re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`) + if len(sinfo_lines) > 0 { + for _, line := range strings.Split(sinfo_lines, "\n") { + // log.Info(line) + if len(line) > 0 && strings.Contains(line, "gpu:") { + nodes := strings.Fields(line)[0] + num_nodes, _ := strconv.ParseFloat(nodes, 64) + node_gpus := strings.Fields(line)[1] + num_node_gpus := 0.0 + for _, node_gpus_type := range strings.Split(node_gpus, ",") { + if strings.Contains(node_gpus_type, "gpu:") { + node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2] + num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64) + num_node_gpus += num_node_gpus_type + } + } + num_gpus += num_nodes * num_node_gpus } } } @@ -77,29 +169,40 @@ func ParseTotalGPUs() float64 { func ParseGPUsMetrics() *GPUsMetrics { var gm GPUsMetrics - total_gpus := ParseTotalGPUs() - allocated_gpus := ParseAllocatedGPUs() + total_gpus := ParseTotalGPUs(TotalGPUsData()) + allocated_gpus := ParseAllocatedGPUs(AllocatedGPUsData()) + idle_gpus := ParseIdleGPUs(IdleGPUsData()) + other_gpus := total_gpus - allocated_gpus - idle_gpus gm.alloc = allocated_gpus - gm.idle = total_gpus - allocated_gpus + gm.idle = idle_gpus + gm.other = other_gpus gm.total = total_gpus gm.utilization = allocated_gpus / total_gpus return &gm } +func AllocatedGPUsData() []byte { + args := []string{"-a", "-h", "--Format=Nodes: ,GresUsed:", "--state=allocated"} + return Execute("sinfo", args) +} + +func IdleGPUsData() []byte { + args := []string{"-a", "-h", "--Format=Nodes: ,Gres: ,GresUsed:", "--state=idle,allocated"} + return Execute("sinfo", args) +} + +func TotalGPUsData() []byte { + args := []string{"-a", "-h", "--Format=Nodes: ,Gres:"} + return Execute("sinfo", args) +} + // Execute the sinfo command and return its output func Execute(command string, arguments []string) []byte { cmd := exec.Command(command, arguments...) - stdout, err := cmd.StdoutPipe() + out, err := cmd.CombinedOutput() if err != nil { log.Fatal(err) } - if err := cmd.Start(); err != nil { - log.Fatal(err) - } - out, _ := ioutil.ReadAll(stdout) - if err := cmd.Wait(); err != nil { - log.Fatal(err) - } return out } @@ -111,9 +214,10 @@ func Execute(command string, arguments []string) []byte { func NewGPUsCollector() *GPUsCollector { return &GPUsCollector{ - alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil), - idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil), - total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil), + alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil), + idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil), + other: prometheus.NewDesc("slurm_gpus_other", "Other GPUs", nil, nil), + total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil), utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil), } } @@ -121,6 +225,7 @@ func NewGPUsCollector() *GPUsCollector { type GPUsCollector struct { alloc *prometheus.Desc idle *prometheus.Desc + other *prometheus.Desc total *prometheus.Desc utilization *prometheus.Desc } @@ -129,6 +234,7 @@ type GPUsCollector struct { func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) { ch <- cc.alloc ch <- cc.idle + ch <- cc.other ch <- cc.total ch <- cc.utilization } @@ -136,6 +242,7 @@ func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) { cm := GPUsGetMetrics() ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc) ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle) + ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, cm.other) ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total) ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm.utilization) } diff --git a/node.go b/node.go index bf2f759..ae7dc90 100644 --- a/node.go +++ b/node.go @@ -27,12 +27,12 @@ import ( // NodeMetrics stores metrics for each node type NodeMetrics struct { - memAlloc uint64 - memTotal uint64 - cpuAlloc uint64 - cpuIdle uint64 - cpuOther uint64 - cpuTotal uint64 + memAlloc uint64 + memTotal uint64 + cpuAlloc uint64 + cpuIdle uint64 + cpuOther uint64 + cpuTotal uint64 nodeStatus string } @@ -60,7 +60,6 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics { memAlloc, _ := strconv.ParseUint(node[1], 10, 64) memTotal, _ := strconv.ParseUint(node[2], 10, 64) - cpuInfo := strings.Split(node[3], "/") cpuAlloc, _ := strconv.ParseUint(cpuInfo[0], 10, 64) cpuIdle, _ := strconv.ParseUint(cpuInfo[1], 10, 64) @@ -82,7 +81,7 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics { // NodeData executes the sinfo command to get data for each node // It returns the output of the sinfo command func NodeData() []byte { - cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong") + cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong:") out, err := cmd.Output() if err != nil { log.Fatal(err) @@ -102,7 +101,7 @@ type NodeCollector struct { // NewNodeCollector creates a Prometheus collector to keep all our stats in // It returns a set of collections for consumption func NewNodeCollector() *NodeCollector { - labels := []string{"node","status"} + labels := []string{"node", "status"} return &NodeCollector{ cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil), @@ -128,7 +127,7 @@ func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) { nodes := NodeGetMetrics() for node := range nodes { ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nodes[node].cpuAlloc), node, nodes[node].nodeStatus) - ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus) + ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus) ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nodes[node].cpuOther), node, nodes[node].nodeStatus) ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nodes[node].cpuTotal), node, nodes[node].nodeStatus) ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nodes[node].memAlloc), node, nodes[node].nodeStatus)
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor