Skip to content

Commit

Permalink
Lots of configuration for Slurm
Browse files Browse the repository at this point in the history
  • Loading branch information
kincl committed Nov 7, 2023
1 parent cbe08eb commit 3a5c769
Show file tree
Hide file tree
Showing 10 changed files with 121 additions and 12 deletions.
16 changes: 9 additions & 7 deletions slurm/manifests/configs/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ ProctrackType=proctrack/linuxproc
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
ReturnToService=2
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
Expand Down Expand Up @@ -78,7 +78,7 @@ KillWait=30
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
SlurmdTimeout=5
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
Expand All @@ -90,6 +90,7 @@ Waittime=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres # needed for supporting dynamic nodes
#SelectType=select/cons_res
#
#
# JOB PRIORITY
Expand Down Expand Up @@ -123,10 +124,11 @@ JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=job_container/none
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
SlurmctldDebug=info
JobAcctGatherType=jobacct_gather/linux
SlurmctldDebug=debug
#SlurmctldLogFile=
SlurmdDebug=info
SlurmctldParameters=cloud_reg_addrs
SlurmdDebug=debug
#SlurmdLogFile=
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
Expand All @@ -144,7 +146,7 @@ SlurmdDebug=info
#SuspendRate=
#SuspendTime=
#
#
MaxNodeCount=100 # dynamic nodes
# COMPUTE NODES
MaxNodeCount=100
#NodeName=compute-[0-10] CPUs=2 State=UNKNOWN
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
3 changes: 3 additions & 0 deletions slurm/manifests/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ resources:
- serviceaccount.yaml
- clusterrolebinding.yaml
- claim.yaml

- servicemonitor.yaml
- scaledobject.yaml
- serviceaccount-slurm-metrics.yaml

configMapGenerator:
- name: slurm-conf
Expand Down
2 changes: 1 addition & 1 deletion slurm/manifests/namespace.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ kind: Namespace
metadata:
name: slurm-system
labels:
openshift.io/cluster-monitoring: 'true'
openshift.io/cluster-monitoring: 'true'
50 changes: 50 additions & 0 deletions slurm/manifests/scaledobject.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
# annotations:
# autoscaling.keda.sh/paused-replicas: "0"
name: scaledobject
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: StatefulSet
name: compute
cooldownPeriod: 60
maxReplicaCount: 10
minReplicaCount: 0
fallback:
failureThreshold: 3
replicas: 1
pollingInterval: 30
# advanced:
# scalingModifiers:
# activationTarget: "2"
# horizontalPodAutoscalerConfig:
# behavior:
# scaleDown:
# stabilizationWindowSeconds: 150
# policies:
# - type: Percent
# value: 100
# periodSeconds: 15
triggers:
- type: prometheus
metadata:
serverAddress: https://thanos-querier.openshift-monitoring.svc.cluster.local:9092
namespace: slurm-system
query: slurm_queue_pending
threshold: '1'
authModes: bearer
authenticationRef:
name: prom-triggerauthentication
kind: TriggerAuthentication
---
apiVersion: keda.sh/v1alpha1
kind: TriggerAuthentication
metadata:
name: prom-triggerauthentication
spec:
secretTargetRef:
- parameter: bearerToken
name: slurm-metrics-token-zkqtg
key: token
3 changes: 1 addition & 2 deletions slurm/manifests/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ spec:
name: slurmctld
- port: 8080
name: scrape
clusterIP: None
selector:
app: slurm
component: head
component: head
38 changes: 38 additions & 0 deletions slurm/manifests/serviceaccount-slurm-metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: slurm-metrics
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: thanos-metrics-reader
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- apiGroups:
- metrics.k8s.io
resources:
- pods
- nodes
verbs:
- get
- list
- watch
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: slurm-metrics
subjects:
- kind: ServiceAccount
name: slurm-metrics
namespace: slurm-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: thanos-metrics-reader
2 changes: 1 addition & 1 deletion slurm/manifests/serviceaccount.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: slurm
name: slurm
3 changes: 2 additions & 1 deletion slurm/manifests/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
# https://docs.openshift.com/container-platform/4.13/monitoring/managing-metrics.html
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: slurm
spec:
endpoints:
- interval: 30s
scrapeTimeout: 30s
port: scrape
scheme: http
selector:
matchLabels:
app: slurm
component: head
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
15 changes: 15 additions & 0 deletions slurm/manifests/statefulset-compute.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
apiVersion: v1
kind: Service
metadata:
name: compute
labels:
app: slurm
component: compute
spec:
ports:
- port: 6818
name: slurmd
selector:
app: slurm
component: compute
---
apiVersion: apps/v1
kind: StatefulSet
Expand All @@ -15,6 +29,7 @@ spec:
serviceName: "compute"
replicas: 1
minReadySeconds: 10
terminationGracePeriodSeconds: 60
template:
metadata:
labels:
Expand Down
1 change: 1 addition & 0 deletions slurm/manifests/statefulset-head.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ spec:
dnsConfig:
searches:
- slurm.slurm-system.svc.cluster.local
- compute.slurm-system.svc.cluster.local
enableServiceLinks: false
terminationGracePeriodSeconds: 10
serviceAccountName: slurm
Expand Down

0 comments on commit 3a5c769

Please sign in to comment.