产品咨询及市场合作 400-860-6560
快速筛选产品
智能制造官网
  • 解决方案 +
    • AI 解决方案
    • HPC解决方案
    • 超融合一体机
    • 存储解决方案
    • 液冷解决方案
    • 服务器定制解决方案
  • 产品中心 +
    • 服务器-智慧计算
    • 集群管理软件
    • 深度学习平台
    • 智能制造业务
  • 技术支持 +
    • GPU 深度学习性能
    • 远程测试申请
    • 服务与保修
    • 技术文档
    • FAQs
  • 关于超集 +
    • 公司介绍
    • 新闻中心
    • 资质荣誉
    • 生态合作
    • 联系我们
资料下载
首页 技术文档 单节点的 slurm 安装
  • 修改注册表屏蔽开机磁盘自检
  • GPU burn 测试GPU
  • 单节点的 slurm 安装
  • 处理器(Processor)和散热器(Heatsink)安装
  • AMAX定制系统一键恢复启动盘制作步骤
  • ubuntu 进try ubuntu模式操作

安装环境:Ubuntu20.04

1、安装 munge

apt install munge

启动 munge 服务

systemctl enable munge

systemctl start munge

2、下载 slurm

apt install mysql-server slurm-wlm slurmdbd -y

3、配置数据库

mysql 启动数据库

mysql

create user 'amax'@'localhost' identified by 'amax@1234';

(amax 为用户名,amax@1234 为密码)

create database slurm_acct_db;

grant all PRIVILEGES on slurm_acct_db.* TO 'amax'@'localhost' with grant option; quit;

systemctl start mysql.service

systemctl enable mysql.service

4、配置 slurmdbd

vim /etc/slurm-llnl/slurmdbd.conf

ArchiveEvents=yes

ArchiveJobs=yes

ArchiveResvs=yes

ArchiveSuspend=no

ArchiveTXN=no

ArchiveUsage=no

#ArchiveDir="/tmp"

ArchiveSteps=yes

#ArchiveScript=

#JobPurge=12

#StepPurge=1

#

# Authentication info

AuthType=auth/munge

AuthInfo=/var/run/munge/munge.socket.2

#

# slurmDBD info

DbdAddr=localhost

DbdHost=localhost

DbdPort=6819

SlurmUser=amax

#MessageTimeout=300

DebugLevel=4

#DefaultQOS=normal,standby

LogFile=/var/log/slurm-llnl/slurmdbd.log

PidFile=/var/run/slurmdbd.pid

#PluginDir=/usr/lib/slurm

#PrivateData=accounts,users,usage,jobs

#TrackWCKey=yes

#

# Database info

StorageType=accounting_storage/mysql

StorageHost=localhost

StoragePort=3306

StoragePass=amax@1234

StorageUser=amax

StorageLoc=slurm_acct_db

PurgeEventAfter=12month

PurgeJobAfter=12month

PurgeResvAfter=2month

PurgeStepAfter=2month

PurgeSuspendAfter=1month

PurgeTXNAfter=12month

PurgeUsageAfter=12month

MaxQueryTimeRange=60-0

启动 slurmdbd 服务:

systemctl start slurmdbd

systemctl enable slurmdbd

5、配置 slurm

mkdir /var/spool/slurmd

mkdir /var/spool/slurmctld

chmod -R 777 slurmd/

chmod -R 777 slurmctld/

vim /etc/slurm-llnl/slurm.conf

# slurm.conf file generated by configurator.html. # Put this file on all nodes of your cluster.

# See the slurm.conf man page for more information. #

ClusterName=cluster

SlurmctldHost=localhost

#SlurmctldHost=

#

#DisableRootJobs=NO

#EnforcePartLimits=NO

#Epilog=

#EpilogSlurmctld=

#FirstJobId=1

#MaxJobId=67043328

#GresTypes=

#GroupUpdateForce=0

#GroupUpdateTime=600

#JobFileAppend=0

#JobRequeue=1

#JobSubmitPlugins=lua

#KillOnBadExit=0

#LaunchType=launch/slurm

#Licenses=foo*4,bar

#MailProg=/bin/mail

#MaxJobCount=10000

#MaxStepCount=40000

#MaxTasksPerNode=512

MpiDefault=none

#MpiParams=ports=#-#

#PluginDir=

#PlugStackConfig=

#PrivateData=jobs

ProctrackType=proctrack/cgroup

#Prolog=

#PrologFlags=

#PrologSlurmctld=

#PropagatePrioProcess=0

#PropagateResourceLimits=

#PropagateResourceLimitsExcept=

#RebootProgram=

ReturnToService=1

SlurmctldPidFile=/var/run/slurmctld.pid

SlurmctldPort=6817

SlurmdPidFile=/var/run/slurmd.pid

SlurmdPort=6818

SlurmdSpoolDir=/var/spool/slurmd

SlurmUser=root

#SlurmdUser=root

#SrunEpilog=

#SrunProlog=

StateSaveLocation=/var/spool/slurmctld

SwitchType=switch/none

#TaskEpilog=

#TaskPlugin=task/affinity

#TaskProlog=

#TopologyPlugin=topology/tree

#TmpFS=/tmp

#TrackWCKey=no

#TreeWidth=

#UnkillableStepProgram=

#UsePAM=0

#

#

# TIMERS

#BatchStartTimeout=10

#CompleteWait=0

#EpilogMsgTime=2000

#GetEnvTimeout=2

#HealthCheckInterval=0

#HealthCheckProgram=

InactiveLimit=0

KillWait=30

#MessageTimeout=10

#ResvOverRun=0

MinJobAge=300

#OverTimeLimit=0

SlurmctldTimeout=120

SlurmdTimeout=300

#UnkillableStepTimeout=60

#VSizeFactor=0

Waittime=0

#

#

# SCHEDULING

#DefMemPerCPU=0

#MaxMemPerCPU=0

#SchedulerTimeSlice=30

TaskPlugin=task/cgroup

SchedulerType=sched/backfill

SelectType=select/cons_res

SelectTypeParameters=CR_CPU

#

#

# JOB PRIORITY

#PriorityFlags=

#PriorityType=priority/basic

#PriorityDecayHalfLife=

#PriorityCalcPeriod=

#PriorityFavorSmall=

#PriorityMaxAge=

#PriorityUsageResetPeriod=

#PriorityWeightAge=

#PriorityWeightFairshare=

#PriorityWeightJobSize=

#PriorityWeightPartition=

#PriorityWeightQOS=

#

#

# LOGGING AND ACCOUNTING

#AccountingStorageEnforce=0

#AccountingStorageHost=

AccountingStoragePass=/var/run/munge/munge.socket.2

#AccountingStoragePort=

AccountingStorageType=accounting_storage/slurmdbd

#AccountingStorageUser=

#AccountingStoreFlags=

JobCompHost=localhost

JobCompLoc=slurm_acct_db

JobCompPass=amax@1234

#JobCompPort=

JobCompType=jobcomp/mysql

JobCompUser=amax

#JobContainerType=job_container/none

JobAcctGatherFrequency=30

JobAcctGatherType=jobacct_gather/none

SlurmctldDebug=info

SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log

SlurmdDebug=info

SlurmdLogFile=/var/log/slurm-llnl/slurmd.log

#SlurmSchedLogFile=

#SlurmSchedLogLevel=

#DebugFlags=

#

#

# POWER SAVE SUPPORT FOR IDLE NODES (optional)

#SuspendProgram=

#ResumeProgram=

#SuspendTimeout=

#ResumeTimeout=

#ResumeRate=

#SuspendExcNodes=

#SuspendExcParts=

#SuspendRate=

#SuspendTime=

#

#

# COMPUTE NODES

NodeName=amax CPUs=2 RealMemory=1941

PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP (节点配置查看用 lscpu/slurmd -C)

6、配置 gres

vim /etc/slurm-llnl/gres.conf

# This section of this file was automatically generated by cmd. Do not edit manually! # BEGIN AUTOGENERATED SECTION -- DO NOT REMOVE

# No gres config specified

# END AUTOGENERATED SECTION -- DO NOT REMOVE

NodeName=amax Name=gpu File=/dev/nvidia[0-3]

7、配置 cgroup

vim /etc/slurm-llnl/cgroup.conf

# This section of this file was automatically generated by cmd. Do not edit manually! # BEGIN AUTOGENERATED SECTION -- DO NOT REMOVE CgroupMountpoint="/sys/fs/cgroup"

CgroupAutomount=no

TaskAffinity=no

ConstrainCores=yes

ConstrainRAMSpace=no

ConstrainSwapSpace=no

ConstrainDevices=yes

ConstrainKmemSpace=yes

AllowedRamSpace=100.00

AllowedSwapSpace=0.00

MinKmemSpace=30

MaxKmemPercent=100.00

MaxRAMPercent=100.00

MaxSwapPercent=100.00

MinRAMSpace=30

# END AUTOGENERATED SECTION -- DO NOT REMOVE

8、启动服务并设置开机自启:

systemctl start slurmctld

systemctl enable slurmctld

systemctl start slurmd

systemctl enable slurmd

9、查看服务状态:

systemctl status slurmctld

systemctl status slurmd

systemctl status slurmdbd

10、测试命令:

  • 400-860-6560
  • QQ咨询
  • 在线留言
  • 主要方案▼
    AI解决方案 HPC解决方案 存储解决方案服务器定制解决方案
  • 主推产品▼
    服务器-智慧计算 产品中心 集群管理软件 深度学习平台
  • 技术支持▼
    远程测试申请 技术文档 合作伙伴服务与保修
  • 关于AMAX ▼
    公司介绍 联系我们 资质荣誉 新闻中心
AMAX全球网址
400-860-6560
  • AMAX 官方淘宝
Copyright© 2000 - 2023 版权所有 苏州超集信息科技有限公司 苏 ICP 备 11056665 号-2 隐私条款 使用条款