[araim1@tara-fe1 ~]$ salloc -n1 srun -n1 -N1 --pty --preserve-env $SHELL salloc: Granted job allocation 102560 [araim1@n1 ~]$ squeue JOBID PARTITION NAME USER ST TIME NODES QOS NODELIST(REASON) 102560 develop srun araim1 R 0:59 1 normal n1 [araim1@n1 ~]$
[araim1@tara-fe1 ~]$ salloc -n1 srun -n1 -N1 --pty --preserve-env $SHELL salloc: Granted job allocation 102560 [araim1@n1 ~]$ matlab ... [araim1@n1 ~]$ R ... [araim1@n1 ~]$ exit salloc: Relinquishing job allocation 102560
Note that the usual limitations on memory, time usage, etc are in effect for interactive jobs.
[araim1@tara-fe1 ~]$ salloc --partition=batch -n1 srun -n1 -N1 --pty --preserve-env $SHELL
[araim1@tara-fe1 ~]$ salloc --time 00:25:00 -n1 srun -n1 -N1 --pty --preserve-env $SHELL
[araim1@tara-fe1 ~]$ salloc --exclusive -n1 srun -n1 -N1 --pty --preserve-env $SHELL
[araim1@tara-fe1 ~]$ squeue JOBID PARTITION NAME USER ST TIME NODES QOS NODELIST(REASON) 59512 develop test-job araim1 R 0:01 2 normal n[1-2] [araim1@tara-fe1 ~]$
[araim1@tara-fe1 ~]$ srun --jobid 59512 hostname n1 n2 [araim1@tara-fe1 ~]$
A more useful application would be to run a "ps" on each node, showing important information about our processes such as status, memory and CPU usage, etc. Note that in the output below, some lines were very long so they have been truncated.
[araim1@maya-usr1 ~]$ srun --jobid 59512 ps u -u araim1 USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND araim1 12400 0.0 0.0 40748 1852 ? S 14:05 0:00 /usr/cluster/openmpi/1.3.3-p1/gcc/4/bin/orted ... araim1 12402 99.2 0.2 381676 50368 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12403 99.2 0.2 360664 51112 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12404 99.2 0.2 315640 49884 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12405 99.1 0.2 381180 49920 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12406 99.2 0.2 381340 50064 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12407 99.2 0.2 315640 49908 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12408 99.2 0.2 315804 50040 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12409 99.1 0.2 315804 50052 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12512 0.0 0.0 63760 920 ? R 14:08 0:00 /bin/ps u -u araim1 USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND araim1 31143 0.0 0.0 63868 1084 ? S 14:05 0:00 /bin/bash /var/spool/slurmd/job59512/slurm_script ... araim1 31145 0.0 0.0 49388 2080 ? S 14:05 0:00 mpirun -np 1 R --no-save araim1 31146 75.5 1.3 592456 324488 ? RLl 14:05 1:59 /usr/lib64/R/bin/exec/R --no-save araim1 31153 0.0 0.0 89188 4120 ? Sl 14:05 0:00 srun --nodes=1 --ntasks=1 --kill-on-bad-exit ... araim1 31157 78.7 0.2 315624 49892 ? RLl 14:05 2:04 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31158 99.2 0.2 315628 49888 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31159 59.7 0.2 315624 49900 ? RLl 14:05 1:34 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31160 99.1 0.2 315792 50040 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31161 86.0 0.2 315628 49892 ? RLl 14:05 2:15 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31162 99.2 0.2 360816 51252 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31163 98.2 0.2 381332 50064 ? RLl 14:05 2:35 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31165 99.2 0.2 381668 50352 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31274 0.0 0.0 63748 912 ? R 14:08 0:00 /bin/ps u -u araim1
#!/bin/bash HOSTNAME=$(hostname) TMP="cmd-$HOSTNAME-$$.out" echo "----------------- $HOSTNAME -------------------" > $TMP ps u -u $USER >> $TMP cat $TMP rm $TMP
[araim1@maya-usr1 ~]$ srun --jobid 59513 ./diagnostics.bash ----------------- n2 ------------------- USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND araim1 12400 0.0 0.0 40748 1852 ? S 14:05 0:00 /usr/cluster/openmpi/1.3.3-p1/gcc/4/bin/orted ... araim1 12402 99.2 0.2 381676 50368 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12403 99.2 0.2 360664 51112 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12404 99.2 0.2 315640 49884 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12405 99.1 0.2 381180 49920 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12406 99.2 0.2 381340 50064 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12407 99.2 0.2 315640 49908 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12408 99.2 0.2 315804 50040 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12409 99.1 0.2 315804 50052 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 12512 0.0 0.0 63760 920 ? R 14:08 0:00 /bin/ps u -u araim1 ----------------- n1 ------------------- USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND araim1 31143 0.0 0.0 63868 1084 ? S 14:05 0:00 /bin/bash /var/spool/slurmd/job59512/slurm_script ... araim1 31145 0.0 0.0 49388 2080 ? S 14:05 0:00 mpirun -np 1 R --no-save araim1 31146 75.5 1.3 592456 324488 ? RLl 14:05 1:59 /usr/lib64/R/bin/exec/R --no-save araim1 31153 0.0 0.0 89188 4120 ? Sl 14:05 0:00 srun --nodes=1 --ntasks=1 --kill-on-bad-exit ... araim1 31157 78.7 0.2 315624 49892 ? RLl 14:05 2:04 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31158 99.2 0.2 315628 49888 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31159 59.7 0.2 315624 49900 ? RLl 14:05 1:34 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31160 99.1 0.2 315792 50040 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31161 86.0 0.2 315628 49892 ? RLl 14:05 2:15 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31162 99.2 0.2 360816 51252 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31163 98.2 0.2 381332 50064 ? RLl 14:05 2:35 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31165 99.2 0.2 381668 50352 ? RLl 14:05 2:36 /usr/lib64/R/bin/exec/R --slave --no-restore ... araim1 31274 0.0 0.0 63748 912 ? R 14:08 0:00 /bin/ps u -u araim1 [araim1@maya-usr1 ~]$