#!/bin/bash #SBATCH --job-name=cnc_dist #SBATCH --output=slurm.out #SBATCH --error=slurm.err #SBATCH --partition=batch #SBATCH --nodes=4 #SBATCH --ntasks-per-node=1 #SBATCH --exclusive # User should set this, and hopefully everything else will work CMD="./hello 64 8" # ----------------------------------------- export CNC_SOCKET_HOST=$SLURM_NPROCS export CNC_SOCKETS_START_CLIENTS_IN_ORDER=1 export CNC_SOCKET_HOST_EXECUTABLE=$CMD # Start the master process in the background # Split stdout to a logfile "master.log"; we need to search # this logfile to find the connection string for the clients $CMD 2>&1 | tee master.log & # Give the master a moment to write out the connection string # Is there a more robust way to do this? sleep 1 # Use gawk to read the logfile - the first line should contain the # connection string in the 7th field contactString=$(gawk '(NR==1) { print $7 }' master.log) # Now launch the clients using srun # The number of clients we scheduled should be available in SLURM_NPROCS for (( i=1; i<=${SLURM_NPROCS}; i++ )) do export CNC_SOCKET_CLIENT=$contactString export CNC_SOCKET_CLIENT_ID=$i export CNC_CLIENTS=SOCKET # Notice that we also need to run the clients in the background # The master will wait until we spawned off CNC_SOCKET_HOST of these clients # before it can begin any actual work printf "Starting client %03d with SOCKETs: %s\n" $i $CNC_SOCKET_CLIENT srun -N1 -n1 --distribution=cyclic $CMD & done wait