If you have used our script (check the last reference) for deploying Condor then you need modify your local configuration files (condor_config.local) at master and worker nodes as follows:
Master:
Worker Nodes:
In addition, I attach a submission file for running parallel jobs.
should_transfer_files = Yes
when_to_transfer_output = ON_EXIT_OR_EVICT
universe = parallel
executable = /bin/hostname
+ParallelShutdownPolicy = "WAIT_FOR_ALL"
machine_count = 3
log = hostLog
Output = hostOut.$(Node).$(Process)
Error = hostErr.$(Node).$(Process)
queue
The boldfaced line forces that Condor will wait until every node in the parallel job has completed.
References
Master:
UNUSED_CLAIM_TIMEOUT = 0
MPI_CONDOR_RSH_PATH = $(LIBEXEC)
ALTERNATE_STARTER_2 = $(SBIN)/condor_starter
STARTER_2_IS_DC = TRUE
SHADOW_MPI = $(SBIN)/condor_shadow
MPI_CONDOR_RSH_PATH = $(LIBEXEC)
ALTERNATE_STARTER_2 = $(SBIN)/condor_starter
STARTER_2_IS_DC = TRUE
SHADOW_MPI = $(SBIN)/condor_shadow
Worker Nodes:
DedicatedScheduler = "DedicatedScheduler@PASTE_ SUBMIT_NODE_NAME_HERE"
STARTD_ATTRS = $(STARTD_ATTRS), DedicatedScheduler
SUSPEND = False
CONTINUE = True
PREEMPT = False
KILL = False
WANT_SUSPEND = False
WANT_VACATE = False
RANK = Scheduler =?= $(DedicatedScheduler)
MPI_CONDOR_RSH_PATH = $(LIBEXEC)
CONDOR_SSHD = /usr/sbin/sshd
CONDOR_SSH_KEYGEN = /usr/bin/ssh-keygen
STARTD_EXPRS = $(STARTD_EXPRS), DedicatedScheduler
STARTD_ATTRS = $(STARTD_ATTRS), DedicatedScheduler
SUSPEND = False
CONTINUE = True
PREEMPT = False
KILL = False
WANT_SUSPEND = False
WANT_VACATE = False
RANK = Scheduler =?= $(DedicatedScheduler)
MPI_CONDOR_RSH_PATH = $(LIBEXEC)
CONDOR_SSHD = /usr/sbin/sshd
CONDOR_SSH_KEYGEN = /usr/bin/ssh-keygen
STARTD_EXPRS = $(STARTD_EXPRS), DedicatedScheduler
# Dedicated Node.
START=TRUEIn addition, I attach a submission file for running parallel jobs.
should_transfer_files = Yes
when_to_transfer_output = ON_EXIT_OR_EVICT
universe = parallel
executable = /bin/hostname
+ParallelShutdownPolicy = "WAIT_FOR_ALL"
machine_count = 3
log = hostLog
Output = hostOut.$(Node).$(Process)
Error = hostErr.$(Node).$(Process)
queue
The boldfaced line forces that Condor will wait until every node in the parallel job has completed.
References
No comments:
Post a Comment