Rmpi
From FarmShare
m (mktmp and awk were on the same line. Added a newline) |
|||
Line 14: | Line 14: | ||
#$ -j y | #$ -j y | ||
#$ -S /bin/bash | #$ -S /bin/bash | ||
- | #$ -N | + | #$ -N rmpi64 |
- | #$ -pe orte | + | #$ -pe orte 64 |
tmphosts=`mktemp` | tmphosts=`mktemp` | ||
Line 93: | Line 93: | ||
<br> Here is a very short R program. Save this as Rmpitest.R | <br> Here is a very short R program. Save this as Rmpitest.R | ||
<pre># Tell all slaves to return a message identifying themselves | <pre># Tell all slaves to return a message identifying themselves | ||
- | + | mpi.remote.exec(paste(Sys.info()[c("nodename")],"checking in as",mpi.comm.rank(),"of",mpi.comm.size())) | |
- | mpi.remote.exec(paste(" | + | |
# Tell all slaves to close down, and exit the program | # Tell all slaves to close down, and exit the program | ||
- | |||
mpi.close.Rslaves() | mpi.close.Rslaves() | ||
mpi.quit() | mpi.quit() | ||
Line 103: | Line 101: | ||
== running == | == running == | ||
- | If you look at the first few lines of rmpi.submit you will see -pe orte | + | If you look at the first few lines of rmpi.submit you will see -pe orte 64. This indicates to grid engine to allocate 64 slots, across any hosts. The driver R script (Rmiptest.R above is executed on the master (rank 0) process and the slaves all wait for instructions. |
+ | |||
+ | To see it in action, submit the job: | ||
- | + | <br> | |
<pre>bishopj@corn14:/mnt/glusterfs/bishopj$ qsub rmpi.submit | <pre>bishopj@corn14:/mnt/glusterfs/bishopj$ qsub rmpi.submit | ||
Your job 81714 ("rmpi32") has been submitted | Your job 81714 ("rmpi32") has been submitted | ||
bishopj@corn14:/mnt/glusterfs/bishopj$ cat rmpi32.o81714 | bishopj@corn14:/mnt/glusterfs/bishopj$ cat rmpi32.o81714 | ||
- | Got | + | Got 64 slots |
jobid 81714 | jobid 81714 | ||
- | master (rank 0 , comm 1) of size | + | master (rank 0 , comm 1) of size 64 is running on: barley19 |
- | slave1 (rank 1 , comm 1) of size | + | slave1 (rank 1 , comm 1) of size 64 is running on: barley19 |
- | slave2 (rank 2 , comm 1) of size | + | slave2 (rank 2 , comm 1) of size 64 is running on: barley19 |
- | slave3 (rank 3 , comm 1) of size | + | slave3 (rank 3 , comm 1) of size 64 is running on: barley19 |
... ... ... | ... ... ... | ||
- | + | slave62 (rank 62, comm 1) of size 64 is running on: barley16 | |
- | + | slave63 (rank 63, comm 1) of size 64 is running on: barley16 | |
> | > | ||
> # Tell all slaves to return a message identifying themselves | > # Tell all slaves to return a message identifying themselves | ||
- | > mpi.remote.exec(paste(" | + | > mpi.remote.exec(paste(Sys.info()[c("nodename")],"checking in as",mpi.comm.rank(),"of",mpi.comm.size())) |
$slave1 | $slave1 | ||
- | [1] " | + | [1] "barley19.stanford.edu checking in as 1 of 64" |
$slave2 | $slave2 | ||
- | [1] " | + | [1] "barley19.stanford.edu checking in as 2 of 64" |
$slave3 | $slave3 | ||
- | [1] " | + | [1] "barley19.stanford.edu checking in as 3 of 64" |
$slave4 | $slave4 | ||
- | [1] " | + | [1] "barley13.stanford.edu checking in as 4 of 64" |
$slave5 | $slave5 | ||
- | [1] " | + | [1] "barley13.stanford.edu checking in as 5 of 64" |
$slave6 | $slave6 | ||
- | [1] " | + | [1] "barley13.stanford.edu checking in as 6 of 64" |
$slave7 | $slave7 | ||
- | [1] " | + | [1] "barley13.stanford.edu checking in as 7 of 64" |
$slave8 | $slave8 | ||
- | [1] " | + | [1] "barley17.stanford.edu checking in as 8 of 64" |
$slave9 | $slave9 | ||
- | [1] " | + | [1] "barley17.stanford.edu checking in as 9 of 64" |
$slave10 | $slave10 | ||
- | [1] " | + | [1] "barley17.stanford.edu checking in as 10 of 64" |
$slave11 | $slave11 | ||
- | [1] " | + | [1] "barley17.stanford.edu checking in as 11 of 64" |
$slave12 | $slave12 | ||
- | [1] " | + | [1] "barley12.stanford.edu checking in as 12 of 64" |
$slave13 | $slave13 | ||
- | [1] " | + | [1] "barley12.stanford.edu checking in as 13 of 64" |
$slave14 | $slave14 | ||
- | [1] " | + | [1] "barley12.stanford.edu checking in as 14 of 64" |
$slave15 | $slave15 | ||
- | [1] " | + | [1] "barley12.stanford.edu checking in as 15 of 64" |
$slave16 | $slave16 | ||
- | [1] " | + | [1] "barley06.stanford.edu checking in as 16 of 64" |
$slave17 | $slave17 | ||
- | [1] " | + | [1] "barley06.stanford.edu checking in as 17 of 64" |
$slave18 | $slave18 | ||
- | [1] " | + | [1] "barley06.stanford.edu checking in as 18 of 64" |
$slave19 | $slave19 | ||
- | [1] " | + | [1] "barley06.stanford.edu checking in as 19 of 64" |
$slave20 | $slave20 | ||
- | [1] " | + | [1] "barley10.stanford.edu checking in as 20 of 64" |
$slave21 | $slave21 | ||
- | [1] " | + | [1] "barley10.stanford.edu checking in as 21 of 64" |
$slave22 | $slave22 | ||
- | [1] " | + | [1] "barley10.stanford.edu checking in as 22 of 64" |
$slave23 | $slave23 | ||
- | [1] " | + | [1] "barley10.stanford.edu checking in as 23 of 64" |
$slave24 | $slave24 | ||
- | [1] " | + | [1] "barley08.stanford.edu checking in as 24 of 64" |
$slave25 | $slave25 | ||
- | [1] " | + | [1] "barley08.stanford.edu checking in as 25 of 64" |
$slave26 | $slave26 | ||
- | [1] " | + | [1] "barley08.stanford.edu checking in as 26 of 64" |
$slave27 | $slave27 | ||
- | [1] " | + | [1] "barley08.stanford.edu checking in as 27 of 64" |
$slave28 | $slave28 | ||
- | [1] " | + | [1] "barley07.stanford.edu checking in as 28 of 64" |
$slave29 | $slave29 | ||
- | [1] " | + | [1] "barley07.stanford.edu checking in as 29 of 64" |
$slave30 | $slave30 | ||
- | [1] " | + | [1] "barley07.stanford.edu checking in as 30 of 64" |
$slave31 | $slave31 | ||
- | [1] " | + | [1] "barley07.stanford.edu checking in as 31 of 64" |
+ | |||
+ | $slave32 | ||
+ | [1] "barley04.stanford.edu checking in as 32 of 64" | ||
+ | |||
+ | $slave33 | ||
+ | [1] "barley04.stanford.edu checking in as 33 of 64" | ||
+ | |||
+ | $slave34 | ||
+ | [1] "barley04.stanford.edu checking in as 34 of 64" | ||
+ | |||
+ | $slave35 | ||
+ | [1] "barley04.stanford.edu checking in as 35 of 64" | ||
+ | |||
+ | $slave36 | ||
+ | [1] "barley09.stanford.edu checking in as 36 of 64" | ||
+ | |||
+ | $slave37 | ||
+ | [1] "barley09.stanford.edu checking in as 37 of 64" | ||
+ | |||
+ | $slave38 | ||
+ | [1] "barley09.stanford.edu checking in as 38 of 64" | ||
+ | |||
+ | $slave39 | ||
+ | [1] "barley09.stanford.edu checking in as 39 of 64" | ||
+ | |||
+ | $slave40 | ||
+ | [1] "barley11.stanford.edu checking in as 40 of 64" | ||
+ | |||
+ | $slave41 | ||
+ | [1] "barley11.stanford.edu checking in as 41 of 64" | ||
+ | |||
+ | $slave42 | ||
+ | [1] "barley11.stanford.edu checking in as 42 of 64" | ||
+ | |||
+ | $slave43 | ||
+ | [1] "barley18.stanford.edu checking in as 43 of 64" | ||
+ | |||
+ | $slave44 | ||
+ | [1] "barley18.stanford.edu checking in as 44 of 64" | ||
+ | |||
+ | $slave45 | ||
+ | [1] "barley18.stanford.edu checking in as 45 of 64" | ||
+ | |||
+ | $slave46 | ||
+ | [1] "barley03.stanford.edu checking in as 46 of 64" | ||
+ | |||
+ | $slave47 | ||
+ | [1] "barley03.stanford.edu checking in as 47 of 64" | ||
+ | |||
+ | $slave48 | ||
+ | [1] "barley03.stanford.edu checking in as 48 of 64" | ||
+ | |||
+ | $slave49 | ||
+ | [1] "barley15.stanford.edu checking in as 49 of 64" | ||
+ | |||
+ | $slave50 | ||
+ | [1] "barley15.stanford.edu checking in as 50 of 64" | ||
+ | |||
+ | $slave51 | ||
+ | [1] "barley15.stanford.edu checking in as 51 of 64" | ||
+ | |||
+ | $slave52 | ||
+ | [1] "barley20.stanford.edu checking in as 52 of 64" | ||
+ | |||
+ | $slave53 | ||
+ | [1] "barley20.stanford.edu checking in as 53 of 64" | ||
+ | |||
+ | $slave54 | ||
+ | [1] "barley20.stanford.edu checking in as 54 of 64" | ||
+ | |||
+ | $slave55 | ||
+ | [1] "barley14.stanford.edu checking in as 55 of 64" | ||
+ | |||
+ | $slave56 | ||
+ | [1] "barley14.stanford.edu checking in as 56 of 64" | ||
+ | |||
+ | $slave57 | ||
+ | [1] "barley14.stanford.edu checking in as 57 of 64" | ||
+ | |||
+ | $slave58 | ||
+ | [1] "barley01.stanford.edu checking in as 58 of 64" | ||
+ | |||
+ | $slave59 | ||
+ | [1] "barley01.stanford.edu checking in as 59 of 64" | ||
+ | |||
+ | $slave60 | ||
+ | [1] "barley01.stanford.edu checking in as 60 of 64" | ||
+ | |||
+ | $slave61 | ||
+ | [1] "barley16.stanford.edu checking in as 61 of 64" | ||
+ | |||
+ | $slave62 | ||
+ | [1] "barley16.stanford.edu checking in as 62 of 64" | ||
+ | |||
+ | $slave63 | ||
+ | [1] "barley16.stanford.edu checking in as 63 of 64" | ||
> | > | ||
Line 220: | Line 316: | ||
[1] 1 | [1] 1 | ||
> mpi.quit() | > mpi.quit() | ||
+ | |||
</pre> | </pre> |
Revision as of 19:06, 17 February 2012
Rmpi
Rmpi is R with mpi support. On the barley this means OpenMPI. Rmpi can be installed from CRAN packages if you wish to have the latest version. As a convenience Rmpi_0.5-9 is installed on the barley in /mnt/glusterfs/apps/R. The setup section following this one will describe how to setup Rmpi so that you may submit jobs to the barley cluster.
setup
grid engine submit script (rmpi.submit)
# #$ -cwd #$ -j y #$ -S /bin/bash #$ -N rmpi64 #$ -pe orte 64 tmphosts=`mktemp` awk '{ for (i=0; i < $2; ++i) { print $1} }' $PE_HOSTFILE > $tmphosts echo "Got $NSLOTS slots" echo "jobid $JOB_ID" mpirun -np $NSLOTS -machinefile $tmphosts R --no-save -q < Rmpitest.R
Save following as .Rprofile. This should be in the current directory from where you submit your Rmpi jobs
# This R profile can be used when a cluster does not allow spawning or a job # scheduler is required to launch any parallel jobs. Saving this file as # .Rprofile in the working directory or root directory. For unix platform, run # mpirexec -n [cpu numbers] R --no-save -q # For windows platform with mpich2, use mpiexec wrapper and specify a working # directory where .Rprofile is inside. # Cannot be used as Rprofile.site because it will not work # Following system libraries are not loaded automatically. So manual loads are # needed. .libPaths(c("/mnt/glusterfs/apps/R", "/usr/lib/R/library")) library(utils) library(stats) library(datasets) library(grDevices) library(graphics) library(methods) if (!invisible(library(Rmpi,logical.return = TRUE))){ warning("Rmpi cannot be loaded") q(save = "no") } options(error=quote(assign(".mpi.err", FALSE, env = .GlobalEnv))) if (mpi.comm.size(0) > 1) invisible(mpi.comm.dup(0,1)) if (mpi.comm.rank(0) >0){ #sys.load.image(".RData",TRUE) options(echo=FALSE) .comm <- 1 mpi.barrier(0) repeat try(eval(mpi.bcast.cmd(rank=0,comm=.comm)),TRUE) #try(eval(mpi.bcast.cmd(rank=0,comm=.comm),env=sys.parent()),TRUE) #mpi.barrier(.comm) if (is.loaded("mpi_comm_disconnect")) mpi.comm.disconnect(.comm) else mpi.comm.free(.comm) mpi.quit() } if (mpi.comm.rank(0)==0) { #options(echo=TRUE) mpi.barrier(0) if(mpi.comm.size(0) > 1) slave.hostinfo(1) } .Last <- function(){ if (is.loaded("mpi_initialize")){ if (mpi.comm.size(1) > 1){ print("Please use mpi.close.Rslaves() to close slaves") mpi.close.Rslaves(comm=1) } } print("Please use mpi.quit() to quit R") mpi.quit() }
Here is a very short R program. Save this as Rmpitest.R
# Tell all slaves to return a message identifying themselves mpi.remote.exec(paste(Sys.info()[c("nodename")],"checking in as",mpi.comm.rank(),"of",mpi.comm.size())) # Tell all slaves to close down, and exit the program mpi.close.Rslaves() mpi.quit()
running
If you look at the first few lines of rmpi.submit you will see -pe orte 64. This indicates to grid engine to allocate 64 slots, across any hosts. The driver R script (Rmiptest.R above is executed on the master (rank 0) process and the slaves all wait for instructions.
To see it in action, submit the job:
bishopj@corn14:/mnt/glusterfs/bishopj$ qsub rmpi.submit Your job 81714 ("rmpi32") has been submitted bishopj@corn14:/mnt/glusterfs/bishopj$ cat rmpi32.o81714 Got 64 slots jobid 81714 master (rank 0 , comm 1) of size 64 is running on: barley19 slave1 (rank 1 , comm 1) of size 64 is running on: barley19 slave2 (rank 2 , comm 1) of size 64 is running on: barley19 slave3 (rank 3 , comm 1) of size 64 is running on: barley19 ... ... ... slave62 (rank 62, comm 1) of size 64 is running on: barley16 slave63 (rank 63, comm 1) of size 64 is running on: barley16 > > # Tell all slaves to return a message identifying themselves > mpi.remote.exec(paste(Sys.info()[c("nodename")],"checking in as",mpi.comm.rank(),"of",mpi.comm.size())) $slave1 [1] "barley19.stanford.edu checking in as 1 of 64" $slave2 [1] "barley19.stanford.edu checking in as 2 of 64" $slave3 [1] "barley19.stanford.edu checking in as 3 of 64" $slave4 [1] "barley13.stanford.edu checking in as 4 of 64" $slave5 [1] "barley13.stanford.edu checking in as 5 of 64" $slave6 [1] "barley13.stanford.edu checking in as 6 of 64" $slave7 [1] "barley13.stanford.edu checking in as 7 of 64" $slave8 [1] "barley17.stanford.edu checking in as 8 of 64" $slave9 [1] "barley17.stanford.edu checking in as 9 of 64" $slave10 [1] "barley17.stanford.edu checking in as 10 of 64" $slave11 [1] "barley17.stanford.edu checking in as 11 of 64" $slave12 [1] "barley12.stanford.edu checking in as 12 of 64" $slave13 [1] "barley12.stanford.edu checking in as 13 of 64" $slave14 [1] "barley12.stanford.edu checking in as 14 of 64" $slave15 [1] "barley12.stanford.edu checking in as 15 of 64" $slave16 [1] "barley06.stanford.edu checking in as 16 of 64" $slave17 [1] "barley06.stanford.edu checking in as 17 of 64" $slave18 [1] "barley06.stanford.edu checking in as 18 of 64" $slave19 [1] "barley06.stanford.edu checking in as 19 of 64" $slave20 [1] "barley10.stanford.edu checking in as 20 of 64" $slave21 [1] "barley10.stanford.edu checking in as 21 of 64" $slave22 [1] "barley10.stanford.edu checking in as 22 of 64" $slave23 [1] "barley10.stanford.edu checking in as 23 of 64" $slave24 [1] "barley08.stanford.edu checking in as 24 of 64" $slave25 [1] "barley08.stanford.edu checking in as 25 of 64" $slave26 [1] "barley08.stanford.edu checking in as 26 of 64" $slave27 [1] "barley08.stanford.edu checking in as 27 of 64" $slave28 [1] "barley07.stanford.edu checking in as 28 of 64" $slave29 [1] "barley07.stanford.edu checking in as 29 of 64" $slave30 [1] "barley07.stanford.edu checking in as 30 of 64" $slave31 [1] "barley07.stanford.edu checking in as 31 of 64" $slave32 [1] "barley04.stanford.edu checking in as 32 of 64" $slave33 [1] "barley04.stanford.edu checking in as 33 of 64" $slave34 [1] "barley04.stanford.edu checking in as 34 of 64" $slave35 [1] "barley04.stanford.edu checking in as 35 of 64" $slave36 [1] "barley09.stanford.edu checking in as 36 of 64" $slave37 [1] "barley09.stanford.edu checking in as 37 of 64" $slave38 [1] "barley09.stanford.edu checking in as 38 of 64" $slave39 [1] "barley09.stanford.edu checking in as 39 of 64" $slave40 [1] "barley11.stanford.edu checking in as 40 of 64" $slave41 [1] "barley11.stanford.edu checking in as 41 of 64" $slave42 [1] "barley11.stanford.edu checking in as 42 of 64" $slave43 [1] "barley18.stanford.edu checking in as 43 of 64" $slave44 [1] "barley18.stanford.edu checking in as 44 of 64" $slave45 [1] "barley18.stanford.edu checking in as 45 of 64" $slave46 [1] "barley03.stanford.edu checking in as 46 of 64" $slave47 [1] "barley03.stanford.edu checking in as 47 of 64" $slave48 [1] "barley03.stanford.edu checking in as 48 of 64" $slave49 [1] "barley15.stanford.edu checking in as 49 of 64" $slave50 [1] "barley15.stanford.edu checking in as 50 of 64" $slave51 [1] "barley15.stanford.edu checking in as 51 of 64" $slave52 [1] "barley20.stanford.edu checking in as 52 of 64" $slave53 [1] "barley20.stanford.edu checking in as 53 of 64" $slave54 [1] "barley20.stanford.edu checking in as 54 of 64" $slave55 [1] "barley14.stanford.edu checking in as 55 of 64" $slave56 [1] "barley14.stanford.edu checking in as 56 of 64" $slave57 [1] "barley14.stanford.edu checking in as 57 of 64" $slave58 [1] "barley01.stanford.edu checking in as 58 of 64" $slave59 [1] "barley01.stanford.edu checking in as 59 of 64" $slave60 [1] "barley01.stanford.edu checking in as 60 of 64" $slave61 [1] "barley16.stanford.edu checking in as 61 of 64" $slave62 [1] "barley16.stanford.edu checking in as 62 of 64" $slave63 [1] "barley16.stanford.edu checking in as 63 of 64" > > # Tell all slaves to close down, and exit the program > mpi.close.Rslaves() [1] 1 > mpi.quit()