#!/bin/env python # -*- Mode: Python -*- #!/bin/bash #"exec" "python" "$0" "$@" """ Transformation to invoke ATLAS production job in Panda via AutoPilot. It is exec'd from ProdPilot.py """ import sys, getopt, time, re, os, commands # error codes EC_MissingArg = 10 print "--- start ---" if os.environ.has_key('HOSTNAME'): print "%s at %s " % ( time.ctime(), os.environ['HOSTNAME'] ) else: print commands.getoutput("echo `date` at `hostname`") # python pilot.py -s -d -a -l -w # -p -q -u -m -g -r # -j # where: # is the name of the site that this job is landed,like BNL_ATLAS_1 # is the pathname to the work directory of this job on the site # is the pathname to the directory of the executables # is the pathname to where long term storage is available on the # site # is the URL of the http web server that the pilot job should connect # to # is the URL of the https web server for the local site's DQ2 # siteservice # is the port on which the web server listens on # is a flag meaning this pilot is to get a user analysis job from # dispatcher # controls if this pilot runs in single or multi-task mode, for # multi-task mode, set it to true, all other values is for single mode # location of grid client software # controls if the workdir of this pilot should be removed in the # end or not, forced mode: true or false (should not be used otherwise # since it counteracts the job recovery decision making; default is None) # turns on/off job recovery, true or false (default is true) # # ================= GLEXEC & MYPROXY INTEGRATION ================= # # When this script is run under gLExec, # the execution is done in a standalone mode. # That implies communication between this code and the rest of ProdPilot # disappears, and requires the following modifications: # # - import PilotUtils as utils # - reset the variable jobstat = 0 # - a new option, --glexec=yes, is passed throught the environment # via pilotPars variable # The default value is glexec='no' # When glexec='yes' the value of jobstat is returned # via sys.exit() function. In this way, the final value of jostat # is the return code of this script. # # This changes do not afect (or should not) # the operations when the script is run # in the usual way, as part of the ProdPilot script. # # # Jose Caballero (Brookhaven National Laboratory) # jcaballero (at) bnl.gov # 30 Jul 2008 # # =================================================================== # # The following two lines are in case this script is executed under gLExec import PilotUtils as utils jobstat = 0 ## Environment set by uber-pilot try: pid = os.environ['PandaID'] print "PandaID", pid site = os.environ['PandaSite'] print "Panda site", site except: print "!!FAILED!!6999!!PandaID and PandaSite env vars must be supplied to trans script" jobstat = 1 queuepars = '' if os.environ.has_key('PandaWorkdir'): workdir = os.environ['PandaWorkdir'] print "PandaWorkdir:", workdir queuepars += " -d %s" % workdir if os.environ.has_key('PANDA_URL_SSL'): baseURLSSL = os.environ['PANDA_URL_SSL'] pat = re.compile('^(.*):([0-9]+)/.*$') mat = pat.match(baseURLSSL) if mat: pandaURL = mat.group(1) pandaPort = mat.group(2) else: print "!!FAILED!!6999!!Bad Panda URL %s" % baseURLSSL jobstat = 1 else: print "!!FAILED!!6999!!PANDA_URL_SSL undefined" jobstat = 1 if os.environ.has_key('QueueName'): queue = os.environ['QueueName'] print "trans-atlasprod running on queue", queue else: print "!!FAILED!!6999!!QueueName undefined" jobstat = 1 try: release = os.environ["swRelease"] release = release.replace('Atlas-','') print "Release:", release except: print "!!WARNING!!2999!!No release defined" release = '' ## Command line parameters #print "Script argv:", sys.argv # Get the parameters for me from --pilotpars #for arg in sys.argv[1:]: # if arg.startswith('--pilotpars='): # pilotpars = arg.replace('--pilotpars=','') # break pilotPars = os.environ['pilotPars'] if pilotPars == '': print "!!FAILED!!6999!!No pilot wrapper parameters delivered with --pilotpars option" jobstat = 1 ## Gather environment info for the DB if os.environ.has_key("VO_ATLAS_SW_DIR"): print "VO_ATLAS_SW_DIR area", os.environ["VO_ATLAS_SW_DIR"] print commands.getoutput("/bin/ls -alL $VO_ATLAS_SW_DIR") appdir = "%s/software" % os.environ["VO_ATLAS_SW_DIR"] reldir = "%s/%s" % ( appdir, release.split('\n')[0] ) sitesetup = "source %s/setup.sh;" % reldir elif os.environ.has_key("OSG_APP"): print "OSG_APP area ",os.environ["OSG_APP"] print commands.getoutput("/bin/ls -alL $OSG_APP") appdir = "%s/atlas_app/atlas_rel" % os.environ["OSG_APP"] reldir = "%s/%s" % ( appdir, release.split('\n')[0] ) sitesetup = "%s/setup.sh;" % reldir #sitesetup += 'source %s/atlas_app/atlas_rel/%s/dist/%s/AtlasRelease/*/cmt/setup.sh -tag_add=DC2;'%(os.environ['APP'], release, release) elif os.environ.has_key("APP"): print "APP area $APP:" print commands.getoutput("/bin/ls -alL $APP") appdir = "%s/atlas_app/atlas_rel" % os.environ["APP"] reldir = "%s/%s" % ( appdir, release.split('\n')[0] ) sitesetup = "%s/setup.sh;" % reldir #sitesetup += 'source %s/atlas_app/atlas_rel/%s/dist/%s/AtlasRelease/*/cmt/setup.sh -tag_add=DC2;'%(os.environ['APP'], release, release) else: appdir = '' reldir = '' sitesetup = '' _appdir = appdir releases = '' if appdir != '': # appdir = /usatlas/OSG//atlas_app/atlas_rel print "Looking for releases in", appdir print commands.getoutput("/bin/ls -alL "+appdir) dirlist = commands.getoutput("/bin/ls -1 "+appdir) dirlist = dirlist.split('\n') for f in dirlist: pat = re.compile('^[0-9]+\.[0-9\.]+[0-9a-zA-Z\.\_]*') mat = pat.match(f) if mat: print "Release identified:",f if releases != '': releases += '|' releases += f ## Get the queue-specific parameters from the DB via Panda monitor. They will override settings inferred from the environment. datadir = '' dq2url = '' #parurl = "http://pandamon.usatlas.bnl.gov:25880/server/pandamon/query?tpmes=pilotpars&queue=%s" % queue parurl = "http://pandamon.usatlas.bnl.gov:25880/server/pandamon/query/?tpmes=pilotpars&queue=%s" % queue if releases != '': print "Releases to DB", releases parurl += "&releases=%s" % releases cmd = 'curl --connect-timeout 20 --max-time 120 "%s" -s -S' % parurl queuedata = commands.getoutput(cmd) print "Queue param retrieval command",cmd print "Queue specific parameters",queuedata # Save it for later use fh = open('queuedata.dat','w') fh.write(queuedata) fh.close() queuepars += " -s %s " % queue par = utils.getpar('appdir',queuedata) if par != '': appdir = par par = utils.getpar('datadir',queuedata) if par != '': datadir = par par = utils.getpar('dq2url',queuedata) if par != '': dq2url = par # appdir = /usatlas/OSG/ _releases = release.split('\n') if len(_releases) > 1: print "Encountered multi-release job: %s" % str(_releases) else: print "Encountered single release job: %s" % str(_releases) for _release in _releases: if _release != '' and _release != 'NULL': if reldir == '' and appdir != '': reldir = "%s/%s" % ( appdir, _release ) elif reldir == '': print "!!FAILED!!2500!!Cannot locate ATLAS software" jobstat = 1 elif _appdir != '': reldir = "%s/%s" % ( _appdir, _release ) print "Release directory %s:" % reldir print commands.getoutput("/bin/ls -alL %s"% reldir) if not os.path.exists(reldir + "/cmtsite/setup.sh"): print "!!FAILED!!2500!!Release directory/setup file missing" print "Release setup file %s/cmtsite/setup.sh doesn't exist" % reldir jobstat = 1 queuepars += " -a %s" % appdir if datadir == '': if os.environ.has_key("SCRATCH_DIRECTORY"): datadir = os.environ["SCRATCH_DIRECTORY"] elif os.environ.has_key("OSG_WN_TMP"): datadir = os.environ["OSG_WN_TMP"] else: print "!!WARNING!!2500!!Cannot locate scratch area" print "datadir",datadir queuepars += " -l %s" % datadir if dq2url != "": print "dq2url",dq2url queuepars += " -q %s" % dq2url else: print "dq2url not defined" par = utils.getpar('retry',queuedata) if par != '': queuepars += ' -j %s ' % par ## Process the parameters print "pilotPars:", pilotPars optlist = pilotPars.split() print "optlist:", optlist optstr = "a:l:q:u:m:g:r:j:f:" optitems = optstr.split(':') opts, args = getopt.getopt(optlist, optstr,("script=","libcode=","pilotsrcurl=","transurl=","glexec=")) print "opts:", opts script = '' libcode = '' pilotsrcurl = '' glexec = 'no' optd = {} for o, a in opts: for optitem in optitems: if o == "-%s" % optitem: optd[optitem] = a if o == "--script": script = a if o == "--libcode": libcode = a if o == "--pilotsrcurl": pilotsrcurl = a if o == "--glexec": glexec = 'yes' print "production pilot params: ",optd ## Set environment variables the pilot wants os.environ['PANDA_JSID'] = os.environ['SchedulerID'] os.environ['GTAG'] = os.environ['PilotID'] os.environ["SITEROOT"]="%s/%s" % ( appdir, release.split('\n')[0] ) if os.environ.has_key('logFile'): logtgz = os.environ['logFile'] else: logtgz = '' destinationBlock = os.environ['dispatchBlock'] ## Pull down the production pilot scripts if not jobstat: if pilotsrcurl == '': print "!!FAILED!!6999!!No URL for production pilot scripts specified" jobstat = 1 if script == '': print "!!FAILED!!6999!!Production pilot script not specified on --script option" jobstat = 1 if libcode == '': print "!!WARNING!!6999!!Auxiliary lib code not specified on --libcode option" # is pilotsrcurl a list? pilotsrcurl_list = pilotsrcurl.split(',') print 'pilotsrcurl list:',pilotsrcurl_list scripts = libcode.split() max_trials = 2 for s in scripts: print 'Downloading script:', s verified = False for pilotsrcurl_i in pilotsrcurl_list: print 'Trying src url:', pilotsrcurl_i sfile = os.path.basename(s) scripturl = "%s/%s" % ( pilotsrcurl_i, s ) cmd = "curl --connect-timeout 20 --max-time 120 %s -s -S -o %s" % ( scripturl, sfile ) print cmd trial = 1 while trial <= max_trials: st, out = commands.getstatusoutput(cmd) print "%s: %s" % ( s, out ) if st != 0: print "Error retrieving script with curl (attempt %d/%d)" % (trial, max_trials) else: cmd = "chmod +x %s; /bin/ls -al %s" % ( sfile, sfile ) print cmd stt, out = commands.getstatusoutput(cmd) print "%s: %s" % ( s, out ) if stt == 0: if sfile.find('.tar.gz') > 0: print "Untarring", sfile cmd = "tar xzf %s" % sfile print cmd ec, out = commands.getstatusoutput(cmd) print out if ec != 0: print "tar failed, curl did not return valid archive (attempt %d/%d)" % (trial, max_trials) else: cmd = "chmod +x *.py *.sh" print cmd ec, out = commands.getstatusoutput(cmd) print ec, out verified = True break else: verified = True break else: print "chmod failed (attempt %d/%d)" % (trial, max_trials) trial += 1 if verified: # go to next script print "Script verified:", s break if not verified: print "!!FAILED!!6999!! Failed to download/unpack %s" % (sfile) jobstat = 3001 break print "Listing after code retrieval:" cmd = '/bin/ls -al' out = commands.getoutput(cmd) print out if not jobstat: # If things still OK, run job print "--- Run job script", time.ctime() pars = '' for o in optd: pars += ' -%s "%s" ' % ( o, optd[o] ) pars += " -r 0 -j false -w %s -p %s " % ( pandaURL, pandaPort ) ####print "Running with sitesetup", sitesetup ####cmd = "%s ./%s %s %s" % ( sitesetup, script, pars, queuepars ) #cmd = "./%s %s %s" % ( script, pars, queuepars ) #print "Job command:",cmd #st, out = commands.getstatusoutput(cmd) ##if st != 0: # print "!!FAILED!!2999!!Job script failed" #print out sys.path.append('.') import pilot wntmpdir = './' wnclientdir = '' runpars = [ '-s', queue, '-d', workdir, '-a', appdir, '-l', datadir, '-q', dq2url, '-m', 'false', '-g', wnclientdir, '-f', 'false', '-w', pandaURL, '-p', pandaPort ] jobstat = pilot.runMain(runpars) #execfile(script) print "--- Clean up", time.ctime() print "--- Finished", time.ctime() #sys.exit(0) # the value of jobstat is returned via sys.exit() when this script # is executed under gLExec if glexec == 'yes': sys.exit(jobstat)