diff --git a/current/INSTALL.txt b/current/INSTALL.txt deleted file mode 100644 index e3358f664..000000000 --- a/current/INSTALL.txt +++ /dev/null @@ -1,137 +0,0 @@ -Installation --------------------- - -1. Checkout panda-common and panda-server. - -$ svn co svn+ssh://svn.cern.ch/reps/panda/panda-common/tags/X.Y.Z panda-common -$ svn co svn+ssh://svn.cern.ch/reps/panda/panda-server/tags/A.B.C panda-server - -* For tar-ball installation - -$ cd panda-common -$ python setup.py install --prefix=INSTALLDIR -$ cd ../panda-server -$ python setup.py install --prefix=INSTALLDIR - -where INSTALLDIR is /data/atlpan/testsrv, for example. - -* For RPM installation - -$ cd panda-common -$ python setup.py bdist_rpm -$ sudo rpm -Uvh dist/panda-common-*.noarch.rpm -$ cd ../panda-server -$ python setup.py bdist_rpm -$ sudo rpm -Uvh dist/panda-server-*.noarch.rpm - -INSTALLDIR is set to /data/atlpan/srv automatically for RPMs - - -2. Modify config files - -$ cd INSTALLDIR/etc/panda -$ mv panda_common.cfg.rpmnew panda_common.cfg -$ mv panda_server.cfg.rpmnew panda_server.cfg -$ mv panda_server-httpd.conf.rpmnew panda_server-httpd.conf -$ emacs -nw panda_server.cfg - -fix FIXME - -dq2_dir = /opt/dq2 - --> - -dq2_dir = /data/atlpan/DQ2Clients/DQ2Clients - -$ emacs -nw panda_server-httpd.conf - -SSLCertificateFile InstallDir/etc/panda/server.crt -SSLCertificateKeyFile InstallDir/etc/panda/server.key - --> - -SSLCertificateFile /etc/httpd/conf/ssl.crt/server.crt -SSLCertificateKeyFile /etc/httpd/conf/ssl.key/server.key - -$ cd INSTALLDIR/etc/sysconfig -$ mv panda_server-sysconfig.rpmnew panda_server-sysconfig -$ emacs -nw panda_server-sysconfig - -add - -export X509_USER_PROXY=/data/atlpan/x509up_u25606 - - -3. Add .gacl - -$ cd INSTALLDIR/lib/python*/site-packages/pandaserver/server/ -$ emacs -nw .gacl - - - - - - - - -4. Add grid-env.sh if needed - -e.g., -$ cat INSTALLDIR/etc/grid-env.sh -export LD_LIBRARY_PATH=/opt/glite/lib64:/opt/globus/lib:/opt/lcg/lib64:$LD_LIBRARY_PATH -export PYTHONPATH=/opt/glite/lib64/python:/opt/lcg/lib64/python:$PYTHONPATH -export PATH=/opt/edg/bin:/opt/glite/bin:/opt/globus/bin:/opt/lcg/bin:$PATH - -and modify panda_server.cfg - -$ emacs -nw INSTALLDIR/etc/panda/panda_server.cfg - -glite_source = /opt/glite/etc/profile.d/grid-env.sh - --> - -glite_source = INSTALLDIR/etc/grid-env.sh - - -5. Make log and cache dirs, and change owner if RPM is used - -mkdir -p INSTALLDIR/var/log/panda -mkdir -p INSTALLDIR/var/log/panda/wsgisocks -mkdir -p INSTALLDIR/var/cache/pandaserver -chown atlpan:zp INSTALLDIR/var/log/panda -chown atlpan:zp INSTALLDIR/var/log/panda/wsgisocks -chown atlpan:zp INSTALLDIR/var/cache/pandaserver - -6. For voatlas - -cp ~/devsrv/share/httpd-pandasrv /etc/rc.d/init.d/ -/sbin/chkconfig --add httpd-pandasrv -cp ~/devsrv/share/panda_server-httpd.conf.VM /data/atlpan/srv/etc/panda/panda_server-httpd.conf -cp ~/devsrv/share/panda_server.cfg.VM /data/atlpan/srv/etc/panda/panda_server.cfg -cp ~/devsrv/share/x509up_u25606_novoms /data/atlpan/ -chown atlpan:zp /data/atlpan/x509up_u25606_novoms -cp ~/devsrv/share/pandasrv /etc/logrotate.d/ -cp ~/devsrv/share/pandasrv.cron /etc/cron.d/ - - -Start the server --------------------- - -Add the following to crontab. - -0-59/5 * * * * INSTALLDIR/usr/bin/panda_server-add.sh > /dev/null 2>&1 -15 0-21/3 * * * INSTALLDIR/usr/bin/panda_server-copyArchive.sh > /dev/null 2>&1 - -Run the server. - -$ sudo INSTALLDIR/etc/init.d/panda_server-ctl start - -Stop the server. - -$ sudo INSTALLDIR/etc/init.d/panda_server-ctl stop - - - - - - diff --git a/current/MANIFEST.in b/current/MANIFEST.in deleted file mode 100644 index 46666ccfe..000000000 --- a/current/MANIFEST.in +++ /dev/null @@ -1,2 +0,0 @@ -include *.txt *.py *.cfg -recursive-include templates *.template diff --git a/current/README.txt b/current/README.txt deleted file mode 100644 index 7df534fd3..000000000 --- a/current/README.txt +++ /dev/null @@ -1,967 +0,0 @@ -Release Note - -* 0.0.18 (7/2/2013) - * tagged for JEDI - * fixed datriHandler for SLC6 - * improved getLFNsInUseForAnal - * fixed getScriptOfflineRunning for athenaMP - * fixed dispatcher so that install jobs can run on sites with status=test - * fixed for ANALY_BNL_SHORT and ANALY_BNL_LONG - * included group analysis jobs in priority massage - * removed priority boost for group analysis jobs - * fixed brokerage to respect preset computingSite even for too many input - jobs in cloud with negative t1weight - -* 0.0.17 (4/27/2013) - * giving a higher prio to install jobs - * split runRebro from copyArchived - * fixed retryInActive to reset file status - * modified dispatcher to send prodSourceLabel for getJob - * changed ATLAS_PANDALOG.USERS_ID_SEQ to ATLAS_PANDAMETA.USERS_ID_SEQ - * added TaskMonitor link to email notifications - * changed getJob() to allow the prod/analy pilot to get installation jobs - * fixed retryJobsInActive - * fixed datasetManager to delete sub from foreign T1 instead of home T1 - * improved getDisInUseForAnal - * added boostUser - * improved fairshare to support per-cloud shares - * changed Setupper to register both DATADISK and PRODDISK as locations for sub - * changed job/task brokerages not to check DBR with DQ2 at CVMFS sites - * changed the brokerage to skip release checks for releases=ANY - * fixed for site.priorityoffset - * fixed T2 cleanup to check if there is active subscription - * fixed brokerage and copyArchive for RU - * changed insertNewJob not to insert metadata when it is empty - * fixed killUser to kill jobs gradually - * fixed Setupper to make dis for pin at MCP sites in ND cloud - * fixed Setupper to take cloudconfig.tier1se into account for dis subscriptions - * set a limit on G/U in the brokerage - * sending more info in PD2P logging - * fixed LFC lookup in the brokerage - * changed PD2P to be triggered by the second job - * removed multiCloudFactor from the brokerage for NL - * added a protection to updateJobStatus to prevent holding->transferring - * fixed getUserParameter to insert new row if the user is missing - * fixed Setupper to trigger prestaging when sites with multi-endpoints use TAPE - * put all info to ErrorDiag in the brokerage - * added modificationTime constraint to URL sent to the user by Notifier - * introduced ProcessLimiter - * changed TA to shorten retry interval after refreshing replica info - * skipping file availability check for log datasets in TA - * using cloudconfig.tier1SE to count files at T1 - * setting scope only for ATLAS - * improved the task brokerage to check datasets with fewer replicas first - * set limit on the number of IDs to be sent to the logger for reassign/killJobs - * removed LFC lookup from TA - * changed PD2P to use secondary share - * fixed to use correct DQ2 site ID for pinning at sites with multi-endpoints - * modified to send scopes for output files to the pilot - * added changeJobPriorities - * using DATADISK for MCP T1 input at all T1s except US - * added filespec.scope - * reducing lifetime of dis when corresponding jobs finished and some of them failed - * improved the brokerage to count the number of running jobs per processingType - * using transferringlimit in the brokerage - * fixed the bulk OK file lookup again for unique ddm endpoint sites - * reduced interval of PandaMover reattempts to 15min from 3h - * fixed the bulk OK file lookup in the brokerge for multiple ddm endpoints - * increased the number of PandaMover channels to 15 - * using DATADISK for MCP T1 input at CERN - * using a default fareshare defined per cloud if T2 doesn't define share - * added a protection against overwriting of dataset status by datasetMgr - * implemented a nested fareshare management mechanism - * fixed the brokerage message when release is missing for repro - * fixed TA since replicas at T1 non DATADISK prevented T2 replicas from being used - * using DATADISK for MCP T1 input at ND,ES,DE,NL,TW - * added a patch for MWT2 to associate MWT2_DATADISK in TA - * allowed wildcards in cloudconfig.tier1SE - * fixed Merger for standalone ROOT - * fixed Closer to trigger merging for cancelled jobs - * fixed Setupper to pin DBR as well - * added a protection to Setupper for file lost after job submission - * fixed getHighestPrioJobStatPerPG for group queue - * added group queue to all clouds - * added FOR UPDATE when getting jobdefID for users - * removed hard-coded FZK-LCG2_DATATAPE removal in TA - * set activity=Production to TA subscriptions - * fixed weight reduction in TA for no input tasks - * fixed the brokerage to send message to logger for too many transferring's - * fixed wrong error message in TA when open dataset is incomplete - * updated TA to use a special weight reduction when only TAPE is available - * removed selector from fileCallbackListener - * fixed for TMPDISK - * fixed Setupper to scan T2 LFC per LFC host instead of per SE - * fixed Setupper to use correct location when pinning dis at foreign T1 - * fixed sitemapper to allow multiple DQ2 site IDs to use the same token - * added DQ2 registration time to SLS - * fixed vomsrenew.sh to check certificate and proxy lifetime - * fixed file-check in the brokerage for BNL@non-US - * fixed brokerage not to overwrite file's destSE for destSE=local - * introduced mcore queue in PG - * added iscvmfs to SiteSpec - -* 0.0.16 (8/29/2012) - * changed Setupper to make sub when data is available only at T2 - * changed Setupper to make sub when data is missing at T1 - * change TA to pin input and skip replicas with ToBeDeleted - * using share=secondary for non T2-close-source PD2P - * added useWebCache() to Client - * fixed getJobStatistics not to read archived via http by default - * fixed Adder2 to skip destSE check for ddm=local - * fixed LFCclient to randomly resolve DNS alias for LFC host - * added makeSlsXml - * patched smtplib.stderr to send debug info to logger - * added 32/64 to getScriptOfflineRunning - * changed JOBSARCHIVED4_MODTIME_IDX hint - * enabled maxtime check for analysis brokerage - * fixed to check T2 files when get reassigned - * removed hints related to JOBSACTIVE4_JOBSTATUS_IDX - * fixed setOK to check map - * fixed resetDefinedJob for for recordStatusChange - * fixed updateJobStatus not to reset modificationTime of holding jobs - * fixed file check not to use TAPE replicas when T1 is used as T2 - * disabled release check for CERN-RELEASE - * enabled release check for CERN - * removed EVNT from PD2P - * removed the higher priority to phys-higgs - * added _LONG as a suffix of hospital queue - * fixed queryLastFilesInDataset agains missing jobs which are still in fileDB - * added setPriority.py - * fixed updateJobStatus for endTime - * updated the brokerage log to have timestamp - * updated the brokerage to take maxtime into account - * updated file-level callback - * added Job Status Monitor - * added --killUserJobs to killJob.py - * added reliability-based brokerage for analysis jobs - * fixed getDestSE to look into ARCH for sub datasets for failed log files - * fixed rebrokerage when orig replica is set to ToBeDeleted - * temporally gave a higher priority to phys-higgs for ICHEP2012 - * added code=91 to allow prod role to kill user jobs gracefully - * check LFC every hour for high prio transferring jobs - * fixed datasetManager for T2 cleanup by recognizing T1 PRODDISK correctly - * delete sub from PRODDISK except US clous - * added protection to ReBroker against redundant comma in excludedSite - * added fatal errors for datri in Adder2 - * fixed Adder2 for missing src in schedconfig for analysis with destSE - * changed brokeage to make a chunk for each diskCount/memory - * added RbLauncher to run ReBroker in grid env - * added more message to Finisher - * fixed Adder2 for failed jobs to add files to sub - * reduced the number of add.py - * modified getHighestPrioJobStat to calculate per PG - * added --noRunning to killTask - * fixed insertSandboxInfo to use real file size - * added checkSandboxFile - * fixed brokerage for nightlies - * extracting crc from input sandbox in putFile - * added changes for debug mode - * setting prestage sites with PandaMover dynamically - * removed BNL_ATLAS_1 from SiteMapper - * removed FILESTABLE4_DATASET_IDX - * added more info to putFile - * optimized getDisInUseForAnal in TB - * fixed TA to ignore non-DATADISK replicas at T1 - * fixed brokerage for preassigned repro jobs - * fixed dataset update timing check in Notifier - * rixed zero suppression with wildcard in brokerage - * fixed rebro to set the same specialHandling to build since new build may have different specialHandling - * removed old hints - * fixed DataServiceUtils to return an empty map when DQ2Map is set - * using FOR UPDATE in lockJobForReBrokerage - * added more debug INFO to Setupper - * fixed DBProxy not to freeze top datasets for HC when build failed - * fixed anal brokerage to take # of defined jobs into account - * setting RUCIO_ACCOUNT and RUCIO_APPID - * pin dis for foreign T2s in US cloud - * removed special treatment for BNL from Adder - * fixed the brokerage to get hospital queues automatically - * updated brokerage to use coreCount - * fixed Closer not to freeze any HC datasets - * fixed Adder since Register2 gives DatasetExist error when it got deleted - * enabled cap based on priority for CERN - * not reset retried jobs in Watcher - * check attemprNr in retryJob - * added double quotas to all params in getScriptOfflineRunning - * added jobMetrics - * added a protection against non-integer PandaID in peekJob - * changed to update only changed attributes in job tables - * fixed runMerge not to be stopped due to a single dataset error - * added debug message for execution time of DQ2(+LFC) registration - * fixed storeJob to reset changed attribute list - * disabled beyond-pledge for HC jobs - * changed to update only changed attributes in filesTable4 - * added nOutputDataFiles and outputFileBytes to job tables - * modified getScriptOfflineRunning to use parallel transfers - * removed shadow lookup in Adder - * disabled sub for computingSite=destinationSE - * added getScriptOfflineRunning - * added retry to Cassandra operations - * changed killing with group prod role not to be case-sensitive - * added getDis/LFNsInUseForAnal - * added getPledgeResourceRatio to TB - * added Cassandra file cache - * added TAG support in EventPicker - * added countGuidsClient - * using SCRIPT_NAME in panda.py - * removed _shadow creation in ReBroker - * fixed queryLastFilesInDataset for the fileTable change - * remove deleting datasets from the Datasets table - * sending error log to the logger when TA cannot find dataset in DQ2 - * sending fsize and checksum to the pilot - * added modificationTime<=CURRENT in getFilesInUseForAnal - * added hint when deleting rows from Datasets - * making larger subs by sorting jobs by site - * instantiating dq2api in each thread - * added hint to use 11g cashing - * removed constraint in TA to consider T1 and T2 equally - * increased the lifetime of the proxy to 96h - * fixed TA to select candidate T2s correctly - * getting shadow info from filesTable - * added vomsrenew.sh - * fixed TA to count the number of files at US T2 - * check attmptNr - * fixed for non-MC/DATA space at split T1 - * fixed TA to check completeness at T2 - * use correct locations for GEN dis when jobs directly go to T2 - * added protection to Adder2 against sites disappearance from schedconfig - * added preferential analysis brokerage based on countryGroup - * added more verbose message in Adder - * Mikhail Titov updated datriHandler - * fixed cloudlist to skip None - * added getJobStatisticsPerUserSite - * added 64bit in copyROOT - * avoid priority reduction for merge jobs - * use <= for maxDiskCount in getJob - * fixed rebrokerage for --destSE - * updated rebrokerage to be triggered 3 hours after the site is blacklisted - * set maxAttempt to allow users to disable auto retry - * changed global file map to local in brokerage - * fixed Adder2 to use proper destination for token=TAPE when running at T1 as T2 - * updated killJob to take group prod role into account - * updated brokerage to take priorities into account for prod jobs - * using native DQ2 call in ToA - * modified brokerage to do bulk LFC lookup per site - * fixed brokerage_util to do LFC lookup per 1000 files instead of 100 files - * fixed brokerageErrorDiag for repro + missingRel - * fixed port of pandamon in email notification - * fixed brokerageErrorDiag for useT2 + repro - * set replica pin lifetime before deleting from T2 - * improved brokerage error diag - * cleaned the brokerage for hospital queues - * use 0 when memory=0 in one of online sites with the same siteID - * fixed the brokerage to use RAL-LCG2_H​IME as UK T1 - * touch input sandbox when tried to be overwritten - * permit overwriting of input sandbox - * reject limited proxy - * added priority boost for gangarobot-pft - * fixed getCriteria for aggregated sites - * fixed brokerage for group=any:0% - * fixed brokerage more for type=any:0% - * fixed brokerage to take zero shares into account - * fixed getCriteriaForProdShare for zero shares - * added minPriority to Client.getJobStatisticsPerSite - * using MV in getJobStatisticsWithLabel - * added fairshare to getJob - * fixed retryJob not to change the name of lib.tgz for ptest - * fixed retryJob not to retry buildJob to keep the PandaID order - * fixed TB to give higher prio to buildJob with prodRole - * fixed Merger to use the largest SN for merged files - * fixed queryLastFilesInDataset to ignore merged files - * fixed brokerageErrorDiag for non missing release errors - * added tmpwatch.py - * changed hint in getJobs - * fixed updateProdDBUpdateTime for pending jobs - * fixed brokerage to accept test sites for prod_test jobs - * changed getJobs for test pilots to get gangarobot jobs - * setup glite in TaLuncher - * added lock in lockDatasets - * added version check in Merger to avoid duplicating merge jobs - * changed Merger to fail when container name is too long - * use lockJobsForReassign for reassign in copyArchive - * use native DQ2 in copyArchive and datasetMgr - * use python2.5 for copyArchive and prio-mgr - * use native DQ2 in Setupper - * fixed guid generation for user's log - * introduced 2 staged submission for prod jobs - * using T2 in TA - * using materialized view get getJobStatistics family - * updated Merger to put log files of merge jobs to a separate container - * fixed Merger for --transferredDS - * enabled rebrokerage for processingType=ganga - * updated Adder for unique constraint error - * added copyROOT - * updated Adder to immediately go to failed when subscription failures - * disabled prio boost for gangarobot derivatives - * added protection to TA against undefined maxinputsize - * updated TA and brokerage to use T2 datasets in prod - * updated for DQ2 client 0.1.37 - -* 0.0.15 (11/07/2011) - * removed redundant freshness checks in getSN - * changed hint in getSerialNumber - * randomized job order in adder - * decreased the number of adder processes - * added more tight constraint to getJobStatistics family - * reduced prio by 10 for pilot-retry jobs - * increased the factor of the RW limit to 8000 - * updated Merger for --mexec - * modified rebroekrage to send brokerage log - * modified brokerage to send user's countryGroup and nJobs to logger - * added a protection to httpd.conf for interesting panda.py - * not attach attemptNr to lib.tgz for rc_test+buildJob - * fixed parentID for retryJob with new PandaID - * randomized the order of site check in analysis brokerage - * added --killOwnProdJobs to killJob.py and killJobsInTask.py - * fixed brokerage to require cache=None for release check - * pinning input datasets - * added limitation of exe/pilotErrorDiags in JD - * fixed short->long mapping in retryJob - * generates new PandaID for pilot-retried job - * using negative errorcode for pilot-retry - * added invalid character check to DDM - * fixed the brokerage for --transferredDS - -* 0.0.14 (10/11/2011) - * fixed TaskAssigner for MCshare=0 - * updated brokerage to consider priorities for analysis jobs - * fixed brokerage for BNL_CVMFS_1 - * modified managed pilots to get prod_test as well - * call addShadow even if DaTRI failed - * fixed the error message of location registration in Setupper - * modified ReBroker for server-side retry - * reverted the brokerage change - * changed brokerage to skip sites with memory=0 for analysis with memory - * increaded MaxClients - * use DQ2 for foreign T2 in US cloud - * use IN2P3-CC and IN2P3-CC_SGE_VL as FR T1 in brokerage - * unset commandToPilot for jobs reassigned by rebrokerage - * added retryJobsInActive - * added --maxJobs and --running to killJobLowPrio.py - * added killJobLowPrio.py - * fixed killJob - * simplified anal_finalizer - * added SiteSpec.lfcregister - * added getAttr - * keep failed analysis jobs in Active until all jobs finished - -* 0.0.13 (8/30/2011) - * fixed Adder2.removeUnmerged to catch DQ2 errors correctly - * using subType in datasetManager - * filling datasets.subtype - * added protection against too large inputFileBytes - * removed CN=Robot: from DN - * added hint to DBProxy.getLockDatasets - * reduced the number of table scan in datasetMgr and runMerge - * fixed brokerage not to count jobs for usermerge or pandamover - * changed brokerage to use ANALY_CERN_XROOTD and not to use ANALY_CERN - * added Forker to add.py - * updated dispatcher to send taskID - * using schedconfig.multicloud - * fixed brokerage for test sites - * fixed brokerage not to count jobs for HC - * fixed rebrokerage for CERN TMP - * updated the brokerage to stop assigning prod jobs to sites which have many transferring - * added jobdefID to libDS in ReBrokerage - * disabled short -> long for HC - * fixed SiteMapper to respect online even if another queue is not online - * put attempt number to output file name in Merger - * changed = to == in redundant messages - * job-chaining for ptest+prun - * added initLogger to Notifier - * removed redundant suffix from DN for DaTRI request in EventPicker - * added more message in EventPicker for DaTRI request - * changed Notifier to non-thread - * fixed Notifier to take into account old jobs in Arch - * implemented new PD2P scheme using MoU and close sites - * increased the number of concurrent Mergers - * incrementing Datasets.currentfile only for the first failed job - * fixed Watcher to append attemptNr when sent->activated - * fixed resetDefJob - * limited the number of jobs with the same GEN dis - * fixed EventPicker to take input files into account - * fixed Merger to use .tgz for text merging - * added EventPicker - * added statusmodtime to SiteSpec - * updated Merger for runDir - * updated rebrokerage to take --cloud into account - * added tags into PD2P logging - * updated Merger for mergeScript - * fixed getFilesInUseForAnal to skip NULL dis datasets - * updated analy_brokerage to use memory size - * added cmtconfig to broker logging - * enabled cross-cloud for US in PD2P - * enabled banUser in storeJobs - * enabled role-check in submitJobs - * added WrappedPickle to avoid deserializing insecure objects - * added banUser to storeJob - * added prodSourceLabel check to UserIF - -* 0.0.12 (6/13/2011) - * fixed Merger for --useContElement - * fixed inputFileProject extraction for wildcard-uses - * using basename in Utils methods - * fixed fetchLog to disallow chdir - * fixed panda.py to disallow unexpected methods - * added getVomsAttr - * updated getJob to decompose CERN-XYZ to CERN-PROD+processingType - * updated the brokerage to use installedsw.cmtConfig - * use MoU share for T1 PD2P - * added getNumPilots - * added prodSourceLabel=ssc as user's label - * added --prodSourceLabel to killUser - * fixed archiveJob for failed jobs with multiple dis - * fixed Setupper to store GEN dis - * disabled release check in the brokerage for x86_64-slc5-gcc43 - * implemented aggressive cleaning for PRODDISK - * added priority boost for gangarobot - * updated T2 cleanup to use grace_period='00:00:00' - * cleanup copyArchive - * changed analysis brokerage to use nRunning(max in last 24h) - * increased # of active subscriptions to 2 in PD2P - * added nRunning calculator to add.py - * disabled priority reduction for merge jods - * sending analysis brokerage info to logger - * updated PD2P not to check provenance since group datasets have mc*/data* - * disabled PD2P to CERN-PROD_EOSDATADISK - * added checkMergeGenerationStatus - * enforce LFN-lookup to trigger getting replica map when reassigned - * fixed brokerge for test jobs at test sites - * use release matching for T2s in CERN cloud - * skip release check for CERN and ND - * set correct info to brokerageErrorDiag - * send jobs to waiting when release/cache is missing - * remove '' for |pilotOwners| - * put cloud-boundary back to US - * use SourcesPolicy.ALL_SOURCES for PD2P subscriptions - * improved PD2P logger - * included CERN to trigger PD2P - * fixed typo in PD2P skip message - * fixed zero-division in PD2P - * enabled T1-T1 in PD2P - -* 0.0.11 (4/18/2011) - * fixed getExpressJobs - * use c-t-s for all files in merge jobs - * modified runMerger to kill old process - * disable Initializer when nDBConnection is 0 - * increased max attempt for rebrokerage to 5 - * changed the rebrokerage interval to 24h - * skip init for jobDispather,dataService,userIF when nCon=0 - * added parameters in email notification - * ignore LOCALGROUPDISK in PD2P - * fixed auto type detection of Merger for THIST - * use IN2P3-CC_VL for too many input or high prio jobs - * gave T1 weight to IN2P3-CC_VL - * added protection to Adder2 against DQ2 failure for jumbo datasets - * updated Adder2 to avoid making DaTRI request for unmerged files - * added protection against generating multiple Mergers for --individualOutDS - * updated brokerage to give T1 weight to NIKHEF for repro jobs - * fixed Merger for lib.tgz - * added automatic merge type detection to Merger - * updated Closer to redirect logging to parent as it doesn't work in nested threads - * changed parameter convention for Merger - * added merge job generation - * set secondary for TA subscription - * use TAIWAN-LCG2_HOTDISK for TW HOTDISK - * disabled PD2P for ESD - * set file.dispDBlock even if they are already available at the site - * send jobDefID and cloud to the pilot - * updated Setupper/Adder2 for T1 used as T2 - * set destDBlockToken to DATADISK - * using home cloud to skip release check in the brokerage - * reassign stuck T2 evgensimul more frequently - * enabled release/cache check for US - * using nRunning(cloud) in brokerage for multi-cloud - * added fileGUID to updateInFilesReturnPandaIDs for file-level callback - * set source to _subs for all clouds - * using DQ2 API directly in Adder - * added nInputDataFiles,inputFileType,inputFileProject,inputFileBytes - * add hacks again to TA and Setupper for split T1 - * added EventLookup to PD2P - * updated SiteMapper for multi-cloud - * removed hacks from TA and Setupper for split T1 - * added forceOpt to runReBrokerage - * fixed PD2P not to make sub when dataset is being deleted - * changed PD2P not to send ESD to EOS - * added a hint to getPandaIDsForProdDB to enforce function index - * added comment_ to SiteSpec - * put hacks back to TA and Setupper for split T1 which uses NIKHEF as src - * set hidden metadata to _dis and _sub - * removed REGEXP from Datasets cleanup - * enabled rebrokerage for ganga-rbtest - * fixed ReBroker for EOS - * fixed ReBroker to add _shadow - * use DATADISK for all PD2P subscriptions - * close user datasets in container - * set lifetime for dis and sub datasets - * added --jobsetID to killUser.py - * added protection against missing argument for jobID/jobsetID to killUser.py - * trigger PD2P for EOS when nUsed >= 3 - * updated brokerage to take transferType into account - * update modificationTime when going to Archived4 - * disabled extra replica making in PD2P - * trigger PD2P for EOS when nUsed >= 2 - * added testG4sim16.py and testEvgen16.py - * use diskThr=max(5%,3TB)-diskSize in PD2P - * added killJobsInTask - * set disk threshold in PD2P to 5GB - * updated PD2P so that any analysis job using data makes subscriptions to CERN EOS - * set specialHandling=rebro when reassigned by rebrokerage - * fixed DQ2 ID conversion in PD2P for EOS - * check free disk size in PD2P using DQ2.queryStorageUsage - * use function index in getPandaIDsForProdDB - * reduced the number of rotated logs - * use cernmx.cern.ch - * added getLockDatasets - * added the number of succeeded jobs to the subject of Notification - * added pd2p logging - * added deleteJobs.py - * split arch procedure to another cron - * call taskbuffer.Initializer in forkSetupper.py to acquire Oracle environment handle correctly - * use truncated DN when setting dataset owner - * reassign evgen/simul with active state at T1 more aggressively - * made SQLDumper iterable - * added SQLDumper - * added reassignTask - * use getFullJobStatus in Notifier since some jobs can go to ARCH before notification - * seprate retry for Notifier - * added retry to Notifier when failing to send notifications - * express jobs - * make new dis datasets even if files are already available at T2 - * short/long mapping for ANALY_LYON-T2 - * updated PD2P to use a negative weight based on the number of subs - * ignore hidden datasets in PD2P - * don't use modTime index on jobs_ARCH - * set/increment nUsed in PD2P - * use LFN for WN-level matchmaking - * ignore datasets with provenance=GP for PD2P - * don't reuse the same site in a single PD2P cycle - * fixed brokerage to send warning when cache is missing - * removed redundant holding for prod jobs in Watcher - * more fix to SetUpper for rc_test - * not reset holding analysis jobs when stateChangeTime=modTime - * set stateChangeTime when job goes to holding for finished/failed - * job chain for rc_test + gangarobot-rctest - * added archivelogs - * set tobeclosed to sub datasets of failed downstream jobs - * rctest -> rc_test - * reduced time interval to reassign waiting jobs to 30min - * enabled user-triggered rebrokerage - * send currentPriority in dispatcher - * set localpool to specialHandling when beyond-pledge pilot got the job - * fixed makeSub in TA for getAva change - * added random sleep for Finisher in copyArchive - * improved del in copyArchive to avoid redundant deletion - * increased timelimit for copyArchive - * added auto rebrokerage to copyArchive - * report new PandaID to taskBufferErrorDiag when rebrokered - * check procesingType in rebrokerage - * added code=8 to killJob for rebrokerage - * first implementation of auto rebrokerage - * added getCachePrefixes - * removed apostrophes from prodUserName - * fixed useNiotifier in Closer for completed sub datasets - * changed queryLastFilesInDataset to use MAX(lfn) - * improved the space shortage message in TA - * don't check missing files with LFC when site is already set - * added -9 to killTask - * added forceKill for prod jobs - * changed the brokerage to use CERN-PROD_EOSDATADISK as the dest for CERN-EOS jobs - * added enforce to Activator - * changes for merge/unmerge jobs - * rctest - * deleteStalledJobs - * removed hacks for last_insert_id of InnoDB - * allowOtherCountry - * updated datriHandler to prevent false http-requests - * added a hint to getJobIDsInTimeRange against jobsActive4 - * added a hint to getJobIDsInTimeRange against jobsArchived4 - * changed hint in DBProxy.updateTransferStatus - * changing TRF URL from BNL to CERN on the server side - * fixed error message in brokerage for sites with status!=brokeroff - * fixed brokerage for release check when schedconfig.rel != '' - * changed countryGroup=ustlas to us - * ignore gangarobot family in PD2P - * disabled priority decreasing for HC jobs - * use installedSW for base-release matching for analysis - * $GROUPJOBSN - * added getSerialNumberForGroupJob - * use jobsetID in Notifier - * use max memory/inputsize for each site - * set jobsetID for ptest - * changes for output container and short LFN for analysis - -* 0.0.10 (8/2/2010) - * tagged for output container and short LFN for analysis - * added setCloudTaskByUser - * get list of PD2P clouds dynamically - * send transferType to the pilot - * imposed a size limit on uploaded files by users - * fixed the task brokerage to take maxDiskCount into account - * added a protection againt empty jobParameters only for new jobs - * fixed PD2P to remove the cloud boundary when counting nSites - * disable brokerage for gangarobot - * ignore HC and group jobs in PD2P - * fixed PD2P to take non-PD2P sites into account when checking comp/incomp - * fixed AtlasRelese for PD2P - * enabled WN brokerage for ANALY_GLASGOW - * updated Adder for --destSE=multiSites - * use Data Brokering fr PD2P - * change MWT2_UC_DATADISK to MWT2_DATADISK in PD2P - * delete replicas from T2 when locations != [] - * protection against meta/para=None in peekJob - * kill ITB_INTEGRATION jobs in sent status - * batchID - * ignore dis/sub in PD2P - * dispatchDBlockTokenForOut - * added banUser.py and made --jobID optional in killUser.py - * set activity='Data Consolidation' and acl_alias='secondary' to PD2P subscriptions - * check replica at T1 in PD2P - * added getActiveDatasets - * don't move RAW,HITS,RDO by PD2P - * allow prod proxy to kill anal jobs with 2 or 4 - * added PD2P - * regard found=None as an incomplete replica - * invoke listFileReplicasBySites only for incomplete sites in TA - * fixed re-brokerage - * fixed used file check for cancelled jobs - * increased wait interval for reconnection in connection pool - * updated ConBridge to kill child when connection failure - * changed URL of panda mover trf - * added a protection against method execution failure in panda.py - * set dataset status for DaTRI requests - * ignore DaTRI failure for duplicated requests - * use DQ2 for email extraction - * added -9 to killJob.py - * added killUser.py - * added alias to httpd.conf for trf URL - * changed reading order in getPandIDsWithJobID to avoid missing jobs - * set taskBufferErrorDiag when running jobs are killed - * prevent prod proxy from killing analysis jobs - * added priority massager - * added NG words to Notifier - * avoid sending DaTRI requests for failed jobs - * fixed replica registration for --destSE - * set type in datriHandler for analysis system - * testpanda -> panda - * introduced datriHandler - * delete sub datasets from EGEE T2 when callback is received - * set REMOTE_HOST to creationHost - * increased priority boost for activated jobs - * delete cancelled from jobsDefined4 - * added boostPrio.py - * added cvs,svn,grid,librarian to NG words - * True/False for schedconfig.validation - * added admin to NG words for Notifier - * added cancelled state - -* 0.0.9 (4/13/2010) - * increased the subscription limit to 600 in TA - * protection against reassigning analysis jobs - * enabled cache-matching brokerage for all EGEE clouds - * enabled cache-matching brokerage for NL/FR - * added a protection for containers composed of multiple datasets - * added processingType to runBrokerage for HC - * doesn't check release matching for CERN - * cache-matching in the brokerage for DE - * added getHighestPrioJobStat - * changed weight for the task brokerage to use RW instead of fullRW - * fixed getFilesInUseForAnal for --individualOutDS - * added getQueuedAnalJobs - * updated brokerage to assign one prod_test job to a site - * disable prod role for non-group activity - * use maxinputsize in the brokerage - * added schedconfig stuff to template - * removed cx_Oracle from FileSpec - * removed MySQLdb from broker_utils - * added maxinputsize - * modified xyzCacheDB to take a list of siteIDs - * suppressed warning messages in dashboard - -* 0.0.8 (2/2/2010) - * tagging for SLC5 migration - * added hostname matching for T3 pilots - * use listFileReplicasBySites in TA - * added checkFilesWithCacheDB - * changed the default cmtconfig to SL4 for analysis in brokerage - * updated the brokerage to allow slc4 jobs on slc5 sites - * added killTask.py - * added addFilesToCacheDB and flushCacheDB - * modified dispatcher to accept service proxy - * added WN-level file matching to getJob - * added MemProxy - * fixed brokerage to skip release/cache matching for ND - * use all source locations for dis - * use long hint for queryDatasetWithMap - * added /Engage/LBNE/Role=pilot to acceptable roles - * added analy_test to getJob for test pilots - * use poffset regardless of accesscontrol - * removed / from FQAN check in allowedgroups - * limit the max number of files in sub dataset - * use fasttrack only for evgen/simul - * added cleanup in updateSiteData - * added chdir to LFC - * added chdir for dq2 and fork - * removed logging updateJob/getJob from dispatcher - * use averaged updateJob/getJob - * ignore test when summing SiteData - * don't update SiteData when logrotate is running - * randomized the order of sites in updateSiteData to avoid concatenation - * fixed checkSitesWithCache - * multi-threads in adder.py - * count number of updateJob/getJob in add.py - * use taskBuffer in add.py for all DB access - * use fasttrack for all tasktypes and prio>=700 - * use taskBuffer for reassignment in copyArchived - * cleanup old PandaSiteIDs for UK - * set the number of treads to 2 in wsgi daemon - * set MaxRequestsPerChild - * enabled KeepAlive for proxy sites - * check filename FieldStorage when a param is treated as file - * not delete dis datasets when jobs are reassigned - * check useFastCGI before importing flup - * introduced nDBConForFastCGIWSGI - * fixed Setupper to re-register location at next attempt when previous was failed - * changed logLevel in httpd - * added flag to control verbosity of entry point - * added FastCGI stuff - -* 0.0.7 (11/20/2009) - * removed verbose message from DBProxyPool - * more verbose info to DBProxyPool - * fixed ReBrokerage to require the same distribution pattern of input datasets - * set encoded nJobs to taskID for analysis jobs - * fixed ReBrokerage - * propagate bad state from dashboard - * removed threading in dispatcher and dataservice - * fixed typo in dashboard access - * fixed CloudTaskSpec for serialization - * close non-DQ2 destinationDBlock in Closer - * use infinite loop in ProxyPool.__init__ - * add random sleep to ConBridge.connect - * use TaskBuffer instead of DBProxy in copyArchive - * added querySQLS to DBProxy - * use ping for wakeUp - * degrade message level of child termination in ConBridge - * added ConBridge for database timeout - * re-implemented rebrokerage to allow the case where build finished - -* 0.0.6 (11/13/2009) - * destinationSE=local - * propage failed_transfer from dashboard - * added activity to subscriptions - * added cleanup for Datasets table - * added workaround for x86_64-slc5-gcc43 - * removed TO_DATE for Datasets.modificationdate - * set priority of buildJob back to 2000 - * renamed testpanda.ddm to pandaddm_ - * added /osg/Role=pilot - * added lower limit for TO_DATE against Datasets table - * added protection in JobDispatch against non-proxy pilots - * added ReBroker - * removed UAT stuff - * use long queue in brokerage in addition - * increased max subjobs in UserIF to 5000 - * send log message from brokerage when disk shortage - * use ANALY_LONG_BNL_ATLAS for UAT - * added temporary priority boost for UAT - * added YY.MM.DD to destinationDBlock of PandaMover - * skipped release check in brokerage when weight is negative - * removed T1 constaint on high prio jobs in brokerage only for i686-slc5-gcc43-opt - * limit matching of cmtconfig=i686-slc5-gcc43-opt to i686-slc5-gcc43-opt jobs only - * changed brokerage to use only T1 for many input jobs when weight is negative - * removed computingElement matching in getJob for test jobs - * use transtimelo for timeout of analysis transfers - * fixed for site->siteid in installedSW - * added protection to _checkRole() - * use cache version matching for analysis - * added 'user' to NG words in Notifier - * take '_' into account in Closer for new naming convention - * use onlyNames in dq2.listDatasets - * changes for destSE - * changed cmtconfig for slc5 to match to slc4 and slc5 - * set pandamover priorities using original job priorities - * added HOTDISK to Setupper - * added PandaMonURL to email notification - * send email notification to site contact in addition to cloud contact - * use schedconfig.DN for privilege check in addition to cloudconfig - * ptest for analy tests - * use SARA-MATRIX for all T1 sources - * more NG words in address finding - * skip VUID lookup for analysis jobs - * added getSlimmedFileInfoPandaIDs - * added a hint for filesTable_ARCH - * limited modificationTime on filesTable_ARCH queries - * allowed the pilot to set status for failed input files - * make subscription for ptest - * use /atlas for auth of updateFileStatusInDisp - * added updateFileStatusInDisp to flag lost files - * removed double counting of jobs in Notifier - * updated template - * changed LogFormat for SLS - * send prodDBlockToken to the pilot - * modified Adder to take DQUnknownDatasetException into account - * make subscriptions for rc_test - * flagged all missing files in Setupper - * added jobType to Client.getJobStatisticsPerSite - * use stage-priority for prestaging - * updated the brokerage to take input size into account - * use cleanUserID in Notifier - * add copysetup to SiteSpec - * fixed getCurrentSiteData for analysis - * use pilotowners for checkRole in dispatcher - * ignore DBRelease when adding shadow - * support getJobStatisticsPerSite(countryGroup=None,workingGroup=None) - * added two more filed to dis datasetname - * calculate priority for each workingGroup - * added finder for email address using phonebook - * reverted the change in Setupper - * register location for _sub even when src=dest - * workingGroup/countryGroup in getJobStatisticsPerSite - * added getPandaClientVer - * fixed MailUtils for multiple recipients - * reuse unknown input files when build failed - * use T1 in brokerage when too many inputs are required - * added a timeout to Client - * set sources of dis for all clouds - * use MCTAPE for subscriptions - * added trustIS to runBrokerage - * added longFormat to listSiteAccess - * added set to updateSiteAccess - * verify workingGroup - * send email update/request for site access - * kill old dq2 processes - * addded updateSiteAccess - * workingGroup - * added MailUtils - * prestaging for MCTAPE - * set processingType for mover - * get proxy for each job in getFullJobStatus - * fixed address-check to trigger xwho - * introduced NG words in email-adder finding - * put size limit in putFile - * set higher priority for installation mover - * skip files used by failed/finished jobs in getFilesInUseForAnal - * removed BNL and old bamboo stuff from Client.py - * added a hint to updateInFilesReturnPandaIDs - * added getFilesInUseForAnal - * set sources for ES - * added a hint to getJobIDsInTimeRangeLog - * removed write spaces from md5sum/checksum in peekJobLog - -* 0.0.5 (5/15/2009) - * subtract N*250M from available space in brokerage - * use tasktype2 for RW recalculation - * allow transferring in updateJob - * use job stat per process group in brokerage - * added prodUserName - * added validation to test - * fixed TA - * use prodUserName for users - * added nEvents to JD - * added pilotowners - * added rc_test - * added a hint for Datasets.name - * enabled validatedReleases for all clouds - * set high priority for production role - * added realDatasetsIn - * get empty list of LFNs for empty dataset - * set modificationTime to ARCH tables - * fixed getUserParameter - * added nInputFiles for HC - * added countryGroup for country share - * use a hint for filesTable4.dataset - * fixed lookup for mail addr - * use PandaMover for US - * give higher priorities to /atlas/xyz/Role=production - * set workingGroup when jobs are submitted with prod role - * fixed peekJobLog - * replica location lookup for containers - * fixed broker_util to use proper python - * use jobParamsTable - * fixed python path to use 64bit glite - * fixed for ArchivedDB - * fixed FQAN extraction for GRST_CONN - * dispatchDBlockToken - * converted datetime to str for stateChangeTime - * use 12hr limit in getJobStatisticsForBamboo - * use CERN-PROD_DAQ for prestaging when _DATATAPE is not a location - * ignore token=ATLASDATATAPE when no tape copy - * pandasrv -> pandaserver - * set old=False for listDatasetReplicas - * fixed copyArchived for ArchiveDB - * added _zStr/_nullAttrs in JobSpec - * fixed getJobStatisticsForExtIF() - * fixed for schedID/pilotID - * removed redundant debug message - * fixed for Notification - * input token for mover - * set NULL for creationHost,AtlasRelease,transformation,homepackage - * use sequences directly for PandaID and row_ID - * use SUBCOUNTER_SUBID_SEQ directly - * added a hint to countFilesWithMap - * fixed getNUserJobs - * removed log/cache dirs making - * put alias to filesTable4 in countFilesWithMap - * introduced PANDA_URL_MAP - * suppressed meta in JobSpec - * error handling in Adder - * fixed enddate in Notifier - * use CURRENT_DATE in copyArch - * added nprestage - * added startTime/endTime in updateJob - * validatedreleases and accesscontrol - * 3 -> 1hour for movers (discarded) - * added 'IS NULL' to copyArch - * added bulk reading for PandaID to copyArch to avoid redundant lookup - * added a hint to updateOutFilesReturnPandaIDs - * use Null instead of 'NULL' - * don't reset jobParameters when reassigned - * added a hint to all fileTable4+destinationDBlock - * use JOBSARCHIVED4_MODTIME_IDX - * addSiteAccess and listSiteAccess - * hours=1 -> 3 for movers - * retry in peekJob - * reconnection in rollback - * added hint to queryDatasetWithMap - * use bind-variables for all queries - * fixed freezeDS - * fixed a duplicated variable in Closer - * truncate ddmErrorDiag - * hint to freezeDS - * removed deleteFiles in copyArchived - * not update modTime in copyArchived when peekJob failed - * container-aware - * validatedreleases and space check in brokerage - * added deleteJobSimple - * use validatedreleases for FR too - * fixed reassignXYZ - * use archivedFlag for copy/delete - * fine lock for reassignRepro - * threading for reassignRepro - * improved expiration messages - * failed when input dataset is not found in DQ2 - * debug messages in Setupper - * added other error codes in rollback - -* 0.0.4 (2/23/2009) - * GSI authentication for pilots - * tag-based security mechanism for scheduler-pilot-server chain - * fixed test/add.py to use Oracle instead of MySQL - * fixed querySQLS for DELETE - * added panda_server-grid-env.sh - * merged DB proxies to reduce the number of connections - * added lock for worker MPM - * use common write account - -* 0.0.3 (2/16/2009) - * sync to production version - -* 0.0.2 (12/18/2008) - * adjustments for CERN - -* 0.0.1 (12/4/2008) - * first import - - LocalWords: ConBridge diff --git a/current/pandaserver/__init__.py b/current/pandaserver/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/current/pandaserver/brokerage/ErrorCode.py b/current/pandaserver/brokerage/ErrorCode.py deleted file mode 100644 index ea80122e4..000000000 --- a/current/pandaserver/brokerage/ErrorCode.py +++ /dev/null @@ -1,9 +0,0 @@ -############## errror code - -# release is not found -EC_Release = 100 - -# voms authentication failure -EC_Voms = 101 - - diff --git a/current/pandaserver/brokerage/LFCclient.py b/current/pandaserver/brokerage/LFCclient.py deleted file mode 100755 index 06a857e8e..000000000 --- a/current/pandaserver/brokerage/LFCclient.py +++ /dev/null @@ -1,152 +0,0 @@ -import re -import os -import sys -import socket -import random - -# error codes -EC_Main = 70 -EC_LFC = 80 - -# import lfc api -try: - import lfc -except: - print "ERROR : could not import lfc" - sys.exit(EC_LFC) - - -# get files from LFC -def _getFilesLFC(files,lfcHost,storages,verbose=False): - # randomly resolve DNS alias - if lfcHost in ['prod-lfc-atlas.cern.ch']: - lfcHost = random.choice(socket.gethostbyname_ex(lfcHost)[2]) - # set LFC HOST - os.environ['LFC_HOST'] = lfcHost - # timeout - os.environ['LFC_CONNTIMEOUT'] = '60' - os.environ['LFC_CONRETRY'] = '2' - os.environ['LFC_CONRETRYINT'] = '6' - # get PFN - iGUID = 0 - nGUID = 1000 - pfnMap = {} - listGUID = [] - for guid in files.keys(): - if verbose: - sys.stdout.write('.') - sys.stdout.flush() - iGUID += 1 - listGUID.append(guid) - if iGUID % nGUID == 0 or iGUID == len(files): - # get replica - ret,resList = lfc.lfc_getreplicas(listGUID,'') - if ret == 0: - for fr in resList: - if fr != None and ((not hasattr(fr,'errcode')) or \ - (hasattr(fr,'errcode') and fr.errcode == 0)): - # get host - match = re.search('^[^:]+://([^:/]+):*\d*/',fr.sfn) - if match==None: - continue - # check host - host = match.group(1) - if storages != [] and (not host in storages): - continue - # append - if not pfnMap.has_key(fr.guid): - pfnMap[fr.guid] = [] - pfnMap[fr.guid].append(fr.sfn) - else: - print "ERROR : %s" % lfc.sstrerror(lfc.cvar.serrno) - sys.exit(EC_LFC) - # reset - listGUID = [] - # collect LFNs - retLFNs = {} - for guid,lfn in files.iteritems(): - if guid in pfnMap.keys(): - retLFNs[lfn] = pfnMap[guid] - # return - return retLFNs - - - -#################################################################### -# main -def main(): - import sys - import getopt - # option class - class _options: - def __init__(self): - pass - options = _options() - del _options - # set default values - options.verbose = False - options.lfns = [] - options.guids = [] - options.lfchost = '' - options.storages = [] - options.infile = None - options.outfile = None - # get command-line parameters - try: - opts, args = getopt.getopt(sys.argv[1:],"s:i:g:vl:o:f:") - except: - _usage() - print "ERROR : Invalid options" - sys.exit(EC_Main) - # set options - for o, a in opts: - if o in ("-v",): - options.verbose = True - if o in ("-s",): - options.storages = a.split(',') - if o in ("-i",): - options.lfns = a.split(',') - if o in ("-g",): - options.guids = a.split(',') - if o in ("-l",): - options.lfchost = a - if o in ("-f",): - options.infile = a - if o in ("-o",): - options.outfile = a - # read GUID/LFN - files = {} - if options.infile == None: - for idx in range(len(options.guids)): - guid = options.guids[idx] - lfn = options.lfns[idx] - if guid != 'NULL': - files[guid] = lfn - else: - try: - # read from file - ifile = open(options.infile) - for line in ifile: - items = line.split() - if len(items) == 2: - guid = items[1] - lfn = items[0] - if guid != 'NULL': - files[guid] = lfn - # close and delete - ifile.close() - os.remove(options.infile) - except: - errType,errValue = sys.exc_info()[:2] - print "ERROR: %s:%s" % (errType,errValue) - sys.exit(1) - # get files - retFiles = _getFilesLFC(files,options.lfchost,options.storages,options.verbose) - print "LFCRet : %s " % retFiles - # return - sys.exit(0) - - -if __name__ == "__main__": - main() - diff --git a/current/pandaserver/brokerage/PandaSiteIDs.py b/current/pandaserver/brokerage/PandaSiteIDs.py deleted file mode 100644 index 5819cc4c0..000000000 --- a/current/pandaserver/brokerage/PandaSiteIDs.py +++ /dev/null @@ -1,198 +0,0 @@ -# !!!!!!! This file is OBSOLETE. Its content has been absorbed into pilotController.py in the autopilot repository. -# !!!!!!! Questions to Torre Wenaus. -PandaSiteIDs = { - 'AGLT2' : {'nickname':'AGLT2-condor','status':'OK'}, - 'ALBERTA-LCG2' : {'nickname':'ALBERTA-LCG2-lcgce01-atlas-lcgpbs','status':'OK'}, - 'ANALY_AGLT2' : {'nickname':'ANALY_AGLT2-condor','status':'OK'}, - 'ANALY_ALBERTA' : {'nickname':'ALBERTA-LCG2-lcgce01-atlas-lcgpbs','status':'OK'}, - 'ANALY_BEIJING' : {'nickname':'BEIJING-LCG2-lcg002-atlas-lcgpbs','status':'OK'}, - 'ANALY_BNL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, - 'ANALY_BNL_ATLAS_1' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, - 'ANALY_BNL_ATLAS_2' : {'nickname':'BNL_ATLAS_2-condor','status':'OK'}, - #'ANALY_BNL_LOCAL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, - 'ANALY_BNL_test' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, - 'ANALY_BNL_test2' : {'nickname':'ANALY_BNL_ATLAS_1-condor','status':'OK'}, - 'ANALY_BNL_test3' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, - 'ANALY_BRUNEL' : {'nickname':'UKI-LT2-Brunel-dgc-grid-44-atlas-lcgpbs','status':'notOK'}, - 'ANALY_CERN' : {'nickname':'CERN-PROD-ce123-grid_atlas-lcglsf','status':'notOK'}, - 'ANALY_CNAF' : {'nickname':'INFN-CNAF-gridit-ce-001-lcg-lcgpbs','status':'notOK'}, - 'ANALY_CPPM' : {'nickname':'IN2P3-CPPM-marce01-atlas-pbs','status':'OK'}, - 'ANALY_FZK' : {'nickname':'FZK-LCG2-ce-5-fzk-atlasXS-pbspro','status':'OK'}, - 'ANALY_GLASGOW' : {'nickname':'UKI-SCOTGRID-GLASGOW-svr021-q3d-lcgpbs','status':'OK'}, - 'ANALY_GLOW-ATLAS' : {'nickname':'GLOW-ATLAS-condor','status':'OK'}, - 'ANALY_GRIF-IRFU' : {'nickname':'GRIF-IRFU-node07-atlas-lcgpbs','status':'OK'}, - 'ANALY_GRIF-LAL' : {'nickname':'GRIF-LAL-grid10-atlasana-pbs','status':'notOK'}, - 'ANALY_GRIF-LPNHE' : {'nickname':'GRIF-LPNHE-lpnce-atlas-pbs','status':'notOK'}, - 'ANALY_HU_ATLAS_Tier2' : {'nickname':'ANALY_HU_ATLAS_Tier2-lsf','status':'OK'}, - 'ANALY_LANCS' : {'nickname':'UKI-NORTHGRID-LANCS-HEP-fal-pygrid-18-atlas-lcgpbs','status':'notOK'}, - 'ANALY_LAPP' : {'nickname':'IN2P3-LAPP-lapp-ce01-atlas-pbs','status':'notOK'}, - 'ANALY_LIV' : {'nickname':'UKI-NORTHGRID-LIV-HEP-hepgrid2-atlas-lcgpbs','status':'notOK'}, - 'ANALY_LONG_BNL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, - 'ANALY_LONG_BNL_ATLAS' : {'nickname':'BNL_ATLAS_2-condor','status':'OK'}, - 'ANALY_LONG_BNL_LOCAL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, - 'ANALY_LONG_LYON' : {'nickname':'IN2P3-CC-T2-cclcgceli05-long-bqs','status':'OK'}, - 'ANALY_LPC' : {'nickname':'IN2P3-LPC-clrlcgce03-atlas-lcgpbs','status':'notOK'}, - 'ANALY_LPSC' : {'nickname':'IN2P3-LPSC-lpsc-ce-atlas-pbs','status':'OK'}, - 'ANALY_LYON' : {'nickname':'IN2P3-CC-T2-cclcgceli05-medium-bqs','status':'OK'}, - 'ANALY_MANC' : {'nickname':'UKI-NORTHGRID-MAN-HEP-ce01-atlas-lcgpbs','status':'OK'}, - 'ANALY_MCGILL' : {'nickname':'MCGILL-LCG2-atlas-ce-atlas-lcgpbs','status':'OK'}, - 'ANALY_MWT2' : {'nickname':'ANALY_MWT2-condor','status':'notOK'}, - 'ANALY_MWT2_SHORT' : {'nickname':'ANALY_MWT2_SHORT-pbs','status':'notOK'}, - 'ANALY_NET2' : {'nickname':'ANALY_NET2-pbs','status':'OK'}, - 'ANALY_OU_OCHEP_SWT2' : {'nickname':'ANALY_OU_OCHEP_SWT2-condor','status':'notOK'}, - 'ANALY_PIC' : {'nickname':'pic-ce07-gshort-lcgpbs','status':'OK'}, - 'ANALY_RAL' : {'nickname':'RAL-LCG2-lcgce01-atlasL-lcgpbs','status':'OK'}, - 'ANALY_ROMANIA02' : {'nickname':'RO-02-NIPNE-tbat01-atlas-lcgpbs','status':'notOK'}, - 'ANALY_ROMANIA07' : {'nickname':'RO-07-NIPNE-tbit01-atlas-lcgpbs','status':'notOK'}, - 'ANALY_SARA' : {'nickname':'SARA-MATRIX-mu6-short-pbs','status':'notOK'}, - 'ANALY_SFU' : {'nickname':'SFU-LCG2-snowpatch-hep-atlas-lcgpbs','status':'notOK'}, - 'ANALY_SHEF' : {'nickname':'UKI-NORTHGRID-SHEF-HEP-lcgce0-atlas-lcgpbs','status':'OK'}, - 'ANALY_SLAC' : {'nickname':'ANALY_SLAC-lsf','status':'OK'}, - 'ANALY_SWT2_CPB' : {'nickname':'ANALY_SWT2_CPB-pbs','status':'OK'}, - 'ANALY_TAIWAN' : {'nickname':'Taiwan-LCG2-w-ce01-atlas-lcgpbs','status':'OK'}, - 'ANALY_TEST' : {'nickname':'ANALY_TEST','status':'OK'}, - 'ANALY_TORONTO' : {'nickname':'TORONTO-LCG2-bigmac-lcg-ce2-atlas-pbs','status':'OK'}, - 'ANALY_TOKYO' : {'nickname':'TOKYO-LCG2-lcg-ce01-atlas-lcgpbs','status':'OK'}, - 'ANALY_TRIUMF' : {'nickname':'TRIUMF-LCG2-ce1-atlas-lcgpbs','status':'OK'}, - 'ANALY_UBC' : {'nickname':'UBC-pbs','status':'OK'}, - 'ANALY_UIUC-HEP' : {'nickname':'ANALY_UIUC-HEP-condor','status':'OK'}, - 'ANALY_UTA' : {'nickname':'UTA-DPCC-pbs','status':'OK'}, - 'ANALY_UTA-DPCC' : {'nickname':'UTA-DPCC-test-pbs','status':'notOK'}, - 'ANALY_VICTORIA' : {'nickname':'VICTORIA-LCG2-lcg-ce-general-lcgpbs','status':'OK'}, - 'AUVERGRID' : {'nickname':'AUVERGRID-iut15auvergridce01-atlas-lcgpbs','status':'notOK'}, - 'ASGC' : {'nickname':'Taiwan-LCG2-w-ce01-atlas-lcgpbs','status':'OK'}, - 'ASGC_REPRO' : {'nickname':'ASGC_REPRO','status':'notOK'}, - 'Australia-ATLAS' : {'nickname':'Australia-ATLAS-agh2-atlas-lcgpbs','status':'OK'}, - 'BARNETT_TEST' : {'nickname':'BARNETT_TEST','status':'notOK'}, - 'BEIJING' : {'nickname':'BEIJING-LCG2-lcg002-atlas-lcgpbs','status':'OK'}, - 'BNLPROD' : {'nickname':'BNL_ATLAS_1-condor','status':'notOK'}, - 'BNL_ATLAS_1' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, - 'BNL_ATLAS_2' : {'nickname':'BNL_ATLAS_2-condor','status':'OK'}, - 'BNL_ATLAS_DDM' : {'nickname':'BNL_DDM-condor','status':'notOK'}, - 'BNL_ATLAS_test' : {'nickname':'BNL_ATLAS_2-condor','status':'notOK'}, - 'BU_ATLAS_Tier2' : {'nickname':'BU_ATLAS_Tier2-pbs','status':'OK'}, - 'BU_ATLAS_Tier2o' : {'nickname':'BU_ATLAS_Tier2o-pbs','status':'OK'}, - 'BU_ATLAS_test' : {'nickname':'BU_ATLAS_Tier2-pbs','status':'NOTOK'}, - 'HU_ATLAS_Tier2' : {'nickname':'HU_ATLAS_Tier2-lsf','status':'OK'}, - 'CERN-BUILDS' : {'nickname':'CERN-BUILDS','status':'notOK'}, - 'CERN-RELEASE' : {'nickname':'CERN-RELEASE','status':'notOK'}, - 'CERN-UNVALID' : {'nickname':'CERN-UNVALID','status':'notOK'}, - 'CGG' : {'nickname':'CGG-LCG2-ce1-atlas-lcgpbs','status':'notOK'}, - 'CHARMM' : {'nickname':'CHARMM','status':'notOK'}, - 'CNR-ILC-PISA' : {'nickname':'CNR-ILC-PISA-gridce-atlas-lcgpbs','status':'notOK'}, - 'CPPM' : {'nickname':'IN2P3-CPPM-marce01-atlas-pbs','status':'OK'}, - 'CSCS-LCG2' : {'nickname':'CSCS-LCG2-ce01-egee48h-lcgpbs','status':'OK'}, - 'csTCDie' : {'nickname':'csTCDie-gridgate-himem-pbs','status':'OK'}, - 'CYF' : {'nickname':'CYFRONET-LCG2-ce-atlas-pbs','status':'OK'}, - 'DESY-HH' : {'nickname':'DESY-HH-grid-ce3-default-lcgpbs','status':'OK'}, - 'DESY-ZN' : {'nickname':'DESY-ZN-lcg-ce0-atlas-lcgpbs','status':'OK'}, - 'EFDA-JET' : {'nickname':'EFDA-JET-grid002-atlas-lcgpbs','status':'notok'}, - 'FZK-LCG2' : {'nickname':'FZK-LCG2-ce-1-fzk-atlasXL-pbspro','status':'OK'}, - 'FZK_REPRO' : {'nickname':'FZK_REPRO','status':'notOK'}, - 'FZU' : {'nickname':'praguelcg2-golias25-lcgatlas-lcgpbs','status':'OK'}, - 'GLOW' : {'nickname':'GLOW-CMS-cmsgrid02-atlas-condor','status':'notOK'}, - 'GLOW-ATLAS' : {'nickname':'GLOW-ATLAS-condor','status':'OK'}, - 'GoeGrid' : {'nickname':'GoeGrid-ce-goegrid-atlas-lcgpbs','status':'OK'}, - 'GRIF-IRFU' : {'nickname':'GRIF-IRFU-node07-atlas-lcgpbs','status':'OK'}, - 'GRIF-LAL' : {'nickname':'GRIF-LAL-grid10-atlas-pbs','status':'OK'}, - 'GRIF-LPNHE' : {'nickname':'GRIF-LPNHE-lpnce-atlas-pbs','status':'OK'}, - 'HEPHY-UIBK' : {'nickname':'HEPHY-UIBK-hepx4-atlas-lcgpbs','status':'OK'}, - 'IFAE' : {'nickname':'ifae-ifaece01-ifae-lcgpbs','status':'OK'}, - 'IFIC' : {'nickname':'IFIC-LCG2-ce01-atlas-pbs','status':'OK'}, - 'IHEP' : {'nickname':'BEIJING-LCG2-lcg002-atlas-lcgpbs','status':'OK'}, - 'ITEP' : {'nickname':'ITEP-ceglite-atlas-lcgpbs','status':'OK'}, - 'IN2P3-LPSC' : {'nickname':'IN2P3-LPSC-lpsc-ce-atlas-pbs','status':'OK'}, - 'JINR-LCG2' : {'nickname':'JINR-LCG2-lcgce01-atlas-lcgpbs', 'status':'OK'}, - 'LAPP' : {'nickname':'IN2P3-LAPP-lapp-ce01-atlas-pbs','status':'OK'}, - 'LIP-COIMBRA' : {'nickname':'LIP-Coimbra-grid006-atlas-lcgpbs','status':'OK'}, - 'LIP-LISBON' : {'nickname':'LIP-Lisbon-ce02-atlasgrid-lcgsge','status':'OK'}, - 'LLR' : {'nickname':'GRIF-LLR-polgrid1-atlas-pbs','status':'notOK'}, - 'LPC' : {'nickname':'IN2P3-LPC-clrlcgce03-atlas-lcgpbs','status':'OK'}, - 'LRZ' : {'nickname':'LRZ-LMU-lcg-lrz-ce-atlas-sge','status':'OK'}, - 'LYON' : {'nickname':'IN2P3-CC-cclcgceli02-long-bqs','status':'OK'}, - 'LYON_REPRO' : {'nickname':'LYON_REPRO','status':'notOK'}, - 'Lyon-T2' : {'nickname':'IN2P3-CC-T2-cclcgceli05-long-bqs','status':'OK'}, - 'LTU_CCT' : {'nickname':'LTU_CCT-pbs','status':'OK'}, - 'MANC' : {'nickname':'UKI-NORTHGRID-MAN-HEP-ce02-atlas-lcgpbs','status':'OK'}, - 'MCGILL-LCG2' : {'nickname':'MCGILL-LCG2-atlas-ce-atlas-pbs','status':'OK'}, - 'MONTREAL' : {'nickname':'Umontreal-LCG2-lcg-ce-atlas-lcgpbs','status':'notOK'}, - 'MPP' : {'nickname':'MPPMU-grid-ce-long-sge','status':'OK'}, - 'MWT2_IU' : {'nickname':'MWT2_IU-pbs','status':'OK'}, - 'MWT2_UC' : {'nickname':'MWT2_UC-pbs','status':'OK'}, - 'NDGF' : {'nickname':'NDGF-condor','status':'OK'}, - 'NIKHEF-ELPROD' : {'nickname':'NIKHEF-ELPROD-gazon-atlas-pbs','status':'OK'}, - 'NIKHEF_REPRO' : {'nickname':'NIKHEF_REPRO','status':'notOK'}, - 'OUHEP_ITB' : {'nickname':'OUHEP_ITB-condor','status':'notOK'}, - 'OU_PAUL_TEST' : {'nickname':'OU_OCHEP_SWT2-condor','status':'notOK'}, - 'OU_OCHEP_SWT2' : {'nickname':'OU_OCHEP_SWT2-condor','status':'OK'}, - 'OU_OSCER_ATLAS' : {'nickname':'OU_OSCER_ATLAS-lsf','status':'OK'}, - 'OU_OSCER_ATLASdeb' : {'nickname':'OU_OSCER_ATLASdeb-lsf','status':'notOK'}, - 'PSNC' : {'nickname':'PSNC-ce-atlas-pbs','status':'OK'}, - 'PIC' : {'nickname':'pic-ce05-glong-lcgpbs','status':'OK'}, - 'PIC_REPRO' : {'nickname':'PIC_REPRO','status':'notOK'}, - 'prague_cesnet_lcg2' : {'nickname':'prague_cesnet_lcg2-skurut17-egee_atlas-lcgpbs','status':'notOK'}, - 'RAL' : {'nickname':'RAL-LCG2-lcgce02-grid1000M-lcgpbs','status':'OK'}, - 'RAL_REPRO' : {'nickname':'RAL_REPRO','status':'notOK'}, - 'ru-Moscow-SINP-LCG2' : {'nickname':'ru-Moscow-SINP-LCG2-lcg02-atlas-lcgpbs','status':'OK'}, - 'ru-PNPI' : {'nickname':'ru-PNPI-cluster-atlas-pbs','status':'OK'}, - 'RDIGTEST' : {'nickname':'RDIGTEST','status':'notOK'}, - 'ROMANIA02' : {'nickname':'RO-02-NIPNE-tbat01-atlas-lcgpbs','status':'OK'}, - 'ROMANIA07' : {'nickname':'RO-07-NIPNE-tbit01-atlas-lcgpbs','status':'OK'}, - 'RRC-KI' : {'nickname':'RRC-KI-gate-atlas-lcgpbs','status':'OK'}, - 'RU-Protvino-IHEP' : {'nickname':'RU-Protvino-IHEP-ce0003-atlas-lcgpbs','status':'OK'}, - 'SARA_REPRO' : {'nickname':'SARA_REPRO','status':'notOK'}, - 'SFU-LCG2' : {'nickname':'SFU-LCG2-snowpatch-atlas-lcgpbs','status':'OK'}, - 'SLACXRD' : {'nickname':'SLACXRD-lsf','status':'OK'}, - 'SLAC_PAUL_TEST' : {'nickname':'SLACXRD-lsf','status':'notOK'}, - 'SNS-PISA' : {'nickname':'SNS-PISA-gridce-atlas-lcgpbs','status':'notOK'}, - 'SPACI-CS-IA64' : {'nickname':'SPACI-CS-IA64-square-atlas-lsf','status':'notOK'}, - 'SWT2_CPB' : {'nickname':'SWT2_CPB-pbs','status':'OK'}, - 'Taiwan-IPAS-LCG2' : {'nickname':'Taiwan-IPAS-LCG2-atlasce-atlas-lcgcondor','status':'notOK'}, - 'TEST1' : {'nickname':'TEST1','status':'notOK'}, - 'TEST2' : {'nickname':'TEST2','status':'notOK'}, - 'TEST3' : {'nickname':'TEST3','status':'notOK'}, - 'TEST4' : {'nickname':'TEST4','status':'notOK'}, - 'TESTCHARMM' : {'nickname':'TESTCHARMM','status':'notOK'}, - 'TESTGLIDE' : {'nickname':'TESTGLIDE','status':'notOK'}, - 'TOKYO' : {'nickname':'TOKYO-LCG2-lcg-ce01-atlas-lcgpbs','status':'OK'}, - 'TORONTO-LCG2' : {'nickname':'TORONTO-LCG2-bigmac-lcg-ce2-atlas-pbs','status':'OK'}, - 'TPATHENA' : {'nickname':'TPATHENA','status':'notOK'}, - 'TPPROD' : {'nickname':'TPPROD','status':'notOK'}, - 'TRIUMF' : {'nickname':'TRIUMF-LCG2-ce1-atlas-lcgpbs','status':'OK'}, - 'TRIUMF_DDM' : {'nickname':'TRIUMF_DDM','status':'notOK'}, - 'TRIUMF_REPRO' : {'nickname':'TRIUMF_REPRO','status':'notOK'}, - 'TW-FTT' : {'nickname':'TW-FTT-f-ce01-atlas-lcgpbs','status':'OK'}, - 'TWTEST' : {'nickname':'TWTEST','status':'notOK'}, - 'TestPilot' : {'nickname':'TestPilot','status':'notOK'}, - 'UAM-LCG2' : {'nickname':'UAM-LCG2-grid003-atlas-lcgpbs','status':'OK'}, - 'UBC' : {'nickname':'UBC-pbs','status':'OK'}, - 'UBC_PAUL_TEST' : {'nickname':'UBC-pbs','status':'notOK'}, - 'UIUC-HEP' : {'nickname':'UIUC-HEP-condor','status':'OK'}, - 'UCITB_EDGE7' : {'nickname':'UCITB_EDGE7-pbs','status':'OK'}, - 'UC_ATLAS_MWT2' : {'nickname':'UC_ATLAS_MWT2-condor','status':'OK'}, - 'UC_ATLAS_test' : {'nickname':'UC_ATLAS_MWT2-condor','status':'OK'}, - 'UC_Teraport' : {'nickname':'UC_Teraport-pbs','status':'notOK'}, - 'UMESHTEST' : {'nickname':'UMESHTEST','status':'notOK'}, - 'UNI-FREIBURG' : {'nickname':'UNI-FREIBURG-ce-atlas-pbs','status':'OK'}, - 'UTA-DPCC' : {'nickname':'UTA-DPCC-pbs','status':'OK'}, - 'UTA-DPCC-test' : {'nickname':'UTA-DPCC-test-pbs','status':'OK'}, - 'UTA_PAUL_TEST' : {'nickname':'UTA-SWT2-pbs','status':'notOK'}, - 'UTA_SWT2' : {'nickname':'UTA-SWT2-pbs','status':'OK'}, - 'UTD-HEP' : {'nickname':'UTD-HEP-pbs','status':'OK'}, - 'VICTORIA-LCG2' : {'nickname':'VICTORIA-LCG2-lcg-ce-general-lcgpbs','status':'OK'}, - 'Wuppertal' : {'nickname':'wuppertalprod-grid-ce-dg_long-lcgpbs','status':'OK'}, -} - - -# cloud-MoverID mapping -PandaMoverIDs = { - 'US' : 'BNL_ATLAS_DDM', - 'CA' : 'TRIUMF_DDM', - 'FR' : 'TRIUMF_DDM', - 'IT' : 'TRIUMF_DDM', - 'NL' : 'TRIUMF_DDM', - 'DE' : 'TRIUMF_DDM', - 'TW' : 'TRIUMF_DDM', - 'UK' : 'TRIUMF_DDM', - 'ES' : 'TRIUMF_DDM', - } diff --git a/current/pandaserver/brokerage/SiteMapper.py b/current/pandaserver/brokerage/SiteMapper.py deleted file mode 100644 index a0ad2c0a6..000000000 --- a/current/pandaserver/brokerage/SiteMapper.py +++ /dev/null @@ -1,205 +0,0 @@ -import re -import sys - -# logger -from pandalogger.PandaLogger import PandaLogger -_logger = PandaLogger().getLogger('SiteMapper') - -# PandaIDs -from PandaSiteIDs import PandaSiteIDs - -# default site -from taskbuffer.SiteSpec import SiteSpec -defSite = SiteSpec() -defSite.sitename = 'BNL_ATLAS_1' -defSite.nickname = 'BNL_ATLAS_1-condor' -defSite.dq2url = 'http://dms02.usatlas.bnl.gov:8000/dq2/' -defSite.ddm = 'PANDA_UNDEFINED' -defSite.type = 'production' -defSite.gatekeeper = 'gridgk01.racf.bnl.gov' -defSite.status = 'online' -defSite.setokens = {} - - -######################################################################## - -class SiteMapper: - - # constructor - def __init__(self,taskBuffer,verbose=False): - _logger.debug('__init__ SiteMapper') - try: - # site list - self.siteSpecList = {} - - # sites not belonging to a cloud - self.defCloudSites = [] - - # cloud specification - self.cloudSpec = {} - - # create CloudSpec list - tmpCloudListDB = taskBuffer.getCloudList() - for tmpName,tmpCloudSpec in tmpCloudListDB.iteritems(): - self.cloudSpec[tmpName] = {} - # copy attributes from CloudSepc - for tmpAttr in tmpCloudSpec._attributes: - self.cloudSpec[tmpName][tmpAttr] = getattr(tmpCloudSpec,tmpAttr) - # append additional attributes - # source : Panda siteID for source - # dest : Panda siteID for dest - # sites : Panda siteIDs in the cloud - self.cloudSpec[tmpName]['source'] = self.cloudSpec[tmpName]['tier1'] - self.cloudSpec[tmpName]['dest'] = self.cloudSpec[tmpName]['tier1'] - self.cloudSpec[tmpName]['sites'] = [] - _logger.debug('Cloud->%s %s' % (tmpName,str(self.cloudSpec[tmpName]))) - # get list of PandaIDs - siteIDsList = taskBuffer.getSiteList() - firstDefault = True - # read full list from DB - siteFullList = taskBuffer.getSiteInfo() - # read DB to produce paramters in siteinfo dynamically - for tmpID,tmpNicknameList in siteIDsList.iteritems(): - for tmpNickname in tmpNicknameList: - # invalid nickname - if not siteFullList.has_key(tmpNickname): - continue - # get full spec - ret = siteFullList[tmpNickname] - # append - if ret == None: - _logger.error('Could not read site info for %s:%s' % (tmpID,tmpNickname)) - elif (firstDefault and tmpID == defSite.sitename) or (not self.siteSpecList.has_key(tmpID)) \ - or (self.siteSpecList.has_key(tmpID) and self.siteSpecList[tmpID].status in ['offline','']): - # overwrite default or remove existing offline - if firstDefault and tmpID == defSite.sitename: - del self.siteSpecList[tmpID] - firstDefault = False - elif self.siteSpecList.has_key(tmpID) and self.siteSpecList[tmpID].status in ['offline','']: - del self.siteSpecList[tmpID] - # append - if not self.siteSpecList.has_key(tmpID): - # determine type following a convention - tmpType = 'production' - if tmpID.startswith('ANALY_'): - tmpType = 'analysis' - elif re.search('test',tmpID,re.I) or \ - (PandaSiteIDs.has_key(tmpID) and PandaSiteIDs[tmpID]['status']!='OK'): - tmpType = 'test' - # set type - ret.sitename = tmpID - ret.type = tmpType - # don't use site for production when cloud is undefined - if ret.type == 'production' and ret.cloud == '': - _logger.error('Empty cloud for %s:%s' % (tmpID,tmpNickname)) - else: - self.siteSpecList[tmpID] = ret - else: - # overwrite status - if not ret.status in ['offline','']: - if self.siteSpecList[tmpID].status != 'online': - self.siteSpecList[tmpID].status = ret.status - # use larger maxinputsize and memory - try: - if ret.status in ['online']: - if self.siteSpecList[tmpID].maxinputsize < ret.maxinputsize or \ - ret.maxinputsize == 0: - self.siteSpecList[tmpID].maxinputsize = ret.maxinputsize - if (self.siteSpecList[tmpID].memory != 0 and self.siteSpecList[tmpID].memory < ret.memory) or \ - ret.memory == 0: - self.siteSpecList[tmpID].memory = ret.memory - except: - errtype, errvalue = sys.exc_info()[:2] - _logger.error("%s memory/inputsize failuer : %s %s" % (tmpID,errtype,errvalue)) - # make cloudSpec - for siteSpec in self.siteSpecList.values(): - # choose only prod sites - if siteSpec.type != 'production': - continue - # append prod site in cloud - for tmpCloud in siteSpec.cloudlist: - if self.cloudSpec.has_key(tmpCloud): - if not siteSpec.sitename in self.cloudSpec[tmpCloud]['sites']: - # append - self.cloudSpec[tmpCloud]['sites'].append(siteSpec.sitename) - else: - # append to the default cloud - if not siteSpec.sitename in self.defCloudSites: - # append - self.defCloudSites.append(siteSpec.sitename) - # set defCloudSites for backward compatibility - if self.cloudSpec.has_key('US'): - # use US sites - self.defCloudSites = self.cloudSpec['US']['sites'] - else: - # add def site as a protection if defCloudSites is empty - self.defCloudSites.append(defSite.sitename) - # dump sites - if verbose: - _logger.debug('========= dump =========') - for tmpSite,tmpSiteSpec in self.siteSpecList.iteritems(): - _logger.debug('Site->%s' % str(tmpSiteSpec)) - # check - for tmpCloud,tmpVals in self.cloudSpec.iteritems(): - # set T1 - try: - tmpVals['sites'].remove(tmpVals['dest']) - except: - pass - tmpVals['sites'].insert(0,tmpVals['dest']) - # dump - _logger.debug('Cloud:%s has %s' % (tmpCloud,tmpVals['sites'])) - for tmpSite in tmpVals['sites']: - if not self.siteSpecList.has_key(tmpSite): - _logger.debug(" '%s' doesn't exist" % tmpSite) - continue - tmpSiteSpec = self.siteSpecList[tmpSite] - if tmpSiteSpec.status in ['offline']: - _logger.debug(' %s:%s' % (tmpSite,tmpSiteSpec.status)) - _logger.debug('Cloud:XX has %s' % self.defCloudSites) - except: - type, value, traceBack = sys.exc_info() - _logger.error("__init__ SiteMapper : %s %s" % (type,value)) - _logger.debug('__init__ SiteMapper done') - - - # accessor for site - def getSite(self,site): - if self.siteSpecList.has_key(site): - return self.siteSpecList[site] - else: - # return default site - return defSite - - - # check if site exists - def checkSite(self,site): - return self.siteSpecList.has_key(site) - - - # accessor for cloud - def getCloud(self,cloud): - if self.cloudSpec.has_key(cloud): - return self.cloudSpec[cloud] - else: - # return sites in default cloud - ret = { 'source' : 'default', - 'dest' : 'default', - 'sites' : self.defCloudSites, - 'transtimelo' : 2, - 'transtimehi' : 1, - } - return ret - - - # accessor for cloud - def checkCloud(self,cloud): - if self.cloudSpec.has_key(cloud): - return True - else: - return False - - - # accessor for cloud list - def getCloudList(self): - return self.cloudSpec.keys() diff --git a/current/pandaserver/brokerage/VomsResolver.py b/current/pandaserver/brokerage/VomsResolver.py deleted file mode 100644 index 7bc432002..000000000 --- a/current/pandaserver/brokerage/VomsResolver.py +++ /dev/null @@ -1,56 +0,0 @@ -import re -import sys - -# logger -from pandalogger.PandaLogger import PandaLogger -_logger = PandaLogger().getLogger('VomsResolver') - - -######################################################################## - -class VomsResolver: - - # constructor - def __init__(self): - self.vomsUserMap = {} - try: - # read grid-mapfile - mapFile = open('/home/sm/grid-mapfile') - vo = None - for line in mapFile: - if line.startswith("#----"): - # get vo name - vo = line.split()[-2] - _logger.debug('get VO:%s' % vo) - self.vomsUserMap[vo] = [] - else: - # get DN - match = re.search('^"([^"]+)"',line) - if match != None: - # append - self.vomsUserMap[vo] = match.group(1) - # close grid-mapfile - mapFile.close() - except: - type, value, traceBack = sys.exc_info() - _logger.error("init : %s %s" % (type,value)) - - - # check the user is on VO - def checkUser(self,voms,dn): - _logger.debug('checkUser VO:%s DN:%s' % (voms,dn)) - if not self.vomsUserMap.has_key(voms): - _logger.debug(' NG - VO:%s is unsupported' % voms) - return False - # look for DN - for tmpDN in self.vomsUserMap[voms]: - if dn.startswith(tmpDN): - _logger.debug(' OK' % dn) - return True - _logger.debug(' NG - DN:%s is not found' % dn) - return False - - - # check voms is supported - def checkVoms(self,voms): - return self.vomsUserMap.has_key(voms) diff --git a/current/pandaserver/brokerage/__init__.py b/current/pandaserver/brokerage/__init__.py deleted file mode 100755 index e69de29bb..000000000 diff --git a/current/pandaserver/brokerage/broker.py b/current/pandaserver/brokerage/broker.py deleted file mode 100755 index 0cfc98dee..000000000 --- a/current/pandaserver/brokerage/broker.py +++ /dev/null @@ -1,1684 +0,0 @@ -import re -import sys -import time -import types -import fcntl -import random -import datetime -import commands -import ErrorCode -import broker_util -import PandaSiteIDs -from taskbuffer import ProcessGroups -from dataservice import DataServiceUtils -from config import panda_config - -from pandalogger.PandaLogger import PandaLogger -_log = PandaLogger().getLogger('broker') - -# all known sites -_allSites = PandaSiteIDs.PandaSiteIDs.keys() - -# sites for prestaging -#prestageSites = ['BNL_ATLAS_test','BNL_ATLAS_1','BNL_ATLAS_2'] - -# non LRC checking -_disableLRCcheck = [] - -# lock for uuidgen -_lockGetUU = open(panda_config.lockfile_getUU, 'w') - -# short-long mapping -shortLongMap = {'ANALY_BNL_ATLAS_1':'ANALY_LONG_BNL_ATLAS', - 'ANALY_LYON-T2' :'ANALY_LONG_LYON-T2', - 'ANALY_LYON_DCACHE':'ANALY_LONG_LYON_DCACHE', - 'ANALY_BNL_SHORT' :'ANALY_BNL_LONG', - } - -# processingType to skip brokerage -skipBrokerageProTypes = ['prod_test'] - -# comparison function for sort -def _compFunc(jobA,jobB): - # append site if not in list - if not jobA.computingSite in _allSites: - _allSites.append(jobA.computingSite) - if not jobB.computingSite in _allSites: - _allSites.append(jobB.computingSite) - # compare - indexA = _allSites.index(jobA.computingSite) - indexB = _allSites.index(jobB.computingSite) - if indexA > indexB: - return 1 - elif indexA < indexB: - return -1 - else: - return 0 - - -# release checker -def _checkRelease(jobRels,siteRels): - # all on/off - if "True" in siteRels: - return True - if "False" in siteRels: - return False - # loop over all releases - for tmpRel in jobRels.split('\n'): - relVer = re.sub('^Atlas-','',tmpRel) - # not available releases - if not relVer in siteRels: - return False - return True - - -# get list of files which already exist at the site -def _getOkFiles(v_ce,v_files,v_guids,allLFNs,allGUIDs,allOkFilesMap,tmpLog=None): - # DQ2 URL - dq2URL = v_ce.dq2url - dq2IDs = v_ce.setokens.values() - try: - dq2IDs.remove('') - except: - pass - dq2IDs.sort() - if dq2IDs == []: - dq2ID = v_ce.ddm - else: - dq2ID = '' - for tmpID in dq2IDs: - dq2ID += '%s,' % tmpID - dq2ID = dq2ID[:-1] - # set LFC and SE name - tmpSE = [] - if not v_ce.lfchost in [None,'']: - dq2URL = 'lfc://'+v_ce.lfchost+':/grid/atlas/' - tmpSE = broker_util.getSEfromSched(v_ce.se) - if tmpLog != None: - tmpLog.debug('getOkFiles for %s with dq2ID:%s,LFC:%s,SE:%s' % (v_ce.sitename,dq2ID,dq2URL,str(tmpSE))) - # use bulk lookup - if allLFNs != []: - # get bulk lookup data - if not allOkFilesMap.has_key(dq2ID): - # get files from LRC - allOkFilesMap[dq2ID] = broker_util.getFilesFromLRC(allLFNs,dq2URL,guids=allGUIDs, - storageName=tmpSE,getPFN=True) - # make return map - retMap = {} - for tmpLFN in v_files: - if allOkFilesMap[dq2ID].has_key(tmpLFN): - retMap[tmpLFN] = allOkFilesMap[dq2ID][tmpLFN] - # return - return retMap - else: - # old style - return broker_util.getFilesFromLRC(v_files,dq2URL,guids=v_guids, - storageName=tmpSE,getPFN=True) - - -# check reprocessing or not -def _isReproJob(tmpJob): - if tmpJob != None: - if tmpJob.processingType in ['reprocessing']: - return True - if tmpJob.transformation in ['csc_cosmics_trf.py','csc_BSreco_trf.py','BStoESDAODDPD_trf.py']: - return True - return False - - -# set 'ready' if files are already there -def _setReadyToFiles(tmpJob,okFiles,siteMapper,tmpLog): - allOK = True - tmpSiteSpec = siteMapper.getSite(tmpJob.computingSite) - tmpSrcSpec = siteMapper.getSite(siteMapper.getCloud(tmpJob.cloud)['source']) - # direct usage of remote SE - if tmpSiteSpec.ddm != tmpSrcSpec.ddm and tmpSrcSpec.ddm in tmpSiteSpec.setokens.values(): - tmpSiteSpec = tmpSrcSpec - tmpLog.debug('%s uses remote SiteSpec of %s for %s' % (tmpJob.PandaID,tmpSrcSpec.sitename,tmpJob.computingSite)) - tmpLog.debug('%s %s' % (tmpJob.PandaID,str(tmpSiteSpec.seprodpath))) - prestageSites = getPrestageSites(siteMapper) - for tmpFile in tmpJob.Files: - if tmpFile.type == 'input': - if DataServiceUtils.isCachedFile(tmpFile.dataset,tmpSiteSpec): - # cached file - tmpFile.status = 'cached' - tmpFile.dispatchDBlock = 'NULL' - elif (tmpJob.computingSite.endswith('_REPRO') or tmpJob.computingSite == siteMapper.getCloud(tmpJob.cloud)['source'] \ - or tmpSiteSpec.ddm == tmpSrcSpec.ddm) \ - and (not tmpJob.computingSite in prestageSites): - # EGEE T1. use DQ2 prestage only for on-tape files - if tmpSiteSpec.seprodpath.has_key('ATLASDATATAPE') and tmpSiteSpec.seprodpath.has_key('ATLASMCTAPE') and \ - okFiles.has_key(tmpFile.lfn): - tapeOnly = True - tapeCopy = False - for okPFN in okFiles[tmpFile.lfn]: - if re.search(tmpSiteSpec.seprodpath['ATLASDATATAPE'],okPFN) == None and \ - re.search(tmpSiteSpec.seprodpath['ATLASMCTAPE'],okPFN) == None: - # there is a disk copy - if tmpJob.cloud == 'US': - # check for BNLPANDA - if (tmpSiteSpec.seprodpath.has_key('ATLASMCDISK') and \ - re.search(tmpSiteSpec.seprodpath['ATLASMCDISK'],okPFN) != None) or \ - (tmpSiteSpec.seprodpath.has_key('ATLASDATADISK') and - re.search(tmpSiteSpec.seprodpath['ATLASDATADISK'],okPFN) != None): - tapeOnly = False - else: - tapeOnly = False - else: - # there is a tape copy - tapeCopy = True - # trigger prestage when disk copy doesn't exist or token is TAPE - if tapeOnly or (tapeCopy and tmpFile.dispatchDBlockToken in ['ATLASDATATAPE','ATLASMCTAPE']): - allOK = False - else: - # set ready - tmpFile.status = 'ready' - tmpFile.dispatchDBlock = 'NULL' - else: - # set ready anyway even if LFC is down. i.e. okFiles doesn't contain the file - tmpFile.status = 'ready' - tmpFile.dispatchDBlock = 'NULL' - elif (((tmpFile.lfn in okFiles) or (tmpJob.computingSite == tmpJob.destinationSE)) \ - and (not tmpJob.computingSite in prestageSites or \ - (tmpJob.computingSite in prestageSites and not tmpJob.cloud in ['US']))) \ - or tmpFile.status == 'missing': - # don't use TAPE replicas when T1 is used as T2 - if okFiles.has_key(tmpFile.lfn) and \ - tmpSiteSpec.seprodpath.has_key('ATLASDATATAPE') and len(okFiles[tmpFile.lfn]) == 1 and \ - re.search(tmpSiteSpec.seprodpath['ATLASDATATAPE'],okFiles[tmpFile.lfn][0]) != None: - allOK = False - else: - # set ready if the file exists and the site doesn't use prestage - tmpFile.status = 'ready' - tmpFile.dispatchDBlock = 'NULL' - else: - # prestage with PandaMover - allOK = False - # unset disp dataset - if allOK: - tmpJob.dispatchDBlock = 'NULL' - - - -# check number/size of inputs -def _isTooManyInput(nFilesPerJob,inputSizePerJob): - # the number of inputs is larger than 5 or - # size of inputs is larger than 500MB - if nFilesPerJob > 5 or inputSizePerJob > 500*1024*1024: - return True - return False - - -# send analysis brokerage info -def sendAnalyBrokeageInfo(results,prevRelease,diskThreshold,chosenSite,prevCmtConfig, - siteReliability): - # send log messages - messageList = [] - for resultType,resultList in results.iteritems(): - for resultItem in resultList: - if resultType == 'rel': - if prevCmtConfig in ['','NULL',None]: - msgBody = 'action=skip site=%s reason=missingapp - app=%s is missing' % (resultItem,prevRelease) - else: - msgBody = 'action=skip site=%s reason=missingapp - app=%s/%s is missing' % (resultItem,prevRelease,prevCmtConfig) - elif resultType == 'pilot': - msgBody = 'action=skip site=%s reason=nopilot - no pilots for last 3 hours' % resultItem - elif resultType == 'disk': - msgBody = 'action=skip site=%s reason=diskshortage - disk shortage < %sGB' % (resultItem,diskThreshold) - elif resultType == 'memory': - msgBody = 'action=skip site=%s reason=ramshortage - RAM shortage' % resultItem - elif resultType == 'maxtime': - msgBody = 'action=skip site=%s reason=maxtime - shorter walltime limit' % resultItem - elif resultType == 'status': - msgBody = 'action=skip site=%s reason=sitestatus - not online' % resultItem - elif resultType == 'reliability': - msgBody = 'action=skip site=%s reason=reliability - insufficient>%s' % (resultItem ,siteReliability) - elif resultType == 'weight': - tmpSite,tmpWeight = resultItem - if tmpSite == chosenSite: - msgBody = 'action=choose site=%s reason=maxweight - max weight=%s' % (tmpSite,tmpWeight) - else: - msgBody = 'action=skip site=%s reason=notmaxweight - weight=%s' % (tmpSite,tmpWeight) - elif resultType == 'prefcountry': - tmpSite,tmpCountry = resultItem - if tmpSite == chosenSite: - msgBody = 'action=prefer country=%s reason=countrygroup - preferential brokerage for beyond-pledge' % tmpCountry - else: - continue - else: - continue - messageList.append(msgBody) - # return - return messageList - - -# send analysis brokerage info to logger -def sendMsgToLogger(message): - _log.debug(message) - - -# send analysis brokerage info to logger with HTTP -def sendMsgToLoggerHTTP(msgList,job): - try: - # logging - iMsg = 0 - # message type - msgType = 'analy_brokerage' - # make header - if not job.jobsetID in [None,'NULL']: - msgHead = "dn='%s' : jobset=%s jobdef=%s" % (job.prodUserName,job.jobsetID,job.jobDefinitionID) - else: - msgHead = "dn='%s' : jobdef=%s" % (job.prodUserName,job.jobDefinitionID) - for msgBody in msgList: - # make message - message = msgHead + ' : ' + msgBody - # dump locally - _log.debug(message) - # get logger - _pandaLogger = PandaLogger() - _pandaLogger.lock() - _pandaLogger.setParams({'Type':msgType}) - logger = _pandaLogger.getHttpLogger(panda_config.loggername) - # add message - logger.info(message) - # release HTTP handler - _pandaLogger.release() - # sleep - iMsg += 1 - if iMsg % 5 == 0: - time.sleep(1) - except: - errType,errValue = sys.exc_info()[:2] - _log.error("sendMsgToLoggerHTTP : %s %s" % (errType,errValue)) - - -# get T2 candidates when files are missing at T2 -def getT2CandList(tmpJob,siteMapper,t2FilesMap): - if tmpJob == None: - return [] - # no cloud info - if not t2FilesMap.has_key(tmpJob.cloud): - return [] - # loop over all files - tmpCandT2s = None - for tmpFile in tmpJob.Files: - if tmpFile.type == 'input' and tmpFile.status == 'missing': - # no dataset info - if not t2FilesMap[tmpJob.cloud].has_key(tmpFile.dataset): - return [] - # initial candidates - if tmpCandT2s == None: - tmpCandT2s = t2FilesMap[tmpJob.cloud][tmpFile.dataset]['sites'] - # check all candidates - newCandT2s = [] - for tmpCandT2 in tmpCandT2s: - # site doesn't have the dataset - if not t2FilesMap[tmpJob.cloud][tmpFile.dataset]['sites'].has_key(tmpCandT2): - continue - # site has the file - if tmpFile.lfn in t2FilesMap[tmpJob.cloud][tmpFile.dataset]['sites'][tmpCandT2]: - if not tmpCandT2 in newCandT2s: - newCandT2s.append(tmpCandT2) - # set new candidates - tmpCandT2s = newCandT2s - if tmpCandT2s == []: - break - # return [] if no missing files - if tmpCandT2s == None: - return [] - # return - tmpCandT2s.sort() - return tmpCandT2s - - -# get hospital queues -def getHospitalQueues(siteMapper): - retMap = {} - # hospital words - goodWordList = ['CORE$','VL$','MEM$','MP\d+$','LONG$'] - # loop over all clouds - for tmpCloudName in siteMapper.getCloudList(): - # get cloud - tmpCloudSpec = siteMapper.getCloud(tmpCloudName) - # get T1 - tmpT1Name = tmpCloudSpec['source'] - tmpT1Spec = siteMapper.getSite(tmpT1Name) - # skip if DDM is undefined - if tmpT1Spec.ddm == []: - continue - # loop over all sites - for tmpSiteName in tmpCloudSpec['sites']: - # skip T1 defined in cloudconfig - if tmpSiteName == tmpT1Name: - continue - # check hospital words - checkHospWord = False - for tmpGoodWord in goodWordList: - if re.search(tmpGoodWord,tmpSiteName) != None: - checkHospWord = True - break - if not checkHospWord: - continue - # check site - if not siteMapper.checkSite(tmpSiteName): - continue - tmpSiteSpec = siteMapper.getSite(tmpSiteName) - # check DDM - if tmpT1Spec.ddm == tmpSiteSpec.ddm: - # append - if not retMap.has_key(tmpCloudName): - retMap[tmpCloudName] = [] - if not tmpSiteName in retMap[tmpCloudName]: - retMap[tmpCloudName].append(tmpSiteName) - _log.debug('hospital queues : %s' % str(retMap)) - # return - return retMap - - -# get prestage sites -def getPrestageSites(siteMapper): - retList = [] - # get cloud - tmpCloudSpec = siteMapper.getCloud('US') - # get T1 - tmpT1Name = tmpCloudSpec['source'] - tmpT1Spec = siteMapper.getSite(tmpT1Name) - # loop over all sites - for tmpSiteName in tmpCloudSpec['sites']: - # check site - if not siteMapper.checkSite(tmpSiteName): - continue - # get spec - tmpSiteSpec = siteMapper.getSite(tmpSiteName) - # add if DDM is the same as T1 - if tmpT1Spec.ddm == tmpSiteSpec.ddm and not tmpSiteName in retList: - retList.append(tmpSiteName) - _log.debug('US prestage sites : %s' % str(retList)) - # return - return retList - - -# make compact dialog message -def makeCompactDiagMessage(header,results): - # limit - maxSiteList = 5 - # types for compact format - compactTypeList = ['status','cpucore'] - # message mapping - messageMap = {'rel' : 'missing rel/cache', - 'pilot' : 'no pilot', - 'status' : 'not online', - 'disk' : 'SE full', - 'memory' : 'RAM shortage', - 'transferring' : 'many transferring', - 'share' : 'zero share', - 'maxtime' : 'short walltime', - 'cpucore' : 'CPU core mismatch', - 'scratch' : 'small scratch disk' - } - # put header - if header in ['',None]: - retStr = 'No candidate - ' - else: - retStr = 'special brokerage for %s - ' % header - # count number of sites per type - numTypeMap = {} - for resultType,resultList in results.iteritems(): - # ignore empty - if len(resultList) == 0: - continue - # add - nSites = len(resultList) - if not numTypeMap.has_key(nSites): - numTypeMap[nSites] = [] - numTypeMap[nSites].append(resultType) - # sort - numTypeKeys = numTypeMap.keys() - numTypeKeys.sort() - # use compact format for largest one - largeTypes = None - if len(numTypeKeys) > 0: - largeTypes = numTypeMap[numTypeKeys[-1]] - # loop over all types - for numTypeKey in numTypeKeys: - for resultType in numTypeMap[numTypeKey]: - # label - if messageMap.has_key(resultType): - retStr += '%s at ' % messageMap[resultType] - else: - retStr += '%s at' % resultType - # use comact format or not - if (resultType in compactTypeList+largeTypes \ - or len(results[resultType]) >= maxSiteList) \ - and header in ['',None,'reprocessing'] : - if len(results[resultType]) == 1: - retStr += '%s site' % len(results[resultType]) - else: - retStr += '%s sites' % len(results[resultType]) - else: - for tmpSite in results[resultType]: - retStr += '%s,' % tmpSite - retStr = retStr[:-1] - retStr += '. ' - retStr = retStr[:-2] - # return - return retStr - - -# message class -class MsgWrapper: - def __init__(self): - self.timestamp = datetime.datetime.utcnow().isoformat('/') - - def info(self,msg): - _log.info(self.timestamp + ' ' + msg) - - def debug(self,msg): - _log.debug(self.timestamp + ' ' + msg) - - def error(self,msg): - _log.error(self.timestamp + ' ' + msg) - - def warning(self,msg): - _log.warning(self.timestamp + ' ' + msg) - - - -# schedule -def schedule(jobs,taskBuffer,siteMapper,forAnalysis=False,setScanSiteList=[],trustIS=False, - distinguishedName=None,specialWeight={},getWeight=False,sizeMapForCheck={}, - datasetSize=0,replicaMap={},pd2pT1=False,reportLog=False,minPriority=None, - t2FilesMap={},preferredCountries=[],siteReliability=None): - # make a message instance - tmpLog = MsgWrapper() - try: - tmpLog.debug('start %s %s %s %s minPrio=%s pref=%s siteRel=%s' % (forAnalysis,str(setScanSiteList),trustIS, - distinguishedName,minPriority, - str(preferredCountries), - siteReliability)) - if specialWeight != {}: - tmpLog.debug('PD2P weight : %s' % str(specialWeight)) - tmpLog.debug('replicaMap : %s' % str(replicaMap)) - # no jobs - if len(jobs) == 0: - tmpLog.debug('finished : no jobs') - return - allOkFilesMap = {} - # use ANALY_CERN_XROOTD and not ANALY_CERN for EOS migration - if forAnalysis: - if 'ANALY_CERN_XROOTD' in setScanSiteList and 'ANALY_CERN' in setScanSiteList: - setScanSiteList.remove('ANALY_CERN') - tmpLog.debug('remove ANALY_CERN since ANALY_CERN_XROOTD is also a candidate') - nJob = 20 - iJob = 0 - nFile = 20 - fileList = [] - guidList = [] - okFiles = {} - totalNumInputs = 0 - totalInputSize = 0 - chosen_ce = None - prodDBlock = None - computingSite = None - dispatchDBlock = None - previousCloud = None - prevRelease = None - prevMemory = None - prevCmtConfig = None - prevProType = None - prevSourceLabel= None - prevDiskCount = None - prevHomePkg = None - prevDirectAcc = None - prevCoreCount = None - prevBrokergageSiteList = None - prevManualPreset = None - prevGoToT2Flag = None - prevWorkingGroup = None - prevMaxCpuCount = None - prevBrokerageNote = None - prevPriority = None - - nWNmap = {} - indexJob = 0 - vomsOK = None - - diskThreshold = 200 - diskThresholdPD2P = 1024 * 3 - manyInputsThr = 20 - weightUsedByBrokerage = {} - - prestageSites = getPrestageSites(siteMapper) - - # get statistics - faresharePolicy = {} - newJobStatWithPrio = {} - jobStatBrokerCloudsWithPrio = {} - if len(jobs) > 0 and (jobs[0].processingType.startswith('gangarobot') or \ - jobs[0].processingType.startswith('hammercloud') or \ - jobs[0].processingType in ['pandamover','usermerge']): - # disable redundant counting for HC - jobStatistics = {} - jobStatBroker = {} - jobStatBrokerClouds = {} - nRunningMap = {} - hospitalQueueMap = {} - else: - jobStatistics = taskBuffer.getJobStatistics(forAnal=forAnalysis) - if not forAnalysis: - jobStatBroker = {} - jobStatBrokerClouds = taskBuffer.getJobStatisticsBrokerage() - faresharePolicy = taskBuffer.getFaresharePolicy() - else: - if minPriority == None: - jobStatBroker = taskBuffer.getJobStatisticsAnalBrokerage() - else: - jobStatBroker = taskBuffer.getJobStatisticsAnalBrokerage(minPriority=minPriority) - nRunningMap = taskBuffer.getnRunningInSiteData() - hospitalQueueMap = getHospitalQueues(siteMapper) - # sort jobs by siteID. Some jobs may already define computingSite - jobs.sort(_compFunc) - # brokerage for analysis - candidateForAnal = True - relCloudMap = {} - loggerMessages = [] - # get all input files for bulk LFC lookup - allLFNs = [] - allGUIDs = [] - for tmpJob in jobs: - if tmpJob.prodSourceLabel in ('test','managed'): - for tmpFile in tmpJob.Files: - if tmpFile.type == 'input' and not tmpFile.lfn in allLFNs: - allLFNs.append(tmpFile.lfn) - allGUIDs.append(tmpFile.GUID) - # loop over all jobs + terminator(None) - for job in jobs+[None]: - indexJob += 1 - # ignore failed jobs - if job == None: - pass - elif job.jobStatus == 'failed': - continue - # list of sites for special brokerage - specialBrokergageSiteList = [] - # note for brokerage - brokerageNote = '' - # send jobs to T2 when files are missing at T1 - goToT2Flag = False - if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \ - and specialBrokergageSiteList == []: - currentT2CandList = getT2CandList(job,siteMapper,t2FilesMap) - if currentT2CandList != []: - goToT2Flag = True - specialBrokergageSiteList = currentT2CandList - tmpLog.debug('PandaID:%s -> set SiteList=%s to use T2 for missing files at T1' % (job.PandaID,specialBrokergageSiteList)) - brokerageNote = 'useT2' - # hack for split T1 - if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \ - and job.cloud == 'NL' and specialBrokergageSiteList == []: - # loop over all input datasets - tmpCheckedDS = [] - useSplitT1 = None - for tmpFile in job.Files: - if tmpFile.type == 'input' and (not tmpFile.dataset.startswith('ddo')) \ - and (not tmpFile.dataset in tmpCheckedDS): - # init - if useSplitT1 == None: - useSplitT1 = True - # no replica map - if not replicaMap.has_key(tmpFile.dataset): - # not set - useSplitT1 = False - break - # check if input datasets are available only at NIKHEF - tmpRepMap = replicaMap[tmpFile.dataset] - splitT1HasDS = False - for tmpSplitT1Key in tmpRepMap.keys(): - if tmpSplitT1Key.startswith('NIKHEF-ELPROD'): - splitT1HasDS = True - break - if splitT1HasDS \ - and not tmpRepMap.has_key('SARA-MATRIX_MCDISK') \ - and not tmpRepMap.has_key('SARA-MATRIX_DATADISK') \ - and not tmpRepMap.has_key('SARA-MATRIX_MCTAPE') \ - and not tmpRepMap.has_key('SARA-MATRIX_DATATAPE'): - pass - else: - # not set - useSplitT1 = False - break - # set - if useSplitT1 == True: - specialBrokergageSiteList = ['NIKHEF-ELPROD'] - tmpLog.debug('PandaID:%s -> set SiteList=%s for split T1' % (job.PandaID,specialBrokergageSiteList)) - brokerageNote = 'useSplitNLT1' - # set computingSite to T1 for high priority jobs - if job != None and job.currentPriority >= 950 and job.computingSite == 'NULL' \ - and job.prodSourceLabel in ('test','managed') and specialBrokergageSiteList == []: - specialBrokergageSiteList = [siteMapper.getCloud(job.cloud)['source']] - # set site list to use T1 and T1_VL - if hospitalQueueMap.has_key(job.cloud): - specialBrokergageSiteList += hospitalQueueMap[job.cloud] - tmpLog.debug('PandaID:%s -> set SiteList=%s for high prio' % (job.PandaID,specialBrokergageSiteList)) - brokerageNote = 'highPrio' - # set computingSite to T1 when too many inputs are required - if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \ - and specialBrokergageSiteList == []: - # counts # of inputs - tmpTotalInput = 0 - for tmpFile in job.Files: - if tmpFile.type == 'input': - tmpTotalInput += 1 - if tmpTotalInput >= manyInputsThr: - specialBrokergageSiteList = [siteMapper.getCloud(job.cloud)['source']] - # set site list to use T1 and T1_VL - if hospitalQueueMap.has_key(job.cloud): - specialBrokergageSiteList += hospitalQueueMap[job.cloud] - tmpLog.debug('PandaID:%s -> set SiteList=%s for too many inputs' % (job.PandaID,specialBrokergageSiteList)) - brokerageNote = 'manyInput' - # use limited sites for reprocessing - if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \ - and job.processingType in ['reprocessing'] and specialBrokergageSiteList == []: - for tmpSiteName in siteMapper.getCloud(job.cloud)['sites']: - if siteMapper.checkSite(tmpSiteName): - tmpSiteSpec = siteMapper.getSite(tmpSiteName) - if _checkRelease(job.AtlasRelease,tmpSiteSpec.validatedreleases): - specialBrokergageSiteList.append(tmpSiteName) - tmpLog.debug('PandaID:%s -> set SiteList=%s for processingType=%s' % (job.PandaID,specialBrokergageSiteList,job.processingType)) - brokerageNote = '%s' % job.processingType - # use limited sites for MP jobs - if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \ - and not job.coreCount in [None,'NULL'] and job.coreCount > 1 and specialBrokergageSiteList == []: - for tmpSiteName in siteMapper.getCloud(job.cloud)['sites']: - if siteMapper.checkSite(tmpSiteName): - tmpSiteSpec = siteMapper.getSite(tmpSiteName) - if tmpSiteSpec.coreCount > 1: - specialBrokergageSiteList.append(tmpSiteName) - tmpLog.debug('PandaID:%s -> set SiteList=%s for MP=%scores' % (job.PandaID,specialBrokergageSiteList,job.coreCount)) - brokerageNote = 'MP=%score' % job.coreCount - # manually set site - manualPreset = False - if job != None and job.computingSite != 'NULL' and job.prodSourceLabel in ('test','managed') \ - and specialBrokergageSiteList == []: - specialBrokergageSiteList = [job.computingSite] - manualPreset = True - brokerageNote = 'presetSite' - overwriteSite = False - # new bunch or terminator - if job == None or len(fileList) >= nFile \ - or (dispatchDBlock == None and job.homepackage.startswith('AnalysisTransforms')) \ - or prodDBlock != job.prodDBlock or job.computingSite != computingSite or iJob > nJob \ - or previousCloud != job.cloud or prevRelease != job.AtlasRelease \ - or prevCmtConfig != job.cmtConfig \ - or (computingSite in ['RAL_REPRO','INFN-T1_REPRO'] and len(fileList)>=2) \ - or (prevProType in skipBrokerageProTypes and iJob > 0) \ - or prevDirectAcc != job.transferType \ - or prevMemory != job.minRamCount \ - or prevDiskCount != job.maxDiskCount \ - or prevCoreCount != job.coreCount \ - or prevWorkingGroup != job.workingGroup \ - or prevProType != job.processingType \ - or prevMaxCpuCount != job.maxCpuCount \ - or prevBrokergageSiteList != specialBrokergageSiteList: - if indexJob > 1: - tmpLog.debug('new bunch') - tmpLog.debug(' iJob %s' % iJob) - tmpLog.debug(' cloud %s' % previousCloud) - tmpLog.debug(' rel %s' % prevRelease) - tmpLog.debug(' sourceLabel %s' % prevSourceLabel) - tmpLog.debug(' cmtConfig %s' % prevCmtConfig) - tmpLog.debug(' memory %s' % prevMemory) - tmpLog.debug(' priority %s' % prevPriority) - tmpLog.debug(' prodDBlock %s' % prodDBlock) - tmpLog.debug(' computingSite %s' % computingSite) - tmpLog.debug(' processingType %s' % prevProType) - tmpLog.debug(' workingGroup %s' % prevWorkingGroup) - tmpLog.debug(' coreCount %s' % prevCoreCount) - tmpLog.debug(' maxCpuCount %s' % prevMaxCpuCount) - tmpLog.debug(' transferType %s' % prevDirectAcc) - tmpLog.debug(' goToT2 %s' % prevGoToT2Flag) - # brokerage decisions - resultsForAnal = {'rel':[],'pilot':[],'disk':[],'status':[],'weight':[],'memory':[], - 'share':[],'transferring':[],'prefcountry':[],'cpucore':[], - 'reliability':[],'maxtime':[],'scratch':[]} - # determine site - if (iJob == 0 or chosen_ce != 'TOBEDONE') and prevBrokergageSiteList in [None,[]]: - # file scan for pre-assigned jobs - jobsInBunch = jobs[indexJob-iJob-1:indexJob-1] - if jobsInBunch != [] and fileList != [] and (not computingSite in prestageSites) \ - and (jobsInBunch[0].prodSourceLabel in ['managed','software'] or \ - re.search('test',jobsInBunch[0].prodSourceLabel) != None): - # get site spec - tmp_chosen_ce = siteMapper.getSite(computingSite) - # get files from LRC - okFiles = _getOkFiles(tmp_chosen_ce,fileList,guidList,allLFNs,allGUIDs,allOkFilesMap,tmpLog) - # loop over all jobs - for tmpJob in jobsInBunch: - # set 'ready' if files are already there - _setReadyToFiles(tmpJob,okFiles,siteMapper,tmpLog) - else: - # load balancing - minSites = {} - nMinSites = 2 - if prevBrokergageSiteList != []: - # special brokerage - scanSiteList = prevBrokergageSiteList - elif setScanSiteList == []: - if siteMapper.checkCloud(previousCloud): - # use cloud sites - scanSiteList = siteMapper.getCloud(previousCloud)['sites'] - else: - # use default sites - scanSiteList = siteMapper.getCloud('default')['sites'] - else: - # use given sites - scanSiteList = setScanSiteList - # add long queue - for tmpShortQueue,tmpLongQueue in shortLongMap.iteritems(): - if tmpShortQueue in scanSiteList: - if not tmpLongQueue in scanSiteList: - scanSiteList.append(tmpLongQueue) - # the number/size of inputs per job - nFilesPerJob = float(totalNumInputs)/float(iJob) - inputSizePerJob = float(totalInputSize)/float(iJob) - # use T1 for jobs with many inputs when weight is negative - if (not forAnalysis) and _isTooManyInput(nFilesPerJob,inputSizePerJob) and \ - siteMapper.getCloud(previousCloud)['weight'] < 0 and prevManualPreset == False: - scanSiteList = [siteMapper.getCloud(previousCloud)['source']] - # set site list to use T1 and T1_VL - if hospitalQueueMap.has_key(previousCloud): - scanSiteList += hospitalQueueMap[previousCloud] - # get availabe sites with cache - useCacheVersion = False - siteListWithCache = [] - if forAnalysis: - if re.search('-\d+\.\d+\.\d+\.\d+',prevRelease) != None: - useCacheVersion = True - siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList,caches=prevRelease,cmtConfig=prevCmtConfig) - tmpLog.debug(' using installSW for cache %s' % prevRelease) - elif re.search('-\d+\.\d+\.\d+$',prevRelease) != None: - useCacheVersion = True - siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList,releases=prevRelease,cmtConfig=prevCmtConfig) - tmpLog.debug(' using installSW for release %s' % prevRelease) - elif re.search(':rel_\d+$$',prevRelease) != None: - useCacheVersion = True - iteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList, - releases=prevRelease.split(':')[0], - caches=prevRelease.split(':')[1], - cmtConfig=prevCmtConfig) - tmpLog.debug(' using installSW for release:cache %s' % prevRelease) - elif previousCloud in ['DE','NL','FR','CA','ES','IT','TW','UK','US','ND','CERN','RU']: - useCacheVersion = True - # change / to - - convedPrevHomePkg = prevHomePkg.replace('/','-') - if re.search('rel_\d+(\n|$)',prevHomePkg) == None: - # only cache is used for normal jobs - siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList,caches=convedPrevHomePkg, - cmtConfig=prevCmtConfig) - else: - # both AtlasRelease and homepackage are used for nightlies - siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList, - releases=prevRelease, - caches=convedPrevHomePkg, - cmtConfig=prevCmtConfig) - tmpLog.debug(' cache %s' % prevHomePkg) - if useCacheVersion: - tmpLog.debug(' cache/relSites %s' % str(siteListWithCache)) - # release/cmtconfig check - foundRelease = False - # found candidate - foundOneCandidate = False - # randomize the order - if forAnalysis: - random.shuffle(scanSiteList) - # get cnadidates - if True: - # loop over all sites - for site in scanSiteList: - tmpLog.debug('calculate weight for site:%s' % site) - # _allSites may conain NULL after sort() - if site == 'NULL': - continue - # ignore test sites - if (prevManualPreset == False) and (site.endswith('test') or \ - site.endswith('Test') or site.startswith('Test')): - continue - # ignore analysis queues - if (not forAnalysis) and site.startswith('ANALY'): - continue - # get SiteSpec - if siteMapper.checkSite(site): - tmpSiteSpec = siteMapper.getSite(site) - else: - tmpLog.debug(" skip: %s doesn't exist in DB" % site) - continue - # check status - if tmpSiteSpec.status in ['offline','brokeroff'] and computingSite in ['NULL',None,'']: - if forAnalysis and tmpSiteSpec.status == 'brokeroff' and tmpSiteSpec.accesscontrol == 'grouplist': - # ignore brokeroff for grouplist site - pass - elif forAnalysis and prevProType in ['hammercloud','gangarobot','gangarobot-squid']: - # ignore site status for HC - pass - else: - tmpLog.debug(' skip: status %s' % tmpSiteSpec.status) - resultsForAnal['status'].append(site) - continue - if tmpSiteSpec.status == 'test' and (not prevProType in ['prod_test','hammercloud','gangarobot','gangarobot-squid']) \ - and not prevSourceLabel in ['test','prod_test']: - tmpLog.debug(' skip: status %s for %s' % (tmpSiteSpec.status,prevProType)) - resultsForAnal['status'].append(site) - continue - tmpLog.debug(' status=%s' % tmpSiteSpec.status) - # check core count - if tmpSiteSpec.coreCount > 1: - # use multi-core queue for MP jobs - if not prevCoreCount in [None,'NULL'] and prevCoreCount > 1: - pass - else: - tmpLog.debug(' skip: MP site (%s core) for job.coreCount=%s' % (tmpSiteSpec.coreCount, - prevCoreCount)) - resultsForAnal['cpucore'].append(site) - continue - else: - # use single core for non-MP jobs - if not prevCoreCount in [None,'NULL'] and prevCoreCount > 1: - tmpLog.debug(' skip: single core site (%s core) for job.coreCount=%s' % (tmpSiteSpec.coreCount, - prevCoreCount)) - resultsForAnal['cpucore'].append(site) - continue - # check memory - if tmpSiteSpec.memory != 0 and not prevMemory in [None,0,'NULL']: - try: - if int(tmpSiteSpec.memory) < int(prevMemory): - tmpLog.debug(' skip: memory shortage %s<%s' % (tmpSiteSpec.memory,prevMemory)) - resultsForAnal['memory'].append(site) - continue - except: - errtype,errvalue = sys.exc_info()[:2] - tmpLog.error("memory check : %s %s" % (errtype,errvalue)) - # check maxcpucount - if tmpSiteSpec.maxtime != 0 and not prevMaxCpuCount in [None,0,'NULL']: - try: - if int(tmpSiteSpec.maxtime) < int(prevMaxCpuCount): - tmpLog.debug(' skip: insufficient maxtime %s<%s' % (tmpSiteSpec.maxtime,prevMaxCpuCount)) - resultsForAnal['maxtime'].append(site) - continue - except: - errtype,errvalue = sys.exc_info()[:2] - tmpLog.error("maxtime check : %s %s" % (errtype,errvalue)) - # check max input size - if tmpSiteSpec.maxinputsize != 0 and (not prevDiskCount in [None,0,'NULL']): - try: - if int(tmpSiteSpec.maxinputsize) < int(prevDiskCount): - tmpLog.debug(' skip: not enough disk %s<%s' % (tmpSiteSpec.maxinputsize,prevDiskCount)) - resultsForAnal['scratch'].append(site) - continue - except: - errtype,errvalue = sys.exc_info()[:2] - tmpLog.error("disk check : %s %s" % (errtype,errvalue)) - tmpLog.debug(' maxinput=%s' % tmpSiteSpec.maxinputsize) - # reliability - if forAnalysis and isinstance(siteReliability,types.IntType): - if tmpSiteSpec.reliabilityLevel != None and tmpSiteSpec.reliabilityLevel > siteReliability: - tmpLog.debug(' skip: insufficient reliability %s > %s' % (tmpSiteSpec.reliabilityLevel,siteReliability)) - resultsForAnal['reliability'].append(site) - continue - # change NULL cmtconfig to slc3/4 - if prevCmtConfig in ['NULL','',None]: - if forAnalysis: - tmpCmtConfig = 'i686-slc4-gcc34-opt' - else: - tmpCmtConfig = 'i686-slc3-gcc323-opt' - else: - tmpCmtConfig = prevCmtConfig - # set release - releases = tmpSiteSpec.releases - origReleases = releases - if prevProType in ['reprocessing']: - # use validated releases for reprocessing - releases = tmpSiteSpec.validatedreleases - if not useCacheVersion: - tmpLog.debug(' %s' % str(releases)) - if origReleases == ['ANY']: - # doesn't check releases for catch all - tmpLog.debug(' no release check due to releases=%s' % origReleases) - foundRelease = True - elif forAnalysis and (tmpSiteSpec.cloud in ['ND'] or prevRelease==''): - # doesn't check releases for analysis - tmpLog.debug(' no release check') - pass - elif forAnalysis and useCacheVersion: - # cache matching - if not site in siteListWithCache: - tmpLog.debug(' skip: cache %s/%s not found' % (prevRelease.replace('\n',' '),prevCmtConfig)) - if trustIS: - resultsForAnal['rel'].append(site) - continue - elif prevRelease != None and \ - (useCacheVersion and not tmpSiteSpec.cloud in ['ND'] and not site in ['CERN-RELEASE']) and \ - (not prevProType in ['reprocessing']) and \ - (not site in siteListWithCache): - tmpLog.debug(' skip: cache %s/%s not found' % (prevHomePkg.replace('\n',' '),prevCmtConfig)) - # send message to logger - try: - if prevSourceLabel in ['managed','test']: - resultsForAnal['rel'].append(site) - # make message - message = '%s - cache %s/%s not found' % (site,prevHomePkg.replace('\n',' '),prevCmtConfig) - if not message in loggerMessages: - loggerMessages.append(message) - except: - pass - continue - elif prevRelease != None and \ - ((not useCacheVersion and releases != [] and not tmpSiteSpec.cloud in ['ND'] and not site in ['CERN-RELEASE']) or prevProType in ['reprocessing']) and \ - (((not _checkRelease(prevRelease,releases) and prevManualPreset == False) or not site in siteListWithCache) and not tmpSiteSpec.cloud in ['ND'] and not site in ['CERN-RELEASE']): - # release matching - if not useCacheVersion: - tmpLog.debug(' skip: release %s/%s not found' % (prevRelease.replace('\n',' '),prevCmtConfig)) - else: - tmpLog.debug(' skip: repro cache %s/%s not found' % (prevHomePkg.replace('\n',' '),prevCmtConfig)) - resultsForAnal['rel'].append(site) - continue - elif not foundRelease: - # found at least one site has the release - foundRelease = True - # direct access - if prevDirectAcc == 'direct' and not tmpSiteSpec.allowdirectaccess: - tmpLog.debug(' skip: no direct access support') - continue - # get pilot statistics - nPilotsGet = 0 - nPilotsUpdate = 0 - if nWNmap == {}: - nWNmap = taskBuffer.getCurrentSiteData() - if nWNmap.has_key(site): - nPilots = nWNmap[site]['getJob'] + nWNmap[site]['updateJob'] - nPilotsGet = nWNmap[site]['getJob'] - nPilotsUpdate = nWNmap[site]['updateJob'] - else: - nPilots = 0 - tmpLog.debug(' original nPilots:%s get:%s update:%s' % (nPilots,nPilotsGet,nPilotsUpdate)) - # limit on (G+1)/(U+1) - limitOnGUmax = 2.0 - limitOnGUmin = 0.5 - guRatio = float(1+nPilotsGet)/float(1+nPilotsUpdate) - if guRatio > limitOnGUmax: - nPilotsGet = limitOnGUmax * float(1+nPilotsUpdate) - 1.0 - elif guRatio < limitOnGUmin: - nPilotsGet = limitOnGUmin * float(1+nPilotsUpdate) - 1.0 - tmpLog.debug(' limited nPilots:%s get:%s update:%s' % (nPilots,nPilotsGet,nPilotsUpdate)) - # if no pilots - if nPilots == 0 and nWNmap != {}: - tmpLog.debug(" skip: %s no pilot" % site) - resultsForAnal['pilot'].append(site) - continue - # if no jobs in jobsActive/jobsDefined - if not jobStatistics.has_key(site): - jobStatistics[site] = {'assigned':0,'activated':0,'running':0,'transferring':0} - # set nRunning - if forAnalysis: - if not nRunningMap.has_key(site): - nRunningMap[site] = 0 - # check space - if specialWeight != {}: - # for PD2P - if sizeMapForCheck.has_key(site): - # threshold for PD2P max(5%,3TB) - thrForThisSite = long(sizeMapForCheck[site]['total'] * 5 / 100) - if thrForThisSite < diskThresholdPD2P: - thrForThisSite = diskThresholdPD2P - remSpace = sizeMapForCheck[site]['total'] - sizeMapForCheck[site]['used'] - tmpLog.debug(' space available=%s remain=%s thr=%s' % (sizeMapForCheck[site]['total'], - remSpace,thrForThisSite)) - if remSpace-datasetSize < thrForThisSite: - tmpLog.debug(' skip: disk shortage %s-%s< %s' % (remSpace,datasetSize,thrForThisSite)) - if getWeight: - weightUsedByBrokerage[site] = "NA : disk shortage" - continue - elif site != siteMapper.getCloud(previousCloud)['source']: - # for T2 - if tmpSiteSpec.space != 0: - nRemJobs = jobStatistics[site]['assigned']+jobStatistics[site]['activated']+jobStatistics[site]['running'] - if not forAnalysis: - # take assigned/activated/running jobs into account for production - remSpace = tmpSiteSpec.space - 0.250*nRemJobs - else: - remSpace = tmpSiteSpec.space - tmpLog.debug(' space available=%s remain=%s' % (tmpSiteSpec.space,remSpace)) - if remSpace < diskThreshold: - tmpLog.debug(' skip: disk shortage < %s' % diskThreshold) - resultsForAnal['disk'].append(site) - # keep message to logger - try: - if prevSourceLabel in ['managed','test']: - # make message - message = '%s - disk %s < %s' % (site,remSpace,diskThreshold) - if not message in loggerMessages: - loggerMessages.append(message) - except: - pass - continue - # get the process group - tmpProGroup = ProcessGroups.getProcessGroup(prevProType) - if prevProType in skipBrokerageProTypes: - # use original processingType since prod_test is in the test category and thus is interfered by validations - tmpProGroup = prevProType - # production share - skipDueToShare = False - try: - if not forAnalysis and prevSourceLabel in ['managed'] and faresharePolicy.has_key(site): - for tmpPolicy in faresharePolicy[site]['policyList']: - # ignore priority policy - if tmpPolicy['priority'] != None: - continue - # only zero share - if tmpPolicy['share'] != '0%': - continue - # check group - if tmpPolicy['group'] != None: - if '*' in tmpPolicy['group']: - # wildcard - tmpPatt = '^' + tmpPolicy['group'].replace('*','.*') + '$' - if re.search(tmpPatt,prevWorkingGroup) == None: - continue - else: - # normal definition - if prevWorkingGroup != tmpPolicy['group']: - continue - else: - # catch all except WGs used by other policies - groupInDefList = faresharePolicy[site]['groupList'] - usedByAnother = False - # loop over all groups - for groupInDefItem in groupInDefList: - if '*' in groupInDefItem: - # wildcard - tmpPatt = '^' + groupInDefItem.replace('*','.*') + '$' - if re.search(tmpPatt,prevWorkingGroup) != None: - usedByAnother = True - break - else: - # normal definition - if prevWorkingGroup == groupInDefItem: - usedByAnother = True - break - if usedByAnother: - continue - # check type - if tmpPolicy['type'] != None: - if tmpPolicy['type'] == tmpProGroup: - skipDueToShare = True - break - else: - # catch all except PGs used by other policies - typeInDefList = faresharePolicy[site]['typeList'][tmpPolicy['group']] - usedByAnother = False - for typeInDefItem in typeInDefList: - if typeInDefItem == tmpProGroup: - usedByAnother = True - break - if not usedByAnother: - skipDueToShare = True - break - # skip - if skipDueToShare: - tmpLog.debug(" skip: %s zero share" % site) - resultsForAnal['share'].append(site) - continue - except: - errtype,errvalue = sys.exc_info()[:2] - tmpLog.error("share check : %s %s" % (errtype,errvalue)) - # the number of assigned and activated - if not forAnalysis: - if not jobStatBrokerClouds.has_key(previousCloud): - jobStatBrokerClouds[previousCloud] = {} - # use number of jobs in the cloud - jobStatBroker = jobStatBrokerClouds[previousCloud] - if not jobStatBroker.has_key(site): - jobStatBroker[site] = {} - if not jobStatBroker[site].has_key(tmpProGroup): - jobStatBroker[site][tmpProGroup] = {'assigned':0,'activated':0,'running':0,'transferring':0} - # count # of assigned and activated jobs for prod by taking priorities in to account - nRunJobsPerGroup = None - if not forAnalysis and prevSourceLabel in ['managed','test']: - if not jobStatBrokerCloudsWithPrio.has_key(prevPriority): - jobStatBrokerCloudsWithPrio[prevPriority] = taskBuffer.getJobStatisticsBrokerage(prevPriority) - if not jobStatBrokerCloudsWithPrio[prevPriority].has_key(previousCloud): - jobStatBrokerCloudsWithPrio[prevPriority][previousCloud] = {} - if not jobStatBrokerCloudsWithPrio[prevPriority][previousCloud].has_key(site): - jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site] = {} - if not jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site].has_key(tmpProGroup): - jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup] = {'assigned':0,'activated':0,'running':0,'transferring':0} - nAssJobs = jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup]['assigned'] - nActJobs = jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup]['activated'] - nRunJobsPerGroup = jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup]['running'] - # add newly assigned jobs - for tmpNewPriority in newJobStatWithPrio.keys(): - if tmpNewPriority < prevPriority: - continue - if not newJobStatWithPrio[tmpNewPriority].has_key(previousCloud): - continue - if not newJobStatWithPrio[tmpNewPriority][previousCloud].has_key(site): - continue - if not newJobStatWithPrio[tmpNewPriority][previousCloud][site].has_key(tmpProGroup): - continue - nAssJobs += newJobStatWithPrio[tmpNewPriority][previousCloud][site][tmpProGroup] - else: - nAssJobs = jobStatBroker[site][tmpProGroup]['assigned'] - if forAnalysis and jobStatBroker[site][tmpProGroup].has_key('defined'): - nAssJobs += jobStatBroker[site][tmpProGroup]['defined'] - nActJobs = jobStatBroker[site][tmpProGroup]['activated'] - # number of jobs per node - if not nWNmap.has_key(site): - nJobsPerNode = 1 - elif jobStatistics[site]['running']==0 or nWNmap[site]['updateJob']==0: - nJobsPerNode = 1 - else: - if nRunJobsPerGroup == None: - nJobsPerNode = float(jobStatistics[site]['running'])/float(nWNmap[site]['updateJob']) - else: - if nRunJobsPerGroup == 0: - nJobsPerNode = 1.0/float(nWNmap[site]['updateJob']) - else: - nJobsPerNode = float(nRunJobsPerGroup)/float(nWNmap[site]['updateJob']) - # limit of the number of transferring jobs - if tmpSiteSpec.transferringlimit == 0: - maxTransferring = 2000 - else: - maxTransferring = tmpSiteSpec.transferringlimit - # get ration of transferring to running - if not forAnalysis and not tmpSiteSpec.cloud in ['ND']: - nTraJobs = 0 - nRunJobs = 0 - for tmpGroupForTra,tmpCountsForTra in jobStatBroker[site].iteritems(): - if tmpCountsForTra.has_key('running'): - nRunJobs += tmpCountsForTra['running'] - if tmpCountsForTra.has_key('transferring'): - nTraJobs += tmpCountsForTra['transferring'] - tmpLog.debug(' running=%s transferring=%s max=%s' % (nRunJobs,nTraJobs,maxTransferring)) - if max(maxTransferring,2*nRunJobs) < nTraJobs: - tmpLog.debug(" skip: %s many transferring=%s > max(%s,2*running=%s)" % (site,nTraJobs,maxTransferring,nRunJobs)) - resultsForAnal['transferring'].append(site) - if prevSourceLabel in ['managed','test']: - # make message - message = '%s - too many transferring' % site - if not message in loggerMessages: - loggerMessages.append(message) - continue - # get ratio of running jobs = run(cloud)/run(all) for multi cloud - multiCloudFactor = 1 - if not forAnalysis and not previousCloud in ['NL']: - tmpTotalRunningMulti = 0 - tmpNCloudMulti = 0 - for tmpCloudMulti,tmpCloudValMulti in jobStatBrokerClouds.iteritems(): - if tmpCloudValMulti.has_key(site): - if tmpCloudValMulti[site].has_key(tmpProGroup): - tmpNCloudMulti += 1 - if tmpCloudValMulti[site][tmpProGroup].has_key('running'): - tmpTotalRunningMulti += tmpCloudValMulti[site][tmpProGroup]['running'] - # no running - if tmpTotalRunningMulti == 0: - if tmpNCloudMulti != 0: - multiCloudFactor = tmpNCloudMulti - else: - multiCloudFactor = float(tmpTotalRunningMulti+1)/float(jobStatBroker[site][tmpProGroup]['running']+1) - tmpLog.debug(' totalRun:%s cloudRun:%s multiCloud:%s' % (tmpTotalRunningMulti, - jobStatBroker[site][tmpProGroup]['running'], - multiCloudFactor)) - # country preference - preferredCountryWeight = 1.0 - preferredCountryWeightStr = '' - if forAnalysis: - if preferredCountries != [] and tmpSiteSpec.countryGroup != []: - for tmpCountry in preferredCountries: - if tmpCountry in tmpSiteSpec.countryGroup: - # avoid negative weight or zero-divide - if tmpSiteSpec.availableCPU >= tmpSiteSpec.pledgedCPU and tmpSiteSpec.pledgedCPU > 0: - preferredCountryWeight = float(tmpSiteSpec.availableCPU) / float(tmpSiteSpec.pledgedCPU) - preferredCountryWeightStr = "*(%s/%s)" % (tmpSiteSpec.availableCPU,tmpSiteSpec.pledgedCPU) - resultsForAnal['prefcountry'].append((site,tmpCountry)) - break - tmpLog.debug(' country preference=%s' % preferredCountryWeightStr[1:]) - # calculate weight - if specialWeight != {}: - if not pd2pT1: - # weight for T2 PD2P - nSubs = 1 - if specialWeight.has_key(site): - nSubs = specialWeight[site] - tmpLog.debug(' %s nSubs:%s assigned:%s activated:%s running:%s nWNsG:%s nWNsU:%s' % \ - (site,nSubs,nAssJobs,nActJobs,nRunningMap[site],nPilotsGet,nPilotsUpdate)) - winv = float(nSubs) * float(nAssJobs+nActJobs) / float(1+nRunningMap[site]) / (1.0+float(nPilotsGet)/float(1+nPilotsUpdate)) - if getWeight: - weightUsedByBrokerage[site] = "(1+%s/%s)*%s/%s/%s" % (nPilotsGet,1+nPilotsUpdate,1+nRunningMap[site],nAssJobs+nActJobs,nSubs) - else: - # weight for T1 PD2P - tmpLog.debug(' %s MoU:%s' % (site,specialWeight[site])) - winv = 1.0 / float(specialWeight[site]) - if getWeight: - weightUsedByBrokerage[site] = "%s" % specialWeight[site] - else: - if not forAnalysis: - if nRunJobsPerGroup == None: - tmpLog.debug(' %s assigned:%s activated:%s running:%s nPilotsGet:%s nPilotsUpdate:%s multiCloud:%s' % - (site,nAssJobs,nActJobs,jobStatistics[site]['running'],nPilotsGet,nPilotsUpdate,multiCloudFactor)) - else: - tmpLog.debug(' %s assigned:%s activated:%s runningGroup:%s nPilotsGet:%s nPilotsUpdate:%s multiCloud:%s' % - (site,nAssJobs,nActJobs,nRunJobsPerGroup,nPilotsGet,nPilotsUpdate,multiCloudFactor)) - else: - tmpLog.debug(' %s assigned:%s activated:%s running:%s nWNsG:%s nWNsU:%s' % - (site,nAssJobs,nActJobs,nRunningMap[site],nPilotsGet,nPilotsUpdate)) - if forAnalysis: - winv = float(nAssJobs+nActJobs) / float(1+nRunningMap[site]) / (1.0+float(nPilotsGet)/float(1+nPilotsUpdate)) - else: - if nRunJobsPerGroup == None: - winv = float(nAssJobs+nActJobs) / float(1+jobStatistics[site]['running']) / (float(1+nPilotsGet)/float(1+nPilotsUpdate)) - else: - winv = float(nAssJobs+nActJobs) / float(1+nRunJobsPerGroup) / (float(1+nPilotsGet)/float(1+nPilotsUpdate)) - winv *= float(multiCloudFactor) - # send jobs to T1 when they require many or large inputs - if _isTooManyInput(nFilesPerJob,inputSizePerJob): - if site == siteMapper.getCloud(previousCloud)['source'] or \ - (site=='NIKHEF-ELPROD' and previousCloud=='NL' and prevProType=='reprocessing') or \ - (hospitalQueueMap.has_key(previousCloud) and site in hospitalQueueMap[previousCloud]): - cloudT1Weight = 2.0 - # use weight in cloudconfig - try: - tmpCloudT1Weight = float(siteMapper.getCloud(previousCloud)['weight']) - if tmpCloudT1Weight != 0.0: - cloudT1Weight = tmpCloudT1Weight - except: - pass - winv /= cloudT1Weight - tmpLog.debug(' special weight for %s : nInputs/Job=%s inputSize/Job=%s weight=%s' % - (site,nFilesPerJob,inputSizePerJob,cloudT1Weight)) - # found at least one candidate - foundOneCandidate = True - tmpLog.debug('Site:%s 1/Weight:%s' % (site,winv)) - if forAnalysis and trustIS and reportLog: - resultsForAnal['weight'].append((site,'(1+%s/%s)*%s/%s%s' % (nPilotsGet,1+nPilotsUpdate,1+nRunningMap[site], - nAssJobs+nActJobs,preferredCountryWeightStr))) - # choose largest nMinSites weights - minSites[site] = winv - if len(minSites) > nMinSites: - maxSite = site - maxWinv = winv - for tmpSite,tmpWinv in minSites.iteritems(): - if tmpWinv > maxWinv: - maxSite = tmpSite - maxWinv = tmpWinv - # delte max one - del minSites[maxSite] - # remove too different weights - if len(minSites) >= 2: - # look for minimum - minSite = minSites.keys()[0] - minWinv = minSites[minSite] - for tmpSite,tmpWinv in minSites.iteritems(): - if tmpWinv < minWinv: - minSite = tmpSite - minWinv = tmpWinv - # look for too different weights - difference = 2 - removeSites = [] - for tmpSite,tmpWinv in minSites.iteritems(): - if tmpWinv > minWinv*difference: - removeSites.append(tmpSite) - # remove - for tmpSite in removeSites: - del minSites[tmpSite] - # set default - if len(minSites) == 0: - # cloud's list - if forAnalysis or siteMapper.checkCloud(previousCloud): - minSites[scanSiteList[0]] = 0 - else: - minSites['BNL_ATLAS_1'] = 0 - # release not found - if forAnalysis and trustIS: - candidateForAnal = False - # use only one site for prod_test to skip LFC scan - if prevProType in skipBrokerageProTypes: - if len(minSites) > 1: - minSites = {minSites.keys()[0]:0} - # choose site - tmpLog.debug('Min Sites:%s' % minSites) - if len(fileList) ==0: - # choose min 1/weight - minSite = minSites.keys()[0] - minWinv = minSites[minSite] - for tmpSite,tmpWinv in minSites.iteritems(): - if tmpWinv < minWinv: - minSite = tmpSite - minWinv = tmpWinv - chosenCE = siteMapper.getSite(minSite) - else: - # compare # of files in LRC - maxNfiles = -1 - for site in minSites: - tmp_chosen_ce = siteMapper.getSite(site) - # search LRC - if site in _disableLRCcheck: - tmpOKFiles = {} - else: - # get files from LRC - tmpOKFiles = _getOkFiles(tmp_chosen_ce,fileList,guidList,allLFNs,allGUIDs,allOkFilesMap,tmpLog) - nFiles = len(tmpOKFiles) - tmpLog.debug('site:%s - nFiles:%s/%s %s' % (site,nFiles,len(fileList),str(tmpOKFiles))) - # choose site holding max # of files - if nFiles > maxNfiles: - chosenCE = tmp_chosen_ce - maxNfiles = nFiles - okFiles = tmpOKFiles - # set job spec - tmpLog.debug('indexJob : %s' % indexJob) - tmpLog.debug('nInputs/Job : %s' % nFilesPerJob) - tmpLog.debug('inputSize/Job : %s' % inputSizePerJob) - for tmpJob in jobs[indexJob-iJob-1:indexJob-1]: - # set computingSite - if (not candidateForAnal) and forAnalysis and trustIS: - resultsForAnalStr = 'ERROR : No candidate. ' - if resultsForAnal['rel'] != []: - if prevCmtConfig in ['','NULL',None]: - resultsForAnalStr += 'Release:%s was not found at %s. ' % (prevRelease,str(resultsForAnal['rel'])) - else: - resultsForAnalStr += 'Release:%s/%s was not found at %s. ' % (prevRelease,prevCmtConfig,str(resultsForAnal['rel'])) - if resultsForAnal['pilot'] != []: - resultsForAnalStr += '%s are inactive (no pilots for last 3 hours). ' % str(resultsForAnal['pilot']) - if resultsForAnal['disk'] != []: - resultsForAnalStr += 'Disk shortage < %sGB at %s. ' % (diskThreshold,str(resultsForAnal['disk'])) - if resultsForAnal['memory'] != []: - resultsForAnalStr += 'Insufficient RAM at %s. ' % str(resultsForAnal['memory']) - if resultsForAnal['maxtime'] != []: - resultsForAnalStr += 'Shorter walltime limit than maxCpuCount:%s at ' % prevMaxCpuCount - for tmpItem in resultsForAnal['maxtime']: - if siteMapper.checkSite(tmpItem): - resultsForAnalStr += '%s:%s,' % (tmpItem,siteMapper.getSite(tmpItem).maxtime) - resultsForAnalStr = resultsForAnalStr[:-1] - resultsForAnalStr += '. ' - if resultsForAnal['status'] != []: - resultsForAnalStr += '%s are not online. ' % str(resultsForAnal['status']) - if resultsForAnal['reliability'] != []: - resultsForAnalStr += 'Insufficient reliability at %s. ' % str(resultsForAnal['reliability']) - resultsForAnalStr = resultsForAnalStr[:-1] - tmpJob.computingSite = resultsForAnalStr - else: - tmpJob.computingSite = chosenCE.sitename - # send log - if forAnalysis and trustIS and reportLog: - # put logging info to ErrorDiag just to give it back to the caller - tmpJob.brokerageErrorDiag = sendAnalyBrokeageInfo(resultsForAnal,prevRelease,diskThreshold, - tmpJob.computingSite,prevCmtConfig, - siteReliability) - tmpLog.debug('PandaID:%s -> site:%s' % (tmpJob.PandaID,tmpJob.computingSite)) - if tmpJob.computingElement == 'NULL': - if tmpJob.prodSourceLabel == 'ddm': - # use nickname for ddm jobs - tmpJob.computingElement = chosenCE.nickname - else: - tmpJob.computingElement = chosenCE.gatekeeper - # fail jobs if no sites have the release - if (not foundRelease or (tmpJob.relocationFlag != 1 and not foundOneCandidate)) and (tmpJob.prodSourceLabel in ['managed','test']): - # reset - if tmpJob.relocationFlag != 1: - tmpJob.computingSite = None - tmpJob.computingElement = None - # go to waiting - tmpJob.jobStatus = 'waiting' - tmpJob.brokerageErrorCode = ErrorCode.EC_Release - if tmpJob.relocationFlag == 1: - try: - if resultsForAnal['pilot'] != []: - tmpJob.brokerageErrorDiag = '%s no pilots' % tmpJob.computingSite - elif resultsForAnal['disk'] != []: - tmpJob.brokerageErrorDiag = 'SE full at %s' % tmpJob.computingSite - elif resultsForAnal['memory'] != []: - tmpJob.brokerageErrorDiag = 'RAM shortage at %s' % tmpJob.computingSite - elif resultsForAnal['status'] != []: - tmpJob.brokerageErrorDiag = '%s not online' % tmpJob.computingSite - elif resultsForAnal['share'] != []: - tmpJob.brokerageErrorDiag = '%s zero share' % tmpJob.computingSite - elif resultsForAnal['cpucore'] != []: - tmpJob.brokerageErrorDiag = "CPU core mismatch at %s" % tmpJob.computingSite - elif resultsForAnal['maxtime'] != []: - tmpJob.brokerageErrorDiag = "short walltime at %s" % tmpJob.computingSite - elif resultsForAnal['transferring'] != []: - tmpJob.brokerageErrorDiag = 'too many transferring at %s' % tmpJob.computingSite - elif resultsForAnal['scratch'] != []: - tmpJob.brokerageErrorDiag = 'small scratch disk at %s' % tmpJob.computingSite - elif useCacheVersion: - tmpJob.brokerageErrorDiag = '%s/%s not found at %s' % (tmpJob.homepackage,tmpJob.cmtConfig,tmpJob.computingSite) - else: - tmpJob.brokerageErrorDiag = '%s/%s not found at %s' % (tmpJob.AtlasRelease,tmpJob.cmtConfig,tmpJob.computingSite) - except: - errtype,errvalue = sys.exc_info()[:2] - tmpLog.error("failed to set diag for %s: %s %s" % (tmpJob.PandaID,errtype,errvalue)) - tmpJob.brokerageErrorDiag = 'failed to set diag. see brokerage log in the panda server' - elif not prevBrokergageSiteList in [[],None]: - try: - # make message - tmpJob.brokerageErrorDiag = makeCompactDiagMessage(prevBrokerageNote,resultsForAnal) - except: - errtype,errvalue = sys.exc_info()[:2] - tmpLog.error("failed to set special diag for %s: %s %s" % (tmpJob.PandaID,errtype,errvalue)) - tmpJob.brokerageErrorDiag = 'failed to set diag. see brokerage log in the panda server' - elif prevProType in ['reprocessing']: - tmpJob.brokerageErrorDiag = '%s/%s not found at reprocessing sites' % (tmpJob.homepackage,tmpJob.cmtConfig) - elif not useCacheVersion: - tmpJob.brokerageErrorDiag = '%s/%s not found at online sites with enough memory and disk' % \ - (tmpJob.AtlasRelease,tmpJob.cmtConfig) - else: - try: - tmpJob.brokerageErrorDiag = makeCompactDiagMessage('',resultsForAnal) - except: - errtype,errvalue = sys.exc_info()[:2] - tmpLog.error("failed to set compact diag for %s: %s %s" % (tmpJob.PandaID,errtype,errvalue)) - tmpJob.brokerageErrorDiag = 'failed to set diag. see brokerage log in the panda server' - tmpLog.debug('PandaID:%s %s' % (tmpJob.PandaID,tmpJob.brokerageErrorDiag)) - continue - # set ready if files are already there - _setReadyToFiles(tmpJob,okFiles,siteMapper,tmpLog) - # update statistics - tmpProGroup = ProcessGroups.getProcessGroup(tmpJob.processingType) - if tmpJob.processingType in skipBrokerageProTypes: - # use original processingType since prod_test is in the test category and thus is interfered by validations - tmpProGroup = tmpJob.processingType - if not jobStatistics.has_key(tmpJob.computingSite): - jobStatistics[tmpJob.computingSite] = {'assigned':0,'activated':0,'running':0} - if not jobStatBroker.has_key(tmpJob.computingSite): - jobStatBroker[tmpJob.computingSite] = {} - if not jobStatBroker[tmpJob.computingSite].has_key(tmpProGroup): - jobStatBroker[tmpJob.computingSite][tmpProGroup] = {'assigned':0,'activated':0,'running':0} - jobStatistics[tmpJob.computingSite]['assigned'] += 1 - jobStatBroker[tmpJob.computingSite][tmpProGroup]['assigned'] += 1 - # update statistics by taking priorities into account - if not forAnalysis and prevSourceLabel in ['managed','test']: - if not newJobStatWithPrio.has_key(prevPriority): - newJobStatWithPrio[prevPriority] = {} - if not newJobStatWithPrio[prevPriority].has_key(tmpJob.cloud): - newJobStatWithPrio[prevPriority][tmpJob.cloud] = {} - if not newJobStatWithPrio[prevPriority][tmpJob.cloud].has_key(tmpJob.computingSite): - newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite] = {} - if not newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite].has_key(tmpProGroup): - newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite][tmpProGroup] = 0 - newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite][tmpProGroup] += 1 - # terminate - if job == None: - break - # reset iJob - iJob = 0 - # reset file list - fileList = [] - guidList = [] - okFiles = {} - totalNumInputs = 0 - totalInputSize = 0 - # create new dispDBlock - if job.prodDBlock != 'NULL': - # get datatype - try: - tmpDataType = job.prodDBlock.split('.')[-2] - except: - # default - tmpDataType = 'GEN' - if len(tmpDataType) > 20: - # avoid too long name - tmpDataType = 'GEN' - dispatchDBlock = "panda.%s.%s.%s.%s_dis%s" % (job.taskID,time.strftime('%m.%d'),tmpDataType, - commands.getoutput('uuidgen'),job.PandaID) - tmpLog.debug('New dispatchDBlock: %s' % dispatchDBlock) - prodDBlock = job.prodDBlock - # already define computingSite - if job.computingSite != 'NULL': - # instantiate KnownSite - chosen_ce = siteMapper.getSite(job.computingSite) - # if site doesn't exist, use ANALY_BNL_ATLAS_1 - if job.homepackage.startswith('AnalysisTransforms'): - if chosen_ce.sitename == 'BNL_ATLAS_1': - chosen_ce = siteMapper.getSite('ANALY_BNL_ATLAS_1') - overwriteSite = True - else: - # default for Analysis jobs - if job.homepackage.startswith('AnalysisTransforms'): - chosen_ce = siteMapper.getSite('ANALY_BNL_ATLAS_1') - overwriteSite = True - else: - # set chosen_ce - chosen_ce = 'TOBEDONE' - # increment iJob - iJob += 1 - # reserve computingSite and cloud - computingSite = job.computingSite - previousCloud = job.cloud - prevRelease = job.AtlasRelease - prevMemory = job.minRamCount - prevCmtConfig = job.cmtConfig - prevProType = job.processingType - prevSourceLabel = job.prodSourceLabel - prevDiskCount = job.maxDiskCount - prevHomePkg = job.homepackage - prevDirectAcc = job.transferType - prevCoreCount = job.coreCount - prevMaxCpuCount = job.maxCpuCount - prevBrokergageSiteList = specialBrokergageSiteList - prevManualPreset = manualPreset - prevGoToT2Flag = goToT2Flag - prevWorkingGroup = job.workingGroup - prevBrokerageNote = brokerageNote - # truncate prio to avoid too many lookups - if not job.currentPriority in [None,'NULL']: - prevPriority = (job.currentPriority / 50) * 50 - # assign site - if chosen_ce != 'TOBEDONE': - job.computingSite = chosen_ce.sitename - if job.computingElement == 'NULL': - if job.prodSourceLabel == 'ddm': - # use nickname for ddm jobs - job.computingElement = chosen_ce.nickname - else: - job.computingElement = chosen_ce.gatekeeper - # update statistics - if not jobStatistics.has_key(job.computingSite): - jobStatistics[job.computingSite] = {'assigned':0,'activated':0,'running':0} - jobStatistics[job.computingSite]['assigned'] += 1 - tmpLog.debug('PandaID:%s -> preset site:%s' % (job.PandaID,chosen_ce.sitename)) - # set cloud - if job.cloud in ['NULL',None,'']: - job.cloud = chosen_ce.cloud - # set destinationSE - destSE = job.destinationSE - if siteMapper.checkCloud(job.cloud): - # use cloud dest for non-exsiting sites - if job.prodSourceLabel != 'user' and (not job.destinationSE in siteMapper.siteSpecList.keys()) \ - and job.destinationSE != 'local': - destSE = siteMapper.getCloud(job.cloud)['dest'] - job.destinationSE = destSE - # use CERN-PROD_EOSDATADISK for CERN-EOS jobs - if job.computingSite in ['CERN-EOS']: - overwriteSite = True - if overwriteSite: - # overwrite SE for analysis jobs which set non-existing sites - destSE = job.computingSite - job.destinationSE = destSE - # set dispatchDBlock and destinationSE - first = True - for file in job.Files: - # dispatchDBlock. Set dispDB for prestaging jobs too - if file.type == 'input' and file.dispatchDBlock == 'NULL' and \ - ((not file.status in ['ready','missing']) or job.computingSite in prestageSites): - if first: - first = False - job.dispatchDBlock = dispatchDBlock - file.dispatchDBlock = dispatchDBlock - file.status = 'pending' - if not file.lfn in fileList: - fileList.append(file.lfn) - guidList.append(file.GUID) - try: - # get total number/size of inputs except DBRelease - # tgz inputs for evgen may be negligible - if re.search('\.tar\.gz',file.lfn) == None: - totalNumInputs += 1 - totalInputSize += file.fsize - except: - pass - # destinationSE - if file.type in ['output','log'] and destSE != '': - if job.prodSourceLabel == 'user' and job.computingSite == file.destinationSE: - pass - elif destSE == 'local': - pass - else: - file.destinationSE = destSE - # pre-assign GUID to log - if file.type == 'log': - # get lock - fcntl.flock(_lockGetUU.fileno(), fcntl.LOCK_EX) - # generate GUID - file.GUID = commands.getoutput('uuidgen') - # release lock - fcntl.flock(_lockGetUU.fileno(), fcntl.LOCK_UN) - # send log messages - try: - for message in loggerMessages: - # get logger - _pandaLogger = PandaLogger() - _pandaLogger.lock() - _pandaLogger.setParams({'Type':'brokerage'}) - logger = _pandaLogger.getHttpLogger(panda_config.loggername) - # add message - logger.warning(message) - # release HTTP handler - _pandaLogger.release() - time.sleep(1) - except: - pass - # send analysis brokerage info when jobs are submitted - if len(jobs) > 0 and jobs[0] != None and not forAnalysis and not pd2pT1 and specialWeight=={}: - # for analysis job. FIXME once ganga is updated to send analy brokerage info - if jobs[0].prodSourceLabel in ['user','panda'] and jobs[0].processingType in ['pathena','prun']: - # send countryGroup - tmpMsgList = [] - tmpNumJobs = len(jobs) - if jobs[0].prodSourceLabel == 'panda': - tmpNumJobs -= 1 - tmpMsg = 'nJobs=%s ' % tmpNumJobs - if jobs[0].countryGroup in ['NULL','',None]: - tmpMsg += 'countryGroup=None' - else: - tmpMsg += 'countryGroup=%s' % jobs[0].countryGroup - tmpMsgList.append(tmpMsg) - # send log - sendMsgToLoggerHTTP(tmpMsgList,jobs[0]) - # finished - tmpLog.debug('finished') - if getWeight: - return weightUsedByBrokerage - except: - type, value, traceBack = sys.exc_info() - tmpLog.error("schedule : %s %s" % (type,value)) - if getWeight: - return {} - diff --git a/current/pandaserver/brokerage/broker_util.py b/current/pandaserver/brokerage/broker_util.py deleted file mode 100755 index ca8564a91..000000000 --- a/current/pandaserver/brokerage/broker_util.py +++ /dev/null @@ -1,399 +0,0 @@ -import re -import urllib -import time -import sys -import types -import commands -import xml.dom.minidom - - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger -_log = PandaLogger().getLogger('broker_util') - -# curl class -class _Curl: - # constructor - def __init__(self,useProxy=False): - # path to curl - self.path = 'curl --user-agent "dqcurl" -m 180' - # verification of the host certificate - self.verifyHost = False - # use proxy - if useProxy and panda_config.httpProxy != '': - self.path = 'env http_proxy=%s %s' % (panda_config.httpProxy,self.path) - - # GET method - def get(self,url,data={}): - # make command - com = '%s --silent --get' % self.path - if not self.verifyHost: - com += ' --insecure' - # data - for key,value in data.iteritems(): - com += ' --data "%s"' % urllib.urlencode({key:value}) - com += ' %s' % url - # execute - _log.debug(com) - ret = commands.getstatusoutput(com) - _log.debug(ret) - return ret - - -# get default storage -def _getDefaultStorage(baseURL,sePath=None,seProdPath={}): - _log.debug('_getDefaultStorage (%s %s %s)' % (baseURL,sePath,seProdPath)) - # use se+seprodpath when baseURL='' - if baseURL=='': - # get token - match = re.search('^token:([^:]+):',sePath) - if match == None: - _log.error("could not get token from %s" % sePath) - return "" - token = match.group(1) - # get corresponding path - if not seProdPath.has_key(token): - _log.error("could not find path for % in %s" % (token,seProdPath)) - return "" - # set se+seprodpath - out = sePath+seProdPath[token] - # append / - if not out.endswith('/'): - out += '/' - _log.debug(out) - else: - # check port to set proxy - useProxy = False - if panda_config.httpProxy != '': - pMatch = re.search('http://[^:/]+:*(\d+)/',baseURL) - if pMatch == None: - # default port - useProxy = True - elif pMatch.group(1) == '80': - # standard port - useProxy = True - # instantiate curl - curl = _Curl(useProxy) - # get default storage - url = baseURL + 'storages/default' - status,out = curl.get(url) - _log.debug(out) - if status != 0: - _log.error("could not get default storage from %s:%s" % (baseURL,status)) - return "" - # parse - match = re.search('^[^/]+://[^/]+(/.+)$',out) - if match == None: - _log.error("could not parse string : %s" % out) - return "" - return match.group(1) - - -# get PoolFileCatalog -def _getPoolFileCatalog(lfns,dq2url): - _log.debug('_getPoolFileCatalog') - # check port to set proxy - useProxy = False - if panda_config.httpProxy != '': - pMatch = re.search('http://[^:/]+:*(\d+)/',dq2url) - if pMatch == None: - # default port - useProxy = True - elif pMatch.group(1) == '80': - # standard port - useProxy = True - # instantiate curl - curl = _Curl(useProxy) - # get PoolFileCatalog - iLFN = 0 - outXML ='' - strLFNs = '' - if not dq2url.endswith('_'): - url = dq2url + '/lrc/PoolFileCatalog' - else: - # NDGF LRC - url = dq2url + 'lrc/PoolFileCatalog' - for lfn in lfns: - iLFN += 1 - # make argument - strLFNs += '%s ' % lfn - if iLFN % 40 == 0 or iLFN == len(lfns): - # get PoolFileCatalog - strLFNs = strLFNs.rstrip() - data = {'lfns':strLFNs} - # avoid too long argument - strLFNs = '' - # execute - status,out = curl.get(url,data) - _log.debug(status) - # sleep - time.sleep(2) - if status != 0: - _log.error("_getPoolFileCatalog : %s %s %s" % (dq2url,status,out)) - return status - if status != 0 or out.startswith('Error'): - continue - if not out.startswith('<\?xml version="1.0" encoding="UTF-8" standalone="no" \?> - - - -""" - outXML = re.sub(th,'',outXML) - outXML = re.sub("""\s*""",'',outXML) - outXML = re.sub("""\s*""",'',outXML) - outXML = re.sub("""\s*""",'',outXML) - outXML = re.sub("""\s*""",'',outXML) - outXML = re.sub("""\s*""",'',outXML) - outXML = re.sub("""\s*""",'',outXML) - - # return XML - return outXML - - -# get files from MySQL -def _getPFNFromMySQL(lfns,dq2url): - _log.debug('_getPFNFromMySQL') - import MySQLdb - comment = ' /* broker_util._getPFNFromMySQL */' - outStr = '' - # parse connection string - match = re.search('^mysql://([^:]+):([^@]+)@([^/:]+):(\d+)/(.+)$',dq2url) - if match == None: - return outStr - # parameters for DB connection - connStr = "mysql -h %s -u %s -p%s -P %s %s" - dbhost = match.group(3) - dbuser = match.group(1) - dbpswd = match.group(2) - dbport = int(match.group(4)) - dbname = match.group(5) - connStr = "mysql -h %s -u %s -p%s -P %s %s" % (dbhost,dbuser,dbpswd,dbport,dbname) - try: - _log.debug(connStr) - # connect - dbConn = MySQLdb.connect(db=dbname,host=dbhost,port=dbport,user=dbuser,passwd=dbpswd) - # make cursor - dbCur = dbConn.cursor() - # query files - iLFN = 0 - strLFNs = '' - for lfn in lfns: - iLFN += 1 - # make argument - strLFNs += " lfname='%s' OR " % lfn - if iLFN % 40 == 0 or iLFN == len(lfns): - # get PoolFileCatalog - strLFNs = strLFNs[:-3] - # construct SQL - sql = 'SELECT lfname FROM t_lfn WHERE %s' % strLFNs - # reset - strLFNs = '' - # execute - _log.debug(sql) - dbCur.execute(sql+comment) - res = dbCur.fetchall() - _log.debug(res) - # append LFNs - if res != None and len(res) != 0: - for resLFN in res: - outStr += '%s ' % resLFN - # close cursor - dbCur.close() - # close connection - dbConn.close() - except: - type, value, traceBack = sys.exc_info() - _log.error("_getPFNFromMySQL : %s %s %s" % (dq2url,type,value)) - return -1 - # return - return outStr - - -# get files from LFC -def _getPFNFromLFC(lfns,dq2url,guids,storageName): - _log.debug('_getPFNFromLFC') - outStr = '' - # check paramter - if guids == [] or storageName == [] or (len(lfns) != len(guids)): - return outStr - # extract LFC host - lfcHost = re.sub('[/:]',' ',dq2url).split()[1] - # loop over all LFNs - iLFN = 0 - nLFN = 1000 - strFiles = '' - outStr = '' - for iLFN in range(len(lfns)): - strFiles += '%s %s\n' % (lfns[iLFN],guids[iLFN]) - # bulk operation - if (iLFN+1) % nLFN == 0 or (iLFN+1) >= len(lfns): - # write to file - inFileName = '%s/lfcin.%s' % (panda_config.logdir,commands.getoutput('uuidgen')) - ifile = open(inFileName,'w') - ifile.write(strFiles) - ifile.close() - # construct commands - strStorage = '' - for storage in storageName: - strStorage += '%s,' % storage - strStorage = strStorage[:-1] - com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) - com+= 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; ' - com+= 'source %s; %s/python -Wignore %s/LFCclient.py -f %s -l %s -s %s' % \ - (panda_config.glite_source,panda_config.native_python32,panda_config.lfcClient_dir, - inFileName,lfcHost,strStorage) - _log.debug(com) - # exeute - status,output = commands.getstatusoutput(com) - _log.debug(status) - if status == 0: - outStr += output - else: - _log.error("_getPFNFromLFC : %s %s %s" % (dq2url,status,output)) - # send message to logger - try: - # make message - message = 'LFC access : %s %s %s' % (dq2url,status,output) - # get logger - _pandaLogger = PandaLogger() - _pandaLogger.lock() - _pandaLogger.setParams({'Type':'broker_util'}) - logger = _pandaLogger.getHttpLogger(panda_config.loggername) - # add message - logger.error(message) - # release HTTP handler - _pandaLogger.release() - except: - pass - return status - # reset - strFiles = '' - # return - return outStr - - -# get files from LRC -def getFilesFromLRC(files,url,guids=[],storageName=[],terminateWhenFailed=False,getPFN=False): - _log.debug('getFilesFromLRC "%s" %s' % (url,str(storageName))) - # get PFC - outSTR = '' - if url.startswith('mysql://'): - # from MySQL - outSTR = _getPFNFromMySQL(files,url) - # get PFN - if getPFN: - outPFN = {} - # FIXME - _log.debug('RetPFN:%s ' % str(outPFN)) - return outPFN - elif url.startswith('http://'): - # from HTTP I/F - outSTR = _getPoolFileCatalog(files,url) - # get PFN - if getPFN: - outPFN = {} - try: - if not outSTR in ['',None]: - root = xml.dom.minidom.parseString(outSTR) - fileNodes = root.getElementsByTagName('File') - for file in fileNodes: - # get PFN and LFN nodes - physical = file.getElementsByTagName('physical')[0] - pfnNode = physical.getElementsByTagName('pfn')[0] - logical = file.getElementsByTagName('logical')[0] - lfnNode = logical.getElementsByTagName('lfn')[0] - # convert UTF8 to Raw - pfn = str(pfnNode.getAttribute('name')) - lfn = str(lfnNode.getAttribute('name')) - # assign - if not outPFN.has_key(lfn): - outPFN[lfn] = [] - outPFN[lfn].append(pfn) - except: - type, value, traceBack = sys.exc_info() - _log.error(outSTR) - _log.error("could not parse XML - %s %s" % (type, value)) - _log.debug('RetPFN:%s ' % str(outPFN)) - return outPFN - elif url.startswith('lfc://'): - # from LFC - outSTR = _getPFNFromLFC(files,url,guids,storageName) - # get PFN - if getPFN: - outPFN = {} - try: - if not outSTR in ['',None]: - tmpItems = outSTR.split('LFCRet :') - tmpItems.remove('') - # loop over all returns - for tmpItem in tmpItems: - exec "tmpLFNmap = %s" % tmpItem - for tmpLFN,tmpPFN in tmpLFNmap.iteritems(): - outPFN[tmpLFN] = tmpPFN - except: - type, value, traceBack = sys.exc_info() - _log.error(outSTR) - _log.error("could not parse LFC ret - %s %s" % (type, value)) - _log.debug('RetPFN:%s ' % str(outPFN)) - return outPFN - # check return - if not isinstance(outSTR,types.StringType): - if terminateWhenFailed: - return None - # set empty string - outSTR = '' - # collect OK Files - okFiles = [] - for file in files: - if re.search(file,outSTR) != None: - okFiles.append(file) - _log.debug('Ret:%s ' % str(okFiles)) - return okFiles - - -# get # of files from LRC -def getNFilesFromLRC(files,url): - _log.debug('getNFilesFromLRC') - # get okFiles - okFiles = getFilesFromLRC(files,url) - nFiles = len(okFiles) - _log.debug('Ret:%s ' % nFiles) - return nFiles - - -# get list of missing LFNs from LRC -def getMissLFNsFromLRC(files,url,guids=[],storageName=[]): - _log.debug('getMissLFNsFromLRC') - # get OF files - okFiles = getFilesFromLRC(files,url,guids,storageName) - # collect missing files - missFiles = [] - for file in files: - if not file in okFiles: - missFiles.append(file) - _log.debug('Ret:%s ' % str(missFiles)) - return missFiles - - -# extract list of se hosts from schedconfig -def getSEfromSched(seStr): - tmpSE = [] - if seStr != None: - for tmpSrcSiteSE in seStr.split(','): - # extract host - match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE) - if match != None: - tmpSE.append(match.group(1)) - # sort - tmpSE.sort() - # return - return tmpSE - - diff --git a/current/pandaserver/config/__init__.py b/current/pandaserver/config/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/current/pandaserver/config/panda_config.py b/current/pandaserver/config/panda_config.py deleted file mode 100755 index 68034b586..000000000 --- a/current/pandaserver/config/panda_config.py +++ /dev/null @@ -1,33 +0,0 @@ -import re -import sys -import commands -from liveconfigparser.LiveConfigParser import LiveConfigParser - -# get ConfigParser -tmpConf = LiveConfigParser() - -# read -tmpConf.read('panda_server.cfg') - -# get server section -tmpDict = tmpConf.server - -# expand all values -tmpSelf = sys.modules[ __name__ ] -for tmpKey,tmpVal in tmpDict.iteritems(): - # convert string to bool/int - if tmpVal == 'True': - tmpVal = True - elif tmpVal == 'False': - tmpVal = False - elif re.match('^\d+$',tmpVal): - tmpVal = int(tmpVal) - # update dict - tmpSelf.__dict__[tmpKey] = tmpVal - -# set hostname -tmpSelf.__dict__['pserverhost'] = commands.getoutput('hostname -f') - -# change the number of database connections for FastCGI/WSGI -if tmpSelf.__dict__['useFastCGI'] or tmpSelf.__dict__['useWSGI']: - tmpSelf.__dict__['nDBConnection'] = tmpSelf.__dict__['nDBConForFastCGIWSGI'] diff --git a/current/pandaserver/dataservice/Activator.py b/current/pandaserver/dataservice/Activator.py deleted file mode 100755 index af3909050..000000000 --- a/current/pandaserver/dataservice/Activator.py +++ /dev/null @@ -1,47 +0,0 @@ -''' -activate job - -''' - -import threading - -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('Activator') - - -class Activator (threading.Thread): - # constructor - def __init__(self,taskBuffer,dataset,enforce=False): - threading.Thread.__init__(self) - self.dataset = dataset - self.taskBuffer = taskBuffer - self.enforce = enforce - - - # main - def run(self): - _logger.debug("start: %s" % self.dataset.name) - if self.dataset.status in ['completed','deleting','deleted'] and not self.enforce: - _logger.debug(" skip: %s" % self.dataset.name) - else: - # update input files - ids = self.taskBuffer.updateInFilesReturnPandaIDs(self.dataset.name,'ready') - _logger.debug("IDs: %s" % ids) - if len(ids) != 0: - # get job - jobs = self.taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False) - # remove None and unknown - acJobs = [] - for job in jobs: - if job == None or job.jobStatus == 'unknown': - continue - acJobs.append(job) - # activate - self.taskBuffer.activateJobs(acJobs) - # update dataset in DB - if self.dataset.type == 'dispatch': - self.dataset.status = 'completed' - self.taskBuffer.updateDatasets([self.dataset]) - _logger.debug("end: %s" % self.dataset.name) diff --git a/current/pandaserver/dataservice/Adder.py b/current/pandaserver/dataservice/Adder.py deleted file mode 100755 index 7209704e5..000000000 --- a/current/pandaserver/dataservice/Adder.py +++ /dev/null @@ -1,742 +0,0 @@ -''' -add data to dataset - -''' - -import os -import re -import sys -import time -import fcntl -import commands -import threading -import xml.dom.minidom -import ErrorCode -import brokerage.broker_util -from DDM import ddm -from Closer import Closer - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('Adder') - - -class Adder (threading.Thread): - # constructor - def __init__(self,taskBuffer,jobID,fileCatalog,jobStatus,xmlFile='',ignoreDDMError=True,joinCloser=False, - addOutput=False,pandaDDM=False,siteMapper=None,attemptNr=None): - threading.Thread.__init__(self) - self.job = None - self.jobID = jobID - self.jobStatus = jobStatus - self.taskBuffer = taskBuffer - self.ignoreDDMError = ignoreDDMError - self.joinCloser = joinCloser - self.addOutput = addOutput - self.pandaDDM = pandaDDM - self.lockXML = None - self.datasetMap = {} - self.siteMapper = siteMapper - self.addToTopOnly = False - self.goToTransferring = False - self.subscriptionMap = {} - self.attemptNr = attemptNr - # dump Catalog into file - if xmlFile=='': - if attemptNr == None: - self.xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,jobID,jobStatus, - commands.getoutput('uuidgen')) - else: - self.xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,jobID,jobStatus, - commands.getoutput('uuidgen'),attemptNr) - file = open(self.xmlFile,'w') - file.write(fileCatalog) - file.close() - else: - self.xmlFile = xmlFile - - - # main - def run(self): - try: - _logger.debug("%s new start: %s" % (self.jobID,self.jobStatus)) - # lock XML except last trial - if self.addOutput and self.ignoreDDMError: - self.lockXML = open(self.xmlFile) - try: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) - except: - _logger.debug("%s cannot get lock : %s" % (self.jobID,self.xmlFile)) - self.lockXML.close() - return - # query job - self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, - fromArchived=False, - fromWaiting=False)[0] - # check if job has finished - if self.job == None: - _logger.debug('%s : not found' % self.jobID) - elif self.job.jobStatus in ['finished','failed','unknown','cancelled']: - _logger.error('%s : invalid state -> %s' % (self.jobID,self.job.jobStatus)) - else: - # add files only to top-level datasets for transferring jobs - if self.job.jobStatus == 'transferring': - self.addToTopOnly = True - _logger.debug("%s adder for transferring" % self.jobID) - # use PandaDDM for ddm jobs - if self.job.prodSourceLabel == 'ddm': - self.pandaDDM = True - # set job status - self.job.jobStatus = self.jobStatus - # add outputs. Cannot add self.pandaDDM here since minidom.parse() produces seg-fault - if self.addOutput: - # check if the job should go to trasnferring - tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm - tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se) - destSEwasSet = False - if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(self.job.destinationSE): - # DQ2 ID was set by using --destSE for analysis job to transfer output - destSEwasSet = True - tmpDstDDM = self.job.destinationSE - tmpDstSEs = self.job.destinationSE - else: - tmpDstDDM = self.siteMapper.getSite(self.job.destinationSE).ddm - tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.destinationSE).se) - if re.search('^ANALY_',self.job.computingSite) != None: - # analysis site - pass - elif (re.search('BNL', self.job.computingSite) != None or self.job.computingSite == "TPATHENA"): - # BNL - pass - elif self.job.computingSite == self.job.destinationSE: - # same site ID for computingSite and destinationSE - pass - elif tmpSrcDDM == tmpDstDDM: - # same DQ2ID for src/dest - pass - elif tmpSrcSEs == tmpDstSEs: - # same SEs - pass - elif self.job.computingSite.endswith("_REPRO"): - # reprocessing sites - pass - elif self.addToTopOnly: - # already in transferring - pass - elif self.job.jobStatus == 'failed': - # failed jobs - pass - else: - self.goToTransferring = True - self._updateOutputs() - else: - _logger.debug('%s : not added' % self.jobID) - _logger.debug('%s escape' % self.jobID) - return - _logger.debug('%s updated outputs' % self.jobID) - # ignore DDMError - if self.ignoreDDMError and \ - (re.search('could not add files',self.job.ddmErrorDiag) != None or \ - re.search('could not register subscription',self.job.ddmErrorDiag) != None) and \ - re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None and \ - re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None and \ - re.search('DQUnknownDatasetException',self.job.ddmErrorDiag) == None and \ - re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None and \ - re.search('KeyError',self.job.ddmErrorDiag) == None: - _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag)) - _logger.debug('%s escape' % self.jobID) - # unlock XML - try: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) - self.lockXML.close() - except: - type, value, traceBack = sys.exc_info() - _logger.debug("%s : %s %s" % (self.jobID,type,value)) - _logger.debug("%s cannot unlock XML" % self.jobID) - return - # update shadow dataset - if self.job.prodSourceLabel == 'user' and self.jobStatus == 'finished' and self.job.ddmErrorDiag == 'NULL' \ - and not self.goToTransferring: - self._updateShadow() - # ignore DDMError - if self.ignoreDDMError and re.search('could not add files',self.job.ddmErrorDiag) != None \ - and re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None \ - and re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None \ - and re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None \ - and re.search('KeyError',self.job.ddmErrorDiag) == None: - _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag)) - _logger.debug('%s escape' % self.jobID) - # unlock XML - try: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) - self.lockXML.close() - except: - type, value, traceBack = sys.exc_info() - _logger.debug("%s : %s %s" % (self.jobID,type,value)) - _logger.debug("%s cannot unlock XML" % self.jobID) - return - # set file status - if self.job.jobStatus == 'failed': - for file in self.job.Files: - if file.type == 'output' or file.type == 'log': - file.status = 'failed' - else: - # reset errors - self.job.jobDispatcherErrorCode = 0 - self.job.jobDispatcherErrorDiag = 'NULL' - # set job status - hasOutput = False - if self.goToTransferring or self.subscriptionMap != {}: - # set status to transferring - for file in self.job.Files: - if file.type == 'output' or file.type == 'log' or \ - self.subscriptionMap.has_key(file.destinationDBlock): - file.status = 'transferring' - hasOutput = True - if hasOutput: - self.job.jobStatus = 'transferring' - # propagate transition to prodDB - self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - # endtime - if self.job.endTime=='NULL': - self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - # set cancelled state - if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': - self.job.jobStatus = 'cancelled' - # update job - retU = self.taskBuffer.updateJobs([self.job],False) - _logger.debug("%s retU: %s" % (self.jobID,retU)) - # failed - if not retU[0]: - _logger.error('failed to update DB for %s' % self.jobID) - # unlock XML - try: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) - self.lockXML.close() - except: - type, value, traceBack = sys.exc_info() - _logger.debug("%s : %s %s" % (self.jobID,type,value)) - _logger.debug("%s cannot unlock XML" % self.jobID) - return - # setup for closer - destDBList = [] - guidList = [] - for file in self.job.Files: - # ignore inputs - if file.type == 'input': - continue - # start closer for output/log datasets - if not file.destinationDBlock in destDBList: - destDBList.append(file.destinationDBlock) - # collect GUIDs - if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test'] and \ - self.job.processingType in ['pathena','gangarobot-rctest'])) \ - and file.type == 'output': - guidList.append({'lfn':file.lfn, 'guid':file.GUID, 'type':file.type}) - if guidList != []: - retG = self.taskBuffer.setGUIDs(guidList) - if destDBList != []: - # start Closer - cThr = Closer(self.taskBuffer,destDBList,self.job,pandaDDM=self.pandaDDM, - datasetMap=self.datasetMap) - _logger.debug("%s start Closer" % self.jobID) - cThr.start() - if self.joinCloser: - cThr.join() - _logger.debug("%s end Closer" % self.jobID) - _logger.debug("%s end" % self.jobID) - try: - # remove Catalog - os.remove(self.xmlFile) - except: - pass - # unlock XML - if self.lockXML != None: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) - self.lockXML.close() - except: - type, value, traceBack = sys.exc_info() - _logger.debug("%s : %s %s" % (self.jobID,type,value)) - _logger.debug("%s except" % self.jobID) - # unlock XML just in case - try: - if self.lockXML != None: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) - except: - type, value, traceBack = sys.exc_info() - _logger.debug("%s : %s %s" % (self.jobID,type,value)) - _logger.debug("%s cannot unlock XML" % self.jobID) - - - # update output files - def _updateOutputs(self): - # get LFN and GUID - _logger.debug("%s %s" % (self.jobID,self.xmlFile)) - # no outputs - if self.job.Files == []: - _logger.debug("%s has no outputs" % self.jobID) - _logger.debug("%s addFiles end" % self.jobID) - return - # get input files - inputLFNs = [] - for file in self.job.Files: - if file.type == 'input': - inputLFNs.append(file.lfn) - # parse XML - lfns = [] - guids = [] - fsizes = [] - md5sums = [] - chksums = [] - try: - root = xml.dom.minidom.parse(self.xmlFile) - files = root.getElementsByTagName('File') - for file in files: - # get GUID - guid = str(file.getAttribute('ID')) - _logger.debug(guid) - # get PFN and LFN nodes - logical = file.getElementsByTagName('logical')[0] - lfnNode = logical.getElementsByTagName('lfn')[0] - # convert UTF8 to Raw - lfn = str(lfnNode.getAttribute('name')) - # get metadata - fsize = None - md5sum = None - adler32 = None - for meta in file.getElementsByTagName('metadata'): - # get fsize - name = str(meta.getAttribute('att_name')) - if name == 'fsize': - fsize = long(meta.getAttribute('att_value')) - elif name == 'md5sum': - md5sum = str(meta.getAttribute('att_value')) - # check - if re.search("^[a-fA-F0-9]{32}$",md5sum) == None: - md5sum = None - elif name == 'adler32': - adler32 = str(meta.getAttribute('att_value')) - # error check - if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): - raise RuntimeError, 'fsize/md5sum/adler32=None' - # append - lfns.append(lfn) - guids.append(guid) - fsizes.append(fsize) - md5sums.append(md5sum) - if adler32 != None: - # use adler32 if available - chksums.append("ad:%s" % adler32) - else: - chksums.append("md5:%s" % md5sum) - except: - # check if file exists - if os.path.exists(self.xmlFile): - type, value, traceBack = sys.exc_info() - _logger.error("%s : %s %s" % (self.jobID,type,value)) - # set failed anyway - self.job.jobStatus = 'failed' - # XML error happens when pilot got killed due to wall-time limit or failures in wrapper - if (self.job.pilotErrorCode in [0,'0','NULL']) and \ - (self.job.transExitCode in [0,'0','NULL']): - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "Adder._updateOutputs() could not get GUID/LFN/MD5/FSIZE" - return - else: - # XML was deleted - self.job.ddmErrorDiag = "Adder._updateOutputs() could not add files" - self.ignoreDDMError = True - return - # check files - idMap = {} - fileList = [] - subMap = {} - for file in self.job.Files: - if file.type == 'input': - if file.lfn in lfns: - if self.job.prodSourceLabel in ['user','panda']: - # skipped file - file.status = 'skipped' - elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']: - # failed by pilot - file.status = 'failed' - elif file.type == 'output' or file.type == 'log': - # append to fileList - fileList.append(file.lfn) - # add only log file for failed jobs - if self.jobStatus == 'failed' and file.type != 'log': - continue - # add only log file for unmerge jobs - if self.job.prodSourceLabel == 'panda' and self.job.processingType in ['unmerge'] \ - and file.type != 'log': - continue - # look for GUID with LFN - try: - i = lfns.index(file.lfn) - file.GUID = guids[i] - file.fsize = fsizes[i] - file.md5sum = md5sums[i] - file.checksum = chksums[i] - # status - file.status = 'ready' - # fsize - fsize = None - if not file.fsize in ['NULL','',0]: - try: - fsize = long(file.fsize) - except: - type, value, traceBack = sys.exc_info() - _logger.error("%s : %s %s" % (self.jobID,type,value)) - # append to map - if not idMap.has_key(file.destinationDBlock): - idMap[file.destinationDBlock] = [] - idMap[file.destinationDBlock].append({'guid' : file.GUID, - 'lfn' : lfns[i], - 'size' : fsize, - 'checksum' : file.checksum}) - # for subscription - if self.job.prodSourceLabel in ['managed','test','software','rc_test','ptest','user'] and \ - re.search('_sub\d+$',file.destinationDBlock) != None and (not self.addToTopOnly) and \ - self.job.destinationSE != 'local': - if self.siteMapper == None: - _logger.error("%s : SiteMapper==None" % self.jobID) - else: - # get dataset spec - if not self.datasetMap.has_key(file.destinationDBlock): - tmpDS = self.taskBuffer.queryDatasetWithMap({'name':file.destinationDBlock}) - self.datasetMap[file.destinationDBlock] = tmpDS - # check if valid dataset - if self.datasetMap[file.destinationDBlock] == None: - _logger.error("%s : cannot find %s in DB" % (self.jobID,file.destinationDBlock)) - else: - if not self.datasetMap[file.destinationDBlock].status in ['defined']: - # not a fresh dataset - _logger.debug("%s : subscription was already made for %s:%s" % \ - (self.jobID,self.datasetMap[file.destinationDBlock].status, - file.destinationDBlock)) - else: - # get DQ2 IDs - tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm - tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se) - if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(file.destinationSE): - # DQ2 ID was set by using --destSE for analysis job to transfer output - tmpDstDDM = file.destinationSE - tmpDstSEs = file.destinationSE - else: - tmpDstDDM = self.siteMapper.getSite(file.destinationSE).ddm - tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(file.destinationSE).se) - # if src != dest or multi-token - if (tmpSrcDDM != tmpDstDDM and tmpSrcSEs != tmpDstSEs) or \ - (tmpSrcDDM == tmpDstDDM and file.destinationDBlockToken.count(',') != 0): - optSub = {'DATASET_COMPLETE_EVENT' : ['https://%s:%s/server/panda/datasetCompleted' % \ - (panda_config.pserverhost,panda_config.pserverport)]} - # append - if not subMap.has_key(file.destinationDBlock): - subMap[file.destinationDBlock] = [] - # sources - optSource = {} - # set sources for NL/FR/ES to handle T2s in another cloud - if self.job.cloud in ['NL','FR','ES']: - if file.destinationDBlockToken in ['NULL','']: - # use default DQ2 ID as source - optSource[tmpSrcDDM] = {'policy' : 0} - else: - # convert token to DQ2 ID - dq2ID = tmpSrcDDM - # use the first token's location as source for T1D1 - tmpSrcToken = file.destinationDBlockToken.split(',')[0] - if self.siteMapper.getSite(self.job.computingSite).setokens.has_key(tmpSrcToken): - dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens[tmpSrcToken] - optSource[dq2ID] = {'policy' : 0} - # use another location when token is set - if not file.destinationDBlockToken in ['NULL','']: - tmpDQ2IDList = [] - tmpDstTokens = file.destinationDBlockToken.split(',') - # remove the first one because it is already used as a location - if tmpSrcDDM == tmpDstDDM: - tmpDstTokens = tmpDstTokens[1:] - # loop over all tokens - for idxToken,tmpDstToken in enumerate(tmpDstTokens): - dq2ID = tmpDstDDM - if self.siteMapper.getSite(file.destinationSE).setokens.has_key(tmpDstToken): - dq2ID = self.siteMapper.getSite(file.destinationSE).setokens[tmpDstToken] - # keep the fist destination for multi-hop - if idxToken == 0: - firstDestDDM = dq2ID - else: - # use the fist destination as source for T1D1 - optSource = {} - optSource[firstDestDDM] = {'policy' : 0} - # remove looping subscription - if dq2ID == tmpSrcDDM: - continue - # avoid duplication - if not dq2ID in tmpDQ2IDList: - subMap[file.destinationDBlock].append((dq2ID,optSub,optSource)) - else: - # use default DDM - for dq2ID in tmpDstDDM.split(','): - subMap[file.destinationDBlock].append((dq2ID,optSub,optSource)) - except: - # status - file.status = 'failed' - type, value, traceBack = sys.exc_info() - _logger.error("%s : %s %s" % (self.jobID,type,value)) - # cleanup submap - tmpKeys = subMap.keys() - for tmpKey in tmpKeys: - if subMap[tmpKey] == []: - del subMap[tmpKey] - # check consistency between XML and filesTable - for lfn in lfns: - if (not lfn in fileList) and (not lfn in inputLFNs): - _logger.error("%s %s is not found in filesTable" % (self.jobID,lfn)) - self.job.jobStatus = 'failed' - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "Adder._updateOutputs() XML is inconsistent with filesTable" - return - # return if PandaDDM is used or non-DQ2 - if self.pandaDDM or self.job.destinationSE == 'local': - return - # add data to original dataset - for destinationDBlock in idMap.keys(): - match = re.findall('(.+)_sub\d+$',destinationDBlock) - if len(match): - # add files to top-level datasets - if not self.goToTransferring: - origDBlock = match[0] - idMap[origDBlock] = idMap[destinationDBlock] - # add files to top-level datasets only - if self.addToTopOnly: - del idMap[destinationDBlock] - # print idMap - _logger.debug("%s idMap = %s" % (self.jobID,idMap)) - # add data - _logger.debug("%s addFiles start" % self.jobID) - # number of retry - nTry = 3 - for iTry in range(nTry): - # empty - if idMap == {}: - break - # add data to datasets - time.sleep(1) - _logger.debug((self.jobID, 'registerFilesInDatasets',idMap)) - status,out = ddm.DQ2.main('registerFilesInDatasets',idMap) - isFailed = False - if status != 0 and out.find('DQFileExistsInDatasetException') == -1 \ - and (out.find('The file LFN or GUID is already registered') == -1 or \ - out.find('already registered in vuid') == -1): - isFailed = True - if not isFailed: - _logger.debug('%s %s' % (self.jobID,out)) - # failed - if isFailed: - _logger.error('%s %s' % (self.jobID,out)) - if (iTry+1) == nTry or out.find('DQClosedDatasetException') != 0 or \ - out.find('DQFrozenDatasetException') != 0 or \ - out.find('DQUnknownDatasetException') != 0 or \ - out.find('DQFileMetaDataMismatchException') != 0: - self.job.jobStatus = 'failed' - self.job.ddmErrorCode = ErrorCode.EC_Adder - errMsg = "Adder._updateOutputs() could not add files to %s\n" % idMap.keys() - self.job.ddmErrorDiag = errMsg + out.split('\n')[-1] - return - _logger.error("%s Try:%s" % (self.jobID,iTry)) - # sleep - time.sleep(120) - else: - break - # register dataset subscription - subActivity = 'Production' - if not self.job.prodSourceLabel in ['user']: - # make DQ2 subscription for prod jobs - for tmpName,tmpVal in subMap.iteritems(): - for dq2ID,optSub,optSource in tmpVal: - _logger.debug((self.jobID,'registerDatasetSubscription',tmpName,dq2ID,0,0,optSub, - optSource,001000 | 010000,0,None,0,"production",None,subActivity,None,"14 days")) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('registerDatasetSubscription',tmpName,dq2ID,0,0,optSub, - optSource,001000 | 010000,0,None,0,"production",None,subActivity,None,"14 days") - if (status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1) and \ - out.find('DQSubscriptionExistsException') == -1: - time.sleep(60) - else: - break - if status != 0 and (out != 'None' and out.find('DQSubscriptionExistsException') == -1): - _logger.error('%s %s' % (self.jobID,out)) - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "Adder._updateOutputs() could not register subscription : %s" % tmpName - return - _logger.debug('%s %s' % (self.jobID,out)) - # set dataset status - self.datasetMap[tmpName].status = 'running' - # keep subscriptions - self.subscriptionMap = subMap - else: - # send request to DaTRI - tmpTopDatasets = {} - # collect top-level datasets - for tmpName,tmpVal in subMap.iteritems(): - for dq2ID,optSub,optSource in tmpVal: - tmpTopName = re.sub('_sub\d+','',tmpName) - # append - if not tmpTopDatasets.has_key(tmpTopName): - tmpTopDatasets[tmpTopName] = [] - if not dq2ID in tmpTopDatasets[tmpTopName]: - tmpTopDatasets[tmpTopName].append(dq2ID) - # remove redundant CN from DN - tmpDN = self.job.prodUserID - tmpDN = re.sub('/CN=limited proxy','',tmpDN) - tmpDN = re.sub('(/CN=proxy)+$','',tmpDN) - # send request - if tmpTopDatasets != {} and self.jobStatus == 'finished': - try: - from datriHandler import datriHandler - if self.job.lockedby.startswith('Ganga'): - tmpHandler = datriHandler(type='ganga') - else: - tmpHandler = datriHandler(type='pathena') - # loop over all output datasets - for tmpDsName,dq2IDlist in tmpTopDatasets.iteritems(): - for tmpDQ2ID in dq2IDlist: - tmpMsg = "%s %s ds=%s site=%s id=%s" % (self.jobID,'datriHandler.sendRequest', - tmpDsName,tmpDQ2ID,tmpDN) - _logger.debug(tmpMsg) - tmpHandler.setParameters(data_pattern=tmpDsName, - site=tmpDQ2ID, - userid=tmpDN) - # number of retry - nTry = 3 - for iTry in range(nTry): - dhStatus,dhOut = tmpHandler.sendRequest() - # succeeded - if dhStatus == 0 or "such request is exist" in dhOut: - _logger.debug("%s %s %s" % (self.jobID,dhStatus,dhOut)) - break - if iTry+1 < nTry: - # sleep - time.sleep(60) - else: - # final attempt failed - tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,dhStatus,dhOut) - _logger.error(tmpMsg) - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "DaTRI failed for %s with %s %s" % (tmpDsName,dhStatus,dhOut) - return - # set dataset status - for tmpName,tmpVal in subMap.iteritems(): - self.datasetMap[tmpName].status = 'running' - except: - errType,errValue = sys.exc_info()[:2] - tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,errType,errValue) - _logger.error(tmpMsg) - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "DaTRI failed with %s %s" % (errType,errValue) - return - # properly finished - _logger.debug("%s addFiles end" % self.jobID) - - - # update shadow dataset - def _updateShadow(self): - # return if PandaDDM is used or non-DQ2 - if self.pandaDDM or self.job.destinationSE == 'local': - return - _logger.debug("%s updateShadow" % self.jobID) - # get shadow DS and contents - shadowList = [] - shadowFiles = [] - for file in self.job.Files: - if file.type == 'output' or file.type == 'log': - # get shadow name - shadowDS = re.sub('_sub\d+$','',file.destinationDBlock) + '_shadow' - if not shadowDS in shadowList: - shadowList.append(shadowDS) - elif file.type == 'input': - # remove skipped files - if file.status in ['skipped']: - continue - # ignore lib.tgz - if re.search('lib\.tgz\.*\d*',file.lfn) != None: - continue - # ignore DBRelease - if re.search('DBRelease',file.lfn) != None: - continue - # ignore when noshadow is set - if file.destinationDBlockToken == 'noshadow': - continue - # fsize - fsize = None - if not file.fsize in ['NULL','',0]: - try: - fsize = long(file.fsize) - except: - type, value, traceBack = sys.exc_info() - _logger.error("%s : %s %s" % (self.jobID,type,value)) - # append - if len(str(file.GUID))==36: - shadowFiles.append({'guid' : file.GUID, - 'lfn' : file.lfn, - 'size' : fsize, - 'checksum' : None}) - # create idMap - idMap = {} - for shadowDS in shadowList: - nTry = 3 - findFlag = False - for iTry in range(nTry): - # check if shadow dataset exists - _logger.debug((self.jobID, 'listDatasets',shadowDS,0,True)) - status,out = ddm.DQ2.main('listDatasets',shadowDS,0,True) - if status == 0: - if (out.find(shadowDS) == -1): - _logger.debug("%s shadow %s doesn't exist" % (self.jobID,shadowDS)) - else: - findFlag = True - break - # sleep - time.sleep(120) - # append - if findFlag and shadowFiles != []: - idMap[shadowDS] = shadowFiles - # add data - _logger.debug("%s shadow idMap = %s" % (self.jobID,idMap)) - if idMap == {}: - return - _logger.debug("%s addFilesToShadow start" % self.jobID) - # number of retry - nTry = 3 - for iTry in range(nTry): - # add data to datasets - time.sleep(1) - _logger.debug((self.jobID, 'registerFilesInDatasets',idMap)) - status,out = ddm.DQ2.main('registerFilesInDatasets',idMap) - isFailed = False - if status != 0 and out.find('DQFileExistsInDatasetException') == -1 \ - and (out.find('The file LFN or GUID is already registered') == -1 or \ - out.find('already registered in vuid') == -1): - isFailed = True - if not isFailed: - _logger.debug('%s %s' % (self.jobID,out)) - # failed - if isFailed: - _logger.error('%s %s' % (self.jobID,out)) - if (iTry+1) == nTry or out.find('DQClosedDatasetException') != 0 or \ - out.find('DQFrozenDatasetException') != 0 or \ - out.find('DQFileMetaDataMismatchException') != 0: - self.job.jobStatus = 'failed' - self.job.ddmErrorCode = ErrorCode.EC_Adder - errMsg = "Adder._updateOutputs() could not add files to %s\n" % idMap.keys() - self.job.ddmErrorDiag = errMsg + out.split('\n')[-1] - return - _logger.error("%s shadow Try:%s" % (self.jobID,iTry)) - # sleep - time.sleep(120) - else: - break - _logger.debug("%s addFilesToShadow end" % self.jobID) diff --git a/current/pandaserver/dataservice/Adder2.py b/current/pandaserver/dataservice/Adder2.py deleted file mode 100644 index 521526d7b..000000000 --- a/current/pandaserver/dataservice/Adder2.py +++ /dev/null @@ -1,1014 +0,0 @@ -''' -add data to dataset - -''' - -import os -import re -import sys -import time -import fcntl -import datetime -import commands -import threading -import xml.dom.minidom -import ErrorCode -from dq2.clientapi import DQ2 -try: - from dq2.clientapi.cli import Register2 -except: - pass - -import brokerage.broker_util -import Closer - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('Adder') -Closer.initLogger(_logger) - - -class Adder (threading.Thread): - # constructor - def __init__(self,taskBuffer,jobID,fileCatalog,jobStatus,xmlFile='',ignoreDDMError=True,joinCloser=False, - addOutput=False,pandaDDM=False,siteMapper=None,attemptNr=None): - threading.Thread.__init__(self) - self.job = None - self.jobID = jobID - self.jobStatus = jobStatus - self.taskBuffer = taskBuffer - self.ignoreDDMError = ignoreDDMError - self.joinCloser = joinCloser - self.addOutput = addOutput - self.pandaDDM = pandaDDM - self.lockXML = None - self.datasetMap = {} - self.siteMapper = siteMapper - self.addToTopOnly = False - self.goToTransferring = False - self.logTransferring = False - self.subscriptionMap = {} - self.dq2api = None - self.attemptNr = attemptNr - # dump Catalog into file - if xmlFile=='': - if attemptNr == None: - self.xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,jobID,jobStatus, - commands.getoutput('uuidgen')) - else: - self.xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,jobID,jobStatus, - commands.getoutput('uuidgen'),attemptNr) - file = open(self.xmlFile,'w') - file.write(fileCatalog) - file.close() - else: - self.xmlFile = xmlFile - # exstract attemptNr - try: - tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1] - if re.search('^\d+$',tmpAttemptNr) != None: - self.attemptNr = int(tmpAttemptNr) - except: - pass - # main - def run(self): - try: - _logger.debug("%s new start: %s attemptNr=%s" % (self.jobID,self.jobStatus,self.attemptNr)) - # instantiate DQ2 - self.dq2api = DQ2.DQ2() - # lock XML except last trial - if self.addOutput and self.ignoreDDMError: - self.lockXML = open(self.xmlFile) - try: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) - except: - _logger.debug("%s cannot get lock : %s" % (self.jobID,self.xmlFile)) - self.lockXML.close() - return - # query job - self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, - fromArchived=False, - fromWaiting=False)[0] - # check if job has finished - if self.job == None: - _logger.debug('%s : not found' % self.jobID) - elif self.job.jobStatus in ['finished','failed','unknown','cancelled']: - _logger.error('%s : invalid state -> %s' % (self.jobID,self.job.jobStatus)) - elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: - _logger.error('%s : wrong attemptNr -> job=%s <> %s' % (self.jobID,self.job.attemptNr,self.attemptNr)) - else: - # add files only to top-level datasets for transferring jobs - if self.job.jobStatus == 'transferring': - self.addToTopOnly = True - _logger.debug("%s adder for transferring" % self.jobID) - # use PandaDDM for ddm jobs - if self.job.prodSourceLabel == 'ddm': - self.pandaDDM = True - # set job status - self.job.jobStatus = self.jobStatus - # add outputs. Cannot add self.pandaDDM here since minidom.parse() produces seg-fault - if self.addOutput: - # check if the job should go to trasnferring - tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm - tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se) - destSEwasSet = False - brokenSched = False - if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(self.job.destinationSE): - # DQ2 ID was set by using --destSE for analysis job to transfer output - destSEwasSet = True - tmpDstDDM = self.job.destinationSE - tmpDstSEs = self.job.destinationSE - else: - tmpDstDDM = self.siteMapper.getSite(self.job.destinationSE).ddm - tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.destinationSE).se) - # protection against disappearance of dest from schedconfig - if not self.siteMapper.checkSite(self.job.destinationSE) and self.job.destinationSE != 'local': - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "destinaitonSE %s is unknown in schedconfig" % self.job.destinationSE - self.job.jobStatus = 'failed' - self.jobStatus = 'failed' - _logger.error("%s %s" % (self.jobID,self.job.ddmErrorDiag)) - brokenSched = True - # protection against disappearance of src from schedconfig - if not self.siteMapper.checkSite(self.job.computingSite): - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "computingSite %s is unknown in schedconfig" % self.job.computingSite - self.job.jobStatus = 'failed' - self.jobStatus = 'failed' - _logger.error("%s %s" % (self.jobID,self.job.ddmErrorDiag)) - brokenSched = True - _logger.debug('%s DDM src:%s dst:%s' % (self.jobID,tmpSrcDDM,tmpDstDDM)) - _logger.debug('%s SE src:%s dst:%s' % (self.jobID,tmpSrcSEs,tmpDstSEs)) - if re.search('^ANALY_',self.job.computingSite) != None: - # analysis site - pass - elif self.job.computingSite == self.job.destinationSE: - # same site ID for computingSite and destinationSE - pass - elif tmpSrcDDM == tmpDstDDM: - # same DQ2ID for src/dest - pass - elif tmpSrcSEs == tmpDstSEs: - # same SEs - pass - elif self.addToTopOnly: - # already in transferring - pass - elif self.job.jobStatus == 'failed': - # failed jobs - if self.job.prodSourceLabel in ['managed','test']: - self.logTransferring = True - pass - else: - self.goToTransferring = True - _logger.debug('%s goToTransferring=%s' % (self.jobID,self.goToTransferring)) - _logger.debug('%s logTransferring=%s' % (self.jobID,self.logTransferring)) - if not brokenSched: - self._updateOutputs() - else: - _logger.debug('%s : not added' % self.jobID) - _logger.debug('%s escape' % self.jobID) - return - _logger.debug('%s updated outputs' % self.jobID) - # ignore DDMError - if self.ignoreDDMError and \ - (re.search('could not add files',self.job.ddmErrorDiag) != None or \ - re.search('could not register subscription',self.job.ddmErrorDiag) != None) and \ - re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None and \ - re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None and \ - re.search('DQUnknownDatasetException',self.job.ddmErrorDiag) == None and \ - re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None and \ - re.search('DQDatasetExistsException',self.job.ddmErrorDiag) == None and \ - re.search('Exceeded the maximum number of files',self.job.ddmErrorDiag) == None and \ - re.search('KeyError',self.job.ddmErrorDiag) == None and \ - not self.job.ddmErrorCode in [ErrorCode.EC_Subscription]: - _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag)) - _logger.debug('%s escape' % self.jobID) - # unlock XML - try: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) - self.lockXML.close() - except: - type, value, traceBack = sys.exc_info() - _logger.debug("%s : %s %s" % (self.jobID,type,value)) - _logger.debug("%s cannot unlock XML" % self.jobID) - return - # update shadow dataset - """ - if self.job.prodSourceLabel == 'user' and self.jobStatus == 'finished' and \ - (self.job.ddmErrorDiag == 'NULL' or re.search('DaTRI failed',self.job.ddmErrorDiag) != None) and \ - not self.goToTransferring: - self._updateShadow() - # ignore DDMError - if self.ignoreDDMError and re.search('could not add files',self.job.ddmErrorDiag) != None \ - and re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None \ - and re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None \ - and re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None \ - and re.search('Exceeded the maximum number of files',self.job.ddmErrorDiag) == None \ - and re.search('KeyError',self.job.ddmErrorDiag) == None: - _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag)) - _logger.debug('%s escape' % self.jobID) - # unlock XML - try: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) - self.lockXML.close() - except: - type, value, traceBack = sys.exc_info() - _logger.debug("%s : %s %s" % (self.jobID,type,value)) - _logger.debug("%s cannot unlock XML" % self.jobID) - return - """ - # remove unmerged - if self.job.processingType == 'usermerge' and self.job.prodSourceLabel == 'user' and \ - self.jobStatus == 'finished' and self.job.ddmErrorDiag == 'NULL': - retMerge = self._removeUnmerged() - # ignore DDMError - if self.ignoreDDMError and retMerge == None: - _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag)) - _logger.debug('%s escape' % self.jobID) - # unlock XML - try: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) - self.lockXML.close() - except: - type, value, traceBack = sys.exc_info() - _logger.debug("%s : %s %s" % (self.jobID,type,value)) - _logger.debug("%s cannot unlock XML" % self.jobID) - return - # set file status - if self.job.jobStatus == 'failed': - for file in self.job.Files: - if file.type == 'output' or file.type == 'log': - file.status = 'failed' - else: - # reset errors - self.job.jobDispatcherErrorCode = 0 - self.job.jobDispatcherErrorDiag = 'NULL' - # set job status - hasOutput = False - if self.goToTransferring or self.subscriptionMap != {}: - # set status to transferring - for file in self.job.Files: - if file.type == 'output' or file.type == 'log' or \ - self.subscriptionMap.has_key(file.destinationDBlock): - file.status = 'transferring' - hasOutput = True - if hasOutput: - self.job.jobStatus = 'transferring' - # propagate transition to prodDB - self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - # endtime - if self.job.endTime=='NULL': - self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - # output size and # of outputs - self.job.nOutputDataFiles = 0 - self.job.outputFileBytes = 0 - for tmpFile in self.job.Files: - if tmpFile.type == 'output': - self.job.nOutputDataFiles += 1 - try: - self.job.outputFileBytes += tmpFile.fsize - except: - pass - # protection - maxOutputFileBytes = 99999999999 - if self.job.outputFileBytes > maxOutputFileBytes: - self.job.outputFileBytes = maxOutputFileBytes - # set cancelled state - if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': - self.job.jobStatus = 'cancelled' - # update job - retU = self.taskBuffer.updateJobs([self.job],False) - _logger.debug("%s retU: %s" % (self.jobID,retU)) - # failed - if not retU[0]: - _logger.error('failed to update DB for %s' % self.jobID) - # unlock XML - try: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) - self.lockXML.close() - except: - type, value, traceBack = sys.exc_info() - _logger.debug("%s : %s %s" % (self.jobID,type,value)) - _logger.debug("%s cannot unlock XML" % self.jobID) - return - # setup for closer - destDBList = [] - guidList = [] - for file in self.job.Files: - # ignore inputs - if file.type == 'input': - continue - # start closer for output/log datasets - if not file.destinationDBlock in destDBList: - destDBList.append(file.destinationDBlock) - # collect GUIDs - if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test'] and \ - self.job.processingType in ['pathena','prun','gangarobot-rctest'])) \ - and file.type == 'output': - guidList.append({'lfn':file.lfn,'guid':file.GUID,'type':file.type, - 'checksum':file.checksum,'md5sum':file.md5sum, - 'fsize':file.fsize,'scope':file.scope}) - if guidList != []: - retG = self.taskBuffer.setGUIDs(guidList) - if destDBList != []: - # start Closer - cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,pandaDDM=self.pandaDDM, - datasetMap=self.datasetMap) - _logger.debug("%s start Closer" % self.jobID) - cThr.start() - if self.joinCloser: - cThr.join() - _logger.debug("%s end Closer" % self.jobID) - _logger.debug("%s end" % self.jobID) - try: - # remove Catalog - os.remove(self.xmlFile) - except: - pass - # unlock XML - if self.lockXML != None: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) - self.lockXML.close() - except: - type, value, traceBack = sys.exc_info() - _logger.debug("%s : %s %s" % (self.jobID,type,value)) - _logger.debug("%s except" % self.jobID) - # unlock XML just in case - try: - if self.lockXML != None: - fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) - except: - type, value, traceBack = sys.exc_info() - _logger.debug("%s : %s %s" % (self.jobID,type,value)) - _logger.debug("%s cannot unlock XML" % self.jobID) - - - # update output files - def _updateOutputs(self): - # get LFN and GUID - _logger.debug("%s %s" % (self.jobID,self.xmlFile)) - # no outputs - if self.job.Files == []: - _logger.debug("%s has no outputs" % self.jobID) - _logger.debug("%s addFiles end" % self.jobID) - return - # get input files - inputLFNs = [] - for file in self.job.Files: - if file.type == 'input': - inputLFNs.append(file.lfn) - # parse XML - lfns = [] - guids = [] - fsizes = [] - md5sums = [] - chksums = [] - surls = [] - try: - root = xml.dom.minidom.parse(self.xmlFile) - files = root.getElementsByTagName('File') - for file in files: - # get GUID - guid = str(file.getAttribute('ID')) - _logger.debug(guid) - # get PFN and LFN nodes - logical = file.getElementsByTagName('logical')[0] - lfnNode = logical.getElementsByTagName('lfn')[0] - # convert UTF8 to Raw - lfn = str(lfnNode.getAttribute('name')) - # get metadata - fsize = None - md5sum = None - adler32 = None - surl = None - for meta in file.getElementsByTagName('metadata'): - # get fsize - name = str(meta.getAttribute('att_name')) - if name == 'fsize': - fsize = long(meta.getAttribute('att_value')) - elif name == 'md5sum': - md5sum = str(meta.getAttribute('att_value')) - # check - if re.search("^[a-fA-F0-9]{32}$",md5sum) == None: - md5sum = None - elif name == 'adler32': - adler32 = str(meta.getAttribute('att_value')) - elif name == 'surl': - surl = str(meta.getAttribute('att_value')) - # error check - if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None) \ - or (self.useCentralLFC() and surl == None)): - raise RuntimeError, 'fsize/md5sum/adler32/surl=None' - # append - lfns.append(lfn) - guids.append(guid) - fsizes.append(fsize) - md5sums.append(md5sum) - surls.append(surl) - if adler32 != None: - # use adler32 if available - chksums.append("ad:%s" % adler32) - else: - chksums.append("md5:%s" % md5sum) - except: - # check if file exists - if os.path.exists(self.xmlFile): - type, value, traceBack = sys.exc_info() - _logger.error("%s : %s %s" % (self.jobID,type,value)) - # set failed anyway - self.job.jobStatus = 'failed' - # XML error happens when pilot got killed due to wall-time limit or failures in wrapper - if (self.job.pilotErrorCode in [0,'0','NULL']) and \ - (self.job.transExitCode in [0,'0','NULL']): - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "Adder._updateOutputs() could not get GUID/LFN/MD5/FSIZE/SURL" - return - else: - # XML was deleted - self.job.ddmErrorDiag = "Adder._updateOutputs() could not add files" - self.ignoreDDMError = True - return - # check files - idMap = {} - fileList = [] - subMap = {} - for file in self.job.Files: - if file.type == 'input': - if file.lfn in lfns: - if self.job.prodSourceLabel in ['user','panda']: - # skipped file - file.status = 'skipped' - elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']: - # failed by pilot - file.status = 'failed' - elif file.type == 'output' or file.type == 'log': - # append to fileList - fileList.append(file.lfn) - # add only log file for failed jobs - if self.jobStatus == 'failed' and file.type != 'log': - continue - # add only log file for unmerge jobs - if self.job.prodSourceLabel == 'panda' and self.job.processingType in ['unmerge'] \ - and file.type != 'log': - continue - # look for GUID with LFN - try: - i = lfns.index(file.lfn) - file.GUID = guids[i] - file.fsize = fsizes[i] - file.md5sum = md5sums[i] - file.checksum = chksums[i] - surl = surls[i] - # status - file.status = 'ready' - # fsize - fsize = None - if not file.fsize in ['NULL','',0]: - try: - fsize = long(file.fsize) - except: - type, value, traceBack = sys.exc_info() - _logger.error("%s : %s %s" % (self.jobID,type,value)) - # append to map - if not idMap.has_key(file.destinationDBlock): - idMap[file.destinationDBlock] = [] - fileAttrs = {'guid' : file.GUID, - 'lfn' : lfns[i], - 'size' : fsize, - 'checksum' : file.checksum} - # add SURLs if LFC registration is required - if self.useCentralLFC(): - fileAttrs['surl'] = surl - idMap[file.destinationDBlock].append(fileAttrs) - # for subscription - if self.job.prodSourceLabel in ['managed','test','software','rc_test','ptest','user'] and \ - re.search('_sub\d+$',file.destinationDBlock) != None and (not self.addToTopOnly) and \ - self.job.destinationSE != 'local': - if self.siteMapper == None: - _logger.error("%s : SiteMapper==None" % self.jobID) - else: - # get dataset spec - if not self.datasetMap.has_key(file.destinationDBlock): - tmpDS = self.taskBuffer.queryDatasetWithMap({'name':file.destinationDBlock}) - self.datasetMap[file.destinationDBlock] = tmpDS - # check if valid dataset - if self.datasetMap[file.destinationDBlock] == None: - _logger.error("%s : cannot find %s in DB" % (self.jobID,file.destinationDBlock)) - else: - if not self.datasetMap[file.destinationDBlock].status in ['defined']: - # not a fresh dataset - _logger.debug("%s : subscription was already made for %s:%s" % \ - (self.jobID,self.datasetMap[file.destinationDBlock].status, - file.destinationDBlock)) - else: - # get DQ2 IDs - tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm - tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se) - if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(file.destinationSE): - # DQ2 ID was set by using --destSE for analysis job to transfer output - tmpDstDDM = file.destinationSE - tmpDstSEs = file.destinationSE - else: - tmpDstDDM = self.siteMapper.getSite(file.destinationSE).ddm - tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(file.destinationSE).se) - # if src != dest or multi-token - if (tmpSrcDDM != tmpDstDDM and tmpSrcSEs != tmpDstSEs) or \ - (tmpSrcDDM == tmpDstDDM and file.destinationDBlockToken.count(',') != 0): - optSub = {'DATASET_COMPLETE_EVENT' : ['https://%s:%s/server/panda/datasetCompleted' % \ - (panda_config.pserverhost,panda_config.pserverport)]} - # append - if not subMap.has_key(file.destinationDBlock): - subMap[file.destinationDBlock] = [] - # sources - optSource = {} - # set sources - if file.destinationDBlockToken in ['NULL','']: - # use default DQ2 ID as source - optSource[tmpSrcDDM] = {'policy' : 0} - else: - # convert token to DQ2 ID - dq2ID = tmpSrcDDM - # use the first token's location as source for T1D1 - tmpSrcToken = file.destinationDBlockToken.split(',')[0] - if self.siteMapper.getSite(self.job.computingSite).setokens.has_key(tmpSrcToken): - dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens[tmpSrcToken] - optSource[dq2ID] = {'policy' : 0} - # T1 used as T2 - if self.siteMapper.getSite(self.job.computingSite).cloud != self.job.cloud and \ - (not tmpSrcDDM.endswith('PRODDISK')) and \ - (not self.job.prodSourceLabel in ['user','panda']): - # register both DATADISK and PRODDISK as source locations - if self.siteMapper.getSite(self.job.computingSite).setokens.has_key('ATLASPRODDISK'): - dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens['ATLASPRODDISK'] - optSource[dq2ID] = {'policy' : 0} - if not optSource.has_key(tmpSrcDDM): - optSource[tmpSrcDDM] = {'policy' : 0} - # use another location when token is set - if not file.destinationDBlockToken in ['NULL','']: - tmpDQ2IDList = [] - tmpDstTokens = file.destinationDBlockToken.split(',') - # remove the first one because it is already used as a location - if tmpSrcDDM == tmpDstDDM: - tmpDstTokens = tmpDstTokens[1:] - # loop over all tokens - for idxToken,tmpDstToken in enumerate(tmpDstTokens): - dq2ID = tmpDstDDM - if self.siteMapper.getSite(file.destinationSE).setokens.has_key(tmpDstToken): - dq2ID = self.siteMapper.getSite(file.destinationSE).setokens[tmpDstToken] - # keep the fist destination for multi-hop - if idxToken == 0: - firstDestDDM = dq2ID - else: - # use the fist destination as source for T1D1 - optSource = {} - optSource[firstDestDDM] = {'policy' : 0} - # remove looping subscription - if dq2ID == tmpSrcDDM: - continue - # avoid duplication - if not dq2ID in tmpDQ2IDList: - subMap[file.destinationDBlock].append((dq2ID,optSub,optSource)) - else: - # use default DDM - for dq2ID in tmpDstDDM.split(','): - subMap[file.destinationDBlock].append((dq2ID,optSub,optSource)) - except: - # status - file.status = 'failed' - type, value, traceBack = sys.exc_info() - _logger.error("%s : %s %s" % (self.jobID,type,value)) - # cleanup submap - tmpKeys = subMap.keys() - for tmpKey in tmpKeys: - if subMap[tmpKey] == []: - del subMap[tmpKey] - # check consistency between XML and filesTable - for lfn in lfns: - if (not lfn in fileList) and (not lfn in inputLFNs): - _logger.error("%s %s is not found in filesTable" % (self.jobID,lfn)) - self.job.jobStatus = 'failed' - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "Adder._updateOutputs() XML is inconsistent with filesTable" - return - # return if PandaDDM is used or non-DQ2 - if self.pandaDDM or self.job.destinationSE == 'local': - return - # add data to original dataset - for destinationDBlock in idMap.keys(): - origDBlock = None - match = re.search('^(.+)_sub\d+$',destinationDBlock) - if match != None: - # add files to top-level datasets - origDBlock = match.group(1) - if not self.goToTransferring: - idMap[origDBlock] = idMap[destinationDBlock] - # add files to top-level datasets only - if self.addToTopOnly: - del idMap[destinationDBlock] - # skip sub unless getting transferred - if origDBlock != None: - if not self.goToTransferring and not self.logTransferring \ - and idMap.has_key(destinationDBlock): - del idMap[destinationDBlock] - # print idMap - _logger.debug("%s idMap = %s" % (self.jobID,idMap)) - _logger.debug("%s subMap = %s" % (self.jobID,subMap)) - # add data - _logger.debug("%s addFiles start" % self.jobID) - # count the number of files - regNumFiles = 0 - regFileList = [] - for tmpRegDS,tmpRegList in idMap.iteritems(): - for tmpRegItem in tmpRegList: - if not tmpRegItem['lfn'] in regFileList: - regNumFiles += 1 - regFileList.append(tmpRegItem['lfn']) - # number of retry - nTry = 3 - for iTry in range(nTry): - # empty - if idMap == {}: - break - # add data to datasets - time.sleep(1) - isFailed = False - isFatal = False - setErrorDiag = False - out = 'OK' - fatalErrStrs = ['[ORA-00001] unique constraint (ATLAS_DQ2.UQ_01_FILES_GUID) violated'] - regStart = datetime.datetime.utcnow() - try: - if not self.useCentralLFC(): - regMsgStr = "DQ2 registraion for %s files " % regNumFiles - _logger.debug('%s %s %s' % (self.jobID,'registerFilesInDatasets',str(idMap))) - self.dq2api.registerFilesInDatasets(idMap) - else: - regMsgStr = "LFC+DQ2 registraion for %s files " % regNumFiles - _logger.debug('%s %s %s' % (self.jobID,'Register.registerFilesInDatasets',str(idMap))) - registerAPI = Register2.Register(self.siteMapper.getSite(self.job.computingSite).ddm) - out = registerAPI.registerFilesInDatasets(idMap) - except DQ2.DQFileExistsInDatasetException: - # hamless error - errType,errValue = sys.exc_info()[:2] - out = '%s : %s' % (errType,errValue) - except (DQ2.DQClosedDatasetException, - DQ2.DQFrozenDatasetException, - DQ2.DQUnknownDatasetException, - DQ2.DQFileMetaDataMismatchException): - # fatal errors - errType,errValue = sys.exc_info()[:2] - out = '%s : %s' % (errType,errValue) - isFatal = True - except: - # unknown errors - errType,errValue = sys.exc_info()[:2] - out = '%s : %s' % (errType,errValue) - for tmpFatalErrStr in fatalErrStrs: - if tmpFatalErrStr in str(errValue): - self.job.ddmErrorDiag = 'failed to add files : ' + tmpFatalErrStr - setErrorDiag = True - break - isFatal = True - regTime = datetime.datetime.utcnow() - regStart - _logger.debug('%s ' % self.jobID + regMsgStr + \ - 'took %s.%03d sec' % (regTime.seconds,regTime.microseconds/1000)) - # failed - if isFailed or isFatal: - _logger.error('%s %s' % (self.jobID,out)) - if (iTry+1) == nTry or isFatal: - self.job.jobStatus = 'failed' - self.job.ddmErrorCode = ErrorCode.EC_Adder - if not setErrorDiag: - errMsg = "Adder._updateOutputs() could not add files : " - self.job.ddmErrorDiag = errMsg + out.split('\n')[-1] - return - _logger.error("%s Try:%s" % (self.jobID,iTry)) - # sleep - time.sleep(120) - else: - _logger.debug('%s %s' % (self.jobID,out)) - break - # register dataset subscription - subActivity = 'Production' - if not self.job.prodSourceLabel in ['user']: - # make DQ2 subscription for prod jobs - for tmpName,tmpVal in subMap.iteritems(): - for dq2ID,optSub,optSource in tmpVal: - _logger.debug("%s %s %s %s" % (self.jobID,'registerDatasetSubscription', - (tmpName,dq2ID), - {'version':0,'archived':0,'callbacks':optSub, - 'sources':optSource,'sources_policy':(001000 | 010000), - 'wait_for_sources':0,'destination':None,'query_more_sources':0, - 'sshare':"production",'group':None,'activity':subActivity, - 'acl_alias':None,'replica_lifetime':"14 days"})) - for iDDMTry in range(3): - out = 'OK' - isFailed = False - try: - self.dq2api.registerDatasetSubscription(tmpName,dq2ID,version=0,archived=0,callbacks=optSub, - sources=optSource,sources_policy=(001000 | 010000), - wait_for_sources=0,destination=None,query_more_sources=0, - sshare="production",group=None,activity=subActivity, - acl_alias=None,replica_lifetime="14 days") - except DQ2.DQSubscriptionExistsException: - # harmless error - errType,errValue = sys.exc_info()[:2] - out = '%s : %s' % (errType,errValue) - except: - # unknown errors - errType,errValue = sys.exc_info()[:2] - out = '%s : %s' % (errType,errValue) - isFailed = True - if 'is not a Tiers of Atlas Destination' in str(errValue) or \ - 'is not in Tiers of Atlas' in str(errValue): - # fatal error - self.job.ddmErrorCode = ErrorCode.EC_Subscription - else: - # retry for temporary errors - time.sleep(60) - else: - break - if isFailed: - _logger.error('%s %s' % (self.jobID,out)) - if self.job.ddmErrorCode == ErrorCode.EC_Subscription: - # fatal error - self.job.ddmErrorDiag = "subscription failure with %s" % out - self.job.jobStatus = 'failed' - else: - # temoprary errors - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "Adder._updateOutputs() could not register subscription : %s" % tmpName - return - _logger.debug('%s %s' % (self.jobID,out)) - # set dataset status - self.datasetMap[tmpName].status = 'running' - # keep subscriptions - self.subscriptionMap = subMap - elif not "--mergeOutput" in self.job.jobParameters: - # send request to DaTRI unless files will be merged - tmpTopDatasets = {} - # collect top-level datasets - for tmpName,tmpVal in subMap.iteritems(): - for dq2ID,optSub,optSource in tmpVal: - tmpTopName = re.sub('_sub\d+','',tmpName) - # append - if not tmpTopDatasets.has_key(tmpTopName): - tmpTopDatasets[tmpTopName] = [] - if not dq2ID in tmpTopDatasets[tmpTopName]: - tmpTopDatasets[tmpTopName].append(dq2ID) - # remove redundant CN from DN - tmpDN = self.job.prodUserID - tmpDN = re.sub('/CN=limited proxy','',tmpDN) - tmpDN = re.sub('(/CN=proxy)+$','',tmpDN) - # send request - if tmpTopDatasets != {} and self.jobStatus == 'finished': - try: - from datriHandler import datriHandler - if self.job.lockedby.startswith('Ganga'): - tmpHandler = datriHandler(type='ganga') - else: - tmpHandler = datriHandler(type='pathena') - # loop over all output datasets - for tmpDsName,dq2IDlist in tmpTopDatasets.iteritems(): - for tmpDQ2ID in dq2IDlist: - tmpMsg = "%s %s ds=%s site=%s id=%s" % (self.jobID,'datriHandler.sendRequest', - tmpDsName,tmpDQ2ID,tmpDN) - _logger.debug(tmpMsg) - tmpHandler.setParameters(data_pattern=tmpDsName, - site=tmpDQ2ID, - userid=tmpDN) - # number of retry - nTry = 3 - for iTry in range(nTry): - dhStatus,dhOut = tmpHandler.sendRequest() - # succeeded - if dhStatus == 0 or "such request is exist" in dhOut: - _logger.debug("%s %s %s" % (self.jobID,dhStatus,dhOut)) - break - # faital errors - if "No input data or input data is incorrect" in dhOut: - tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,dhStatus,dhOut) - _logger.error(tmpMsg) - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "DaTRI failed for %s with %s %s" % (tmpDsName,dhStatus,dhOut) - return - # retry - if iTry+1 < nTry: - # sleep - time.sleep(60) - else: - # final attempt failed - tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,dhStatus,dhOut) - _logger.error(tmpMsg) - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "DaTRI failed for %s with %s %s" % (tmpDsName,dhStatus,dhOut) - return - # set dataset status - for tmpName,tmpVal in subMap.iteritems(): - self.datasetMap[tmpName].status = 'running' - except: - errType,errValue = sys.exc_info()[:2] - tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,errType,errValue) - _logger.error(tmpMsg) - self.job.ddmErrorCode = ErrorCode.EC_Adder - self.job.ddmErrorDiag = "DaTRI failed with %s %s" % (errType,errValue) - return - # properly finished - _logger.debug("%s addFiles end" % self.jobID) - - - # update shadow dataset - def _updateShadow(self): - # return if PandaDDM is used or non-DQ2 - if self.pandaDDM or self.job.destinationSE == 'local': - return - _logger.debug("%s updateShadow" % self.jobID) - # get shadow DS and contents - shadowList = [] - shadowFiles = [] - for file in self.job.Files: - if file.type == 'output' or file.type == 'log': - # get shadow name - shadowDS = re.sub('_sub\d+$','',file.destinationDBlock) + '_shadow' - if not shadowDS in shadowList: - shadowList.append(shadowDS) - elif file.type == 'input': - # remove skipped files - if file.status in ['skipped']: - continue - # ignore lib.tgz - if re.search('lib\.tgz\.*\d*',file.lfn) != None: - continue - # ignore DBRelease - if re.search('DBRelease',file.lfn) != None: - continue - # ignore when noshadow is set - if file.destinationDBlockToken == 'noshadow': - continue - # fsize - fsize = None - if not file.fsize in ['NULL','',0]: - try: - fsize = long(file.fsize) - except: - type, value, traceBack = sys.exc_info() - _logger.error("%s : %s %s" % (self.jobID,type,value)) - # append - if len(str(file.GUID))==36: - shadowFiles.append({'guid' : file.GUID, - 'lfn' : file.lfn, - 'size' : fsize, - 'checksum' : None}) - # create idMap - idMap = {} - for shadowDS in shadowList: - nTry = 3 - findFlag = False - for iTry in range(nTry): - # check if shadow dataset exists - _logger.debug((self.jobID, 'listDatasets',shadowDS,0,True)) - try: - out = self.dq2api.listDatasets(shadowDS,0,True) - if not out.has_key(shadowDS): - _logger.debug("%s shadow %s doesn't exist" % (self.jobID,shadowDS)) - else: - findFlag = True - break - except: - # sleep - time.sleep(120) - # append - if findFlag and shadowFiles != []: - idMap[shadowDS] = shadowFiles - # add data - _logger.debug("%s shadow idMap = %s" % (self.jobID,idMap)) - if idMap == {}: - return - _logger.debug("%s addFilesToShadow start" % self.jobID) - # number of retry - nTry = 3 - for iTry in range(nTry): - # add data to datasets - _logger.debug((self.jobID, 'registerFilesInDatasets',idMap)) - isFailed = False - isFatal = False - out = 'OK' - try: - self.dq2api.registerFilesInDatasets(idMap) - except DQ2.DQFileExistsInDatasetException: - # hamless error - errType,errValue = sys.exc_info()[:2] - out = '%s : %s' % (errType,errValue) - except (DQ2.DQClosedDatasetException, - DQ2.DQFrozenDatasetException, - DQ2.DQUnknownDatasetException, - DQ2.DQFileMetaDataMismatchException): - # fatal errors - errType,errValue = sys.exc_info()[:2] - out = '%s : %s' % (errType,errValue) - isFatal = True - except: - # unknown errors - errType,errValue = sys.exc_info()[:2] - out = '%s : %s' % (errType,errValue) - isFatal = True - # failed - if isFailed or isFatal: - _logger.error('%s %s' % (self.jobID,out)) - if (iTry+1) == nTry or isFatal: - self.job.jobStatus = 'failed' - self.job.ddmErrorCode = ErrorCode.EC_Adder - errMsg = "Adder._updateOutputs() could not add files : " - self.job.ddmErrorDiag = errMsg + out.split('\n')[-1] - return - _logger.error("%s shadow Try:%s" % (self.jobID,iTry)) - # sleep - time.sleep(120) - else: - _logger.debug('%s %s' % (self.jobID,out)) - break - _logger.debug("%s addFilesToShadow end" % self.jobID) - - - # use cerntral LFC - def useCentralLFC(self): - tmpSiteSpec = self.siteMapper.getSite(self.job.computingSite) - if not self.addToTopOnly and tmpSiteSpec.lfcregister in ['server']: - return True - return False - - - # remove unmerged files - def _removeUnmerged(self): - _logger.debug("%s removeUnmerged" % self.jobID) - # get input files - inputFileGUIDs = [] - inputFileStr = '' - for file in self.job.Files: - if file.type == 'input': - # remove skipped files - if file.status in ['skipped']: - continue - # ignore lib.tgz - if re.search('lib\.tgz\.*\d*',file.lfn) != None: - continue - # ignore DBRelease - if re.search('DBRelease',file.lfn) != None: - continue - # append - inputFileGUIDs.append(file.GUID) - inputFileStr += '%s,' % file.lfn - # extract parent dataset name - tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters) - # failed - if tmpMatch == None: - _logger.error("%s failed to extract parentDS from params=%s" % (self.jobID,self.job.jobParameters)) - return False - parentDS = tmpMatch.group(1) - # delete - _logger.debug("%s deleteFilesFromDataset %s %s" % (self.jobID,parentDS,inputFileStr[:-1])) - nTry = 3 - for iTry in range(nTry): - # add data to datasets - isFailed = False - isFatal = False - out = 'OK' - try: - self.dq2api.deleteFilesFromDataset(parentDS,inputFileGUIDs) - except (DQ2.DQClosedDatasetException, - DQ2.DQFrozenDatasetException, - DQ2.DQUnknownDatasetException, - DQ2.DQFileMetaDataMismatchException): - # fatal errors - errType,errValue = sys.exc_info()[:2] - out = '%s : %s' % (errType,errValue) - isFatal = True - except: - # unknown errors - errType,errValue = sys.exc_info()[:2] - out = '%s : %s' % (errType,errValue) - isFailed = True - # failed - if isFailed or isFatal: - _logger.error('%s %s' % (self.jobID,out)) - if (iTry+1) == nTry or isFatal: - self.job.jobStatus = 'failed' - self.job.ddmErrorCode = ErrorCode.EC_Adder - errMsg = "failed to remove unmerged files : " - self.job.ddmErrorDiag = errMsg + out.split('\n')[-1] - if not isFatal: - # retrun None to retry later - return None - return False - _logger.error("%s removeUnmerged Try:%s" % (self.jobID,iTry)) - # sleep - time.sleep(120) - else: - _logger.debug('%s %s' % (self.jobID,out)) - break - # succeeded - _logger.debug("%s removeUnmerged end" % self.jobID) - return True diff --git a/current/pandaserver/dataservice/AddressFinder.py b/current/pandaserver/dataservice/AddressFinder.py deleted file mode 100644 index c96099bff..000000000 --- a/current/pandaserver/dataservice/AddressFinder.py +++ /dev/null @@ -1,308 +0,0 @@ -import re -import sys -import urllib -import commands - -from config import panda_config -from taskbuffer.OraDBProxy import DBProxy -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('AddressFinder') - -# NG words in email address -_ngWordsInMailAddr = ['support','system','stuff','service','secretariat','club','user'] - - -# insert * -def insertWC(str): - retStr = ".*" - for item in str: - retStr += item - retStr += ".*" - return retStr - - -# clean name -def cleanName(dn): - # extract First Last from DN - dbProxy = DBProxy() - extractedDN = dbProxy.cleanUserID(dn) - # replace -. - extractedDN = re.sub('-|\.',' ',extractedDN) - # change to lower - extractedDN = extractedDN.lower() - # remove ATLAS - extractedDN = re.sub('\(*atlas\)*','',extractedDN) - # remove numbers - extractedDN = re.sub('\d*','',extractedDN) - # remove Jr - extractedDN = re.sub(' jr( |$)',' ',extractedDN) - # remove whitespaces - extractedDN = re.sub(' +',' ',extractedDN) - extractedDN = extractedDN.strip() - # return - return extractedDN - - -# get email address using phonebook -def getEmailPhonebook(dn): - _logger.debug('Getting email via phonebook for %s' % dn) - # clean DN - extractedDN = cleanName(dn) - # dump - _logger.debug(extractedDN) - # construct command - for sTry in ['full','full_rev','fullwc','fullwc_rev,', - 'suronly', 'firstonly','suronly_rev','firstonly_rev', - 'email']: - if sTry == 'full': - # try full name - com = '~atlpan/phonebook --firstname "%s" --surname "%s" --all' \ - % (extractedDN.split()[0],extractedDN.split()[-1]) - if sTry == 'full_rev': - # try full name - com = '~atlpan/phonebook --firstname "%s" --surname "%s" --all' \ - % (extractedDN.split()[-1],extractedDN.split()[0]) - elif sTry == 'fullwc': - # try full name with wildcard - com = '~atlpan/phonebook --firstname "*%s*" --surname "*%s*" --all' \ - % (extractedDN.split()[0],extractedDN.split()[-1]) - elif sTry == 'fullwc_rev': - # try full name with wildcard - com = '~atlpan/phonebook --firstname "*%s*" --surname "*%s*" --all' \ - % (extractedDN.split()[-1],extractedDN.split()[0]) - elif sTry == 'suronly': - if len(extractedDN.split()) == 2: - # try surname only - com = '~atlpan/phonebook --surname "%s" --all' \ - % extractedDN.split()[-1] - else: - # try surname with wildcard - com = '~atlpan/phonebook --surname "*%s*" --all' \ - % extractedDN.split()[-1] - elif sTry == 'suronly_rev': - if len(extractedDN.split()) == 2: - # try surname only - com = '~atlpan/phonebook --surname "%s" --all' \ - % extractedDN.split()[0] - else: - # try surname with wildcard - com = '~atlpan/phonebook --surname "*%s*" --all' \ - % extractedDN.split()[0] - elif sTry == 'firstonly': - if len(extractedDN.split()) == 2: - # try firstname only - com = '~atlpan/phonebook --firstname "%s" --all' \ - % extractedDN.split()[0] - else: - # try firstname with wildcard - com = '~atlpan/phonebook --firstname "*%s*" --all' \ - % extractedDN.split()[0] - elif sTry == 'firstonly_rev': - if len(extractedDN.split()) == 2: - # try firstname only - com = '~atlpan/phonebook --firstname "%s" --all' \ - % extractedDN.split()[-1] - else: - # try firstname with wildcard - com = '~atlpan/phonebook --firstname "*%s*" --all' \ - % extractedDN.split()[-1] - elif sTry == 'email': - # try email - mailPatt = re.sub(' +','*',extractedDN) - com = '~atlpan/phonebook --email "*%s*" --all' \ - % mailPatt - _logger.debug(com) - # execute - sStat,sOut = commands.getstatusoutput(com) - _logger.debug(sOut) - # failed - if sStat != 0: - _logger.debug('phonebook failed with %s' % sStat) - return [] - # extract email - emails = [] - groups = [] - dnames = [] - for line in sOut.split('\n'): - if line.startswith('E-mail:'): - # append - tmpStr = line.split()[-1] - emails.append(tmpStr) - elif line.startswith('Group:'): - # append - tmpStr = line.split()[-1] - groups.append(tmpStr) - elif line.startswith('Display Name:'): - # append - tmpStr = re.sub('^[^:]+:','',line).strip() - dnames.append(tmpStr) - # check groups - newGroups = [] - newEmails = [] - newDNames = [] - for idx,group in enumerate(groups): - if group.startswith('A') or group in ['UAT','GS','-']: - newGroups.append(group) - newEmails.append(emails[idx]) - newDNames.append(dnames[idx]) - # replace - groups = newGroups - emails = newEmails - dnames = newDNames - # check dname - if len(emails) > 1 and len(emails) == len(dnames): - newGroups = [] - newEmails = [] - newDNames = [] - newGroupsWC = [] - newEmailsWC = [] - newDNamesWC = [] - for idx,dname in enumerate(dnames): - # check fragments - nameItems = extractedDN.split() - nMatch = 0 - nMatchWC = 0 - for nameItem in nameItems: - # check w/o wildcard - if re.search(nameItem,dname,re.I) != None: - nMatch += 1 - # check with wildcard - if re.search(insertWC(nameItem),dname,re.I) != None: - nMatchWC += 1 - # append if totally matched or partially matched ignoring middle-name etc - if len(nameItems) == nMatch or (len(nameItems) > 2 and (len(nameItems)-nMatch) < 2): - newGroups.append(groups[idx]) - newEmails.append(emails[idx]) - newDNames.append(dname) - # append if matched with wildcard - if len(nameItems) == nMatchWC or (len(nameItems) > 2 and (len(nameItems)-nMatchWC) < 2): - newGroupsWC.append(groups[idx]) - newEmailsWC.append(emails[idx]) - newDNamesWC.append(dname) - # replace - if len(newGroups)>0: - # use strict matching - groups = newGroups - emails = newEmails - dnames = newDNames - else: - # use loose matching - groups = newGroupsWC - emails = newEmailsWC - dnames = newDNamesWC - _logger.debug('emails=%s' % str(emails)) - # return - if len(emails) == 1: - _logger.debug('Succeeded %s %s' % (groups[0],emails[0])) - return emails - # failed - _logger.error('Failed for %s' % dn) - return [] - - -# get email address using xwho -def getEmailXwho(dn): - # get email from CERN/xwho - _logger.debug('Getting email via xwho for %s' % dn) - for sTry in ['full','firstlastonly']: - try: - # remove middle name - encodedDN = cleanName(dn) - encodedDN = re.sub(' . ',' ',encodedDN) - # remove _ - encodedDN = encodedDN.replace('_',' ') - # use fist and lastnames only - if sTry == 'firstlastonly': - newEncodedDN = '%s %s' % (encodedDN.split()[0],encodedDN.split()[-1]) - # skip if it was already tried - if encodedDN == newEncodedDN: - continue - encodedDN = newEncodedDN - # URL encode - encodedDN = encodedDN.replace(' ','%20') - url = 'http://consult.cern.ch/xwho?'+encodedDN - if panda_config.httpProxy != '': - proxies = proxies={'http': panda_config.httpProxy} - else: - proxies = proxies={} - opener = urllib.FancyURLopener(proxies) - fd=opener.open(url) - data = fd.read() - if re.search(' not found',data,re.I) == None: - break - except: - type, value, traceBack = sys.exc_info() - _logger.error("xwho failure with %s %s" % (type,value)) - return [] - # parse HTML - emails = [] - headerItem = ["Family Name","First Name","Phone","Dep"] - findTable = False - _logger.debug(data) - for line in data.split('\n'): - # look for table - if not findTable: - # look for header - tmpFlag = True - for item in headerItem: - if re.search(item,line) == None: - tmpFlag = False - break - findTable = tmpFlag - continue - else: - # end of table - if re.search(item,"") != None: - findTable = False - continue - # look for link to individual page - match = re.search('href="(/xwho/people/\d+)"',line) - if match == None: - continue - link = match.group(1) - try: - url = 'http://consult.cern.ch'+link - if panda_config.httpProxy != '': - proxies = proxies={'http': panda_config.httpProxy} - else: - proxies = proxies={} - opener = urllib.FancyURLopener(proxies) - fd=opener.open(url) - data = fd.read() - _logger.debug(data) - except: - type, value, traceBack = sys.exc_info() - _logger.error("xwho failure with %s %s" % (type,value)) - return [] - # get mail adder - match = re.search("mailto:([^@]+@[^>]+)>",data) - if match != None: - adder = match.group(1) - # check NG words - okAddr = True - for ngWord in _ngWordsInMailAddr: - if re.search(ngWord,adder,re.I): - _logger.error("%s has NG word:%s" % (adder,ngWord)) - okAddr = False - break - if okAddr and (not adder in emails): - emails.append(adder) - _logger.debug("emails from xwho : '%s'" % emails) - # return - if len(emails) == 1: - _logger.debug('Succeeded : %s %s' % (str(emails),dn)) - return emails - # multiple candidates - if len(emails) > 1: - _logger.error("non unique address : %s for %s" % (str(emails),dn)) - return [] - # failed - _logger.error('Failed to find address for %s' % dn) - return [] - - - - - diff --git a/current/pandaserver/dataservice/Closer.py b/current/pandaserver/dataservice/Closer.py deleted file mode 100755 index 8301945d3..000000000 --- a/current/pandaserver/dataservice/Closer.py +++ /dev/null @@ -1,290 +0,0 @@ -''' -update dataset DB, and then close dataset and start Activator if needed - -''' - -import re -import sys -import time -import urllib -import commands -import threading -from DDM import ddm -import Notifier -import RetryMaker -from Activator import Activator -from pandalogger.PandaLogger import PandaLogger -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec -from taskbuffer.DatasetSpec import DatasetSpec -from brokerage.SiteMapper import SiteMapper -from config import panda_config -import brokerage.broker_util - -# logger -_logger = PandaLogger().getLogger('Closer') - -def initLogger(pLogger): - # redirect logging to parent as it doesn't work in nested threads - global _logger - _logger = pLogger - Notifier.initLogger(_logger) - RetryMaker.initLogger(_logger) - - -class Closer (threading.Thread): - # constructor - def __init__(self,taskBuffer,destinationDBlocks,job,pandaDDM=False,datasetMap={}): - threading.Thread.__init__(self) - self.taskBuffer = taskBuffer - self.destinationDBlocks = destinationDBlocks - self.job = job - self.pandaID = job.PandaID - self.pandaDDM = pandaDDM - self.siteMapper = None - self.datasetMap = datasetMap - - - # main - def run(self): - try: - _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus)) - flagComplete = True - ddmJobs = [] - topUserDsList = [] - usingMerger = False - disableNotifier = False - firstIndvDS = True - for destinationDBlock in self.destinationDBlocks: - dsList = [] - _logger.debug('%s start %s' % (self.pandaID,destinationDBlock)) - # ignore tid datasets - if re.search('_tid[\d_]+$',destinationDBlock): - _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock)) - continue - # query dataset - if self.datasetMap.has_key(destinationDBlock): - dataset = self.datasetMap[destinationDBlock] - else: - dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock}) - if dataset == None: - _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock)) - flagComplete = False - continue - # skip tobedeleted/tobeclosed - if dataset.status in ['cleanup','tobeclosed','completed']: - _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status)) - continue - dsList.append(dataset) - # sort - dsList.sort() - # count number of completed files - notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock, - 'status':'unknown'}) - if notFinish < 0: - _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish)) - flagComplete = False - continue - # check if completed - _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish)) - if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']: - # close non-DQ2 destinationDBlock immediately - finalStatus = 'closed' - elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \ - and self.job.processingType != 'usermerge': - # merge output files - if firstIndvDS: - # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS - finalStatus = 'tobemerged' - firstIndvDS = False - else: - finalStatus = 'tobeclosed' - # set merging to top dataset - usingMerger = True - # disable Notifier - disableNotifier = True - else: - # set status to 'tobeclosed' to trigger DQ2 closing - finalStatus = 'tobeclosed' - if notFinish==0: - _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock)) - # set status - dataset.status = finalStatus - # update dataset in DB - retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", - criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) - if len(retT) > 0 and retT[0]==1: - # close user datasets - if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \ - and (dataset.name.startswith('user') or dataset.name.startswith('group')): - # get top-level user dataset - topUserDsName = re.sub('_sub\d+$','',dataset.name) - # update if it is the first attempt - if topUserDsName != dataset.name and not topUserDsName in topUserDsList: - topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName}) - if topUserDs != None: - # check status - if topUserDs.status in ['completed','cleanup','tobeclosed', - 'tobemerged','merging']: - _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status)) - else: - # set status - if self.job.processingType.startswith('gangarobot') or \ - self.job.processingType.startswith('hammercloud'): - # not trigger freezing for HC datasets so that files can be appended - topUserDs.status = 'completed' - elif not usingMerger: - topUserDs.status = finalStatus - else: - topUserDs.status = 'merging' - # append to avoid repetition - topUserDsList.append(topUserDsName) - # update DB - retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus", - criteriaMap={':crStatus':topUserDs.status}) - if len(retTopT) > 0 and retTopT[0]==1: - _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName)) - else: - _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName)) - # get parent dataset for merge job - if self.job.processingType == 'usermerge': - tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters) - if tmpMatch == None: - _logger.error('%s failed to extract parentDS' % self.pandaID) - else: - unmergedDsName = tmpMatch.group(1) - # update if it is the first attempt - if not unmergedDsName in topUserDsList: - unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName}) - if unmergedDs == None: - _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName)) - else: - # check status - if unmergedDs.status in ['completed','cleanup','tobeclosed']: - _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status)) - else: - # set status - unmergedDs.status = finalStatus - # append to avoid repetition - topUserDsList.append(unmergedDsName) - # update DB - retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus", - criteriaMap={':crStatus':unmergedDs.status}) - if len(retTopT) > 0 and retTopT[0]==1: - _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName)) - else: - _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName)) - if self.pandaDDM and self.job.prodSourceLabel=='managed': - # instantiate SiteMapper - if self.siteMapper == None: - self.siteMapper = SiteMapper(self.taskBuffer) - # get file list for PandaDDM - retList = self.taskBuffer.queryFilesWithMap({'destinationDBlock':destinationDBlock}) - lfnsStr = '' - guidStr = '' - for tmpFile in retList: - if tmpFile.type in ['log','output']: - lfnsStr += '%s,' % tmpFile.lfn - guidStr += '%s,' % tmpFile.GUID - if lfnsStr != '': - guidStr = guidStr[:-1] - lfnsStr = lfnsStr[:-1] - # create a DDM job - ddmjob = JobSpec() - ddmjob.jobDefinitionID = int(time.time()) % 10000 - ddmjob.jobName = "%s" % commands.getoutput('uuidgen') - ddmjob.transformation = 'http://pandaserver.cern.ch:25080/trf/mover/run_dq2_cr' - ddmjob.destinationDBlock = 'testpanda.%s' % ddmjob.jobName - ddmjob.computingSite = "BNL_ATLAS_DDM" - ddmjob.destinationSE = ddmjob.computingSite - ddmjob.currentPriority = 200000 - ddmjob.prodSourceLabel = 'ddm' - ddmjob.transferType = 'sub' - # append log file - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % ddmjob.jobName - fileOL.destinationDBlock = ddmjob.destinationDBlock - fileOL.destinationSE = ddmjob.destinationSE - fileOL.dataset = ddmjob.destinationDBlock - fileOL.type = 'log' - ddmjob.addFile(fileOL) - # make arguments - dstDQ2ID = 'BNLPANDA' - srcDQ2ID = self.siteMapper.getSite(self.job.computingSite).ddm - callBackURL = 'https://%s:%s/server/panda/datasetCompleted?vuid=%s&site=%s' % \ - (panda_config.pserverhost,panda_config.pserverport, - dataset.vuid,dstDQ2ID) - _logger.debug(callBackURL) - # set src/dest - ddmjob.sourceSite = srcDQ2ID - ddmjob.destinationSite = dstDQ2ID - # if src==dst, send callback without ddm job - if dstDQ2ID == srcDQ2ID: - comout = commands.getoutput('curl -k %s' % callBackURL) - _logger.debug(comout) - else: - # run dq2_cr - callBackURL = urllib.quote(callBackURL) - # get destination dir - destDir = brokerage.broker_util._getDefaultStorage(self.siteMapper.getSite(self.job.computingSite).dq2url) - argStr = "-s %s -r %s --guids %s --lfns %s --callBack %s -d %s/%s %s" % \ - (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,callBackURL,destDir, - destinationDBlock,destinationDBlock) - # set job parameters - ddmjob.jobParameters = argStr - _logger.debug('%s pdq2_cr %s' % (self.pandaID,ddmjob.jobParameters)) - ddmJobs.append(ddmjob) - # start Activator - if re.search('_sub\d+$',dataset.name) == None: - if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']: - # don't trigger Activator for merge jobs - pass - else: - if self.job.jobStatus == 'finished': - aThr = Activator(self.taskBuffer,dataset) - aThr.start() - aThr.join() - else: - # unset flag since another thread already updated - flagComplete = False - else: - # update dataset in DB - self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", - criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) - # unset flag - flagComplete = False - # end - _logger.debug('%s end %s' % (self.pandaID,destinationDBlock)) - # start DDM jobs - if ddmJobs != []: - self.taskBuffer.storeJobs(ddmJobs,self.job.prodUserID,joinThr=True) - # change pending jobs to failed - if flagComplete and self.job.prodSourceLabel=='user': - #_logger.debug('%s call RetryMaker for %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID)) - #retryMaker = RetryMaker.RetryMaker(self.taskBuffer,self.job) - #retryMaker.run() - _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID)) - self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID) - # start notifier - _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete)) - if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \ - (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')): - # don't send email for merge jobs - if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']: - useNotifier = True - summaryInfo = {} - # check all jobDefIDs in jobsetID - if not self.job.jobsetID in [0,None,'NULL']: - useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID, - self.job.prodUserName) - _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier)) - if useNotifier: - _logger.debug('%s start Notifier' % self.pandaID) - nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo) - nThr.run() - _logger.debug('%s end Notifier' % self.pandaID) - _logger.debug('%s End' % self.pandaID) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s %s" % (errType,errValue)) - diff --git a/current/pandaserver/dataservice/DDM.py b/current/pandaserver/dataservice/DDM.py deleted file mode 100755 index 5888a36b3..000000000 --- a/current/pandaserver/dataservice/DDM.py +++ /dev/null @@ -1,344 +0,0 @@ -""" -provide primitive methods for DDM - -""" - -import sys -import types -import commands -from config import panda_config - - -# change cwd -_cwd = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) - -# environment variables -_env = 'PATH=%s:%s:$PATH ' % (panda_config.native_python,panda_config.globus_dir+'/bin') -_env+= 'LD_LIBRARY_PATH=%s ' % (panda_config.globus_dir+'/lib') -_env+= 'DQ2_HOME=%s/opt/dq2 ' % panda_config.dq2_dir -_env+= 'http_proxy=%s ' % panda_config.httpProxy -_env+= 'https_proxy=%s ' % panda_config.httpProxy - -_env+= 'PYTHONPATH=%s/usr/lib/python2.3/site-packages:$PYTHONPATH' \ - % panda_config.dq2_dir - -# method object wrapping DQ2 method -class _DQMethod: - # constructor - def __init__(self,moduleName,methodName): - self.moduleName = moduleName - self.methodName = methodName - - # method emulation - def __call__(self,*args,**kwargs): - # main method has disappeared since 0.3 - args = list(args) - if self.methodName == 'main': - self.methodName = args[0] - args.pop(0) - # build command - com = 'import dq2.clientapi.cli.cliutil; ' - #com += 'import sys; sys.tracebacklimit=0; ' - com += 'dq2api = dq2.clientapi.cli.cliutil.getDQ2(None); ' - if self.moduleName == 'DQ2': - # DQ2 is top-level module - com += 'print dq2api.%s(' % self.methodName - elif self.moduleName == 'DQ2_iter': - # iterator - com += 'iter = dq2api.%s(' % self.methodName - else: - com += 'print dq2api.%s.%s(' % (self.moduleName,self.methodName) - # expand args - for i in range(len(args)): - arg = args[i] - if isinstance(arg,types.StringType): - # check invalid characters - for invCh in ['"',"'",'(',')',';']: - if invCh in arg: - return -1,"invalid character %s in %s" % (invCh,arg) - com = "%s'%s'," % (com,arg) - else: - com = '%s%s,' % (com,str(arg)) - for tmpK,tmpV in kwargs.iteritems(): - if isinstance(tmpV,types.StringType): - com += "%s='%s'," % (tmpK,tmpV) - else: - com += "%s=%s," % (tmpK,tmpV) - com = com[:-1] - com += ")" - # loop over iterator - if self.moduleName == 'DQ2_iter': - com += ";exec 'for item in iter:print item'" - # execute - return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com)) - - -# DQ module class -class _DQModule: - # constructor - def __init__(self,moduleName): - self.moduleName = moduleName - - # factory method - def __getattr__(self,methodName): - return _DQMethod(self.moduleName,methodName) - - -# native DQ2 method class -class NativeDQ2Method: - # constructor - def __init__(self): - self.moduleName = None - self.methodName = None - # set module and method name - def setNames(self,moduleName,methodName): - self.moduleName = moduleName - self.methodName = methodName - # method emulation - def __call__(self,*args,**kwargs): - try: - # make dq2api locally since global dq2 object is not thread-safe - import dq2.clientapi.cli.cliutil - dq2api = dq2.clientapi.cli.cliutil.getDQ2(None) - # main method has disappeared since 0.3 - args = list(args) - if self.methodName == 'main': - self.methodName = args[0] - args.pop(0) - # get method object - if self.moduleName in ['DQ2','DQ2_iter']: - methodObj = getattr(dq2api,self.methodName) - else: - methodObj = getattr(getattr(dq2api,self.moduleName),self.methodName) - # execute - retVal = apply(methodObj,args,kwargs) - # loop over for iterator - if self.moduleName == 'DQ2_iter': - strRet = '' - for item in retVal: - strRet += str(item) - else: - strRet = str(retVal) - # return - return 0,strRet - except: - errType,errVale = sys.exc_info()[:2] - return 1,'%s %s' % (errType,errVale) - - - -# native DQ2 module class -class NativeDQ2Module: - # constructor - def __init__(self): - self.moduleName = None - # set module name - def setModName(self,moduleName): - self.moduleName = moduleName - # getter - def __getattr__(self,methodName): - # set method name - api = NativeDQ2Method() - api.setNames(self.moduleName,methodName) - return api - - -# factory class -class DDM: - # constructor - def __init__(self): - self.usingNativeDQ2 = False - # switch to use DQ2 in the same session - def useDirectDQ2(self): - self.usingNativeDQ2 = True - # getter - def __getattr__(self,moduleName): - if not self.usingNativeDQ2: - # run dq2 comamnd in another session - return _DQModule(moduleName) - else: - # run dq2 command in the same session - nativeDQ2 = NativeDQ2Module() - nativeDQ2.setModName(moduleName) - return nativeDQ2 - -# instantiate -ddm = DDM() -del DDM - - -# method object wrapping TOA method -class _TOAMethod: - # constructor - def __init__(self,methodName): - self.methodName = methodName - - # method emulation - def __call__(self,*args): - args = list(args) - # build command - com = 'from dq2.info import TiersOfATLAS; ' - com += 'print TiersOfATLAS.%s(' % self.methodName - # expand args - for i in range(len(args)): - arg = args[i] - if isinstance(arg,types.StringType): - com += "'%s'," % arg - else: - com = '%s,' % arg - com = com[:-1] - com += ")" - # execute - return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com)) - - -# native ToA method class -class NativeTOAMethod: - # constructor - def __init__(self): - self.methodName = None - from dq2.info import TiersOfATLAS - self.api = TiersOfATLAS - # set method name - def setName(self,methodName): - self.methodName = methodName - # method emulation - def __call__(self,*args,**kwargs): - try: - methodObj = getattr(self.api,self.methodName) - # execute - retVal = apply(methodObj,args,kwargs) - strRet = str(retVal) - # return - return 0,strRet - except: - errType,errVale = sys.exc_info()[:2] - return 1,'%s %s' % (errType,errVale) - - -# TOA module class -class TOA: - # constructor - def __init__(self): - self.usingNativeDQ2 = False - self.nativeTOA = None - # getter - def __getattr__(self,methodName): - if not ddm.usingNativeDQ2: - # run dq2 comamnd in another session - return _TOAMethod(methodName) - else: - # make method object - if self.nativeTOA == None: - self.nativeTOA = NativeTOAMethod() - # run dq2 command in the same session - self.nativeTOA.setName(methodName) - return self.nativeTOA - - - -# instantiate -toa = TOA() -del TOA - - -# method object wrapping Dashboard method -class _DashBoradMethod: - # constructor - def __init__(self,methodName): - self.methodName = methodName - - # method emulation - def __call__(self,*args): - args = list(args) - # build command - com = "import sys;sys.stderr=open('/dev/null','w');" - com += "import datetime;from dashboard.api.data.DataQuery import DataQuery;" - com += "sys.stderr=sys.__stderr__;" - com += "dash=DataQuery('dashb-atlas-data.cern.ch', 80);" - com += "print dash.%s(%s,'%s'," % (self.methodName,args[0],args[1]) - com += "startDate=datetime.datetime.utcnow()-datetime.timedelta(hours=24))" - # execute - return commands.getstatusoutput('%s python -c "%s"' % (_cwd,com)) - - -# TOA module class -class DashBorad: - def __getattr__(self,methodName): - return _DashBoradMethod(methodName) - -# instantiate -dashBorad = DashBorad() -del DashBorad - - -# method object wrapping DQ2Info method -class _DQ2InfoMethod: - # constructor - def __init__(self,methodName): - self.methodName = methodName - - # method emulation - def __call__(self,*args): - args = list(args) - # build command - com = 'from dq2.info.client.infoClient import infoClient; ' - com += 'print infoClient().%s(' % self.methodName - # expand args - for i in range(len(args)): - arg = args[i] - if isinstance(arg,types.StringType): - com += "'%s'," % arg - else: - com = '%s,' % arg - com = com[:-1] - com += ")" - # execute - return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com)) - - -# TOA module class -class DQ2Info: - def __getattr__(self,methodName): - return _DQ2InfoMethod(methodName) - - -# instantiate -dq2Info = DQ2Info() -del DQ2Info - - -# method object wrapping dq2 common -class _DQ2CommonMethod: - # constructor - def __init__(self,methodName): - self.methodName = methodName - - # method emulation - def __call__(self,*args): - args = list(args) - # build command - com = 'from dq2.common import %s; ' % self.methodName - com += 'print %s(' % self.methodName - # expand args - for i in range(len(args)): - arg = args[i] - if isinstance(arg,types.StringType): - com += "'%s'," % arg - else: - com = '%s,' % arg - com = com[:-1] - com += ")" - # execute - return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com)) - - -# TOA module class -class DQ2Common: - def __getattr__(self,methodName): - return _DQ2CommonMethod(methodName) - - -# instantiate -dq2Common = DQ2Common() -del DQ2Common diff --git a/current/pandaserver/dataservice/DDMHandler.py b/current/pandaserver/dataservice/DDMHandler.py deleted file mode 100755 index 165738c8e..000000000 --- a/current/pandaserver/dataservice/DDMHandler.py +++ /dev/null @@ -1,48 +0,0 @@ -''' -master hander for DDM - -''' - -import re -import threading - -from Waker import Waker -from Finisher import Finisher -from Activator import Activator - -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('DDMHandler') - - -class DDMHandler (threading.Thread): - # constructor - def __init__(self,taskBuffer,vuid,site=None): - threading.Thread.__init__(self) - self.vuid = vuid - self.taskBuffer = taskBuffer - self.site = site - - - # main - def run(self): - # query dataset - _logger.debug("start: %s %s" % (self.vuid,self.site)) - dataset = self.taskBuffer.queryDatasetWithMap({'vuid':self.vuid}) - if dataset == None: - _logger.error("Not found : %s" % self.vuid) - _logger.debug("end: %s" % self.vuid) - return - _logger.debug("vuid:%s type:%s name:%s" % (self.vuid,dataset.type,dataset.name)) - if dataset.type == 'dispatch': - # activate jobs in jobsDefined - Activator(self.taskBuffer,dataset).start() - if dataset.type == 'output': - if dataset.name != None and re.search('^panda\..*_zip$',dataset.name) != None: - # start unmerge jobs - Activator(self.taskBuffer,dataset,enforce=True).start() - else: - # finish transferring jobs - Finisher(self.taskBuffer,dataset,site=self.site).start() - _logger.debug("end: %s" % self.vuid) diff --git a/current/pandaserver/dataservice/DataService.py b/current/pandaserver/dataservice/DataService.py deleted file mode 100755 index 540987e1a..000000000 --- a/current/pandaserver/dataservice/DataService.py +++ /dev/null @@ -1,99 +0,0 @@ -""" -provide web service for DDM - -""" - -import re -import sys -import cPickle as pickle -from config import panda_config -from taskbuffer.WrappedPickle import WrappedPickle -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('DataService') - - -class DataService: - # constructor - def __init__(self): - self.taskBuffer = None - - # set taskbuffer - def init(self,taskBuffer): - self.taskBuffer = taskBuffer - -# Singleton -dataService = DataService() -del DataService - - -''' -web interface - -''' - -from DDMHandler import DDMHandler - - -# callback for dataset verification -def datasetCompleted(req,vuid,site=None): - thr = DDMHandler(dataService.taskBuffer,vuid,site) - thr.start() - thr.join() - return True - - -# get FQANs -def _getFQAN(req): - fqans = [] - for tmpKey,tmpVal in req.subprocess_env.iteritems(): - # compact credentials - if tmpKey.startswith('GRST_CRED_'): - # VOMS attribute - if tmpVal.startswith('VOMS'): - # FQAN - fqan = tmpVal.split()[-1] - # append - fqans.append(fqan) - # old style - elif tmpKey.startswith('GRST_CONN_'): - tmpItems = tmpVal.split(':') - # FQAN - if len(tmpItems)==2 and tmpItems[0]=='fqan': - fqans.append(tmpItems[-1]) - # return - return fqans - - -# set file status -def updateFileStatusInDisp(req,dataset,fileStatus): - try: - # get FQAN - fqans = _getFQAN(req) - roleOK = False - # loop over all FQANs - for fqan in fqans: - # check production role - for rolePat in ['/atlas/usatlas/Role=production', - '/atlas/Role=production', - # use /atlas since delegation proxy doesn't inherit roles - '/atlas/']: - if fqan.startswith(rolePat): - roleOK = True - break - if not roleOK: - _logger.error('updateFileStatusInDisp : invalid proxy %s' % fqans) - return "False" - # deserialize fileStatus - fileStatusMap = WrappedPickle.loads(fileStatus) - _logger.debug('updateFileStatusInDisp : start %s - %s' % (dataset,fileStatusMap)) - # update status - dataService.taskBuffer.updateFileStatusInDisp(dataset,fileStatusMap) - _logger.debug('updateFileStatusInDisp : done') - return "True" - except: - type,value,traceBack = sys.exc_info() - _logger.error("updateFileStatusInDisp : %s %s" % (type,value)) - return "False" - diff --git a/current/pandaserver/dataservice/DataServiceUtils.py b/current/pandaserver/dataservice/DataServiceUtils.py deleted file mode 100644 index 0e4093cbb..000000000 --- a/current/pandaserver/dataservice/DataServiceUtils.py +++ /dev/null @@ -1,281 +0,0 @@ -import re -import sys - -# get prefix for DQ2 -def getDQ2Prefix(dq2SiteID): - # prefix of DQ2 ID - tmpDQ2IDPrefix = re.sub('_[A-Z,0-9]+DISK$','',dq2SiteID) - # remove whitespace - tmpDQ2IDPrefix = tmpDQ2IDPrefix.strip() - # patchfor MWT2 - if tmpDQ2IDPrefix == 'MWT2_UC': - tmpDQ2IDPrefix = 'MWT2' - return tmpDQ2IDPrefix - - -# check if the file is cached -def isCachedFile(datasetName,siteSpec): - # using CVMFS - if siteSpec.iscvmfs != True: - return False - # FIXME - if not siteSpec.cloud in ['IT']: - return False - # look for DBR - if not datasetName.startswith('ddo.'): - return False - # look for three digits - if re.search('v\d{6}$',datasetName) == None: - return False - return True - - -# get the list of sites where dataset is available -def getSitesWithDataset(tmpDsName,siteMapper,replicaMap,cloudKey,useHomeCloud=False,getDQ2ID=False, - useOnlineSite=False,includeT1=False): - retList = [] - retDQ2Map = {} - # no replica map - if not replicaMap.has_key(tmpDsName): - if getDQ2ID: - return retDQ2Map - return retList - # use valid cloud - if not siteMapper.checkCloud(cloudKey): - if getDQ2ID: - return retDQ2Map - return retList - # check sites in the cloud - for tmpSiteName in siteMapper.getCloud(cloudKey)['sites']: - # skip T1 - if not includeT1: - # T1 - if tmpSiteName == siteMapper.getCloud(cloudKey)['source']: - continue - # hospital queue - if siteMapper.getSite(tmpSiteName).ddm == siteMapper.getSite(siteMapper.getCloud(cloudKey)['source']).ddm: - continue - # use home cloud - if useHomeCloud: - if siteMapper.getSite(tmpSiteName).cloud != cloudKey: - continue - # online - if siteMapper.getSite(tmpSiteName).status != 'online': - continue - # check all associated DQ2 IDs - tmpFoundFlag = False - tmpSiteSpec = siteMapper.getSite(tmpSiteName) - for tmpSiteDQ2ID in [tmpSiteSpec.ddm]+tmpSiteSpec.setokens.values(): - # prefix of DQ2 ID - tmpDQ2IDPrefix = getDQ2Prefix(tmpSiteDQ2ID) - # ignore empty - if tmpDQ2IDPrefix == '': - continue - # loop over all replica DQ2 IDs - for tmpDQ2ID in replicaMap[tmpDsName].keys(): - # use DATADISK or GROUPDISK - if '_SCRATCHDISK' in tmpDQ2ID or \ - '_USERDISK' in tmpDQ2ID or \ - '_PRODDISK' in tmpDQ2ID or \ - '_LOCALGROUPDISK' in tmpDQ2ID or \ - 'TAPE' in tmpDQ2ID or \ - '_DAQ' in tmpDQ2ID or \ - '_TMPDISK' in tmpDQ2ID or \ - '_TZERO' in tmpDQ2ID: - continue - # check DQ2 prefix - if tmpDQ2ID.startswith(tmpDQ2IDPrefix): - tmpFoundFlag = True - if not getDQ2ID: - break - # append map - if not retDQ2Map.has_key(tmpSiteName): - retDQ2Map[tmpSiteName] = [] - if not tmpDQ2ID in retDQ2Map[tmpSiteName]: - retDQ2Map[tmpSiteName].append(tmpDQ2ID) - # append - if tmpFoundFlag: - retList.append(tmpSiteName) - # return map - if getDQ2ID: - return retDQ2Map - # retrun - return retList - - -# get the number of files available at the site -def getNumAvailableFilesSite(siteName,siteMapper,replicaMap,badMetaMap,additionalSEs=[], - noCheck=[],fileCounts=None): - try: - # get DQ2 endpoints - tmpSiteSpec = siteMapper.getSite(siteName) - prefixList = [] - for tmpSiteDQ2ID in [tmpSiteSpec.ddm]+tmpSiteSpec.setokens.values(): - # prefix of DQ2 ID - tmpDQ2IDPrefix = getDQ2Prefix(tmpSiteDQ2ID) - # ignore empty - if tmpDQ2IDPrefix != '': - prefixList.append(tmpDQ2IDPrefix) - # loop over datasets - totalNum = 0 - for tmpDsName,tmpSitesData in replicaMap.iteritems(): - # cached files - if isCachedFile(tmpDsName,tmpSiteSpec) and fileCounts != None and \ - fileCounts.has_key(tmpDsName): - # add with no check - totalNum += fileCounts[tmpDsName] - continue - # dataset type - datasetType = getDatasetType(tmpDsName) - # use total num to effectively skip file availability check - if datasetType in noCheck: - columnName = 'total' - else: - columnName = 'found' - # get num of files - maxNumFile = 0 - # for T1 or T2 - if additionalSEs != []: - # check T1 endpoints - for tmpSePat in additionalSEs: - # ignore empty - if tmpSePat == '': - continue - # make regexp pattern - if '*' in tmpSePat: - tmpSePat = tmpSePat.replace('*','.*') - tmpSePat = '^' + tmpSePat +'$' - # loop over all sites - for tmpSE in tmpSitesData.keys(): - # skip bad metadata - if badMetaMap.has_key(tmpDsName) and tmpSE in badMetaMap[tmpDsName]: - continue - # check match - if re.search(tmpSePat,tmpSE) == None: - continue - # get max num of files - tmpN = tmpSitesData[tmpSE][0][columnName] - if tmpN != None and tmpN > maxNumFile: - maxNumFile = tmpN - else: - # check explicit endpoint name - for tmpSiteDQ2ID in [tmpSiteSpec.ddm]+tmpSiteSpec.setokens.values(): - # skip bad metadata - if badMetaMap.has_key(tmpDsName) and tmpSiteDQ2ID in badMetaMap[tmpDsName]: - continue - # ignore empty - if tmpSiteDQ2ID == '': - continue - # get max num of files - if tmpSitesData.has_key(tmpSiteDQ2ID): - tmpN = tmpSitesData[tmpSiteDQ2ID][0][columnName] - if tmpN != None and tmpN > maxNumFile: - maxNumFile = tmpN - # check prefix - for tmpDQ2IDPrefix in prefixList: - for tmpDQ2ID,tmpStat in tmpSitesData.iteritems(): - # skip bad metadata - if badMetaMap.has_key(tmpDsName) and tmpDQ2ID in badMetaMap[tmpDsName]: - continue - # ignore NG - if '_SCRATCHDISK' in tmpDQ2ID or \ - '_USERDISK' in tmpDQ2ID or \ - '_PRODDISK' in tmpDQ2ID or \ - '_LOCALGROUPDISK' in tmpDQ2ID or \ - '_DAQ' in tmpDQ2ID or \ - '_TMPDISK' in tmpDQ2ID or \ - '_TZERO' in tmpDQ2ID: - continue - # check prefix - if tmpDQ2ID.startswith(tmpDQ2IDPrefix): - tmpN = tmpSitesData[tmpDQ2ID][0][columnName] - if tmpN != None and tmpN > maxNumFile: - maxNumFile = tmpN - # sum - totalNum += maxNumFile - # return - return True,totalNum - except: - errtype,errvalue = sys.exc_info()[:2] - return False,'%s:%s' % (errtype,errvalue) - - -# get the list of sites where dataset is available -def getEndpointsAtT1(tmpRepMap,siteMapper,cloudName): - retList = [] - # get cloud SEs - tmpCloud = siteMapper.getCloud(cloudName) - cloudSEs = tmpCloud['tier1SE'] - # check T1 endpoints - for tmpSePat in cloudSEs: - # ignore empty - if tmpSePat == '': - continue - # make regexp pattern - if '*' in tmpSePat: - tmpSePat = tmpSePat.replace('*','.*') - tmpSePat = '^' + tmpSePat +'$' - # loop over all sites - for tmpSE in tmpRepMap.keys(): - # check match - if re.search(tmpSePat,tmpSE) == None: - continue - # append - if not tmpSE in retList: - retList.append(tmpSE) - # return - return retList - - -# check DDM response -def isDQ2ok(out): - if out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1 \ - or out == '()': - return False - return True - - -# check if DBR -def isDBR(datasetName): - if datasetName.startswith('ddo'): - return True - return False - - -# get the list of sites in a cloud which cache a dataset -def getSitesWithCacheDS(cloudKey,excludedSites,siteMapper,datasetName): - retList = [] - # check sites in the cloud - for tmpSiteName in siteMapper.getCloud(cloudKey)['sites']: - # excluded - if tmpSiteName in excludedSites: - continue - # skip T1 - if tmpSiteName == siteMapper.getCloud(cloudKey)['source']: - continue - # hospital queue - if siteMapper.getSite(tmpSiteName).ddm == siteMapper.getSite(siteMapper.getCloud(cloudKey)['source']).ddm: - continue - # not home cloud - if siteMapper.getSite(tmpSiteName).cloud != cloudKey: - continue - # online - if siteMapper.getSite(tmpSiteName).status != 'online': - continue - # check CVMFS - if isCachedFile(datasetName,siteMapper.getSite(tmpSiteName)): - retList.append(tmpSiteName) - # return - return retList - - -# get dataset type -def getDatasetType(dataset): - datasetType = None - try: - datasetType = dataset.split('.')[4] - except: - pass - return datasetType diff --git a/current/pandaserver/dataservice/DynDataDistributer.py b/current/pandaserver/dataservice/DynDataDistributer.py deleted file mode 100644 index 8a808a54c..000000000 --- a/current/pandaserver/dataservice/DynDataDistributer.py +++ /dev/null @@ -1,1657 +0,0 @@ -''' -find candidate site to distribute input datasets - -''' - -import re -import sys -import time -import math -import types -import random -import datetime - -from dataservice.DDM import ddm -from dataservice.DDM import toa -from taskbuffer.JobSpec import JobSpec -import brokerage.broker - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('DynDataDistributer') - -def initLogger(pLogger): - # redirect logging to parent - global _logger - _logger = pLogger - - -# NG datasets -ngDataTypes = ['RAW','HITS','RDO','ESD','EVNT'] - -# excluded provenance -ngProvenance = [] - -# protection for max number of replicas -protectionMaxNumReplicas = 10 - -# max number of waiting jobs -maxWaitingJobs = 200 - -# max number of waiting jobsets -maxWaitingJobsets = 2 - -# clouds with small T1 to make replica at T2 -cloudsWithSmallT1 = ['IT'] - -# files in datasets -g_filesInDsMap = {} - - -class DynDataDistributer: - - # constructor - def __init__(self,jobs,taskBuffer,siteMapper,simul=False,token=None): - self.jobs = jobs - self.taskBuffer = taskBuffer - self.siteMapper = siteMapper - if token == None: - self.token = datetime.datetime.utcnow().isoformat(' ') - else: - self.token = token - # use a fixed list since some clouds don't have active T2s - self.pd2pClouds = ['CA','DE','ES','FR','IT','ND','NL','TW','UK','US'] - self.simul = simul - self.lastMessage = '' - self.cachedSizeMap = {} - self.shareMoUForT2 = None - self.mapTAGandParentGUIDs = {} - self.tagParentInfo = {} - self.parentLfnToTagMap = {} - - - # main - def run(self): - try: - self.putLog("start for %s" % self.jobs[0].PandaID) - # check cloud - if not self.jobs[0].cloud in self.pd2pClouds+['CERN',]: - self.putLog("skip cloud=%s not one of PD2P clouds %s" % (self.jobs[0].cloud,str(self.pd2pClouds))) - self.putLog("end for %s" % self.jobs[0].PandaID) - return - # ignore HC and group production - if self.jobs[0].processingType in ['hammercloud','gangarobot'] or self.jobs[0].processingType.startswith('gangarobot'): - self.putLog("skip due to processingType=%s" % self.jobs[0].processingType) - self.putLog("end for %s" % self.jobs[0].PandaID) - return - # ignore HC and group production - if not self.jobs[0].workingGroup in ['NULL',None,'']: - self.putLog("skip due to workingGroup=%s" % self.jobs[0].workingGroup) - self.putLog("end for %s" % self.jobs[0].PandaID) - return - # get input datasets - inputDatasets = [] - for tmpJob in self.jobs: - if tmpJob.prodSourceLabel == 'user': - for tmpFile in tmpJob.Files: - if tmpFile.type == 'input' and not tmpFile.lfn.endswith('.lib.tgz'): - if not tmpFile.dataset in inputDatasets: - inputDatasets.append(tmpFile.dataset) - # loop over all input datasets - for inputDS in inputDatasets: - # only mc/data datasets - moveFlag = False - for projectName in ['mc','data']: - if inputDS.startswith(projectName): - moveFlag = True - if not moveFlag: - self.putLog("skip non official dataset %s" % inputDS) - continue - if re.search('_sub\d+$',inputDS) != None or re.search('_dis\d+$',inputDS) != None: - self.putLog("skip dis/sub dataset %s" % inputDS) - continue - # check type - tmpItems = inputDS.split('.') - if len(tmpItems) < 5: - self.putLog("cannot get type from %s" % inputDS) - continue - if tmpItems[4] in ngDataTypes: - self.putLog("don't move %s : %s" % (tmpItems[4],inputDS)) - continue - # get candidate sites - self.putLog("get candidates for %s" % inputDS) - status,sitesMaps = self.getCandidates(inputDS,useCloseSites=True) - if not status: - self.putLog("failed to get candidates") - continue - # get size of input container - totalInputSize = 0 - if inputDS.endswith('/'): - status,totalInputSize = self.getDatasetSize(inputDS) - if not status: - self.putLog("failed to get size of %s" % inputDS) - continue - # get number of waiting jobs and jobsets - nWaitingJobsAll = self.taskBuffer.getNumWaitingJobsForPD2P(inputDS) - nWaitingJobsets = self.taskBuffer.getNumWaitingJobsetsForPD2P(inputDS) - # loop over all datasets - usedSites = [] - for tmpDS,tmpVal in sitesMaps.iteritems(): - self.putLog("triggered for %s" % tmpDS,sendLog=True) - # increment used counter - if not self.simul: - nUsed = self.taskBuffer.incrementUsedCounterSubscription(tmpDS) - else: - nUsed = 5 - # insert dummy for new dataset which is used to keep track of usage even if subscription is not made - if nUsed == 0: - retAddUserSub = self.taskBuffer.addUserSubscription(tmpDS,['DUMMY']) - if not retAddUserSub: - self.putLog("failed to add dummy subscription to database for %s " % tmpDS,type='error',sendLog=True) - continue - # collect candidates - allCandidates = [] - totalUserSub = 0 - allCompPd2pSites = [] - allOKClouds = [] - totalSecReplicas = 0 - allT1Candidates = [] - totalT1Sub = 0 - cloudCandMap = {} - nReplicasInCloud = {} - allCandidatesMoU = [] - nTier1Copies = 0 - for tmpCloud,(candSites,sitesComDS,sitesPd2pDS,nUserSub,t1HasReplica,t1HasPrimary,nSecReplicas,nT1Sub,candForMoU) in tmpVal.iteritems(): - self.putLog("%s sites with comp DS:%s compPD2P:%s candidates:%s nSub:%s T1:%s Pri:%s nSec:%s nT1Sub:%s candMoU:%s" % \ - (tmpCloud,str(sitesComDS),str(sitesPd2pDS),str(candSites),nUserSub,t1HasReplica,t1HasPrimary, - nSecReplicas,nT1Sub,str(candForMoU))) - # add - totalUserSub += nUserSub - totalT1Sub += nT1Sub - allCompPd2pSites += sitesPd2pDS - totalSecReplicas += nSecReplicas - cloudCandMap[tmpCloud] = candSites - nReplicasInCloud[tmpCloud] = len(sitesComDS) + len(sitesPd2pDS) - # cloud is candidate for T1-T1 when T1 doesn't have primary or secondary replicas or old subscriptions - if not t1HasPrimary and nSecReplicas == 0 and nT1Sub == 0: - allT1Candidates.append(tmpCloud) - # the number of T1s with replica - if t1HasPrimary or nSecReplicas > 0: - nTier1Copies += 1 - # add candidates - for tmpCandSite in candSites: - if not tmpCandSite in usedSites: - allCandidates.append(tmpCandSite) - # add candidates for MoU - for tmpCandSite in candForMoU: - if not tmpCandSite in usedSites: - allCandidatesMoU.append(tmpCandSite) - # add clouds - if not tmpCloud in allOKClouds: - allOKClouds.append(tmpCloud) - self.putLog("PD2P sites with comp replicas : %s" % str(allCompPd2pSites)) - self.putLog("PD2P T2 candidates : %s" % str(allCandidates)) - self.putLog("PD2P T2 MoU candidates : %s" % str(allCandidatesMoU)) - self.putLog("PD2P # of T2 subscriptions : %s" % totalUserSub) - self.putLog("PD2P # of T1 secondaries : %s" % totalSecReplicas) - self.putLog("PD2P # of T1 subscriptions : %s" % nT1Sub) - self.putLog("PD2P # of T1 replicas : %s" % nTier1Copies) - self.putLog("PD2P T1 candidates : %s" % str(allT1Candidates)) - self.putLog("PD2P nUsed : %s" % nUsed) - # get dataset size - retDsSize,dsSize = self.getDatasetSize(tmpDS) - if not retDsSize: - self.putLog("failed to get dataset size of %s" % tmpDS,type='error',sendLog=True) - continue - self.putLog("PD2P nWaitingJobsets : %s" % nWaitingJobsets) - if totalInputSize != 0: - self.putLog("PD2P nWaitingJobs : %s = %s(all)*%s(dsSize)/%s(contSize)" % \ - (int((float(nWaitingJobsAll * dsSize) / float(totalInputSize))), - nWaitingJobsAll,dsSize,totalInputSize)) - else: - self.putLog("PD2P nWaitingJobs : %s = %s(all)" % \ - (nWaitingJobsAll,nWaitingJobsAll)) - # make T1-T1 - triggeredT1PD2P = False - if nUsed > 0: - # extract integer part. log10(nUsed) and log10(nUsed)+1 are used to avoid round-off error - intLog10nUsed = int(math.log10(nUsed)) - if self.simul or (int(math.log10(nUsed)) > totalSecReplicas and \ - (nUsed == 10**intLog10nUsed or nUsed == 10**(intLog10nUsed+1)) and \ - nT1Sub == 0 and allT1Candidates != []): - self.putLog("making T1-T1",sendLog=True) - # make subscription - retT1Sub,useSmallT1 = self.makeT1Subscription(allT1Candidates,tmpDS,dsSize,nUsed) - self.putLog("done for T1-T1") - triggeredT1PD2P = True - # make a T2 copy when T1 PD2P was triggered - if triggeredT1PD2P: - # TODO - retT2MoU,selectedSite = self.makeT2SubscriptionMoU(allCandidatesMoU,tmpDS,dsSize,'T1MOU',nUsed) - if retT2MoU and selectedSite != None: - # remove from candidate list - if selectedSite in allCandidates: - allCandidates.remove(selectedSite) - if selectedSite in allCandidatesMoU: - allCandidatesMoU.remove(selectedSite) - # increment the number of T2 subscriptions - totalUserSub += 1 - # set the number of T2 PD2P replicas - maxSitesHaveDS = 1 - # additional replicas - if nWaitingJobsets > maxWaitingJobsets: - # the number of waiting jobs for this dataset - if totalInputSize != 0: - # dataset in container - tmpN = float(nWaitingJobsAll * dsSize) / float(totalInputSize) - else: - # dataset - tmpN = float(nWaitingJobsAll) - tmpN = int(math.log10(tmpN/float(maxWaitingJobs))) + nTier1Copies - maxSitesHaveDS = max(maxSitesHaveDS,tmpN) - # protection against too many replications - maxSitesHaveDS = min(maxSitesHaveDS,protectionMaxNumReplicas) - self.putLog("PD2P maxSitesHaveDS : %s" % maxSitesHaveDS) - # ignore the first job - if nUsed == 0: - self.putLog("skip the first job", - sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'FIRSTJOB','dataset':tmpDS}) - if not self.simul: - continue - # check number of replicas - if len(allCompPd2pSites) >= maxSitesHaveDS and nUsed != 1: - self.putLog("skip since many T2 PD2P sites (%s>=%s) have the replica" % (len(allCompPd2pSites),maxSitesHaveDS), - sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'TOO_MANY_T2_REPLICAS','dataset':tmpDS}) - if not self.simul: - continue - # check the number of subscriptions - maxNumSubInAllCloud = max(0,maxSitesHaveDS-len(allCompPd2pSites)) - maxNumSubInAllCloud = min(2,maxNumSubInAllCloud) - self.putLog("PD2P maxNumSubInAllCloud : %s" % maxNumSubInAllCloud) - if totalUserSub >= maxNumSubInAllCloud: - self.putLog("skip since enough subscriptions (%s>=%s) were already made for T2 PD2P" % \ - (totalUserSub,maxNumSubInAllCloud), - sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'TOO_MANY_T2_SUBSCRIPTIONS','dataset':tmpDS}) - if not self.simul: - continue - # no candidates - if len(allCandidates) == 0: - self.putLog("skip since no candidates",sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'NO_T2_CANDIDATE','dataset':tmpDS}) - continue - # get inverse weight for brokerage - weightForBrokerage = self.getWeightForBrokerage(allCandidates,tmpDS,nReplicasInCloud) - self.putLog("inverse weight %s" % str(weightForBrokerage)) - # get free disk size - self.putLog("getting free disk size for T2 PD2P") - retFreeSizeMap,freeSizeMap = self.getFreeDiskSize(tmpDS,allCandidates) - if not retFreeSizeMap: - self.putLog("failed to get free disk size",type='error',sendLog=True) - continue - # run brokerage - tmpJob = JobSpec() - tmpJob.AtlasRelease = '' - self.putLog("run brokerage for %s" % tmpDS) - usedWeight = brokerage.broker.schedule([tmpJob],self.taskBuffer,self.siteMapper,True,allCandidates, - True,specialWeight=weightForBrokerage,getWeight=True, - sizeMapForCheck=freeSizeMap,datasetSize=dsSize) - selectedSite = tmpJob.computingSite - for tmpWeightSite,tmpWeightStr in usedWeight.iteritems(): - tmpTagsMap = {'site':tmpWeightSite,'weight':tmpWeightStr,'dataset':tmpDS} - if tmpWeightSite == selectedSite: - if nUsed == 1: - tmpActionTag = 'SELECTEDT2_JOB' - elif len(allCompPd2pSites) == 0: - tmpActionTag = 'SELECTEDT2_NOREP' - else: - tmpActionTag = 'SELECTEDT2_WAIT' - tmpTagsMap['nused'] = nUsed - tmpTagsMap['nwaitingjobs'] = nWaitingJobsAll - tmpTagsMap['nwaitingjobsets'] = nWaitingJobsets - tmpTagsMap['nsiteshaveds'] = len(allCompPd2pSites) - else: - tmpActionTag = 'UNSELECTEDT2' - self.putLog("weight %s %s" % (tmpWeightSite,tmpWeightStr),sendLog=True, - actionTag=tmpActionTag,tagsMap=tmpTagsMap) - self.putLog("site for T2 PD2P -> %s" % selectedSite) - # remove from candidate list - if selectedSite in allCandidates: - allCandidates.remove(selectedSite) - if selectedSite in allCandidatesMoU: - allCandidatesMoU.remove(selectedSite) - # make subscription - if not self.simul: - subRet,dq2ID = self.makeSubscription(tmpDS,selectedSite,ddmShare='secondary') - self.putLog("made subscription to %s:%s" % (selectedSite,dq2ID),sendLog=True) - usedSites.append(selectedSite) - # update database - if subRet: - self.taskBuffer.addUserSubscription(tmpDS,[dq2ID]) - # additional T2 copy with MoU share when it is the second submission - if nUsed == 1 or self.simul: - retT2MoU,selectedSite = self.makeT2SubscriptionMoU(allCandidatesMoU,tmpDS,dsSize,'T2MOU',nUsed) - self.putLog("end for %s" % self.jobs[0].PandaID) - except: - errType,errValue = sys.exc_info()[:2] - self.putLog("%s %s" % (errType,errValue),'error') - - - # get candidate sites for subscription - def getCandidates(self,inputDS,checkUsedFile=True,useHidden=False,useCloseSites=False): - # return for failure - failedRet = False,{'':{'':([],[],[],0,False,False,0,0,[])}} - # get replica locations - if inputDS.endswith('/'): - # container - status,tmpRepMaps = self.getListDatasetReplicasInContainer(inputDS) - # get used datasets - if status and checkUsedFile: - status,tmpUsedDsList = self.getUsedDatasets(tmpRepMaps) - # remove unused datasets - newRepMaps = {} - for tmpKey,tmpVal in tmpRepMaps.iteritems(): - if tmpKey in tmpUsedDsList: - newRepMaps[tmpKey] = tmpVal - tmpRepMaps = newRepMaps - else: - # normal dataset - status,tmpRepMap = self.getListDatasetReplicas(inputDS) - tmpRepMaps = {inputDS:tmpRepMap} - if not status: - # failed - self.putLog("failed to get replica locations for %s" % inputDS,'error') - return failedRet - # get close sites - closeSitesMap = {} - for tmpDS,tmpRepMap in tmpRepMaps.iteritems(): - # loop over all DQ2 IDs - for tmpDQ2ID in tmpRepMap.keys(): - if not closeSitesMap.has_key(tmpDQ2ID): - status,tmpCloseSiteList = toa.getCloseSites(tmpDQ2ID) - exec "tmpCloseSiteList = %s" % tmpCloseSiteList - closeSitesMap[tmpDQ2ID] = [] - # select only DATADISK - for tmpCloseSite in tmpCloseSiteList: - if tmpCloseSite.endswith('_DATADISK'): - closeSitesMap[tmpDQ2ID].append(tmpCloseSite) - # get all sites - allSiteMap = {} - for tmpSiteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems(): - # check cloud - if not tmpSiteSpec.cloud in self.pd2pClouds: - continue - # ignore test sites - if 'test' in tmpSiteName.lower(): - continue - # analysis only - if not tmpSiteName.startswith('ANALY'): - continue - # online - if not tmpSiteSpec.status in ['online']: - self.putLog("skip %s due to status=%s" % (tmpSiteName,tmpSiteSpec.status)) - continue - if not allSiteMap.has_key(tmpSiteSpec.cloud): - allSiteMap[tmpSiteSpec.cloud] = [] - allSiteMap[tmpSiteSpec.cloud].append(tmpSiteSpec) - # NG DQ2 IDs - ngDQ2SuffixList = ['LOCALGROUPDISK'] - # loop over all clouds - returnMap = {} - checkedMetaMap = {} - userSubscriptionsMap = {} - for cloud in self.pd2pClouds: - # DQ2 prefix of T1 - tmpT1SiteID = self.siteMapper.getCloud(cloud)['source'] - tmpT1DQ2ID = self.siteMapper.getSite(tmpT1SiteID).ddm - prefixDQ2T1 = re.sub('[^_]+DISK$','',tmpT1DQ2ID) - # loop over all datasets - for tmpDS,tmpRepMap in tmpRepMaps.iteritems(): - candSites = [] - sitesComDS = [] - sitesCompPD2P = [] - # check metadata - if not checkedMetaMap.has_key(tmpDS): - checkedMetaMap[tmpDS] = self.getDatasetMetadata(tmpDS) - retMeta,tmpMetadata = checkedMetaMap[tmpDS] - if not retMeta: - self.putLog("failed to get metadata for %s" % tmpDS,'error') - return failedRet - if tmpMetadata['provenance'] in ngProvenance: - self.putLog("provenance=%s of %s is excluded" % (tmpMetadata['provenance'],tmpDS)) - continue - if tmpMetadata['hidden'] in [True,'True'] and not useHidden: - self.putLog("%s is hidden" % tmpDS) - continue - # check T1 has a replica and get close sites - t1HasReplica = False - t1HasPrimary = False - nSecReplicas = 0 - closeSiteList = [] - candForMoU = [] - for tmpDQ2ID,tmpStatMap in tmpRepMap.iteritems(): - # check NG suffix - ngSuffixFlag = False - for tmpNGSuffix in ngDQ2SuffixList: - if tmpDQ2ID.endswith(tmpNGSuffix): - ngSuffixFlag = True - break - if ngSuffixFlag: - continue - # get close sites - if closeSitesMap.has_key(tmpDQ2ID): - for tmpCloseSiteID in closeSitesMap[tmpDQ2ID]: - if not tmpCloseSiteID in closeSiteList: - closeSiteList.append(tmpCloseSiteID) - # checks for T1 - if tmpDQ2ID.startswith(prefixDQ2T1): - if tmpStatMap[0]['total'] == tmpStatMap[0]['found']: - t1HasReplica = True - # check replica metadata to get archived info - retRepMeta,tmpRepMetadata = self.getReplicaMetadata(tmpDS,tmpDQ2ID) - if not retRepMeta: - self.putLog("failed to get replica metadata for %s:%s" % \ - (tmpDS,tmpDQ2ID),'error') - return failedRet - # check archived field - if isinstance(tmpRepMetadata,types.DictType) and tmpRepMetadata.has_key('archived') and \ - tmpRepMetadata['archived'] == 'primary': - # primary - t1HasPrimary = True - break - elif isinstance(tmpRepMetadata,types.DictType) and tmpRepMetadata.has_key('archived') and \ - tmpRepMetadata['archived'] == 'secondary': - # secondary - nSecReplicas += 1 - break - self.putLog("close sites : %s" % str(closeSiteList)) - # get on-going subscriptions - timeRangeSub = 7 - if not userSubscriptionsMap.has_key(tmpDS): - userSubscriptionsMap[tmpDS] = self.taskBuffer.getUserSubscriptions(tmpDS,timeRangeSub) - userSubscriptions = userSubscriptionsMap[tmpDS] - # unused cloud - if not allSiteMap.has_key(cloud): - continue - # count the number of T1 subscriptions - nT1Sub = 0 - for tmpUserSub in userSubscriptions: - if tmpUserSub.startswith(prefixDQ2T1): - nT1Sub += 1 - # check sites - nUserSub = 0 - for tmpSiteSpec in allSiteMap[cloud]: - # check cloud - if tmpSiteSpec.cloud != cloud: - continue - # prefix of DQ2 ID - prefixDQ2 = re.sub('[^_]+DISK$','',tmpSiteSpec.ddm) - # skip T1 - if prefixDQ2 == prefixDQ2T1: - continue - # check if corresponding DQ2 ID is a replica location - hasReplica = False - for tmpDQ2ID,tmpStatMap in tmpRepMap.iteritems(): - # check NG suffix - ngSuffixFlag = False - for tmpNGSuffix in ngDQ2SuffixList: - if tmpDQ2ID.endswith(tmpNGSuffix): - ngSuffixFlag = True - break - if ngSuffixFlag: - continue - if tmpDQ2ID.startswith(prefixDQ2): - if tmpStatMap[0]['total'] == tmpStatMap[0]['found']: - # complete - sitesComDS.append(tmpSiteSpec.sitename) - if tmpSiteSpec.cachedse == 1: - sitesCompPD2P.append(tmpSiteSpec.sitename) - hasReplica = True - break - # site doesn't have a replica - if (not hasReplica) and tmpSiteSpec.cachedse == 1: - candForMoU.append(tmpSiteSpec.sitename) - if not useCloseSites: - candSites.append(tmpSiteSpec.sitename) - else: - # use close sites only - if self.getDQ2ID(tmpSiteSpec.sitename,tmpDS) in closeSiteList: - candSites.append(tmpSiteSpec.sitename) - # the number of subscriptions - for tmpUserSub in userSubscriptions: - if tmpUserSub.startswith(prefixDQ2): - nUserSub += 1 - break - # append - if not returnMap.has_key(tmpDS): - returnMap[tmpDS] = {} - returnMap[tmpDS][cloud] = (candSites,sitesComDS,sitesCompPD2P,nUserSub,t1HasReplica,t1HasPrimary, - nSecReplicas,nT1Sub,candForMoU) - # return - return True,returnMap - - - # check DDM response - def isDQ2ok(self,out): - if out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1 \ - or out == '()': - return False - return True - - - # get map of DQ2 IDs - def getDQ2ID(self,sitename,dataset): - # get DQ2 ID - if not self.siteMapper.checkSite(sitename): - self.putLog("cannot find SiteSpec for %s" % sitename) - return '' - dq2ID = self.siteMapper.getSite(sitename).ddm - if True: - # data - matchEOS = re.search('_EOS[^_]+DISK$',dq2ID) - if matchEOS != None: - dq2ID = re.sub('_EOS[^_]+DISK','_EOSDATADISK',dq2ID) - else: - dq2ID = re.sub('_[^_]+DISK','_DATADISK',dq2ID) - else: - # unsupported prefix for subscription - self.putLog('%s has unsupported prefix for subscription' % dataset,'error') - return '' - # patch for MWT2_UC - if dq2ID == 'MWT2_UC_DATADISK': - dq2ID = 'MWT2_DATADISK' - # return - return dq2ID - - - # get list of datasets - def makeSubscription(self,dataset,sitename,givenDQ2ID=None,ddmShare='secondary'): - # return for failuer - retFailed = False,'' - # get DQ2 IDs - if givenDQ2ID == None: - dq2ID = self.getDQ2ID(sitename,dataset) - else: - dq2ID = givenDQ2ID - if dq2ID == '': - self.putLog("cannot find DQ2 ID for %s:%s" % (sitename,dataset)) - return retFailed - # make subscription - optSrcPolicy = 000001 - nTry = 3 - for iDDMTry in range(nTry): - # register subscription - self.putLog('%s/%s registerDatasetSubscription %s %s' % (iDDMTry,nTry,dataset,dq2ID)) - status,out = ddm.DQ2.main('registerDatasetSubscription',dataset,dq2ID,version=0,archived=0, - callbacks={},sources={},sources_policy=optSrcPolicy, - wait_for_sources=0,destination=None,query_more_sources=0, - sshare=ddmShare,group=None,activity='Data Brokering',acl_alias='secondary') - if out.find('DQSubscriptionExistsException') != -1: - break - elif out.find('DQLocationExistsException') != -1: - break - elif status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if out.find('DQSubscriptionExistsException') != -1: - pass - elif status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response for %s' % dataset,'error') - return retFailed - # update - self.putLog('%s %s' % (status,out)) - return True,dq2ID - - - # get weight for brokerage - def getWeightForBrokerage(self,sitenames,dataset,nReplicasInCloud): - # return for failuer - retFailed = False,{} - retMap = {} - # get the number of subscriptions for last 24 hours - numUserSubs = self.taskBuffer.getNumUserSubscriptions() - # loop over all sites - for sitename in sitenames: - # get DQ2 ID - dq2ID = self.getDQ2ID(sitename,dataset) - if dq2ID == '': - self.putLog("cannot find DQ2 ID for %s:%s" % (sitename,dataset)) - return retFailed - # append - if numUserSubs.has_key(dq2ID): - retMap[sitename] = 1 + numUserSubs[dq2ID] - else: - retMap[sitename] = 1 - # negative weight if a cloud already has replicas - tmpCloud = self.siteMapper.getSite(sitename).cloud - retMap[sitename] *= (1 + nReplicasInCloud[tmpCloud]) - # return - return retMap - - - # get free disk size - def getFreeDiskSize(self,dataset,siteList): - # return for failuer - retFailed = False,{} - # loop over all sites - sizeMap = {} - for sitename in siteList: - # reuse cached value - if self.cachedSizeMap.has_key(sitename): - sizeMap[sitename] = self.cachedSizeMap[sitename] - continue - # get DQ2 IDs - dq2ID = self.getDQ2ID(sitename,dataset) - if dq2ID == '': - self.putLog("cannot find DQ2 ID for %s:%s" % (sitename,dataset)) - return retFailed - for valueItem in ['used','total']: - nTry = 3 - for iDDMTry in range(nTry): - status,out = ddm.DQ2.main('queryStorageUsage','srm',valueItem,dq2ID) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if status != 0 or out.startswith('Error'): - self.putLog("%s/%s queryStorageUsage key=%s value=%s site=%s" % (iDDMTry,nTry,'srm',valueItem,dq2ID)) - self.putLog(out,'error') - self.putLog('bad DQ2 response for %s:%s' % (dq2ID,valueItem), 'error') - return retFailed - try: - # convert res to map - exec "tmpGigaVal = %s[0]['giga']" % out - if not sizeMap.has_key(sitename): - sizeMap[sitename] = {} - # append - sizeMap[sitename][valueItem] = tmpGigaVal - # cache - self.cachedSizeMap[sitename] = sizeMap[sitename] - except: - self.putLog("%s/%s queryStorageUsage key=%s value=%s site=%s" % (iDDMTry,nTry,'srm',valueItem,dq2ID)) - self.putLog(out,'error') - self.putLog('could not convert HTTP-res to free size map for %s%s' % (dq2ID,valueItem), 'error') - return retFailed - # return - self.putLog('getFreeDiskSize done->%s' % str(sizeMap)) - return True,sizeMap - - - - # get list of replicas for a dataset - def getListDatasetReplicas(self,dataset): - nTry = 3 - for iDDMTry in range(nTry): - self.putLog("%s/%s listDatasetReplicas %s" % (iDDMTry,nTry,dataset)) - status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response for %s' % dataset, 'error') - return False,{} - try: - # convert res to map - exec "tmpRepSites = %s" % out - self.putLog('getListDatasetReplicas->%s' % str(tmpRepSites)) - return True,tmpRepSites - except: - self.putLog(out,'error') - self.putLog('could not convert HTTP-res to replica map for %s' % dataset, 'error') - return False,{} - - - # get replicas for a container - def getListDatasetReplicasInContainer(self,container): - # response for failure - resForFailure = False,{} - # get datasets in container - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s listDatasetsInContainer %s' % (iDDMTry,nTry,container)) - status,out = ddm.DQ2.main('listDatasetsInContainer',container) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response for %s' % container, 'error') - return resForFailure - datasets = [] - try: - # convert to list - exec "datasets = %s" % out - except: - self.putLog('could not convert HTTP-res to dataset list for %s' % container, 'error') - return resForFailure - # loop over all datasets - allRepMap = {} - for dataset in datasets: - # get replicas - status,tmpRepSites = self.getListDatasetReplicas(dataset) - if not status: - return resForFailure - # append - allRepMap[dataset] = tmpRepSites - # return - self.putLog('getListDatasetReplicasInContainer done') - return True,allRepMap - - - # get dataset metadata - def getDatasetMetadata(self,datasetName): - # response for failure - resForFailure = False,{} - metaDataAttrs = ['provenance','hidden'] - # get datasets in container - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s getMetaDataAttribute %s' % (iDDMTry,nTry,datasetName)) - status,out = ddm.DQ2.main('getMetaDataAttribute',datasetName,metaDataAttrs) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response for %s' % datasetName, 'error') - return resForFailure - metadata = {} - try: - # convert to map - exec "metadata = %s" % out - except: - self.putLog('could not convert HTTP-res to metadata for %s' % datasetName, 'error') - return resForFailure - # check whether all attributes are available - for tmpAttr in metaDataAttrs: - if not metadata.has_key(tmpAttr): - self.putLog('%s is missing in %s' % (tmpAttr,str(metadata)), 'error') - return resForFailure - # return - self.putLog('getDatasetMetadata -> %s' % str(metadata)) - return True,metadata - - - # get replica metadata - def getReplicaMetadata(self,datasetName,locationName): - # response for failure - resForFailure = False,{} - # get metadata - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s listMetaDataReplica %s %s' % (iDDMTry,nTry,datasetName,locationName)) - status,out = ddm.DQ2.main('listMetaDataReplica',locationName,datasetName) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response for %s' % datasetName, 'error') - return resForFailure - metadata = {} - try: - # convert to map - exec "metadata = %s" % out - except: - self.putLog('could not convert HTTP-res to replica metadata for %s:%s' % \ - (datasetName,locationName), 'error') - return resForFailure - # return - self.putLog('getReplicaMetadata -> %s' % str(metadata)) - return True,metadata - - - # check subscription info - def checkSubscriptionInfo(self,destDQ2ID,datasetName): - resForFailure = (False,False) - # get datasets in container - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s listSubscriptionInfo %s %s' % (iDDMTry,nTry,destDQ2ID,datasetName)) - status,out = ddm.DQ2.main('listSubscriptionInfo',datasetName,destDQ2ID,0) - if status != 0: - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response for %s' % datasetName, 'error') - return resForFailure - self.putLog(out) - if out == '()': - # no subscription - retVal = False - else: - # already exists - retVal = True - self.putLog('checkSubscriptionInfo -> %s' % retVal) - return True,retVal - - - # get size of dataset - def getDatasetSize(self,datasetName): - self.putLog("get size of %s" % datasetName) - resForFailure = (False,0) - # get size of datasets - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s listFilesInDataset %s' % (iDDMTry,nTry,datasetName)) - status,out = ddm.DQ2.listFilesInDataset(datasetName) - if status != 0: - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response to get size of %s' % datasetName, 'error') - return resForFailure - self.putLog("OK") - # get total size - dsSize = 0 - try: - exec "outList = %s" % out - for guid,vals in outList[0].iteritems(): - dsSize += long(vals['filesize']) - except: - self.putLog('failed to get size from DQ2 response for %s' % datasetName, 'error') - return resForFailure - # GB - dsSize /= (1024*1024*1024) - self.putLog("dataset size = %s" % dsSize) - return True,dsSize - - - # get datasets used by jobs - def getUsedDatasets(self,datasetMap): - resForFailure = (False,[]) - # loop over all datasets - usedDsList = [] - for datasetName in datasetMap.keys(): - # get file list - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s listFilesInDataset %s' % (iDDMTry,nTry,datasetName)) - status,out = ddm.DQ2.listFilesInDataset(datasetName) - if status != 0: - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response to get size of %s' % datasetName, 'error') - return resForFailure - # convert to map - try: - tmpLfnList = [] - exec "outList = %s" % out - for guid,vals in outList[0].iteritems(): - tmpLfnList.append(vals['lfn']) - except: - self.putLog('failed to get file list from DQ2 response for %s' % datasetName, 'error') - return resForFailure - # check if jobs use the dataset - usedFlag = False - for tmpJob in self.jobs: - for tmpFile in tmpJob.Files: - if tmpFile.type == 'input' and tmpFile.lfn in tmpLfnList: - usedFlag = True - break - # escape - if usedFlag: - break - # used - if usedFlag: - usedDsList.append(datasetName) - # return - self.putLog("used datasets = %s" % str(usedDsList)) - return True,usedDsList - - - # get file from dataset - def getFileFromDataset(self,datasetName,guid,randomMode=False,nSamples=1): - resForFailure = (False,None) - # get files in datasets - global g_filesInDsMap - if not g_filesInDsMap.has_key(datasetName): - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s listFilesInDataset %s' % (iDDMTry,nTry,datasetName)) - status,out = ddm.DQ2.listFilesInDataset(datasetName) - if status != 0: - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response to get size of %s' % datasetName, 'error') - return resForFailure - # get file - try: - exec "outList = %s" % out - # append - g_filesInDsMap[datasetName] = outList[0] - except: - self.putLog('failed to get file list from DQ2 response for %s' % datasetName, 'error') - return resForFailure - # random mode - if randomMode: - tmpList = g_filesInDsMap[datasetName].keys() - random.shuffle(tmpList) - retList = [] - for iSamples in range(nSamples): - if iSamples < len(tmpList): - guid = tmpList[iSamples] - retMap = g_filesInDsMap[datasetName][guid] - retMap['guid'] = guid - retMap['dataset'] = datasetName - retList.append(retMap) - return True,retList - # return - if g_filesInDsMap[datasetName].has_key(guid): - retMap = g_filesInDsMap[datasetName][guid] - retMap['guid'] = guid - retMap['dataset'] = datasetName - return True,retMap - return resForFailure - - - # make subscriptions to EOS - def makeSubscriptionToEOS(self,datasetName): - self.putLog("start making EOS subscription for %s" % datasetName) - destDQ2IDs = ['CERN-PROD_EOSDATADISK'] - # get dataset replica locations - if datasetName.endswith('/'): - statRep,replicaMaps = self.getListDatasetReplicasInContainer(datasetName) - else: - statRep,replicaMap = self.getListDatasetReplicas(datasetName) - replicaMaps = {datasetName:replicaMap} - if not statRep: - self.putLog("failed to get replica map for EOS",type='error') - return False - # loop over all datasets - for tmpDsName,replicaMap in replicaMaps.iteritems(): - # check if replica is already there - for destDQ2ID in destDQ2IDs: - if replicaMap.has_key(destDQ2ID): - self.putLog("skip EOS sub for %s:%s since replica is already there" % (destDQ2ID,tmpDsName)) - else: - statSubEx,subExist = self.checkSubscriptionInfo(destDQ2ID,tmpDsName) - if not statSubEx: - self.putLog("failed to check subscription for %s:%s" % (destDQ2ID,tmpDsName),type='error') - continue - # make subscription - if subExist: - self.putLog("skip EOS sub for %s:%s since subscription is already there" % (destDQ2ID,tmpDsName)) - else: - statMkSub,retMkSub = self.makeSubscription(tmpDsName,'',destDQ2ID) - if statMkSub: - self.putLog("made subscription to %s for %s" % (destDQ2ID,tmpDsName)) - else: - self.putLog("failed to make subscription to %s for %s" % (destDQ2ID,tmpDsName),type='error') - # return - self.putLog("end making EOS subscription for %s" % datasetName) - return True - - - # register new dataset container with datasets - def registerDatasetContainerWithDatasets(self,containerName,files,replicaMap): - # sort by locations - filesMap = {} - for tmpFile in files: - tmpLocations = replicaMap[tmpFile['dataset']] - tmpLocations.sort() - tmpKey = tuple(tmpLocations) - if not filesMap.has_key(tmpKey): - filesMap[tmpKey] = [] - # append file - filesMap[tmpKey].append(tmpFile) - # register new datasets - datasetNames = [] - tmpIndex = 1 - for tmpLocations,tmpFiles in filesMap.iteritems(): - tmpDsName = containerName[:-1] + '_%04d' % tmpIndex - tmpRet = self.registerDatasetWithLocation(tmpDsName,tmpFiles,tmpLocations) - # failed - if not tmpRet: - self.putLog('failed to register %s' % tmpDsName, 'error') - return False - # append dataset - datasetNames.append(tmpDsName) - tmpIndex += 1 - # register container - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s registerContainer %s' % (iDDMTry,nTry,containerName)) - status,out = ddm.DQ2.main('registerContainer',containerName,datasetNames) - if status != 0 and out.find('DQDatasetExistsException') == -1: - time.sleep(60) - else: - break - if out.find('DQDatasetExistsException') != -1: - pass - elif status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response to register %s' % containerName, 'error') - return False - # return - return True - - - - # register new dataset with locations - def registerDatasetWithLocation(self,datasetName,files,locations): - resForFailure = False - # get file info - guids = [] - lfns = [] - fsizes = [] - chksums = [] - for tmpFile in files: - guids.append(tmpFile['guid']) - lfns.append(tmpFile['lfn']) - fsizes.append(None) - chksums.append(None) - # register new dataset - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s registerNewDataset %s' % (iDDMTry,nTry,datasetName)) - status,out = ddm.DQ2.main('registerNewDataset',datasetName,lfns,guids,fsizes,chksums, - None,None,None,True) - if status != 0 and out.find('DQDatasetExistsException') == -1: - time.sleep(60) - else: - break - if out.find('DQDatasetExistsException') != -1: - pass - elif status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response to register %s' % datasetName, 'error') - return resForFailure - # freeze dataset - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s freezeDataset %s' % (iDDMTry,nTry,datasetName)) - status,out = ddm.DQ2.main('freezeDataset',datasetName) - if status != 0 and out.find('DQFrozenDatasetException') == -1: - time.sleep(60) - else: - break - if out.find('DQFrozenDatasetException') != -1: - pass - elif status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response to freeze %s' % datasetName, 'error') - return resForFailure - # register locations - for tmpLocation in locations: - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s registerDatasetLocation %s %s' % (iDDMTry,nTry,datasetName,tmpLocation)) - status,out = ddm.DQ2.main('registerDatasetLocation',datasetName,tmpLocation,0,1,None,None,None,"14 days") - if status != 0 and out.find('DQLocationExistsException') == -1: - time.sleep(60) - else: - break - if out.find('DQLocationExistsException') != -1: - pass - elif status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response to freeze %s' % datasetName, 'error') - return resForFailure - return True - - - # list datasets by file GUIDs - def listDatasetsByGUIDs(self,guids,dsFilters): - resForFailure = (False,{}) - # get size of datasets - nTry = 3 - for iDDMTry in range(nTry): - self.putLog('%s/%s listDatasetsByGUIDs' % (iDDMTry,nTry)) - status,out = ddm.DQ2.listDatasetsByGUIDs(guids) - if status != 0: - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - self.putLog(out,'error') - self.putLog('bad DQ2 response to list datasets by GUIDs','error') - return resForFailure - self.putLog(out) - # get map - retMap = {} - try: - exec "outMap = %s" % out - for guid in guids: - tmpDsNames = [] - # GUID not found - if not outMap.has_key(guid): - self.putLog('GUID=%s not found' % guid,'error') - return resForFailure - # ignore junk datasets - for tmpDsName in outMap[guid]: - if tmpDsName.startswith('panda') or \ - tmpDsName.startswith('user') or \ - tmpDsName.startswith('group') or \ - re.search('_sub\d+$',tmpDsName) != None or \ - re.search('_dis\d+$',tmpDsName) != None or \ - re.search('_shadow$',tmpDsName) != None: - continue - # check with filters - if dsFilters != []: - flagMatch = False - for tmpFilter in dsFilters: - if re.search(tmpFilter,tmpDsName) != None: - flagMatch = True - break - # not match - if not flagMatch: - continue - # append - tmpDsNames.append(tmpDsName) - # empty - if tmpDsNames == []: - self.putLog('no datasets found for GUID=%s' % guid) - continue - # duplicated - if len(tmpDsNames) != 1: - self.putLog('there are multiple datasets %s for GUID:%s' % (str(tmpDsNames),guid),'error') - return resForFailure - # append - retMap[guid] = tmpDsNames[0] - except: - self.putLog('failed to list datasets by GUIDs','error') - return resForFailure - return True,retMap - - - # conver event/run list to datasets - def convertEvtRunToDatasets(self,runEvtList,dsType,streamName,dsFilters,amiTag): - self.putLog('convertEvtRunToDatasets type=%s stream=%s dsPatt=%s amitag=%s' % \ - (dsType,streamName,str(dsFilters),amiTag)) - # check data type - failedRet = False,{},[] - if dsType == 'AOD': - streamRef = 'StreamAOD_ref' - elif dsType == 'ESD': - streamRef = 'StreamESD_ref' - elif dsType == 'RAW': - streamRef = 'StreamRAW_ref' - else: - self.putLog("invalid data type %s for EventRun conversion" % dsType,type='error') - return failedRet - # import event lookup client - from eventLookupClient import eventLookupClient - elssiIF = eventLookupClient() - # loop over all events - runEvtGuidMap = {} - nEventsPerLoop = 500 - iEventsTotal = 0 - while iEventsTotal < len(runEvtList): - tmpRunEvtList = runEvtList[iEventsTotal:iEventsTotal+nEventsPerLoop] - iEventsTotal += nEventsPerLoop - if streamName == '': - guidListELSSI = elssiIF.doLookup(tmpRunEvtList,tokens=streamRef, - amitag=amiTag,extract=True) - else: - guidListELSSI = elssiIF.doLookup(tmpRunEvtList,stream=streamName,tokens=streamRef, - amitag=amiTag,extract=True) - # failed - if guidListELSSI == None or len(guidListELSSI) == 0: - errStr = '' - for tmpLine in elssiIF.output: - errStr += tmpLine - self.putLog(errStr,type='error') - self.putLog("invalid retrun from EventLookup",type='error') - return failedRet - # check attribute - attrNames, attrVals = guidListELSSI - def getAttributeIndex(attr): - for tmpIdx,tmpAttrName in enumerate(attrNames): - if tmpAttrName.strip() == attr: - return tmpIdx - return None - # get index - indexEvt = getAttributeIndex('EventNumber') - indexRun = getAttributeIndex('RunNumber') - indexTag = getAttributeIndex(streamRef) - if indexEvt == None or indexRun == None or indexTag == None: - self.putLog("failed to get attribute index from %s" % str(attrNames),type='error') - return failedRet - # check events - for runNr,evtNr in tmpRunEvtList: - paramStr = 'Run:%s Evt:%s Stream:%s' % (runNr,evtNr,streamName) - self.putLog(paramStr) - # collect GUIDs - tmpguids = [] - for attrVal in attrVals: - if runNr == attrVal[indexRun] and evtNr == attrVal[indexEvt]: - tmpGuid = attrVal[indexTag] - # check non existing - if tmpGuid == 'NOATTRIB': - continue - if not tmpGuid in tmpguids: - tmpguids.append(tmpGuid) - # not found - if tmpguids == []: - errStr = "no GUIDs were found in Event Lookup service for %s" % paramStr - self.putLog(errStr,type='error') - return failedRet - # append - runEvtGuidMap[(runNr,evtNr)] = tmpguids - # convert to datasets - allDatasets = [] - allFiles = [] - allLocations = {} - for tmpIdx,tmpguids in runEvtGuidMap.iteritems(): - runNr,evtNr = tmpIdx - tmpDsRet,tmpDsMap = self.listDatasetsByGUIDs(tmpguids,dsFilters) - # failed - if not tmpDsRet: - self.putLog("failed to convert GUIDs to datasets",type='error') - return failedRet - # empty - if tmpDsMap == {}: - self.putLog("there is no dataset for Run:%s Evt:%s" % (runNr,evtNr),type='error') - return failedRet - if len(tmpDsMap) != 1: - self.putLog("there are multiple datasets %s for Run:%s Evt:%s" % (str(tmpDsMap),runNr,evtNr), - type='error') - return failedRet - # append - for tmpGUID,tmpDsName in tmpDsMap.iteritems(): - # collect dataset names - if not tmpDsName in allDatasets: - allDatasets.append(tmpDsName) - # get location - statRep,replicaMap = self.getListDatasetReplicas(tmpDsName) - # failed - if not statRep: - self.putLog("failed to get locations for DS:%s" % tmpDsName,type='error') - return failedRet - # collect locations - tmpLocationList = [] - for tmpLocation in replicaMap.keys(): - if not tmpLocation in tmpLocationList: - tmpLocationList.append(tmpLocation) - allLocations[tmpDsName] = tmpLocationList - # get file info - tmpFileRet,tmpFileInfo = self.getFileFromDataset(tmpDsName,tmpGUID) - # failed - if not tmpFileRet: - self.putLog("failed to get fileinfo for GUID:%s DS:%s" % (tmpGUID,tmpDsName),type='error') - return failedRet - # collect files - allFiles.append(tmpFileInfo) - # return - self.putLog('converted to %s, %s, %s' % (str(allDatasets),str(allLocations),str(allFiles))) - return True,allLocations,allFiles - - - # get mapping between TAG and parent GUIDs - def getMapTAGandParentGUIDs(self,dsName,tagQuery,streamRef): - # remove _tidXYZ - dsNameForLookUp = re.sub('_tid\d+(_\d+)*$','',dsName) - # reuse - if self.mapTAGandParentGUIDs.has_key(dsNameForLookUp): - return self.mapTAGandParentGUIDs[dsNameForLookUp] - # set - from countGuidsClient import countGuidsClient - tagIF = countGuidsClient() - tagResults = tagIF.countGuids(dsNameForLookUp,tagQuery,streamRef+',StreamTAG_ref') - if tagResults == None: - errStr = '' - for tmpLine in tagIF.output: - if tmpLine == '\n': - continue - errStr += tmpLine - self.putLog(errStr,type='error') - errStr2 = "invalid return from Event Lookup service. " - if "No collection in the catalog matches the dataset name" in errStr: - errStr2 += "Note that only merged TAG is uploaded to the TAG DB, " - errStr2 += "so you need to use merged TAG datasets (or container) for inDS. " - errStr2 += "If this is already the case please contact atlas-event-metadata@cern.ch" - self.putLog(errStr2,type='error') - return None - # empty - if not tagResults[0]: - errStr = "No GUIDs found for %s" % dsName - self.putLog(errStr,type='error') - return None - # collect - retMap = {} - for guidCount,guids in tagResults[1]: - self.putLog('%s %s' % (guidCount,guids)) - parentGUID,tagGUID = guids - # append TAG GUID - if not retMap.has_key(tagGUID): - retMap[tagGUID] = {} - # append parent GUID and the number of selected events - if retMap[tagGUID].has_key(parentGUID): - errStr = "GUIDs=%s is duplicated" % parentGUID - self.putLog(errStr,type='error') - return None - retMap[tagGUID][parentGUID] = long(guidCount) - # keep to avoid redundant lookup - self.mapTAGandParentGUIDs[dsNameForLookUp] = retMap - # return - return retMap - - - # get TAG files and parent DS/files using TAG query - def getTagParentInfoUsingTagQuery(self,tagDsList,tagQuery,streamRef): - # return code for failure - failedRet = False,{},[] - allDatasets = [] - allFiles = [] - allLocations = {} - # set empty if Query is undefined - if tagQuery == False: - tagQuery = '' - # loop over all tags - self.putLog('getting parent dataset names and LFNs from TAG DB using EventSelector.Query="%s"' % tagQuery) - for tagDS in tagDsList: - if tagDS.endswith('/'): - # get elements in container - tmpStat,elementMap = self.getListDatasetReplicasInContainer(tagDS) - else: - tmpStat,elementMap = self.getListDatasetReplicas(tagDS) - # loop over all elemets - for dsName in elementMap.keys(): - self.putLog("DS=%s Query=%s Ref:%s" % (dsName,tagQuery,streamRef)) - guidMap = self.getMapTAGandParentGUIDs(dsName,tagQuery,streamRef) - # failed - if guidMap == None: - self.putLog("failed to get mappping between TAG and parent GUIDs",type='error') - return failedRet - # convert TAG GUIDs to LFNs - tmpTagRet,tmpTagDsMap = self.listDatasetsByGUIDs(guidMap.keys(),[]) - # failed - if not tmpTagRet: - self.putLog("failed to convert GUIDs to datasets",type='error') - return failedRet - # empty - if tmpTagDsMap == {}: - self.putLog("there is no dataset for DS=%s Query=%s Ref:%s" % (dsName,tagQuery,streamRef),type='error') - return failedRet - # convert parent GUIDs for each TAG file - for tagGUID in guidMap.keys(): - # not found - if not tmpTagDsMap.has_key(tagGUID): - errStr = 'TAG GUID=%s not found in DQ2' % tagGUID - self.putLog(errStr,type='error') - return failedRet - # get TAG file info - tagElementDS = tmpTagDsMap[tagGUID] - tmpFileRet,tmpTagFileInfo = self.getFileFromDataset(tmpTagDsMap[tagGUID],tagGUID) - # failed - if not tmpFileRet: - self.putLog("failed to get fileinfo for GUID:%s DS:%s" % (tagGUID,tmpTagDsMap[tagGUID]),type='error') - return failedRet - # convert parent GUIDs to DS/LFNs - tmpParentRet,tmpParentDsMap = self.listDatasetsByGUIDs(guidMap[tagGUID].keys(),[]) - # failed - if not tmpParentRet: - self.putLog("failed to convert GUIDs:%s to parent datasets" % str(guidMap[tagGUID].keys()),type='error') - return failedRet - # empty - if tmpParentDsMap == {}: - self.putLog("there is no parent dataset for GUIDs:%s" % str(guidMap[tagGUID].keys()),type='error') - return failedRet - # loop over all parent GUIDs - for parentGUID in guidMap[tagGUID].keys(): - # not found - if not tmpParentDsMap.has_key(parentGUID): - errStr = '%s GUID=%s not found in DQ2' % (re.sub('_ref$','',streamRef),parentGUID) - self.putLog(errStr,type='error') - return failedRet - # get parent file info - tmpParentDS = tmpParentDsMap[parentGUID] - tmpFileRet,tmpParentFileInfo = self.getFileFromDataset(tmpParentDS,parentGUID) - # failed - if not tmpFileRet: - self.putLog("failed to get parent fileinfo for GUID:%s DS:%s" % (parentGUID,tmpParentDS), - type='error') - return failedRet - # collect files - allFiles.append(tmpParentFileInfo) - # get location - if not tmpParentDS in allDatasets: - allDatasets.append(tmpParentDS) - # get location - statRep,replicaMap = self.getListDatasetReplicas(tmpParentDS) - # failed - if not statRep: - self.putLog("failed to get locations for DS:%s" % tmpParentDS,type='error') - return failedRet - # collect locations - tmpLocationList = [] - for tmpLocation in replicaMap.keys(): - if not tmpLocation in tmpLocationList: - tmpLocationList.append(tmpLocation) - allLocations[tmpParentDS] = tmpLocationList - # return - self.putLog('converted to %s, %s, %s' % (str(allDatasets),str(allLocations),str(allFiles))) - return True,allLocations,allFiles - - - # put log - def putLog(self,msg,type='debug',sendLog=False,actionTag='',tagsMap={}): - tmpMsg = self.token+' '+msg - if type == 'error': - _logger.error(tmpMsg) - # keep last error message - self.lastMessage = tmpMsg - else: - _logger.debug(tmpMsg) - # send to logger - if sendLog: - tmpMsg = self.token + ' - ' - if actionTag != '': - tmpMsg += 'action=%s ' % actionTag - for tmpTag,tmpTagVal in tagsMap.iteritems(): - tmpMsg += '%s=%s ' % (tmpTag,tmpTagVal) - tmpMsg += '- ' + msg - tmpPandaLogger = PandaLogger() - tmpPandaLogger.lock() - tmpPandaLogger.setParams({'Type':'pd2p'}) - tmpLog = tmpPandaLogger.getHttpLogger(panda_config.loggername) - # add message - if type == 'error': - tmpLog.error(tmpMsg) - else: - tmpLog.info(tmpMsg) - # release HTTP handler - tmpPandaLogger.release() - time.sleep(1) - - - # peek log - def peekLog(self): - return self.lastMessage - - - # make T1 subscription - def makeT1Subscription(self,allCloudCandidates,tmpDS,dsSize, - nUsed=None,nWaitingJobs=None,nWaitingJobsets=None): - useSmallT1 = None - # no candidate - if allCloudCandidates == []: - return True,useSmallT1 - # convert to siteIDs - t1Candidates = [] - t1Weights = {} - siteToCloud = {} - for tmpCloud in allCloudCandidates: - tmpCloudSpec = self.siteMapper.getCloud(tmpCloud) - tmpT1SiteID = tmpCloudSpec['source'] - t1Candidates.append(tmpT1SiteID) - # use MoU share - t1Weights[tmpT1SiteID] = tmpCloudSpec['mcshare'] - # reverse lookup - siteToCloud[tmpT1SiteID] = tmpCloud - # get free disk size - self.putLog("getting free disk size for T1 PD2P") - retFreeSizeMap,freeSizeMap = self.getFreeDiskSize(tmpDS,t1Candidates) - if not retFreeSizeMap: - self.putLog("failed to get free disk size",type='error',sendLog=True) - return False,useSmallT1 - # run brokerage - tmpJob = JobSpec() - tmpJob.AtlasRelease = '' - self.putLog("run brokerage for T1-T1 for %s" % tmpDS) - selectedSite = self.chooseSite(t1Weights,freeSizeMap,dsSize) - self.putLog("site for T1 PD2P -> %s" % selectedSite) - # simulation - if self.simul: - return True,useSmallT1 - # no candidate - if selectedSite == None: - self.putLog("no candidate for T1-T1") - return False,useSmallT1 - # make subscription - tmpJob.computingSite = selectedSite - subRet,dq2ID = self.makeSubscription(tmpDS,tmpJob.computingSite) - tmpTagsMap = {'site':tmpJob.computingSite,'dataset':tmpDS} - if nUsed != None: - tmpTagsMap['nused'] = nUsed - if nWaitingJobs != None: - tmpTagsMap['nwaitingjobs'] = nWaitingJobs - if nWaitingJobsets != None: - tmpTagsMap['nwaitingjobsets'] = nWaitingJobsets - self.putLog("made subscription for T1-T1 to %s:%s" % (tmpJob.computingSite,dq2ID),sendLog=True, - actionTag='SELECTEDT1',tagsMap=tmpTagsMap) - # check if small cloud is used - if siteToCloud[tmpJob.computingSite] in cloudsWithSmallT1: - useSmallT1 = siteToCloud[tmpJob.computingSite] - # update database - if subRet: - self.taskBuffer.addUserSubscription(tmpDS,[dq2ID]) - return True,useSmallT1 - else: - return False,useSmallT1 - - - # make T2 subscription with MoU share - def makeT2SubscriptionMoU(self,allCandidates,tmpDS,dsSize,pd2pType, - nUsed=None,nWaitingJobs=None,nWaitingJobsets=None): - # no candidate - if allCandidates == []: - return True,None - # get MoU share - if self.shareMoUForT2 == None: - self.shareMoUForT2 = self.taskBuffer.getMouShareForT2PD2P() - # convert to DQ2 ID - t2Candidates = [] - t2Weights = {} - dq2List = [] - for tmpCandidate in allCandidates: - tmpDQ2ID = self.getDQ2ID(tmpCandidate,tmpDS) - if not tmpDQ2ID in dq2List: - # append - dq2List.append(tmpDQ2ID) - # get MoU share - if not self.shareMoUForT2.has_key(tmpDQ2ID): - # site is undefined in t_regions_replication - self.putLog("%s is not in MoU table" % tmpDQ2ID,type='error') - continue - if not self.shareMoUForT2[tmpDQ2ID]['status'] in ['ready']: - # site is not ready - self.putLog("%s is not ready in MoU table" % tmpDQ2ID) - continue - tmpWeight = self.shareMoUForT2[tmpDQ2ID]['weight'] - # skip if the weight is 0 - if tmpWeight == 0: - self.putLog("%s has 0 weight in MoU table" % tmpDQ2ID) - continue - # collect siteIDs and weights for brokerage - t2Candidates.append(tmpCandidate) - t2Weights[tmpCandidate] = tmpWeight - # sort for reproducibility - t2Candidates.sort() - # get free disk size - self.putLog("getting free disk size for T2 %s PD2P" % pd2pType) - retFreeSizeMap,freeSizeMap = self.getFreeDiskSize(tmpDS,t2Candidates) - if not retFreeSizeMap: - self.putLog("failed to get free disk size",type='error',sendLog=True) - return False,None - # run brokerage - tmpJob = JobSpec() - tmpJob.AtlasRelease = '' - self.putLog("run brokerage for T2 with %s for %s" % (pd2pType,tmpDS)) - selectedSite = self.chooseSite(t2Weights,freeSizeMap,dsSize) - self.putLog("site for T2 %s PD2P -> %s" % (pd2pType,selectedSite)) - # simulation - if self.simul: - return True,selectedSite - # no candidate - if selectedSite == None: - self.putLog("no candidate for T2 with %s" % pd2pType) - return False,None - # make subscription - subRet,dq2ID = self.makeSubscription(tmpDS,selectedSite) - tmpTagsMap = {'site':selectedSite,'dataset':tmpDS} - if nUsed != None: - tmpTagsMap['nused'] = nUsed - if nWaitingJobs != None: - tmpTagsMap['nwaitingjobs'] = nWaitingJobs - if nWaitingJobsets != None: - tmpTagsMap['nwaitingjobsets'] = nWaitingJobsets - self.putLog("made subscription for T2 with %s to %s:%s" % (pd2pType,selectedSite,dq2ID),sendLog=True, - actionTag='SELECTEDT2_%s' % pd2pType,tagsMap=tmpTagsMap) - # update database - if subRet: - self.taskBuffer.addUserSubscription(tmpDS,[dq2ID]) - return True,selectedSite - else: - return False,None - - - # choose site - def chooseSite(self,canWeights,freeSizeMap,datasetSize): - # loop over all candidates - totalW = 0 - allCandidates = [] - for tmpCan,tmpW in canWeights.iteritems(): - # size check - if freeSizeMap.has_key(tmpCan): - # disk threshold for PD2P max(5%,3TB) - diskThresholdPD2P = 1024 * 3 - thrForThisSite = long(freeSizeMap[tmpCan]['total'] * 5 / 100) - if thrForThisSite < diskThresholdPD2P: - thrForThisSite = diskThresholdPD2P - remSpace = freeSizeMap[tmpCan]['total'] - freeSizeMap[tmpCan]['used'] - if remSpace-datasetSize < thrForThisSite: - self.putLog(' skip: disk shortage %s-%s< %s' % (remSpace,datasetSize,thrForThisSite)) - continue - self.putLog('weight %s %s' % (tmpCan,tmpW)) - # get total weight - totalW += tmpW - # append candidate - allCandidates.append(tmpCan) - # no candidate - if allCandidates == []: - return None - # sort for reproducibility - allCandidates.sort() - # choose site - rNumber = random.random() * totalW - for tmpCan in allCandidates: - rNumber -= canWeights[tmpCan] - if rNumber <= 0: - return tmpCan - return allCandidates[-1] - - diff --git a/current/pandaserver/dataservice/ErrorCode.py b/current/pandaserver/dataservice/ErrorCode.py deleted file mode 100755 index 91faf46e1..000000000 --- a/current/pandaserver/dataservice/ErrorCode.py +++ /dev/null @@ -1,16 +0,0 @@ -############## errror code - -# Setupper -EC_Setupper = 100 - -# Setupper -EC_GUID = 101 - -# Adder -EC_Adder = 200 - -# Subscription failures -EC_Subscription = 201 - -# lost file (=taskbuffer.ErrorCode.EC_LostFile) -EC_LostFile = 110 diff --git a/current/pandaserver/dataservice/EventPicker.py b/current/pandaserver/dataservice/EventPicker.py deleted file mode 100644 index 977be5be5..000000000 --- a/current/pandaserver/dataservice/EventPicker.py +++ /dev/null @@ -1,288 +0,0 @@ -''' -add data to dataset - -''' - -import os -import re -import sys -import time -import fcntl -import datetime -import commands -import brokerage.broker -from dataservice import DynDataDistributer -from dataservice.MailUtils import MailUtils -from dataservice.Notifier import Notifier -from taskbuffer.JobSpec import JobSpec -from dataservice.datriHandler import datriHandler - - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('EventPicker') -DynDataDistributer.initLogger(_logger) - - -class EventPicker: - # constructor - def __init__(self,taskBuffer,siteMapper,evpFileName,ignoreError): - self.taskBuffer = taskBuffer - self.siteMapper = siteMapper - self.ignoreError = ignoreError - self.evpFileName = evpFileName - self.token = datetime.datetime.utcnow().isoformat(' ') - self.pd2p = DynDataDistributer.DynDataDistributer([],self.taskBuffer,self.siteMapper, - token=self.token) - self.userDatasetName = '' - self.creationTime = '' - self.params = '' - self.lockedBy = '' - self.evpFile = None - - # main - def run(self): - try: - self.putLog('start %s' % self.evpFileName) - # lock evp file - self.evpFile = open(self.evpFileName) - try: - fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_EX|fcntl.LOCK_NB) - except: - # relase - self.putLog("cannot lock %s" % self.evpFileName) - self.evpFile.close() - return True - # options - runEvtList = [] - eventPickDataType = '' - eventPickStreamName = '' - eventPickDS = [] - eventPickAmiTag = '' - inputFileList = [] - tagDsList = [] - tagQuery = '' - tagStreamRef = '' - # read evp file - for tmpLine in self.evpFile: - tmpMatch = re.search('^([^=]+)=(.+)$',tmpLine) - # check format - if tmpMatch == None: - continue - tmpItems = tmpMatch.groups() - if tmpItems[0] == 'runEvent': - # get run and event number - tmpRunEvt = tmpItems[1].split(',') - if len(tmpRunEvt) == 2: - runEvtList.append(tmpRunEvt) - elif tmpItems[0] == 'eventPickDataType': - # data type - eventPickDataType = tmpItems[1] - elif tmpItems[0] == 'eventPickStreamName': - # stream name - eventPickStreamName = tmpItems[1] - elif tmpItems[0] == 'eventPickDS': - # dataset pattern - eventPickDS = tmpItems[1].split(',') - elif tmpItems[0] == 'eventPickAmiTag': - # AMI tag - eventPickAmiTag = tmpItems[1] - elif tmpItems[0] == 'userName': - # user name - self.userDN = tmpItems[1] - self.putLog("user=%s" % self.userDN) - elif tmpItems[0] == 'userDatasetName': - # user dataset name - self.userDatasetName = tmpItems[1] - elif tmpItems[0] == 'lockedBy': - # client name - self.lockedBy = tmpItems[1] - elif tmpItems[0] == 'creationTime': - # creation time - self.creationTime = tmpItems[1] - elif tmpItems[0] == 'params': - # parameters - self.params = tmpItems[1] - elif tmpItems[0] == 'inputFileList': - # input file list - inputFileList = tmpItems[1].split(',') - try: - inputFileList.remove('') - except: - pass - elif tmpItems[0] == 'tagDS': - # TAG dataset - tagDsList = tmpItems[1].split(',') - elif tmpItems[0] == 'tagQuery': - # query for TAG - tagQuery = tmpItems[1] - elif tmpItems[0] == 'tagStreamRef': - # StreamRef for TAG - tagStreamRef = tmpItems[1] - if not tagStreamRef.endswith('_ref'): - tagStreamRef += '_ref' - # convert - if tagDsList == [] or tagQuery == '': - # convert run/event list to dataset/file list - tmpRet,locationMap,allFiles = self.pd2p.convertEvtRunToDatasets(runEvtList, - eventPickDataType, - eventPickStreamName, - eventPickDS, - eventPickAmiTag) - if not tmpRet: - self.endWithError('Failed to convert the run/event list to a dataset/file list') - return False - else: - # get parent dataset/files with TAG - tmpRet,locationMap,allFiles = self.pd2p.getTagParentInfoUsingTagQuery(tagDsList,tagQuery,tagStreamRef) - if not tmpRet: - self.endWithError('Failed to get parent dataset/file list with TAG') - return False - # use only files in the list - if inputFileList != []: - tmpAllFiles = [] - for tmpFile in allFiles: - if tmpFile['lfn'] in inputFileList: - tmpAllFiles.append(tmpFile) - allFiles = tmpAllFiles - # make dataset container - tmpRet = self.pd2p.registerDatasetContainerWithDatasets(self.userDatasetName,allFiles,locationMap) - if not tmpRet: - self.endWithError('Failed to make a dataset container %s' % self.userDatasetName) - return False - # get candidates - tmpRet,candidateMaps = self.pd2p.getCandidates(self.userDatasetName,checkUsedFile=False, - useHidden=True) - if not tmpRet: - self.endWithError('Failed to find candidate for destination') - return False - # collect all candidates - allCandidates = [] - for tmpDS,tmpDsVal in candidateMaps.iteritems(): - for tmpCloud,tmpCloudVal in tmpDsVal.iteritems(): - for tmpSiteName in tmpCloudVal[0]: - if not tmpSiteName in allCandidates: - allCandidates.append(tmpSiteName) - if allCandidates == []: - self.endWithError('No candidate for destination') - return False - # get size of dataset container - tmpRet,totalInputSize = self.pd2p.getDatasetSize(self.userDatasetName) - if not tmpRet: - self.endWithError('Failed to get the size of %s' % self.userDatasetName) - return False - # run brokerage - tmpJob = JobSpec() - tmpJob.AtlasRelease = '' - self.putLog("run brokerage for %s" % tmpDS) - brokerage.broker.schedule([tmpJob],self.taskBuffer,self.siteMapper,True,allCandidates, - True,datasetSize=totalInputSize) - if tmpJob.computingSite.startswith('ERROR'): - self.endWithError('brokerage failed with %s' % tmpJob.computingSite) - return False - self.putLog("site -> %s" % tmpJob.computingSite) - # send request to DaTRI - if self.lockedBy.startswith('ganga'): - tmpHandler = datriHandler(type='ganga') - else: - tmpHandler = datriHandler(type='pathena') - # remove redundant CN from DN - tmpDN = self.userDN - tmpDN = re.sub('/CN=limited proxy','',tmpDN) - tmpDN = re.sub('(/CN=proxy)+$','',tmpDN) - tmpMsg = "%s ds=%s site=%s id=%s" % ('datriHandler.sendRequest', - self.userDatasetName, - self.siteMapper.getSite(tmpJob.computingSite).ddm, - tmpDN) - self.putLog(tmpMsg) - tmpHandler.setParameters(data_pattern=self.userDatasetName, - site=self.siteMapper.getSite(tmpJob.computingSite).ddm, - userid=tmpDN) - nTry = 3 - for iTry in range(nTry): - dhStatus,dhOut = tmpHandler.sendRequest() - # succeeded - if dhStatus == 0 or "such request is exist" in dhOut: - self.putLog("%s %s" % (dhStatus,dhOut)) - break - if iTry+1 < nTry: - # sleep - time.sleep(60) - else: - # final attempt failed - self.endWithError('Failed to send request to DaTRI : %s %s' % (dhStatus,dhOut)) - return False - # send email notification for success - tmpMsg = 'A transfer request was successfully sent to DaTRI.\n' - tmpMsg += 'You will receive a notification from DaTRI when it completed.' - self.sendEmail(True,tmpMsg) - try: - # unlock and delete evp file - fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN) - self.evpFile.close() - os.remove(self.evpFileName) - except: - pass - # successfully terminated - self.putLog("end %s" % self.evpFileName) - return True - except: - errType,errValue = sys.exc_info()[:2] - self.endWithError('Got exception %s:%s' % (errType,errValue)) - return False - - - # end with error - def endWithError(self,message): - self.putLog(message,'error') - # unlock evp file - try: - fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN) - self.evpFile.close() - if not self.ignoreError: - # remove evp file - os.remove(self.evpFileName) - # send email notification - self.sendEmail(False,message) - except: - pass - self.putLog('end %s' % self.evpFileName) - - - # put log - def putLog(self,msg,type='debug'): - tmpMsg = self.token+' '+msg - if type == 'error': - _logger.error(tmpMsg) - else: - _logger.debug(tmpMsg) - - - # send email notification - def sendEmail(self,isSucceeded,message): - # mail address - toAdder = Notifier(self.taskBuffer,None,[]).getEmail(self.userDN) - if toAdder == '': - self.putLog('cannot find email address for %s' % self.userDN,'error') - return - # subject - mailSubject = "PANDA notification for Event-Picking Request" - # message - mailBody = "Hello,\n\nHere is your request status for event picking\n\n" - if isSucceeded: - mailBody += "Status : Passed to DaTRI\n" - else: - mailBody += "Status : Failed\n" - mailBody += "Created : %s\n" % self.creationTime - mailBody += "Ended : %s\n" % datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') - mailBody += "Dataset : %s\n" % self.userDatasetName - mailBody += "\n" - mailBody += "Parameters : %s %s\n" % (self.lockedBy,self.params) - mailBody += "\n" - mailBody += "%s\n" % message - # send - retVal = MailUtils().send(toAdder,mailSubject,mailBody) - # return - return diff --git a/current/pandaserver/dataservice/Finisher.py b/current/pandaserver/dataservice/Finisher.py deleted file mode 100755 index 64d5c30be..000000000 --- a/current/pandaserver/dataservice/Finisher.py +++ /dev/null @@ -1,178 +0,0 @@ -''' -finish transferring jobs - -''' - -import re -import sys -import commands -import threading -from DDM import ddm -from config import panda_config - -from brokerage.SiteMapper import SiteMapper - -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('Finisher') - - -class Finisher (threading.Thread): - # constructor - def __init__(self,taskBuffer,dataset,job=None,site=None): - threading.Thread.__init__(self) - self.dataset = dataset - self.taskBuffer = taskBuffer - self.job = job - self.site = site - - - # main - def run(self): - # start - try: - if self.job == None: - _logger.debug("start: %s" % self.dataset.name) - _logger.debug("callback from %s" % self.site) - # FIXME when callback from BNLPANDA disappeared - if self.site == 'BNLPANDA': - self.site = 'BNL-OSG2_ATLASMCDISK' - # instantiate site mapper - siteMapper = SiteMapper(self.taskBuffer) - # get computingSite/destinationSE - computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name) - if destinationSE == None: - # try to get computingSite/destinationSE from ARCH to delete sub - # even if no active jobs left - computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name,True) - if destinationSE == None: - _logger.error("cannot get source/destination for %s" % self.dataset.name) - _logger.debug("end: %s" % self.dataset.name) - return - _logger.debug("src: %s" % computingSite) - _logger.debug("dst: %s" % destinationSE) - # get corresponding token - tmpSrcSiteSpec = siteMapper.getSite(computingSite) - tmpDstSiteSpec = siteMapper.getSite(destinationSE) - _logger.debug(tmpDstSiteSpec.setokens) - destToken = None - for tmpToken,tmpDdmId in tmpDstSiteSpec.setokens.iteritems(): - if self.site == tmpDdmId: - destToken = tmpToken - break - _logger.debug("use Token=%s" % destToken) - # get required tokens - reqTokens = self.taskBuffer.getDestTokens(self.dataset.name) - if reqTokens == None: - _logger.error("cannot get required token for %s" % self.dataset.name) - _logger.debug("end: %s" % self.dataset.name) - return - _logger.debug("req Token=%s" % reqTokens) - # make bitmap for the token - bitMap = 1 - if len(reqTokens.split(','))>1: - for tmpReqToken in reqTokens.split(','): - if tmpReqToken == destToken: - break - # shift one bit - bitMap <<= 1 - # completed bitmap - compBitMap = (1 << len(reqTokens.split(',')))-1 - # ignore the lowest bit for T1, file on DISK is already there - if tmpSrcSiteSpec.ddm == tmpDstSiteSpec.ddm: - compBitMap = compBitMap & 0xFFFE - # update bitmap in DB - updatedBitMap = self.taskBuffer.updateTransferStatus(self.dataset.name,bitMap) - _logger.debug("transfer status:%s - comp:%s - bit:%s" % (hex(updatedBitMap),hex(compBitMap),hex(bitMap))) - # update output files - if (updatedBitMap & compBitMap) == compBitMap: - ids = self.taskBuffer.updateOutFilesReturnPandaIDs(self.dataset.name) - # set flag for T2 cleanup - self.dataset.status = 'cleanup' - self.taskBuffer.updateDatasets([self.dataset]) - else: - _logger.debug("end: %s" % self.dataset.name) - return - else: - _logger.debug("start: %s" % self.job.PandaID) - # update input files - ids = [self.job.PandaID] - _logger.debug("IDs: %s" % ids) - if len(ids) != 0: - # get job - if self.job == None: - jobs = self.taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False) - else: - jobs = [self.job] - # loop over all jobs - for job in jobs: - if job == None: - continue - _logger.debug("Job: %s" % job.PandaID) - if job.jobStatus == 'transferring': - jobReady = True - # check file status - for file in job.Files: - if file.type == 'output' or file.type == 'log': - if file.status != 'ready': - _logger.debug("Job: %s file:%s %s != ready" % (job.PandaID,file.lfn,file.status)) - jobReady = False - break - # finish job - if jobReady: - _logger.debug("Job: %s all files ready" % job.PandaID) - # create XML - try: - import xml.dom.minidom - dom = xml.dom.minidom.getDOMImplementation() - doc = dom.createDocument(None,'xml',None) - topNode = doc.createElement("POOLFILECATALOG") - for file in job.Files: - if file.type in ['output','log']: - # File - fileNode = doc.createElement("File") - fileNode.setAttribute("ID",file.GUID) - # LFN - logNode = doc.createElement("logical") - lfnNode = doc.createElement("lfn") - lfnNode.setAttribute('name',file.lfn) - # metadata - fsizeNode = doc.createElement("metadata") - fsizeNode.setAttribute("att_name","fsize") - fsizeNode.setAttribute("att_value",str(file.fsize)) - # checksum - if file.checksum.startswith('ad:'): - # adler32 - chksumNode = doc.createElement("metadata") - chksumNode.setAttribute("att_name","adler32") - chksumNode.setAttribute("att_value",re.sub('^ad:','',file.checksum)) - else: - # md5sum - chksumNode = doc.createElement("metadata") - chksumNode.setAttribute("att_name","md5sum") - chksumNode.setAttribute("att_value",re.sub('^md5:','',file.checksum)) - # append nodes - logNode.appendChild(lfnNode) - fileNode.appendChild(logNode) - fileNode.appendChild(fsizeNode) - fileNode.appendChild(chksumNode) - topNode.appendChild(fileNode) - # write to file - xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,job.PandaID,'finished',commands.getoutput('uuidgen')) - oXML = open(xmlFile,"w") - oXML.write(topNode.toxml()) - oXML.close() - except: - type, value, traceBack = sys.exc_info() - _logger.error("%s : %s %s" % (job.PandaID,type,value)) - _logger.debug("Job: %s status: %s" % (job.PandaID,job.jobStatus)) - # end - if self.job == None: - _logger.debug("end: %s" % self.dataset.name) - else: - _logger.debug("end: %s" % self.job.PandaID) - except: - type, value, traceBack = sys.exc_info() - _logger.error("run() : %s %s" % (type,value)) - diff --git a/current/pandaserver/dataservice/MailUtils.py b/current/pandaserver/dataservice/MailUtils.py deleted file mode 100755 index 9a8dfd290..000000000 --- a/current/pandaserver/dataservice/MailUtils.py +++ /dev/null @@ -1,103 +0,0 @@ -''' -email utilities -''' - -import sys -import smtplib - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('MailUtils') - -class MailUtils: - # constructor - def __init__(self): - pass - - # main - def send(self,toAddr,mailSubject,mailBody): - _logger.debug("start SEND session") - try: - # remove duplicated address - listToAddr = [] - newToAddr = '' - for tmpToAddr in toAddr.split(','): - if not tmpToAddr in listToAddr: - listToAddr.append(tmpToAddr) - newToAddr += '%s,' % tmpToAddr - toAddr = newToAddr[:-1] - # make message - fromAdd = panda_config.emailSender - message = \ -"""Subject: %s -From: %s -To: %s - -%s -""" % (mailSubject,fromAdd,toAddr,mailBody) - message = self.addTailer(message) - # send mail - _logger.debug("send to %s\n%s" % (toAddr,message)) - server = smtplib.SMTP(panda_config.emailSMTPsrv) - server.set_debuglevel(1) - server.ehlo() - server.starttls() - #server.login(panda_config.emailLogin,panda_config.emailPass) - out = server.sendmail(fromAdd,listToAddr,message) - _logger.debug(out) - server.quit() - retVal = True - except: - type, value, traceBack = sys.exc_info() - _logger.error("%s %s" % (type,value)) - retVal = False - _logger.debug("end SEND session") - return retVal - - - # send update notification to user - def sendSiteAccessUpdate(self,toAddr,newStatus,pandaSite): - # subject - mailSubject = "PANDA Update on Access Request for %s" % pandaSite - # message - mailBody = "Hello,\n\nYour access request for %s has been %s \n" % (pandaSite,newStatus.upper()) - # send - retVal = self.send(toAddr,mailSubject,mailBody) - # return - return retVal - - - # send requests to cloud responsible - def sendSiteAccessRequest(self,toAddr,requestsMap,cloud): - # subject - mailSubject = "PANDA Access Requests in %s" % cloud - # message - mailBody = "Hello,\n\nThere are access requests to be approved or rejected.\n\n" - for pandaSite,userNames in requestsMap.iteritems(): - mailBody += " %s\n" % pandaSite - userStr = '' - for userName in userNames: - userStr += ' %s,' % userName - userStr = userStr[:-1] - mailBody += " %s\n\n" % userStr - # send - retVal = self.send(toAddr,mailSubject,mailBody) - # return - return retVal - - - # add tailer - def addTailer(self,msg): - msg += """ -Report Panda problems of any sort to - - the eGroup for help request - hn-atlas-dist-analysis-help@cern.ch - - the Savannah for software bug - https://savannah.cern.ch/projects/panda/ -""" - return msg - diff --git a/current/pandaserver/dataservice/Merger.py b/current/pandaserver/dataservice/Merger.py deleted file mode 100644 index b8e1d60e5..000000000 --- a/current/pandaserver/dataservice/Merger.py +++ /dev/null @@ -1,692 +0,0 @@ -''' -merge files in dataset - -''' - -import re -import sys -import time -import commands - -import dq2.common -from dq2.clientapi import DQ2 -import dq2.container.exceptions - -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('Merger') - - -class Merger: - - # constructor - def __init__(self,taskBuffer,job,simulFlag=False,noSubmit=False): - self.taskBuffer = taskBuffer - self.job = job - self.mergeType = "" - self.mergeScript = "" - self.runDir = "." - self.mergeTypeMap = {} - self.supportedMergeType = ['hist','ntuple','pool','user','log','text'] - self.simulFlag = simulFlag - self.noSubmit = noSubmit - self.dsContMergeLog = "" - self.fileDestSeMap = {} - - - # parse jobParameters and get mergeType specified by the client - def getMergeType(self): - type = "" - try: - paramList = re.split('\W+',self.job.jobParameters.strip()) - type = paramList[ paramList.index('mergeType') + 1 ] - except: - _logger.debug("%s cannot find --mergeType parameter from parent job" % self.job.PandaID) - return type - - - # parse jobParameters and get mergeScript specified by the client - def getUserMergeScript(self): - script = "" - try: - match = re.search("--mergeScript\s(([^\'\"\s]+)|(\"[^\"]+\")|(\'[^\']+\'))",self.job.jobParameters) - if match != None: - script = match.group(1) - except: - _logger.debug("%s cannot find --mergeScript parameter from parent job" % self.job.PandaID) - return script - - # parse jobParameters and get rundir specified by the client - def getRunDir(self): - rundir = "." - try: - m = re.match(r'.*\-r\s+(\S+)\s+.*', self.job.jobParameters.strip()) - if m: - rundir = re.sub(r'[\'"]','',m.group(1)) - except: - _logger.debug("%s cannot find -r parameter from parent job" % self.job.PandaID) - return rundir - - # parse jobParameters and get ROOT version - def getRootVer(self): - ver = "" - try: - m = re.match(r'.*\--rootVer\s+(\S+)\s+.*', self.job.jobParameters.strip()) - if m: - ver = m.group(1) - except: - _logger.debug("%s cannot find --rootVer parameter from parent job" % self.job.PandaID) - return ver - - # get file type - def getFileType(self,tmpLFN): - tmpLFN = re.sub('\.\d+$','',tmpLFN) - tmpMatch = re.search('^(.+)\._\d+\.(.+)$',tmpLFN) - if tmpMatch != None: - return (tmpMatch.group(1),tmpMatch.group(2)) - return None - - - # parse jobSpec to get merge type automatically - def getMergeTypeAuto(self): - # look for outmap - try: - tmpMatch = re.search('-o \"([^\"]+)\"',self.job.jobParameters) - outMapStr = tmpMatch.group(1) - exec "outMap="+outMapStr - except: - errType,errValue = sys.exc_info()[:2] - _logger.debug("%s cannot extract outMap from jobParameters=%s %s:%s" % \ - (self.job.PandaID,self.job.jobParameters,errType,errValue)) - return False - # convert output type to merge type - if '/runGen-' in self.job.transformation: - # loop over all output files for runGen - for oldName,newName in outMap.iteritems(): - # get file type - tmpKey = self.getFileType(newName) - if tmpKey != None: - # check extension - if re.search('\.pool\.root(\.\d+)*$',newName) != None: - # POOL - tmpType = 'pool' - elif re.search('\.root(\.\d+)*$',newName) != None: - # map all root files to ntuple - tmpType = 'ntuple' - else: - # catch all using zip - tmpType = 'text' - # append - self.mergeTypeMap[tmpKey] = tmpType - else: - # hist - if outMap.has_key('hist'): - tmpType = 'hist' - tmpKey = self.getFileType(outMap['hist']) - if tmpKey != None: - # append - self.mergeTypeMap[tmpKey] = tmpType - # ntuple - if outMap.has_key('ntuple'): - tmpType = 'ntuple' - for sName,fName in outMap['ntuple']: - tmpKey = self.getFileType(fName) - if tmpKey != None: - # append - self.mergeTypeMap[tmpKey] = tmpType - # AANT - if outMap.has_key('AANT'): - # map AANT to ntuple for now - tmpType = 'ntuple' - for aName,sName,fName in outMap['AANT']: - tmpKey = self.getFileType(fName) - if tmpKey != None: - # append - self.mergeTypeMap[tmpKey] = tmpType - # THIST - if outMap.has_key('THIST'): - tmpType = 'ntuple' - for aName,fName in outMap['THIST']: - tmpKey = self.getFileType(fName) - if tmpKey != None: - # append only when the stream is not used by AANT - if not self.mergeTypeMap.has_key(tmpKey): - self.mergeTypeMap[tmpKey] = tmpType - # POOL - for tmpOutType,tmpOutVal in outMap.iteritems(): - # TAG is mapped to POOL for now - if tmpOutType in ['RDO','ESD','AOD','TAG','Stream1','Stream2']: - tmpType = 'pool' - tmpKey = self.getFileType(tmpOutVal) - if tmpKey != None: - # append - self.mergeTypeMap[tmpKey] = tmpType - # general POOL stream - if outMap.has_key('StreamG'): - tmpType = 'pool' - for sName,fName in outMap['StreamG']: - tmpKey = self.getFileType(fName) - if tmpKey != None: - # append - self.mergeTypeMap[tmpKey] = tmpType - # meta - if outMap.has_key('Meta'): - tmpType = 'pool' - for sName,fName in outMap['Meta']: - tmpKey = self.getFileType(fName) - if tmpKey != None: - # append only when the stream is not used by another - if not self.mergeTypeMap.has_key(tmpKey): - self.mergeTypeMap[tmpKey] = tmpType - # UserData - if outMap.has_key('UserData'): - tmpType = 'pool' - for fName in outMap['UserData']: - tmpKey = self.getFileType(fName) - if tmpKey != None: - # append - self.mergeTypeMap[tmpKey] = tmpType - # BS - if outMap.has_key('BS'): - # ByteStream is mapped to text to use zip for now - tmpType = 'text' - tmpKey = self.getFileType(outMap['BS']) - if tmpKey != None: - # append - self.mergeTypeMap[tmpKey] = tmpType - # extra outputs - if outMap.has_key('IROOT'): - for oldName,newName in outMap['IROOT']: - tmpKey = self.getFileType(newName) - if tmpKey != None: - # check extension - if re.search('\.pool\.root(\.\d+)*$',newName) != None: - # POOL - tmpType = 'pool' - elif re.search('\.root(\.\d+)*$',newName) != None: - # map all root files to ntuple - tmpType = 'ntuple' - else: - # catch all using zip - tmpType = 'text' - # append - self.mergeTypeMap[tmpKey] = tmpType - # dump - _logger.debug("%s automatic merge type mapping -> %s" % (self.job.PandaID,str(self.mergeTypeMap))) - return True - - - # detect merge type with LFN prefix and suffix - def detectMergeTypeWithLFN(self,filePrefix,fileSuffix): - tmpKey = (filePrefix,fileSuffix) - if self.mergeTypeMap.has_key(tmpKey): - return self.mergeTypeMap[tmpKey] - # look for matching fileSuffix mainly for --useContElement which has differed prefix - for tmpKey in self.mergeTypeMap.keys(): - tmpFilePrefix,tmpFileSuffix = tmpKey - if tmpFileSuffix == fileSuffix: - _logger.debug("%s updated merge type mapping for %s:%s -> %s" % (self.job.PandaID,filePrefix,fileSuffix,str(self.mergeTypeMap))) - self.mergeTypeMap[(filePrefix,fileSuffix)] = self.mergeTypeMap[tmpKey] - return self.mergeTypeMap[tmpKey] - raise RuntimeError,'cannot find merge type for %s %s' % (filePrefix,fileSuffix) - - - # main returns None for unrecoverable - def run(self): - try: - _logger.debug("%s start" % self.job.PandaID) - # check source label - if not self.job.prodSourceLabel in ['user',]: - _logger.debug("%s do nothing for non-user job" % self.job.PandaID) - _logger.debug("%s end" % self.job.PandaID) - return None - # check command-line parameter - if not self.simulFlag and not "--mergeOutput" in self.job.jobParameters: - _logger.debug("%s skip no-merge" % self.job.PandaID) - _logger.debug("%s end" % self.job.PandaID) - return None - # get mergeType from jobParams - self.mergeType = self.getMergeType() - self.mergeScript = self.getUserMergeScript() - - # if mergeScript is given by user, it's equivalent to user mode mergeType - if self.mergeScript: - self.mergeType = 'user' - - if self.mergeType != '': - # check if the merging type is given and is supported - if self.mergeType not in self.supportedMergeType: - _logger.error("%s skip not supported merging type \"%s\"" % (self.job.PandaID, self.mergeType)) - _logger.debug("%s end" % self.job.PandaID) - return None - elif self.mergeType in ['user']: - self.runDir = self.getRunDir() - if not self.mergeScript: - _logger.error("%s skip: no merging command specified for merging type \"%s\"" % (self.job.PandaID, self.mergeType)) - _logger.debug("%s end" % self.job.PandaID) - return None - else: - # automatic merge type detection - tmpRet = self.getMergeTypeAuto() - if not tmpRet: - _logger.error("%s failed to detect merge type automatically" % self.job.PandaID) - _logger.debug("%s end" % self.job.PandaID) - return None - # instantiate DQ2 - self.dq2api = DQ2.DQ2() - # get list of datasets - dsList = [] - dsSubDsMap = {} - for tmpFile in self.job.Files: - # use output/log - if not tmpFile.type in ['log','output']: - continue - tmpContName = tmpFile.dataset - # extend logfile container name with ".merge.log" for storing logs of the merging operation - if tmpFile.type == 'log' and not self.dsContMergeLog: - self.dsContMergeLog = re.sub('/$','.merge.log/',tmpFile.dataset) - tmpSubDsName = tmpFile.destinationDBlock - # remove _sub - tmpDsName = re.sub('_sub\d+$','',tmpSubDsName) - tmpKey = (tmpContName,tmpDsName) - if not tmpKey in dsList: - dsList.append(tmpKey) - dsSubDsMap[tmpDsName] = tmpSubDsName - # get type - tmpMatch = self.getFileType(tmpFile.lfn) - if tmpMatch != None: - self.fileDestSeMap[tmpMatch] = tmpFile.destinationSE - # loop over all datasets - mergeJobList = {} - for tmpContName,tmpDsName in dsList: - # check prefix - if (not tmpDsName.startswith('user')) and (not tmpDsName.startswith('group')): - _logger.debug("%s ignore non-user/group DS %s" % (self.job.PandaID,tmpDsName)) - continue - # get list of files - _logger.debug("%s listFilesInDataset %s" % (self.job.PandaID,tmpDsName)) - tmpAllFileMap = {} - nTry = 3 - for iTry in range(nTry): - try: - tmpRetTimeStamp = self.dq2api.listFilesInDataset(tmpDsName) - except DQ2.DQUnknownDatasetException: - _logger.error("%s DQ2 doesn't know %s" % (self.job.PandaID,tmpDsName)) - _logger.debug("%s end" % self.job.PandaID) - return None - except: - if (iTry+1) == nTry: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s DQ2 failed with %s:%s to get file list for %s" % (self.job.PandaID,errType,errValue,tmpDsName)) - _logger.debug("%s end" % self.job.PandaID) - return False - # sleep - time.sleep(60) - # empty - if tmpRetTimeStamp == (): - # close dataset - varMap = {} - varMap[':name'] = tmpDsName - varMap[':status'] = 'tobeclosed' - uSQL = "UPDATE /*+ INDEX(tab DATASETS_NAME_IDX)*/ ATLAS_PANDA.Datasets " - uSQL += "SET status=:status,modificationdate=CURRENT_DATE WHERE name=:name " - self.taskBuffer.querySQLS(uSQL,varMap) - _logger.debug("%s %s is empty" % (self.job.PandaID,tmpDsName)) - continue - # loop over all GUIDs - tmpRet,tmpTimeStamp = tmpRetTimeStamp - for tmpGUID,tmpVal in tmpRet.iteritems(): - # set GUID - tmpVal['guid'] = tmpGUID - # get type - tmpMatch = self.getFileType(tmpVal['lfn']) - if tmpMatch == None: - _logger.error("%s cannot get type for %s" % (self.job.PandaID,tmpVal['lfn'])) - _logger.debug("%s end" % self.job.PandaID) - return None - tmpType = (tmpMatch[0],tmpMatch[1],tmpContName,tmpDsName) - # append - if not tmpAllFileMap.has_key(tmpType): - tmpAllFileMap[tmpType] = {} - tmpAllFileMap[tmpType][tmpVal['lfn']] = tmpVal - # max size of merged file - maxMergedFileSize = 5 * 1024 * 1024 * 1024 - # max number of files to be merged - maxNumToBeMerged = 200 - # loop over all types - for tmpType,tmpFileMap in tmpAllFileMap.iteritems(): - # sort LFNs - tmpFileList = tmpFileMap.keys() - tmpFileList.sort() - # split by size - subTotalSize = 0 - subFileList = [] - for tmpFileName in tmpFileList: - if (subTotalSize+tmpFileMap[tmpFileName]['filesize'] > maxMergedFileSize and subFileList != []) \ - or len(subFileList) >= maxNumToBeMerged: - # instantiate job - tmpMergeJob = self.makeMergeJob(subFileList,tmpFileMap,tmpType) - # append - if not mergeJobList.has_key(tmpDsName): - mergeJobList[tmpDsName] = [] - mergeJobList[tmpDsName].append(tmpMergeJob) - # reset - subTotalSize = 0 - subFileList = [] - # append - subTotalSize += tmpFileMap[tmpFileName]['filesize'] - subFileList.append(tmpFileName) - # remaining - if subFileList != []: - # instantiate job - tmpMergeJob = self.makeMergeJob(subFileList,tmpFileMap,tmpType) - # append - if not mergeJobList.has_key(tmpDsName): - mergeJobList[tmpDsName] = [] - mergeJobList[tmpDsName].append(tmpMergeJob) - # terminate simulation - if self.simulFlag and not self.noSubmit: - _logger.debug("%s end simulation" % self.job.PandaID) - return True - # get list of new datasets - newDatasetMap = {} - for tmpDsName,tmpJobList in mergeJobList.iteritems(): - # loop over all files - for tmpFile in tmpJobList[0].Files: - # ignore inputs - if not tmpFile.type in ['output','log']: - continue - # append - if not newDatasetMap.has_key(tmpFile.dataset): - newDatasetMap[tmpFile.dataset] = [] - if not tmpFile.destinationDBlock in newDatasetMap[tmpFile.dataset]: - newDatasetMap[tmpFile.dataset].append(tmpFile.destinationDBlock) - # remove /CN=proxy and /CN=limited from DN - tmpRealDN = self.job.prodUserID - tmpRealDN = re.sub('/CN=limited proxy','',tmpRealDN) - tmpRealDN = re.sub('/CN=proxy','',tmpRealDN) - tmpRealDN = dq2.common.parse_dn(tmpRealDN) - # register container for merge log files - if self.dsContMergeLog: - # register new container for the logs of merging operation - _logger.debug("%s registerContainer %s" % (self.job.PandaID, self.dsContMergeLog)) - nTry = 3 - unRecoverable = False - for iTry in range(nTry): - try: - self.dq2api.registerContainer(self.dsContMergeLog) - break - except DQ2.DQDatasetExistsException: - break - except: - errType,errValue = sys.exc_info()[:2] - if 'exceeds the maximum length' in str(errValue): - unRecoverable = True - if unRecoverable or (iTry+1) == nTry: - _logger.error("%s DQ2 failed with %s:%s to register new container %s" % (self.job.PandaID,errType,errValue,self.dsContMergeLog)) - _logger.debug("%s end" % self.job.PandaID) - if unRecoverable: - return None - return False - # sleep - time.sleep(60) - # set container owner - _logger.debug("%s setMetaDataAttribute %s %s" % (self.job.PandaID, self.dsContMergeLog, tmpRealDN)) - nTry = 3 - for iTry in range(nTry): - try: - self.dq2api.setMetaDataAttribute(self.dsContMergeLog, 'owner', tmpRealDN) - except: - if (iTry+1) == nTry: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s DQ2 failed with %s:%s to set owner for %s" % (self.job.PandaID,errType,errValue,self.dsContMergeLog)) - _logger.debug("%s end" % self.job.PandaID) - return False - # sleep - time.sleep(60) - # register datasets - for tmpDsContainer,tmpNewDatasets in newDatasetMap.iteritems(): - # loop over all datasets - for tmpNewDS in tmpNewDatasets: - # register - _logger.debug("%s registerNewDataset %s" % (self.job.PandaID,tmpNewDS)) - nTry = 3 - for iTry in range(nTry): - try: - self.dq2api.registerNewDataset(tmpNewDS) - except DQ2.DQDatasetExistsException: - pass - except: - if (iTry+1) == nTry: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s DQ2 failed with %s:%s to register %s" % (self.job.PandaID,errType,errValue,tmpNewDS)) - _logger.debug("%s end" % self.job.PandaID) - return False - # sleep - time.sleep(60) - # set owner - _logger.debug("%s setMetaDataAttribute %s %s" % (self.job.PandaID,tmpNewDS,tmpRealDN)) - nTry = 3 - for iTry in range(nTry): - try: - self.dq2api.setMetaDataAttribute(tmpNewDS,'owner',tmpRealDN) - except: - if (iTry+1) == nTry: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s DQ2 failed with %s:%s to set owner for %s" % (self.job.PandaID,errType,errValue,tmpNewDS)) - _logger.debug("%s end" % self.job.PandaID) - return False - # sleep - time.sleep(60) - # add to container - if tmpDsContainer.endswith('/'): - # add - _logger.debug("%s registerDatasetsInContainer %s %s" % (self.job.PandaID,tmpDsContainer,str(tmpNewDatasets))) - nTry = 3 - for iTry in range(nTry): - try: - self.dq2api.registerDatasetsInContainer(tmpDsContainer,tmpNewDatasets) - break - except dq2.container.exceptions.DQContainerAlreadyHasDataset: - break - except: - if (iTry+1) == nTry: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s DQ2 failed with %s:%s to add datasets to %s" % (self.job.PandaID,errType,errValue,tmpDsContainer)) - _logger.debug("%s end" % self.job.PandaID) - return False - # sleep - time.sleep(60) - # no submission - if self.noSubmit: - _logger.debug("%s end with no submission" % self.job.PandaID) - return True - # submit new jobs - _logger.debug("%s submit jobs" % self.job.PandaID) - # fake FQANs - fqans = [] - if not self.job.countryGroup in ['','NULL',None]: - fqans.append('/atlas/%s/Role=NULL' % self.job.countryGroup) - if self.job.destinationDBlock.startswith('group') and not self.job.workingGroup in ['','NULL',None]: - fqans.append('/atlas/%s/Role=production' % self.job.workingGroup) - # insert jobs - for tmpDsName,tmpJobList in mergeJobList.iteritems(): - ret = self.taskBuffer.storeJobs(tmpJobList,self.job.prodUserID,True,False,fqans, - self.job.creationHost,True,checkSpecialHandling=False) - if ret == []: - _logger.error("%s storeJobs failed with [] for %s" % (self.job.PandaID,tmpDsName)) - _logger.debug("%s end" % self.job.PandaID) - return False - else: - # set jobDefID - tmpJobDefID = ret[0][1] - if not tmpJobDefID in ['NULL','',None,-1]: - varMap = {} - varMap[':name'] = dsSubDsMap[tmpDsName] - varMap[':moverID'] = tmpJobDefID - uSQL = "UPDATE /*+ INDEX(tab DATASETS_NAME_IDX)*/ ATLAS_PANDA.Datasets " - uSQL += "SET moverID=:moverID WHERE name=:name " - self.taskBuffer.querySQLS(uSQL,varMap) - # dump - strPandaIDs = '' - for tmpItem in ret: - strPandaIDs += '%s,' % tmpItem[0] - _logger.debug("%s jobDefID=%s mergeJobs=%s" % (self.job.PandaID,tmpJobDefID,strPandaIDs[:-1])) - # return - _logger.debug("%s end" % self.job.PandaID) - return True - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s failed with %s:%s" % (self.job.PandaID,errType,errValue)) - _logger.debug("%s end" % self.job.PandaID) - return None - - - # make merge job - def makeMergeJob(self,fileList,fileMap,fileType): - # make job spec - tmpJob = JobSpec() - # set release and cache - if not self.job.AtlasRelease in ['','NULL',None]: - tmpJob.AtlasRelease = self.job.AtlasRelease - if not self.job.homepackage in ['','NULL',None]: - tmpJob.homepackage = self.job.homepackage - tmpJob.prodSourceLabel = 'user' - tmpJob.prodUserID = self.job.prodUserID - tmpJob.assignedPriority = 5000 - tmpJob.jobName = 'usermerge.%s' % commands.getoutput('uuidgen') - tmpJob.computingSite = self.job.computingSite - tmpJob.metadata = self.job.metadata - tmpJob.prodDBlock = self.job.prodDBlock - tmpJob.destinationDBlock = self.job.destinationDBlock - tmpJob.destinationSE = self.job.destinationSE - tmpJob.cloud = self.job.cloud - tmpJob.cmtConfig = self.job.cmtConfig - tmpJob.lockedby = self.job.lockedby - tmpJob.processingType = 'usermerge' - tmpJob.jobsetID = self.job.jobsetID - tmpJob.jobDefinitionID = 0 - tmpJob.transformation = "http://pandaserver.cern.ch:25080/trf/user/runMerge-00-00-01" - # decompose fileType - filePrefix,fileSuffix,containerName,datasetName = fileType - fileTypeKey = (filePrefix,fileSuffix) - # output dataset name - outDsName = datasetName+'.merge' - # job parameter - params = '--parentDS %s --parentContainer %s --outDS %s' % (datasetName,containerName,outDsName) - # look for lib.tgz - for tmpLibFile in self.job.Files: - if tmpLibFile.type == 'input' and tmpLibFile.lfn.endswith('.lib.tgz'): - tmpFile = FileSpec() - tmpFile.lfn = tmpLibFile.lfn - tmpFile.GUID = tmpLibFile.GUID - tmpFile.fsize = tmpLibFile.fsize - tmpFile.md5sum = tmpLibFile.md5sum - tmpFile.checksum = tmpLibFile.checksum - tmpFile.dataset = tmpLibFile.dataset - tmpFile.prodDBlock = tmpLibFile.prodDBlock - tmpFile.type = 'input' - tmpFile.status = 'ready' - tmpFile.prodDBlockToken = 'local' - tmpJob.addFile(tmpFile) - params += " --libTgz %s" % tmpFile.lfn - break - # reverse sort to use the largest SN in merged LFN, which is required to find SN offset when outDS is reused - fileList.reverse() - # input - serNum = None - attNum = None - for tmpFileName in fileList: - # extract serial number - if serNum == None: - tmpMatch = re.search('^'+filePrefix+'\.(_\d+)\.'+fileSuffix,tmpFileName) - if tmpMatch == None: - raise RuntimeError,'cannot extract SN from %s' % tmpFileName - serNum = tmpMatch.group(1) - # extract attempt number - tmpMatch = re.search('\.(\d+)$',tmpFileName) - if tmpMatch != None: - attNum = tmpMatch.group(1) - # make file spec - tmpFile = FileSpec() - vals = fileMap[tmpFileName] - tmpFile.lfn = tmpFileName - tmpFile.GUID = vals['guid'] - tmpFile.fsize = vals['filesize'] - tmpFile.md5sum = vals['checksum'] - tmpFile.checksum = vals['checksum'] - tmpFile.dataset = containerName - tmpFile.prodDBlock = tmpFile.dataset - tmpFile.type = 'input' - tmpFile.status = 'ready' - tmpFile.prodDBlockToken = 'local' - tmpJob.addFile(tmpFile) - - # merge type determination - if fileSuffix.endswith('log.tgz'): - # log - usedMergeType = 'log' - elif self.mergeType != '': - # user specified merging type - usedMergeType = self.mergeType - else: - # auto detection - usedMergeType = self.detectMergeTypeWithLFN(filePrefix,fileSuffix) - - if usedMergeType in ['user']: - ## run user mode merging given the merging script - params += ' -j %s -r %s' % (self.mergeScript, self.runDir) - - params += " -t %s" % usedMergeType - params += " -i \"%s\"" % repr(fileList) - - if self.getRootVer(): - params += " --rootVer %s" % self.getRootVer() - - if self.job.jobParameters.find('--useRootCore') >= 0: - params += " --useRootCore" - - # output - tmpFile = FileSpec() - if attNum == None: - tmpFile.lfn = "%s.%s.merge.%s" % (filePrefix,serNum,fileSuffix) - else: - tmpFile.lfn = "%s.%s.%s.merge.%s" % (filePrefix,serNum,attNum,fileSuffix) - - if usedMergeType == 'text' and \ - not tmpFile.lfn.endswith('.tgz') and \ - not tmpFile.lfn.endswith('.tar.gz'): - tmpFile.lfn += '.tgz' - tmpFile.destinationDBlock = outDsName - if self.fileDestSeMap.has_key(fileTypeKey): - tmpFile.destinationSE = self.fileDestSeMap[fileTypeKey] - else: - tmpFile.destinationSE = self.job.destinationSE - tmpFile.dataset = containerName - tmpFile.type = 'output' - tmpJob.addFile(tmpFile) - params += ' -o "%s"' % tmpFile.lfn - # log - tmpItems = filePrefix.split('.') - if len(tmpItems) > 3: - logPrefix = "%s.%s.%s" % tuple(tmpItems[:3]) - else: - logPrefix = filePrefix - tmpFile = FileSpec() - tmpFile.lfn = '%s._$PANDAID.log.tgz' % logPrefix - tmpFile.destinationDBlock = outDsName + ".log" - tmpFile.destinationSE = tmpJob.computingSite - tmpFile.dataset = self.dsContMergeLog - tmpFile.type = 'log' - tmpJob.addFile(tmpFile) - # set job parameter - tmpJob.jobParameters = params - if self.simulFlag: - _logger.debug("%s prams %s" % (self.job.PandaID,tmpJob.jobParameters)) - # return - return tmpJob diff --git a/current/pandaserver/dataservice/Notifier.py b/current/pandaserver/dataservice/Notifier.py deleted file mode 100755 index 44aa7cdcf..000000000 --- a/current/pandaserver/dataservice/Notifier.py +++ /dev/null @@ -1,396 +0,0 @@ -''' -notifier - -''' - -import re -import sys -import fcntl -import commands -import threading -import urllib -import shelve -import smtplib -import datetime -import time - -from config import panda_config -from taskbuffer.OraDBProxy import DBProxy -from pandalogger.PandaLogger import PandaLogger -from dataservice.DDM import dq2Info -import taskbuffer.ErrorCode - -# logger -_logger = PandaLogger().getLogger('Notifier') - -# lock file -_lockGetMail = open(panda_config.lockfile_getMail, 'w') - -# ignored DN -_ignoreList = [ - 'Nurcan Ozturk', - 'Xin Zhao', - 'Dietrich Liko', - ] - -# NG words in email address -_ngWordsInMailAddr = ['support','system','stuff','service','secretariat','club','user','admin', - 'cvs','grid','librarian','svn','atlas','cms','lhcb','alice','alaelp'] - -# port for SMTP server -smtpPortList = [25,587] - -def initLogger(pLogger): - # redirect logging to parent as it doesn't work in nested threads - global _logger - _logger = pLogger - - -# wrapper to patch smtplib.stderr to send debug info to logger -class StderrLogger(object): - def __init__(self,token): - self.token = token - def write(self,message): - message = message.strip() - if message != '': - _logger.debug('%s %s' % (self.token,message)) - - -class Notifier: - # constructor - def __init__(self,taskBuffer,job,datasets,summary={},mailFile=None,mailFileName=''): - self.job = job - self.datasets = datasets - self.taskBuffer = taskBuffer - self.summary = summary - self.mailFile = mailFile - self.mailFileName = mailFileName - - # main - def run(self): - if self.mailFile == None: - _logger.debug("%s start" % self.job.PandaID) - try: - # check job type - if self.job.prodSourceLabel != 'user' and self.job.prodSourceLabel != 'panda': - _logger.error("Invalid job type : %s" % self.job.prodSourceLabel) - _logger.debug("%s end" % self.job.PandaID) - return - # ignore some DNs to avoid mail storm - for igName in _ignoreList: - if re.search(igName,self.job.prodUserID) != None: - _logger.debug("Ignore DN : %s" % self.job.prodUserID) - _logger.debug("%s end" % self.job.PandaID) - return - # get e-mail address - mailAddr = self.getEmail(self.job.prodUserID) - if mailAddr == '': - _logger.error("could not find email address for %s" % self.job.prodUserID) - _logger.debug("%s end" % self.job.PandaID) - return - # not send - if mailAddr in ['notsend','',None]: - _logger.debug("not send to %s" % self.job.prodUserID) - _logger.debug("%s end" % self.job.PandaID) - return - # use all datasets - if self.summary != {}: - self.datasets = [] - for tmpJobID,tmpDsList in self.summary.iteritems(): - if tmpDsList == []: - continue - self.datasets += tmpDsList - # get full jobSpec including metadata - self.job = self.taskBuffer.peekJobs([self.job.PandaID],fromDefined=False, - fromActive=False,fromWaiting=False)[0] - if self.job == None: - _logger.error('%s : not found in DB' % self.job.PandaID) - _logger.debug("%s end" % self.job.PandaID) - return - # get IDs - ids = [] - # from active tables - tmpIDs = self.taskBuffer.queryPandaIDwithDataset(self.datasets) - for tmpID in tmpIDs: - if not tmpID in ids: - ids.append(tmpID) - # from archived table - if self.job.jobsetID in [0,'NULL',None]: - tmpIDs = self.taskBuffer.getPandIDsWithIdInArch(self.job.prodUserName,self.job.jobDefinitionID,False) - else: - tmpIDs = self.taskBuffer.getPandIDsWithIdInArch(self.job.prodUserName,self.job.jobsetID,True) - for tmpID in tmpIDs: - if not tmpID in ids: - ids.append(tmpID) - _logger.debug("%s IDs: %s" % (self.job.PandaID,ids)) - if len(ids) != 0: - # get jobs - jobs = self.taskBuffer.getFullJobStatus(ids,fromDefined=False,fromActive=False, - fromWaiting=False,forAnal=False) - # statistics - nTotal = 0 - nSucceeded = 0 - nFailed = 0 - nPartial = 0 - nCancel = 0 - # time info - creationTime = self.job.creationTime - endTime = self.job.modificationTime - if isinstance(endTime,datetime.datetime): - endTime = endTime.strftime('%Y-%m-%d %H:%M:%S') - # datasets - iDSList = [] - oDSList = [] - siteMap = {} - logDS = None - for tmpJob in jobs: - if not siteMap.has_key(tmpJob.jobDefinitionID): - siteMap[tmpJob.jobDefinitionID] = tmpJob.computingSite - for file in tmpJob.Files: - if file.type == 'input': - if not file.dataset in iDSList: - iDSList.append(file.dataset) - else: - if not file.dataset in oDSList: - oDSList.append(file.dataset) - if file.type == 'log': - logDS = file.dataset - # job/jobset IDs and site - if self.summary == {}: - jobIDsite = "%s/%s" % (self.job.jobDefinitionID,self.job.computingSite) - jobsetID = self.job.jobDefinitionID - jobDefIDList = [self.job.jobDefinitionID] - else: - jobDefIDList = self.summary.keys() - jobDefIDList.sort() - jobIDsite = '' - tmpIndent = " " - for tmpJobID in jobDefIDList: - jobIDsite += '%s/%s\n%s' % (tmpJobID,siteMap[tmpJobID],tmpIndent) - remCount = len(tmpIndent) + 1 - jobIDsite = jobIDsite[:-remCount] - jobsetID = self.job.jobsetID - # count - for job in jobs: - if job == None: - continue - # ignore pilot-retried job - if job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_PilotRetried]: - continue - # total - nTotal += 1 - # count per job status - if job.jobStatus == 'finished': - # check all files were used - allUses = True - for file in job.Files: - if file.type == 'input' and file.status in ['skipped']: - allUses = False - break - if allUses: - nSucceeded += 1 - else: - nPartial += 1 - elif job.jobStatus == 'failed': - nFailed += 1 - elif job.jobStatus == 'cancelled': - nCancel += 1 - # make message - if nSucceeded == nTotal: - finalStatInSub = "(All Succeeded)" - else: - finalStatInSub = "(%s/%s Succeeded)" % (nSucceeded,nTotal) - fromadd = panda_config.emailSender - if self.job.jobsetID in [0,'NULL',None]: - message = \ -"""Subject: PANDA notification for JobID : %s %s -From: %s -To: %s - -Summary of JobID : %s - -Site : %s""" % (self.job.jobDefinitionID,finalStatInSub,fromadd,mailAddr,self.job.jobDefinitionID,self.job.computingSite) - else: - message = \ -"""Subject: PANDA notification for JobsetID : %s %s -From: %s -To: %s - -Summary of JobsetID : %s - -JobID/Site : %s""" % (jobsetID,finalStatInSub,fromadd,mailAddr,jobsetID,jobIDsite) - message += \ -""" - -Created : %s (UTC) -Ended : %s (UTC) - -Total Number of Jobs : %s - Succeeded : %s - Partial : %s - Failed : %s - Cancelled : %s -""" % (creationTime,endTime,nTotal,nSucceeded,nPartial,nFailed,nCancel) - # input datasets - for iDS in iDSList: - message += \ -""" -In : %s""" % iDS - # output datasets - for oDS in oDSList: - message += \ -""" -Out : %s""" % oDS - # command - if not self.job.metadata in ['','NULL',None]: - message += \ -""" - -Parameters : %s""" % self.job.metadata - # URLs to PandaMon - if self.job.jobsetID in [0,'NULL',None]: - for tmpIdx,tmpJobID in enumerate(jobDefIDList): - urlData = {} - urlData['job'] = '*' - urlData['jobDefinitionID'] = tmpJobID - urlData['user'] = self.job.prodUserName - urlData['at'] = (str(creationTime)).split()[0] - if tmpIdx == 0: - message += \ -""" - -PandaMonURL : http://panda.cern.ch/server/pandamon/query?%s""" % urllib.urlencode(urlData) - else: - message += \ -""" - http://panda.cern.ch/server/pandamon/query?%s""" % urllib.urlencode(urlData) - else: - urlData = {} - urlData['job'] = '*' - urlData['jobsetID'] = self.job.jobsetID - urlData['user'] = self.job.prodUserName - urlData['at'] = (str(creationTime)).split()[0] - message += \ -""" - -PandaMonURL : http://panda.cern.ch/server/pandamon/query?%s""" % urllib.urlencode(urlData) - if logDS != None: - message += \ -""" -TaskMonitorURL : https://dashb-atlas-task.cern.ch/templates/task-analysis/#task=%s""" % logDS - - # tailer - message += \ -""" - - -Report Panda problems of any sort to - - the eGroup for help request - hn-atlas-dist-analysis-help@cern.ch - - the Savannah for software bug - https://savannah.cern.ch/projects/panda/ -""" - - # send mail - self.sendMail(self.job.PandaID,fromadd,mailAddr,message,1,True) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s %s %s" % (self.job.PandaID,errType,errValue)) - _logger.debug("%s end" % self.job.PandaID) - else: - try: - _logger.debug("start recovery for %s" % self.mailFileName) - # read from file - pandaID = self.mailFile.readline()[:-1] - fromadd = self.mailFile.readline()[:-1] - mailAddr = self.mailFile.readline()[:-1] - message = self.mailFile.read() - _logger.debug("%s start recovery" % pandaID) - if message != '': - self.sendMail(pandaID,fromadd,mailAddr,message,5,False) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s %s %s" % (self.mailFileName,errType,errValue)) - _logger.debug("end recovery for %s" % self.mailFileName) - - - # send mail - def sendMail(self,pandaID,fromadd,mailAddr,message,nTry,fileBackUp): - _logger.debug("%s send to %s\n%s" % (pandaID,mailAddr,message)) - for iTry in range(nTry): - try: - org_smtpstderr = smtplib.stderr - smtplib.stderr = StderrLogger(pandaID) - smtpPort = smtpPortList[iTry % len(smtpPortList)] - server = smtplib.SMTP(panda_config.emailSMTPsrv,smtpPort) - server.set_debuglevel(1) - server.ehlo() - server.starttls() - #server.login(panda_config.emailLogin,panda_config.emailPass) - out = server.sendmail(fromadd,mailAddr,message) - _logger.debug('%s %s' % (pandaID,str(out))) - server.quit() - break - except: - errType,errValue = sys.exc_info()[:2] - if iTry+1 < nTry: - # sleep for retry - _logger.debug("%s sleep %s due to %s %s" % (pandaID,iTry,errType,errValue)) - time.sleep(30) - else: - _logger.error("%s %s %s" % (pandaID,errType,errValue)) - if fileBackUp: - # write to file which is processed in add.py - mailFile = '%s/mail_%s_%s' % (panda_config.logdir,self.job.PandaID,commands.getoutput('uuidgen')) - oMail = open(mailFile,"w") - oMail.write(str(self.job.PandaID)+'\n'+fromadd+'\n'+mailAddr+'\n'+message) - oMail.close() - try: - smtplib.stderr = org_smtpstderr - except: - pass - - - - # get email - def getEmail(self,dn): - # get DN - _logger.debug("getDN for %s" % dn) - dbProxy = DBProxy() - distinguishedName = dbProxy.cleanUserID(dn) - _logger.debug("DN = %s" % distinguishedName) - if distinguishedName == "": - _logger.error("cannot get DN for %s" % dn) - return "" - # get email from MetaDB - mailAddr = self.taskBuffer.getEmailAddr(distinguishedName) - if mailAddr == 'notsend': - _logger.debug("email from MetaDB : '%s'" % mailAddr) - return mailAddr - # get email from DQ2 - realDN = re.sub('/CN=limited proxy','',dn) - realDN = re.sub('(/CN=proxy)+','',realDN) - try: - _logger.debug("dq2Info.finger(%s)" % realDN) - for iDDMTry in range(3): - status,out = dq2Info.finger(realDN) - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(10) - else: - break - _logger.debug(out) - exec "userInfo=%s" % out - mailAddr = userInfo['email'] - _logger.debug("email from DQ2 : '%s'" % mailAddr) - return mailAddr - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s %s" % (errType,errValue)) - return "" - - - diff --git a/current/pandaserver/dataservice/ProcessLimiter.py b/current/pandaserver/dataservice/ProcessLimiter.py deleted file mode 100644 index 580fe9c39..000000000 --- a/current/pandaserver/dataservice/ProcessLimiter.py +++ /dev/null @@ -1,54 +0,0 @@ -import datetime -import commands -import threading - -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('ProcessLimiter') - - -# limit the number of processes -class ProcessLimiter: - # constructor - def __init__(self,maxProcess=3): - self.processLock = threading.Semaphore(maxProcess) - self.dataLock = threading.Lock() - self.summary = {'nQueued':0,'nRunning':0} - - - # update summary - def updateSummary(self,dataName,change): - # lock - self.dataLock.acquire() - # update - if self.summary.has_key(dataName): - self.summary[dataName] += change - # release - self.dataLock.release() - _logger.debug('Summary : %s' % str(self.summary)) - - - # execute command - def getstatusoutput(self,commandStr): - # time stamp - timestamp = datetime.datetime.utcnow().isoformat(' ') - _logger.debug('%s start for "%s"' % (timestamp,commandStr)) - self.updateSummary('nQueued',1) - _logger.debug('%s getting lock' % timestamp) - # get semaphore - self.processLock.acquire() - _logger.debug('%s got lock' % timestamp) - # execute - self.updateSummary('nRunning',1) - status,output = commands.getstatusoutput(commandStr) - _logger.debug('%s executed' % timestamp) - self.updateSummary('nRunning',-1) - # release queue - self.processLock.release() - _logger.debug('%s end' % timestamp) - self.updateSummary('nQueued',-1) - # return - return status,output - - diff --git a/current/pandaserver/dataservice/RetryMaker.py b/current/pandaserver/dataservice/RetryMaker.py deleted file mode 100755 index e6b69a6ce..000000000 --- a/current/pandaserver/dataservice/RetryMaker.py +++ /dev/null @@ -1,125 +0,0 @@ -''' -notifier - -''' - -import re -import sys -import commands -import urllib -import datetime -import time - -from config import panda_config -from userinterface import ReBroker -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('RetryMaker') - - -def initLogger(pLogger): - # redirect logging to parent as it doesn't work in nested threads - global _logger - _logger = pLogger - ReBroker.initLogger(_logger) - - -class RetryMaker: - # constructor - def __init__(self,taskBuffer,job): - self.job = job - self.taskBuffer = taskBuffer - - # main - def run(self): - _logger.debug("%s start" % self.job.PandaID) - try: - # check the number of server retry - nRetry = self.job.specialHandling.split(',').count('sretry') - _logger.debug("%s nRetry=%s" % (self.job.PandaID,nRetry)) - # too many reattempts - maxRetry = 2 - if nRetry >= maxRetry: - _logger.debug("%s end : too many reattempts %s>=%s" % (self.job.PandaID,nRetry,maxRetry)) - return True - # get all job status in Active - idStatus,buildID = self.taskBuffer.getPandIDsWithJobID(self.job.prodUserName, - self.job.jobDefinitionID, - {},0) - # count # of failed in active - nFailed = 0 - for tmpID,tmpVar in idStatus.iteritems(): - # ignore buildJob - if tmpID == buildID: - continue - # count - tmpStatus,tmpCommand = tmpVar - if tmpStatus == 'failed': - nFailed += 1 - elif tmpStatus == 'cancelled' or tmpCommand == 'tobekilled': - # killed - _logger.debug("%s end : cancelled" % self.job.PandaID) - return True - _logger.debug("%s : nFailed=%s in Active" % (self.job.PandaID,nFailed)) - # no failed - if nFailed == 0: - _logger.debug("%s end : no failed jobs" % self.job.PandaID) - return True - # get all job status including Archived - idStatus,buildID = self.taskBuffer.getPandIDsWithJobIDLog(self.job.prodUserName, - self.job.jobDefinitionID, - idStatus,0,buildID) - # count # of failed and others in archived - nFailed = 0 - nOthers = 0 - for tmpID,tmpVar in idStatus.iteritems(): - # ignore buildJob - if tmpID == buildID: - continue - # count - tmpStatus,tmpCommand = tmpVar - if tmpStatus == 'failed': - nFailed += 1 - elif tmpStatus == 'cancelled' or tmpCommand == 'tobekilled': - # killed - _logger.debug("%s end : cancelled" % self.job.PandaID) - return True - else: - nOthers += 1 - _logger.debug("%s : nFailed=%s nOthers=%s in Active+Archived" % (self.job.PandaID,nFailed,nOthers)) - # no successful jobs - if nOthers == 0: - _logger.debug("%s end : no successful jobs" % self.job.PandaID) - return True - # no failed jobs just in case - if nFailed == 0: - _logger.debug("%s end : no failed jobs" % self.job.PandaID) - return True - # check ratio - maxFailedRatio = 0.8 - failedRatio = float(nFailed) / float(nOthers+nFailed) - if failedRatio > maxFailedRatio: - _logger.debug("%s end : too many failed jobs %s/%s>%s" % (self.job.PandaID, - nFailed, - nOthers+nFailed, - maxFailedRatio)) - return True - # instantiate rebrokerage since server-side retry relies on that - rebro = ReBroker.ReBroker(self.taskBuffer,forFailed=True,avoidSameSite=True) - # lock job for retry - reSt,reVal = rebro.lockJob(self.job.prodUserID,self.job.jobDefinitionID) - if not reSt: - _logger.debug("%s end : failed to lock jobs with %s" % (self.job.PandaID,eVal)) - return False - # execute - _logger.debug("%s : execute ReBroker" % self.job.PandaID) - rebro.start() - rebro.join() - _logger.debug("%s end : successfully" % self.job.PandaID) - return True - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s %s %s" % (self.job.PandaID,errType,errValue)) - _logger.debug("%s end : failed" % self.job.PandaID) - return False diff --git a/current/pandaserver/dataservice/Setupper.py b/current/pandaserver/dataservice/Setupper.py deleted file mode 100755 index 6b2103fea..000000000 --- a/current/pandaserver/dataservice/Setupper.py +++ /dev/null @@ -1,2420 +0,0 @@ -''' -setup dataset - -''' - -import re -import sys -import time -import types -import urllib -import datetime -import commands -import threading -import traceback -import ErrorCode -import TaskAssigner -from DDM import ddm -from dataservice.DDM import dq2Common -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec -from taskbuffer.DatasetSpec import DatasetSpec -from brokerage.SiteMapper import SiteMapper -from brokerage.PandaSiteIDs import PandaMoverIDs -import brokerage.broker -import brokerage.broker_util -import DataServiceUtils - - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('Setupper') - - -# temporary -PandaDDMSource = ['BNLPANDA','BNL-OSG2_MCDISK','BNL-OSG2_DATADISK','BNL-OSG2_MCTAPE','BNL-OSG2_DATATAPE'] - - -class Setupper (threading.Thread): - # constructor - def __init__(self,taskBuffer,jobs,resubmit=False,pandaDDM=False,ddmAttempt=0,forkRun=False,onlyTA=False, - resetLocation=False,useNativeDQ2=True): - threading.Thread.__init__(self) - self.jobs = jobs - self.taskBuffer = taskBuffer - # VUIDs of dispatchDBlocks - self.vuidMap = {} - # resubmission or not - self.resubmit = resubmit - # time stamp - self.timestamp = datetime.datetime.utcnow().isoformat(' ') - # use PandaDDM - self.pandaDDM = pandaDDM - # file list for dispDS for PandaDDM - self.dispFileList = {} - # priority for ddm job - self.ddmAttempt = ddmAttempt - # site mapper - self.siteMapper = None - # fork another process because python doesn't release memory - self.forkRun = forkRun - # run task assignment only - self.onlyTA = onlyTA - # location map - self.replicaMap = {} - # all replica locations - self.allReplicaMap = {} - # reset locations - self.resetLocation = resetLocation - # replica map for special brokerage - self.replicaMapForBroker = {} - # available files at T2 - self.availableLFNsInT2 = {} - # use DQ2 in the same process - self.useNativeDQ2 = useNativeDQ2 - # list of missing datasets - self.missingDatasetList = {} - # lfn ds map - self.lfnDatasetMap = {} - - - # main - def run(self): - try: - _logger.debug('%s startRun' % self.timestamp) - self._memoryCheck() - # run main procedure in the same process - if not self.forkRun: - if self.jobs != None and len(self.jobs) > 0: - _logger.debug('%s PandaID:%s type:%s taskID:%s' % (self.timestamp, - self.jobs[0].PandaID, - self.jobs[0].prodSourceLabel, - self.jobs[0].taskID)) - # instantiate site mapper - self.siteMapper = SiteMapper(self.taskBuffer) - # use native DQ2 - if self.useNativeDQ2: - ddm.useDirectDQ2() - # correctLFN - self._correctLFN() - # run full Setupper - if not self.onlyTA: - # invoke brokerage - _logger.debug('%s brokerSchedule' % self.timestamp) - brokerage.broker.schedule(self.jobs,self.taskBuffer,self.siteMapper, - replicaMap=self.replicaMapForBroker, - t2FilesMap=self.availableLFNsInT2) - # remove waiting jobs - self.removeWaitingJobs() - # setup dispatch dataset - _logger.debug('%s setupSource' % self.timestamp) - self._setupSource() - # sort by site so that larger subs are created in the next step - if self.jobs != [] and self.jobs[0].prodSourceLabel in ['managed','test']: - tmpJobMap = {} - for tmpJob in self.jobs: - # add site - if not tmpJobMap.has_key(tmpJob.computingSite): - tmpJobMap[tmpJob.computingSite] = [] - # add job - tmpJobMap[tmpJob.computingSite].append(tmpJob) - # make new list - tmpJobList = [] - for tmpSiteKey in tmpJobMap.keys(): - tmpJobList += tmpJobMap[tmpSiteKey] - # set new list - self.jobs = tmpJobList - # create dataset for outputs and assign destination - if self.jobs != [] and self.jobs[0].prodSourceLabel in ['managed','test'] and self.jobs[0].cloud in ['DE']: - # count the number of jobs per _dis - iBunch = 0 - prevDisDsName = None - nJobsPerDisList = [] - for tmpJob in self.jobs: - if prevDisDsName != None and prevDisDsName != tmpJob.dispatchDBlock: - nJobsPerDisList.append(iBunch) - iBunch = 0 - # increment - iBunch += 1 - # set _dis name - prevDisDsName = tmpJob.dispatchDBlock - # remaining - if iBunch != 0: - nJobsPerDisList.append(iBunch) - # split sub datasets - iBunch = 0 - nBunchMax = 50 - tmpIndexJob = 0 - for nJobsPerDis in nJobsPerDisList: - # check _dis boundary so that the same _dis doesn't contribute to many _subs - if iBunch+nJobsPerDis > nBunchMax: - if iBunch != 0: - self._setupDestination(startIdx=tmpIndexJob,nJobsInLoop=iBunch) - tmpIndexJob += iBunch - iBunch = 0 - # increment - iBunch += nJobsPerDis - # remaining - if iBunch != 0: - self._setupDestination(startIdx=tmpIndexJob,nJobsInLoop=iBunch) - else: - # at a burst - self._setupDestination() - # make dis datasets for existing files - self._makeDisDatasetsForExistingfiles() - # update jobs - _logger.debug('%s updateJobs' % self.timestamp) - self._updateJobs() - # then subscribe sites distpatchDBlocks. this must be the last method - _logger.debug('%s subscribeDistpatchDB' % self.timestamp) - self._subscribeDistpatchDB() - # dynamic data placement for analysis jobs - self._dynamicDataPlacement() - # pin input datasets - self._pinInputDatasets() - # make subscription for missing - self._makeSubscriptionForMissing() - else: - # write jobs to file - import os - import cPickle as pickle - outFileName = '%s/set.%s_%s' % (panda_config.logdir,self.jobs[0].PandaID,commands.getoutput('uuidgen')) - outFile = open(outFileName,'w') - pickle.dump(self.jobs,outFile) - outFile.close() - # run main procedure in another process because python doesn't release memory - com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) - com += 'source /opt/glite/etc/profile.d/grid-env.sh; ' - com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \ - (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, - panda_config.pandaPython_dir,outFileName) - if self.onlyTA: - com += " -t" - _logger.debug(com) - # exeute - status,output = self.taskBuffer.processLimiter.getstatusoutput(com) - _logger.debug("Ret from another process: %s %s" % (status,output)) - self._memoryCheck() - _logger.debug('%s endRun' % self.timestamp) - except: - type, value, traceBack = sys.exc_info() - _logger.error("%s run() : %s %s" % (self.timestamp,type,value)) - - - # make dipatchDBlocks, insert prod/dispatchDBlock to database - def _setupSource(self): - fileList = {} - prodList = [] - prodError = {} - dispSiteMap = {} - dispError = {} - # extract prodDBlock - for job in self.jobs: - # ignore failed jobs - if job.jobStatus in ['failed','cancelled']: - continue - # production datablock - if job.prodDBlock != 'NULL' and (not self.pandaDDM) and (not job.prodSourceLabel in ['user','panda']): - # get VUID and record prodDBlock into DB - if not prodError.has_key(job.prodDBlock): - time.sleep(1) - _logger.debug((self.timestamp,'queryDatasetByName',job.prodDBlock)) - prodError[job.prodDBlock] = '' - for iDDMTry in range(3): - status,out = ddm.repositoryClient.main('queryDatasetByName',job.prodDBlock) - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(60) - else: - break - _logger.debug("%s %s" % (self.timestamp,out)) - if status != 0 or out.find('Error') != -1: - prodError[job.prodDBlock] = "Setupper._setupSource() could not get VUID of prodDBlock" - _logger.error(out) - else: - try: - exec "vuids = %s['%s']['vuids']" % (out.split('\n')[0],job.prodDBlock) - nfiles = 0 - # dataset spec - ds = DatasetSpec() - ds.vuid = vuids[0] - ds.name = job.prodDBlock - ds.type = 'input' - ds.status = 'completed' - ds.numberfiles = nfiles - ds.currentfiles = nfiles - prodList.append(ds) - except: - type, value, traceBack = sys.exc_info() - _logger.error("_setupSource() : %s %s" % (type,value)) - prodError[job.prodDBlock] = "Setupper._setupSource() could not decode VUID of prodDBlock" - # error - if prodError[job.prodDBlock] != '': - job.jobStatus = 'failed' - job.ddmErrorCode = ErrorCode.EC_Setupper - job.ddmErrorDiag = prodError[job.prodDBlock] - continue - # dispatch datablock - if job.dispatchDBlock != 'NULL': - # src/dst sites - tmpSrcID = 'BNL_ATLAS_1' - if self.siteMapper.checkCloud(job.cloud): - # use cloud's source - tmpSrcID = self.siteMapper.getCloud(job.cloud)['source'] - srcDQ2ID = self.siteMapper.getSite(tmpSrcID).ddm - # use srcDQ2ID as dstDQ2ID when dst SE is same as src SE - srcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpSrcID).se) - dstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(job.computingSite).se) - if srcSEs == dstSEs: - dstDQ2ID = srcDQ2ID - else: - dstDQ2ID = self.siteMapper.getSite(job.computingSite).ddm - dispSiteMap[job.dispatchDBlock] = {'src':srcDQ2ID,'dst':dstDQ2ID,'site':job.computingSite} - # filelist - if not fileList.has_key(job.dispatchDBlock): - fileList[job.dispatchDBlock] = {'lfns':[],'guids':[],'fsizes':[],'md5sums':[],'chksums':[]} - # collect LFN and GUID - for file in job.Files: - if file.type == 'input' and file.status == 'pending': - if not file.lfn in fileList[job.dispatchDBlock]['lfns']: - fileList[job.dispatchDBlock]['lfns'].append(file.lfn) - fileList[job.dispatchDBlock]['guids'].append(file.GUID) - if file.fsize in ['NULL',0]: - fileList[job.dispatchDBlock]['fsizes'].append(None) - else: - fileList[job.dispatchDBlock]['fsizes'].append(long(file.fsize)) - if file.md5sum in ['NULL','']: - fileList[job.dispatchDBlock]['md5sums'].append(None) - elif file.md5sum.startswith("md5:"): - fileList[job.dispatchDBlock]['md5sums'].append(file.md5sum) - else: - fileList[job.dispatchDBlock]['md5sums'].append("md5:%s" % file.md5sum) - if file.checksum in ['NULL','']: - fileList[job.dispatchDBlock]['chksums'].append(None) - else: - fileList[job.dispatchDBlock]['chksums'].append(file.checksum) - # get replica locations - if not self.replicaMap.has_key(job.dispatchDBlock): - self.replicaMap[job.dispatchDBlock] = {} - if not self.allReplicaMap.has_key(file.dataset): - if file.dataset.endswith('/'): - status,out = self.getListDatasetReplicasInContainer(file.dataset) - else: - for iDDMTry in range(3): - _logger.debug((self.timestamp,'listDatasetReplicas',file.dataset)) - status,out = ddm.DQ2.main('listDatasetReplicas',file.dataset,0,None,False) - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1 \ - or out == '()': - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error("%s %s" % (self.timestamp,out)) - dispError[job.dispatchDBlock] = 'could not get locations for %s' % file.dataset - _logger.error(dispError[job.dispatchDBlock]) - else: - _logger.debug("%s %s" % (self.timestamp,out)) - tmpRepSites = {} - try: - # convert res to map - exec "tmpRepSites = %s" % out - self.allReplicaMap[file.dataset] = tmpRepSites - except: - dispError[job.dispatchDBlock] = 'could not convert HTTP-res to replica map for %s' % file.dataset - _logger.error(dispError[job.dispatchDBlock]) - _logger.error(out) - if self.allReplicaMap.has_key(file.dataset): - self.replicaMap[job.dispatchDBlock][file.dataset] = self.allReplicaMap[file.dataset] - # register dispatch dataset - dispList = [] - for dispatchDBlock in fileList.keys(): - # ignore empty dataset - if len(fileList[dispatchDBlock]['lfns']) == 0: - continue - # use DQ2 - if (not self.pandaDDM) and (not dispSiteMap[dispatchDBlock]['src'] in PandaDDMSource or \ - self.siteMapper.getSite(dispSiteMap[dispatchDBlock]['site']).cloud != 'US') \ - and (job.prodSourceLabel != 'ddm') and (not dispSiteMap[dispatchDBlock]['site'].endswith("_REPRO")): - # register dispatch dataset - disFiles = fileList[dispatchDBlock] - _logger.debug((self.timestamp,'registerNewDataset',dispatchDBlock,disFiles['lfns'],disFiles['guids'], - disFiles['fsizes'],disFiles['chksums'],None,None,None,True)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('registerNewDataset',dispatchDBlock,disFiles['lfns'],disFiles['guids'], - disFiles['fsizes'],disFiles['chksums'],None,None,None,True) - if status != 0 and out.find('DQDatasetExistsException') != -1: - break - elif status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - _logger.debug("%s sleep %s for %s" % (self.timestamp,iDDMTry,dispatchDBlock)) - _logger.debug(status) - _logger.debug(out) - _logger.debug("-------------") - time.sleep(60) - else: - break - if status != 0 or out.find('Error') != -1: - _logger.error("%s %s" % (self.timestamp,out)) - dispError[dispatchDBlock] = "Setupper._setupSource() could not register dispatchDBlock" - continue - _logger.debug("%s %s" % (self.timestamp,out)) - vuidStr = out - # freezeDataset dispatch dataset - time.sleep(1) - _logger.debug((self.timestamp,'freezeDataset',dispatchDBlock)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('freezeDataset',dispatchDBlock) - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(60) - else: - break - if status != 0 or (out.find('Error') != -1 and out.find("is frozen") == -1): - _logger.error("%s %s" % (self.timestamp,out)) - dispError[dispatchDBlock] = "Setupper._setupSource() could not freeze dispatchDBlock" - continue - _logger.debug("%s %s" % (self.timestamp,out)) - else: - # use PandaDDM - self.dispFileList[dispatchDBlock] = fileList[dispatchDBlock] - # create a fake vuidStr for PandaDDM - tmpMap = {'vuid':commands.getoutput('uuidgen')} - vuidStr = "%s" % tmpMap - # get VUID - try: - exec "vuid = %s['vuid']" % vuidStr - # dataset spec. currentfiles is used to count the number of failed jobs - ds = DatasetSpec() - ds.vuid = vuid - ds.name = dispatchDBlock - ds.type = 'dispatch' - ds.status = 'defined' - ds.numberfiles = len(fileList[dispatchDBlock])/2 - ds.currentfiles = 0 - dispList.append(ds) - self.vuidMap[ds.name] = ds.vuid - except: - type, value, traceBack = sys.exc_info() - _logger.error("_setupSource() : %s %s" % (type,value)) - dispError[dispatchDBlock] = "Setupper._setupSource() could not decode VUID dispatchDBlock" - # insert datasets to DB - self.taskBuffer.insertDatasets(prodList+dispList) - # job status - for job in self.jobs: - if dispError.has_key(job.dispatchDBlock) and dispError[job.dispatchDBlock] != '': - job.jobStatus = 'failed' - job.ddmErrorCode = ErrorCode.EC_Setupper - job.ddmErrorDiag = dispError[job.dispatchDBlock] - # delete explicitly some huge variables - del fileList - del prodList - del prodError - del dispSiteMap - - - # create dataset for outputs in the repository and assign destination - def _setupDestination(self,startIdx=-1,nJobsInLoop=50): - _logger.debug('%s setupDestination idx:%s n:%s' % (self.timestamp,startIdx,nJobsInLoop)) - destError = {} - datasetList = {} - newnameList = {} - snGottenDS = [] - if startIdx == -1: - jobsList = self.jobs - else: - jobsList = self.jobs[startIdx:startIdx+nJobsInLoop] - for job in jobsList: - # ignore failed jobs - if job.jobStatus in ['failed','cancelled']: - continue - for file in job.Files: - # ignore input files - if file.type == 'input': - continue - # don't touch with outDS for unmerge jobs - if job.prodSourceLabel == 'panda' and job.processingType == 'unmerge' and file.type != 'log': - continue - # extract destinationDBlock, destinationSE and computingSite - dest = (file.destinationDBlock,file.destinationSE,job.computingSite,file.destinationDBlockToken) - if not destError.has_key(dest): - destError[dest] = '' - originalName = '' - if (job.prodSourceLabel == 'panda') or (job.prodSourceLabel in ['ptest','rc_test'] and \ - job.processingType in ['pathena','prun','gangarobot-rctest']): - # keep original name - nameList = [file.destinationDBlock] - else: - # set freshness to avoid redundant DB lookup - definedFreshFlag = None - if file.destinationDBlock in snGottenDS: - # already checked - definedFreshFlag = False - elif job.prodSourceLabel in ['user','test','prod_test']: - # user or test datasets are always fresh in DB - definedFreshFlag = True - # get serial number - sn,freshFlag = self.taskBuffer.getSerialNumber(file.destinationDBlock,definedFreshFlag) - if sn == -1: - destError[dest] = "Setupper._setupDestination() could not get serial num for %s" % file.destinationDBlock - continue - if not file.destinationDBlock in snGottenDS: - snGottenDS.append(file.destinationDBlock) - # new dataset name - newnameList[dest] = "%s_sub0%s" % (file.destinationDBlock,sn) - if freshFlag or self.resetLocation: - # register original dataset and new dataset - nameList = [file.destinationDBlock,newnameList[dest]] - originalName = file.destinationDBlock - else: - # register new dataset only - nameList = [newnameList[dest]] - # create dataset - for name in nameList: - computingSite = job.computingSite - if name == originalName: - # for original dataset - computingSite = file.destinationSE - # use DQ2 - if (not self.pandaDDM) and (job.prodSourceLabel != 'ddm') and (job.destinationSE != 'local'): - # get src and dest DDM conversion is needed for unknown sites - if job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(computingSite): - # DQ2 ID was set by using --destSE for analysis job to transfer output - tmpSrcDDM = self.siteMapper.getSite(job.computingSite).ddm - else: - tmpSrcDDM = self.siteMapper.getSite(computingSite).ddm - if job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(file.destinationSE): - # DQ2 ID was set by using --destSE for analysis job to transfer output - tmpDstDDM = tmpSrcDDM - else: - tmpDstDDM = self.siteMapper.getSite(file.destinationSE).ddm - # skip registration for _sub when src=dest - if tmpSrcDDM == tmpDstDDM and name != originalName and re.search('_sub\d+$',name) != None: - # create a fake vuidStr - vuidStr = 'vuid="%s"' % commands.getoutput('uuidgen') - else: - # register dataset - time.sleep(1) - # set hidden flag for _sub - tmpHiddenFlag = False - if name != originalName and re.search('_sub\d+$',name) != None: - tmpHiddenFlag = True - _logger.debug((self.timestamp,'registerNewDataset',name,[],[],[],[], - None,None,None,tmpHiddenFlag)) - atFailed = 0 - for iDDMTry in range(3): - status,out = ddm.DQ2.main('registerNewDataset',name,[],[],[],[], - None,None,None,tmpHiddenFlag) - if status != 0 and out.find('DQDatasetExistsException') != -1: - atFailed = iDDMTry - break - elif status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - _logger.debug("%s sleep %s for %s" % (self.timestamp,iDDMTry,name)) - _logger.debug(status) - _logger.debug(out) - _logger.debug("-------------") - time.sleep(60) - else: - break - if status != 0 or out.find('Error') != -1: - # unset vuidStr - vuidStr = "" - # ignore 'already exists' ERROR because original dataset may be registered by upstream. - # atFailed > 0 is for the case in which the first attempt succeeded but report failure - if (job.prodSourceLabel == 'panda' or (job.prodSourceLabel in ['ptest','rc_test'] and \ - job.processingType in ['pathena','prun','gangarobot-rctest']) \ - or name == originalName or atFailed > 0) and \ - out.find('DQDatasetExistsException') != -1: - _logger.debug('%s ignored DQDatasetExistsException' % self.timestamp) - else: - destError[dest] = "Setupper._setupDestination() could not register : %s" % name - _logger.error("%s %s" % (self.timestamp,out)) - continue - else: - _logger.debug("%s %s" % (self.timestamp,out)) - vuidStr = "vuid = %s['vuid']" % out - # get list of tokens - tmpTokenList = file.destinationDBlockToken.split(',') - # register datasetsets - if name == originalName or tmpSrcDDM != tmpDstDDM or \ - job.prodSourceLabel == 'panda' or (job.prodSourceLabel in ['ptest','rc_test'] and \ - job.processingType in ['pathena','prun','gangarobot-rctest']) \ - or len(tmpTokenList) > 1: - time.sleep(1) - # register location - usingT1asT2 = False - if job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(computingSite): - dq2IDList = [self.siteMapper.getSite(job.computingSite).ddm] - else: - if self.siteMapper.getSite(computingSite).cloud != job.cloud and \ - re.search('_sub\d+$',name) != None and \ - (not job.prodSourceLabel in ['user','panda']) and \ - (not self.siteMapper.getSite(computingSite).ddm.endswith('PRODDISK')): - # T1 used as T2. Use both DATADISK and PRODDISK as locations while T1 PRODDISK is phasing out - dq2IDList = [self.siteMapper.getSite(computingSite).ddm] - if self.siteMapper.getSite(computingSite).setokens.has_key('ATLASPRODDISK'): - dq2IDList += [self.siteMapper.getSite(computingSite).setokens['ATLASPRODDISK']] - usingT1asT2 = True - else: - dq2IDList = [self.siteMapper.getSite(computingSite).ddm] - # use another location when token is set - if (not usingT1asT2) and (not file.destinationDBlockToken in ['NULL','']): - dq2IDList = [] - for tmpToken in tmpTokenList: - # set default - dq2ID = self.siteMapper.getSite(computingSite).ddm - # convert token to DQ2ID - if self.siteMapper.getSite(computingSite).setokens.has_key(tmpToken): - dq2ID = self.siteMapper.getSite(computingSite).setokens[tmpToken] - # replace or append - if len(tmpTokenList) <= 1 or name != originalName: - # use location consistent with token - dq2IDList = [dq2ID] - break - else: - # use multiple locations for _tid - if not dq2ID in dq2IDList: - dq2IDList.append(dq2ID) - # loop over all locations - repLifeTime = None - if name != originalName and re.search('_sub\d+$',name) != None: - repLifeTime = "14 days" - for dq2ID in dq2IDList: - _logger.debug((self.timestamp,'registerDatasetLocation',name,dq2ID,0,0,None,None,None,repLifeTime)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('registerDatasetLocation',name,dq2ID,0,0,None,None,None,repLifeTime) - if status != 0 and out.find('DQLocationExistsException') != -1: - break - elif status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(60) - else: - break - # ignore "already exists at location XYZ" - if out.find('DQLocationExistsException') != -1: - _logger.debug('%s ignored DQLocationExistsException' % self.timestamp) - status,out = 0,'' - else: - _logger.debug("%s %s" % (self.timestamp,out)) - if status == 0 and out.find('Error') == -1: - # change replica ownership for user datasets - if self.resetLocation and ((name == originalName and job.prodSourceLabel == 'user') or \ - job.prodSourceLabel=='panda'): - # remove /CN=proxy and /CN=limited from DN - tmpRealDN = job.prodUserID - tmpRealDN = re.sub('/CN=limited proxy','',tmpRealDN) - tmpRealDN = re.sub('/CN=proxy','',tmpRealDN) - status,out = dq2Common.parse_dn(tmpRealDN) - if status != 0: - _logger.error("%s %s" % (self.timestamp,out)) - status,out = 1,'failed to truncate DN:%s' % job.prodUserID - else: - tmpRealDN = out - _logger.debug((self.timestamp,'setReplicaMetaDataAttribute',name,dq2ID,'owner',tmpRealDN)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',name,dq2ID,'owner',tmpRealDN) - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(60) - else: - break - # failed - if status != 0 or out.find('Error') != -1: - _logger.error("%s %s" % (self.timestamp,out)) - break - # delete old replicas - tmpDelStat = self.deleteDatasetReplicas([name],[dq2ID]) - if not tmpDelStat: - status,out = 1,'failed to delete old replicas for %s' % name - break - # failed - if status != 0 or out.find('Error') != -1: - _logger.error("%s %s" % (self.timestamp,out)) - break - else: - # skip registerDatasetLocations - status,out = 0,'' - if status != 0 or out.find('Error') != -1: - destError[dest] = "Could not register location : %s %s" % (name,out.split('\n')[-1]) - elif job.prodSourceLabel == 'panda' or (job.prodSourceLabel in ['ptest','rc_test'] and \ - job.processingType in ['pathena','prun','gangarobot-rctest']): - # do nothing for "panda" job - pass - elif name == originalName and job.prodSourceLabel in ['managed','test','rc_test','ptest']: - # set metadata - time.sleep(1) - dq2ID = self.siteMapper.getSite(file.destinationSE).ddm - # use another location when token is set - if not file.destinationDBlockToken in ['NULL','']: - # register only the first token becasue it is used as the location - tmpFirstToken = file.destinationDBlockToken.split(',')[0] - if self.siteMapper.getSite(file.destinationSE).setokens.has_key(tmpFirstToken): - dq2ID = self.siteMapper.getSite(file.destinationSE).setokens[tmpFirstToken] - _logger.debug((self.timestamp,'setMetaDataAttribute',name,'origin',dq2ID)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('setMetaDataAttribute',name,'origin',dq2ID) - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(60) - else: - break - _logger.debug("%s %s" % (self.timestamp,out)) - if status != 0 or (out != 'None' and out.find('already exists') == -1): - _logger.error(out) - destError[dest] = "Setupper._setupDestination() could not set metadata : %s" % name - # use PandaDDM or non-DQ2 - else: - # create a fake vuidStr - vuidStr = 'vuid="%s"' % commands.getoutput('uuidgen') - # already failed - if destError[dest] != '' and name == originalName: - break - # get vuid - if vuidStr == '': - _logger.debug((self.timestamp,'queryDatasetByName',name)) - for iDDMTry in range(3): - status,out = ddm.repositoryClient.main('queryDatasetByName',name) - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(60) - else: - break - _logger.debug("%s %s" % (self.timestamp,out)) - if status != 0 or out.find('Error') != -1: - _logger.error(out) - vuidStr = "vuid = %s['%s']['vuids'][0]" % (out.split('\n')[0],name) - try: - exec vuidStr - # dataset spec - ds = DatasetSpec() - ds.vuid = vuid - ds.name = name - ds.type = 'output' - ds.numberfiles = 0 - ds.currentfiles = 0 - ds.status = 'defined' - # append - datasetList[(name,file.destinationSE,computingSite)] = ds - except: - # set status - type, value, traceBack = sys.exc_info() - _logger.error("_setupDestination() : %s %s" % (type,value)) - destError[dest] = "Setupper._setupDestination() could not get VUID : %s" % name - # set new destDBlock - if newnameList.has_key(dest): - file.destinationDBlock = newnameList[dest] - # update job status if failed - if destError[dest] != '': - job.jobStatus = 'failed' - job.ddmErrorCode = ErrorCode.EC_Setupper - job.ddmErrorDiag = destError[dest] - else: - newdest = (file.destinationDBlock,file.destinationSE,job.computingSite) - # increment number of files - datasetList[newdest].numberfiles = datasetList[newdest].numberfiles + 1 - # dump - for tmpDsKey in datasetList.keys(): - if re.search('_sub\d+$',tmpDsKey[0]) != None: - _logger.debug('%s made sub:%s for nFiles=%s' % (self.timestamp,tmpDsKey[0],datasetList[tmpDsKey].numberfiles)) - # insert datasets to DB - return self.taskBuffer.insertDatasets(datasetList.values()) - - - # subscribe sites to distpatchDBlocks - def _subscribeDistpatchDB(self): - dispError = {} - failedJobs = [] - ddmJobs = [] - ddmUser = 'NULL' - for job in self.jobs: - # ignore failed jobs - if job.jobStatus in ['failed','cancelled']: - continue - # ignore no dispatch jobs - if job.dispatchDBlock=='NULL' or job.computingSite=='NULL': - continue - # extract dispatchDBlock and computingSite - disp = (job.dispatchDBlock,job.computingSite) - if dispError.has_key(disp) == 0: - dispError[disp] = '' - # DQ2 IDs - tmpSrcID = 'BNL_ATLAS_1' - if self.siteMapper.checkCloud(job.cloud): - # use cloud's source - tmpSrcID = self.siteMapper.getCloud(job.cloud)['source'] - srcDQ2ID = self.siteMapper.getSite(tmpSrcID).ddm - # destination - tmpDstID = job.computingSite - if srcDQ2ID != self.siteMapper.getSite(job.computingSite).ddm and \ - srcDQ2ID in self.siteMapper.getSite(job.computingSite).setokens.values(): - # direct usage of remote SE. Mainly for prestaging - tmpDstID = tmpSrcID - _logger.debug('%s use remote SiteSpec of %s for %s' % (self.timestamp,tmpDstID,job.computingSite)) - # use srcDQ2ID as dstDQ2ID when dst SE is same as src SE - srcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpSrcID).se) - dstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpDstID).se) - if srcSEs == dstSEs or job.computingSite.endswith("_REPRO"): - dstDQ2ID = srcDQ2ID - else: - dstDQ2ID = self.siteMapper.getSite(job.computingSite).ddm - # use DQ2 - if (not self.pandaDDM) and (not srcDQ2ID in PandaDDMSource or self.siteMapper.getSite(tmpDstID).cloud != 'US') \ - and (job.prodSourceLabel != 'ddm') and (not job.computingSite.endswith("_REPRO")): - # look for replica - dq2ID = srcDQ2ID - dq2IDList = [] - # register replica - if dq2ID != dstDQ2ID: - # make list - if self.replicaMap.has_key(job.dispatchDBlock): - # set DQ2 ID for DISK - if not srcDQ2ID.endswith('_DATADISK'): - hotID = re.sub('_MCDISK','_HOTDISK', srcDQ2ID) - diskID = re.sub('_MCDISK','_DATADISK',srcDQ2ID) - tapeID = re.sub('_MCDISK','_DATATAPE',srcDQ2ID) - mctapeID = re.sub('_MCDISK','_MCTAPE',srcDQ2ID) - else: - hotID = re.sub('_DATADISK','_HOTDISK', srcDQ2ID) - diskID = re.sub('_DATADISK','_DATADISK',srcDQ2ID) - tapeID = re.sub('_DATADISK','_DATATAPE',srcDQ2ID) - mctapeID = re.sub('_DATADISK','_MCTAPE',srcDQ2ID) - # DQ2 ID is mixed with TAIWAN-LCG2 and TW-FTT - if job.cloud in ['TW',]: - tmpSiteSpec = self.siteMapper.getSite(tmpSrcID) - if tmpSiteSpec.setokens.has_key('ATLASDATADISK'): - diskID = tmpSiteSpec.setokens['ATLASDATADISK'] - if tmpSiteSpec.setokens.has_key('ATLASDATATAPE'): - tapeID = tmpSiteSpec.setokens['ATLASDATATAPE'] - if tmpSiteSpec.setokens.has_key('ATLASMCTAPE'): - mctapeID = tmpSiteSpec.setokens['ATLASMCTAPE'] - hotID = 'TAIWAN-LCG2_HOTDISK' - for tmpDataset,tmpRepMap in self.replicaMap[job.dispatchDBlock].iteritems(): - if tmpRepMap.has_key(hotID): - # HOTDISK - if not hotID in dq2IDList: - dq2IDList.append(hotID) - if tmpRepMap.has_key(srcDQ2ID): - # MCDISK - if not srcDQ2ID in dq2IDList: - dq2IDList.append(srcDQ2ID) - if tmpRepMap.has_key(diskID): - # DATADISK - if not diskID in dq2IDList: - dq2IDList.append(diskID) - if job.cloud == 'US' and tmpRepMap.has_key('BNLPANDA'): - # BNLPANDA - if not 'BNLPANDA' in dq2IDList: - dq2IDList.append('BNLPANDA') - if tmpRepMap.has_key(tapeID): - # DATATAPE - if not tapeID in dq2IDList: - dq2IDList.append(tapeID) - if tmpRepMap.has_key(mctapeID): - # MCTAPE - if not mctapeID in dq2IDList: - dq2IDList.append(mctapeID) - # hack for split T1 - splitT1IDsHaveDS = [] - for tmpSplitT1Key in tmpRepMap.keys(): - if tmpSplitT1Key.startswith('NIKHEF-ELPROD'): - splitT1IDsHaveDS.append(tmpSplitT1Key) - if job.cloud == 'NL' and splitT1IDsHaveDS != [] \ - and not tmpRepMap.has_key('SARA-MATRIX_MCDISK') \ - and not tmpRepMap.has_key('SARA-MATRIX_DATADISK') \ - and not tmpRepMap.has_key('SARA-MATRIX_MCTAPE') \ - and not tmpRepMap.has_key('SARA-MATRIX_DATATAPE'): - for tmpSplitT1Key in splitT1IDsHaveDS: - if not tmpSplitT1Key in dq2IDList: - dq2IDList.append(tmpSplitT1Key) - # consider cloudconfig.tier1se - tmpCloudSEs = DataServiceUtils.getEndpointsAtT1(tmpRepMap,self.siteMapper,job.cloud) - useCloudSEs = [] - for tmpCloudSE in tmpCloudSEs: - if not tmpCloudSE in dq2IDList: - useCloudSEs.append(tmpCloudSE) - if useCloudSEs != []: - dq2IDList += useCloudSEs - _logger.debug('%s use additional endpoints %s from cloudconfig' % (self.timestamp,str(useCloudSEs))) - # use default location if empty - if dq2IDList == []: - dq2IDList = [dq2ID] - for dq2ID in dq2IDList: - time.sleep(1) - _logger.debug((self.timestamp,'registerDatasetLocation',job.dispatchDBlock,dq2ID,0,1,None,None,None,"7 days")) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('registerDatasetLocation',job.dispatchDBlock,dq2ID,0,1,None,None,None,"7 days") - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(60) - else: - break - _logger.debug("%s %s" % (self.timestamp,out)) - # failure - if status != 0 or out.find('Error') != -1: - break - else: - # skip registerDatasetLocations - status,out = 0,'' - if status != 0 or out.find('Error') != -1: - _logger.error(out) - dispError[disp] = "Setupper._subscribeDistpatchDB() could not register location" - else: - # assign destination - time.sleep(1) - optSub = {'DATASET_COMPLETE_EVENT' : ['https://%s:%s/server/panda/datasetCompleted' % \ - (panda_config.pserverhost,panda_config.pserverport)]} - optSource = {} - optSrcPolicy = 001000 | 010000 - dq2ID = dstDQ2ID - # prestaging - if srcDQ2ID == dstDQ2ID: - # stage-in callback - optSub['DATASET_STAGED_EVENT'] = ['https://%s:%s/server/panda/datasetCompleted' % \ - (panda_config.pserverhost,panda_config.pserverport)] - # use ATLAS*TAPE - seTokens = self.siteMapper.getSite(tmpDstID).setokens - if seTokens.has_key('ATLASDATATAPE') and seTokens.has_key('ATLASMCTAPE'): - dq2ID = seTokens['ATLASDATATAPE'] - # use MCDISK if needed - for tmpDataset,tmpRepMap in self.replicaMap[job.dispatchDBlock].iteritems(): - if (not tmpRepMap.has_key(dq2ID)) and tmpRepMap.has_key(seTokens['ATLASMCTAPE']): - dq2ID = seTokens['ATLASMCTAPE'] - break - # for CERN and BNL - if job.cloud in ['CERN','US'] and self.replicaMap.has_key(job.dispatchDBlock): - setNewIDflag = False - if job.cloud == 'CERN': - otherIDs = ['CERN-PROD_DAQ','CERN-PROD_TZERO','CERN-PROD_TMPDISK'] - else: - otherIDs = ['BNLPANDA'] - for tmpDataset,tmpRepMap in self.replicaMap[job.dispatchDBlock].iteritems(): - if not tmpRepMap.has_key(dq2ID): - # look for another id - for cernID in otherIDs: - if tmpRepMap.has_key(cernID): - dq2ID = cernID - setNewIDflag = True - break - # break - if setNewIDflag: - break - optSrcPolicy = 000010 - optSource[dq2ID] = {'policy' : 0} - else: - # set sources to handle T2s in another cloud and to transfer dis datasets being split in multiple sites - for tmpDQ2ID in dq2IDList: - optSource[tmpDQ2ID] = {'policy' : 0} - # T1 used as T2 - if job.cloud != self.siteMapper.getSite(tmpDstID).cloud and \ - (not dstDQ2ID.endswith('PRODDISK')) and \ - (not job.prodSourceLabel in ['user','panda']) and \ - self.siteMapper.getSite(tmpDstID).cloud in ['US']: - seTokens = self.siteMapper.getSite(tmpDstID).setokens - # use T1_PRODDISK - if seTokens.has_key('ATLASPRODDISK'): - dq2ID = seTokens['ATLASPRODDISK'] - # register subscription - _logger.debug('%s %s %s %s' % (self.timestamp,'registerDatasetSubscription', - (job.dispatchDBlock,dq2ID), - {'version':0,'archived':0,'callbacks':optSub,'sources':optSource,'sources_policy':optSrcPolicy, - 'wait_for_sources':0,'destination':None,'query_more_sources':0,'sshare':"production",'group':None, - 'activity':"Production",'acl_alias':None,'replica_lifetime':"7 days"})) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('registerDatasetSubscription',job.dispatchDBlock,dq2ID,version=0,archived=0,callbacks=optSub, - sources=optSource,sources_policy=optSrcPolicy,wait_for_sources=0,destination=None, - query_more_sources=0,sshare="production",group=None,activity="Production", - acl_alias=None,replica_lifetime="7 days") - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(60) - else: - break - _logger.debug("%s %s" % (self.timestamp,out)) - if status != 0 or (out != 'None' and len(out) != 35): - _logger.error(out) - dispError[disp] = "Setupper._subscribeDistpatchDB() could not register subscription" - # logging - try: - # make message - dq2ID = dstDQ2ID - message = '%s - siteID:%s type:dispatch vuid:%s' % (commands.getoutput('hostname'),dq2ID, - self.vuidMap[job.dispatchDBlock]) - # get logger - _pandaLogger = PandaLogger() - _pandaLogger.lock() - _pandaLogger.setParams({'Type':'registerSubscription'}) - logger = _pandaLogger.getHttpLogger(panda_config.loggername) - # add message - logger.info(message) - # release HTTP handler - _pandaLogger.release() - except: - pass - # use PandaDDM - else: - # set DDM user DN - if ddmUser == 'NULL': - ddmUser = job.prodUserID - # create a DDM job - ddmjob = JobSpec() - ddmjob.jobDefinitionID = int(time.time()) % 10000 - ddmjob.jobName = "%s" % commands.getoutput('uuidgen') - ddmjob.transformation = 'http://pandaserver.cern.ch:25080/trf/mover/run_dq2_cr' - ddmjob.destinationDBlock = 'pandaddm_%s.%s' % (time.strftime('%y.%m.%d'),ddmjob.jobName) - if job.cloud == 'NULL': - ddmjob.cloud = 'US' - else: - ddmjob.cloud = job.cloud - if not PandaMoverIDs.has_key(job.cloud): - ddmjob.computingSite = "BNL_ATLAS_DDM" - else: - ddmjob.computingSite = PandaMoverIDs[job.cloud] - ddmjob.destinationSE = ddmjob.computingSite - ddmjob.assignedPriority = 200000 - if job.prodSourceLabel in ['software']: - # set higher priority for installation jobs - ddmjob.assignedPriority += 1000 - else: - ddmjob.assignedPriority += job.currentPriority - ddmjob.currentPriority = ddmjob.assignedPriority - if self.ddmAttempt != 0: - # keep count of attemptNr - ddmjob.attemptNr = self.ddmAttempt + 1 - else: - ddmjob.attemptNr = 1 - # check attemptNr to avoid endless loop - if ddmjob.attemptNr > 10: - err = "Too many attempts %s for %s" % (ddmjob.attemptNr,job.dispatchDBlock) - _logger.error(err) - dispError[disp] = err - continue - ddmjob.prodSourceLabel = 'ddm' - ddmjob.transferType = 'dis' - ddmjob.processingType = 'pandamover' - # append log file - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz.%s" % (ddmjob.destinationDBlock,ddmjob.attemptNr) - fileOL.destinationDBlock = ddmjob.destinationDBlock - fileOL.destinationSE = ddmjob.destinationSE - fileOL.dataset = ddmjob.destinationDBlock - fileOL.type = 'log' - ddmjob.addFile(fileOL) - # make arguments - callBackURL = 'https://%s:%s/server/panda/datasetCompleted?vuid=%s&site=%s' % \ - (panda_config.pserverhost,panda_config.pserverport, - self.vuidMap[job.dispatchDBlock],dstDQ2ID) - callBackURL = urllib.quote(callBackURL) - lfnsStr = '' - for tmpLFN in self.dispFileList[job.dispatchDBlock]['lfns']: - lfnsStr += '%s,' % tmpLFN - guidStr = '' - for tmpGUID in self.dispFileList[job.dispatchDBlock]['guids']: - guidStr += '%s,' % tmpGUID - guidStr = guidStr[:-1] - lfnsStr = lfnsStr[:-1] - # check input token - moverUseTape = False - for tmpFile in job.Files: - if tmpFile.type == 'input' and tmpFile.dispatchDBlockToken in ['ATLASDATATAPE']: - moverUseTape = True - break - if srcDQ2ID != dstDQ2ID: - # get destination dir - tmpSpec = self.siteMapper.getSite(job.computingSite) - destDir = brokerage.broker_util._getDefaultStorage(tmpSpec.dq2url,tmpSpec.se,tmpSpec.seprodpath) - if destDir == '': - err = "could not get default storage for %s" % job.computingSite - _logger.error(err) - dispError[disp] = err - continue - # normal jobs - argStr = "" - if moverUseTape: - argStr += "--useTape " - argStr += "-t 7200 -n 3 -s %s -r %s --guids %s --lfns %s --tapePriority %s --callBack %s -d %spanda/dis/%s%s %s" % \ - (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,job.currentPriority,callBackURL,destDir, - time.strftime('%y/%m/%d/'),job.dispatchDBlock,job.dispatchDBlock) - else: - # prestaging jobs - argStr = "" - if moverUseTape: - argStr += "--useTape " - argStr += "-t 540 -n 2 -s %s -r %s --guids %s --lfns %s --tapePriority %s --callBack %s --prestage --cloud %s %s" % \ - (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,job.currentPriority,callBackURL,job.cloud,job.dispatchDBlock) - # set job parameters - ddmjob.jobParameters = argStr - _logger.debug('%s pdq2_cr %s' % (self.timestamp,ddmjob.jobParameters)) - # set src/dest - ddmjob.sourceSite = srcDQ2ID - ddmjob.destinationSite = dstDQ2ID - ddmJobs.append(ddmjob) - # failed jobs - if dispError[disp] != '': - job.jobStatus = 'failed' - job.ddmErrorCode = ErrorCode.EC_Setupper - job.ddmErrorDiag = dispError[disp] - failedJobs.append(job) - # update failed jobs only. succeeded jobs should be activate by DDM callback - self.taskBuffer.updateJobs(failedJobs,True) - # submit ddm jobs - if ddmJobs != []: - ddmRet = self.taskBuffer.storeJobs(ddmJobs,ddmUser,joinThr=True) - # update datasets - ddmIndex = 0 - ddmDsList = [] - for ddmPandaID,ddmJobDef,ddmJobName in ddmRet: - # invalid PandaID - if ddmPandaID in ['NULL',None]: - continue - # get dispatch dataset - dsName = ddmJobs[ddmIndex].jobParameters.split()[-1] - ddmIndex += 1 - tmpDS = self.taskBuffer.queryDatasetWithMap({'name':dsName}) - if tmpDS != None: - # set MoverID - tmpDS.MoverID = ddmPandaID - ddmDsList.append(tmpDS) - # update - if ddmDsList != []: - self.taskBuffer.updateDatasets(ddmDsList) - - - # update jobs - def _updateJobs(self): - updateJobs = [] - failedJobs = [] - activateJobs = [] - # sort out jobs - for job in self.jobs: - # failed jobs - if job.jobStatus in ['failed','cancelled']: - failedJobs.append(job) - # no input jobs - elif job.dispatchDBlock=='NULL': - activateJobs.append(job) - # normal jobs - else: - # change status - job.jobStatus = "assigned" - updateJobs.append(job) - # update DB - self.taskBuffer.activateJobs(activateJobs) - self.taskBuffer.updateJobs(updateJobs,True) - self.taskBuffer.updateJobs(failedJobs,True) - # delete local values - del updateJobs - del failedJobs - del activateJobs - - - # correct LFN for attemptNr - def _correctLFN(self): - lfnMap = {} - valMap = {} - prodError = {} - missingDS = {} - jobsWaiting = [] - jobsFailed = [] - jobsProcessed = [] - allLFNs = {} - allGUIDs = {} - cloudMap = {} - lfnDsMap = {} - replicaMap = {} - _logger.debug('%s go into LFN correction' % self.timestamp) - for job in self.jobs: - if self.onlyTA: - _logger.debug("%s start TA session %s" % (self.timestamp,job.taskID)) - # check if sitename is known - if job.computingSite != 'NULL' and (not job.computingSite in self.siteMapper.siteSpecList.keys()): - job.jobStatus = 'failed' - job.ddmErrorCode = ErrorCode.EC_Setupper - job.ddmErrorDiag = "computingSite:%s is unknown" % job.computingSite - # append job for downstream process - jobsProcessed.append(job) - # error message for TA - if self.onlyTA: - _logger.error("%s %s" % (self.timestamp,job.ddmErrorDiag)) - continue - # ignore no prodDBlock jobs or container dataset - if job.prodDBlock == 'NULL': - # set cloud - if panda_config.enableDynamicTA and job.prodSourceLabel in ['managed','validation'] \ - and job.cloud in ['NULL',''] and (not job.taskID in [None,'NULL',0]): - # look into map to check if it is already gotten - if not cloudMap.has_key(job.taskID): - # instantiate TaskAssigner - cloudResolver = TaskAssigner.TaskAssigner(self.taskBuffer,self.siteMapper, - job.taskID,job.prodSourceLabel,job) - # check cloud - _logger.debug("%s check cloud for %s" % (self.timestamp,job.taskID)) - retCloud = cloudResolver.checkCloud() - _logger.debug("%s checkCloud() -> %s" % (self.timestamp,retCloud)) - # failed - if retCloud == None: - _logger.error("failed to check cloud for %s" % job.taskID) - # append job to waiting list - jobsWaiting.append(job) - continue - # to be set - elif retCloud == "": - # collect LFN/GUID - tmpLFNs = [] - tmpGUIDs = [] - # set cloud - _logger.debug("%s set cloud for %s" % (self.timestamp,job.taskID)) - retCloud = cloudResolver.setCloud(tmpLFNs,tmpGUIDs,metadata=job.metadata) - _logger.debug("%s setCloud() -> %s" % (self.timestamp,retCloud)) - if retCloud == None: - _logger.debug("failed to set cloud for %s" % job.taskID) - # append job to waiting list - jobsWaiting.append(job) - continue - # append to map - cloudMap[job.taskID] = retCloud - # set cloud - job.cloud = cloudMap[job.taskID] - # message for TA - if self.onlyTA: - _logger.debug("%s set %s:%s" % (self.timestamp,job.taskID,job.cloud)) - # append job to processed list - jobsProcessed.append(job) - continue - # collect datasets - datasets = [] - for file in job.Files: - if file.type == 'input' and file.dispatchDBlock == 'NULL' \ - and (file.GUID == 'NULL' or job.prodSourceLabel in ['managed','test','ptest']): - if not file.dataset in datasets: - datasets.append(file.dataset) - # get LFN list - for dataset in datasets: - if not dataset in lfnMap.keys(): - prodError[dataset] = '' - lfnMap[dataset] = {} - # get LFNs - time.sleep(1) - for iDDMTry in range(3): - _logger.debug((self.timestamp,'listFilesInDataset',dataset)) - status,out = ddm.DQ2.main('listFilesInDataset',dataset) - if out.find("DQUnknownDatasetException") != -1: - break - elif status == -1: - break - elif status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error(out) - prodError[dataset] = 'could not get file list of prodDBlock %s' % dataset - _logger.error(prodError[dataset]) - # doesn't exist in DQ2 - if out.find('DQUnknownDatasetException') != -1: - missingDS[dataset] = "DS:%s not found in DQ2" % dataset - elif status == -1: - missingDS[dataset] = out - else: - # make map (key: LFN w/o attemptNr, value: LFN with attemptNr) - items = {} - try: - # protection for empty dataset - if out != '()': - exec "items = %s[0]" % out - # keep values to avoid redundant lookup - self.lfnDatasetMap[dataset] = items - # loop over all files - for guid,vals in items.iteritems(): - valMap[vals['lfn']] = {'guid' : guid, 'fsize' : vals['filesize'], - 'md5sum' : vals['checksum'], - 'chksum' : vals['checksum'], - 'scope' : vals['scope']} - genLFN = re.sub('\.\d+$','',vals['lfn']) - if lfnMap[dataset].has_key(genLFN): - # get attemptNr - newAttNr = 0 - newMat = re.search('\.(\d+)$',vals['lfn']) - if newMat != None: - newAttNr = int(newMat.group(1)) - oldAttNr = 0 - oldMat = re.search('\.(\d+)$',lfnMap[dataset][genLFN]) - if oldMat != None: - oldAttNr = int(oldMat.group(1)) - # compare - if newAttNr > oldAttNr: - lfnMap[dataset][genLFN] = vals['lfn'] - else: - lfnMap[dataset][genLFN] = vals['lfn'] - # mapping from LFN to DS - lfnDsMap[lfnMap[dataset][genLFN]] = dataset - except: - prodError[dataset] = 'could not convert HTTP-res to map for prodDBlock %s' % dataset - _logger.error(prodError[dataset]) - _logger.error(out) - # get replica locations - if (self.onlyTA or job.prodSourceLabel in ['managed','test']) \ - and prodError[dataset] == '' and (not replicaMap.has_key(dataset)): - if dataset.endswith('/'): - status,out = self.getListDatasetReplicasInContainer(dataset) - else: - for iDDMTry in range(3): - _logger.debug((self.timestamp,'listDatasetReplicas',dataset)) - status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1 \ - or out == '()': - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - prodError[dataset] = 'could not get locations for %s' % dataset - _logger.error(prodError[dataset]) - _logger.error(out) - else: - tmpRepSites = {} - try: - # convert res to map - exec "tmpRepSites = %s" % out - replicaMap[dataset] = tmpRepSites - except: - prodError[dataset] = 'could not convert HTTP-res to replica map for %s' % dataset - _logger.error(prodError[dataset]) - _logger.error(out) - # append except DBR - if not dataset.startswith('ddo'): - self.replicaMapForBroker[dataset] = tmpRepSites - # error - isFailed = False - # check for failed - for dataset in datasets: - if missingDS.has_key(dataset): - job.jobStatus = 'failed' - job.ddmErrorCode = ErrorCode.EC_GUID - job.ddmErrorDiag = missingDS[dataset] - # set missing - for tmpFile in job.Files: - if tmpFile.dataset == dataset: - tmpFile.status = 'missing' - # append - jobsFailed.append(job) - isFailed = True - # message for TA - if self.onlyTA: - _logger.error("%s %s" % (self.timestamp,missingDS[dataset])) - self.sendTaMesg("%s %s" % (job.taskID,missingDS[dataset]),msgType='error') - else: - _logger.debug("%s %s failed with %s" % (self.timestamp,job.PandaID,missingDS[dataset])) - break - if isFailed: - continue - # check for waiting - for dataset in datasets: - if prodError[dataset] != '': - # append job to waiting list - jobsWaiting.append(job) - isFailed = True - # message for TA - if self.onlyTA: - _logger.error("%s %s" % (self.timestamp,prodError[dataset])) - break - if isFailed: - continue - # set cloud - if panda_config.enableDynamicTA and job.prodSourceLabel in ['managed','validation'] \ - and job.cloud in ['NULL',''] and (not job.taskID in [None,'NULL',0]): - # look into map to check if it is already gotten - if not cloudMap.has_key(job.taskID): - # instantiate TaskAssigner - cloudResolver = TaskAssigner.TaskAssigner(self.taskBuffer,self.siteMapper, - job.taskID,job.prodSourceLabel,job) - # check cloud - _logger.debug("%s check cloud for %s" % (self.timestamp,job.taskID)) - retCloud = cloudResolver.checkCloud() - _logger.debug("%s checkCloud() -> %s" % (self.timestamp,retCloud)) - # failed - if retCloud == None: - _logger.error("failed to check cloud for %s" % job.taskID) - # append job to waiting list - jobsWaiting.append(job) - continue - # to be set - elif retCloud == "": - # collect LFN/GUID - tmpLFNs = [] - tmpGUIDs = [] - tmpReLoc = {} - tmpCountMap = {} - for dataset in datasets: - # get LFNs - eachDSLFNs = lfnMap[dataset].values() - tmpLFNs += eachDSLFNs - # get GUIDs - for oneLFN in eachDSLFNs: - tmpGUIDs.append(valMap[oneLFN]['guid']) - # locations - tmpReLoc[dataset] = replicaMap[dataset] - # file counts - tmpCountMap[dataset] = len(eachDSLFNs) - # set cloud - _logger.debug("%s set cloud for %s" % (self.timestamp,job.taskID)) - retCloud = cloudResolver.setCloud(tmpLFNs,tmpGUIDs,tmpReLoc,metadata=job.metadata, - fileCounts=tmpCountMap) - _logger.debug("%s setCloud() -> %s" % (self.timestamp,retCloud)) - if retCloud == None: - _logger.debug("failed to set cloud for %s" % job.taskID) - # append job to waiting list - jobsWaiting.append(job) - continue - # append to map - cloudMap[job.taskID] = retCloud - # set cloud - job.cloud = cloudMap[job.taskID] - # message for TA - if self.onlyTA: - _logger.debug("%s set %s:%s" % (self.timestamp,job.taskID,job.cloud)) - _logger.debug('%s replacing generic LFNs' % self.timestamp) - # replace generic LFN with real LFN - replaceList = [] - isFailed = False - for file in job.Files: - if file.type == 'input' and file.dispatchDBlock == 'NULL': - addToLfnMap = True - if file.GUID == 'NULL': - # get LFN w/o attemptNr - basename = re.sub('\.\d+$','',file.lfn) - if basename == file.lfn: - # replace - if basename in lfnMap[file.dataset].keys(): - file.lfn = lfnMap[file.dataset][basename] - replaceList.append((basename,file.lfn)) - # set GUID - if file.lfn in valMap: - file.GUID = valMap[file.lfn]['guid'] - file.fsize = valMap[file.lfn]['fsize'] - file.md5sum = valMap[file.lfn]['md5sum'] - file.checksum = valMap[file.lfn]['chksum'] - file.scope = valMap[file.lfn]['scope'] - # remove white space - if file.md5sum != None: - file.md5sum = file.md5sum.strip() - if file.checksum != None: - file.checksum = file.checksum.strip() - else: - if not job.prodSourceLabel in ['managed','test']: - addToLfnMap = False - # check missing file - if file.GUID == 'NULL' or job.prodSourceLabel in ['managed','test']: - if not file.lfn in valMap: - # append job to waiting list - errMsg = "GUID for %s not found in DQ2" % file.lfn - _logger.debug("%s %s" % (self.timestamp,errMsg)) - file.status = 'missing' - if not job in jobsFailed: - job.jobStatus = 'failed' - job.ddmErrorCode = ErrorCode.EC_GUID - job.ddmErrorDiag = errMsg - jobsFailed.append(job) - isFailed = True - continue - # add to allLFNs/allGUIDs - if addToLfnMap: - if not allLFNs.has_key(job.cloud): - allLFNs[job.cloud] = [] - if not allGUIDs.has_key(job.cloud): - allGUIDs[job.cloud] = [] - allLFNs[job.cloud].append(file.lfn) - allGUIDs[job.cloud].append(file.GUID) - # modify jobParameters - if not isFailed: - for patt,repl in replaceList: - job.jobParameters = re.sub('%s ' % patt, '%s ' % repl, job.jobParameters) - # append job to processed list - jobsProcessed.append(job) - # return if TA only - if self.onlyTA: - _logger.debug("%s end TA sessions" % self.timestamp) - return - _logger.debug('%s checking missing files at T1' % self.timestamp) - # get missing LFNs from source LRC/LFC - missLFNs = {} - for cloudKey in allLFNs.keys(): - # use BNL by default - dq2URL = self.siteMapper.getSite('BNL_ATLAS_1').dq2url - dq2SE = [] - # use cloud's source - if self.siteMapper.checkCloud(cloudKey): - tmpSrcID = self.siteMapper.getCloud(cloudKey)['source'] - tmpSrcSite = self.siteMapper.getSite(tmpSrcID) - # get LRC/LFC URL - if not tmpSrcSite.lfchost in [None,'']: - # LFC - dq2URL = 'lfc://'+tmpSrcSite.lfchost+':/grid/atlas/' - if tmpSrcSite.se != None: - for tmpSrcSiteSE in tmpSrcSite.se.split(','): - match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE) - if match != None: - dq2SE.append(match.group(1)) - # hack for split T1 - if cloudKey == 'NL': - tmpSplitSite = self.siteMapper.getSite('NIKHEF-ELPROD') - if tmpSplitSite.se != None: - for tmpSrcSiteSE in tmpSplitSite.se.split(','): - match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE) - if match != None: - dq2SE.append(match.group(1)) - else: - # LRC - dq2URL = tmpSrcSite.dq2url - dq2SE = [] - # get missing files - tmpMissLFNs = brokerage.broker_util.getMissLFNsFromLRC(allLFNs[cloudKey],dq2URL,allGUIDs[cloudKey],dq2SE) - # append - if not missLFNs.has_key(cloudKey): - missLFNs[cloudKey] = [] - missLFNs[cloudKey] += tmpMissLFNs - _logger.debug('%s checking T2 LFC' % self.timestamp) - # check availability of files at T2 - for cloudKey,tmpAllLFNs in allLFNs.iteritems(): - if len(self.jobs) > 0 and (self.jobs[0].prodSourceLabel in ['user','panda','ddm'] or \ - self.jobs[0].processingType.startswith('gangarobot') or \ - self.jobs[0].processingType.startswith('hammercloud')): - continue - # add cloud - if not self.availableLFNsInT2.has_key(cloudKey): - self.availableLFNsInT2[cloudKey] = {} - # loop over all files to find datasets - for tmpCheckLFN in tmpAllLFNs: - # add dataset - if not lfnDsMap.has_key(tmpCheckLFN): - continue - tmpDsName = lfnDsMap[tmpCheckLFN] - if not self.availableLFNsInT2[cloudKey].has_key(tmpDsName): - # collect sites - tmpSiteNameDQ2Map = DataServiceUtils.getSitesWithDataset(tmpDsName,self.siteMapper,replicaMap,cloudKey,getDQ2ID=True) - if tmpSiteNameDQ2Map == {}: - continue - self.availableLFNsInT2[cloudKey][tmpDsName] = {'allfiles':[],'allguids':[],'sites':{}} - for tmpSiteName in tmpSiteNameDQ2Map.keys(): - self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName] = [] - self.availableLFNsInT2[cloudKey][tmpDsName]['siteDQ2IDs'] = tmpSiteNameDQ2Map - # add files - if not tmpCheckLFN in self.availableLFNsInT2[cloudKey][tmpDsName]: - self.availableLFNsInT2[cloudKey][tmpDsName]['allfiles'].append(tmpCheckLFN) - self.availableLFNsInT2[cloudKey][tmpDsName]['allguids'].append(allGUIDs[cloudKey][allLFNs[cloudKey].index(tmpCheckLFN)]) - # get available files at each T2 - for tmpDsName in self.availableLFNsInT2[cloudKey].keys(): - checkedDq2SiteMap = {} - checkLfcSeMap = {} - for tmpSiteName in self.availableLFNsInT2[cloudKey][tmpDsName]['sites'].keys(): - tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) - # add LFC - if not checkLfcSeMap.has_key(tmpSiteSpec.lfchost): - checkLfcSeMap[tmpSiteSpec.lfchost] = {} - # add site - if not checkLfcSeMap[tmpSiteSpec.lfchost].has_key(tmpSiteName): - checkLfcSeMap[tmpSiteSpec.lfchost][tmpSiteName] = [] - # add SE - if tmpSiteSpec.se != None: - for tmpSrcSiteSE in tmpSiteSpec.se.split(','): - match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE) - if match != None: - checkLfcSeMap[tmpSiteSpec.lfchost][tmpSiteName].append(match.group(1)) - # LFC lookup - for tmpLfcHost in checkLfcSeMap.keys(): - # get SEs - tmpSEList = [] - for tmpSiteName in checkLfcSeMap[tmpLfcHost].keys(): - tmpSEList += checkLfcSeMap[tmpLfcHost][tmpSiteName] - # get available file list - _logger.debug('%s checking T2 LFC=%s for %s' % (self.timestamp,tmpLfcHost,tmpSEList)) - bulkAvFiles = brokerage.broker_util.getFilesFromLRC(self.availableLFNsInT2[cloudKey][tmpDsName]['allfiles'], - 'lfc://'+tmpLfcHost+':/grid/atlas/', - self.availableLFNsInT2[cloudKey][tmpDsName]['allguids'], - storageName=tmpSEList,getPFN=True) - # check each site - for tmpSiteName in checkLfcSeMap[tmpLfcHost].keys(): - self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName] = [] - for tmpLFNck,tmpPFNlistck in bulkAvFiles.iteritems(): - siteHasFileFlag = False - for tmpPFNck in tmpPFNlistck: - # check se - for tmpSE in checkLfcSeMap[tmpLfcHost][tmpSiteName]: - if '://'+tmpSE in tmpPFNck: - siteHasFileFlag = True - break - # escape - if siteHasFileFlag: - break - # append - if siteHasFileFlag: - self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName].append(tmpLFNck) - _logger.debug('%s available %s files at %s T2=%s for %s' % \ - (self.timestamp, - len(self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName]), - cloudKey,tmpSiteName,tmpDsName)) - _logger.debug('%s missLFNs at T1 %s' % (self.timestamp,missLFNs)) - # check if files in source LRC/LFC - tmpJobList = tuple(jobsProcessed) - for job in tmpJobList: - # check only production/test jobs - if not job.prodSourceLabel in ['managed','test','software','rc_test','ptest']: - continue - # don't check if site is already set - if job.prodSourceLabel in ['managed','test'] and not job.computingSite in ['NULL','',None]: - continue - missingFlag = False - for file in job.Files: - if file.type == 'input': - if missLFNs.has_key(job.cloud) and file.lfn in missLFNs[job.cloud]: - # set file status - file.status = 'missing' - missingFlag = True - # check if missing files are available at T2s - goToT2 = None - if missingFlag: - tmpCandT2s = None - for tmpFile in job.Files: - if tmpFile.type == 'input' and tmpFile.status == 'missing': - # no cloud info - if not self.availableLFNsInT2.has_key(job.cloud): - goToT2 = False - break - # no dataset info - if not self.availableLFNsInT2[job.cloud].has_key(tmpFile.dataset): - goToT2 = False - break - # initial candidates - if tmpCandT2s == None: - tmpCandT2s = self.availableLFNsInT2[job.cloud][tmpFile.dataset]['sites'] - # check all candidates - newCandT2s = [] - for tmpCandT2 in tmpCandT2s: - # site doesn't have the dataset - if not self.availableLFNsInT2[job.cloud][tmpFile.dataset]['sites'].has_key(tmpCandT2): - continue - # site has the file - if tmpFile.lfn in self.availableLFNsInT2[job.cloud][tmpFile.dataset]['sites'][tmpCandT2]: - if not tmpCandT2 in newCandT2s: - newCandT2s.append(tmpCandT2) - # set new candidates - tmpCandT2s = newCandT2s - # no candidates left - if tmpCandT2s == []: - goToT2 = False - break - # go to T2 - if goToT2 == None: - goToT2 = True - # remove job not to process further - if missingFlag and goToT2 != True: - jobsProcessed.remove(job) - # revert - for oJob in self.jobs: - if oJob.PandaID == job.PandaID: - jobsWaiting.append(oJob) - break - # get missing datasets - if missingFlag: - if job.processingType.startswith('gangarobot') or \ - job.processingType.startswith('hammercloud'): - pass - elif not job.prodSourceLabel in ['managed']: - pass - else: - for tmpFile in job.Files: - if tmpFile.type == 'input' and tmpFile.status == 'missing' and \ - not tmpFile.dataset.startswith('ddo'): - # append - if not self.missingDatasetList.has_key(job.cloud): - self.missingDatasetList[job.cloud] = {} - if not self.missingDatasetList[job.cloud].has_key(tmpFile.dataset): - self.missingDatasetList[job.cloud][tmpFile.dataset] = [] - if not tmpFile.GUID in self.missingDatasetList[job.cloud][tmpFile.dataset]: - self.missingDatasetList[job.cloud][tmpFile.dataset].append(tmpFile.GUID) - # set data summary fields - for tmpJob in self.jobs: - try: - # set only for production/analysis/test - if not tmpJob.prodSourceLabel in ['managed','test','rc_test','ptest','user']: - continue - # loop over all files - tmpJob.nInputDataFiles = 0 - tmpJob.inputFileBytes = 0 - tmpInputFileProject = None - tmpInputFileType = None - for tmpFile in tmpJob.Files: - # use input files and ignore DBR/lib.tgz - if tmpFile.type == 'input' and (not tmpFile.dataset.startswith('ddo')) \ - and not tmpFile.lfn.endswith('.lib.tgz'): - tmpJob.nInputDataFiles += 1 - if not tmpFile.fsize in ['NULL',None,0,'0']: - tmpJob.inputFileBytes += tmpFile.fsize - # get input type and project - if tmpInputFileProject == None: - tmpInputItems = tmpFile.dataset.split('.') - # input project - tmpInputFileProject = tmpInputItems[0] - # input type. ignore user/group/groupXY - if len(tmpInputItems) > 4 and (not tmpInputItems[0] in ['','NULL','user','group']) \ - and (not tmpInputItems[0].startswith('group')): - tmpInputFileType = tmpInputItems[4] - # set input type and project - if not tmpJob.prodDBlock in ['',None,'NULL']: - # input project - if tmpInputFileProject != None: - tmpJob.inputFileProject = tmpInputFileProject - # input type - if tmpInputFileType != None: - tmpJob.inputFileType = tmpInputFileType - # protection - maxInputFileBytes = 99999999999 - if tmpJob.inputFileBytes > maxInputFileBytes: - tmpJob.inputFileBytes = maxInputFileBytes - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("failed to set data summary fields for PandaID=%s: %s %s" % (tmpJob.PandaID,errType,errValue)) - # send jobs to jobsWaiting - self.taskBuffer.keepJobs(jobsWaiting) - # update failed job - self.taskBuffer.updateJobs(jobsFailed,True) - # remove waiting/failed jobs - self.jobs = jobsProcessed - # delete huge variables - del lfnMap - del valMap - del prodError - del jobsWaiting - del jobsProcessed - del allLFNs - del allGUIDs - del cloudMap - del missLFNs - - - # remove waiting jobs - def removeWaitingJobs(self): - jobsWaiting = [] - jobsProcessed = [] - for tmpJob in self.jobs: - if tmpJob.jobStatus == 'waiting': - jobsWaiting.append(tmpJob) - else: - jobsProcessed.append(tmpJob) - # send jobs to jobsWaiting - self.taskBuffer.keepJobs(jobsWaiting) - # remove waiting/failed jobs - self.jobs = jobsProcessed - - - # memory checker - def _memoryCheck(self): - try: - import os - proc_status = '/proc/%d/status' % os.getpid() - procfile = open(proc_status) - name = "" - vmSize = "" - vmRSS = "" - # extract Name,VmSize,VmRSS - for line in procfile: - if line.startswith("Name:"): - name = line.split()[-1] - continue - if line.startswith("VmSize:"): - vmSize = "" - for item in line.split()[1:]: - vmSize += item - continue - if line.startswith("VmRSS:"): - vmRSS = "" - for item in line.split()[1:]: - vmRSS += item - continue - procfile.close() - _logger.debug('%s MemCheck PID=%s Name=%s VSZ=%s RSS=%s' % (self.timestamp,os.getpid(),name,vmSize,vmRSS)) - except: - type, value, traceBack = sys.exc_info() - _logger.error("memoryCheck() : %s %s" % (type,value)) - _logger.debug('%s MemCheck PID=%s unknown' % (self.timestamp,os.getpid())) - return - - - # check DDM response - def isDQ2ok(self,out): - if out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1 \ - or out == '()': - return False - return True - - - # get list of files in dataset - def getListFilesInDataset(self,dataset): - # use cache data - if self.lfnDatasetMap.has_key(dataset): - return True,self.lfnDatasetMap[dataset] - for iDDMTry in range(3): - _logger.debug((self.timestamp,'listFilesInDataset',dataset)) - status,out = ddm.DQ2.main('listFilesInDataset',dataset) - if out.find("DQUnknownDatasetException") != -1: - break - elif status == -1: - break - elif status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error("%s %s" % (self.timestamp,out)) - return False,{} - # convert - items = {} - try: - exec "items = %s[0]" % out - except: - return False,{} - return True,items - - - # get list of datasets in container - def getListDatasetInContainer(self,container): - # get datasets in container - _logger.debug((self.timestamp,'listDatasetsInContainer',container)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('listDatasetsInContainer',container) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - _logger.debug('%s %s' % (self.timestamp,out)) - if status != 0 or out.startswith('Error'): - return False,out - datasets = [] - try: - # convert to list - exec "datasets = %s" % out - except: - return False,out - return True,datasets - - - def getListDatasetReplicasInContainer(self,container,getMap=False): - # get datasets in container - _logger.debug((self.timestamp,'listDatasetsInContainer',container)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('listDatasetsInContainer',container) - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1 \ - or out == '()': - time.sleep(60) - else: - break - _logger.debug('%s %s' % (self.timestamp,out)) - if status != 0 or out.startswith('Error'): - return status,out - datasets = [] - try: - # convert to list - exec "datasets = %s" % out - except: - return status,out - # loop over all datasets - allRepMap = {} - for dataset in datasets: - _logger.debug((self.timestamp,'listDatasetReplicas',dataset)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1 \ - or out == '()': - time.sleep(60) - else: - break - _logger.debug('%s %s' % (self.timestamp,out)) - if status != 0 or out.startswith('Error'): - return status,out - tmpRepSites = {} - try: - # convert res to map - exec "tmpRepSites = %s" % out - except: - return status,out - # get map - if getMap: - allRepMap[dataset] = tmpRepSites - continue - # otherwise get sum - for siteId,statList in tmpRepSites.iteritems(): - if not allRepMap.has_key(siteId): - # append - allRepMap[siteId] = [statList[-1],] - else: - # add - newStMap = {} - for stName,stNum in allRepMap[siteId][0].iteritems(): - if statList[-1].has_key(stName): - # try mainly for archived=None - try: - newStMap[stName] = stNum + statList[-1][stName] - except: - newStMap[stName] = stNum - else: - newStMap[stName] = stNum - allRepMap[siteId] = [newStMap,] - # return - _logger.debug('%s %s' % (self.timestamp,str(allRepMap))) - if not getMap: - return 0,str(allRepMap) - else: - return 0,allRepMap - - - # get list of replicas for a dataset - def getListDatasetReplicas(self,dataset): - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug("%s %s/%s listDatasetReplicas %s" % (self.timestamp,iDDMTry,nTry,dataset)) - status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if status != 0 or out.startswith('Error'): - _logger.error(self.timestamp+' '+out) - _logger.error('%s bad DQ2 response for %s' % (self.timestamp,dataset)) - return False,{} - try: - # convert res to map - exec "tmpRepSites = %s" % out - _logger.debug('%s getListDatasetReplicas->%s' % (self.timestamp,str(tmpRepSites))) - return True,tmpRepSites - except: - _logger.error(self.timestamp+' '+out) - _logger.error('%s could not convert HTTP-res to replica map for %s' % (self.timestamp,dataset)) - return False,{} - - - # delete original locations - def deleteDatasetReplicas(self,datasets,keepSites): - # loop over all datasets - for dataset in datasets: - # get locations - status,tmpRepSites = self.getListDatasetReplicas(dataset) - if not status: - return False - # no replicas - if len(tmpRepSites.keys()) == 0: - continue - delSites = [] - for tmpRepSite in tmpRepSites.keys(): - if not tmpRepSite in keepSites: - delSites.append(tmpRepSite) - # no repilicas to be deleted - if delSites == []: - continue - # delete - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug("%s %s/%s deleteDatasetReplicas %s %s" % (self.timestamp,iDDMTry,nTry,dataset,str(delSites))) - status,out = ddm.DQ2.main('deleteDatasetReplicas',dataset,delSites) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if status != 0 or out.startswith('Error'): - _logger.error(self.timestamp+' '+out) - _logger.error('%s bad DQ2 response for %s' % (self.timestamp,dataset)) - return False - _logger.debug(self.timestamp+' '+out) - # return - _logger.debug('%s deleted replicas for %s' % (self.timestamp,str(datasets))) - return True - - - # dynamic data placement for analysis jobs - def _dynamicDataPlacement(self): - # no jobs - if len(self.jobs) == 0: - return - # only successful analysis - if self.jobs[0].jobStatus in ['failed','cancelled'] or (not self.jobs[0].prodSourceLabel in ['user','panda']): - return - # execute - _logger.debug('%s execute PD2P' % self.timestamp) - from DynDataDistributer import DynDataDistributer - ddd = DynDataDistributer(self.jobs,self.taskBuffer,self.siteMapper) - ddd.run() - _logger.debug('%s finished PD2P' % self.timestamp) - return - - - # make dis datasets for existing files to avoid deletion when jobs are queued - def _makeDisDatasetsForExistingfiles(self): - _logger.debug('%s make dis datasets for existing files' % self.timestamp) - # collect existing files - dsFileMap = {} - nMaxJobs = 20 - nJobsMap = {} - for tmpJob in self.jobs: - # use production or test jobs only - if not tmpJob.prodSourceLabel in ['managed','test']: - continue - # ignore inappropriate status - if tmpJob.jobStatus in ['failed','cancelled','waiting']: - continue - # check cloud - if (tmpJob.cloud == 'ND' and self.siteMapper.getSite(tmpJob.computingSite).cloud == 'ND') or \ - (tmpJob.cloud == 'US' and self.siteMapper.getSite(tmpJob.computingSite).cloud == 'US'): - continue - # check SE to use T2 only - tmpSrcID = self.siteMapper.getCloud(tmpJob.cloud)['source'] - srcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpSrcID).se) - dstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpJob.computingSite).se) - if srcSEs == dstSEs: - continue - # look for log _sub dataset to be used as a key - logSubDsName = '' - for tmpFile in tmpJob.Files: - if tmpFile.type == 'log': - logSubDsName = tmpFile.destinationDBlock - break - # append site - destDQ2ID = self.siteMapper.getSite(tmpJob.computingSite).ddm - # T1 used as T2 - if tmpJob.cloud != self.siteMapper.getSite(tmpJob.computingSite).cloud and \ - not destDQ2ID.endswith('PRODDISK') and \ - self.siteMapper.getSite(tmpJob.computingSite).cloud in ['US']: - tmpSeTokens = self.siteMapper.getSite(tmpJob.computingSite).setokens - if tmpSeTokens.has_key('ATLASPRODDISK'): - destDQ2ID = tmpSeTokens['ATLASPRODDISK'] - mapKeyJob = (destDQ2ID,logSubDsName) - # increment the number of jobs per key - if not nJobsMap.has_key(mapKeyJob): - nJobsMap[mapKeyJob] = 0 - mapKey = (destDQ2ID,logSubDsName,nJobsMap[mapKeyJob]/nMaxJobs) - nJobsMap[mapKeyJob] += 1 - if not dsFileMap.has_key(mapKey): - dsFileMap[mapKey] = {} - # add files - for tmpFile in tmpJob.Files: - if tmpFile.type != 'input': - continue - # if files are unavailable at the dest site normal dis datasets contain them - # or files are cached - if not tmpFile.status in ['ready']: - continue - # if available at T2 - realDestDQ2ID = (destDQ2ID,) - if self.availableLFNsInT2.has_key(tmpJob.cloud) and self.availableLFNsInT2[tmpJob.cloud].has_key(tmpFile.dataset) \ - and self.availableLFNsInT2[tmpJob.cloud][tmpFile.dataset]['sites'].has_key(tmpJob.computingSite) \ - and tmpFile.lfn in self.availableLFNsInT2[tmpJob.cloud][tmpFile.dataset]['sites'][tmpJob.computingSite]: - realDestDQ2ID = self.availableLFNsInT2[tmpJob.cloud][tmpFile.dataset]['siteDQ2IDs'][tmpJob.computingSite] - realDestDQ2ID = tuple(realDestDQ2ID) - # append - if not dsFileMap[mapKey].has_key(realDestDQ2ID): - dsFileMap[mapKey][realDestDQ2ID] = {'taskID':tmpJob.taskID, - 'PandaID':tmpJob.PandaID, - 'files':{}} - if not dsFileMap[mapKey][realDestDQ2ID]['files'].has_key(tmpFile.lfn): - dsFileMap[mapKey][realDestDQ2ID]['files'][tmpFile.lfn] = {'lfn' :tmpFile.lfn, - 'guid':tmpFile.GUID, - 'fileSpecs':[]} - # add file spec - dsFileMap[mapKey][realDestDQ2ID]['files'][tmpFile.lfn]['fileSpecs'].append(tmpFile) - # loop over all locations - dispList = [] - for tmpMapKey,tmpDumVal in dsFileMap.iteritems(): - tmpDumLocation,tmpLogSubDsName,tmpBunchIdx = tmpMapKey - for tmpLocationList,tmpVal in tmpDumVal.iteritems(): - for tmpLocation in tmpLocationList: - tmpFileList = tmpVal['files'] - if tmpFileList == {}: - continue - nMaxFiles = 500 - iFiles = 0 - iLoop = 0 - while iFiles < len(tmpFileList): - subFileNames = tmpFileList.keys()[iFiles:iFiles+nMaxFiles] - if len(subFileNames) == 0: - break - # dis name - disDBlock = "panda.%s.%s.%s.%s_dis0%s%s" % (tmpVal['taskID'],time.strftime('%m.%d'),'GEN', - commands.getoutput('uuidgen'),iLoop, - tmpVal['PandaID']) - iFiles += nMaxFiles - lfns = [] - guids = [] - fsizes = [] - chksums = [] - for tmpSubFileName in subFileNames: - lfns.append(tmpFileList[tmpSubFileName]['lfn']) - guids.append(tmpFileList[tmpSubFileName]['guid']) - fsizes.append(None) - chksums.append(None) - # set dis name - for tmpFileSpec in tmpFileList[tmpSubFileName]['fileSpecs']: - if tmpFileSpec.status in ['ready'] and tmpFileSpec.dispatchDBlock == 'NULL': - tmpFileSpec.dispatchDBlock = disDBlock - # register datasets - iLoop += 1 - _logger.debug((self.timestamp,'ext registerNewDataset',disDBlock,lfns,guids,fsizes,chksums, - None,None,None,True)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('registerNewDataset',disDBlock,lfns,guids,fsizes,chksums, - None,None,None,True) - if status != 0 and out.find('DQDatasetExistsException') != -1: - break - elif status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - _logger.debug("%s sleep %s for %s" % (self.timestamp,iDDMTry,disDBlock)) - _logger.debug(status) - _logger.debug(out) - time.sleep(60) - else: - break - if status != 0 or out.find('Error') != -1: - _logger.error("%s %s" % (self.timestamp,out)) - continue - _logger.debug("%s %s" % (self.timestamp,out)) - # get VUID - try: - exec "vuid = %s['vuid']" % out - # dataset spec. currentfiles is used to count the number of failed jobs - ds = DatasetSpec() - ds.vuid = vuid - ds.name = disDBlock - ds.type = 'dispatch' - ds.status = 'defined' - ds.numberfiles = len(lfns) - ds.currentfiles = 0 - dispList.append(ds) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("ext registerNewDataset : failed to decode VUID for %s - %s %s" % (disDBlock,errType,errValue)) - continue - # freezeDataset dispatch dataset - _logger.debug((self.timestamp,'freezeDataset',disDBlock)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('freezeDataset',disDBlock) - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(60) - else: - break - if status != 0 or (out.find('Error') != -1 and out.find("is frozen") == -1): - _logger.error("%s %s" % (self.timestamp,out)) - continue - _logger.debug("%s %s" % (self.timestamp,out)) - # register location - _logger.debug((self.timestamp,'registerDatasetLocation',disDBlock,tmpLocation,0,1,None,None,None,"7 days")) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('registerDatasetLocation',disDBlock,tmpLocation,0,1,None,None,None,"7 days") - if status != 0 or out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1: - time.sleep(60) - else: - break - _logger.debug("%s %s" % (self.timestamp,out)) - # failure - if status != 0 or out.find('Error') != -1: - _logger.error("%s %s" % (self.timestamp,out)) - continue - # insert datasets to DB - self.taskBuffer.insertDatasets(dispList) - _logger.debug('%s finished to make dis datasets for existing files' % self.timestamp) - return - - - # pin input dataset - def _pinInputDatasets(self): - _logger.debug('%s pin input datasets' % self.timestamp) - # collect input datasets and locations - doneList = [] - allReplicaMap = {} - for tmpJob in self.jobs: - # ignore HC jobs - if tmpJob.processingType.startswith('gangarobot') or \ - tmpJob.processingType.startswith('hammercloud'): - continue - # use production or test or user jobs only - if not tmpJob.prodSourceLabel in ['managed','test','user']: - continue - # ignore inappropriate status - if tmpJob.jobStatus in ['failed','cancelled','waiting']: - continue - # set lifetime - if tmpJob.prodSourceLabel in ['managed','test']: - pinLifeTime = 7 - else: - pinLifeTime = 7 - # get source - if tmpJob.prodSourceLabel in ['managed','test']: - tmpSrcID = self.siteMapper.getCloud(tmpJob.cloud)['source'] - srcDQ2ID = self.siteMapper.getSite(tmpSrcID).ddm - else: - srcDQ2ID = self.siteMapper.getSite(tmpJob.computingSite).ddm - # prefix of DQ2 ID - srcDQ2IDprefix = re.sub('_[A-Z,0-9]+DISK$','',srcDQ2ID) - # loop over all files - for tmpFile in tmpJob.Files: - # use input files and ignore DBR/lib.tgz - if tmpFile.type == 'input' and \ - not tmpFile.lfn.endswith('.lib.tgz') and \ - not tmpFile.dataset.startswith('ddo') and \ - not tmpFile.dataset.startswith('user') and \ - not tmpFile.dataset.startswith('group'): - # get replica locations - if not allReplicaMap.has_key(tmpFile.dataset): - if tmpFile.dataset.endswith('/'): - status,tmpRepSitesMap = self.getListDatasetReplicasInContainer(tmpFile.dataset,getMap=True) - if status == 0: - status = True - else: - status = False - else: - status,tmpRepSites = self.getListDatasetReplicas(tmpFile.dataset) - tmpRepSitesMap = {} - tmpRepSitesMap[tmpFile.dataset] = tmpRepSites - # append - if status: - allReplicaMap[tmpFile.dataset] = tmpRepSitesMap - else: - # set empty to avoid further lookup - allReplicaMap[tmpFile.dataset] = {} - # loop over constituent datasets - _logger.debug('%s pin DQ2 prefix=%s' % (self.timestamp,srcDQ2IDprefix)) - for tmpDsName,tmpRepSitesMap in allReplicaMap[tmpFile.dataset].iteritems(): - # loop over locations - for tmpRepSite in tmpRepSitesMap.keys(): - if tmpRepSite.startswith(srcDQ2IDprefix) \ - and not 'TAPE' in tmpRepSite \ - and not 'SCRATCH' in tmpRepSite: - tmpKey = (tmpDsName,tmpRepSite) - # already done - if tmpKey in doneList: - continue - # append to avoid repetition - doneList.append(tmpKey) - # get metadata - status,tmpMetadata = self.getReplicaMetadata(tmpDsName,tmpRepSite) - if not status: - continue - # check pin lifetime - if tmpMetadata.has_key('pin_expirationdate'): - if isinstance(tmpMetadata['pin_expirationdate'],types.StringType) and tmpMetadata['pin_expirationdate'] != 'None': - # keep original pin lifetime if it is longer - origPinLifetime = datetime.datetime.strptime(tmpMetadata['pin_expirationdate'],'%Y-%m-%d %H:%M:%S') - if origPinLifetime > datetime.datetime.utcnow()+datetime.timedelta(days=pinLifeTime): - _logger.debug('%s skip pinning for %s:%s due to longer lifetime %s' % (self.timestamp, - tmpDsName,tmpRepSite, - tmpMetadata['pin_expirationdate'])) - continue - # set pin lifetime - status = self.setReplicaMetadata(tmpDsName,tmpRepSite,'pin_lifetime','%s days' % pinLifeTime) - # retrun - _logger.debug('%s pin input datasets done' % self.timestamp) - return - - - # make T1 subscription for missing files - def _makeSubscriptionForMissing(self): - _logger.debug('%s make subscriptions for missing files' % self.timestamp) - # collect datasets - missingList = {} - for tmpCloud,tmpMissDatasets in self.missingDatasetList.iteritems(): - # append cloud - if not missingList.has_key(tmpCloud): - missingList[tmpCloud] = [] - # loop over all datasets - for tmpDsName,tmpMissFiles in tmpMissDatasets.iteritems(): - # check if datasets in container are used - if tmpDsName.endswith('/'): - # convert container to datasets - tmpStat,tmpDsList = self.getListDatasetInContainer(tmpDsName) - if not tmpStat: - _logger.error('%s failed to get datasets in container:%s' % (self.timestamp,tmpDsName)) - continue - # check if each dataset is actually used - for tmpConstDsName in tmpDsList: - # skip if already checked - if tmpDsName in missingList[tmpCloud]: - continue - # get files in each dataset - tmpStat,tmpFilesInDs = self.getListFilesInDataset(tmpConstDsName) - if not tmpStat: - _logger.error('%s failed to get files in dataset:%s' % (self.timestamp,tmpConstDsName)) - continue - # loop over all files to check the dataset is used - for tmpGUID in tmpMissFiles: - # append if used - if tmpFilesInDs.has_key(tmpGUID): - missingList[tmpCloud].append(tmpConstDsName) - break - else: - # append dataset w/o checking - if not tmpDsName in missingList[tmpCloud]: - missingList[tmpCloud].append(tmpDsName) - # make subscriptions - for tmpCloud,missDsNameList in missingList.iteritems(): - # get distination - tmpDstID = self.siteMapper.getCloud(tmpCloud)['source'] - dstDQ2ID = self.siteMapper.getSite(tmpDstID).ddm - # register subscription - for missDsName in missDsNameList: - _logger.debug('%s make subscription at %s for missing %s' % (self.timestamp,dstDQ2ID,missDsName)) - self.makeSubscription(missDsName,dstDQ2ID) - # retrun - _logger.debug('%s make subscriptions for missing files done' % self.timestamp) - return - - - # check DDM response - def isDQ2ok(self,out): - if out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1 \ - or out == '()': - return False - return True - - - # make subscription - def makeSubscription(self,dataset,dq2ID): - # return for failuer - retFailed = False - # make subscription - optSrcPolicy = 000001 - nTry = 3 - for iDDMTry in range(nTry): - # register subscription - _logger.debug('%s %s/%s registerDatasetSubscription %s %s' % (self.timestamp,iDDMTry,nTry,dataset,dq2ID)) - status,out = ddm.DQ2.main('registerDatasetSubscription',dataset,dq2ID,version=0,archived=0, - callbacks={},sources={},sources_policy=optSrcPolicy, - wait_for_sources=0,destination=None,query_more_sources=0, - sshare="production",group=None,activity='Production',acl_alias='secondary') - status,out = 0,'' - if out.find('DQSubscriptionExistsException') != -1: - break - elif out.find('DQLocationExistsException') != -1: - break - elif status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if out.find('DQSubscriptionExistsException') != -1: - pass - elif status != 0 or out.startswith('Error'): - _logger.error("%s %s" % (self.timestamp,out)) - return retFailed - # update - _logger.debug('%s %s %s' % (self.timestamp,status,out)) - # return - return True - - - # get replica metadata - def getReplicaMetadata(self,datasetName,locationName): - # response for failure - resForFailure = False,{} - # get metadata - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug('%s %s/%s listMetaDataReplica %s %s' % (self.timestamp,iDDMTry,nTry,datasetName,locationName)) - status,out = ddm.DQ2.main('listMetaDataReplica',locationName,datasetName) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error("%s %s" % (self.timestamp,out)) - return resForFailure - metadata = {} - try: - # convert to map - exec "metadata = %s" % out - except: - _logger.error('%s could not convert HTTP-res to replica metadata for %s:%s' % \ - (self.timestamp,datasetName,locationName)) - return resForFailure - # return - _logger.debug('%s getReplicaMetadata -> %s' % (self.timestamp,str(metadata))) - return True,metadata - - - # set replica metadata - def setReplicaMetadata(self,datasetName,locationName,attrname,attrvalue): - # response for failure - resForFailure = False - # get metadata - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug('%s %s/%s setReplicaMetaDataAttribute %s %s %s=%s' % (self.timestamp,iDDMTry,nTry,datasetName, - locationName,attrname,attrvalue)) - status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',datasetName,locationName,attrname,attrvalue) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error("%s %s" % (self.timestamp,out)) - return resForFailure - # return - _logger.debug('%s setReplicaMetadata done' % self.timestamp) - return True - - - # send task brokerage message to logger - def sendTaMesg(self,message,msgType=None): - try: - # get logger - tmpPandaLogger = PandaLogger() - # lock HTTP handler - tmpPandaLogger.lock() - tmpPandaLogger.setParams({'Type':'taskbrokerage'}) - # use bamboo for loggername - if panda_config.loggername == 'prod': - tmpLogger = tmpPandaLogger.getHttpLogger('bamboo') - else: - # for dev - tmpLogger = tmpPandaLogger.getHttpLogger(panda_config.loggername) - # add message - if msgType=='error': - tmpLogger.error(message) - elif msgType=='warning': - tmpLogger.warning(message) - elif msgType=='info': - tmpLogger.info(message) - else: - tmpLogger.debug(message) - # release HTTP handler - tmpPandaLogger.release() - except: - pass - time.sleep(1) - diff --git a/current/pandaserver/dataservice/TaLauncher.py b/current/pandaserver/dataservice/TaLauncher.py deleted file mode 100755 index e44a7bc72..000000000 --- a/current/pandaserver/dataservice/TaLauncher.py +++ /dev/null @@ -1,55 +0,0 @@ -''' -launcer for TaskAssigner - -''' - -import sys -import time -import commands -import threading -import cPickle as pickle - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('TaLauncher') - - -class TaLauncher (threading.Thread): - # constructor - def __init__(self,taskBuffer,jobs): - threading.Thread.__init__(self) - self.jobs = jobs - self.taskBuffer = taskBuffer - # time stamp - self.timestamp = time.asctime() - - - # main - def run(self): - try: - _logger.debug('%s startRun' % self.timestamp) - # run setupper sequentially - for job in self.jobs: - # write jobs to file - outFileName = '%s/set.%s_%s' % (panda_config.logdir,job.PandaID,commands.getoutput('uuidgen')) - outFile = open(outFileName,'w') - pickle.dump([job],outFile) - outFile.close() - # run main procedure in another process because python doesn't release memory - com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) - com += 'source /opt/glite/etc/profile.d/grid-env.sh; ' - com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \ - (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, - panda_config.pandaPython_dir,outFileName) - # add option for TA - com += " -t" - _logger.debug('%s taskID:%s %s' % (self.timestamp,job.taskID,com)) - # exeute - status,output = self.taskBuffer.processLimiter.getstatusoutput(com) - _logger.debug("%s Ret from child process: %s %s" % (self.timestamp,status,output)) - _logger.debug('%s endRun' % self.timestamp) - except: - type, value, traceBack = sys.exc_info() - _logger.error("run() : %s %s" % (type,value)) diff --git a/current/pandaserver/dataservice/TaskAssigner.py b/current/pandaserver/dataservice/TaskAssigner.py deleted file mode 100644 index 677cd4645..000000000 --- a/current/pandaserver/dataservice/TaskAssigner.py +++ /dev/null @@ -1,1180 +0,0 @@ -''' -setup cloud - -''' - -import re -import sys -import time -import types -import random -import commands -import datetime -import brokerage.broker_util -from DDM import ddm -from DDM import dq2Common -from DDM import toa -from config import panda_config -from taskbuffer import ProcessGroups -from pandalogger.PandaLogger import PandaLogger -import DataServiceUtils - - -# logger -_logger = PandaLogger().getLogger('TaskAssigner') - -# cutoff for RW -thr_RW_low = 400 -thr_RW_high = 8000 -thr_RW_sub = 600 - -# cutoff for disk -thr_space_low = (1 * 1024) - -# special reduction for TAPE -reductionForTape = 0.5 - -# task types using MC share -taskTypesMcShare = ['evgen'] - -# task types for subscriptions -taskTypesSub = ['simul'] - -# dataset type to ignore file availability check -datasetTypeToSkipCheck = ['log'] - -class TaskAssigner: - # constructor - def __init__(self,taskBuffer,siteMapper,taskID,prodSourceLabel,job): - self.taskBuffer = taskBuffer - self.siteMapper = siteMapper - self.taskID = taskID - self.cloudTask = None - self.prodSourceLabel = prodSourceLabel - self.cloudForSubs = [] - self.job = job - self.metadataMap = {} - self.contDsMap = {} - - - # check cloud - def checkCloud(self): - try: - _logger.info('%s checkCloud' % self.taskID) - # get CloudTask from DB - self.cloudTask = self.taskBuffer.getCloudTask(self.taskID) - if self.cloudTask == None: - _logger.error('%s cannot get CloudTask' % self.taskID) - return None - # if already assigned - if self.cloudTask.status == 'assigned': - _logger.info('%s checked Cloud -> %s' % (self.taskID,self.cloudTask.cloud)) - return self.cloudTask.cloud - # return "" to set cloud later - _logger.info('%s return Cloud=""' % self.taskID) - return "" - except: - type, value, traceBack = sys.exc_info() - _logger.error("%s checkCloud : %s %s" % (self.taskID,type,value)) - return None - - - # set cloud - def setCloud(self,lfns,guids,locations={},metadata=None,fileCounts=None): - try: - _logger.info('%s setCloud' % self.taskID) - _logger.info('%s metadata="%s"' % (self.taskID,metadata)) - _logger.info('%s fileCounts="%s"' % (self.taskID,fileCounts)) - taskType = None - RWs = {} - expRWs = {} - highRWs = {} - prioMap = {} - fullRWs = {} - tt2Map = {} - diskCount = 0 - usingOpenDS = False - try: - # parse metadata - if not metadata in (None,'NULL'): - # task type - taskType = metadata.split(';')[0] - # RWs - exec "RWs = %s" % metadata.split(';')[1] - # expected RWs - exec "expRWs = %s" % metadata.split(';')[2] - # RWs for high priority tasks - exec "prioMap = %s" % metadata.split(';')[3] - # full RWs for space calcuration - exec "fullRWs = %s" % metadata.split(';')[4] - # tasktype2 map - exec "tt2Map = %s" % metadata.split(';')[5] - except: - pass - try: - diskCount = int(self.job.maxDiskCount) - except: - pass - message = '%s taskType==%s prio==%s RW==%s DiskCount==%s' % (self.taskID,taskType,prioMap[self.taskID], - expRWs[self.taskID],diskCount) - _logger.info(message) - self.sendMesg(message) - _logger.info('%s RWs = %s' % (self.taskID,str(RWs))) - _logger.info('%s expRWs = %s' % (self.taskID,str(expRWs))) - _logger.info('%s prioMap = %s' % (self.taskID,str(prioMap))) - _logger.info('%s fullRWs = %s' % (self.taskID,str(fullRWs))) - _logger.info('%s tt2Map = %s' % (self.taskID,str(tt2Map))) - # get cloud list - cloudList = self.siteMapper.getCloudList() - # get pilot statistics - nWNmap = self.taskBuffer.getCurrentSiteData() - # get process group - myTaskGroup = ProcessGroups.getProcessGroup(tt2Map[self.taskID]) - # recalculate RWs - for tmpTaskID,tmpExpRW in expRWs.iteritems(): - # skip myself - if tmpTaskID == self.taskID: - continue - # get cloud from DB - tmpCloudInDB = self.taskBuffer.seeCloudTask(tmpTaskID) - # not assigned - if tmpCloudInDB == '': - continue - # increase full RW - if not fullRWs.has_key(tmpCloudInDB): - fullRWs[tmpCloudInDB] = 0 - fullRWs[tmpCloudInDB] += tmpExpRW - # no priority info - if not prioMap.has_key(tmpTaskID): - continue - # lower priority - if prioMap[tmpTaskID] < prioMap[self.taskID]: - continue - # check tasktype2 - tmpTaskGroup = ProcessGroups.getProcessGroup(tt2Map[tmpTaskID]) - # check tasktype2 - if tmpTaskGroup != myTaskGroup: - continue - # increase RW - if not RWs.has_key(tmpCloudInDB): - RWs[tmpCloudInDB] = 0 - RWs[tmpCloudInDB] += tmpExpRW - _logger.info('%s newRWs =%s' % (self.taskID,str(RWs))) - _logger.info('%s fullRWs =%s' % (self.taskID,str(fullRWs))) - # remove offline clouds and check validation/fasttrack - tmpCloudList = [] - for tmpCloudName in cloudList: - # get cloud - tmpCloud = self.siteMapper.getCloud(tmpCloudName) - # skip offline clouds - if not tmpCloud['status'] in ['online']: - message = '%s %s skip : status==%s' % (self.taskID,tmpCloudName,tmpCloud['status']) - _logger.info(message) - self.sendMesg(message) - continue - # skip non-validation cloud if validation - if self.prodSourceLabel in ['validation'] and tmpCloud['validation'] != 'true': - message = "%s %s skip : validation=='%s'" % (self.taskID,tmpCloudName,tmpCloud['validation']) - _logger.info(message) - self.sendMesg(message) - continue - # check fast track - if ((taskType in ['evgen'] and prioMap[self.taskID] >= 700) or - (taskType in ['simul'] and prioMap[self.taskID] >= 800)) and tmpCloud['fasttrack'] != 'true': - message = "%s %s skip : fasttrack=='%s'" % (self.taskID,tmpCloudName,tmpCloud['fasttrack']) - _logger.info(message) - self.sendMesg(message) - continue - # check disk count - if diskCount != 0: - enoughSpace = self.checkDiskCount(diskCount,tmpCloudName) - if not enoughSpace: - message = "%s %s skip : no online sites have enough space for DiskCount==%s" % (self.taskID,tmpCloudName,diskCount) - _logger.info(message) - self.sendMesg(message,msgType='warning') - continue - # append - tmpCloudList.append(tmpCloudName) - self.cloudForSubs.append(tmpCloudName) - cloudList = tmpCloudList - # DQ2 location info - _logger.info('%s DQ2 locations %s' % (self.taskID,str(locations))) - # check immutable datasets - for tmpDataset,tmpSites in locations.iteritems(): - sitesForRefresh = [] - for tmpSite in tmpSites.keys(): - tmpStat = tmpSites[tmpSite][-1] - if tmpStat['total'] == -1 or tmpStat['found'] == None: - sitesForRefresh.append(tmpSite) - elif tmpStat['immutable'] == 0: - # using open datasets - usingOpenDS = True - _logger.info('%s open dataset : %s' % (self.taskID,tmpDataset)) - # refresh replica info - if sitesForRefresh != []: - # invoke listFileReplicasBySites to refresh replica info - _logger.info('%s listFileReplicasBySites %s:%s' % (self.taskID,tmpDataset,str(sitesForRefresh))) - tmpStat,tmpOut = ddm.DQ2_iter.listFileReplicasBySites(tmpDataset,0,sitesForRefresh,0,300) - _logger.info('%s listFileReplicasBySites end with %s:%s' % (self.taskID,tmpStat,tmpOut)) - # reset tmod to shorten retry interval - self.taskBuffer.resetTmodCloudTask(self.taskID) - removedDQ2Map = {} - t2ListForMissing = {} - diskCopyCloud = None - badMetaMap = {} - if locations != {}: - # sort datasets by the number of sites - numSitesDatasetMap = {} - for dataset,sites in locations.iteritems(): - numSites = len(sites) - if not numSitesDatasetMap.has_key(numSites): - numSitesDatasetMap[numSites] = [] - numSitesDatasetMap[numSites].append(dataset) - numSitesList = numSitesDatasetMap.keys() - numSitesList.sort() - sortedDatasetList = [] - for numSites in numSitesList: - sortedDatasetList += numSitesDatasetMap[numSites] - # loop over datasets starting with fewer replicas - removedCloud = [] - for dataset in sortedDatasetList: - sites = locations[dataset] - tmpDiskCopyCloud = [] - removedDQ2Map[dataset] = [] - _logger.info('%s DS:%s' % (self.taskID,dataset)) - datasetType = DataServiceUtils.getDatasetType(dataset) - for tmpCloudName in cloudList: - useCacheT1 = False - tmpCloud = self.siteMapper.getCloud(tmpCloudName) - if DataServiceUtils.isCachedFile(dataset,self.siteMapper.getSite(tmpCloud['source'])): - # use site's endpoint for CVMFS cache - foundSE = self.siteMapper.getSite(tmpCloud['source']).ddm - tmpDiskCopyCloud.append(tmpCloudName) - # using cached files at T1 - useCacheT1 = True - else: - # look for T1 SE which holds the max number of files - minFound = -1 - foundSE = '' - for tmpSePat in tmpCloud['tier1SE']: - # make regexp pattern - if '*' in tmpSePat: - tmpSePat = tmpSePat.replace('*','.*') - tmpSePat = '^' + tmpSePat +'$' - for tmpSE in sites.keys(): - # check name with regexp pattern - if re.search(tmpSePat,tmpSE) == None: - continue - # check metadata - metaOK = self.checkMetadata(dataset,tmpSE) - if not metaOK: - if not badMetaMap.has_key(dataset): - badMetaMap[dataset] = [] - badMetaMap[dataset].append(tmpSE) - _logger.info('%s skip %s due to ToBeDeleted' % (self.taskID,tmpSE)) - continue - # check the number of available files - tmpStat = sites[tmpSE][-1] - if tmpStat['found'] == None: - if minFound == -1: - foundSE = tmpSE - elif minFound < tmpStat['found']: - minFound = tmpStat['found'] - foundSE = tmpSE - # check if disk copy is available - tmpStatusSE,tmpRetSE = toa.getSiteProperty(tmpSE,'tape') - if tmpRetSE != 'True': - if tmpStat['found'] != None and tmpStat['found'] == tmpStat['total']: - tmpDiskCopyCloud.append(tmpCloudName) - else: - _logger.info('%s %s is on tape : %s' % (self.taskID,tmpSE,tmpRetSE)) - # get list of T2s where dataset is available - tmpT2List = [] - tmpT2Map = DataServiceUtils.getSitesWithDataset(dataset,self.siteMapper,locations, - tmpCloudName,True,getDQ2ID=True, - useOnlineSite=True) - for tmpT2Name,tmpT2DQ2List in tmpT2Map.iteritems(): - # skip redundant lookup - if t2ListForMissing.has_key(tmpCloudName) and \ - not tmpT2Name in t2ListForMissing[tmpCloudName]: - continue - # loop over all DQ2 IDs - for tmpT2DQ2 in tmpT2DQ2List: - # check metadata - metaOK = self.checkMetadata(dataset,tmpT2DQ2) - if metaOK: - tmpT2List.append(tmpT2Name) - break - else: - if not badMetaMap.has_key(dataset): - badMetaMap[dataset] = [] - badMetaMap[dataset].append(tmpT2DQ2) - _logger.info('%s skip %s due to ToBeDeleted' % (self.taskID,tmpT2DQ2)) - # take CVMFS cache into account - tmpT2CacheList = DataServiceUtils.getSitesWithCacheDS(tmpCloudName,tmpT2List,self.siteMapper,dataset) - tmpT2List += tmpT2CacheList - # remove cloud if T1SE or T2 is not a location - if foundSE == '': - # keep if T2 has the dataset - if tmpT2List == []: - if not tmpCloudName in removedCloud: - _logger.info('%s removed %s' % (self.taskID,tmpCloudName)) - removedCloud.append(tmpCloudName) - # add dataset to map for subscription when T2 has non-cached replica - if (tmpT2List != [] and len(tmpT2CacheList) != len(tmpT2List)) and not tmpCloudName in removedDQ2Map[dataset]: - removedDQ2Map[dataset].append(tmpCloudName) - else: - if not useCacheT1: - # check incomplete or not - tmpStat = sites[foundSE][-1] - if tmpStat['found'] == None or \ - (not datasetType in datasetTypeToSkipCheck and tmpStat['found'] < tmpStat['total']): - # add dataset to map which is subscribed when the task is used due to T2 files - if not tmpCloudName in removedDQ2Map[dataset]: - removedDQ2Map[dataset].append(tmpCloudName) - # aggregate T2 list - if not t2ListForMissing.has_key(tmpCloudName): - t2ListForMissing[tmpCloudName] = tmpT2List - else: - # use sites where all datasets are available - newTmpT2List = [] - for tmpT2 in t2ListForMissing[tmpCloudName]: - if tmpT2 in tmpT2List: - newTmpT2List.append(tmpT2) - t2ListForMissing[tmpCloudName] = newTmpT2List - # disk copy cloud - if diskCopyCloud == None: - diskCopyCloud = tmpDiskCopyCloud - else: - newDiskCopyCloud = [] - for tmpCloudName in diskCopyCloud: - if tmpCloudName in tmpDiskCopyCloud: - newDiskCopyCloud.append(tmpCloudName) - diskCopyCloud = newDiskCopyCloud - # remove clouds - for tmpCloudName in removedCloud: - if tmpCloudName in cloudList: - cloudList.remove(tmpCloudName) - _logger.info('%s new locations after DQ2 filter %s' % (self.taskID,str(cloudList))) - _logger.info('%s clouds where complete disk copies are available %s' % (self.taskID,str(diskCopyCloud))) - _logger.info('%s removed DQ2 map %s' % (self.taskID,str(removedDQ2Map))) - if cloudList == []: - # make subscription to empty cloud - if taskType in taskTypesSub: - _logger.info('%s makeSubscription start' % self.taskID) - retSub = self.makeSubscription(removedDQ2Map,RWs,fullRWs,expRWs) - _logger.info('%s makeSubscription end with %s' % (self.taskID,retSub)) - message = '%s no input data locations' % self.taskID - self.sendMesg(message,msgType='warning') - raise RuntimeError, '%s cloud list is empty after DQ2 filter' % self.taskID - message = '%s input data locations %s' % (self.taskID,str(cloudList)) - _logger.info(message) - self.sendMesg(message) - # calculate # of loops - nFile = 200 - nLoop = len(guids) / nFile - if len(guids) % nFile != 0: - nLoop += 1 - iFileList = [] - for iTmp in range(nLoop): - iFileList.append(iTmp*nFile) - # truncate list to avoid too many lookup - maxLoop = 100 - if len(iFileList) > maxLoop: - random.shuffle(iFileList) - iFileList = iFileList[:maxLoop] - iFileList.sort() - # count the number of files to be lookup - maxNFiles = 0 - if not usingOpenDS: - # if dataset is open, doesn't check nFiles - for iFile in iFileList: - maxNFiles += len(lfns[iFile:iFile+nFile]) - # loop over all cloud - weightParams = {} - foundCandidateWithT1 = [] - candidatesUsingT2 = [] - for tmpCloudName in cloudList: - _logger.info('%s calculate weight for %s' % (self.taskID,tmpCloudName)) - # add missing cloud in RWs - if not RWs.has_key(tmpCloudName): - RWs[tmpCloudName] = 0 - if not fullRWs.has_key(tmpCloudName): - fullRWs[tmpCloudName] = 0 - # get cloud - tmpCloud = self.siteMapper.getCloud(tmpCloudName) - weightParams[tmpCloudName] = {} - # get T1 site - tmpT1Site = self.siteMapper.getSite(tmpCloud['source']) - # get number of running jobs. Initially set 1 to avoid zero dividing - nPilot = 1 - for siteName in tmpCloud['sites']: - if nWNmap.has_key(siteName): - nPilot += (nWNmap[siteName]['getJob'] + nWNmap[siteName]['updateJob']) - weightParams[tmpCloudName]['nPilot'] = nPilot - _logger.info('%s # of pilots %s' % (self.taskID,nPilot)) - # available space - weightParams[tmpCloudName]['space'] = tmpT1Site.space - _logger.info('%s T1 space %s' % (self.taskID,tmpT1Site.space)) - # MC share - weightParams[tmpCloudName]['mcshare'] = tmpCloud['mcshare'] - _logger.info('%s MC share %s' % (self.taskID,tmpCloud['mcshare'])) - # calculate available space = totalT1space - ((RW(cloud)+RW(thistask))*GBperSI2kday)) - aveSpace,sizeCloud,sizeThis = self.getAvailableSpace(weightParams[tmpCloudName]['space'], - fullRWs[tmpCloudName], - expRWs[self.taskID]) - # no task is assigned if available space is less than 1TB - if aveSpace < thr_space_low: - message = '%s %s skip : space:%s (total:%s - assigned:%s - this:%s) < %sGB' % \ - (self.taskID,tmpCloudName,aveSpace,weightParams[tmpCloudName]['space'], - sizeCloud,sizeThis,thr_space_low) - _logger.info(message) - self.sendMesg(message,msgType='warning') - del weightParams[tmpCloudName] - continue - else: - _logger.info('%s %s pass : space:%s (total:%s - assigned:%s - this:%s)' % \ - (self.taskID,tmpCloudName,aveSpace,weightParams[tmpCloudName]['space'], - sizeCloud,sizeThis)) - # not assign tasks when RW is too high - if RWs.has_key(tmpCloudName) and RWs[tmpCloudName] > thr_RW_high*weightParams[tmpCloudName]['mcshare']: - message = '%s %s skip : too high RW==%s > %s' % \ - (self.taskID,tmpCloudName,RWs[tmpCloudName],thr_RW_high*weightParams[tmpCloudName]['mcshare']) - _logger.info(message) - self.sendMesg(message,msgType='warning') - del weightParams[tmpCloudName] - continue - # T1 - t1List = [tmpT1Site.sitename] - # hack for split T1 - if tmpCloudName == 'NL': - t1List.append('NIKHEF-ELPROD') - # get files - weightParams[tmpCloudName]['nFiles'] = 0 - # loop - tmpMaxNumFile = 0 - for tmpSiteNameScan in t1List: - tmpScanRet,tmpN = DataServiceUtils.getNumAvailableFilesSite(tmpSiteNameScan, - self.siteMapper, - locations,badMetaMap, - tmpCloud['tier1SE'], - noCheck=datasetTypeToSkipCheck, - fileCounts=fileCounts) - # failed - if not tmpScanRet: - raise RuntimeError, 'failed to get nFiles at %s due to %s' % (tmpSiteNameScan,tmpN) - # max - if tmpMaxNumFile < tmpN: - tmpMaxNumFile = tmpN - # set - weightParams[tmpCloudName]['nFiles'] = tmpMaxNumFile - _logger.info('%s # of files at T1 %s' % (self.taskID,weightParams[tmpCloudName]['nFiles'])) - # found candidate - foundCandidateT1 = False - if weightParams[tmpCloudName]['nFiles'] >= maxNFiles: - foundCandidateT1 = True - # avoid incomplete at T1 - for tmpDS,tmpT2CloudList in removedDQ2Map.iteritems(): - if tmpCloudName in tmpT2CloudList: - foundCandidateT1 = False - # reset nFiles at T1 - weightParams[tmpCloudName]['nFiles'] = 0 - break - if foundCandidateT1: - foundCandidateWithT1.append(tmpCloudName) - # check T2 if files are missing - if (not foundCandidateT1 or weightParams[tmpCloudName]['nFiles'] < maxNFiles) and \ - t2ListForMissing.has_key(tmpCloudName) and t2ListForMissing[tmpCloudName] != []: - _logger.info('%s T2 candidates %s' % (self.taskID,str(t2ListForMissing[tmpCloudName]))) - # loop - tmpMaxNumFile = 0 - for tmpSiteNameScan in t2ListForMissing[tmpCloudName]: - tmpScanRet,tmpN = DataServiceUtils.getNumAvailableFilesSite(tmpSiteNameScan, - self.siteMapper, - locations,badMetaMap, - noCheck=datasetTypeToSkipCheck, - fileCounts=fileCounts) - # failed - if not tmpScanRet: - raise RuntimeError, 'failed to get nFiles at %s due to %s' % (tmpSiteNameScan,tmpN) - # use larger value - _logger.info('%s # of files at T2:%s %s' % (self.taskID,tmpSiteNameScan,tmpN)) - if tmpN > weightParams[tmpCloudName]['nFiles']: - weightParams[tmpCloudName]['nFiles'] = tmpN - # found candidate - if weightParams[tmpCloudName]['nFiles'] >= maxNFiles: - candidatesUsingT2.append(tmpCloudName) - break - # compare parameters - definedCloud = "US" - maxClouds = [] - useMcShare = False - # use clouds where T1 have the data - maxClouds += foundCandidateWithT1 - # use clouds where T2 have the data - maxClouds += candidatesUsingT2 - # logging - _logger.info('%s check nFiles' % self.taskID) - for cloudName,params in weightParams.iteritems(): - if not cloudName in maxClouds: - if maxNFiles == 0: - message = '%s %s skip : missing files at DATA/GROUPDISK' % \ - (self.taskID,cloudName) - elif params['nFiles'] != maxNFiles: - message = '%s %s skip : nFiles==%s<%s' % \ - (self.taskID,cloudName,params['nFiles'],maxNFiles) - else: - message = '%s %s skip : no complete replica at DATA/GROUPDISK' % \ - (self.taskID,cloudName) - _logger.info(message) - self.sendMesg(message) - time.sleep(2) - # check RW - _logger.info('%s check RW' % self.taskID) - tmpInfClouds = [] - for cloudName in maxClouds: - # set weight to infinite when RW is too low - if not taskType in taskTypesMcShare: - if RWs[cloudName] < thr_RW_low*weightParams[cloudName]['mcshare']: - message = '%s %s infinite weight : RW==%s < %s' % \ - (self.taskID,cloudName,RWs[cloudName],thr_RW_low*weightParams[cloudName]['mcshare']) - _logger.info(message) - self.sendMesg(message) - tmpInfClouds.append(cloudName) - # use new list - if tmpInfClouds != []: - _logger.info('%s use infinite clouds after RW checking' % self.taskID) - maxClouds = tmpInfClouds - useMcShare = True - elif maxClouds == []: - messageEnd = '%s no candidates left' % self.taskID - self.sendMesg(messageEnd) - # make subscription to empty cloud - if taskType in taskTypesSub: - _logger.info('%s makeSubscription start' % self.taskID) - retSub = self.makeSubscription(removedDQ2Map,RWs,fullRWs,expRWs) - _logger.info('%s makeSubscription end with %s' % (self.taskID,retSub)) - if retSub: - message = '%s made subscription' % self.taskID - self.sendMesg(message,msgType='info') - else: - message = "%s didn't make subscription" % self.taskID - self.sendMesg(message,msgType='warning') - # return - _logger.info(messageEnd) - _logger.info("%s end" % self.taskID) - return None - # choose one - message = '%s candidates %s' % (self.taskID,str(maxClouds)) - _logger.info(message) - self.sendMesg(message) - if len(maxClouds) == 1: - definedCloud = maxClouds[0] - elif len(maxClouds) > 1: - # choose cloud according to weight - nWeightList = [] - totalWeight = 0 - for cloudName in maxClouds: - if (taskType in taskTypesMcShare): - # use MC share for evgen - tmpWeight = float(weightParams[cloudName]['mcshare']) - message = "%s %s weight==%s" % (self.taskID,cloudName,weightParams[cloudName]['mcshare']) - else: - # use nPilot/RW*MCshare - tmpWeight = float(weightParams[cloudName]['nPilot']) / float(1+RWs[cloudName]) - message = "%s %s weight==%s/%s" % (self.taskID,cloudName, - weightParams[cloudName]['nPilot'], - 1+RWs[cloudName]) - # use different weight if DISK is available - if diskCopyCloud != None and diskCopyCloud != [] and cloudName not in diskCopyCloud: - tmpWeight *= float(reductionForTape) - message += '*%s' % reductionForTape - self.sendMesg(message) - nWeightList.append(tmpWeight) - totalWeight += tmpWeight - # check total weight - if totalWeight == 0: - raise RuntimeError, 'totalWeight=0' - # determin cloud using random number - _logger.info('%s weights %s' % (self.taskID,str(nWeightList))) - rNumber = random.random() * totalWeight - _logger.info('%s totalW %s' % (self.taskID,totalWeight)) - _logger.info('%s rNumber %s' % (self.taskID,rNumber)) - for index,tmpWeight in enumerate(nWeightList): - rNumber -= tmpWeight - _logger.info('%s rNumber %s : Cloud=%s weight=%s' % - (self.taskID,rNumber,maxClouds[index],tmpWeight)) - if rNumber <= 0: - definedCloud = maxClouds[index] - break - # make subscription when T2 candidate is chosen - if definedCloud in candidatesUsingT2: - newT2DQ2Map = {} - for tmpDS,tmpT2CloudList in removedDQ2Map.iteritems(): - if definedCloud in tmpT2CloudList: - newT2DQ2Map[tmpDS] = [definedCloud] - if newT2DQ2Map == {}: - _logger.error('%s no subscription map to use T2 datasets cloud=%s map=%s' % (self.taskID,definedCloud,removedDQ2Map)) - return None - _logger.info('%s makeSubscription to use T2 start' % self.taskID) - retSub = self.makeSubscription(newT2DQ2Map,RWs,fullRWs,expRWs,noEmptyCheck=True,acceptInProcess=True) - if not retSub: - _logger.error('%s makeSubscription to use T2 failed with %s' % (self.taskID,retSub)) - return None - _logger.info('%s makeSubscription to use T2 end with %s' % (self.taskID,retSub)) - # set CloudTask in DB - self.cloudTask.cloud = definedCloud - retCloudTask = self.taskBuffer.setCloudTask(self.cloudTask) - if retCloudTask == None: - _logger.error('%s cannot set CloudTask' % self.taskID) - return None - # pin input dataset - pinSiteList = [] - if definedCloud in candidatesUsingT2: - # pin T2 replicas - if t2ListForMissing.has_key(definedCloud): - pinSiteList = t2ListForMissing[definedCloud] - else: - # pin T1 replica - pinSiteList = [self.siteMapper.getCloud(definedCloud)['tier1']] - if pinSiteList != []: - self.pinDataset(locations,pinSiteList,definedCloud) - message = '%s set Cloud -> %s' % (self.taskID,retCloudTask.cloud) - _logger.info(message) - self.sendMesg(message) - # return - return retCloudTask.cloud - except: - type, value, traceBack = sys.exc_info() - _logger.error("%s setCloud : %s %s" % (self.taskID,type,value)) - return None - - - # send message to logger - def sendMesg(self,message,msgType=None): - try: - # get logger - tmpPandaLogger = PandaLogger() - # lock HTTP handler - tmpPandaLogger.lock() - tmpPandaLogger.setParams({'Type':'taskbrokerage'}) - # use bamboo for loggername - if panda_config.loggername == 'prod': - tmpLogger = tmpPandaLogger.getHttpLogger('bamboo') - else: - # for dev - tmpLogger = tmpPandaLogger.getHttpLogger(panda_config.loggername) - # add message - if msgType=='error': - tmpLogger.error(message) - elif msgType=='warning': - tmpLogger.warning(message) - elif msgType=='info': - tmpLogger.info(message) - else: - tmpLogger.debug(message) - # release HTTP handler - tmpPandaLogger.release() - except: - pass - time.sleep(1) - - - # check disk count - def checkDiskCount(self,diskCount,cloud): - scanSiteList = self.siteMapper.getCloud(cloud)['sites'] - # loop over all sites - for tmpSiteName in scanSiteList: - if 'test' in tmpSiteName.lower(): - continue - # get sitespec - tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) - # use online only - if not tmpSiteSpec.status in ['online']: - continue - # no size limit - if tmpSiteSpec.maxinputsize in [0,None,'']: - return True - # enough space for input - if int(tmpSiteSpec.maxinputsize) >= int(diskCount): - return True - # no sites have enough space - return False - - - # get available space - def getAvailableSpace(self,space,fullRW,expRW): - # calculate available space = totalT1space - ((RW(cloud)+RW(thistask))*GBperSI2kday)) - sizeCloud = fullRW * 0.2 - sizeThis = expRW * 0.2 - aveSpace = space - (sizeCloud + sizeThis) - return aveSpace,sizeCloud,sizeThis - - - # make subscription - def makeSubscription(self,dsCloudMap,RWs,fullRWs,expRWs,noEmptyCheck=False,acceptInProcess=False): - nDDMtry = 3 - cloudList = [] - # collect clouds which don't hold datasets - message = '%s possible clouds : %s' % (self.taskID,str(self.cloudForSubs)) - _logger.info(message) - for tmpDS,tmpClouds in dsCloudMap.iteritems(): - for tmpCloud in tmpClouds: - if (not tmpCloud in cloudList) and tmpCloud in self.cloudForSubs: - cloudList.append(tmpCloud) - message = '%s candidates for subscription : %s' % (self.taskID,str(cloudList)) - _logger.info(message) - self.sendMesg(message) - if cloudList == []: - _logger.info('%s no candidates for subscription' % self.taskID) - return False - # get DN - com = 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; ' - com+= 'source %s; grid-proxy-info -subject' % panda_config.glite_source - status,DN = commands.getstatusoutput(com) - _logger.info('%s %s' % (self.taskID,DN)) - # ignore AC issuer - if re.search('WARNING: Unable to verify signature!',DN) != None: - status = 0 - if status != 0: - _logger.error('%s could not get DN %s:%s' % (self.taskID,status,DN)) - return False - # check if there is in-process subscription - if not acceptInProcess: - # remove /CN=proxy and /CN=limited from DN - DN = DN.split('\n')[-1] - DN = re.sub('(/CN=proxy)+$','',DN) - DN = re.sub('/CN=limited proxy','',DN) - status,out = dq2Common.parse_dn(DN) - if status != 0: - _logger.error('%s could not truncate DN %s:%s' % (self.taskID,status,DN)) - return False - DN = out - # loop over all datasets - runningSub = {} - for tmpDS,tmpClouds in dsCloudMap.iteritems(): - # get running subscriptions - runningSub[tmpDS] = [] - _logger.info('%s listSubscriptions(%s)' % (self.taskID,tmpDS)) - iTry = 0 - while True: - status,outLoc = ddm.DQ2.listSubscriptions(tmpDS) - # succeed - if status == 0: - break - # failed - iTry += 1 - if iTry < nDDMtry: - time.sleep(30) - else: - _logger.error('%s %s' % (self.taskID,outLoc)) - return False - _logger.info('%s %s %s' % (self.taskID,status,outLoc)) - time.sleep(1) - # get subscription metadata - exec "outLoc = %s" % outLoc - for tmpLocation in outLoc: - t1Flag = False - # check T1 or not - for tmpCloudName4T1 in self.siteMapper.getCloudList(): - if tmpLocation in self.siteMapper.getCloud(tmpCloudName4T1)['tier1SE']: - t1Flag = True - break - # skip non-T1 - if not t1Flag: - continue - _logger.info('%s listSubscriptionInfo(%s,%s)' % (self.taskID,tmpDS,tmpLocation)) - iTry = 0 - while True: - status,outMeta = ddm.DQ2.listSubscriptionInfo(tmpDS,tmpLocation,0) - # succeed - if status == 0: - break - # skip non-existing ID - if re.search('not a Tiers of Atlas Destination',outMeta) != None: - _logger.info('%s ignore %s' % (self.taskID,outMeta.split('\n')[-1])) - status = 0 - outMeta = "()" - break - # failed - iTry += 1 - if iTry < nDDMtry: - time.sleep(30) - else: - _logger.error('%s %s' % (self.taskID,outMeta)) - return False - _logger.info('%s %s %s' % (self.taskID,status,outMeta)) - time.sleep(1) - # look for DN in metadata - exec "outMeta = %s" % outMeta - if DN in outMeta: - # get corrosponding cloud - for tmpCloudName in self.siteMapper.getCloudList(): - tmpCloudSpec = self.siteMapper.getCloud(tmpCloudName) - if tmpLocation in tmpCloudSpec['tier1SE']: - # append - if not tmpCloudName in runningSub[tmpDS]: - runningSub[tmpDS].append(tmpCloudName) - break - _logger.info('%s runningSub=%s' % (self.taskID,runningSub)) - # doesn't make subscriptions when another subscriptions is in process - subThr = 1 - for tmpDS,tmpClouds in runningSub.iteritems(): - if len(tmpClouds) > 0: - message = '%s subscription:%s to %s in process' % (self.taskID,tmpDS,str(tmpClouds)) - _logger.info(message) - self.sendMesg(message) - return False - # get size of datasets - dsSizeMap = {} - for tmpDS in dsCloudMap.keys(): - _logger.debug('%s listFilesInDataset(%s)' % (self.taskID,tmpDS)) - iTry = 0 - while True: - status,outList = ddm.DQ2.listFilesInDataset(tmpDS) - # succeed - if status == 0: - break - # failed - iTry += 1 - if iTry < nDDMtry: - time.sleep(30) - else: - _logger.error('%s %s %s' % (self.taskID,status,outList)) - return False - # get total size - dsSizeMap[tmpDS] = 0 - exec "outList = %s" % outList - for guid,vals in outList[0].iteritems(): - try: - dsSizeMap[tmpDS] += long(vals['filesize']) - except: - pass - # GB - _logger.info('%s %s %sB' % (self.taskID,tmpDS,dsSizeMap[tmpDS])) - dsSizeMap[tmpDS] /= (1024*1024*1024) - _logger.info('%s dsSize=%s' % (self.taskID,dsSizeMap)) - # check space and RW - minRW = None - minCloud = None - for tmpCloudName in cloudList: - # get cloud spec - tmpCloudSpec = self.siteMapper.getCloud(tmpCloudName) - # get T1 site - tmpT1Site = self.siteMapper.getSite(tmpCloudSpec['source']) - # calculate available space - if not fullRWs.has_key(tmpCloudName): - fullRWs[tmpCloudName] = 0 - aveSpace,sizeCloud,sizeThis = self.getAvailableSpace(tmpT1Site.space, - fullRWs[tmpCloudName], - expRWs[self.taskID]) - # reduce requred space - for tmpDS,tmpClouds in dsCloudMap.iteritems(): - if tmpCloudName in tmpClouds: - aveSpace -= dsSizeMap[tmpDS] - # check space - if aveSpace < thr_space_low: - message = '%s %s skip : space==%s total==%s' % (self.taskID,tmpCloudName,aveSpace, - tmpT1Site.space) - _logger.info(message) - self.sendMesg(message,msgType='warning') - continue - _logger.info('%s %s pass : space==%s total==%s' % (self.taskID,tmpCloudName,aveSpace, - tmpT1Site.space)) - # get cloud spec - tmpCloudSpec = self.siteMapper.getCloud(tmpCloudName) - # check MC share - if tmpCloudSpec['mcshare'] == 0: - message = '%s %s skip : mcshare==%s' % (self.taskID,tmpCloudName,tmpCloudSpec['mcshare']) - _logger.info(message) - continue - # get minimum RW - if not RWs.has_key(tmpCloudName): - RWs[tmpCloudName] = 0 - tmpRwThr = tmpCloudSpec['mcshare']*thr_RW_sub - _logger.info('%s %s RW==%s Thr==%s' % (self.taskID,tmpCloudName,RWs[tmpCloudName], - tmpRwThr)) - tmpRwRatio = float(RWs[tmpCloudName])/float(tmpRwThr) - if minRW == None or minRW > tmpRwRatio: - minRW = tmpRwRatio - minCloud = tmpCloudName - # check RW - if minCloud == None: - message = '%s no candidates left for subscription' % self.taskID - _logger.info(message) - self.sendMesg(message) - return False - # get cloud spec - tmpCloudSpec = self.siteMapper.getCloud(minCloud) - # check threshold - if minRW > 1.0 and not noEmptyCheck: - message = '%s no empty cloud : %s minRW==%s>%s' % \ - (self.taskID,minCloud,RWs[minCloud],thr_RW_sub*tmpCloudSpec['mcshare']) - _logger.info(message) - self.sendMesg(message) - return False - message = '%s %s for subscription : minRW==%s' % (self.taskID,minCloud,minRW) - _logger.info(message) - self.sendMesg(message) - # get cloud spec for subscription - tmpCloudSpec = self.siteMapper.getCloud(minCloud) - # get T1 site - tmpT1Site = self.siteMapper.getSite(tmpCloudSpec['source']) - # dest DQ2 ID - dq2ID = tmpT1Site.ddm - # make subscription - for tmpDsName,tmpClouds in dsCloudMap.iteritems(): - # skip if the dataset already exists in the cloud - if not minCloud in tmpClouds: - _logger.info('%s %s already exists in %s' % (self.taskID,tmpDS,minCloud)) - continue - # get constituents - if tmpDsName.endswith('/'): - tmpStat,repMap = self.getListDatasetReplicasInContainer(tmpDsName) - if not tmpStat: - _logger.info('%s failed to get datasets in %s ' % (self.taskID,tmpDsName)) - continue - else: - repMap = {tmpDsName:{dq2ID:[]}} - # loop over all constituents - for tmpDS in repMap.keys(): - # register subscription - optSrcPolicy = 001000 | 010000 - _logger.debug("%s %s %s" % ('registerDatasetSubscription',(tmpDS,dq2ID), - {'version':0,'archived':0,'callbacks':{},'sources':{}, - 'sources_policy':optSrcPolicy,'wait_for_sources':0, - 'destination':None,'query_more_sources':0,'sshare':"secondary", - 'group':None,'activity':"Production",'acl_alias':'secondary'})) - iTry = 0 - while True: - # execute - status,out = ddm.DQ2.main('registerDatasetSubscription',tmpDS,dq2ID,version=0,archived=0,callbacks={}, - sources={},sources_policy=optSrcPolicy,wait_for_sources=0,destination=None, - query_more_sources=0,sshare="secondary",group=None,activity="Production", - acl_alias='secondary') - # succeed - if status == 0 or 'DQSubscriptionExistsException' in out: - break - # failed - iTry += 1 - if iTry < nDDMtry: - time.sleep(30) - else: - _logger.error('%s %s %s' % (self.taskID,status,out)) - return False - if 'DQSubscriptionExistsException' in out: - _logger.info('%s %s %s' % (self.taskID,status,'DQSubscriptionExistsException')) - else: - _logger.info('%s %s %s' % (self.taskID,status,out)) - message = '%s registered subscription %s %s:%s' % (self.taskID,tmpDS,minCloud,dq2ID) - _logger.info(message) - self.sendMesg(message) - time.sleep(1) - # completed - return True - - - # pin dataset - def pinDataset(self,locationMap,siteList,cloudName): - _logger.info('%s start pin input datasets' % self.taskID) - pinLifeTime = 7 - # loop over all datasets - for tmpDsName,tmpDQ2Map in locationMap.iteritems(): - # skip DBR - if DataServiceUtils.isDBR(tmpDsName): - continue - # get DQ2 IDs in the cloud where dataset is available - tmpDq2Map = DataServiceUtils.getSitesWithDataset(tmpDsName,self.siteMapper,locationMap, - cloudName,useHomeCloud=True, - getDQ2ID=True, - useOnlineSite=True, - includeT1=True) - # loop over all sites - for tmpSiteName in siteList: - # pin dataset when the site has replicas - if tmpDq2Map.has_key(tmpSiteName): - # loop over all DQ2 IDs - for tmpRepSite in tmpDq2Map[tmpSiteName]: - # get constituents - if tmpDsName.endswith('/'): - tmpStat,repMap = self.getListDatasetReplicasInContainer(tmpDsName) - if not tmpStat: - _logger.info('%s failed to get datasets in %s ' % (self.taskID,tmpDsName)) - continue - else: - repMap = {tmpDsName:{tmpRepSite:[]}} - # loop over all datasets - for datasetName,locVal in repMap.iteritems(): - # check missing - if not repMap[datasetName].has_key(tmpRepSite): - _logger.info('%s skip pinning for %s at %s due to missing replica' % \ - (self.taskID,datasetName,tmpRepSite)) - continue - # get metadata - status,tmpMetadata = self.getReplicaMetadata(datasetName,tmpRepSite) - if not status: - continue - # check pin lifetime - if tmpMetadata.has_key('pin_expirationdate'): - if isinstance(tmpMetadata['pin_expirationdate'],types.StringType) and tmpMetadata['pin_expirationdate'] != 'None': - # keep original pin lifetime if it is longer - origPinLifetime = datetime.datetime.strptime(tmpMetadata['pin_expirationdate'],'%Y-%m-%d %H:%M:%S') - if origPinLifetime > datetime.datetime.utcnow()+datetime.timedelta(days=pinLifeTime): - _logger.info('%s skip pinning for %s:%s due to longer lifetime %s' % (self.taskID, - datasetName,tmpRepSite, - tmpMetadata['pin_expirationdate'])) - continue - # set pin lifetime - status = self.setReplicaMetadata(datasetName,tmpRepSite,'pin_lifetime','%s days' % pinLifeTime) - # return - _logger.info('%s end pin input datasets' % self.taskID) - return - - - # get replica metadata - def getReplicaMetadata(self,datasetName,locationName): - # use cached data - if self.metadataMap.has_key(datasetName) and self.metadataMap[datasetName].has_key(locationName): - return True,self.metadataMap[datasetName][locationName] - # response for failure - resForFailure = False,{} - # get metadata - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug('%s %s/%s listMetaDataReplica %s %s' % (self.taskID,iDDMTry,nTry,datasetName,locationName)) - status,out = ddm.DQ2.main('listMetaDataReplica',locationName,datasetName) - if status != 0 or (not DataServiceUtils.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error("%s %s" % (self.taskID,out)) - return resForFailure - metadata = {} - try: - # convert to map - exec "metadata = %s" % out - except: - _logger.error('%s could not convert HTTP-res to replica metadata for %s:%s' % \ - (self.taskID,datasetName,locationName)) - return resForFailure - # append - if not self.metadataMap.has_key(datasetName): - self.metadataMap[datasetName] = {} - self.metadataMap[datasetName][locationName] = metadata - # return - _logger.debug('%s getReplicaMetadata -> %s' % (self.taskID,str(metadata))) - return True,metadata - - - # check metadata - def checkMetadata(self,datasetName,tmpSE): - try: - # skip checking for DBR - if DataServiceUtils.isDBR(datasetName): - return True - # get constituents - if datasetName.endswith('/'): - tmpStat,repMap = self.getListDatasetReplicasInContainer(datasetName) - if not tmpStat: - raise RuntimeError, 'failed to get datasets in %s when checkMetadata' % datasetName - else: - repMap = {datasetName:{tmpSE:[]}} - # loop over all datasets - for dataset,locVal in repMap.iteritems(): - # check missing - if not locVal.has_key(tmpSE): - _logger.info('%s skip %s at %s due to missing replica when checkMetadata' % (self.taskID,dataset,tmpSE)) - # NG - return False - # get metadata - status,metaItem = self.getReplicaMetadata(dataset,tmpSE) - if not status: - raise RuntimeError, 'failed to get metadata at %s for %s when checkMetadata' % (tmpSE,dataset) - # check - if metaItem.has_key('archived') and isinstance(metaItem['archived'],types.StringType) \ - and metaItem['archived'].lower() in ['tobedeleted',]: - _logger.info('%s skip %s due to ToBeDeleted when checkMetadata' % (self.taskID,tmpSE)) - # NG - return False - except: - errtype,errvalue = sys.exc_info()[:2] - _logger.error("%s checkMetadata : %s %s" % (self.taskID,errtype,errvalue)) - # FIXME - #return False - # OK - return True - - - # set replica metadata - def setReplicaMetadata(self,datasetName,locationName,attrname,attrvalue): - # response for failure - resForFailure = False - # get metadata - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug('%s %s/%s setReplicaMetaDataAttribute %s %s %s=%s' % (self.taskID,iDDMTry,nTry,datasetName, - locationName,attrname,attrvalue)) - status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',datasetName,locationName,attrname,attrvalue) - if status != 0 or (not DataServiceUtils.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error("%s %s" % (self.taskID,out)) - return resForFailure - # return - _logger.info('%s setReplicaMetadata done for %s:%s' % (self.taskID,datasetName,locationName)) - return True - - - # get list of replicas in container - def getListDatasetReplicasInContainer(self,container): - # use cache - if self.contDsMap.has_key(container): - return True,self.contDsMap[container] - # get datasets in container - _logger.debug((self.taskID,'listDatasetsInContainer',container)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('listDatasetsInContainer',container) - if status != 0 or (not DataServiceUtils.isDQ2ok(out)): - time.sleep(60) - else: - break - _logger.debug('%s %s' % (self.taskID,out)) - if status != 0 or out.startswith('Error'): - return False,out - datasets = [] - try: - # convert to list - exec "datasets = %s" % out - except: - return False,out - # loop over all datasets - allRepMap = {} - for dataset in datasets: - _logger.debug((self.taskID,'listDatasetReplicas',dataset)) - for iDDMTry in range(3): - status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) - if status != 0 or (not DataServiceUtils.isDQ2ok(out)): - time.sleep(60) - else: - break - _logger.debug('%s %s' % (self.taskID,out)) - if status != 0 or out.startswith('Error'): - return False,out - tmpRepSites = {} - try: - # convert res to map - exec "tmpRepSites = %s" % out - except: - return False,out - # get map - allRepMap[dataset] = tmpRepSites - # return - _logger.debug('%s %s' % (self.taskID,str(allRepMap))) - self.contDsMap[container] = allRepMap - return True,allRepMap - - - diff --git a/current/pandaserver/dataservice/Waker.py b/current/pandaserver/dataservice/Waker.py deleted file mode 100755 index 93234bcd7..000000000 --- a/current/pandaserver/dataservice/Waker.py +++ /dev/null @@ -1,55 +0,0 @@ -''' -awake jobs in waiting table - -''' - -import time -import threading -from DDM import ddm - -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('Waker') - - -class Waker (threading.Thread): - # constructor - def __init__(self,taskBuffer,dataset): - threading.Thread.__init__(self) - self.dataset = dataset - self.taskBuffer = taskBuffer - - - # main - def run(self): - _logger.debug("start: %s" % self.dataset.name) - # get file list from DDM - for iDDMTry in range(3): - status,out = ddm.DQ2.main('listFilesInDataset',self.dataset.name) - if status != 0 and out.find("DQ2 unknown dataset exception") != -1: - break - elif status != 0 or out.find("DQ2 internal server exception") != -1: - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error(out) - _logger.debug("failed: %s" % self.dataset.name) - return - # parse - lfns = [] - try: - exec "resDQ=%s" % out - for guid,vals in resDQ[0].iteritems(): - lfns.append(vals['lfn']) - except: - _logger.error("could not parse %s" % out) - # get PandaIDs of jobs which use files with LFNs - if len(lfns) != 0: - ids = self.taskBuffer.queryPandaIDwithLFN(lfns) - _logger.debug("IDs: %s" % ids) - if len(ids) != 0: - # awake jobs - self.taskBuffer.awakeJobs(ids) - _logger.debug("finished: %s" % self.dataset.name) diff --git a/current/pandaserver/dataservice/__init__.py b/current/pandaserver/dataservice/__init__.py deleted file mode 100755 index e69de29bb..000000000 diff --git a/current/pandaserver/dataservice/countGuidsClient.py b/current/pandaserver/dataservice/countGuidsClient.py deleted file mode 100644 index a65489ce2..000000000 --- a/current/pandaserver/dataservice/countGuidsClient.py +++ /dev/null @@ -1,72 +0,0 @@ -import urllib, re, string, os, time -from eventLookupClient import eventLookupClient - -# client for countGuids Athenaeum service -# author: Marcin.Nowak@cern.ch - - -class countGuidsClient(eventLookupClient): - - #serverURL = "http://j2eeps.cern.ch/test-Athenaeum/" - serverURL = "http://j2eeps.cern.ch/atlas-project-Athenaeum/" - #serverURL = "http://j2eeps.cern.ch/test-eventPicking/" - servicePage = "CountGuids.jsp" - getPage = "EventLookupGet.jsp" - - def __init__(self): - eventLookupClient.__init__(self) - - def countGuids(self, datasetName, query='', tokens=''): - """ contact the server and return GUIDs count - tokens - token names - """ - query_args = { 'key': self.key, - 'worker': self.workerURL(), - 'cert_proxy': self.certProxy, - 'query': query, - 'dataset': datasetName, - 'tokens': tokens - } - self.talkToServer(self.serverURL + self.servicePage, query_args) - - self.remoteFile = None - for line in self.output: - m = re.search("FILE=(.+)$", line) - if m: - return self.waitForFile( m.group(1) ) - - return self.scanOutputForGuids() - - - def scanOutputForGuids(self): - """ Scan the server output looking for GUIDs - return None in case of errors - """ - self.countedGuids = [] - self.tokens = [] - stage = None - tokpat = re.compile(r'([0-9A-F]{8}-([0-9A-F]{4}-){3}[0-9A-F]{12})') - for line in self.output: - if re.search(self.errorPattern, line, re.I): - #print " -- Error line matched: " + line - return None - if stage == "readGuids": - try: - (count, guidline) = line.split(None,1) - guids = guidline.split() - if tokpat.match(guids[0]): - self.countedGuids.append( (count,guids,) ) - continue - except ValueError: - pass - # end of input, finish - break - if re.search("Event count per distinct GUIDs group:", line): - stage = "readAttribs" - continue - if stage == "readAttribs": - self.tokens = line.split()[1:] - stage = "readGuids" - continue - - return (self.tokens, self.countedGuids) diff --git a/current/pandaserver/dataservice/datriHandler.py b/current/pandaserver/dataservice/datriHandler.py deleted file mode 100644 index de6f8d407..000000000 --- a/current/pandaserver/dataservice/datriHandler.py +++ /dev/null @@ -1,207 +0,0 @@ -""" -DaTRI Handler for external applications (curl, python ver. >= 2.4) -CERN, ATLAS Distributed Computing (March 2010) - -@author: Mikhail Titov -@contact: mikhail.titov@cern.ch -@data: June 21, 2013 -@version: 0.97 -""" - -import os -import subprocess -from urllib import urlencode - -HTTPS_PORT = 25943 -PANDAMON_HOST = 'panda.cern.ch' -PANDAMON_URI = '/server/pandamon/query' - -# -s: Silent or quiet mode. Don't show progress meter or error messages. -# -S: When used with -s it makes curl show an error message if it fails. -CURL_SILENT_OPTION = '-s' - -PARAMS_LIST = ['mode', 'action', 'dpat', 'site', 'userid'] -PARAMS_LIST_ADDON = ['emails', 'comments'] -MODE = { - 'pathena': 'ddm_pathenareq', - 'ganga': 'ddm_gangareq', - 'group': 'ddm_groupreq'} - -RETRY_NUM = 2 - - -def execute(params): - """Returns tuple (out, err) - - @param params (@type list) - shell command (1st parameter) and its options - """ - try: - p = subprocess.Popen(params, stdout=subprocess.PIPE) - except (OSError, ValueError), e: - return '', 'SubprocessException: %s' % e - else: - return p.communicate() - - -class datriHandler(object): - - """Class datriHandler.""" - - def __init__(self, **kwargs): - """Initialization - - @param kwargs (@type dict) - has "type" with one of the next values: pathena/ganga/group - """ - self.curl = datriCurl() - self.info = {'mode': MODE.get(kwargs.get('type', 'pathena'), '')} - self.err_message = '' - if not self.info['mode']: - self.err_message = 'datriHandler: mode is incorrect' - - def __del__(self): - self.curl = None - self.info.clear() - self.err_message = '' - - def hasParams(self): - """Check that parameters are defined and are not null - - @return (@type bool) - True/False - """ - for p in PARAMS_LIST: - if not self.info.get(p, None): - return False - return True - - def setParameters(self, data_pattern, site, userid, **kwargs): - """Define request parameters - - @param data_pattern (@type str) - dataset | container | pattern - @param site (@type str) - destination site (see AGIS/TiersOfAtlas) - @param userid (@type str) - unique user identification (certificate dn | email) - """ - if data_pattern and site and userid: - self.info.update({'dpat': data_pattern, - 'site': site, - 'userid': userid}) - for p in PARAMS_LIST_ADDON: - if p in kwargs: - self.info[p] = kwargs[p] - else: - self.err_message = 'datriHandler: required data are not defined' - - def checkData(self): - """Check request data (send "Check"-request) - - @return (@type typle: int, str) - returns status code and info (error) message - """ - if not self.err_message: - self.info['action'] = 'Check' - if self.hasParams(): - return self.curl.get(**self.info) - else: - self.err_message = 'datriHandler: required data are not defined' - return 4, self.err_message - - def sendRequest(self): - """Send request to DaTRI (send "Submit"-request) - - @return (@type typle: int, str) - returns status code and info (error) message - """ - if not self.err_message: - self.info['action'] = 'Submit' - if self.hasParams(): - return self.curl.get(**self.info) - else: - self.err_message = 'datriHandler: required data are not defined' - return 4, self.err_message - -# - Class for https-request definition - - -class datriCurl(object): - - """Class datriCurl for curl-command creation.""" - - def __init__(self): - self.err_message = '' - self.cmd_params = ['curl', - '--user-agent', 'datricurl', - '--max-redirs', '5', - '--max-time', '90', - CURL_SILENT_OPTION, - '-G'] - self._user_proxy() - self._ca_path() - # - url definition - - self.url = 'https://%s:%s%s' % (PANDAMON_HOST, HTTPS_PORT, PANDAMON_URI) - - def _user_proxy(self): - cert = os.environ.get('X509_USER_PROXY') - if not cert: - cert = '/tmp/x509up_u%s' % os.getuid() - if not os.access(cert, os.R_OK): - cert = None - if cert: - self.cmd_params.extend(['--cert', cert, '--cacert', cert]) - else: - self.err_message += 'User proxy certificate is not defined; ' - - def _ca_path(self): - if os.environ.get('X509_CERT_DIR'): - self.cmd_params.extend(['--capath', os.environ['X509_CERT_DIR']]) - else: - self.err_message += 'CA-path is not defined; ' - - # - method GET - - def get(self, **kwargs): - """Returns status code and response message - - @param kwargs (@type dict) - parameters for DaTRI request definition (see PARAMS_LIST) - @return (@type typle: int, str) - returns status code and info (error) message - """ - if not self.err_message: - if not kwargs: - return 2, 'datriCurl: input parameters are not defined' - o, e = '', ' is not defined' - # - several attempts for cmd execution - begin - - cmd_params = (self.cmd_params + - ['--url', '%s?%s' % (self.url, urlencode(kwargs))]) - for i in range(RETRY_NUM): - o, e = execute(cmd_params) - if o and not e: - return (0, o) if o.startswith('OK.') else (1, o) - # - several attempts for cmd execution - end - - return 3, 'datriCurl: execution error (output=%s, error=%s)' % (o, e) - return 5, 'datriCurl: %s' % self.err_message - - -####################################################################################### -# datriHandler - Status code definition: # -# # -# 0 - DaTRI request - CREATED SUCCESSFULLY # -# # -# 1 - DaTRI request - NOT CREATED [due to incorrect input data] # -# datriHandler - EXECUTED SUCCESSFULLY # -# # -# 2 - DaTRI request - NOT CREATED # -# datriHandler - FAILED [due to lack of input data at datriCurl.get] # -# # -# 3 - DaTRI request - NOT CREATED # -# datriHandler - FAILED [due to failure at datriCurl.get] # -# # -# 4 - DaTRI request - NOT CREATED # -# datriHandler - FAILED [due to lack of input data at datriHandler.setParameters] # -# # -# 5 - DaTRI request - NOT CREATED # -# datriHandler - FAILED [due to failure at datriCurl] # -####################################################################################### diff --git a/current/pandaserver/dataservice/eventLookupClient.py b/current/pandaserver/dataservice/eventLookupClient.py deleted file mode 100644 index b7ae3391a..000000000 --- a/current/pandaserver/dataservice/eventLookupClient.py +++ /dev/null @@ -1,201 +0,0 @@ -import urllib, re, string, os, time - -# client for eventLookup Athenaeum service -# author: Marcin.Nowak@cern.ch - -class eventLookupClient: - - serverURL = "http://j2eeps.cern.ch/atlas-project-Athenaeum/" - #serverURL = "http://j2eeps.cern.ch/test-Athenaeum/" - #serverURL = "http://j2eeps.cern.ch/test-eventPicking/" - lookupPage = "EventLookup.jsp" - getPage = "EventLookupGet.jsp" - key = "insider" - workerHost = "atlas-tagservices.cern.ch" - #workerHost = "atlddm10.cern.ch" #this is at the moment the real host aliased by atlas-tagservices - #workerHost = "voatlas69.cern.ch" - workerPort = '10004' - connectionRefusedSleep = 20 - errorPattern = "(Exception)|(Error)|(Lookup cannot be run)|(invalid)|(NOT EXISTING)" - - - def __init__(self): - self.output = "" - self.guids = {} - self.guidsLine = "" - self.certProxyFileName = None - self.certProxy = "" - self.debug = None - self.remoteFile = None - try: - self.certProxyFileName = os.environ['X509_USER_PROXY'] - except KeyError: - print 'You do not seem to have a certificate proxy! (do voms-proxy-init)' - return - proxy = open(self.certProxyFileName) - try: - for line in proxy: - self.certProxy += line - finally: - proxy.close() - - - def workerURL(self): - if self.workerHost.find(":") > 0: - # port number together with the host name, possibly from commandline option - return "http://" + self.workerHost - else: - return "http://" + self.workerHost + ":" + self.workerPort - - - def doLookup(self, inputEvents, async=None, stream="", tokens="", - amitag="", extract=False): - """ contact the server and return a list of GUIDs - inputEvents - list of run-event pairs - async - request query procesing in a separate process, client will poll for results - stream - stream - tokens - token names - amitag - used to select reprocessing pass (default empty means the latest) - """ - if inputEvents == []: - return [] - - runs_events = "" - runs = set() - sep = "" - for run_ev in inputEvents: - runs_events += sep + run_ev[0] + " " + run_ev[1] - sep = "\n" - runs.add(run_ev[0]); - - if async is None: - if len(runs) > 50 or len(inputEvents) > 1000: - async = True - if async: - asyncStr = "true" - else: - asyncStr = "false" - - query_args = { 'key': self.key, - 'worker': self.workerURL(), - 'runs_events': runs_events, - 'cert_proxy': self.certProxy, - 'async': asyncStr, - 'stream': stream, - 'amitag': amitag, - 'tokens': tokens - } - if extract: - query_args['extract'] = "true" - - self.talkToServer(self.serverURL + self.lookupPage, query_args) - if not async: - for line in self.output: - if re.search("502 Bad Gateway", line): - # usually signifies a timeout on the J2EE server - print "Timeout detected. Retrying in asynchronous mode" - query_args['async'] = "true" - self.talkToServer(self.serverURL + self.lookupPage, query_args) - break - - self.remoteFile = None - for line in self.output: - m = re.search("FILE=(.+)$", line) - if m: - return self.waitForFile( m.group(1) ) - - return self.scanOutputForGuids() - - - def talkToServer(self, url, args): - encoded_args = urllib.urlencode(args) - if self.debug: - print "Contacting URL: " + url - print encoded_args - - for _try in range(1,6): - response = urllib.urlopen(url, encoded_args) - self.output = [] - retry = False - for line in response: - self.output.append(line) - if re.search("Connection refused", line): - retry = True - if retry: - if self.debug: - print "Failed to connect to the server, try " + str(_try) - time.sleep(self.connectionRefusedSleep) - else: - break - - - def scanOutputForGuids(self): - """ Scan the server output looking for a line with GUIDs - return list of GUIDs if line found, put GUIDs in self.guids - return None in case of errors - """ - self.guids = {} - self.tags = [] - self.tagAttributes = None - stage = None - tokpat = re.compile(r'[[]DB=(?P.*?)[]]') - for line in self.output: - if re.search(self.errorPattern, line, re.I): - #print " -- Error line matched: " + line - return None - if stage == "readTags": - if line[0:1] == ":": - # break the line up into attributes, extract GUIDs - values = [] - for attr in string.split(line[1:]): - tok = tokpat.match(attr) - if tok: - attr = tok.group('FID') - # self.guids - TODO - populate the guids dict - values.append(attr) - self.tags.append( values ) - continue - else: - return (self.tagAttributes, self.tags) - if re.match("\{.*\}$", line): - guids = eval(line) - if type(guids).__name__!='dict': - return None - self.guids = guids - return guids - if re.search("TAGs extracted:", line): - stage = "readAttribs" - continue - if stage == "readAttribs": - self.tagAttributes = string.split(line.strip(),",") - stage = "readTags" - continue - return None - - - def waitForFile(self, file): - """ Wait for the server to do EventLookup and store results in file - Retrieve the file and scan for GUIDs - return them if found - """ - query_args = { 'key': self.key, - 'worker': self.workerURL(), - 'file' : file, - 'wait_time' : "45" - } - self.remoteFile = file - if self.debug: - print "EventLookup waiting for server. Remote file=" + file - - ready = False - while not ready: - self.talkToServer(self.serverURL + self.getPage, query_args) - ready = True - for line in self.output: - if re.match("NOT READY", line): - if self.debug: - print "received NOT READY" - time.sleep(1) - ready = False - - return self.scanOutputForGuids() - diff --git a/current/pandaserver/dataservice/forkSetupper.py b/current/pandaserver/dataservice/forkSetupper.py deleted file mode 100755 index 415995de7..000000000 --- a/current/pandaserver/dataservice/forkSetupper.py +++ /dev/null @@ -1,74 +0,0 @@ -import os -import sys -import commands - -# exec -def run(inFile,v_onlyTA): - import cPickle as pickle - try: - # read Jobs from file - f = open(inFile) - jobs = pickle.load(f) - f.close() - except: - type, value, traceBack = sys.exc_info() - print("run() : %s %s" % (type,value)) - return - # password - from config import panda_config - passwd = panda_config.dbpasswd - # initialize cx_Oracle using dummy connection - from taskbuffer.Initializer import initializer - initializer.init() - # instantiate TB - from taskbuffer.TaskBuffer import taskBuffer - taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - # run Setupper - from dataservice.Setupper import Setupper - thr = Setupper(taskBuffer,jobs,onlyTA=v_onlyTA,useNativeDQ2=True) - thr.start() - thr.join() - return - - -# exit action -def _onExit(fname): - commands.getoutput('rm -rf %s' % fname) - - -#################################################################### -# main -def main(): - import getopt - import atexit - # option class - class _options: - def __init__(self): - pass - options = _options() - del _options - # set default values - options.inFile = "" - options.onlyTA = False - # get command-line parameters - try: - opts, args = getopt.getopt(sys.argv[1:],"i:t") - except: - print("ERROR : Invalid options") - sys.exit(1) - # set options - for o, a in opts: - if o in ("-i",): - options.inFile = a - if o in ("-t",): - options.onlyTA = True - # exit action - atexit.register(_onExit,options.inFile) - # run - run(options.inFile,options.onlyTA) - # return - sys.exit(0) - - -if __name__ == "__main__": - main() diff --git a/current/pandaserver/jobdispatcher/ErrorCode.py b/current/pandaserver/jobdispatcher/ErrorCode.py deleted file mode 100755 index e58b1b444..000000000 --- a/current/pandaserver/jobdispatcher/ErrorCode.py +++ /dev/null @@ -1,11 +0,0 @@ -############## errror code - -# Watcher -EC_Watcher = 100 - -# recovery failed -EC_Recovery = 101 - -# send failed -EC_SendError = 102 - diff --git a/current/pandaserver/jobdispatcher/JobDispatcher.py b/current/pandaserver/jobdispatcher/JobDispatcher.py deleted file mode 100755 index 86f126921..000000000 --- a/current/pandaserver/jobdispatcher/JobDispatcher.py +++ /dev/null @@ -1,541 +0,0 @@ -""" -dispatch jobs - -""" - -import re -import types -import threading -import Protocol -import time -import datetime -import commands -from threading import Lock -from config import panda_config -from dataservice.Adder import Adder -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('JobDispatcher') -_pilotReqLogger = PandaLogger().getLogger('PilotRequests') - - -# a wrapper to install timpout into a method -class _TimedMethod: - def __init__(self,method,timeout): - self.method = method - self.timeout = timeout - self.result = Protocol.TimeOutToken - - # method emulation - def __call__(self,*var): - self.result = apply(self.method,var) - - # run - def run(self,*var): - thr = threading.Thread(target=self,args=var) - # run thread - thr.start() - thr.join() #self.timeout) - - -# job dipatcher -class JobDipatcher: - # constructor - def __init__(self): - # taskbuffer - self.taskBuffer = None - # DN/token map - self.tokenDN = None - # datetime of last updated - self.lastUpdated = datetime.datetime.utcnow() - # how frequently update DN/token map - self.timeInterval = datetime.timedelta(seconds=180) - # pilot owners - self.pilotOwners = None - # hostnames for authorization at grid-free sites - self.allowedNodes = None - # lock - self.lock = Lock() - - - # set task buffer - def init(self,taskBuffer): - # lock - self.lock.acquire() - # set TB - if self.taskBuffer == None: - self.taskBuffer = taskBuffer - # update DN/token map - if self.tokenDN == None: - self.tokenDN = self.taskBuffer.getListSchedUsers() - # get pilot owners - if self.pilotOwners == None: - self.pilotOwners = self.taskBuffer.getPilotOwners() - # get allowed nodes - if self.allowedNodes == None: - self.allowedNodes = self.taskBuffer.getAllowedNodes() - # release - self.lock.release() - - - # get job - def getJob(self,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, - atlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup,allowOtherCountry): - jobs = [] - # wrapper function for timeout - tmpWrapper = _TimedMethod(self.taskBuffer.getJobs,timeout) - tmpWrapper.run(1,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, - atlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup,allowOtherCountry) - if isinstance(tmpWrapper.result,types.ListType): - jobs = jobs + tmpWrapper.result - # make response - if len(jobs) > 0: - proxyKey = jobs[-1] - nSent = jobs[-2] - jobs = jobs[:-2] - if len(jobs) != 0: - # succeed - response=Protocol.Response(Protocol.SC_Success) - # append Job - response.appendJob(jobs[0]) - # append nSent - response.appendNode('nSent',nSent) - # set proxy key - if getProxyKey: - response.setProxyKey(proxyKey) - else: - if tmpWrapper.result == Protocol.TimeOutToken: - # timeout - response=Protocol.Response(Protocol.SC_TimeOut) - else: - # no available jobs - response=Protocol.Response(Protocol.SC_NoJobs) - # return - _logger.debug("getJob : %s %s ret -> %s" % (siteName,node,response.encode())) - return response.encode() - - - # update job status - def updateJob(self,jobID,jobStatus,timeout,xml,siteName,param,metadata,attemptNr=None,stdout=''): - # retry failed analysis job and ddm job - if jobStatus=='failed' \ - and ((param.has_key('pilotErrorCode') and (param['pilotErrorCode'] in ['1200','1201'] \ - or param['pilotErrorCode'].startswith('-'))) \ - or (siteName != None and siteName.find('DDM') != -1)): - # retry - if param.has_key('pilotErrorCode') and param['pilotErrorCode'].startswith('-'): - # pilot retry with new PandaID - ret = self.taskBuffer.retryJob(jobID,param,getNewPandaID=True,attemptNr=attemptNr) - else: - # old style - ret = self.taskBuffer.retryJob(jobID,param,attemptNr=attemptNr) - if ret: - # return succeed - response=Protocol.Response(Protocol.SC_Success) - return response.encode() - # add metadata - if metadata != '': - self.taskBuffer.addMetadata([jobID],[metadata]) - # add stdout - if stdout != '': - self.taskBuffer.addStdOut(jobID,stdout) - # update - tmpStatus = jobStatus - updateStateChange = False - if jobStatus == 'failed' or jobStatus == 'finished': - tmpStatus = 'holding' - # update stateChangeTime to prevent Watcher from finding this job - updateStateChange = True - if tmpStatus == 'holding': - tmpWrapper = _TimedMethod(self.taskBuffer.updateJobStatus,None) - else: - tmpWrapper = _TimedMethod(self.taskBuffer.updateJobStatus,timeout) - tmpWrapper.run(jobID,tmpStatus,param,updateStateChange,attemptNr) - # make response - if tmpWrapper.result == Protocol.TimeOutToken: - # timeout - response=Protocol.Response(Protocol.SC_TimeOut) - else: - if tmpWrapper.result: - # succeed - response=Protocol.Response(Protocol.SC_Success) - # set command - if isinstance(tmpWrapper.result,types.StringType): - response.appendNode('command',tmpWrapper.result) - else: - response.appendNode('command','NULL') - # add output to dataset - if tmpWrapper.result != "badattemptnr" and (jobStatus == 'failed' or jobStatus == 'finished'): - Adder(self.taskBuffer,jobID,xml,jobStatus,attemptNr=attemptNr).start() - else: - # failed - response=Protocol.Response(Protocol.SC_Failed) - _logger.debug("updateJob : %s ret -> %s" % (jobID,response.encode())) - return response.encode() - - - # get job status - def getStatus(self,strIDs,timeout): - # convert str to list - ids = strIDs.split() - # peek jobs - tmpWrapper = _TimedMethod(self.taskBuffer.peekJobs,timeout) - tmpWrapper.run(ids,False,True,True,False) - # make response - if tmpWrapper.result == Protocol.TimeOutToken: - # timeout - response=Protocol.Response(Protocol.SC_TimeOut) - else: - if isinstance(tmpWrapper.result,types.ListType): - # succeed - response=Protocol.Response(Protocol.SC_Success) - # make return - retStr = '' - attStr = '' - for job in tmpWrapper.result: - if job == None: - retStr += '%s+' % 'notfound' - attStr += '0+' - else: - retStr += '%s+' % job.jobStatus - attStr += '%s+' % job.attemptNr - response.appendNode('status',retStr[:-1]) - response.appendNode('attemptNr',attStr[:-1]) - else: - # failed - response=Protocol.Response(Protocol.SC_Failed) - _logger.debug("getStatus : %s ret -> %s" % (strIDs,response.encode())) - return response.encode() - - - # get DN/token map - def getDnTokenMap(self): - # get current datetime - current = datetime.datetime.utcnow() - # lock - self.lock.acquire() - # update DN map if old - if current-self.lastUpdated > self.timeInterval: - # get new map - self.tokenDN = self.taskBuffer.getListSchedUsers() - # reset - self.lastUpdated = current - # release - self.lock.release() - # return - return self.tokenDN - - - # generate pilot token - def genPilotToken(self,schedulerhost,scheduleruser,schedulerid): - retVal = self.taskBuffer.genPilotToken(schedulerhost,scheduleruser,schedulerid) - # failed - if retVal == None: - return "ERROR : failed to generate token" - return "SUCCEEDED : " + retVal - - -# Singleton -jobDispatcher = JobDipatcher() -del JobDipatcher - - -# get FQANs -def _getFQAN(req): - fqans = [] - for tmpKey,tmpVal in req.subprocess_env.iteritems(): - # compact credentials - if tmpKey.startswith('GRST_CRED_'): - # VOMS attribute - if tmpVal.startswith('VOMS'): - # FQAN - fqan = tmpVal.split()[-1] - # append - fqans.append(fqan) - # old style - elif tmpKey.startswith('GRST_CONN_'): - tmpItems = tmpVal.split(':') - # FQAN - if len(tmpItems)==2 and tmpItems[0]=='fqan': - fqans.append(tmpItems[-1]) - # return - return fqans - - -# check role -def _checkRole(fqans,dn,jdCore,withVomsPatch=True,site='',hostname=''): - prodManager = False - try: - # VOMS attributes of production and pilot roles - prodAttrs = ['/atlas/usatlas/Role=production', - '/atlas/usatlas/Role=pilot', - '/atlas/Role=production', - '/atlas/Role=pilot', - '/osg/Role=pilot', - '/Engage/LBNE/Role=pilot', - ] - if withVomsPatch: - # FIXEME once http://savannah.cern.ch/bugs/?47136 is solved - prodAttrs += ['/atlas/'] - prodAttrs += ['/osg/'] - prodAttrs += ['/Engage/LBNE/'] - for fqan in fqans: - # check atlas/usatlas production role - for rolePat in prodAttrs: - if fqan.startswith(rolePat): - prodManager = True - break - # escape - if prodManager: - break - # service proxy for CERNVM - if site in ['CERNVM']: - serviceSubjects = ['/DC=ch/DC=cern/OU=computers/CN=pilot/copilot.cern.ch'] - for tmpSub in serviceSubjects: - if dn.startswith(tmpSub): - prodManager = True - break - # grid-free authorization - if not prodManager: - if hostname != '' and jdCore.allowedNodes.has_key(site): - for tmpPat in jdCore.allowedNodes[site]: - if re.search(tmpPat,hostname) != None: - prodManager = True - break - # check DN with pilotOwners - if (not prodManager) and (not dn in [None]): - for owner in jdCore.pilotOwners: - # check - if re.search(owner,dn) != None: - prodManager = True - break - except: - pass - # return - return prodManager - - -# get DN -def _getDN(req): - realDN = None - if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - realDN = req.subprocess_env['SSL_CLIENT_S_DN'] - # remove redundant CN - realDN = re.sub('/CN=limited proxy','',realDN) - realDN = re.sub('/CN=proxy(/CN=proxy)+','/CN=proxy',realDN) - # return - return realDN - - -# check token -def _checkToken(token,jdCore): - # not check None until all pilots use tokens - if token == None: - return True - # get map - tokenDN = jdCore.getDnTokenMap() - # return - return tokenDN.has_key(token) - - - -""" -web service interface - -""" - -# get job -def getJob(req,siteName,token=None,timeout=60,cpu=None,mem=None,diskSpace=None,prodSourceLabel=None,node=None, - computingElement=None,AtlasRelease=None,prodUserID=None,getProxyKey=None,countryGroup=None, - workingGroup=None,allowOtherCountry=None): - _logger.debug("getJob(%s)" % siteName) - # get DN - realDN = _getDN(req) - # get FQANs - fqans = _getFQAN(req) - # check production role - if getProxyKey == 'True': - # don't use /atlas to prevent normal proxy getting credname - prodManager = _checkRole(fqans,realDN,jobDispatcher,False,site=siteName) - else: - prodManager = _checkRole(fqans,realDN,jobDispatcher,site=siteName, - hostname=req.get_remote_host()) - # check token - validToken = _checkToken(token,jobDispatcher) - # set DN for non-production user - if not prodManager: - prodUserID = realDN - # allow getProxyKey for production role - if getProxyKey == 'True' and prodManager: - getProxyKey = True - else: - getProxyKey = False - # convert mem and diskSpace - try: - mem = int(float(mem)) - if mem < 0: - mem = 0 - except: - mem = 0 - try: - diskSpace = int(float(diskSpace)) - if diskSpace < 0: - diskSpace = 0 - except: - diskSpace = 0 - _logger.debug("getJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s)" \ - % (siteName,cpu,mem,diskSpace,prodSourceLabel,node, - computingElement,AtlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup, - allowOtherCountry,realDN,prodManager,token,validToken,str(fqans))) - _pilotReqLogger.info('method=getJob,site=%s,node=%s,type=%s' % (siteName,node,prodSourceLabel)) - # invalid role - if (not prodManager) and (not prodSourceLabel in ['user']): - _logger.warning("getJob(%s) : invalid role" % siteName) - return Protocol.Response(Protocol.SC_Role).encode() - # invalid token - if not validToken: - _logger.warning("getJob(%s) : invalid token" % siteName) - return Protocol.Response(Protocol.SC_Invalid).encode() - # invoke JD - return jobDispatcher.getJob(siteName,prodSourceLabel,cpu,mem,diskSpace,node,int(timeout), - computingElement,AtlasRelease,prodUserID,getProxyKey,countryGroup, - workingGroup,allowOtherCountry) - - -# update job status -def updateJob(req,jobId,state,token=None,transExitCode=None,pilotErrorCode=None,pilotErrorDiag=None,timestamp=None,timeout=60, - xml='',node=None,workdir=None,cpuConsumptionTime=None,cpuConsumptionUnit=None,remainingSpace=None, - schedulerID=None,pilotID=None,siteName=None,messageLevel=None,pilotLog='',metaData='', - cpuConversionFactor=None,exeErrorCode=None,exeErrorDiag=None,pilotTiming=None,computingElement=None, - startTime=None,endTime=None,nEvents=None,nInputFiles=None,batchID=None,attemptNr=None,jobMetrics=None, - stdout=''): - _logger.debug("updateJob(%s)" % jobId) - # get DN - realDN = _getDN(req) - # get FQANs - fqans = _getFQAN(req) - # check production role - prodManager = _checkRole(fqans,realDN,jobDispatcher,site=siteName,hostname=req.get_remote_host()) - # check token - validToken = _checkToken(token,jobDispatcher) - _logger.debug("updateJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,attemptNr:%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s\n==XML==\n%s\n==LOG==\n%s\n==Meta==\n%s\n==Metrics==\n%s\n==stdout==\n%s)" % - (jobId,state,transExitCode,pilotErrorCode,pilotErrorDiag,node,workdir,cpuConsumptionTime, - cpuConsumptionUnit,remainingSpace,schedulerID,pilotID,siteName,messageLevel,nEvents,nInputFiles, - cpuConversionFactor,exeErrorCode,exeErrorDiag,pilotTiming,computingElement,startTime,endTime, - batchID,attemptNr,realDN,prodManager,token,validToken,str(fqans),xml,pilotLog,metaData,jobMetrics, - stdout)) - _pilotReqLogger.info('method=updateJob,site=%s,node=%s,type=None' % (siteName,node)) - # invalid role - if not prodManager: - _logger.warning("updateJob(%s) : invalid role" % jobId) - return Protocol.Response(Protocol.SC_Role).encode() - # invalid token - if not validToken: - _logger.warning("updateJob(%s) : invalid token" % jobId) - return Protocol.Response(Protocol.SC_Invalid).encode() - # aborting message - if jobId=='NULL': - return Protocol.Response(Protocol.SC_Success).encode() - # check status - if not state in ['running','failed','finished','holding','starting','transferring']: - _logger.warning("invalid state=%s for updateJob" % state) - return Protocol.Response(Protocol.SC_Success).encode() - # pilot log - if pilotLog != '': - try: - # make message - message = pilotLog - # get logger - _pandaLogger = PandaLogger() - _pandaLogger.lock() - _pandaLogger.setParams({'Type':'pilotLog','PandaID':int(jobId)}) - logger = _pandaLogger.getHttpLogger(panda_config.loggername) - # add message - logger.info(message) - # release HTTP handler - _pandaLogger.release() - except: - pass - # create parameter map - param = {} - if cpuConsumptionTime != None: - param['cpuConsumptionTime']=cpuConsumptionTime - if cpuConsumptionUnit != None: - param['cpuConsumptionUnit']=cpuConsumptionUnit - if node != None: - param['modificationHost']=node - if transExitCode != None: - param['transExitCode']=transExitCode - if pilotErrorCode != None: - param['pilotErrorCode']=pilotErrorCode - if pilotErrorDiag != None: - param['pilotErrorDiag']=pilotErrorDiag[:500] - if jobMetrics != None: - param['jobMetrics']=jobMetrics[:500] - if schedulerID != None: - param['schedulerID']=schedulerID - if pilotID != None: - param['pilotID']=pilotID[:200] - if batchID != None: - param['batchID']=batchID - if exeErrorCode != None: - param['exeErrorCode']=exeErrorCode - if exeErrorDiag != None: - param['exeErrorDiag']=exeErrorDiag[:500] - if cpuConversionFactor != None: - param['cpuConversion']=cpuConversionFactor - if pilotTiming != None: - param['pilotTiming']=pilotTiming - if computingElement != None: - param['computingElement']=computingElement - if nEvents != None: - param['nEvents']=nEvents - if nInputFiles != None: - param['nInputFiles']=nInputFiles - if startTime != None: - try: - param['startTime']=datetime.datetime(*time.strptime(startTime,'%Y-%m-%d %H:%M:%S')[:6]) - except: - pass - if endTime != None: - try: - param['endTime']=datetime.datetime(*time.strptime(endTime,'%Y-%m-%d %H:%M:%S')[:6]) - except: - pass - if attemptNr != None: - try: - attemptNr = int(attemptNr) - except: - attemptNr = None - if stdout != '': - stdout = stdout[:2048] - # invoke JD - return jobDispatcher.updateJob(int(jobId),state,int(timeout),xml,siteName, - param,metaData,attemptNr,stdout) - - -# get job status -def getStatus(req,ids,timeout=60): - _logger.debug("getStatus(%s)" % ids) - return jobDispatcher.getStatus(ids,int(timeout)) - - -# generate pilot token -def genPilotToken(req,schedulerid,host=None): - # get DN - realDN = _getDN(req) - # get FQANs - fqans = _getFQAN(req) - # check production role - prodManager = _checkRole(fqans,realDN,jobDispatcher,False) - if not prodManager: - return "ERROR : production or pilot role is required" - if realDN == None: - return "ERROR : failed to retrive DN" - # hostname - if host == None: - host = req.get_remote_host() - # return - return jobDispatcher.genPilotToken(host,realDN,schedulerid) - diff --git a/current/pandaserver/jobdispatcher/Protocol.py b/current/pandaserver/jobdispatcher/Protocol.py deleted file mode 100755 index 42cbd9d4d..000000000 --- a/current/pandaserver/jobdispatcher/Protocol.py +++ /dev/null @@ -1,212 +0,0 @@ -import urllib - - -# constants -TimeOutToken = "TimeOut" -NoJobsToken = "NoJobs" - -########### status codes -# succeeded -SC_Success = 0 -# timeout -SC_TimeOut = 10 -# no available jobs -SC_NoJobs = 20 -# failed -SC_Failed = 30 -# Not secure connection -SC_NonSecure = 40 -# invalid token -SC_Invalid = 50 -# invalid role -SC_Role = 60 - - -# response -class Response: - # constructor - def __init__(self,statusCode): - # create data object - self.data = {'StatusCode':statusCode} - - - # URL encode - def encode(self): - return urllib.urlencode(self.data) - - - # append Node - def appendNode(self,name,value): - self.data[name]=value - - - # append job - def appendJob(self,job): - # PandaID - self.data['PandaID'] = job.PandaID - # prodSourceLabel - self.data['prodSourceLabel'] = job.prodSourceLabel - # swRelease - self.data['swRelease'] = job.AtlasRelease - # homepackage - self.data['homepackage'] = job.homepackage - # transformation - self.data['transformation'] = job.transformation - # job name - self.data['jobName'] = job.jobName - # job definition ID - self.data['jobDefinitionID'] = job.jobDefinitionID - # cloud - self.data['cloud'] = job.cloud - # files - strIFiles = '' - strOFiles = '' - strDispatch = '' - strDisToken = '' - strDisTokenForOutput = '' - strDestination = '' - strRealDataset = '' - strRealDatasetIn = '' - strDestToken = '' - strProdToken = '' - strGUID = '' - strFSize = '' - strCheckSum = '' - strScopeIn = '' - strScopeOut = '' - strScopeLog = '' - logFile = '' - logGUID = '' - for file in job.Files: - if file.type == 'input': - if strIFiles != '': - strIFiles += ',' - strIFiles += file.lfn - if strDispatch != '': - strDispatch += ',' - strDispatch += file.dispatchDBlock - if strDisToken != '': - strDisToken += ',' - strDisToken += file.dispatchDBlockToken - if strProdToken != '': - strProdToken += ',' - strProdToken += file.prodDBlockToken - if strGUID != '': - strGUID += ',' - strGUID += file.GUID - strRealDatasetIn += '%s,' % file.dataset - strFSize += '%s,' % file.fsize - if not file.checksum in ['','NULL',None]: - strCheckSum += '%s,' % file.checksum - else: - strCheckSum += '%s,' % file.md5sum - strScopeIn += '%s,' % file.scope - if file.type == 'output' or file.type == 'log': - if strOFiles != '': - strOFiles += ',' - strOFiles += file.lfn - if strDestination != '': - strDestination += ',' - strDestination += file.destinationDBlock - if strRealDataset != '': - strRealDataset += ',' - strRealDataset += file.dataset - if file.type == 'log': - logFile = file.lfn - logGUID = file.GUID - strScopeLog = file.scope - else: - strScopeOut += '%s,' % file.scope - if strDestToken != '': - strDestToken += ',' - strDestToken += file.destinationDBlockToken.split(',')[0] - strDisTokenForOutput += '%s,' % file.dispatchDBlockToken - # inFiles - self.data['inFiles'] = strIFiles - # dispatch DBlock - self.data['dispatchDblock'] = strDispatch - # dispatch DBlock space token - self.data['dispatchDBlockToken'] = strDisToken - # dispatch DBlock space token for output - self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1] - # outFiles - self.data['outFiles'] = strOFiles - # destination DBlock - self.data['destinationDblock'] = strDestination - # destination DBlock space token - self.data['destinationDBlockToken'] = strDestToken - # prod DBlock space token - self.data['prodDBlockToken'] = strProdToken - # real output datasets - self.data['realDatasets'] = strRealDataset - # real output datasets - self.data['realDatasetsIn'] = strRealDatasetIn[:-1] - # log filename - self.data['logFile'] = logFile - # log GUID - self.data['logGUID'] = logGUID - # jobPars - self.data['jobPars'] = job.jobParameters - # attempt number - self.data['attemptNr'] = job.attemptNr - # GUIDs - self.data['GUID'] = strGUID - # checksum - self.data['checksum'] = strCheckSum[:-1] - # fsize - self.data['fsize'] = strFSize[:-1] - # scope - self.data['scopeIn'] = strScopeIn[:-1] - self.data['scopeOut'] = strScopeOut[:-1] - self.data['scopeLog'] = strScopeLog - # destinationSE - self.data['destinationSE'] = job.destinationSE - # user ID - self.data['prodUserID'] = job.prodUserID - # CPU count - self.data['maxCpuCount'] = job.maxCpuCount - # RAM count - self.data['minRamCount'] = job.minRamCount - # disk count - self.data['maxDiskCount'] = job.maxDiskCount - # cmtconfig - self.data['cmtConfig'] = job.cmtConfig - # processingType - self.data['processingType'] = job.processingType - # transferType - self.data['transferType'] = job.transferType - # current priority - self.data['currentPriority'] = job.currentPriority - # taskID - self.data['taskID'] = job.taskID - # debug mode - if job.specialHandling != None and 'debug' in job.specialHandling: - self.data['debug'] = 'True' - - - # set proxy key - def setProxyKey(self,proxyKey): - names = ['credname','myproxy'] - for name in names: - if proxyKey.has_key(name): - self.data[name] = proxyKey[name] - else: - self.data[name] = '' - - -# check if secure connection -def isSecure(req): - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return False - return True - - -# get user DN -def getUserDN(req): - try: - return req.subprocess_env['SSL_CLIENT_S_DN'] - except: - return 'None' - - - diff --git a/current/pandaserver/jobdispatcher/Watcher.py b/current/pandaserver/jobdispatcher/Watcher.py deleted file mode 100755 index f07e6d922..000000000 --- a/current/pandaserver/jobdispatcher/Watcher.py +++ /dev/null @@ -1,172 +0,0 @@ -''' -watch job - -''' - -import re -import sys -import time -import commands -import datetime -import threading -import ErrorCode - -import taskbuffer.ErrorCode - -from brokerage.PandaSiteIDs import PandaSiteIDs - -from dataservice.Closer import Closer -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('Watcher') - - -class Watcher (threading.Thread): - # constructor - def __init__(self,taskBuffer,pandaID,single=False,sleepTime=360,sitemapper=None): - threading.Thread.__init__(self) - self.pandaID = pandaID - self.taskBuffer = taskBuffer - self.sleepTime = sleepTime - self.single = single - self.siteMapper = sitemapper - - # main - def run(self): - try: - while True: - _logger.debug('%s start' % self.pandaID) - # query job - job = self.taskBuffer.peekJobs([self.pandaID],fromDefined=False, - fromArchived=False,fromWaiting=False)[0] - # check job status - if job == None or (not job.jobStatus in ['running','sent','starting','holding', - 'stagein','stageout']): - _logger.debug('%s escape : %s' % (self.pandaID,job.jobStatus)) - return - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=self.sleepTime) - if job.modificationTime < timeLimit or (job.endTime != 'NULL' and job.endTime < timeLimit): - _logger.debug('%s %s lastmod:%s endtime:%s' % (job.PandaID,job.jobStatus, - str(job.modificationTime), - str(job.endTime))) - destDBList = [] - # retry analysis jobs - if (job.prodSourceLabel in ['user','panda']) and (job.attemptNr<2 or job.jobStatus == 'sent') \ - and job.commandToPilot != 'tobekilled' and (not job.processingType in ['ITB_INTEGRATION']) \ - and not job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_Reassigned, - taskbuffer.ErrorCode.EC_Retried, - taskbuffer.ErrorCode.EC_PilotRetried] \ - and not job.processingType.startswith('gangarobot') \ - and not job.processingType.startswith('hammercloud'): - # reset - _logger.debug(' -> reset %s job with %s : PandaID:%s #%s' % (job.prodSourceLabel,job.jobStatus,job.PandaID,job.attemptNr)) - job.jobStatus = 'activated' - job.startTime = None - job.endTime = None - job.attemptNr = job.attemptNr + 1 - # remove flag regarding to pledge-resource handling - if not job.specialHandling in [None,'NULL','']: - newSpecialHandling = re.sub(',*localpool','',job.specialHandling) - if newSpecialHandling == '': - job.specialHandling = None - else: - job.specialHandling = newSpecialHandling - # TEMPORARY : send it to long queue - oldComputingSite = job.computingSite - if job.jobStatus != 'sent' and job.computingSite.startswith('ANALY') and (not job.computingSite.startswith('ANALY_LONG_')): - tmpLongSiteList = [] - tmpLongSite = re.sub('^ANALY_','ANALY_LONG_',job.computingSite) - tmpLongSite = re.sub('_\d+$','',tmpLongSite) - tmpLongSiteList.append(tmpLongSite) - tmpLongSite = job.computingSite + '_LONG' - tmpLongSiteList.append(tmpLongSite) - tmpLongSite = re.sub('SHORT','LONG',job.computingSite) - if tmpLongSite != job.computingSite: - tmpLongSiteList.append(tmpLongSite) - for longSite in tmpLongSiteList: - if self.siteMapper.checkSite(longSite): - tmpSiteSpec = self.siteMapper.getSite(longSite) - if tmpSiteSpec.status == 'online': - job.computingSite = longSite - _logger.debug(' -> sending PandaID:%s to %s' % (job.PandaID,job.computingSite)) - # set destinationSE - if job.destinationSE == oldComputingSite: - job.destinationSE = job.computingSite - break - # modify LFNs and destinationSE - for file in job.Files: - modTypes = ('output','log') - if file.type in modTypes: - # set destinationSE - if file.destinationSE == oldComputingSite: - file.destinationSE = job.computingSite - if job.prodSourceLabel == 'panda': - # doesn't change output for buildJob - modTypes = ('log',) - if file.type in modTypes: - # set new GUID - if file.type == 'log': - file.GUID = commands.getoutput('uuidgen') - # add attempt nr - oldName = file.lfn - file.lfn = re.sub("\.\d+$","",file.lfn) - file.lfn = "%s.%d" % (file.lfn,job.attemptNr) - newName = file.lfn - # modify jobParameters - sepPatt = "(\'|\"|%20)" + oldName + "(\'|\"|%20)" - matches = re.findall(sepPatt,job.jobParameters) - for match in matches: - oldPatt = match[0]+oldName+match[-1] - newPatt = match[0]+newName+match[-1] - job.jobParameters = re.sub(oldPatt,newPatt,job.jobParameters) - else: - if job.jobStatus == 'sent': - # sent job didn't receive reply from pilot within 30 min - job.jobDispatcherErrorCode = ErrorCode.EC_SendError - job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min" - elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL': - # lost heartbeat - job.jobDispatcherErrorCode = ErrorCode.EC_Watcher - if job.jobDispatcherErrorDiag == 'NULL': - if job.endTime == 'NULL': - # normal lost heartbeat - job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.modificationTime) - else: - # job recovery failed - job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.endTime) - else: - # job recovery failed - job.jobDispatcherErrorCode = ErrorCode.EC_Recovery - job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (self.sleepTime/60) - # set job status - job.jobStatus = 'failed' - # set endTime for lost heartbeat - if job.endTime == 'NULL': - # normal lost heartbeat - job.endTime = job.modificationTime - # set files status - for file in job.Files: - if file.type == 'output' or file.type == 'log': - file.status = 'failed' - if not file.destinationDBlock in destDBList: - destDBList.append(file.destinationDBlock) - # update job - self.taskBuffer.updateJobs([job],False) - # start closer - if job.jobStatus == 'failed': - cThr = Closer(self.taskBuffer,destDBList,job) - cThr.start() - cThr.join() - _logger.debug('%s end' % job.PandaID) - return - # single action - if self.single: - return - # sleep - time.sleep(60*self.sleepTime) - except: - type, value, traceBack = sys.exc_info() - _logger.error("run() : %s %s" % (type,value)) - return diff --git a/current/pandaserver/jobdispatcher/__init__.py b/current/pandaserver/jobdispatcher/__init__.py deleted file mode 100755 index e69de29bb..000000000 diff --git a/current/pandaserver/server/panda.py b/current/pandaserver/server/panda.py deleted file mode 100755 index d8d9d4991..000000000 --- a/current/pandaserver/server/panda.py +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/python2.5 - -""" -entry point - -""" - -# config file -from config import panda_config - -# initialize cx_Oracle using dummy connection -from taskbuffer.Initializer import initializer -initializer.init() - -# initialzie TaskBuffer -from taskbuffer.TaskBuffer import taskBuffer -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,panda_config.nDBConnection,True) - -# initialize JobDispatcher -from jobdispatcher.JobDispatcher import jobDispatcher -if panda_config.nDBConnection != 0: - jobDispatcher.init(taskBuffer) - -# initialize DataService -from dataservice.DataService import dataService -if panda_config.nDBConnection != 0: - dataService.init(taskBuffer) - -# initialize UserIF -from userinterface.UserIF import userIF -if panda_config.nDBConnection != 0: - userIF.init(taskBuffer) - -# import web I/F -allowedMethods = [] - -from taskbuffer.Utils import isAlive,putFile,deleteFile,getServer,updateLog,fetchLog,\ - touchFile,getVomsAttr,putEventPickingRequest,getAttr,getFile -allowedMethods += ['isAlive','putFile','deleteFile','getServer','updateLog','fetchLog', - 'touchFile','getVomsAttr','putEventPickingRequest','getAttr','getFile'] - -from dataservice.DataService import datasetCompleted,updateFileStatusInDisp -allowedMethods += ['datasetCompleted','updateFileStatusInDisp'] - -from jobdispatcher.JobDispatcher import getJob,updateJob,getStatus,genPilotToken -allowedMethods += ['getJob','updateJob','getStatus','genPilotToken'] - -from userinterface.UserIF import submitJobs,getJobStatus,queryPandaIDs,killJobs,reassignJobs,\ - getJobStatistics,getJobStatisticsPerSite,resubmitJobs,queryLastFilesInDataset,getPandaIDsSite,\ - getJobsToBeUpdated,updateProdDBUpdateTimes,runTaskAssignment,getAssigningTask,getSiteSpecs,\ - getCloudSpecs,runBrokerage,seeCloudTask,queryJobInfoPerCloud,registerProxyKey,getProxyKey,\ - getJobIDsInTimeRange,getPandIDsWithJobID,getFullJobStatus,getJobStatisticsForBamboo,\ - getNUserJobs,addSiteAccess,listSiteAccess,getFilesInUseForAnal,updateSiteAccess,\ - getPandaClientVer,getSlimmedFileInfoPandaIDs,runReBrokerage,deleteFilesFromCacheDB,\ - addFilesToCacheDB,flushCacheDB,checkFilesWithCacheDB,getQueuedAnalJobs,getHighestPrioJobStat,\ - getActiveDatasets,setCloudTaskByUser,getSerialNumberForGroupJob,getCachePrefixes,\ - checkMergeGenerationStatus,sendLogInfo,getNumPilots,retryFailedJobsInActive,\ - getJobStatisticsWithLabel,getPandaIDwithJobExeID,getJobStatisticsPerUserSite,\ - getDisInUseForAnal,getLFNsInUseForAnal,getScriptOfflineRunning,setDebugMode,\ - insertSandboxFileInfo,checkSandboxFile,changeJobPriorities -allowedMethods += ['submitJobs','getJobStatus','queryPandaIDs','killJobs','reassignJobs', - 'getJobStatistics','getJobStatisticsPerSite','resubmitJobs','queryLastFilesInDataset','getPandaIDsSite', - 'getJobsToBeUpdated','updateProdDBUpdateTimes','runTaskAssignment','getAssigningTask','getSiteSpecs', - 'getCloudSpecs','runBrokerage','seeCloudTask','queryJobInfoPerCloud','registerProxyKey','getProxyKey', - 'getJobIDsInTimeRange','getPandIDsWithJobID','getFullJobStatus','getJobStatisticsForBamboo', - 'getNUserJobs','addSiteAccess','listSiteAccess','getFilesInUseForAnal','updateSiteAccess', - 'getPandaClientVer','getSlimmedFileInfoPandaIDs','runReBrokerage','deleteFilesFromCacheDB', - 'addFilesToCacheDB','flushCacheDB','checkFilesWithCacheDB','getQueuedAnalJobs','getHighestPrioJobStat', - 'getActiveDatasets','setCloudTaskByUser','getSerialNumberForGroupJob','getCachePrefixes', - 'checkMergeGenerationStatus','sendLogInfo','getNumPilots','retryFailedJobsInActive', - 'getJobStatisticsWithLabel','getPandaIDwithJobExeID','getJobStatisticsPerUserSite', - 'getDisInUseForAnal','getLFNsInUseForAnal','getScriptOfflineRunning','setDebugMode', - 'insertSandboxFileInfo','checkSandboxFile','changeJobPriorities'] - -# import error -import taskbuffer.ErrorCode - - -# FastCGI/WSGI entry -if panda_config.useFastCGI or panda_config.useWSGI: - - import os - import cgi - import sys - from pandalogger.PandaLogger import PandaLogger - - # logger - _logger = PandaLogger().getLogger('Entry') - - # dummy request object - class DummyReq: - def __init__(self,env,): - # environ - self.subprocess_env = env - # header - self.headers_in = {} - # content-length - if self.subprocess_env.has_key('CONTENT_LENGTH'): - self.headers_in["content-length"] = self.subprocess_env['CONTENT_LENGTH'] - - # get remote host - def get_remote_host(self): - if self.subprocess_env.has_key('REMOTE_HOST'): - return self.subprocess_env['REMOTE_HOST'] - return "" - - - # application - def application(environ, start_response): - # get method name - methodName = '' - if environ.has_key('SCRIPT_NAME'): - methodName = environ['SCRIPT_NAME'].split('/')[-1] - if panda_config.entryVerbose: - _logger.debug("PID=%s %s in" % (os.getpid(),methodName)) - # check method name - if not methodName in allowedMethods: - _logger.error("PID=%s %s is forbidden" % (os.getpid(),methodName)) - exeRes = "False : %s is forbidden" % methodName - else: - # get method object - tmpMethod = None - try: - exec "tmpMethod = %s" % methodName - except: - pass - # object not found - if tmpMethod == None: - _logger.error("PID=%s %s is undefined" % (os.getpid(),methodName)) - exeRes = "False" - else: - # get params - tmpPars = cgi.FieldStorage(environ['wsgi.input'], environ=environ, - keep_blank_values=1) - # convert to map - params = {} - for tmpKey in tmpPars.keys(): - if tmpPars[tmpKey].file != None and tmpPars[tmpKey].filename != None: - # file - params[tmpKey] = tmpPars[tmpKey] - else: - # string - params[tmpKey] = tmpPars.getfirst(tmpKey) - if panda_config.entryVerbose: - _logger.debug("PID=%s %s with %s" % (os.getpid(),methodName,str(params.keys()))) - # dummy request object - dummyReq = DummyReq(environ) - try: - # exec - exeRes = apply(tmpMethod,[dummyReq],params) - # convert bool to string - if exeRes in [True,False]: - exeRes = str(exeRes) - except: - errType,errValue = sys.exc_info()[:2] - errStr = "" - for tmpKey,tmpVal in environ.iteritems(): - errStr += "%s : %s\n" % (tmpKey,str(tmpVal)) - _logger.error("execution failure : %s %s" % (errType,errValue)) - _logger.error(errStr) - # return internal server error - start_response('500 INTERNAL SERVER ERROR', [('Content-Type', 'text/plain')]) - return ["%s %s" % (errType,errValue)] - if panda_config.entryVerbose: - _logger.debug("PID=%s %s out" % (os.getpid(),methodName)) - # return - if exeRes == taskbuffer.ErrorCode.EC_NotFound: - start_response('404 Not Found', [('Content-Type', 'text/plain')]) - return ['not found'] - elif isinstance(exeRes,taskbuffer.ErrorCode.EC_Redirect): - start_response('302 Redirect', [('Location', exeRes.url)]) - return ['redirect'] - else: - start_response('200 OK', [('Content-Type', 'text/plain')]) - return [exeRes] - - # start server - if panda_config.useFastCGI: - from flup.server.fcgi import WSGIServer - WSGIServer(application,multithreaded=False).run() diff --git a/current/pandaserver/taskbuffer/ArchiveDBProxyPool.py b/current/pandaserver/taskbuffer/ArchiveDBProxyPool.py deleted file mode 100644 index 8bfd014b0..000000000 --- a/current/pandaserver/taskbuffer/ArchiveDBProxyPool.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -pool for ArchiveDBProxies - -""" - -import time -import Queue -import random -import OraLogDBProxy as LogDBProxy -from config import panda_config - -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('ArchiveDBProxyPool') - -class ArchiveDBProxyPool: - - def __init__(self,nConnection=panda_config.nArchiveDBConnection): - # create Proxies - _logger.debug("init") - self.proxyList = Queue.Queue(nConnection) - for i in range(nConnection): - _logger.debug("connect -> %s " % i) - proxy = LogDBProxy.LogDBProxy() - nTry = 10 - for iTry in range(nTry): - if proxy.connect(dbhost = panda_config.archivedbhost, - dbpasswd = panda_config.archivedbpasswd, - dbuser = panda_config.archivedbuser, - dbname = panda_config.archivedbname): - break - _logger.debug("failed -> %s : try %s" % (i,iTry)) - if iTry+1 == nTry: - raise RuntimeError, 'ArchiveDBProxyPool.__init__ failed' - time.sleep(random.randint(10,20)) - self.proxyList.put(proxy) - time.sleep(1) - _logger.debug("ready") - - # return a free proxy. this method blocks until a proxy is available - def getProxy(self): - # get proxy - proxy = self.proxyList.get() - # wake up connection - proxy.wakeUp() - # return - return proxy - - - # put back a proxy - def putProxy(self,proxy): - # put - self.proxyList.put(proxy) - diff --git a/current/pandaserver/taskbuffer/CloudSpec.py b/current/pandaserver/taskbuffer/CloudSpec.py deleted file mode 100644 index bfb1927d3..000000000 --- a/current/pandaserver/taskbuffer/CloudSpec.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -cloud specification - -""" - -class CloudSpec(object): - # attributes - _attributes = ('name','tier1','tier1SE','relocation','weight','server','status','transtimelo', - 'transtimehi','waittime','validation','mcshare','countries','fasttrack','nprestage', - 'pilotowners') - - # constructor - def __init__(self): - # install attributes - for attr in self._attributes: - setattr(self,attr,None) - - # serialize - def __str__(self): - str = '' - for attr in self._attributes: - str += '%s:%s ' % (attr,getattr(self,attr)) - return str - - - - diff --git a/current/pandaserver/taskbuffer/CloudTaskSpec.py b/current/pandaserver/taskbuffer/CloudTaskSpec.py deleted file mode 100644 index 8fade3ce1..000000000 --- a/current/pandaserver/taskbuffer/CloudTaskSpec.py +++ /dev/null @@ -1,99 +0,0 @@ -""" -cloud/task specification - -""" - -class CloudTaskSpec(object): - # attributes - _attributes = ('id','taskname','taskid','cloud','status','tmod','tenter') - # slots - __slots__ = _attributes - - - # constructor - def __init__(self): - # install attributes - for attr in self._attributes: - setattr(self,attr,None) - - - # override __getattribute__ for SQL and PandaID - def __getattribute__(self,name): - ret = object.__getattribute__(self,name) - if ret == None: - return "NULL" - return ret - - - # return a tuple of values - def values(self): - ret = [] - for attr in self._attributes: - val = getattr(self,attr) - ret.append(val) - return tuple(ret) - - - # pack tuple into CloudTaskSpec - def pack(self,values): - for i in range(len(self._attributes)): - attr= self._attributes[i] - val = values[i] - setattr(self,attr,val) - - - # return state values to be pickled - def __getstate__(self): - state = [] - for attr in self._attributes: - val = getattr(self,attr) - state.append(val) - return state - - - # restore state from the unpickled state values - def __setstate__(self,state): - for i in range(len(self._attributes)): - if i+1 < len(state): - setattr(self,self._attributes[i],state[i]) - else: - setattr(self,self._attributes[i],'NULL') - - - # return column names for INSERT - def columnNames(cls): - ret = "" - for attr in cls._attributes: - if ret != "": - ret += ',' - ret += attr - return ret - columnNames = classmethod(columnNames) - - - # return expression of values for INSERT - def valuesExpression(cls): - ret = "VALUES(" - for attr in cls._attributes: - ret += "%s" - if attr != cls._attributes[len(cls._attributes)-1]: - ret += "," - ret += ")" - return ret - valuesExpression = classmethod(valuesExpression) - - - # return an expression for UPDATE - def updateExpression(cls): - ret = "" - for attr in cls._attributes: - ret = ret + attr + "=%s" - if attr != cls._attributes[len(cls._attributes)-1]: - ret += "," - return ret - updateExpression = classmethod(updateExpression) - - - - - diff --git a/current/pandaserver/taskbuffer/CloudURLMap.py b/current/pandaserver/taskbuffer/CloudURLMap.py deleted file mode 100644 index 27bdce567..000000000 --- a/current/pandaserver/taskbuffer/CloudURLMap.py +++ /dev/null @@ -1,36 +0,0 @@ -# cloud to Panda server's URL mapping -cloudURLMap = { - 'CA' : { - 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', - 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', - }, - 'ES' : { - 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', - 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', - }, - 'FR' : { - 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', - 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', - }, - 'IT' : { - 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', - 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', - }, - 'NL' : { - 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', - 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', - }, - 'TW' : { - 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', - 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', - }, - 'UK' : { - 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', - 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', - }, - 'US' : { - 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', - 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', - }, - } - diff --git a/current/pandaserver/taskbuffer/ConBridge.py b/current/pandaserver/taskbuffer/ConBridge.py deleted file mode 100644 index 3f4fd1abd..000000000 --- a/current/pandaserver/taskbuffer/ConBridge.py +++ /dev/null @@ -1,502 +0,0 @@ -import os -import re -import sys -import time -import types -import socket -import signal -import random -import threading -import cPickle as pickle - -import OraDBProxy as DBProxy - -from config import panda_config -from JobSpec import JobSpec -from FileSpec import FileSpec -from DatasetSpec import DatasetSpec -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('ConBridge') - - -# exception for normal termination -class HarmlessEx(Exception): - pass - - -# terminate child process by itself when master has gone -class Terminator (threading.Thread): - - # constructor - def __init__(self,consock): - threading.Thread.__init__(self) - self.consock = consock - - - # main - def run(self): - # watching control socket - try: - rcvSize = self.consock.recv(1) - except: - pass - # get PID - pid = os.getpid() - _logger.debug("child %s received termination" % pid) - # kill - try: - os.kill(pid,signal.SIGTERM) - except: - pass - try: - os.kill(pid,signal.SIGKILL) - except: - pass - - - -# connection bridge with with timeout -class ConBridge (object): - - # constructor - def __init__(self): - self.child_pid = 0 - self.isMaster = False - self.mysock = None - self.consock = None - self.pid = os.getpid() - # timeout - if hasattr(panda_config,'dbtimeout'): - self.timeout = int(panda_config.dbtimeout) - else: - self.timeout = 600 - # verbose - if hasattr(panda_config,'dbbridgeverbose'): - self.verbose = panda_config.dbbridgeverbose - else: - self.verbose = False - - - # destructor - def __del__(self): - # kill old child process - self.bridge_killChild() - - - # connect - def connect(self,dbhost=panda_config.dbhost,dbpasswd=panda_config.dbpasswd, - dbuser=panda_config.dbuser,dbname=panda_config.dbname, - dbtimeout=None,reconnect=False): - # kill old child process - self.bridge_killChild() - _logger.debug('master %s connecting' % self.pid) - # reset child PID and sockets - self.child_pid = 0 - self.mysock = None - self.consock = None - # create socket - datpair = socket.socketpair() - conpair = socket.socketpair() - # fork - self.child_pid = os.fork() - if self.child_pid == 0: - # child - self.isMaster = False - self.pid = os.getpid() - # keep socket - self.mysock = datpair[1] - self.consock = conpair[1] - datpair[0].close() - conpair[0].close() - # connect to database - _logger.debug('child %s connecting to database' % self.pid) - self.proxy = DBProxy.DBProxy() - if not self.proxy.connect(dbhost=dbhost,dbpasswd=dbpasswd,dbtimeout=60): - _logger.error('child %s failed to connect' % self.pid) - # send error - self.bridge_sendError((RuntimeError,'child %s connection failed' % self.pid)) - # exit - self.bridge_childExit() - # send OK just for ACK - _logger.debug('child %s connection is ready' % self.pid) - self.bridge_sendResponse(None) - # start terminator - Terminator(self.consock).start() - # go main loop - _logger.debug('child %s going into the main loop' % self.pid) - self.bridge_run() - # exit - self.bridge_childExit(0) - else: - # master - self.isMaster = True - # keep socket - self.mysock = datpair[0] - self.consock = conpair[0] - datpair[1].close() - conpair[1].close() - try: - # get ACK - _logger.debug('master %s waiting ack from child=%s' % (self.pid,self.child_pid)) - self.bridge_getResponse() - _logger.debug('master %s got ready from child=%s' % (self.pid,self.child_pid)) - return True - except: - errType,errValue = sys.exc_info()[:2] - _logger.error('master %s failed to setup child=%s : %s %s' % \ - (self.pid,self.child_pid,errType,errValue)) - # kill child - self.bridge_killChild() - return False - - - - ####################### - # communication methods - - # send packet - def bridge_send(self,val): - try: - # set timeout - if self.isMaster: - self.mysock.settimeout(self.timeout) - # serialize - tmpStr = pickle.dumps(val) - # send size - self.mysock.sendall("%50s" % len(tmpStr)) - # send body - self.mysock.sendall(tmpStr) - # set timeout back - if self.isMaster: - self.mysock.settimeout(None) - except: - errType,errValue = sys.exc_info()[:2] - if self.isMaster: - roleType = 'master' - else: - roleType = 'child ' - _logger.error('%s %s send error : val=%s - %s %s' % \ - (roleType,self.pid,str(val),errType,errValue)) - # terminate child - if not self.isMaster: - self.bridge_childExit() - raise errType,errValue - - - # receive packet - def bridge_recv(self): - try: - # set timeout - if self.isMaster: - self.mysock.settimeout(self.timeout) - # get size - strSize = '' - headSize = 50 - while len(strSize) < headSize: - tmpSize = headSize - len(strSize) - tmpStr = self.mysock.recv(tmpSize) - if tmpStr == '': - if self.isMaster: - raise socket.error,'empty packet' - else: - # master closed socket - raise HarmlessEx,'empty packet' - strSize += tmpStr - # get body - strBody = '' - bodySize = long(strSize) - while len(strBody) < bodySize: - tmpSize = bodySize - len(strBody) - tmpStr = self.mysock.recv(tmpSize) - if tmpStr == '': - if self.isMaster: - raise socket.error,'empty packet' - else: - # master closed socket - raise HarmlessEx,'empty packet' - strBody += tmpStr - # set timeout back - if self.isMaster: - self.mysock.settimeout(None) - # deserialize - retVal = pickle.loads(strBody) - return True,retVal - except: - if self.isMaster: - roleType = 'master' - else: - roleType = 'child ' - errType,errValue = sys.exc_info()[:2] - if errType == HarmlessEx: - _logger.debug('%s %s recv harmless ex : %s' % \ - (roleType,self.pid,errValue)) - else: - _logger.error('%s %s recv error : %s %s' % \ - (roleType,self.pid,errType,errValue)) - # terminate child - if not self.isMaster: - self.bridge_childExit() - raise errType,errValue - - - - ####################### - # child's methods - - # send error - def bridge_sendError(self,val): - # send status - self.bridge_send("NG") - # check if pickle-able - try: - pickle.dumps(val) - except: - # use RuntimeError - val = (RuntimeError,str(val[-1])) - # send exceptions - self.bridge_send(val) - - - # send response - def bridge_sendResponse(self,val): - # send status - self.bridge_send("OK") - # send response - self.bridge_send(val) - - - # termination of child - def bridge_childExit(self,exitCode=1): - if not self.isMaster: - _logger.debug("child %s closing sockets" % self.pid) - # close sockets - try: - self.mysock.shutdown(socket.SHUT_RDWR) - except: - pass - try: - self.consock.shutdown(socket.SHUT_RDWR) - except: - pass - # exit - _logger.debug("child %s going to exit" % self.pid) - os._exit(exitCode) - - - # child main - def bridge_run(self): - comStr = '' - while True: - try: - # get command - status,comStr = self.bridge_recv() - if not status: - raise RuntimeError,'invalid command' - # get variables - status,variables = self.bridge_recv() - if not status: - raise RuntimeError,'invalid variables' - except: - errType,errValue = sys.exc_info()[:2] - _logger.error('child %s died : %s %s' % (self.pid,errType,errValue)) - # exit - self.bridge_childExit() - if self.verbose: - _logger.debug('child %s method %s executing' % (self.pid,comStr)) - try: - # execute - method = getattr(self.proxy,comStr) - res = apply(method,variables[0],variables[1]) - # FIXME : modify response since cx_Oracle types cannot be picked - if comStr in ['querySQLS']: - newRes = [True]+res[1:] - res = newRes - if self.verbose: - _logger.debug('child %s method %s completed' % (self.pid,comStr)) - # return - self.bridge_sendResponse((res,variables[0],variables[1])) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error('child %s method %s failed : %s %s' % (self.pid,comStr,errType,errValue)) - if errType in [socket.error,socket.timeout]: - _logger.error('child %s died : %s %s' % (self.pid,errType,errValue)) - # exit - self.bridge_childExit() - # send error - self.bridge_sendError((errType,errValue)) - - - - ####################### - # master's methods - - # kill child - def bridge_killChild(self): - # kill old child process - if self.child_pid != 0: - # close sockets - _logger.debug('master %s closing sockets for child=%s' % (self.pid,self.child_pid)) - try: - if self.mysock != None: - self.mysock.shutdown(socket.SHUT_RDWR) - except: - pass - try: - if self.consock != None: - self.consock.shutdown(socket.SHUT_RDWR) - except: - pass - _logger.debug('master %s killing child=%s' % (self.pid,self.child_pid)) - # send SIGTERM - try: - os.kill(self.child_pid,signal.SIGTERM) - except: - pass - time.sleep(2) - # send SIGKILL - try: - os.kill(self.child_pid,signal.SIGKILL) - except: - pass - # wait for completion of child - _logger.debug('master %s waiting child=%s' % (self.pid,self.child_pid)) - try: - os.waitpid(self.child_pid,0) - except: - pass - # sleep to avoid burst reconnection - time.sleep(random.randint(5,15)) - _logger.debug('master %s killed child=%s' % (self.pid,self.child_pid)) - - - # get responce - def bridge_getResponse(self): - # get status - status,strStatus = self.bridge_recv() - if not status: - raise RuntimeError,'master %s got invalid status response from child=%s' % \ - (self.pid,self.child_pid) - if strStatus == 'OK': - # return res - status,ret = self.bridge_recv() - if not status: - raise RuntimeError,'master %s got invalid response body from child=%s' % \ - (self.pid,self.child_pid) - return ret - elif strStatus == 'NG': - # raise error - status,ret = self.bridge_recv() - if not status: - raise RuntimeError,'master %s got invalid response value from child=%s' % \ - (self.pid,self.child_pid) - raise ret[0],ret[1] - else: - raise RuntimeError,'master %s got invalid response from child=%s : %s' % \ - (self.pid,self.child_pid,str(strStatus)) - - - # method wrapper class - class bridge_masterMethod: - - # constructor - def __init__(self,name,parent): - self.name = name - self.parent = parent - self.pid = os.getpid() - - - # copy changes in taskbuff objects to master - def copyTbObjChanges(self,oldPar,newPar): - # check they have the same type - if type(oldPar) != type(newPar): - return False - # copy some Specs since they are passed via ref's - if isinstance(oldPar,JobSpec) or isinstance(oldPar,FileSpec) \ - or isinstance(oldPar,DatasetSpec): - if hasattr(oldPar,'__getstate__'): - tmpStat = newPar.__getstate__() - oldPar.__setstate__(tmpStat) - else: - tmpStat = newPar.values() - oldPar.pack(tmpStat) - return True - # copy Datasets - return False - - - # copy changes in objects to master - def copyChanges(self,oldPar,newPar): - if isinstance(oldPar,types.ListType): - # delete all elements first - while len(oldPar) > 0: - oldPar.pop() - # append - for tmpItem in newPar: - oldPar.append(tmpItem) - elif isinstance(oldPar,types.DictType): - # replace - for tmpKey in newPar.keys(): - oldPar[tmpKey] = newPar[tmpKey] - else: - self.copyTbObjChanges(oldPar,newPar) - - - # method emulation - def __call__(self,*args,**keywords): - while True: - try: - # send command name - self.parent.bridge_send(self.name) - # send variables - self.parent.bridge_send((args,keywords)) - # get response - retVal,newArgs,newKeywords = self.parent.bridge_getResponse() - # propagate child's changes in args to master - for idxArg,tmpArg in enumerate(args): - self.copyChanges(tmpArg,newArgs[idxArg]) - # propagate child's changes in keywords to master - for tmpKey,tmpArg in keywords.iteritems(): - self.copyChanges(tmpArg,newKeywords[tmpKey]) - # return - return retVal - except: - errType,errValue = sys.exc_info()[:2] - _logger.error('master %s method %s failed : %s %s' % \ - (self.pid,self.name,errType,errValue)) - # reconnect when socket has a problem - if not errType in [socket.error,socket.timeout]: - # kill old child process - self.parent.bridge_killChild() - _logger.error('master %s killed child' % self.pid) - #raise errType,errValue - # sleep - time.sleep(5) - # reconnect - try: - _logger.debug('master %s trying to reconnect' % self.pid) - self.parent.connect() - _logger.debug('master %s reconnect completed' % self.pid) - except: - _logger.error('master %s connect failed' % self.pid) - - - # get atter for cursor attributes - def __getattribute__(self,name): - if object.__getattribute__(self,'isMaster'): - try: - # return origianl attribute - return object.__getattribute__(self,name) - except: - # append methods - if not name.startswith('_') and hasattr(DBProxy.DBProxy,name) and \ - isinstance(getattr(DBProxy.DBProxy,name),types.UnboundMethodType): - # get DBProxy's method wrapper - method = ConBridge.bridge_masterMethod(name,self) - # set method - setattr(self,name,method) - # return - return method - # return origianl attribute for child - return object.__getattribute__(self,name) diff --git a/current/pandaserver/taskbuffer/DBProxy.py b/current/pandaserver/taskbuffer/DBProxy.py deleted file mode 100755 index 9d0981e15..000000000 --- a/current/pandaserver/taskbuffer/DBProxy.py +++ /dev/null @@ -1,3066 +0,0 @@ -""" -proxy for database connection - -""" - -import re -import os -import sys -import time -import fcntl -import random -import urllib -import MySQLdb -import datetime -import commands -import traceback -import warnings -import ErrorCode -from JobSpec import JobSpec -from FileSpec import FileSpec -from DatasetSpec import DatasetSpec -from CloudTaskSpec import CloudTaskSpec -from pandalogger.PandaLogger import PandaLogger -from config import panda_config -from brokerage.PandaSiteIDs import PandaSiteIDs - -warnings.filterwarnings('ignore') - -# logger -_logger = PandaLogger().getLogger('DBProxy') - -# lock file -_lockGetSN = open(panda_config.lockfile_getSN, 'w') -_lockSetDS = open(panda_config.lockfile_setDS, 'w') -_lockGetCT = open(panda_config.lockfile_getCT, 'w') - - -# proxy -class DBProxy: - - # constructor - def __init__(self): - # connection object - self.conn = None - # cursor object - self.cur = None - # host name - self.hostname = None - # retry count - self.nTry = 5 - - # connect to DB - def connect(self,dbhost=panda_config.dbhost,dbpasswd=panda_config.dbpasswd, - dbuser=panda_config.dbuser,dbname=panda_config.dbname, - dbtimeout=None,reconnect=False): - # keep parameters for reconnect - if not reconnect: - self.dbhost = dbhost - self.dbpasswd = dbpasswd - self.dbuser = dbuser - self.dbname = dbname - self.dbtimeout = dbtimeout - # connect - try: - if self.dbtimeout == None: - self.conn = MySQLdb.connect(host=self.dbhost,user=self.dbuser, - passwd=self.dbpasswd,db=self.dbname) - else: - self.conn = MySQLdb.connect(host=self.dbhost,user=self.dbuser, - passwd=self.dbpasswd,db=self.dbname, - connect_timeout=self.dbtimeout) - self.cur=self.conn.cursor() - # get hostname - self.cur.execute('SELECT USER()') - res = self.cur.fetchone() - match = re.search('^([^@]+)@([^@]+)$',res[0]) - if match != None: - self.hostname = match.group(2) - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("connect : %s %s" % (type,value)) - # roll back - self._rollback() - return False - - - # query an SQL - def querySQL(self,sql): - comment = ' /* DBProxy.querySQL */' - try: - _logger.debug("querySQL : %s " % sql) - # begin transaction - self.cur.execute("START TRANSACTION") - self.cur.execute(sql+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return res - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("querySQL : %s " % sql) - _logger.error("querySQL : %s %s" % (type,value)) - return None - - - # query an SQL return Status - def querySQLS(self,sql): - comment = ' /* DBProxy.querySQLS */' - try: - # begin transaction - self.cur.execute("SET AUTOCOMMIT=1") - ret = self.cur.execute(sql+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return ret,res - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("querySQLS : %s " % sql) - _logger.error("querySQLS : %s %s" % (type,value)) - return -1,None - - - # query an SQL with list return Status - def querySQLwList(self,sql,valList): - comment = ' /* DBProxy.querySQLwList */' - try: - # begin transaction - self.cur.execute("SET AUTOCOMMIT=1") - ret = self.cur.execute(sql+comment,valList) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return ret,res - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("querySQLwList : %s %s" % (sql,str(valList))) - _logger.error("querySQLwList : %s %s" % (type,value)) - return -1,None - - - # insert job to jobsDefined - def insertNewJob(self,job,user,serNum,weight=0.0,priorityOffset=0,userVO=None): - comment = ' /* DBProxy.insertNewJob */' - sql1 = "INSERT INTO jobsDefined4 (%s) " % JobSpec.columnNames() - sql1+= JobSpec.valuesExpression() - # make sure PandaID is NULL - job.PandaID = None - # job status - job.jobStatus='defined' - # host and time information - job.modificationHost = self.hostname - job.creationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - job.modificationTime = job.creationTime - job.stateChangeTime = job.creationTime - # DN - if job.prodUserID == "NULL" or job.prodSourceLabel in ['user','panda']: - job.prodUserID = user - # VO - job.VO = userVO - # priority - if job.assignedPriority != 'NULL': - job.currentPriority = job.assignedPriority - if job.prodSourceLabel == 'user': - job.currentPriority = 1000 + priorityOffset - (serNum / 5) - int(100 * weight) - elif job.prodSourceLabel == 'panda': - job.currentPriority = 2000 + priorityOffset - # usergroup - if job.prodSourceLabel == 'regional': - job.computingSite= "BNLPROD" - try: - # begin transaction - self.cur.execute("START TRANSACTION") - # insert - retI = self.cur.execute(sql1+comment, job.values()) - # set PandaID - job.PandaID = self.conn.insert_id() - # insert files - _logger.debug("insertNewJob : %s Label : %s ret : %s" % (job.PandaID,job.prodSourceLabel,retI)) - sqlFile = "INSERT INTO filesTable4 (%s) " % FileSpec.columnNames() - sqlFile+= FileSpec.valuesExpression() - for file in job.Files: - file.rowID = None - if file.status != 'ready': - file.status='unknown' - # replace $PANDAID with real PandaID - file.lfn = re.sub('\$PANDAID', '%05d' % job.PandaID, file.lfn) - self.cur.execute(sqlFile+comment, file.values()) - # get rowID - file.rowID = self.conn.insert_id() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("insertNewJob : %s File OK" % job.PandaID) - # update job info in MonALISA - Job Defined. - #aThr = apmonInterface(job) - #aThr.start() - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("insertNewJob : %s %s" % (type,value)) - # roll back - self._rollback() - return False - - - # simply insert job to a table - def insertJobSimple(self,job,table,fileTable): - comment = ' /* DBProxy.insertJobSimple */' - _logger.debug("insertJobSimple : %s" % job.PandaID) - sql1 = "INSERT INTO %s (%s) " % (table,JobSpec.columnNames()) - sql1+= JobSpec.valuesExpression() - try: - # begin transaction - self.cur.execute("START TRANSACTION") - # insert - self.cur.execute(sql1+comment, job.values()) - # files - sqlFile = "INSERT INTO %s " % fileTable - sqlFile+= "(%s) " % FileSpec.columnNames() - sqlFile+= FileSpec.valuesExpression() - for file in job.Files: - self.cur.execute(sqlFile+comment, file.values()) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("insertJobSimple : %s %s" % (type,value)) - # roll back - self._rollback() - return False - - - # activate job. move job from jobsDefined to jobsActive - def activateJob(self,job): - comment = ' /* DBProxy.activateJob */' - if job==None: - _logger.debug("activateJob : None") - return True - _logger.debug("activateJob : %s" % job.PandaID) - sql0 = "SELECT rowID FROM filesTable4 WHERE PandaID=%s AND type=%s AND status!=%s" - sql1 = "UPDATE jobsDefined4 SET jobStatus='activated' " - sql1+= "WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined') AND commandToPilot<>'tobekilled'" - sql2 = "INSERT INTO jobsActive4 (%s) " % JobSpec.columnNames() - sql2+= JobSpec.valuesExpression() - # host and time information - job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - # set stateChangeTime for defined->activated but not for assigned->activated - if job.jobStatus in ['defined']: - job.stateChangeTime = job.modificationTime - nTry=3 - for iTry in range(nTry): - try: - # check if all files are ready - allOK = True - for file in job.Files: - if file.type == 'input' and file.status != 'ready': - allOK = False - break - # begin transaction - self.cur.execute("START TRANSACTION") - # check all inputs are ready - self.cur.execute(sql0+comment, (job.PandaID,"input","ready")) - res = self.cur.fetchall() - if len(res) == 0 or allOK: - # change status - job.jobStatus = "activated" - # update. Not delete for InnoDB - n = self.cur.execute(sql1+comment, (job.PandaID,)) - if n==0: - # already killed or activated - _logger.debug("activateJob : Not found %s" % job.PandaID) - else: - # insert - self.cur.execute(sql2+comment, job.values()) - # update files - sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" - for file in job.Files: - self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) - else: - # update job - sqlJ = ("UPDATE jobsDefined4 SET %s " % JobSpec.updateExpression()) + \ - "WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')" - n = self.cur.execute(sqlJ+comment, job.values()+(job.PandaID,)) - if n==0: - # already killed or activated - _logger.debug("activateJob : Not found %s" % job.PandaID) - else: - # update files - sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" - for file in job.Files: - self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("activateJob : %s retry : %s" % (job.PandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("activateJob : %s %s" % (type,value)) - return False - - - # send job to jobsWaiting - def keepJob(self,job): - comment = ' /* DBProxy.keepJob */' - _logger.debug("keepJob : %s" % job.PandaID) - sql1 = "UPDATE jobsDefined4 SET jobStatus='waiting' " - sql1+= "WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined') AND commandToPilot<>'tobekilled'" - sql2 = "INSERT INTO jobsWaiting4 (%s) " % JobSpec.columnNames() - sql2+= JobSpec.valuesExpression() - # time information - job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - job.stateChangeTime = job.modificationTime - nTry=3 - for iTry in range(nTry): - try: - # begin transaction - self.cur.execute("START TRANSACTION") - # delete - n = self.cur.execute(sql1+comment, (job.PandaID,)) - if n==0: - # already killed - _logger.debug("keepJob : Not found %s" % job.PandaID) - else: - # set status - job.jobStatus = 'waiting' - # insert - self.cur.execute(sql2+comment, job.values()) - # update files - sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" - for file in job.Files: - self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # update job info in MonALISA - Job sent to waiting state - #aThr = apmonInterface(job) - #aThr.start() - return True - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("keepJob : %s retry : %s" % (job.PandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("keepJob : %s %s" % (type,value)) - return False - - - # archive job to jobArchived and remove the job from jobsActive or jobsDefined - def archiveJob(self,job,fromJobsDefined): - comment = ' /* DBProxy.archiveJob */' - _logger.debug("archiveJob : %s" % job.PandaID) - if fromJobsDefined: - sql1 = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')" - else: - sql1 = "DELETE FROM jobsActive4 WHERE PandaID=%s" - sql2 = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames() - sql2+= JobSpec.valuesExpression() - nTry=3 - for iTry in range(nTry): - try: - # begin transaction - self.cur.execute("START TRANSACTION") - # delete - n = self.cur.execute(sql1+comment, (job.PandaID,)) - if n==0: - # already killed - _logger.debug("archiveJob : Not found %s" % job.PandaID) - else: - # insert - job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - job.stateChangeTime = job.modificationTime - if job.endTime == 'NULL': - job.endTime = job.modificationTime - self.cur.execute(sql2+comment, job.values()) - # update files - sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" - for file in job.Files: - self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) - # delete downstream jobs - ddmIDs = [] - newJob = None - ddmAttempt = 0 - if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed': - # look for outputs - upOutputs = [] - for file in job.Files: - if file.type == 'output': - upOutputs.append(file.lfn) - # look for downstream jobs - sqlD = "SELECT PandaID FROM filesTable4 WHERE type='input' AND lfn='%s' GROUP BY PandaID" - sqlDJS = "SELECT %s " % JobSpec.columnNames() - sqlDJS+= "FROM jobsDefined4 WHERE PandaID=%s" - sqlDJD = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s" - sqlDJI = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames() - sqlDJI+= JobSpec.valuesExpression() - for upFile in upOutputs: - _logger.debug("look for downstream jobs for %s" % upFile) - # select PandaID - self.cur.execute((sqlD+comment) % upFile) - res = self.cur.fetchall() - for downID in res: - _logger.debug("delete : %s" % downID) - # select jobs - self.cur.execute((sqlDJS+comment) % downID) - resJob = self.cur.fetchall() - if len(resJob) == 0: - continue - # instantiate JobSpec - dJob = JobSpec() - dJob.pack(resJob[0]) - # delete - retD = self.cur.execute((sqlDJD+comment) % downID) - if retD == 0: - continue - # error code - dJob.jobStatus = 'failed' - dJob.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - dJob.taskBufferErrorCode = ErrorCode.EC_Kill - dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed' - dJob.modificationTime = dJob.endTime - dJob.stateChangeTime = dJob.endTime - # insert - self.cur.execute(sqlDJI+comment, dJob.values()) - elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='dis': - # get corresponding jobs for production movers - vuid = '' - # extract vuid - match = re.search('--callBack (\S+)',job.jobParameters) - if match != None: - try: - callbackUrl = urllib.unquote(match.group(1)) - callbackUrl = re.sub('[&\?]',' ', callbackUrl) - # look for vuid= - for item in callbackUrl.split(): - if item.startswith('vuid='): - vuid = item.split('=')[-1] - break - except: - pass - if vuid == '': - _logger.error("cannot extract vuid from %s" % job.jobParameters) - else: - # get name - self.cur.execute(("SELECT name FROM Datasets WHERE vuid='%s' AND type='dispatch'" % vuid)+comment) - res = self.cur.fetchall() - if len(res) != 0: - disName = res[0] - # get PandaIDs - self.cur.execute(("SELECT PandaID FROM jobsDefined4 WHERE dispatchDBlock='%s' AND jobStatus='assigned'" % disName)+comment) - resDDM = self.cur.fetchall() - for tmpID, in resDDM: - ddmIDs.append(tmpID) - # get offset - ddmAttempt = job.attemptNr - _logger.debug("get PandaID for reassign : %s ddmAttempt=%s" % (str(ddmIDs),ddmAttempt)) - elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='ddm' and job.attemptNr<2 \ - and job.commandToPilot != 'tobekilled': - # instantiate new mover to retry subscription - newJob = JobSpec() - newJob.jobDefinitionID = job.jobDefinitionID - newJob.jobName = job.jobName - newJob.attemptNr = job.attemptNr + 1 - newJob.transformation = job.transformation - newJob.destinationDBlock = job.destinationDBlock - newJob.destinationSE = job.destinationSE - newJob.currentPriority = job.currentPriority - newJob.prodSourceLabel = job.prodSourceLabel - newJob.prodUserID = job.prodUserID - newJob.computingSite = job.computingSite - newJob.transferType = job.transferType - newJob.sourceSite = job.sourceSite - newJob.destinationSite = job.destinationSite - newJob.jobParameters = job.jobParameters - if job.Files != []: - file = job.Files[0] - fileOL = FileSpec() - # add attempt nr - fileOL.lfn = re.sub("\.\d+$","",file.lfn) - fileOL.lfn = "%s.%d" % (fileOL.lfn,job.attemptNr) - fileOL.destinationDBlock = file.destinationDBlock - fileOL.destinationSE = file.destinationSE - fileOL.dataset = file.dataset - fileOL.type = file.type - newJob.addFile(fileOL) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True,ddmIDs,ddmAttempt,newJob - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("archiveJob : %s retry : %s" % (job.PandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("archiveJob : %s" % job.PandaID) - _logger.error("archiveJob : %s %s" % (type,value)) - return False,[],0,None - - - # overload of archiveJob - def archiveJobLite(self,pandaID,jobStatus,param): - comment = ' /* DBProxy.archiveJobLite */' - _logger.debug("archiveJobLite : %s" % pandaID) - sql1 = "SELECT %s FROM jobsActive4 " % JobSpec.columnNames() - sql1+= "WHERE PandaID=%s" - sql2 = "DELETE FROM jobsActive4 WHERE PandaID=%s" - sql3 = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames() - sql3+= JobSpec.valuesExpression() - nTry=3 - for iTry in range(nTry): - try: - # begin transaction - self.cur.execute("START TRANSACTION") - # select - self.cur.execute(sql1+comment, (pandaID,)) - res = self.cur.fetchall() - if len(res) == 0: - _logger.error("archiveJobLite() : PandaID %d not found" % pandaID) - self._rollback() - return False - job = JobSpec() - job.pack(res[0]) - job.jobStatus = jobStatus - for key in param.keys(): - if param[key] != None: - setattr(job,key,param[key]) - job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - job.endTime = job.modificationTime - job.stateChangeTime = job.modificationTime - # delete - n = self.cur.execute(sql2+comment, (job.PandaID,)) - if n==0: - # already killed - _logger.debug("archiveJobLite : Not found %s" % pandaID) - else: - # insert - self.cur.execute(sql3+comment, job.values()) - # update files - sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" - for file in job.Files: - self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) - # delete downstream jobs - if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed': - # file select - sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=%s" - self.cur.execute(sqlFile+comment, (job.PandaID,)) - resFs = self.cur.fetchall() - for resF in resFs: - file = FileSpec() - file.pack(resF) - job.addFile(file) - # look for outputs - upOutputs = [] - for file in job.Files: - if file.type == 'output': - upOutputs.append(file.lfn) - # look for downstream jobs - sqlD = "SELECT PandaID FROM filesTable4 WHERE type='input' AND lfn='%s' GROUP BY PandaID" - sqlDJS = "SELECT %s " % JobSpec.columnNames() - sqlDJS+= "FROM jobsDefined4 WHERE PandaID=%s" - sqlDJD = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s" - sqlDJI = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames() - sqlDJI+= JobSpec.valuesExpression() - for upFile in upOutputs: - _logger.debug("look for downstream jobs for %s" % upFile) - # select PandaID - self.cur.execute((sqlD+comment) % upFile) - res = self.cur.fetchall() - for downID in res: - _logger.debug("delete : %s" % downID) - # select jobs - self.cur.execute((sqlDJS+comment) % downID) - resJob = self.cur.fetchall() - if len(resJob) == 0: - continue - # instantiate JobSpec - dJob = JobSpec() - dJob.pack(resJob[0]) - # delete - retD = self.cur.execute((sqlDJD+comment) % downID) - if retD == 0: - continue - # error code - dJob.jobStatus = 'failed' - dJob.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - dJob.taskBufferErrorCode = ErrorCode.EC_Kill - dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed' - dJob.modificationTime = dJob.endTime - dJob.stateChangeTime = dJob.endTime - # insert - self.cur.execute((sqlDJI+comment), dJob.values()) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("archiveJobLite : %s retry : %s" % (pandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("archiveJobLite : %s %s" % (type,value)) - return False - - - # update Job status in jobsActive - def updateJobStatus(self,pandaID,jobStatus,param): - comment = ' /* DBProxy.updateJobStatus */' - _logger.debug("updateJobStatus : %s" % pandaID) - sql1 = "UPDATE jobsActive4 SET jobStatus=%s,modificationTime=UTC_TIMESTAMP()" - if jobStatus in ['starting']: - sql1 += ",stateChangeTime=UTC_TIMESTAMP()" - values = [jobStatus] - for key in param.keys(): - if param[key] != None: - sql1 = sql1 + (',%s=' % key) + '%s' - values.append(param[key]) - sql1 += " WHERE PandaID=%s" - values.append(pandaID) - nTry=3 - for iTry in range(nTry): - try: - # begin transaction - self.cur.execute("START TRANSACTION") - # update - self.cur.execute (sql1+comment,tuple(values)) - # get command - self.cur.execute ('SELECT commandToPilot,endTime FROM jobsActive4 WHERE PandaID=%s'+comment,(pandaID,)) - res = self.cur.fetchone() - if res != None: - ret = res[0] - # update endTime - endTime = res[1] - if jobStatus == 'holding' and endTime==None: - self.cur.execute ("UPDATE jobsActive4 SET endTime=UTC_TIMESTAMP() WHERE PandaID=%s"+comment,(pandaID,)) - else: - # already deleted - ret = 'tobekilled' - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return ret - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("updateJobStatus : %s retry : %s" % (pandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("updateJobStatus : %s %s" % (type,value)) - _logger.error("updateJobStatus : %s" % pandaID) - return False - - - # update job information in jobsActive or jobsDefined - def updateJob(self,job,inJobsDefined): - comment = ' /* DBProxy.updateJob */' - _logger.debug("updateJob : %s" % job.PandaID) - if inJobsDefined: - sql1 = "UPDATE jobsDefined4 SET %s " % JobSpec.updateExpression() - else: - sql1 = "UPDATE jobsActive4 SET %s " % JobSpec.updateExpression() - sql1+= "WHERE PandaID=%s" - if inJobsDefined: - sql1+= " AND (jobStatus='assigned' OR jobStatus='defined')" - nTry=3 - for iTry in range(nTry): - try: - job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - # set stateChangeTime for defined->assigned - if inJobsDefined: - job.stateChangeTime = job.modificationTime - # begin transaction - self.cur.execute("START TRANSACTION") - # update - n = self.cur.execute(sql1+comment, job.values()+(job.PandaID,)) - if n==0: - # already killed or activated - _logger.debug("updateJob : Not found %s" % job.PandaID) - else: - sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" - for file in job.Files: - self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("updateJob : %s retry : %s" % (job.PandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("updateJob : %s %s" % (type,value)) - return False - - - # retry analysis job - def retryJob(self,pandaID,param): - comment = ' /* DBProxy.retryJob */' - _logger.debug("retryJob : %s" % pandaID) - sql1 = "SELECT %s FROM jobsActive4 " % JobSpec.columnNames() - sql1+= "WHERE PandaID=%s" - sql2 = "UPDATE jobsActive4 SET %s " % JobSpec.updateExpression() - sql2+= "WHERE PandaID=%s" - nTry=3 - for iTry in range(nTry): - try: - retValue = False - # begin transaction - self.cur.execute("START TRANSACTION") - # select - self.cur.execute(sql1+comment, (pandaID,)) - res = self.cur.fetchall() - if len(res) == 0: - _logger.debug("retryJob() : PandaID %d not found" % pandaID) - self._rollback() - return retValue - job = JobSpec() - job.pack(res[0]) - # check if it's analysis job - if (((job.prodSourceLabel == 'user' or job.prodSourceLabel == 'panda') \ - and job.computingSite.startswith('ANALY_') and param.has_key('pilotErrorCode') \ - and param['pilotErrorCode'] in ['1200','1201'] and (not job.computingSite.startswith('ANALY_LONG_')) \ - and job.attemptNr < 2) or (job.prodSourceLabel == 'ddm' and job.cloud == 'CA' and job.attemptNr <= 10)) \ - and job.commandToPilot != 'tobekilled': - _logger.debug(' -> reset PandaID:%s #%s' % (job.PandaID,job.attemptNr)) - # reset job - job.jobStatus = 'activated' - job.startTime = None - job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - job.attemptNr = job.attemptNr + 1 - # send it to long queue for analysis jobs - oldComputingSite = job.computingSite - if job.computingSite.startswith('ANALY') and (not job.computingSite.startswith('ANALY_LONG_')): - longSite = re.sub('^ANALY_','ANALY_LONG_',job.computingSite) - longSite = re.sub('_\d+$','',longSite) - if longSite in PandaSiteIDs.keys(): - job.computingSite = longSite - # set destinationSE if queue is changed - if oldComputingSite == job.destinationSE: - job.destinationSE = job.computingSite - # select files - sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=%s AND (type='log' OR type='output')" - self.cur.execute(sqlFile+comment, (job.PandaID,)) - resFs = self.cur.fetchall() - for resF in resFs: - # set PandaID - file = FileSpec() - file.pack(resF) - job.addFile(file) - # set new GUID - if file.type == 'log': - file.GUID = commands.getoutput('uuidgen') - # append attemptNr to LFN - oldName = file.lfn - file.lfn = re.sub('\.\d+$','',file.lfn) - file.lfn = '%s.%s' % (file.lfn,job.attemptNr) - newName = file.lfn - # set destinationSE - if oldComputingSite == file.destinationSE: - file.destinationSE = job.computingSite - # modify jobParameters - sepPatt = "(\'|\"|%20)" + oldName + "(\'|\"|%20)" - matches = re.findall(sepPatt,job.jobParameters) - for match in matches: - oldPatt = match[0]+oldName+match[-1] - newPatt = match[0]+newName+match[-1] - job.jobParameters = re.sub(oldPatt,newPatt,job.jobParameters) - # update - sqlFup = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" - self.cur.execute(sqlFup+comment, file.values()+(file.rowID,)) - # update job - self.cur.execute(sql2+comment, job.values()+(job.PandaID,)) - # set return - retValue = True - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return retValue - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("retryJob : %s retry : %s" % (pandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - # error report - type, value, traceBack = sys.exc_info() - _logger.error("retryJob : %s %s" % (type,value)) - return False - - - # get jobs - def getJobs(self,nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, - atlasRelease,prodUserID): - comment = ' /* DBProxy.getJobs */' - dynamicBrokering = False - sql1 = "WHERE jobStatus=%s AND computingSite=%s AND commandToPilot<>'tobekilled' " - if not mem in [0,'0']: - sql1+= "AND (minRamCount<=%s OR minRamCount=0) " % mem - if not diskSpace in [0,'0']: - sql1+= "AND (maxDiskCount<%s OR maxDiskCount=0) " % diskSpace - if prodSourceLabel == 'user': - sql1+= "AND (prodSourceLabel='user' OR prodSourceLabel='panda') " - elif prodSourceLabel == 'ddm': - dynamicBrokering = True - sql1+= "AND prodSourceLabel='ddm' " - elif prodSourceLabel in [None,'managed']: - sql1+= "AND (prodSourceLabel='managed' OR prodSourceLabel='test') " - elif prodSourceLabel == 'software': - sql1+= "AND prodSourceLabel='software' " - elif prodSourceLabel == 'test' and computingElement != None: - dynamicBrokering = True - sql1+= "AND (computingElement='%s' OR computingElement='to.be.set' OR processingType='prod_test' OR prodSourceLabel='test') " % computingElement - else: - sql1+= "AND prodSourceLabel='%s' " % prodSourceLabel - # user ID - if prodUserID != None: - sql1+= "AND prodUserID='%s' " % prodUserID - sql2 = "SELECT %s FROM jobsActive4 " % JobSpec.columnNames() - sql2+= "WHERE PandaID=%s" - retJobs = [] - nSent = 0 - try: - timeLimit = datetime.timedelta(seconds=timeout-10) - timeStart = datetime.datetime.utcnow() - strName = datetime.datetime.isoformat(timeStart) - attLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) - attSQL = "AND ((creationTime<'%s' AND attemptNr>1) OR attemptNr<=1) " % attLimit.strftime('%Y-%m-%d %H:%M:%S') - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # get nJobs - for iJob in range(nJobs): - pandaID = 0 - # select channel for ddm jobs - if prodSourceLabel == 'ddm': - sqlDDM = "SELECT count(*),jobStatus,sourceSite,destinationSite,transferType FROM jobsActive4 WHERE computingSite=%s AND prodSourceLabel='ddm' " + attSQL + "GROUP BY jobStatus,sourceSite,destinationSite,transferType" - _logger.debug((sqlDDM+comment) % siteName) - self.cur.execute(sqlDDM+comment,(siteName,)) - resDDM = self.cur.fetchall() - # make a channel map - channelMap = {} - for tmp_count,tmp_jobStatus,tmp_sourceSite,tmp_destinationSite,tmp_transferType in resDDM: - # use source,dest,type as the key - channel = (tmp_sourceSite,tmp_destinationSite,tmp_transferType) - if not channelMap.has_key(channel): - channelMap[channel] = {} - # ignore holding - if tmp_jobStatus == 'holding': - continue - # distinguish activate from other stats - if tmp_jobStatus != 'activated': - tmp_jobStatus = 'others' - # append - if not channelMap[channel].has_key(tmp_jobStatus): - channelMap[channel][tmp_jobStatus] = int(tmp_count) - else: - channelMap[channel][tmp_jobStatus] += int(tmp_count) - _logger.debug(channelMap) - # choose channel - channels = channelMap.keys() - random.shuffle(channels) - foundChannel = False - for channel in channels: - # no activated jobs - if (not channelMap[channel].has_key('activated')) or channelMap[channel]['activated'] == 0: - continue - maxRunning = 10 - # prestaging job - if channel[0] == channel[1] and channel[2] == 'dis': - maxRunning = 50 - if (not channelMap[channel].has_key('others')) or channelMap[channel]['others'] < maxRunning: - # set SQL - sql1+= "AND sourceSite='%s' AND destinationSite='%s' AND transferType='%s' " \ - % channel - foundChannel = True - break - # no proper channel - if not foundChannel: - _logger.debug("getJobs : no DDM jobs for Site %s" % siteName) - break - # get job - if prodSourceLabel in ['ddm']: - # to add some delay for attempts - sql1 += attSQL - nTry=1 - for iTry in range(nTry): - # set siteID - tmpSiteID = siteName - if siteName.startswith('ANALY_BNL_ATLAS'): - tmpSiteID = 'ANALY_BNL_ATLAS_1' - # get file lock - _logger.debug("getJobs : %s -> lock" % strName) - if (datetime.datetime.utcnow() - timeStart) < timeLimit: - toGetPandaIDs = True - pandaIDs = [] - # get max priority for analysis jobs - if prodSourceLabel in ['panda','user']: - sqlMX = "SELECT MAX(currentPriority) FROM jobsActive4 " - sqlMX+= sql1 - _logger.debug((sqlMX+comment) % ("activated",tmpSiteID)) - self.cur.execute(sqlMX+comment, ("activated",tmpSiteID)) - tmpPriority, = self.cur.fetchone() - # no jobs - if tmpPriority == None: - toGetPandaIDs = False - else: - # set priority - sql1 += "AND currentPriority=%s" % tmpPriority - if toGetPandaIDs: - # get PandaIDs - sqlP = "SELECT PandaID,currentPriority FROM jobsActive4 " - sqlP+= sql1 - _logger.debug((sqlP+comment) % ("activated",tmpSiteID)) - self.cur.execute(sqlP+comment, ("activated",tmpSiteID)) - resIDs = self.cur.fetchall() - maxCurrentPriority = None - # get max priority and min PandaID - for tmpPandaID,tmpCurrentPriority in resIDs: - if maxCurrentPriority==None or maxCurrentPriority < tmpCurrentPriority: - maxCurrentPriority = tmpCurrentPriority - pandaIDs = [tmpPandaID] - elif maxCurrentPriority == tmpCurrentPriority: - pandaIDs.append(tmpPandaID) - # sort - pandaIDs.sort() - if pandaIDs == []: - _logger.debug("getJobs : %s -> no PandaIDs" % strName) - retU = 0 - else: - # get nSent for production jobs - if prodSourceLabel in [None,'managed']: - sentLimit = timeStart - datetime.timedelta(seconds=60) - sqlSent = "SELECT count(*) FROM jobsActive4 WHERE jobStatus='sent' " - sqlSent += "AND prodSourceLabel IN ('managed','test') " - sqlSent += "AND computingSite='%s' " % tmpSiteID - sqlSent += "AND modificationTime>'%s' " % sentLimit.strftime('%Y-%m-%d %H:%M:%S') - self.cur.execute(sqlSent+comment) - resSent = self.cur.fetchone() - if resSent != None: - nSent, = resSent - # update - for indexID,tmpPandaID in enumerate(pandaIDs): - # max attempts - if indexID > 10: - break - # update - sqlJ = "UPDATE jobsActive4 " - sqlJ+= "SET jobStatus=%s,modificationTime=UTC_TIMESTAMP(),modificationHost=%s,startTime=UTC_TIMESTAMP()" - # set CE - if computingElement != None: - sqlJ+= ",computingElement='%s'" % computingElement - sqlJ+= " WHERE PandaID=%s AND jobStatus=%s" - _logger.debug((sqlJ+comment) % ("sent",node,tmpPandaID,"activated")) - retU = self.cur.execute(sqlJ+comment,("sent",node,tmpPandaID,"activated")) - # succeeded - if retU != 0: - pandaID = tmpPandaID - # increment nSent - if prodSourceLabel in [None,'managed']: - nSent += (indexID+1) - break - else: - _logger.debug("getJobs : %s -> do nothing" % strName) - retU = 0 - # release file lock - _logger.debug("getJobs : %s -> unlock" % strName) - # succeeded - if retU != 0: - break - if iTry+1 < nTry: - #time.sleep(0.5) - pass - # failed to UPDATE - if retU == 0: - # reset pandaID - pandaID = 0 - _logger.debug("getJobs : Site %s : retU %s : PandaID %s - %s" - % (siteName,retU,pandaID,prodSourceLabel)) - if pandaID == 0: - break - # select - self.cur.execute(sql2+comment, (pandaID,)) - res = self.cur.fetchone() - if len(res) == 0: - break - # instantiate Job - job = JobSpec() - job.pack(res) - # Files - sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=%s" - self.cur.execute(sqlFile+comment, (job.PandaID,)) - resFs = self.cur.fetchall() - for resF in resFs: - file = FileSpec() - file.pack(resF) - job.addFile(file) - # append - retJobs.append(job) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return retJobs,nSent - except: - # roll back - self._rollback() - # error report - type, value, traceBack = sys.exc_info() - _logger.error("getJobs : %s %s" % (type,value)) - return [],0 - - - # reset job in jobsActive or jobsWaiting - def resetJob(self,pandaID,activeTable=True,keepSite=False): - comment = ' /* DBProxy.resetJob */' - _logger.debug("resetJobs : %s" % pandaID) - # select table - table = 'jobsWaiting4' - if activeTable: - table = 'jobsActive4' - sql1 = "SELECT %s FROM %s " % (JobSpec.columnNames(),table) - sql1+= "WHERE PandaID=%s" - sql2 = "DELETE FROM %s " % table - sql2+= "WHERE PandaID=%s AND (jobStatus='waiting' OR jobStatus='activated')" - sql3 = "INSERT INTO jobsDefined4 (%s) " % JobSpec.columnNames() - sql3+= JobSpec.valuesExpression() - try: - # transaction causes Request ndbd time-out in jobsActive4 - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute(sql1+comment,(pandaID,)) - res = self.cur.fetchone() - # not found - if res == None: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return None - # instantiate Job - job = JobSpec() - job.pack(res) - # if already running - if job.jobStatus != 'waiting' and job.jobStatus != 'activated': - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return None - # delete - retD = self.cur.execute(sql2+comment,(pandaID,)) - # delete failed - _logger.debug("resetJobs : retD = %s" % retD) - if retD != 1: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return None - # delete from jobsDefined4 just in case - sqlD = "DELETE FROM jobsDefined4 WHERE PandaID=%s" - self.cur.execute(sqlD+comment,(pandaID,)) - # increase priority - if job.jobStatus == 'activated' and job.currentPriority < 100: - job.currentPriority = 100 - # reset computing site and dispatchDBlocks - job.jobStatus = 'defined' - job.dispatchDBlock = None - # erase old assignment - if (not keepSite) and job.relocationFlag != 1: - job.computingSite = None - job.computingElement = None - # host and time information - job.modificationHost = self.hostname - job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - job.stateChangeTime = job.modificationTime - # insert - self.cur.execute(sql3+comment, job.values()) - # Files - sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=%s" - self.cur.execute(sqlFile+comment, (job.PandaID,)) - resFs = self.cur.fetchall() - for resF in resFs: - file = FileSpec() - file.pack(resF) - # reset GUID to trigger LRC/LFC scanning - if file.status == 'missing': - file.GUID = None - # reset status, destinationDBlock and dispatchDBlock - file.status ='unknown' - file.dispatchDBlock = None - file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock) - # add file - job.addFile(file) - # update files - sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" - self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return job - except: - # roll back - self._rollback() - # error report - type, value, traceBack = sys.exc_info() - _logger.error("resetJobs : %s %s" % (type,value)) - _logger.error("resetJobs : %s" % pandaID) - return None - - - # reset jobs in jobsDefined - def resetDefinedJob(self,pandaID,keepSite=False): - comment = ' /* DBProxy.resetDefinedJob */' - _logger.debug("resetDefinedJob : %s" % pandaID) - sql1 = "UPDATE jobsDefined4 SET " - sql1 += "jobStatus='defined'," - sql1 += "modificationTime=UTC_TIMESTAMP()," - sql1 += "dispatchDBlock=NULL," - sql1 += "computingElement=NULL" - sql1 += " WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')" - sql2 = "SELECT %s FROM jobsDefined4 " % JobSpec.columnNames() - sql2+= "WHERE PandaID=%s" - try: - # begin transaction - self.cur.execute("START TRANSACTION") - # update - retU = self.cur.execute(sql1+comment,(pandaID,)) - # not found - job = None - if retU == 0: - _logger.debug("resetDefinedJob : Not found %s" % pandaID) - else: - # select - self.cur.execute(sql2+comment,(pandaID,)) - res = self.cur.fetchone() - # not found - if res == None: - raise RuntimeError, 'Could not SELECT : PandaID=%s' % pandaID - # instantiate Job - job = JobSpec() - job.pack(res) - job.dispatchDBlock = None - if (not keepSite) and job.relocationFlag != 1: - # erase old assignment - job.computingSite = None - job.computingElement = None - # Files - sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=%s" - self.cur.execute(sqlFile+comment, (job.PandaID,)) - resFs = self.cur.fetchall() - for resF in resFs: - file = FileSpec() - file.pack(resF) - # reset status, destinationDBlock and dispatchDBlock - file.status ='unknown' - file.dispatchDBlock = None - file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock) - # add file - job.addFile(file) - # update files - sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" - self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return job - except: - # error report - type, value, traceBack = sys.exc_info() - _logger.error("resetDefinedJobs : %s %s" % (type,value)) - #_logger.error(traceback.format_exc()) - # roll back - self._rollback() - return None - - - # kill job - def killJob(self,pandaID,user,code,prodManager): - comment = ' /* DBProxy.killJob */' - _logger.debug("killJob : %s %s %s %s" % (code,pandaID,prodManager,user)) - # check PandaID - try: - long(pandaID) - except: - _logger.error("not an integer : %s" % pandaID) - return False - sql0 = "SELECT prodUserID FROM %s WHERE PandaID=%s" - sql1 = "UPDATE %s SET commandToPilot='tobekilled' WHERE PandaID=%s AND commandToPilot<>'tobekilled'" - sql2 = "SELECT %s " % JobSpec.columnNames() - sql2+= "FROM %s WHERE PandaID=%s AND jobStatus<>'running'" - sql3 = "DELETE FROM %s WHERE PandaID=%s" - sqlU = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')" - sql4 = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames() - sql4+= JobSpec.valuesExpression() - try: - flagCommand = False - flagKilled = False - # begin transaction - self.cur.execute("START TRANSACTION") - for table in ('jobsDefined4','jobsActive4','jobsWaiting4'): - # get DN if user is not production DN - if (not prodManager) and (not user.startswith('/DC=org/DC=doegrids/OU=People/CN=Nurcan Ozturk')) \ - and (not user.startswith('/DC=org/DC=doegrids/OU=People/CN=Torre Wenaus')): - self.cur.execute((sql0+comment) % (table,pandaID)) - res = self.cur.fetchone() - # not found - if res == None: - continue - # owner? - def getCN(dn): - distinguishedName = '' - for line in dn.split('/'): - if line.startswith('CN='): - distinguishedName = re.sub('^CN=','',line) - distinguishedName = re.sub('\d+$','',distinguishedName) - distinguishedName = distinguishedName.strip() - break - return distinguishedName - cn1 = getCN(res[0]) - cn2 = getCN(user) - _logger.debug("Owner:%s - Requester:%s " % (cn1,cn2)) - if cn1 != cn2: - _logger.debug("ignore killJob -> Owner != Requester") - break - # update - retU = self.cur.execute((sql1+comment) % (table,pandaID)) - if retU == 0: - continue - # set flag - flagCommand = True - # select - self.cur.execute((sql2+comment) % (table,pandaID)) - res = self.cur.fetchall() - if len(res) == 0: - continue - # instantiate JobSpec - job = JobSpec() - job.pack(res[0]) - # delete - if table=='jobsDefined4': - retD = self.cur.execute((sqlU+comment) % (pandaID,)) - else: - retD = self.cur.execute((sql3+comment) % (table,pandaID)) - if retD == 0: - continue - # error code - job.jobStatus = 'failed' - job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - job.modificationTime = job.endTime - job.stateChangeTime = job.modificationTime - if code in ['2','4']: - # expire - if code == '2': - job.taskBufferErrorCode = ErrorCode.EC_Expire - job.taskBufferErrorDiag = 'expired after 7 days since submission' - else: - # waiting timeout - job.taskBufferErrorCode = ErrorCode.EC_Expire - #job.taskBufferErrorCode = ErrorCode.EC_WaitTimeout - job.taskBufferErrorDiag = 'expired after waiting for input data for 2 days' - elif code=='3': - # aborted - job.taskBufferErrorCode = ErrorCode.EC_Aborted - job.taskBufferErrorDiag = 'aborted by ExtIF' - else: - # killed - job.taskBufferErrorCode = ErrorCode.EC_Kill - job.taskBufferErrorDiag = 'killed by %s' % user - # insert - self.cur.execute(sql4+comment, job.values()) - flagKilled = True - break - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("killJob : com=%s kill=%s " % (flagCommand,flagKilled)) - return (flagCommand or flagKilled) - except: - type, value, traceBack = sys.exc_info() - _logger.error("killJob : %s %s" % (type,value)) - # roll back - self._rollback() - return False - - - # peek at job - def peekJob(self,pandaID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal=False): - comment = ' /* DBProxy.peekJob */' - _logger.debug("peekJob : %s" % pandaID) - # return None for NULL PandaID - if pandaID in ['NULL','','None',None]: - return None - sql1_0 = "SELECT %s FROM %s " - sql1_1 = "WHERE PandaID=%s" - try: - tables=[] - if fromActive: - tables.append('jobsActive4') - if fromArchived: - tables.append('jobsArchived4') - if fromWaiting: - tables.append('jobsWaiting4') - if fromDefined: - # defined needs to be the last one due to InnoDB's auto_increment - tables.append('jobsDefined4') - # select - for table in tables: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1 - self.cur.execute(sql+comment, (pandaID,)) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if len(res) != 0: - # Job - job = JobSpec() - job.pack(res[0]) - # Files - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=%s" - self.cur.execute(sqlFile+comment, (job.PandaID,)) - resFs = self.cur.fetchall() - # metadata - if table == 'jobsArchived4' and (not forAnal): - # read metadata only for finished/failed jobs - sqlMeta = "SELECT metaData FROM metaTable WHERE PandaID=%s" - self.cur.execute(sqlMeta+comment, (job.PandaID,)) - resMeta = self.cur.fetchone() - else: - resMeta = None - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # set files - for resF in resFs: - file = FileSpec() - file.pack(resF) - job.addFile(file) - # set metadata - if resMeta != None: - job.metadata = resMeta[0] - return job - _logger.debug("peekJob() : PandaID %s not found" % pandaID) - return None - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("peekJob : %s %s" % (type,value)) - # return None for analysis - if forAnal: - return None - # return 'unknown' - job = JobSpec() - job.PandaID = pandaID - job.jobStatus = 'unknown' - return job - - - # get JobIDs in a time range - def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs): - comment = ' /* DBProxy.getJobIDsInTimeRange */' - _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) - try: - tables = ['jobsArchived4','jobsActive4','jobsWaiting4','jobsDefined4'] - # select - for table in tables: - # make sql - sql = "SELECT jobDefinitionID FROM %s " % table - sql += "WHERE prodUserID=%s AND modificationTime>%s AND prodSourceLabel='user'" - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - _logger.debug(sql+comment+str((dn,timeRange.strftime('%Y-%m-%d %H:%M:%S')))) - self.cur.execute(sql+comment, (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for tmpID, in resList: - if not tmpID in retJobIDs: - retJobIDs.append(tmpID) - _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs)) - return retJobIDs - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getJobIDsInTimeRange : %s %s" % (type,value)) - # return empty list - return [] - - - # get PandaIDs for a JobID - def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs): - comment = ' /* DBProxy.getPandIDsWithJobID */' - _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID)) - try: - tables = ['jobsArchived4','jobsActive4','jobsWaiting4','jobsDefined4'] - # select - for table in tables: - # skip if all jobs have already been gotten - if nJobs > 0 and len(idStatus) >= nJobs: - continue - # make sql - sql = "SELECT PandaID,jobStatus,commandToPilot FROM %s " % table - sql += "WHERE prodUserID=%s AND jobDefinitionID=%s " - sql += "AND prodSourceLabel in ('user','panda') " - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - _logger.debug(sql+comment+str((dn,jobID))) - self.cur.execute(sql+comment, (dn,jobID)) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for tmpID,tmpStatus,tmpCommand in resList: - if not idStatus.has_key(tmpID): - idStatus[tmpID] = (tmpStatus,tmpCommand) - _logger.debug("getPandIDsWithJobID : %s" % str(idStatus)) - return idStatus - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getPandIDsWithJobID : %s %s" % (type,value)) - # return empty list - return {} - - - # query PandaID - def queryPandaID(self,jobDefID): - comment = ' /* DBProxy.queryPandaID */' - _logger.debug("queryPandaID : %s" % jobDefID) - sql0 = "SELECT PandaID,attemptNr FROM %s WHERE attemptNr=(" - sql0+= "SELECT MAX(attemptNr) FROM %s" - sql1= " WHERE prodSourceLabel=%s AND jobDefinitionID=%s) AND prodSourceLabel=%s AND jobDefinitionID=%s" - try: - ids = [] - # select - for table in ['jobsDefined4','jobsActive4','jobsArchived4','jobsWaiting4']: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = sql0 % (table,table) + sql1 - self.cur.execute(sql+comment, ('managed',jobDefID,'managed',jobDefID)) - res = self.cur.fetchall() - ids += list(res) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # look for the latest attempt - preAtt =-1 - pandaID=None - for pID,att in ids: - if att > preAtt: - pandaID = pID - preAtt = att - if att == preAtt: - if pandaID < pID: - pandaID = pID - return pandaID - except: - type, value, traceBack = sys.exc_info() - _logger.error("queryPandaID : %s %s" % (type,value)) - # roll back - self._rollback() - return None - - - # query job info per cloud - def queryJobInfoPerCloud(self,cloud,schedulerID=None): - comment = ' /* DBProxy.queryJobInfoPerCloud */' - _logger.debug("queryJobInfoPerCloud : %s %s" % (cloud,schedulerID)) - attrs = ['PandaID','jobStatus','jobName'] - sql0 = "SELECT " - for attr in attrs: - sql0 += "%s," % attr - sql0 = "%s " % sql0[:-1] - sql0+= "FROM %s " - sql0+= "WHERE cloud='%s' " % cloud - if schedulerID != None: - sql0+= "AND schedulerID='%s' " % schedulerID - try: - ids = [] - returnList = [] - # select - for table in ['jobsActive4','jobsWaiting4','jobsDefined4']: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = sql0 % table - self.cur.execute(sql+comment) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # loop over all - for res in resList: - valMap = {} - # skip if already in the list - PandaID = res[0] - if PandaID in ids: - continue - # convert to map - for idx,attr in enumerate(attrs): - valMap[attr] = res[idx] - # append to list - ids.append(PandaID) - returnList.append(valMap) - # return - return returnList - except: - type, value, traceBack = sys.exc_info() - _logger.error("queryJobInfoPerCloud : %s %s" % (type,value)) - # roll back - self._rollback() - return None - - - # get PandaIDs at Site - def getPandaIDsSite(self,site,status,limit): - comment = ' /* DBProxy.getPandaIDsSite */' - _logger.debug("getPandaIDsSite : %s %s %s" % (site,status,limit)) - try: - ids = [] - # find table - if status in ['defined','assigned']: - table = 'jobsDefined4' - elif status in ['activated','running','holding','trasnferring']: - table = 'jobsActive4' - elif status in ['waiting']: - table = 'jobsWaiting4' - elif status in ['finished','failed']: - table = 'jobsArchived4' - else: - _logger.error("unknown status:%s" % status) - return ids - # limit - limit = int(limit) - # SQL - sql = "SELECT PandaID FROM %s " % table - sql += "WHERE computingSite=%s AND jobStatus=%s AND prodSourceLabel=%s " - sql += "LIMIT %d" % limit - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute(sql+comment, (site,status,'managed')) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # convert to list - for id, in res: - ids.append(id) - return ids - except: - type, value, traceBack = sys.exc_info() - _logger.error("getPandaIDsSite : %s %s" % (type,value)) - # roll back - self._rollback() - return [] - - - # get PandaIDs to be updated in prodDB - def getPandaIDsForProdDB(self,limit,lockedby): - comment = ' /* DBProxy.getPandaIDsForProdDB */' - _logger.debug("getPandaIDsForProdDB %s" % limit) - sql0 = "SELECT PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID FROM %s " - sql0+= "WHERE prodSourceLabel IN ('managed','rc_test') AND lockedby='%s' " % lockedby - sql0+= "AND stateChangeTime>prodDBUpdateTime AND stateChangeTime<>'0000-00-00 00:00:00'" - try: - retMap = {} - totalIDs = 0 - # select - for table in ['jobsArchived4','jobsActive4','jobsWaiting4','jobsDefined4']: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = sql0 % table - self.cur.execute(sql+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - for PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID in res: - # ignore dummy jobs in jobsDefined4 - if table == 'jobsDefined4' and (not jobStatus in ['defined','assigned']): - continue - # add status - if not retMap.has_key(jobStatus): - retMap[jobStatus] = [] - # append - retMap[jobStatus].append({'PandaID':PandaID,'attemptNr':attemptNr, - 'stateChangeTime':stateChangeTime.strftime('%Y-%m-%d %H:%M:%S'), - 'jobDefinitionID':jobDefinitionID, - 'jobExecutionID':jobExecutionID}) - totalIDs += 1 - # limit - if totalIDs > limit: - break - _logger.debug("getPandaIDsForProdDB %s ret->%s" % (limit,totalIDs)) - return retMap - except: - type, value, traceBack = sys.exc_info() - _logger.error("getPandaIDsForProdDB : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # update prodDBUpdateTime - def updateProdDBUpdateTime(self,param): - comment = ' /* DBProxy.updateProdDBUpdateTime */' - _logger.debug("updateProdDBUpdateTime %s" % str(param)) - sql0 = "UPDATE %s " - sql0+= "SET prodDBUpdateTime='%s' " % param['stateChangeTime'] - sql0+= "WHERE PandaID=%s AND jobStatus='%s' AND stateChangeTime='%s'" % (param['PandaID'], - param['jobStatus'], - param['stateChangeTime']) - try: - if param['jobStatus'] in ['defined','assigned']: - table = 'jobsDefined4' - elif param['jobStatus'] in ['waiting']: - table = 'jobsWaiting4' - elif param['jobStatus'] in ['activated','sent','starting','running','holding','transferring']: - table = 'jobsActive4' - elif param['jobStatus'] in ['finished','failed']: - table = 'jobsArchived4' - else: - _logger.error("invalid status %s" % param['jobStatus']) - return False - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # update - sql = sql0 % table - _logger.debug(sql) - retU = self.cur.execute(sql+comment) - _logger.debug("updateProdDBUpdateTime %s ret=%s" % (param['PandaID'],retU)) - if retU == 1: - return True - return False - except: - type, value, traceBack = sys.exc_info() - _logger.error("updateProdDBUpdateTime : %s %s" % (type,value)) - # roll back - self._rollback() - return False - - - # add metadata - def addMetadata(self,pandaID,metadata): - comment = ' /* DBProxy.addMetaData */' - _logger.debug("addMetaData : %s" % pandaID) - sql0 = "SELECT PandaID FROM metaTable WHERE PandaID=%s" - sql1 = "INSERT INTO metaTable (PandaID,metaData) VALUE (%s,%s)" - nTry=3 - for iTry in range(nTry): - try: - # autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute(sql0+comment, (pandaID,)) - res = self.cur.fetchone() - # already exist - if res != None: - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - # insert - self.cur.execute(sql1+comment, (pandaID,metadata)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("addMetaData : %s retry : %s" % (pandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("addMetaData : %s %s" % (type,value)) - return False - - - # insert dataset - def insertDataset(self,dataset,tablename="Datasets"): - comment = ' /* DBProxy.insertDataset */' - _logger.debug("insertDataset(%s)" % dataset.name) - sql1 = "INSERT INTO %s " % tablename - sql1+= "(%s) " % DatasetSpec.columnNames() - sql1+= DatasetSpec.valuesExpression() - # time information - dataset.creationdate = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - dataset.modificationdate = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - try: - # get file lock - #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_EX) - # begin transaction - self.cur.execute("START TRANSACTION") - # avoid duplication - self.cur.execute("SELECT vuid FROM "+tablename+" WHERE vuid=%s"+comment, (dataset.vuid,)) - res = self.cur.fetchall() - if len(res) == 0: - # insert - self.cur.execute(sql1+comment, dataset.values()) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # release file lock - #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN) - return True - except: - # roll back - self._rollback() - # release file lock - #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN) - # error - type, value, traceBack = sys.exc_info() - _logger.error("insertDataset() : %s %s" % (type,value)) - return False - - - # query dataset with map - def queryDatasetWithMap(self,map): - comment = ' /* DBProxy.queryDatasetWithMap */' - _logger.debug("queryDatasetWithMap(%s)" % map) - sql1 = "SELECT %s FROM Datasets" % DatasetSpec.columnNames() - valueL = [] - for key in map.keys(): - if len(valueL)==0: - sql1+= " WHERE %s=" % key - else: - sql1+= " AND %s=" % key - sql1+= "%s" - valueL.append(map[key]) - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - nTry=5 - for iTry in range(nTry): - retS = self.cur.execute(sql1+comment, tuple(valueL)) - res = self.cur.fetchall() - if retS>=0 and res != None and retS==len(res): - break - if iTry+1 < nTry: - _logger.debug("queryDatasetWithMap : retS %s retry : %s" % (retS,iTry)) - time.sleep(random.randint(10,20)) - _logger.debug("queryDatasetWithMap(%s) : retS %s ret %s" % (str(map),retS,str(res))) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # instantiate Dataset - if res != None and len(res) != 0: - dataset = DatasetSpec() - dataset.pack(res[0]) - return dataset - _logger.error("queryDatasetWithMap(%s) : dataset not found" % map) - return None - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("queryDatasetWithMap(%s) : %s %s" % (map,type,value)) - return None - - - # update dataset - def updateDataset(self,datasets,withLock,withCriteria): - comment = ' /* DBProxy.updateDataset */' - _logger.debug("updateDataset()") - sql1 = "UPDATE Datasets SET %s " % DatasetSpec.updateExpression() - sql1+= "WHERE vuid=%s" - if withCriteria != "": - sql1+= " AND %s" % withCriteria - nTry=3 - for iTry in range(nTry): - try: - # get file lock - if withLock: - fcntl.flock(_lockSetDS.fileno(), fcntl.LOCK_EX) - retList = [] - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - for dataset in datasets: - _logger.debug("updateDataset(%s,%s)" % (dataset.name,dataset.status)) - # time information - dataset.modificationdate = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) - # update - retU = self.cur.execute(sql1+comment, dataset.values()+(dataset.vuid,)) - if retU != 0 and retU != 1: - raise RuntimeError, 'Invalid retrun %s' % retU - retList.append(retU) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # release file lock - if withLock: - fcntl.flock(_lockSetDS.fileno(), fcntl.LOCK_UN) - _logger.debug("updateDataset() ret:%s" % retList) - return retList - except: - # roll back - self._rollback() - # release file lock - if withLock: - fcntl.flock(_lockSetDS.fileno(), fcntl.LOCK_UN) - if iTry+1 < nTry: - _logger.debug("updateDataset : retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("updateDataset() : %s %s" % (type,value)) - return [] - - - # delete dataset - def deleteDataset(self,name): - comment = ' /* DBProxy.deleteDataset */' - sql1 = "DELETE FROM Datasets WHERE name=%s" - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # delete - self.cur.execute(sql1+comment,(name,)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("deleteDataset() : %s %s" % (type,value)) - return False - - - # get serial number for dataset, insert dummy datasets to increment SN - def getSerialNumber(self,datasetname): - comment = ' /* DBProxy.getSerialNumber */' - try: - _logger.debug("getSerialNumber(%s)" % datasetname) - # get file lock - #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_EX) - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = "SELECT COUNT(*) FROM Datasets WHERE type='output' AND name='%s'" % datasetname - nTry=3 - for iTry in range(nTry): - retS = self.cur.execute(sql+comment) - res = self.cur.fetchone() - _logger.debug("getSerialNumber : retS %s, res %s" % (retS,res)) - if retS>=0 and res != None: - break - if iTry+1 < nTry: - time.sleep(random.randint(10,20)) - # fresh dataset or not - if res != None and len(res) != 0 and res[0] > 0: - freshFlag = False - else: - freshFlag = True - # get serial number - sql = "INSERT INTO subCounter (subID) VALUES ('NULL')" - self.cur.execute(sql+comment) - sn = self.conn.insert_id() - # delete. '<' is needed for auto_incr of InnoDB - sql = "DELETE FROM subCounter where subID<%s" % sn - self.cur.execute(sql+comment) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # release file lock - #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN) - _logger.debug("getSerialNumber : %s %s" % (sn,freshFlag)) - return (sn,freshFlag) - except: - # roll back - self._rollback() - # release file lock - #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN) - # error - type, value, traceBack = sys.exc_info() - _logger.error("getSerialNumber() : %s %s" % (type,value)) - return (-1,False) - - - # update transfer status for a dataset - def updateTransferStatus(self,datasetname,bitMap): - comment = ' /* DBProxy.updateTransferStatus */' - try: - _logger.debug("updateTransferStatus(%s,%s)" % (datasetname,hex(bitMap))) - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - retTransSt = 0 - # update bitmap - sqlU = "UPDATE Datasets SET transferStatus=transferStatus|%s WHERE name='%s'" % (bitMap,datasetname) - retU = self.cur.execute(sqlU+comment) - # get transferStatus - sqlS = "SELECT transferStatus from Datasets WHERE name='%s'" % datasetname - retS = self.cur.execute(sqlS+comment) - resS = self.cur.fetchall() - if resS != None and len(resS) != 0: - retTransSt = resS[0][0] - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("updateTransferStatus : %s" % hex(retTransSt)) - return retTransSt - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("updateTransferStatus : %s %s" % (type,value)) - return 0 - - - # get CloudTask. If not exist, create it - def getCloudTask(self,tid): - comment = ' /* getCloudTask */' - try: - _logger.debug("getCloudTask(%s)" % tid) - # check tid - if tid in [None,'NULL']: - _logger.error("invalid TID : %s" % tid) - return None - # get file lock - fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_EX) - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = "SELECT %s FROM cloudtasks " % CloudTaskSpec.columnNames() - sql += "WHERE taskid=%s" % tid - nTry=5 - for iTry in range(nTry): - retS = self.cur.execute(sql+comment) - res = self.cur.fetchall() - _logger.debug("getCloudTask : retS %s" % retS) - if retS>=0 and res != None and retS==len(res): - break - if iTry+1 < nTry: - time.sleep(random.randint(10,20)) - # already exist - if res != None and len(res) != 0: - # instantiate CloudTask - cloudTask = CloudTaskSpec() - cloudTask.pack(res[0]) - # update tmod if status <> 'assigned' - if cloudTask.status <> 'assigned': - sql = "UPDATE cloudtasks SET tmod=UTC_TIMESTAMP() WHERE taskid=%s" % cloudTask.taskid - self.cur.execute(sql+comment) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # release file lock - fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_UN) - _logger.debug("return existing CloudTask") - return cloudTask - # insert new CloudTask - _logger.debug("insert new CloudTask") - cloudTask = CloudTaskSpec() - cloudTask.taskid = tid - cloudTask.status = 'defined' - sql = "INSERT INTO cloudtasks (taskid,status,tmod,tenter) VALUES(%s,%s,UTC_TIMESTAMP(),UTC_TIMESTAMP())" - self.cur.execute(sql+comment,(cloudTask.taskid,cloudTask.status)) - # get id - cloudTask.id = self.conn.insert_id() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # release file lock - fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_UN) - _logger.debug("return new CloudTask") - return cloudTask - except: - # roll back - self._rollback() - # release file lock - fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_UN) - # error - type, value, traceBack = sys.exc_info() - _logger.error("getCloudTask() : %s %s" % (type,value)) - return None - - - # set cloud to CloudTask - def setCloudTask(self,cloudTask): - comment = ' /* setCloudTask */' - try: - _logger.debug("setCloudTask(id=%s,taskid=%s)" % (cloudTask.id,cloudTask.taskid)) - sql = "UPDATE cloudtasks SET cloud=%s,status=%s,tmod=UTC_TIMESTAMP() WHERE id=%s AND status='defined'" - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # update - retU = self.cur.execute(sql+comment,(cloudTask.cloud,'assigned',cloudTask.id)) - # succeeded - if retU != 0: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return cloudTask - # read if it is already set by another thread - sql = "SELECT %s FROM cloudtasks " % CloudTaskSpec.columnNames() - sql += "WHERE id=%s" % cloudTask.id - # select - retS = self.cur.execute(sql+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # retrun CloudTask - if res != None and len(res) != 0: - # instantiate CloudTask - cloudTask = CloudTaskSpec() - cloudTask.pack(res[0]) - return cloudTask - _logger.error("setCloudTask() : cannot find CloudTask for %s" % cloudTask.id) - return None - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("setCloudTask() : %s %s" % (type,value)) - return None - - - # see CloudTask - def seeCloudTask(self,tid): - comment = ' /* seeCloudTask */' - try: - _logger.debug("seeCloudTask(%s)" % tid) - # check tid - if tid in [None,'NULL']: - _logger.error("invalid TID : %s" % tid) - return None - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = "SELECT cloud FROM cloudtasks WHERE taskid=%s" % tid - nTry=5 - for iTry in range(nTry): - retS = self.cur.execute(sql+comment) - res = self.cur.fetchall() - _logger.debug("seeCloudTask : retS %s" % retS) - if retS>=0 and res != None and retS==len(res): - break - if iTry+1 < nTry: - time.sleep(random.randint(10,20)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # existing task - if res != None and len(res) != 0: - # return cloud - return res[0][0] - else: - return None - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("seeCloudTask() : %s %s" % (type,value)) - return None - - - # get assigning task - def getAssigningTask(self): - comment = ' /* getAssigningTask */' - try: - _logger.debug("getAssigningTask") - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = "SELECT taskid FROM cloudtasks WHERE status<>'assigned' AND tmod>'%s'" % timeLimit.strftime('%Y-%m-%d %H:%M:%S') - self.cur.execute(sql+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # loop over all taskid - retList = [] - if res != None: - for tid, in res: - retList.append(tid) - # return - _logger.debug("getAssigningTask ret:%s" % retList) - return retList - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getAssigningTask : %s %s" % (type,value)) - return [] - - - # query files with map - def queryFilesWithMap(self,map): - comment = ' /* DBProxy.queryFilesWithMap */' - _logger.debug("queryFilesWithMap()") - sql1 = "SELECT PandaID,%s FROM filesTable4" % FileSpec.columnNames() - valueL = [] - for key in map.keys(): - if len(valueL)==0: - sql1+= " WHERE %s=" % key - else: - sql1+= " AND %s=" % key - sql1+= "%s" - valueL.append(map[key]) - nTry=3 - for iTry in range(nTry): - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute(sql1+comment, tuple(valueL)) - res = self.cur.fetchall() - _logger.debug("queryFilesWithMap() : %s" % str(res)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # instantiate files - retList = [] - for item in res: - # instantiate dummy JobSpec obj for PandaID - job = JobSpec() - job.PandaID = item[0] - # instantiate file - file = FileSpec() - file.pack(item[1:]) - # set owner - file.setOwner(job) - # append - retList.append(file) - return retList - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("queryFilesWithMap retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("queryFilesWithMap : %s %s" % (type,value)) - return [] - - - # count the number of files with map - def countFilesWithMap(self,map): - comment = ' /* DBProxy.countFilesWithMap */' - sql1 = "SELECT COUNT(*) FROM filesTable4" - valueL = [] - for key in map.keys(): - if len(valueL)==0: - sql1+= " WHERE %s=" % key - else: - sql1+= " AND %s=" % key - sql1+= "%s" - valueL.append(map[key]) - nTry=3 - for iTry in range(nTry): - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - _logger.debug("countFilesWithMap() : %s" % str(map)) - retS = self.cur.execute(sql1+comment, tuple(valueL)) - res = self.cur.fetchone() - _logger.debug("countFilesWithMap() : %s %s" % (retS,str(res))) - # check return - if retS != 1: - raise RuntimeError, 'Invalid return' - nFiles=0 - if res != None: - nFiles=res[0] - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return nFiles - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("countFilesWithMap() retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("countFilesWithMap(%s) : %s %s" % (map,type,value)) - return -1 - - - # update input files and return corresponding PandaIDs - def updateInFilesReturnPandaIDs(self,dataset,status): - comment = ' /* DBProxy.updateInFilesReturnPandaIDs */' - _logger.debug("updateInFilesReturnPandaIDs(%s)" % dataset) - sql0 = "SELECT rowID,PandaID FROM filesTable4 WHERE status<>%s AND dispatchDBlock=%s" - for iTry in range(self.nTry): - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - retS = self.cur.execute(sql0+comment, (status,dataset)) - resS = self.cur.fetchall() - _logger.debug("updateInFilesReturnPandaIDs : retS %s" % retS) - if retS<0 or resS==None or retS!=len(resS): - raise RuntimeError, 'SQL error' - # avoid too long expression - nDiv = 10 - nRow,tmpMod = divmod(len(resS),nDiv) - if tmpMod != 0: - nRow += 1 - # update - retList = [] - for iRow in range(nRow): - rows = [] - pandaIDs = [] - for tmpRowID,tmpPandaID in resS[iRow*nDiv:(iRow+1)*nDiv]: - rows.append(tmpRowID) - if not tmpPandaID in pandaIDs: - pandaIDs.append(tmpPandaID) - # make SQL query - sql1 = "UPDATE filesTable4 SET status=%s WHERE " - for row in rows: - if row != rows[0]: - sql1+= "OR " - sql1+= "rowID=%s " - # update - retU = self.cur.execute(sql1+comment, tuple([status]+rows)) - _logger.debug("updateInFilesReturnPandaIDs : retU %s" % retU) - # append - for tmpPandaID in pandaIDs: - if not tmpPandaID in retList: - retList.append(tmpPandaID) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - _logger.debug("updateInFilesReturnPandaIDs : %s" % str(retList)) - return retList - except: - # roll back - self._rollback() - # error report - if iTry+1 < self.nTry: - _logger.debug("updateInFilesReturnPandaIDs retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("updateInFilesReturnPandaIDs : %s %s" % (type, value)) - return [] - - - # update output files and return corresponding PandaIDs - def updateOutFilesReturnPandaIDs(self,dataset): - comment = ' /* DBProxy.updateOutFilesReturnPandaIDs */' - _logger.debug("updateOutFilesReturnPandaIDs(%s)" % dataset) - sql0 = "SELECT rowID,PandaID FROM filesTable4 WHERE destinationDBlock=%s AND status='transferring'" - for iTry in range(self.nTry): - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - retS = self.cur.execute(sql0+comment, (dataset,)) - resS = self.cur.fetchall() - _logger.debug("updateOutFilesReturnPandaIDs : retS %s" % retS) - if retS<0 or resS==None or retS!=len(resS): - raise RuntimeError, 'SQL error' - # avoid too long expression - nDiv = 10 - nRow,tmpMod = divmod(len(resS),nDiv) - if tmpMod != 0: - nRow += 1 - # update - retList = [] - for iRow in range(nRow): - rows = [] - pandaIDs = [] - for tmpRowID,tmpPandaID in resS[iRow*nDiv:(iRow+1)*nDiv]: - rows.append(tmpRowID) - if not tmpPandaID in pandaIDs: - pandaIDs.append(tmpPandaID) - # make SQL query - sql1 = "UPDATE filesTable4 SET status=%s WHERE " - for row in rows: - if row != rows[0]: - sql1+= "OR " - sql1+= "rowID=%s " - # update - retU = self.cur.execute(sql1+comment, tuple(['ready']+rows)) - _logger.debug("updateOutFilesReturnPandaIDs : retU %s" % retU) - # append - for tmpPandaID in pandaIDs: - if not tmpPandaID in retList: - retList.append(tmpPandaID) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - _logger.debug("updateOutFilesReturnPandaIDs : %s" % str(retList)) - return retList - except: - # roll back - self._rollback() - # error report - if iTry+1 < self.nTry: - _logger.debug("updateOutFilesReturnPandaIDs retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("updateOutFilesReturnPandaIDs : %s %s" % (type, value)) - return [] - - - # set GUIDs - def setGUIDs(self,files): - comment = ' /* DBProxy.setGUIDs */' - _logger.debug("setGUIDs(%s)" % files) - sql0 = "UPDATE filesTable4 SET GUID=%s WHERE lfn=%s" - for iTry in range(self.nTry): - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # update - for file in files: - retU = self.cur.execute(sql0+comment, (file['guid'],file['lfn'])) - _logger.debug("setGUIDs : retU %s" % retU) - if retU<0: - raise RuntimeError, 'SQL error' - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - # error report - if iTry+1 < self.nTry: - _logger.debug("setGUIDs retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("setGUIDs : %s %s" % (type, value)) - return False - - - # query PandaID with Datasets - def queryPandaIDwithDataset(self,datasets): - comment = ' /* DBProxy.queryPandaIDwithDataset */' - _logger.debug("queryPandaIDwithDataset(%s)" % datasets) - if len(datasets) == 0: - return [] - # make SQL query - sql1 = "SELECT PandaID FROM filesTable4 WHERE " - for dataset in datasets: - if dataset != datasets[0]: - sql1+= "OR " - sql1+= "destinationDBlock='%s' " % dataset - sql1+= "GROUP BY PandaID" - # execute - for iTry in range(self.nTry): - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute(sql1+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - retList = [] - for r in res: - retList.append(r[0]) - # return - _logger.debug("queryPandaIDwithDataset : %s" % str(retList)) - return retList - except: - # roll back - self._rollback() - # error report - if iTry+1 < self.nTry: - _logger.debug("queryPandaIDwithDataset retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("queryPandaIDwithDataset : %s %s" % (type, value)) - return [] - - - # query last files in datasets - def queryLastFilesInDataset(self,datasets): - comment = ' /* DBProxy.queryLastFilesInDataset */' - _logger.debug("queryLastFilesInDataset(%s)" % datasets) - if len(datasets) == 0: - return [] - # make SQL query - sql1 = "SELECT MAX(PandaID) FROM filesTable4 WHERE dataset=%s AND type='output'" - sql2 = "SELECT lfn FROM filesTable4 WHERE PandaID=%s AND type='output'" - # execute - try: - retMap = {} - for dataset in datasets: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select PandaID - self.cur.execute(sql1+comment,(dataset,)) - res = self.cur.fetchone() - # found - retList = [] - if res != None: - pandaID = res[0] - # select LFNs - self.cur.execute(sql2+comment,(pandaID,)) - res = self.cur.fetchall() - for r in res: - retList.append(r[0]) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - retMap[dataset] = retList - # return - _logger.debug("queryLastFilesInDataset : %s" % str(retMap)) - return retMap - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("queryLastFilesInDataset : %s %s" % (type, value)) - return {} - - - # query PandaID with filenames - def queryPandaIDwithLFN(self,vlfns): - comment = ' /* DBProxy.queryPandaIDwithLFN */' - _logger.debug("queryPandaIDwithLFN(%s)" % vlfns) - if len(vlfns) == 0: - return [] - # avoid too long expression - nDiv = 15 - nLFN,tmpMod = divmod(len(vlfns),nDiv) - if tmpMod != 0: - nLFN += 1 - # execute - retList = [] - for iLFN in range(nLFN): - lfns = vlfns[iLFN*nDiv:(iLFN+1)*nDiv] - # make SQL query - sql1 = "SELECT PandaID FROM filesTable4 WHERE " - for lfn in lfns: - if lfn != lfns[0]: - sql1+= "OR " - sql1+= "lfn=%s " - sql1+= "GROUP BY PandaID" - # get generic LFNs - gLFNs = [] - for lfn in lfns: - gLFNs.append(re.sub('\.\d+$','',lfn)) - # try - for iTry in range(self.nTry): - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute(sql1+comment, tuple(gLFNs)) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append IDs - for r in res: - if not r[0] in retList: - retList.append(r[0]) - break - except: - # roll back - self._rollback() - # error report - if iTry+1 < self.nTry: - _logger.debug("queryPandaIDwithLFN retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("queryPandaIDwithLFN : %s %s" % (type, value)) - return [] - # return - _logger.debug("queryPandaIDwithLFN : %s" % str(retList)) - return retList - - - # get job statistics - def getJobStatistics(self,archived=False,predefined=False): - comment = ' /* DBProxy.getJobStatistics */' - _logger.debug("getJobStatistics()") - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) - sql0 = "SELECT computingSite,jobStatus,COUNT(*) FROM %s WHERE prodSourceLabel in ('managed','rc_test','user','panda','ddm') " - if predefined: - sql0 += "AND relocationFlag=1 " - sql0 += "GROUP BY computingSite,jobStatus" - sqlA = "SELECT computingSite,jobStatus,COUNT(*) FROM jobsArchived4 WHERE modificationTime>'%s' AND prodSourceLabel in ('managed','rc_test','user','panda','ddm') " \ - % (timeLimit.strftime('%Y-%m-%d %H:%M:%S')) - if predefined: - sqlA += "AND relocationFlag=1 " - sqlA += "GROUP BY computingSite,jobStatus" - tables = ['jobsActive4','jobsDefined4'] - if archived: - tables.append('jobsArchived4') - ret = {} - nTry=3 - for iTry in range(nTry): - try: - for table in tables: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - if table != 'jobsArchived4': - self.cur.execute((sql0+comment) % table) - else: - self.cur.execute(sqlA+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for item in res: - if not ret.has_key(item[0]): - ret[item[0]] = {} - if not ret[item[0]].has_key(item[1]): - ret[item[0]][item[1]] = 0 - ret[item[0]][item[1]] += item[2] - # for zero - stateList = ['assigned','activated','running'] - if archived: - stateList += ['finished','failed'] - for site in ret.keys(): - for state in stateList: - if not ret[site].has_key(state): - ret[site][state] = 0 - # return - _logger.debug("getJobStatistics() : %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("getJobStatistics() retry : %s" % iTry) - time.sleep(2) - continue - type, value, traceBack = sys.exc_info() - _logger.error("getJobStatistics : %s %s" % (type, value)) - return {} - - - # get job statistics for brokerage - def getJobStatisticsBrokerage(self): - comment = ' /* DBProxy.getJobStatisticsBrokerage */' - _logger.debug("getJobStatisticsBrokerage()") - sql0 = "SELECT computingSite,jobStatus,processingType,COUNT(*) FROM %s WHERE prodSourceLabel IN ('managed','rc_test','user','panda','ddm') " - sql0 += "GROUP BY computingSite,jobStatus,processingType" - tables = ['jobsActive4','jobsDefined4'] - ret = {} - nTry=3 - for iTry in range(nTry): - try: - for table in tables: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute((sql0+comment) % table) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for computingSite,jobStatus,processingType,count in res: - # add site - if not ret.has_key(computingSite): - ret[computingSite] = {} - # add processingType - if not ret[computingSite].has_key(processingType): - ret[computingSite][processingType] = {} - # add jobStatus - if not ret[computingSite][processingType].has_key(jobStatus): - ret[computingSite][processingType][jobStatus] = count - # for zero - for site,siteVal in ret.iteritems(): - for pType,typeVal in siteVal.iteritems(): - for stateItem in ['assigned','activated','running']: - if not typeVal.has_key(stateItem): - typeVal[stateItem] = 0 - # return - return ret - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("getJobStatisticsBrokerage retry : %s" % iTry) - time.sleep(2) - continue - type, value, traceBack = sys.exc_info() - _logger.error("getJobStatisticsBrokerage : %s %s" % (type, value)) - return {} - - - # get computingSite and destinationSE for a dataset - def getDestSE(self,dsname): - comment = ' /* DBProxy.getDestSE */' - _logger.debug("getDestSE(%s)" % dsname) - sql0 = "SELECT PandaID FROM filesTable4 WHERE destinationDBlock='%s' AND status='transferring' LIMIT 1" % dsname - sql1 = "SELECT computingSite,destinationSE FROM jobsActive4 WHERE PandaID=%s" - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute(sql0+comment) - res = self.cur.fetchall() - # get PandaID - pandaID = None - if len(res) != 0: - pandaID = res[0][0] - # get computingSite and destinationSE - destSE = None,None - if pandaID != None: - self.cur.execute((sql1+comment) % pandaID) - res = self.cur.fetchall() - if len(res) != 0: - destSE = res[0] - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - _logger.debug("getDestSE(%s) : %s" % (dsname,str(destSE))) - return destSE - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getDestSE : %s %s" % (type, value)) - return None,None - - - # get destinationDBlockToken for a dataset - def getDestTokens(self,dsname): - comment = ' /* DBProxy.getDestTokens */' - _logger.debug("getDestTokens(%s)" % dsname) - sql0 = "SELECT destinationDBlockToken FROM filesTable4 WHERE destinationDBlock='%s' LIMIT 1" % dsname - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute(sql0+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - retToken = None - if len(res) != 0: - retToken = res[0][0] - # return - _logger.debug("getDestTokens(%s) : %s" % (dsname,retToken)) - return retToken - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getDestTokens : %s %s" % (type, value)) - return None - - - # get the number of job for a user - def getNumberJobsUser(self,dn): - comment = ' /* DBProxy.getNumberJobsUser */' - _logger.debug("getNumberJobsUsers(%s)" % dn) - sql0 = "SELECT COUNT(*) FROM %s WHERE prodUserID='%s' AND prodSourceLabel='user'" - nTry = 1 - nJob = 0 - for iTry in range(nTry): - try: - for table in ('jobsActive4','jobsDefined4'): - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute((sql0+comment) % (table,dn)) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - if len(res) != 0: - nJob += res[0][0] - # return - _logger.debug("getNumberJobsUsers(%s) : %s" % (dn,nJob)) - return nJob - except: - # roll back - self._rollback() - if iTry+1 < nTry: - time.sleep(2) - continue - type, value, traceBack = sys.exc_info() - _logger.error("getNumberJobsUsers : %s %s" % (type, value)) - return 0 - - - # get job statistics for ExtIF - def getJobStatisticsForExtIF(self,sourcetype=None): - comment = ' /* DBProxy.getJobStatisticsForExtIF */' - _logger.debug("getJobStatisticsForExtIF()") - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) - if sourcetype == 'analysis': - sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel in ('user','panda') GROUP BY jobStatus,cloud" - sqlA = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel in ('user','panda') " - else: - sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN ('managed','rc_test') GROUP BY jobStatus,cloud" - sqlA = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN ('managed','rc_test') " - sqlA+= "AND modificationTime>'%s' GROUP BY jobStatus,cloud" % (timeLimit.strftime('%Y-%m-%d %H:%M:%S')) - ret = {} - try: - for table in ('jobsActive4','jobsWaiting4','jobsArchived4','jobsDefined4'): - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - if table != 'jobsArchived4': - self.cur.execute((sql0+comment) % table) - else: - self.cur.execute((sqlA+comment) % table) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # change NULL to US for old jobs - newRes = [] - usMap = {} - for jobStatus,count,cloud in res: - if not cloud in ['US','NULL']: - # append since no conversion is required - newRes.append((jobStatus,count,cloud)) - else: - # sum - if not usMap.has_key(jobStatus): - usMap[jobStatus] = 0 - usMap[jobStatus] += count - # append US counts - for jobStatus,count in usMap.iteritems(): - newRes.append((jobStatus,count,'US')) - # create map - for item in newRes: - # add cloud - if not ret.has_key(item[2]): - ret[item[2]] = {} - # this is needed for auto_increment of InnoDB - if not ret[item[2]].has_key(item[0]): - ret[item[2]][item[0]] = item[1] - # return - _logger.debug("getJobStatisticsForExtIF() : %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getJobStatisticsForExtIF : %s %s" % (type, value)) - return {} - - - # get job statistics per processingType - def getJobStatisticsPerProcessingType(self): - comment = ' /* DBProxy.getJobStatisticsPerProcessingType */' - _logger.debug("getJobStatisticsPerProcessingType()") - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) - sql0 = "SELECT jobStatus,COUNT(*),cloud,processingType FROM %s " - sql0 += "WHERE prodSourceLabel IN ('managed','rc_test') " - sqlT = "AND modificationTime>'%s' " % timeLimit.strftime('%Y-%m-%d %H:%M:%S') - sql1 = "GROUP BY jobStatus,cloud,processingType" - sqlN = sql0 + sql1 - sqlA = sql0 + sqlT + sql1 - ret = {} - try: - for table in ('jobsActive4','jobsWaiting4','jobsArchived4','jobsDefined4'): - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - if table == 'jobsArchived4': - self.cur.execute((sqlA+comment) % table) - else: - self.cur.execute((sqlN+comment) % table) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for jobStatus,count,cloud,processingType in res: - # add cloud - if not ret.has_key(cloud): - ret[cloud] = {} - # add processingType - if not ret[cloud].has_key(processingType): - ret[cloud][processingType] = {} - # this is needed for auto_increment of InnoDB - if not ret[cloud][processingType].has_key(jobStatus): - ret[cloud][processingType][jobStatus] = count - # return - _logger.debug("getJobStatisticsPerProcessingType() : %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getJobStatisticsPerProcessingType : %s %s" % (type, value)) - return {} - - - # get number of analysis jobs per user - def getNUserJobs(self,siteName,nJobs): - comment = ' /* DBProxy.getNUserJobs */' - _logger.debug("getNUserJobs(%s)" % siteName) - sql0 = "SELECT prodUserID FROM jobsActive4 WHERE jobStatus='activated' AND prodSourceLabel in ('user','panda') AND computingSite='%s' ORDER BY currentPriority DESC LIMIT %s" % (siteName,nJobs) - ret = {} - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute(sql0+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for prodUserID, in res: - if not ret.has_key(prodUserID): - ret[prodUserID] = 0 - ret[prodUserID] += 1 - # return - _logger.debug("getNUserJobs() : %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getNUserJobs : %s %s" % (type, value)) - return {} - - - # get number of activated analysis jobs - def getNAnalysisJobs(self,nProcesses): - comment = ' /* DBProxy.getNAnalysisJobs */' - _logger.debug("getNAnalysisJobs(%s)" % nProcesses) - sql0 = "SELECT computingSite,COUNT(*) FROM jobsActive4 WHERE jobStatus='activated' AND (prodSourceLabel='user' OR prodSourceLabel='panda') GROUP BY computingSite" - ret = {} - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute(sql0+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for item in res: - ret[item[0]] = float(item[1])/nProcesses - # return - _logger.debug("getNAnalysisJobs() : %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getNAnalysisJobs : %s %s" % (type, value)) - return {} - - - # count pilot requests - def countPilotRequests(self,ids,prodSourceLabel='None'): - comment = ' /* DBProxy.countPilotRequests */' - # prodSourceLabel - if prodSourceLabel=='user': - criteria = " AND MESSAGE REGEXP 'user$'" - else: - criteria = " AND MESSAGE REGEXP 'None$'" - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) - ret = {} - try: - for siteID in ids: - # begin transaction - self.cur.execute("START TRANSACTION") - # select - sql0 = "SELECT COUNT(*) FROM PANDALOG WHERE Type='getJob' AND BINTIME>'%s'" % \ - timeLimit.strftime('%Y-%m-%d %H:%M:%S') - sql0+= " AND MESSAGE REGEXP '%s'" % siteID - sql0+= criteria - self.cur.execute(sql0+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - ret[siteID] = res[0][0] - # return - _logger.debug("countPilotRequests() : %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("countPilotRequests : %s %s" % (type, value)) - # for zero - for siteID in ids: - if not ret.has_key(siteID): - ret[siteID]=0 - return ret - - - # generate pilot token - def genPilotToken(self,schedulerhost,scheduleruser,schedulerid): - comment = ' /* DBProxy.genPilotToken */' - try: - _logger.debug("genPilotToken(%s,%s,%s)" % (schedulerhost,scheduleruser,schedulerid)) - token = commands.getoutput('uuidgen') - timeNow = datetime.datetime.utcnow() - timeExp = timeNow + datetime.timedelta(days=4) - sql = "INSERT INTO pilottoken (token,schedulerhost,scheduleruser,schedulerid,created,expires) " - sql += "VALUES (%s,%s,%s,%s,%s,%s)" - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # execute - self.cur.execute(sql+comment,(token,schedulerhost,scheduleruser,schedulerid, - timeNow.strftime('%Y-%m-%d %H:%M:%S'), - timeExp.strftime('%Y-%m-%d %H:%M:%S'))) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - retVal = "token=%s,created=%s,expires=%s" % (token,timeNow.strftime('%Y-%m-%d %H:%M:%S'), - timeExp.strftime('%Y-%m-%d %H:%M:%S')) - _logger.debug("genPilotToken -> %s" % retVal) - return retVal - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("genPilotToken : %s %s" % (type, value)) - return None - - - # get list of scheduler users - def getListSchedUsers(self): - comment = ' /* DBProxy.getListSchedUsers */' - try: - _logger.debug("getListSchedUsers") - sql = "SELECT token,scheduleruser FROM pilottoken WHERE expires>UTC_TIMESTAMP()" - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # execute - self.cur.execute(sql+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - retVal = {} - for token,scheduleruser in res: - retVal[token] = scheduleruser - _logger.debug("getListSchedUsers->%s" % str(retVal)) - return retVal - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getListSchedUsers : %s %s" % (type, value)) - return {} - - - # wake up connection - def wakeUp(self): - for iTry in range(5): - try: - # check if the connection is working - self.conn.ping() - return - except: - type, value, traceBack = sys.exc_info() - _logger.debug("wakeUp %d : %s %s" % (iTry,type,value)) - # wait for reconnection - time.sleep(1) - self.connect(reconnect=True) - - - # commit - def _commit(self): - try: - self.conn.commit() - return True - except: - _logger.error("commit error") - return False - - - # rollback - def _rollback(self): - try: - self.conn.rollback() - return True - except: - _logger.error("rollback error") - return False - diff --git a/current/pandaserver/taskbuffer/DBProxyPool.py b/current/pandaserver/taskbuffer/DBProxyPool.py deleted file mode 100755 index 53aed84bd..000000000 --- a/current/pandaserver/taskbuffer/DBProxyPool.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -pool for DBProxies - -""" - -import inspect -import Queue -import OraDBProxy as DBProxy -import os -import time -import random -from threading import Lock -from config import panda_config -from taskbuffer.ConBridge import ConBridge -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('DBProxyPool') - -class DBProxyPool: - - def __init__(self,dbhost,dbpasswd,nConnection,useTimeout=False): - # crate lock for callers - self.lock = Lock() - self.callers = [] - # create Proxies - _logger.debug("init") - self.proxyList = Queue.Queue(nConnection) - for i in range(nConnection): - _logger.debug("connect -> %s " % i) - if useTimeout and hasattr(panda_config,'usedbtimeout') and \ - panda_config.usedbtimeout == True: - proxy = ConBridge() - else: - proxy = DBProxy.DBProxy() - iTry = 0 - while True: - if proxy.connect(dbhost,dbpasswd,dbtimeout=60): - break - iTry += 1 - _logger.debug("failed -> %s : try %s" % (i,iTry)) - time.sleep(random.randint(60,90)) - self.proxyList.put(proxy) - time.sleep(1) - # get PID - self.pid = os.getpid() - _logger.debug("ready") - - # return a free proxy. this method blocks until a proxy is available - def getProxy(self): - """ - # get caller - caller = inspect.stack()[1][3] - _logger.debug("PID=%s %s getting proxy used by %s" % (self.pid,caller,str(self.callers))) - """ - # get proxy - proxy = self.proxyList.get() - """ - # lock - self.lock.acquire() - # append - self.callers.append(caller) - # release - self.lock.release() - _logger.debug("PID=%s %s got proxy used by %s" % (self.pid,caller,str(self.callers))) - """ - # wake up connection - proxy.wakeUp() - # return - return proxy - - # put back a proxy - def putProxy(self,proxy): - """ - # get caller - caller = inspect.stack()[1][3] - _logger.debug("PID=%s %s releasing. used by %s" % (self.pid,caller,str(self.callers))) - """ - self.proxyList.put(proxy) - """ - # lock - self.lock.acquire() - # append - self.callers.remove(caller) - # release - self.lock.release() - _logger.debug("PID=%s %s released. used by %s" % (self.pid,caller,str(self.callers))) - """ diff --git a/current/pandaserver/taskbuffer/DatasetSpec.py b/current/pandaserver/taskbuffer/DatasetSpec.py deleted file mode 100755 index 815b98a59..000000000 --- a/current/pandaserver/taskbuffer/DatasetSpec.py +++ /dev/null @@ -1,118 +0,0 @@ -""" -dataset specification - -""" - -class DatasetSpec(object): - # attributes - _attributes = ('vuid','name','version','type','status','numberfiles','currentfiles','creationdate', - 'modificationdate','MoverID','transferStatus','subType') - - # attributes which have 0 by default - _zeroAttrs = ('MoverID','transferStatus') - - - - # constructor - def __init__(self): - # install attributes - for attr in self._attributes: - setattr(self,attr,None) - - - # override __getattribute__ for SQL - def __getattribute__(self,name): - ret = object.__getattribute__(self,name) - if ret == None: - return "NULL" - return ret - - - # return a tuple of values - def values(self): - ret = [] - for attr in self._attributes: - val = getattr(self,attr) - ret.append(val) - return tuple(ret) - - - # return map of values - def valuesMap(self): - ret = {} - for attr in self._attributes: - val = getattr(self,attr) - if val == 'NULL': - if attr in self._zeroAttrs: - val = 0 - else: - val = None - ret[':%s' % attr] = val - return ret - - - # pack tuple into DatasetSpec - def pack(self,values): - for i in range(len(self._attributes)): - attr= self._attributes[i] - val = values[i] - setattr(self,attr,val) - - - # return column names for INSERT - def columnNames(cls): - ret = "" - for attr in cls._attributes: - if ret != "": - ret += ',' - ret += attr - return ret - columnNames = classmethod(columnNames) - - - # return expression of values for INSERT - def valuesExpression(cls): - ret = "VALUES(" - for attr in cls._attributes: - ret += "%s" - if attr != cls._attributes[len(cls._attributes)-1]: - ret += "," - ret += ")" - return ret - valuesExpression = classmethod(valuesExpression) - - - # return expression of bind values for INSERT - def bindValuesExpression(cls): - ret = "VALUES(" - for attr in cls._attributes: - ret += ":%s," % attr - ret = ret[:-1] - ret += ")" - return ret - bindValuesExpression = classmethod(bindValuesExpression) - - - # return an expression for UPDATE - def updateExpression(cls): - ret = "" - for attr in cls._attributes: - ret = ret + attr + "=%s" - if attr != cls._attributes[len(cls._attributes)-1]: - ret += "," - return ret - updateExpression = classmethod(updateExpression) - - - # return an expression of bind variables for UPDATE - def bindUpdateExpression(cls): - ret = "" - for attr in cls._attributes: - ret += '%s=:%s,' % (attr,attr) - ret = ret[:-1] - return ret - bindUpdateExpression = classmethod(bindUpdateExpression) - - - - diff --git a/current/pandaserver/taskbuffer/ErrorCode.py b/current/pandaserver/taskbuffer/ErrorCode.py deleted file mode 100755 index 08f72b116..000000000 --- a/current/pandaserver/taskbuffer/ErrorCode.py +++ /dev/null @@ -1,37 +0,0 @@ -############## errror code - -# killed -EC_Kill = 100 - -# transfer timeout -EC_Transfer = 101 - -# expire -EC_Expire = 102 - -# aborted -EC_Aborted = 103 - -# wait timeout -EC_WaitTimeout = 104 - -# reassigned by rebrokeage -EC_Reassigned = 105 - -# reassigned by server-side retry -EC_Retried = 106 - -# retried by pilot -EC_PilotRetried = 107 - -# lost file (=dataservice.ErrorCode.EC_LostFile) -EC_LostFile = 110 - -# file not found -class EC_NotFound: - pass - -# file relocated -class EC_Redirect: - def __init__(self,url): - self.url = url diff --git a/current/pandaserver/taskbuffer/FileSpec.py b/current/pandaserver/taskbuffer/FileSpec.py deleted file mode 100755 index 209b2ed65..000000000 --- a/current/pandaserver/taskbuffer/FileSpec.py +++ /dev/null @@ -1,213 +0,0 @@ -""" -file specification - -""" - - -class FileSpec(object): - # attributes - _attributes = ('row_ID','PandaID','GUID','lfn','type','dataset','status','prodDBlock', - 'prodDBlockToken','dispatchDBlock','dispatchDBlockToken','destinationDBlock', - 'destinationDBlockToken','destinationSE','fsize','md5sum','checksum','scope') - # slots - __slots__ = _attributes+('_owner','_changedAttrs','_oldPandaID') - # attributes which have 0 by default - _zeroAttrs = ('fsize',) - # mapping between sequence and attr - _seqAttrMap = {'row_ID':'ATLAS_PANDA.FILESTABLE4_ROW_ID_SEQ.nextval'} - - - # constructor - def __init__(self): - # install attributes - for attr in self._attributes: - object.__setattr__(self,attr,None) - # set owner to synchronize PandaID - object.__setattr__(self,'_owner',None) - # map of changed attributes - object.__setattr__(self,'_changedAttrs',{}) - # old PandaID - object.__setattr__(self,'_oldPandaID','NULL') - - - # override __getattribute__ for SQL and PandaID - def __getattribute__(self,name): - # PandaID - if name == 'PandaID': - if self._owner == None: - return 'NULL' - return self._owner.PandaID - # others - ret = object.__getattribute__(self,name) - if ret == None: - return "NULL" - return ret - - - # override __setattr__ to collecte the changed attributes - def __setattr__(self,name,value): - oldVal = getattr(self,name) - object.__setattr__(self,name,value) - newVal = getattr(self,name) - # collect changed attributes - if oldVal != newVal: - self._changedAttrs[name] = value - - - # set owner - def setOwner(self,owner): - self._owner = owner - self._oldPandaID = self.PandaID - - - # reset changed attribute list - def resetChangedList(self): - self._oldPandaID = self.PandaID - object.__setattr__(self,'_changedAttrs',{}) - - - # return a tuple of values - def values(self): - ret = [] - for attr in self._attributes: - val = getattr(self,attr) - ret.append(val) - return tuple(ret) - - - # return map of values - def valuesMap(self,useSeq=False,onlyChanged=False): - ret = {} - for attr in self._attributes: - if useSeq and self._seqAttrMap.has_key(attr): - continue - if onlyChanged: - if attr == 'PandaID': - if self.PandaID == self._oldPandaID: - continue - elif not self._changedAttrs.has_key(attr): - continue - val = getattr(self,attr) - if val == 'NULL': - if attr in self._zeroAttrs: - val = 0 - else: - val = None - ret[':%s' % attr] = val - return ret - - - # pack tuple into FileSpec - def pack(self,values): - for i in range(len(self._attributes)): - attr= self._attributes[i] - val = values[i] - object.__setattr__(self,attr,val) - - - # return state values to be pickled - def __getstate__(self): - state = [] - for attr in self._attributes: - val = getattr(self,attr) - state.append(val) - # append owner info - state.append(self._owner) - return state - - - # restore state from the unpickled state values - def __setstate__(self,state): - pandaID = 'NULL' - for i in range(len(self._attributes)): - if i+1 < len(state): - object.__setattr__(self,self._attributes[i],state[i]) - else: - object.__setattr__(self,self._attributes[i],'NULL') - if self._attributes[i] == 'PandaID': - pandaID = state[i] - object.__setattr__(self,'_owner',state[-1]) - object.__setattr__(self,'_changedAttrs',{}) - object.__setattr__(self,'_oldPandaID',pandaID) - - - # return column names for INSERT - def columnNames(cls,withMod=False): - ret = "" - for attr in cls._attributes: - if ret != "": - ret += ',' - ret += attr - # add modificationTime - if withMod: - ret += ",modificationTime" - return ret - columnNames = classmethod(columnNames) - - - # return expression of values for INSERT - def valuesExpression(cls): - ret = "VALUES(" - for attr in cls._attributes: - ret += "%s" - if attr != cls._attributes[len(cls._attributes)-1]: - ret += "," - ret += ")" - return ret - valuesExpression = classmethod(valuesExpression) - - - # return expression of bind variables for INSERT - def bindValuesExpression(cls,useSeq=False,withMod=False): - ret = "VALUES(" - for attr in cls._attributes: - if useSeq and cls._seqAttrMap.has_key(attr): - ret += "%s," % cls._seqAttrMap[attr] - else: - ret += ":%s," % attr - ret = ret[:-1] - # add modificationTime - if withMod: - ret += ",:modificationTime" - ret += ")" - return ret - bindValuesExpression = classmethod(bindValuesExpression) - - - # return an expression for UPDATE - def updateExpression(cls): - ret = "" - for attr in cls._attributes: - ret = ret + attr + "=%s" - if attr != cls._attributes[len(cls._attributes)-1]: - ret += "," - return ret - updateExpression = classmethod(updateExpression) - - - # return an expression of bind variables for UPDATE - def bindUpdateExpression(cls): - ret = "" - for attr in cls._attributes: - ret += '%s=:%s,' % (attr,attr) - ret = ret[:-1] - ret += ' ' - return ret - bindUpdateExpression = classmethod(bindUpdateExpression) - - - # return an expression of bind variables for UPDATE to update only changed attributes - def bindUpdateChangesExpression(self): - ret = "" - for attr in self._attributes: - if self._changedAttrs.has_key(attr) or \ - (attr == 'PandaID' and self.PandaID != self._oldPandaID): - ret += '%s=:%s,' % (attr,attr) - ret = ret[:-1] - ret += ' ' - return ret - - - - - diff --git a/current/pandaserver/taskbuffer/Initializer.py b/current/pandaserver/taskbuffer/Initializer.py deleted file mode 100644 index a9c158b43..000000000 --- a/current/pandaserver/taskbuffer/Initializer.py +++ /dev/null @@ -1,46 +0,0 @@ -import sys -import cx_Oracle -from threading import Lock - -from config import panda_config - -# logger -from pandalogger.PandaLogger import PandaLogger -_logger = PandaLogger().getLogger('Initializer') - -# initialize cx_Oracle using dummy connection to avoid "Unable to acquire Oracle environment handle" -class Initializer: - def __init__(self): - self.lock = Lock() - self.first = True - - def init(self): - _logger.debug("init new=%s" % self.first) - # do nothing when nDBConnection is 0 - if panda_config.nDBConnection == 0: - return True - # lock - self.lock.acquire() - if self.first: - self.first = False - try: - _logger.debug("connect") - # connect - conn = cx_Oracle.connect(dsn=panda_config.dbhost,user=panda_config.dbuser, - password=panda_config.dbpasswd,threaded=True) - # close - conn.close() - _logger.debug("done") - except: - self.lock.release() - type, value, traceBack = sys.exc_info() - _logger.error("connect : %s %s" % (type,value)) - return False - # release - self.lock.release() - return True - - -# singleton -initializer = Initializer() -del Initializer diff --git a/current/pandaserver/taskbuffer/JobSpec.py b/current/pandaserver/taskbuffer/JobSpec.py deleted file mode 100755 index 7eaa764ab..000000000 --- a/current/pandaserver/taskbuffer/JobSpec.py +++ /dev/null @@ -1,239 +0,0 @@ -""" -job specification - -""" - -class JobSpec(object): - # attributes - _attributes = ('PandaID','jobDefinitionID','schedulerID','pilotID','creationTime','creationHost', - 'modificationTime','modificationHost','AtlasRelease','transformation','homepackage', - 'prodSeriesLabel','prodSourceLabel','prodUserID','assignedPriority','currentPriority', - 'attemptNr','maxAttempt','jobStatus','jobName','maxCpuCount','maxCpuUnit','maxDiskCount', - 'maxDiskUnit','ipConnectivity','minRamCount','minRamUnit','startTime','endTime', - 'cpuConsumptionTime','cpuConsumptionUnit','commandToPilot','transExitCode','pilotErrorCode', - 'pilotErrorDiag','exeErrorCode','exeErrorDiag','supErrorCode','supErrorDiag', - 'ddmErrorCode','ddmErrorDiag','brokerageErrorCode','brokerageErrorDiag', - 'jobDispatcherErrorCode','jobDispatcherErrorDiag','taskBufferErrorCode', - 'taskBufferErrorDiag','computingSite','computingElement','jobParameters', - 'metadata','prodDBlock','dispatchDBlock','destinationDBlock','destinationSE', - 'nEvents','grid','cloud','cpuConversion','sourceSite','destinationSite','transferType', - 'taskID','cmtConfig','stateChangeTime','prodDBUpdateTime','lockedby','relocationFlag', - 'jobExecutionID','VO','pilotTiming','workingGroup','processingType','prodUserName', - 'nInputFiles','countryGroup','batchID','parentID','specialHandling','jobsetID', - 'coreCount','nInputDataFiles','inputFileType','inputFileProject','inputFileBytes', - 'nOutputDataFiles','outputFileBytes','jobMetrics') - # slots - __slots__ = _attributes+('Files','_changedAttrs') - # attributes which have 0 by default - _zeroAttrs = ('assignedPriority','currentPriority','attemptNr','maxAttempt','maxCpuCount','maxDiskCount', - 'minRamCount','cpuConsumptionTime','pilotErrorCode','exeErrorCode','supErrorCode','ddmErrorCode', - 'brokerageErrorCode','jobDispatcherErrorCode','taskBufferErrorCode','nEvents','relocationFlag', - 'jobExecutionID','nOutputDataFiles','outputFileBytes') - # attribute to be suppressed. They are in another table - _suppAttrs = ('jobParameters','metadata') - # mapping between sequence and attr - _seqAttrMap = {'PandaID':'ATLAS_PANDA.JOBSDEFINED4_PANDAID_SEQ.nextval'} - # limit length - _limitLength = {'ddmErrorDiag' : 500, - 'taskBufferErrorDiag' : 300, - 'jobDispatcherErrorDiag' : 250, - 'brokerageErrorDiag' : 250, - 'pilotErrorDiag' : 500, - 'exeErrorDiag' : 500, - } - - - # constructor - def __init__(self): - # install attributes - for attr in self._attributes: - object.__setattr__(self,attr,None) - # files list - object.__setattr__(self,'Files',[]) - # map of changed attributes - object.__setattr__(self,'_changedAttrs',{}) - - - # override __getattribute__ for SQL - def __getattribute__(self,name): - ret = object.__getattribute__(self,name) - if ret == None: - return "NULL" - return ret - - - # override __setattr__ to collecte the changed attributes - def __setattr__(self,name,value): - oldVal = getattr(self,name) - object.__setattr__(self,name,value) - newVal = getattr(self,name) - # collect changed attributes - if oldVal != newVal and not name in self._suppAttrs: - self._changedAttrs[name] = value - - - # reset changed attribute list - def resetChangedList(self): - object.__setattr__(self,'_changedAttrs',{}) - - - # add File to files list - def addFile(self,file): - # set owner - file.setOwner(self) - # append - self.Files.append(file) - - - # pack tuple into JobSpec - def pack(self,values): - for i in range(len(self._attributes)): - attr= self._attributes[i] - val = values[i] - object.__setattr__(self,attr,val) - - - # return a tuple of values - def values(self): - ret = [] - for attr in self._attributes: - val = getattr(self,attr) - ret.append(val) - return tuple(ret) - - - # return map of values - def valuesMap(self,useSeq=False,onlyChanged=False): - ret = {} - for attr in self._attributes: - if useSeq and self._seqAttrMap.has_key(attr): - continue - if onlyChanged: - if not self._changedAttrs.has_key(attr): - continue - val = getattr(self,attr) - if val == 'NULL': - if attr in self._zeroAttrs: - val = 0 - else: - val = None - # jobParameters/metadata go to another table - if attr in self._suppAttrs: - val = None - # truncate too long values - if self._limitLength.has_key(attr): - if val != None: - val = val[:self._limitLength[attr]] - ret[':%s' % attr] = val - return ret - - - # return state values to be pickled - def __getstate__(self): - state = [] - for attr in self._attributes: - val = getattr(self,attr) - state.append(val) - # append File info - state.append(self.Files) - return state - - - # restore state from the unpickled state values - def __setstate__(self,state): - for i in range(len(self._attributes)): - # schema evolution is supported only when adding attributes - if i+1 < len(state): - object.__setattr__(self,self._attributes[i],state[i]) - else: - object.__setattr__(self,self._attributes[i],'NULL') - object.__setattr__(self,'Files',state[-1]) - object.__setattr__(self,'_changedAttrs',{}) - - - # return column names for INSERT or full SELECT - def columnNames(cls): - ret = "" - for attr in cls._attributes: - if ret != "": - ret += ',' - ret += attr - return ret - columnNames = classmethod(columnNames) - - - # return expression of values for INSERT - def valuesExpression(cls): - ret = "VALUES(" - for attr in cls._attributes: - ret += "%s" - if attr != cls._attributes[len(cls._attributes)-1]: - ret += "," - ret += ")" - return ret - valuesExpression = classmethod(valuesExpression) - - - # return expression of bind values for INSERT - def bindValuesExpression(cls,useSeq=False): - ret = "VALUES(" - for attr in cls._attributes: - if useSeq and cls._seqAttrMap.has_key(attr): - ret += "%s," % cls._seqAttrMap[attr] - else: - ret += ":%s," % attr - ret = ret[:-1] - ret += ")" - return ret - bindValuesExpression = classmethod(bindValuesExpression) - - - # return an expression for UPDATE - def updateExpression(cls): - ret = "" - for attr in cls._attributes: - ret = ret + attr + "=%s" - if attr != cls._attributes[len(cls._attributes)-1]: - ret += "," - return ret - updateExpression = classmethod(updateExpression) - - - # return an expression of bind variables for UPDATE - def bindUpdateExpression(cls): - ret = "" - for attr in cls._attributes: - ret += '%s=:%s,' % (attr,attr) - ret = ret[:-1] - ret += ' ' - return ret - bindUpdateExpression = classmethod(bindUpdateExpression) - - - # comparison function for sort - def compFunc(cls,a,b): - iPandaID = list(cls._attributes).index('PandaID') - iPriority = list(cls._attributes).index('currentPriority') - if a[iPriority] > b[iPriority]: - return -1 - elif a[iPriority] < b[iPriority]: - return 1 - else: - if a[iPandaID] > b[iPandaID]: - return 1 - elif a[iPandaID] < b[iPandaID]: - return -1 - else: - return 0 - compFunc = classmethod(compFunc) - - - # return an expression of bind variables for UPDATE to update only changed attributes - def bindUpdateChangesExpression(self): - ret = "" - for attr in self._attributes: - if self._changedAttrs.has_key(attr): - ret += '%s=:%s,' % (attr,attr) - ret = ret[:-1] - ret += ' ' - return ret diff --git a/current/pandaserver/taskbuffer/LogDBProxy.py b/current/pandaserver/taskbuffer/LogDBProxy.py deleted file mode 100755 index e32ef22fc..000000000 --- a/current/pandaserver/taskbuffer/LogDBProxy.py +++ /dev/null @@ -1,790 +0,0 @@ -""" -proxy for log database connection - -""" - -import re -import sys -import time -import datetime - -import MySQLdb - -from pandalogger.PandaLogger import PandaLogger -from config import panda_config - -import SiteSpec -import CloudSpec - -from JobSpec import JobSpec -from FileSpec import FileSpec - -# logger -_logger = PandaLogger().getLogger('LogDBProxy') - -# proxy -class LogDBProxy: - - # constructor - def __init__(self): - # connection object - self.conn = None - # cursor object - self.cur = None - - # connect to DB - def connect(self,dbhost=panda_config.logdbhost,dbpasswd=panda_config.logdbpasswd, - dbuser=panda_config.logdbuser,dbname=panda_config.logdbname,reconnect=False): - # keep parameters for reconnect - if not reconnect: - self.dbhost = dbhost - self.dbpasswd = dbpasswd - self.dbuser = dbuser - self.dbname = dbname - # connect - try: - self.conn = MySQLdb.connect(host=self.dbhost,user=self.dbuser, - passwd=self.dbpasswd,db=self.dbname) - self.cur=self.conn.cursor() - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("connect : %s %s" % (type,value)) - # roll back - self._rollback() - return False - - - # query an SQL - def querySQL(self,sql): - try: - # begin transaction - self.cur.execute("START TRANSACTION") - self.cur.execute(sql) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return res - except: - type, value, traceBack = sys.exc_info() - _logger.error("querySQL : %s %s" % (type,value)) - return None - - - # get site data - def getCurrentSiteData(self): - _logger.debug("getCurrentSiteData") - sql = "SELECT SITE,getJob,updateJob FROM SiteData WHERE FLAG='production' and HOURS=3" - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - self.cur.execute(sql) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - ret = {} - for item in res: - ret[item[0]] = {'getJob':item[1],'updateJob':item[2]} - _logger.debug(ret) - return ret - except: - type, value, traceBack = sys.exc_info() - _logger.error("getCurrentSiteData : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # get list of site - def getSiteList(self): - _logger.debug("getSiteList start") - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = "SELECT siteid,nickname FROM schedconfig WHERE siteid<>''" - self.cur.execute(sql) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - retMap = {} - if res != None and len(res) != 0: - for siteid,nickname in res: - # append - if not retMap.has_key(siteid): - retMap[siteid] = [] - retMap[siteid].append(nickname) - _logger.debug(retMap) - _logger.debug("getSiteList done") - return retMap - except: - type, value, traceBack = sys.exc_info() - _logger.error("getSiteList : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # get site info - def getSiteInfo(self): - _logger.debug("getSiteInfo start") - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = "SELECT nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory," - sql+= "maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec," - sql+= "priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue," - sql+= "validatedreleases,accesscontrol " - sql+= "FROM schedconfig WHERE siteid<>''" - self.cur.execute(sql) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - retList = {} - if resList != None: - # loop over all results - for res in resList: - nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory,\ - maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec,\ - priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue,\ - validatedreleases,accesscontrol \ - = res - # instantiate SiteSpec - ret = SiteSpec.SiteSpec() - ret.sitename = siteid - ret.nickname = nickname - ret.dq2url = dq2url - ret.cloud = cloud - ret.ddm = ddm.split(',')[0] - ret.lfchost = lfchost - ret.se = se - ret.gatekeeper = gatekeeper - ret.memory = memory - ret.maxtime = maxtime - ret.status = status - ret.space = space - ret.glexec = glexec - ret.queue = queue - ret.localqueue = localqueue - ret.accesscontrol = accesscontrol - # job recoverty - ret.retry = True - if retry == 'FALSE': - ret.retry = False - # convert releases to list - ret.releases = [] - for tmpRel in releases.split('|'): - # remove white space - tmpRel = tmpRel.strip() - if tmpRel != '': - ret.releases.append(tmpRel) - # convert validatedreleases to list - ret.validatedreleases = [] - for tmpRel in validatedreleases.split('|'): - # remove white space - tmpRel = tmpRel.strip() - if tmpRel != '': - ret.validatedreleases.append(tmpRel) - # cmtconfig - # add slc3 if the column is empty - ret.cmtconfig = ['i686-slc3-gcc323-opt'] - if cmtconfig != '': - ret.cmtconfig.append(cmtconfig) - # map between token and DQ2 ID - ret.setokens = {} - tmpTokens = setokens.split(',') - for idxToken,tmpddmID in enumerate(ddm.split(',')): - if idxToken < len(tmpTokens): - ret.setokens[tmpTokens[idxToken]] = tmpddmID - # expand [] in se path - match = re.search('([^\[]*)\[([^\]]+)\](.*)',seprodpath) - if match != None and len(match.groups()) == 3: - seprodpath = '' - for tmpBody in match.group(2).split(','): - seprodpath += '%s%s%s,' % (match.group(1),tmpBody,match.group(3)) - seprodpath = seprodpath[:-1] - # map between token and se path - ret.seprodpath = {} - tmpTokens = setokens.split(',') - for idxToken,tmpSePath in enumerate(seprodpath.split(',')): - if idxToken < len(tmpTokens): - ret.seprodpath[tmpTokens[idxToken]] = tmpSePath - # VO related params - ret.priorityoffset = priorityoffset - ret.allowedgroups = allowedgroups - ret.defaulttoken = defaulttoken - # append - retList[ret.nickname] = ret - _logger.debug("getSiteInfo done") - return retList - except: - type, value, traceBack = sys.exc_info() - _logger.error("getSiteInfo : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # get cloud list - def getCloudList(self): - _logger.debug("getCloudList start") - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = "SELECT name,tier1,tier1SE,relocation,weight,server,status,transtimelo," - sql += "transtimehi,waittime,validation,mcshare,countries,fasttrack,nprestage," - sql += "pilotowners " - sql+= "FROM cloudconfig" - self.cur.execute(sql) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - ret = {} - if res != None and len(res) != 0: - for name,tier1,tier1SE,relocation,weight,server,status,transtimelo,transtimehi,\ - waittime,validation,mcshare,countries,fasttrack,nprestage,pilotowners in res: - # instantiate CloudSpec - tmpC = CloudSpec.CloudSpec() - tmpC.name = name - tmpC.tier1 = tier1 - tmpC.tier1SE = re.sub(' ','',tier1SE).split(',') - tmpC.relocation = relocation - tmpC.weight = weight - tmpC.server = server - tmpC.status = status - tmpC.transtimelo = transtimelo - tmpC.transtimehi = transtimehi - tmpC.waittime = waittime - tmpC.validation = validation - tmpC.mcshare = mcshare - tmpC.countries = countries - tmpC.fasttrack = fasttrack - tmpC.nprestage = nprestage - tmpC.pilotowners = pilotowners - # append - ret[name] = tmpC - _logger.debug("getCloudList done") - return ret - except: - type, value, traceBack = sys.exc_info() - _logger.error("getCloudList : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # extract name from DN - def cleanUserID(self, id): - try: - up = re.compile('/(DC|O|OU|C|L)=[^\/]+') - username = up.sub('', id) - up2 = re.compile('/CN=[0-9]+') - username = up2.sub('', username) - up3 = re.compile(' [0-9]+') - username = up3.sub('', username) - up4 = re.compile('_[0-9]+') - username = up4.sub('', username) - username = username.replace('/CN=proxy','') - username = username.replace('/CN=limited proxy','') - username = username.replace('limited proxy','') - pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)') - mat = pat.match(username) - if mat: - username = mat.group(2) - else: - username = username.replace('/CN=','') - if username.lower().find('/email') > 0: - username = username[:username.lower().find('/email')] - pat = re.compile('.*(limited.*proxy).*') - mat = pat.match(username) - if mat: - username = mat.group(1) - username = username.replace('(','') - username = username.replace(')','') - return username - except: - return id - - - # check quota - def checkQuota(self,dn): - _logger.debug("checkQuota %s" % dn) - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - name = self.cleanUserID(dn) - sql = "SELECT cpua1,cpua7,cpua30,quotaa1,quotaa7,quotaa30 FROM users WHERE name = '%s'" % name - self.cur.execute(sql) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - weight = 0.0 - if res != None and len(res) != 0: - item = res[0] - # cpu and quota - cpu1 = item[0] - cpu7 = item[1] - cpu30 = item[2] - quota1 = item[3] * 3600 - quota7 = item[4] * 3600 - quota30 = item[5] * 3600 - # CPU usage - if cpu1 == None: - cpu1 = 0.0 - # weight - weight = float(cpu1) / float(quota1) - # not exceeded the limit - if weight < 1.0: - weight = 0.0 - _logger.debug("checkQuota %s Weight:%s Quota:%s CPU:%s" % (dn,weight,quota1,cpu1)) - else: - _logger.debug("checkQuota cannot found %s" % dn) - return weight - except: - type, value, traceBack = sys.exc_info() - _logger.error("checkQuota : %s %s" % (type,value)) - # roll back - self._rollback() - return 0.0 - - - # get serialize JobID and status - def getUserParameter(self,dn,jobID): - _logger.debug("getUserParameter %s %s" % (dn,jobID)) - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - name = self.cleanUserID(dn) - sql = "SELECT jobid,status FROM users WHERE name = '%s'" % name - self.cur.execute(sql) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - retJobID = jobID - retStatus = True - if res != None and len(res) != 0: - item = res[0] - # JobID in DB - dbJobID = item[0] - # check status - if item[1] in ['disabled']: - retStatus = False - # use larger JobID - if dbJobID >= int(retJobID): - retJobID = dbJobID+1 - # update DB - sql = "UPDATE users SET jobid=%d WHERE name = '%s'" % (retJobID,name) - self.cur.execute(sql) - _logger.debug("getUserParameter set JobID=%s for %s" % (retJobID,dn)) - return retJobID,retStatus - except: - type, value, traceBack = sys.exc_info() - _logger.error("getUserParameter : %s %s" % (type,value)) - # roll back - self._rollback() - return jobID,True - - - # get email address for a user - def getEmailAddr(self,name): - _logger.debug("get email for %s" % name) - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = "SELECT email FROM users WHERE name='%s'" % name - self.cur.execute(sql) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if res != None and len(res) != 0: - return res[0][0] - # return empty string - return "" - except: - type, value, traceBack = sys.exc_info() - _logger.error("getEmailAddr : %s %s" % (type,value)) - # roll back - self._rollback() - return "" - - - # register proxy key - def registerProxyKey(self,params): - _logger.debug("register ProxyKey %s" % str(params)) - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # construct SQL - sql0 = 'INSERT INTO proxykey (' - sql1 = 'VALUES (' - vals = [] - for key,val in params.iteritems(): - sql0 += '%s,' % key - sql1 += '%s,' - vals.append(val) - sql0 = sql0[:-1] - sql1 = sql1[:-1] - sql = sql0 + ') ' + sql1 + ') ' - # insert - self.cur.execute(sql,tuple(vals)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return True - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("registerProxyKey : %s %s" % (type,value)) - # roll back - self._rollback() - return "" - - - # get proxy key - def getProxyKey(self,dn): - _logger.debug("get ProxyKey %s" % dn) - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # construct SQL - sql = 'SELECT credname,expires,origin,myproxy FROM proxykey WHERE dn=%s ORDER BY expires DESC' - # select - self.cur.execute(sql,(dn,)) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - retMap = {} - if res != None and len(res) != 0: - credname,expires,origin,myproxy = res[0] - retMap['credname'] = credname - retMap['expires'] = expires - retMap['origin'] = origin - retMap['myproxy'] = myproxy - _logger.debug(retMap) - return retMap - except: - type, value, traceBack = sys.exc_info() - _logger.error("getProxyKey : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # check site access - def checkSiteAccess(self,siteid,dn): - comment = ' /* LogDBProxy.checkSiteAccess */' - _logger.debug("checkSiteAccess %s:%s" % (siteid,dn)) - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # construct SQL - sql = 'SELECT poffset,rights,status FROM siteaccess WHERE dn=%s AND pandasite=%s' - # select - self.cur.execute(sql+comment,(dn,siteid)) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - retMap = {} - if res != None and len(res) != 0: - poffset,rights,status = res[0] - retMap['poffset'] = poffset - retMap['rights'] = rights - retMap['status'] = status - _logger.debug(retMap) - return retMap - except: - type, value, traceBack = sys.exc_info() - _logger.error("checkSiteAccess : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # add account to siteaccess - def addSiteAccess(self,siteID,dn): - comment = ' /* LogDBProxy.addSiteAccess */' - _logger.debug("addSiteAccess : %s %s" % (siteID,dn)) - try: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = 'SELECT status FROM siteaccess WHERE dn=%s AND pandasite=%s' - self.cur.execute(sql+comment, (dn,siteID)) - res = self.cur.fetchone() - if res != None: - _logger.debug("account already exists with status=%s" % res[0]) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return res[0] - # add - sql = 'INSERT INTO siteaccess (dn,pandasite,status) VALUES (%s,%s,%s)' - self.cur.execute(sql+comment, (dn,siteID,'requested')) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("account was added") - return 0 - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("addSiteAccess( : %s %s" % (type,value)) - # return None - return -1 - - - # list site access - def listSiteAccess(self,siteid=None,dn=None): - comment = ' /* LogDBProxy.listSiteAccess */' - _logger.debug("listSiteAccess %s:%s" % (siteid,dn)) - try: - if siteid==None and dn==None: - return [] - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # construct SQL - if siteid != None: - varMap = (siteid,) - sql = 'SELECT dn,status FROM siteaccess WHERE pandasite=%s ORDER BY dn' - else: - varMap = (dn,) - sql = 'SELECT pandasite,status FROM siteaccess WHERE dn=%s ORDER BY pandasite' - # select - self.cur.execute(sql+comment,varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - ret = [] - if res != None and len(res) != 0: - for tmpRes in res: - ret.append(tmpRes) - _logger.debug(ret) - return ret - except: - type, value, traceBack = sys.exc_info() - _logger.error("listSiteAccess : %s %s" % (type,value)) - # roll back - self._rollback() - return [] - - - # get list of archived tables - def getArchiveTables(self): - tables = [] - cdate = datetime.datetime.utcnow() - for iCycle in range(2): # 2 = (1 months + 2 just in case)/2 - if cdate.month==1: - cdate = cdate.replace(year = (cdate.year-1)) - cdate = cdate.replace(month = 12, day = 1) - else: - cdate = cdate.replace(month = (cdate.month/2)*2, day = 1) - tableName = "jobsArchived_%s%s" % (cdate.strftime('%b'),cdate.year) - if not tableName in tables: - tables.append(tableName) - # one older table - if cdate.month > 2: - cdate = cdate.replace(month = (cdate.month-2)) - else: - cdate = cdate.replace(year = (cdate.year-1), month = 12) - # return - return tables - - - # get JobIDs in a time range - def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs): - comment = ' /* LogDBProxy.getJobIDsInTimeRange */' - _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) - try: - # get list of archived tables - tables = self.getArchiveTables() - # select - for table in tables: - # make sql - sql = "SELECT jobDefinitionID FROM %s " % table - sql += "WHERE prodUserID=%s AND modificationTime>%s AND prodSourceLabel='user'" - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - _logger.debug(sql+comment+str((dn,timeRange.strftime('%Y-%m-%d %H:%M:%S')))) - self.cur.execute(sql+comment, (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for tmpID, in resList: - if not tmpID in retJobIDs: - retJobIDs.append(tmpID) - _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs)) - return retJobIDs - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getJobIDsInTimeRange : %s %s" % (type,value)) - # return empty list - return retJobIDs - - - # get PandaIDs for a JobID - def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs): - comment = ' /* LogProxy.getPandIDsWithJobID */' - _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID)) - try: - # get list of archived tables - tables = self.getArchiveTables() - # select - for table in tables: - # skip if all jobs have already been gotten - if nJobs > 0 and len(idStatus) >= nJobs: - continue - # make sql - sql = "SELECT PandaID,jobStatus,commandToPilot FROM %s " % table - sql += "WHERE prodUserID=%s AND jobDefinitionID=%s " - sql += "AND prodSourceLabel in ('user','panda') " - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - _logger.debug(sql+comment+str((dn,jobID))) - self.cur.execute(sql+comment, (dn,jobID)) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for tmpID,tmpStatus,tmpCommand in resList: - if not idStatus.has_key(tmpID): - idStatus[tmpID] = (tmpStatus,tmpCommand) - _logger.debug("getPandIDsWithJobID : %s" % str(idStatus)) - return idStatus - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getPandIDsWithJobID : %s %s" % (type,value)) - # return empty list - return {} - - - # peek at job - def peekJob(self,pandaID): - comment = ' /* LogDBProxy.peekJob */' - _logger.debug("peekJob : %s" % pandaID) - # return None for NULL PandaID - if pandaID in ['NULL','','None',None]: - return None - sql1_0 = "SELECT %s FROM %s " - sql1_1 = "WHERE PandaID=%s" - try: - # get list of archived tables - tables = self.getArchiveTables() - # select - for table in tables: - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1 - self.cur.execute(sql+comment, (pandaID,)) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if len(res) != 0: - # Job - job = JobSpec() - job.pack(res[0]) - # Files - # set autocommit on - self.cur.execute("SET AUTOCOMMIT=1") - # select - fileTableName = re.sub('jobsArchived','filesTable',table) - sqlFile = "SELECT %s " % FileSpec.columnNames() - sqlFile+= "FROM %s " % fileTableName - sqlFile+= "WHERE PandaID=%s" - self.cur.execute(sqlFile+comment, (job.PandaID,)) - resFs = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # set files - for resF in resFs: - file = FileSpec() - file.pack(resF) - job.addFile(file) - return job - _logger.debug("peekJob() : PandaID %s not found" % pandaID) - return None - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("peekJob : %s %s" % (type,value)) - # return None - return None - - - # wake up connection - def wakeUp(self): - for iTry in range(5): - try: - # check if the connection is working - self.conn.ping() - return - except: - type, value, traceBack = sys.exc_info() - _logger.debug("wakeUp %d : %s %s" % (iTry,type,value)) - # wait for reconnection - time.sleep(1) - self.connect(reconnect=True) - - - # close - def close(self): - try: - self.cur.close() - self.conn.close() - except: - type, value, traceBack = sys.exc_info() - _logger.error("close : %s %s" % (type,value)) - - - # commit - def _commit(self): - try: - self.conn.commit() - return True - except: - _logger.error("commit error") - return False - - - # rollback - def _rollback(self): - try: - self.conn.rollback() - return True - except: - _logger.error("rollback error") - return False - diff --git a/current/pandaserver/taskbuffer/LogDBProxyPool.py b/current/pandaserver/taskbuffer/LogDBProxyPool.py deleted file mode 100755 index c9f986741..000000000 --- a/current/pandaserver/taskbuffer/LogDBProxyPool.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -pool for LogDBProxies - -""" - -import time -import Queue -import random -import OraLogDBProxy as LogDBProxy -from config import panda_config - -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('LogDBProxyPool') - -class LogDBProxyPool: - - def __init__(self,nConnection=panda_config.nLogDBConnection): - # create Proxies - _logger.debug("init") - self.proxyList = Queue.Queue(nConnection) - for i in range(nConnection): - _logger.debug("connect -> %s " % i) - proxy = LogDBProxy.LogDBProxy() - nTry = 10 - for iTry in range(nTry): - if proxy.connect(): - break - _logger.debug("failed -> %s : try %s" % (i,iTry)) - if iTry+1 == nTry: - raise RuntimeError, 'LogDBProxyPool.__init__ failed' - time.sleep(random.randint(10,20)) - self.proxyList.put(proxy) - time.sleep(1) - _logger.debug("ready") - - # return a free proxy. this method blocks until a proxy is available - def getProxy(self): - # get proxy - proxy = self.proxyList.get() - # wake up connection - proxy.wakeUp() - # return - return proxy - - - # put back a proxy - def putProxy(self,proxy): - # put - self.proxyList.put(proxy) - diff --git a/current/pandaserver/taskbuffer/MemProxy.py b/current/pandaserver/taskbuffer/MemProxy.py deleted file mode 100644 index 025b6a815..000000000 --- a/current/pandaserver/taskbuffer/MemProxy.py +++ /dev/null @@ -1,205 +0,0 @@ -# proxy for memcached - -import sys - -from config import panda_config - -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('MemProxy') - - -# proxy -class MemProxy: - - # constructor - def __init__(self): - try: - import memcache - # initialize memcached client - _logger.debug("initialize memcache client with %s" % panda_config.memcached_srvs) - self.mclient = memcache.Client(panda_config.memcached_srvs.split(',')) - # server statistics - _logger.debug(self.mclient.get_stats()) - _logger.debug("memcache client is ready") - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("failed to initialize memcach client : %s %s" % (errType,errValue)) - - - # insert files - def setFiles(self,pandaID,site,node,files): - try: - _logger.debug("setFiles site=%s node=%s start" % (site,node)) - # key prefix - keyPrefix = self.getKeyPrefix(site,node) - # failed to get key prefix - if keyPrefix == None: - _logger.error("setFiles failed to get key prefix") - return False - # loop over all files - varMap = {} - for tmpFile in files: - newKey = tmpFile - varMap[newKey] = True - # bulk insert - failedList = self.mclient.set_multi(varMap,time=panda_config.memcached_exptime, - key_prefix=keyPrefix) - # failed - if failedList != []: - _logger.error("setFiles failed to insert %s values for site=%s node=%s" % \ - (len(failedList),site,node)) - return False - _logger.debug("setFiles site=%s node=%s completed" % (site,node)) - return True - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("setFiles failed with %s %s" % (errType,errValue)) - return False - - - # delete files - def deleteFiles(self,site,node,files): - try: - fileList = files.split(',') - # remove '' - try: - fileList.remove('') - except: - pass - _logger.debug("deleteFiles for %s:%s:%s start" % (site,node,len(fileList))) - # empty list - if len(fileList) == 0: - _logger.debug("deleteFiles skipped for empty list") - return True - # key prefix - keyPrefix = self.getKeyPrefix(site,node) - # non-existing key - if keyPrefix == None: - _logger.debug("deleteFiles skipped for non-existing key") - return True - # get the number of bunches - nKeys = 100 - tmpDiv,tmpMod = divmod(len(fileList),nKeys) - if tmpMod != 0: - tmpDiv += 1 - # loop over all bunches - retMap = {True:0,False:0} - for idxB in range(tmpDiv): - # delete - retD = self.mclient.delete_multi(fileList[idxB*nKeys:(idxB+1)*nKeys], - key_prefix=keyPrefix) - if retD == 1: - retMap[True] += 1 - else: - retMap[False] += 1 - # failed - if retMap[False] != 0: - _logger.error("deleteFiles failed %s/%s" % (retMap[False], - retMap[True]+retMap[False])) - return False - _logger.debug("deleteFiles succeeded") - return True - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("deleteFiles failed with %s %s" % (errType,errValue)) - return False - - - # check files - def checkFiles(self,pandaID,files,site,node,keyPrefix='',getDetail=False): - try: - _logger.debug("checkFiles PandaID=%s with %s:%s start" % (pandaID,site,node)) - # get key prefix - if keyPrefix == '': - keyPrefix = self.getKeyPrefix(site,node) - # non-existing key - if keyPrefix == None: - _logger.debug("checkFiles PandaID=%s with %s:%s doesn't exist" % \ - (pandaID,site,node)) - return 0 - # loop over all files - keyList = [] - for tmpFile in files: - newKey = tmpFile - if not newKey in keyList: - keyList.append(newKey) - # bulk get - retMap = self.mclient.get_multi(keyList,key_prefix=keyPrefix) - _logger.debug("checkFiles PandaID=%s with %s:%s has %s files" % \ - (pandaID,site,node,len(retMap))) - # return detailed string - if getDetail: - retStr = '' - for tmpFile in files: - if retMap.has_key(tmpFile): - retStr += '1,' - else: - retStr += '0,' - retStr = retStr[:-1] - return retStr - # return number of files - return len(retMap) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("checkFiles failed with %s %s" % (errType,errValue)) - return 0 - - - # flush files - def flushFiles(self,site,node): - try: - _logger.debug("flushFiles for %s:%s start" % (site,node)) - # key prefix stored in memcached - keyPrefix = self.getInternalKeyPrefix(site,node) - # increment - serNum = self.mclient.incr(keyPrefix) - # return if not exist - if serNum == None: - _logger.debug("flushFiles skipped for non-existing key") - return True - # avoid overflow - if serNum > 1024: - serNum = 0 - # set - retS = self.mclient.set(keyPrefix,serNum,time=panda_config.memcached_exptime) - if retS == 0: - # failed - _logger.error("flushFiles failed to set new SN") - return False - _logger.error("flushFiles completed") - return True - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("flushFiles failed with %s %s" % (errType,errValue)) - return False - - - # get internal key prefix - def getInternalKeyPrefix(self,site,node): - # get short WN name - shortWN = node.split('.')[0] - # key prefix stored in memcached - keyPrefix = '%s_%s' % (site,shortWN) - return keyPrefix - - - # get key prefix - def getKeyPrefix(self,site,node): - # key prefix stored in memcached - keyPrefix = self.getInternalKeyPrefix(site,node) - # get serial number from memcached - serNum = self.mclient.get(keyPrefix) - # use 0 if not exist - if serNum == None: - serNum = 0 - # set to avoid expiration - retS = self.mclient.set(keyPrefix,serNum,time=panda_config.memcached_exptime) - if retS == 0: - # failed - return None - else: - # return prefix site_node_sn_ - newPrefix = '%s_%s' % (keyPrefix,serNum) - return newPrefix diff --git a/current/pandaserver/taskbuffer/OraDBProxy.py b/current/pandaserver/taskbuffer/OraDBProxy.py deleted file mode 100755 index 1295ef360..000000000 --- a/current/pandaserver/taskbuffer/OraDBProxy.py +++ /dev/null @@ -1,10739 +0,0 @@ -""" -proxy for database connection - -""" - -import re -import os -import sys -import time -import fcntl -import types -import random -import urllib -import socket -import datetime -import commands -import traceback -import warnings -import cx_Oracle -import ErrorCode -import SiteSpec -import CloudSpec -import PrioUtil -import ProcessGroups -from JobSpec import JobSpec -from FileSpec import FileSpec -from DatasetSpec import DatasetSpec -from CloudTaskSpec import CloudTaskSpec -from pandalogger.PandaLogger import PandaLogger -from config import panda_config -from brokerage.PandaSiteIDs import PandaSiteIDs - -warnings.filterwarnings('ignore') - -# logger -_logger = PandaLogger().getLogger('DBProxy') - -# lock file -_lockGetSN = open(panda_config.lockfile_getSN, 'w') -_lockSetDS = open(panda_config.lockfile_setDS, 'w') -_lockGetCT = open(panda_config.lockfile_getCT, 'w') - - -# proxy -class DBProxy: - - # constructor - def __init__(self,useOtherError=False): - # connection object - self.conn = None - # cursor object - self.cur = None - # host name - self.hostname = None - # retry count - self.nTry = 5 - # use special error codes for reconnection in querySQL - self.useOtherError = useOtherError - # memcached client - self.memcache = None - # pledge resource ratio - self.beyondPledgeRatio = {} - # update time for pledge resource ratio - self.updateTimeForPledgeRatio = None - # fareshare policy - self.faresharePolicy = {} - # update time for fareshare policy - self.updateTimeForFaresharePolicy = None - # hostname - self.myHostName = socket.getfqdn() - - - # connect to DB - def connect(self,dbhost=panda_config.dbhost,dbpasswd=panda_config.dbpasswd, - dbuser=panda_config.dbuser,dbname=panda_config.dbname, - dbtimeout=None,reconnect=False): - _logger.debug("connect : re=%s" % reconnect) - # keep parameters for reconnect - if not reconnect: - self.dbhost = dbhost - self.dbpasswd = dbpasswd - self.dbuser = dbuser - self.dbname = dbname - self.dbtimeout = dbtimeout - # close old connection - if reconnect: - _logger.debug("closing old connection") - try: - self.conn.close() - except: - _logger.debug("failed to close old connection") - # connect - try: - self.conn = cx_Oracle.connect(dsn=self.dbhost,user=self.dbuser, - password=self.dbpasswd,threaded=True) - self.cur=self.conn.cursor() - try: - # use SQL dumper - if panda_config.dump_sql: - import SQLDumper - self.cur = SQLDumper.SQLDumper(self.cur) - except: - pass - # get hostname - self.cur.execute("SELECT SYS_CONTEXT('USERENV','HOST') FROM dual") - res = self.cur.fetchone() - if res != None: - self.hostname = res[0] - # set TZ - self.cur.execute("ALTER SESSION SET TIME_ZONE='UTC'") - # set DATE format - self.cur.execute("ALTER SESSION SET NLS_DATE_FORMAT='YYYY/MM/DD HH24:MI:SS'") - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("connect : %s %s" % (type,value)) - return False - - - # query an SQL - def querySQL(self,sql,arraySize=1000): - comment = ' /* DBProxy.querySQL */' - try: - _logger.debug("querySQL : %s " % sql) - # begin transaction - self.conn.begin() - self.cur.arraysize = arraySize - self.cur.execute(sql+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return res - except: - # roll back - self._rollback(self.useOtherError) - type, value, traceBack = sys.exc_info() - _logger.error("querySQL : %s " % sql) - _logger.error("querySQL : %s %s" % (type,value)) - return None - - - # query an SQL return Status - def querySQLS(self,sql,varMap,arraySize=1000): - comment = ' /* DBProxy.querySQLS */' - try: - # begin transaction - self.conn.begin() - self.cur.arraysize = arraySize - ret = self.cur.execute(sql+comment,varMap) - if sql.startswith('INSERT') or sql.startswith('UPDATE') or \ - sql.startswith('DELETE'): - res = self.cur.rowcount - else: - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return ret,res - except: - # roll back - self._rollback(self.useOtherError) - type, value, traceBack = sys.exc_info() - _logger.error("querySQLS : %s %s" % (sql,str(varMap))) - _logger.error("querySQLS : %s %s" % (type,value)) - return -1,None - - - # get CLOB - def getClobObj(self,sql,varMap,arraySize=10000): - comment = ' /* DBProxy.getClobObj */' - try: - # begin transaction - self.conn.begin() - self.cur.arraysize = arraySize - ret = self.cur.execute(sql+comment,varMap) - res = [] - for items in self.cur: - resItem = [] - for item in items: - # read CLOB - resItem.append(item.read()) - # append - res.append(resItem) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return ret,res - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getClobObj : %s %s" % (sql,str(varMap))) - _logger.error("getClobObj : %s %s" % (type,value)) - return -1,None - - - # insert job to jobsDefined - def insertNewJob(self,job,user,serNum,weight=0.0,priorityOffset=0,userVO=None,groupJobSN=0,toPending=False): - comment = ' /* DBProxy.insertNewJob */' - if not toPending: - sql1 = "INSERT INTO ATLAS_PANDA.jobsDefined4 (%s) " % JobSpec.columnNames() - else: - sql1 = "INSERT INTO ATLAS_PANDA.jobsWaiting4 (%s) " % JobSpec.columnNames() - sql1+= JobSpec.bindValuesExpression(useSeq=True) - sql1+= " RETURNING PandaID INTO :newPandaID" - # make sure PandaID is NULL - job.PandaID = None - # job status - if not toPending: - job.jobStatus='defined' - else: - job.jobStatus='pending' - # host and time information - job.modificationHost = self.hostname - job.creationTime = datetime.datetime.utcnow() - job.modificationTime = job.creationTime - job.stateChangeTime = job.creationTime - job.prodDBUpdateTime = datetime.datetime(1,1,1) - # DN - if job.prodUserID == "NULL" or job.prodSourceLabel in ['user','panda']: - job.prodUserID = user - # compact user name - job.prodUserName = self.cleanUserID(job.prodUserID) - if job.prodUserName in ['','NULL']: - # use prodUserID as compact user name - job.prodUserName = job.prodUserID - # VO - job.VO = userVO - # priority - if job.assignedPriority != 'NULL': - job.currentPriority = job.assignedPriority - if job.prodSourceLabel == 'install': - job.currentPriority = 4100 - elif job.prodSourceLabel == 'user': - if job.processingType in ['usermerge'] and not job.currentPriority in ['NULL',None]: - # avoid prio reduction for merge jobs - pass - else: - job.currentPriority = PrioUtil.calculatePriority(priorityOffset,serNum,weight) - if 'express' in job.specialHandling: - job.currentPriority = 6000 - elif job.prodSourceLabel == 'panda': - job.currentPriority = 2000 + priorityOffset - if 'express' in job.specialHandling: - job.currentPriority = 6500 - # usergroup - if job.prodSourceLabel == 'regional': - job.computingSite= "BNLPROD" - # group job SN - groupJobSN = "%05d" % groupJobSN - # set attempt numbers - if job.prodSourceLabel in ['user','panda','ptest','rc_test']: - if job.attemptNr in [None,'NULL','']: - job.attemptNr = 0 - if job.maxAttempt in [None,'NULL','']: - job.maxAttempt = 0 - # set maxAttempt to attemptNr to disable server/pilot retry - if job.maxAttempt == -1: - job.maxAttempt = job.attemptNr - else: - # set maxAttempt to have server/pilot retries for retried jobs - if job.maxAttempt <= job.attemptNr: - job.maxAttempt = job.attemptNr + 2 - try: - # begin transaction - self.conn.begin() - # insert - varMap = job.valuesMap(useSeq=True) - varMap[':newPandaID'] = self.cur.var(cx_Oracle.NUMBER) - retI = self.cur.execute(sql1+comment, varMap) - # set PandaID - job.PandaID = long(varMap[':newPandaID'].getvalue()) - # get jobsetID - if job.jobsetID in [None,'NULL',-1]: - jobsetID = 0 - else: - jobsetID = job.jobsetID - jobsetID = '%06d' % jobsetID - # reset changed attribute list - job.resetChangedList() - # insert files - _logger.debug("insertNewJob : %s Label:%s prio:%s" % (job.PandaID,job.prodSourceLabel, - job.currentPriority)) - sqlFile = "INSERT INTO ATLAS_PANDA.filesTable4 (%s) " % FileSpec.columnNames() - sqlFile+= FileSpec.bindValuesExpression(useSeq=True) - sqlFile+= " RETURNING row_ID INTO :newRowID" - for file in job.Files: - file.row_ID = None - if file.status != 'ready': - file.status='unknown' - # replace $PANDAID with real PandaID - file.lfn = re.sub('\$PANDAID', '%05d' % job.PandaID, file.lfn) - # replace $JOBSETID with real jobsetID - if not job.prodSourceLabel in ['managed']: - file.lfn = re.sub('\$JOBSETID', jobsetID, file.lfn) - file.lfn = re.sub('\$GROUPJOBSN', groupJobSN, file.lfn) - # set scope - if file.type in ['output','log'] and job.VO in ['atlas']: - file.scope = self.extractScope(file.dataset) - # insert - varMap = file.valuesMap(useSeq=True) - varMap[':newRowID'] = self.cur.var(cx_Oracle.NUMBER) - self.cur.execute(sqlFile+comment, varMap) - # get rowID - file.row_ID = long(varMap[':newRowID'].getvalue()) - # reset changed attribute list - file.resetChangedList() - # metadata - if job.prodSourceLabel in ['user','panda'] and job.metadata != '': - sqlMeta = "INSERT INTO ATLAS_PANDA.metaTable (PandaID,metaData) VALUES (:PandaID,:metaData)" - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':metaData'] = job.metadata - self.cur.execute(sqlMeta+comment, varMap) - # job parameters - if not job.prodSourceLabel in ['managed']: - job.jobParameters = re.sub('\$JOBSETID', jobsetID, job.jobParameters) - job.jobParameters = re.sub('\$GROUPJOBSN', groupJobSN, job.jobParameters) - sqlJob = "INSERT INTO ATLAS_PANDA.jobParamsTable (PandaID,jobParameters) VALUES (:PandaID,:param)" - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':param'] = job.jobParameters - self.cur.execute(sqlJob+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("insertNewJob : %s File OK" % job.PandaID) - # record status change - try: - self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) - except: - _logger.error('recordStatusChange in insertNewJob') - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("insertNewJob : %s %s" % (type,value)) - # roll back - self._rollback() - return False - - - # simply insert job to a table - def insertJobSimple(self,job,table,fileTable,jobParamsTable,metaTable): - comment = ' /* DBProxy.insertJobSimple */' - _logger.debug("insertJobSimple : %s" % job.PandaID) - sql1 = "INSERT INTO %s (%s) " % (table,JobSpec.columnNames()) - sql1+= JobSpec.bindValuesExpression() - try: - # begin transaction - self.conn.begin() - # insert - self.cur.execute(sql1+comment, job.valuesMap()) - # files - sqlFile = "INSERT INTO %s " % fileTable - sqlFile+= "(%s) " % FileSpec.columnNames(withMod=True) - sqlFile+= FileSpec.bindValuesExpression(withMod=True) - for file in job.Files: - varMap = file.valuesMap() - varMap[':modificationTime'] = job.modificationTime - self.cur.execute(sqlFile+comment, varMap) - # job parameters - sqlJob = "INSERT INTO %s (PandaID,jobParameters,modificationTime) VALUES (:PandaID,:param,:modificationTime)" \ - % jobParamsTable - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':param'] = job.jobParameters - varMap[':modificationTime'] = job.modificationTime - self.cur.execute(sqlJob+comment, varMap) - # metadata - if not job.metadata in [None,'NULL','']: - sqlMeta = "INSERT INTO %s (PandaID,metaData,modificationTime) VALUES(:PandaID,:metaData,:modificationTime)" \ - % metaTable - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':metaData'] = job.metadata - varMap[':modificationTime'] = job.modificationTime - self.cur.execute(sqlMeta+comment,varMap) - # set flag to avoid duplicated insertion attempts - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':archivedFlag'] = 1 - sqlArch = "UPDATE ATLAS_PANDA.jobsArchived4 SET archivedFlag=:archivedFlag WHERE PandaID=:PandaID" - self.cur.execute(sqlArch+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("insertJobSimple : %s %s" % (type,value)) - # roll back - self._rollback() - return False - - - # simply insert job to a table without reading - def insertJobSimpleUnread(self,pandaID,modTime): - comment = ' /* DBProxy.insertJobSimpleUnread */' - _logger.debug("insertJobSimpleUnread : %s" % pandaID) - # check - sqlC = "SELECT archivedFlag FROM ATLAS_PANDA.jobsArchived4 " - sqlC += "WHERE PandaID=:pandaID " - # job - sqlJ = "INSERT INTO ATLAS_PANDAARCH.jobsArchived (%s) " % JobSpec.columnNames() - sqlJ += "SELECT %s FROM ATLAS_PANDA.jobsArchived4 " % JobSpec.columnNames() - sqlJ += "WHERE PandaID=:pandaID " - # file - sqlF = "INSERT INTO ATLAS_PANDAARCH.filesTable_ARCH (%s) " % FileSpec.columnNames(withMod=True) - sqlF += "SELECT %s,:modTime FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames(withMod=False) - sqlF += "WHERE PandaID=:pandaID " - # parameters - sqlP = "INSERT INTO ATLAS_PANDAARCH.jobParamsTable_ARCH (PandaID,jobParameters,modificationTime) " - sqlP += "SELECT PandaID,jobParameters,:modTime FROM ATLAS_PANDA.jobParamsTable " - sqlP += "WHERE PandaID=:pandaID " - # metadata - sqlM1 = "SELECT PandaID FROM ATLAS_PANDA.metaTable " - sqlM1 += "WHERE PandaID=:pandaID AND rownum<=1 " - sqlM2 = "INSERT INTO ATLAS_PANDAARCH.metaTable_ARCH (PandaID,metaData,modificationTime) " - sqlM2 += "SELECT PandaID,metaData,:modTime FROM ATLAS_PANDA.metaTable " - sqlM2 += "WHERE PandaID=:pandaID " - try: - # begin transaction - self.conn.begin() - # check - varMap = {} - varMap[':pandaID'] = pandaID - self.cur.execute(sqlC+comment,varMap) - res = self.cur.fetchone() - if res == None or res[0] == 1: - if res == None: - _logger.error("insertJobSimpleUnread : %s cannot get archivedFlag" % pandaID) - else: - _logger.debug("insertJobSimpleUnread : %s skip" % pandaID) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - # insert - varMap = {} - varMap[':pandaID'] = pandaID - self.cur.execute(sqlJ+comment,varMap) - varMap = {} - varMap[':pandaID'] = pandaID - varMap[':modTime'] = modTime - self.cur.execute(sqlF+comment,varMap) - varMap = {} - varMap[':pandaID'] = pandaID - varMap[':modTime'] = modTime - self.cur.execute(sqlP+comment,varMap) - varMap = {} - varMap[':pandaID'] = pandaID - self.cur.execute(sqlM1+comment,varMap) - res = self.cur.fetchone() - if res != None: - varMap = {} - varMap[':pandaID'] = pandaID - varMap[':modTime'] = modTime - self.cur.execute(sqlM2+comment,varMap) - # set flag to avoid duplicated insertion attempts - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':archivedFlag'] = 1 - sqlArch = "UPDATE ATLAS_PANDA.jobsArchived4 SET archivedFlag=:archivedFlag WHERE PandaID=:PandaID" - self.cur.execute(sqlArch+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("insertJobSimpleUnread %s : %s %s" % (pandaID,type,value)) - # roll back - self._rollback() - return False - - - # delete job - def deleteJobSimple(self,pandaID): - comment = ' /* DBProxy.deleteJobSimple */' - _logger.debug("deleteJobSimple : %s" % pandaID) - try: - # begin transaction - self.conn.begin() - # delete - varMap = {} - varMap[':PandaID'] = pandaID - sql = 'DELETE from ATLAS_PANDA.jobsArchived4 WHERE PandaID=:PandaID' - self.cur.execute(sql+comment, varMap) - sql = "DELETE from ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID" - self.cur.execute(sql+comment, varMap) - sql = "DELETE from ATLAS_PANDA.metaTable WHERE PandaID=:PandaID" - self.cur.execute(sql+comment, varMap) - sql = "DELETE from ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" - self.cur.execute(sql+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - type, value = sys.exc_info()[:2] - _logger.error("deleteJobSimple %s : %s %s" % (pandaID,type,value)) - # roll back - self._rollback() - return False - - - # activate job. move job from jobsDefined to jobsActive - def activateJob(self,job): - comment = ' /* DBProxy.activateJob */' - updatedFlag = False - if job==None: - _logger.debug("activateJob : None") - return True - _logger.debug("activateJob : %s" % job.PandaID) - sql0 = "SELECT row_ID FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type AND NOT status IN (:status1,:status2) " - sql1 = "DELETE FROM ATLAS_PANDA.jobsDefined4 " - sql1+= "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2) AND commandToPilot IS NULL" - sql2 = "INSERT INTO ATLAS_PANDA.jobsActive4 (%s) " % JobSpec.columnNames() - sql2+= JobSpec.bindValuesExpression() - # host and time information - job.modificationTime = datetime.datetime.utcnow() - # set stateChangeTime for defined->activated but not for assigned->activated - if job.jobStatus in ['defined']: - job.stateChangeTime = job.modificationTime - nTry=3 - for iTry in range(nTry): - try: - # check if all files are ready - allOK = True - for file in job.Files: - if file.type == 'input' and not file.status in ['ready','cached']: - allOK = False - break - # begin transaction - self.conn.begin() - # check all inputs are ready - varMap = {} - varMap[':type'] = 'input' - varMap[':status1'] = 'ready' - varMap[':status2'] = 'cached' - varMap[':PandaID'] = job.PandaID - self.cur.arraysize = 100 - self.cur.execute(sql0+comment, varMap) - res = self.cur.fetchall() - if len(res) == 0 or allOK: - # change status - job.jobStatus = "activated" - # delete - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':oldJobStatus1'] = 'assigned' - varMap[':oldJobStatus2'] = 'defined' - self.cur.execute(sql1+comment, varMap) - n = self.cur.rowcount - if n==0: - # already killed or activated - _logger.debug("activateJob : Not found %s" % job.PandaID) - else: - # insert - self.cur.execute(sql2+comment, job.valuesMap()) - # update files - for file in job.Files: - sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" - varMap = file.valuesMap(onlyChanged=True) - if varMap != {}: - varMap[':row_ID'] = file.row_ID - _logger.debug(sqlF+comment+str(varMap)) - self.cur.execute(sqlF+comment, varMap) - # job parameters - sqlJob = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID" - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':param'] = job.jobParameters - self.cur.execute(sqlJob+comment, varMap) - updatedFlag = True - else: - # update job - sqlJ = ("UPDATE ATLAS_PANDA.jobsDefined4 SET %s " % job.bindUpdateChangesExpression()) + \ - "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)" - varMap = job.valuesMap(onlyChanged=True) - varMap[':PandaID'] = job.PandaID - varMap[':oldJobStatus1'] = 'assigned' - varMap[':oldJobStatus2'] = 'defined' - _logger.debug(sqlJ+comment+str(varMap)) - self.cur.execute(sqlJ+comment, varMap) - n = self.cur.rowcount - if n==0: - # already killed or activated - _logger.debug("activateJob : Not found %s" % job.PandaID) - else: - # update files - for file in job.Files: - sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" - varMap = file.valuesMap(onlyChanged=True) - if varMap != {}: - varMap[':row_ID'] = file.row_ID - _logger.debug(sqlF+comment+str(varMap)) - self.cur.execute(sqlF+comment, varMap) - # job parameters - sqlJob = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID" - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':param'] = job.jobParameters - self.cur.execute(sqlJob+comment, varMap) - updatedFlag = True - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # record status change - try: - if updatedFlag: - self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) - except: - _logger.error('recordStatusChange in activateJob') - return True - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("activateJob : %s retry : %s" % (job.PandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("activateJob : %s %s" % (type,value)) - return False - - - # send job to jobsWaiting - def keepJob(self,job): - comment = ' /* DBProxy.keepJob */' - _logger.debug("keepJob : %s" % job.PandaID) - sql1 = "DELETE FROM ATLAS_PANDA.jobsDefined4 " - sql1+= "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2) AND commandToPilot IS NULL" - sql2 = "INSERT INTO ATLAS_PANDA.jobsWaiting4 (%s) " % JobSpec.columnNames() - sql2+= JobSpec.bindValuesExpression() - # time information - job.modificationTime = datetime.datetime.utcnow() - job.stateChangeTime = job.modificationTime - updatedFlag = False - nTry=3 - for iTry in range(nTry): - try: - # begin transaction - self.conn.begin() - # delete - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':oldJobStatus1'] = 'assigned' - varMap[':oldJobStatus2'] = 'defined' - self.cur.execute(sql1+comment, varMap) - n = self.cur.rowcount - if n==0: - # already killed - _logger.debug("keepJob : Not found %s" % job.PandaID) - else: - # set status - job.jobStatus = 'waiting' - # insert - self.cur.execute(sql2+comment, job.valuesMap()) - # update files - for file in job.Files: - sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" - varMap = file.valuesMap(onlyChanged=True) - if varMap != {}: - varMap[':row_ID'] = file.row_ID - _logger.debug(sqlF+comment+str(varMap)) - self.cur.execute(sqlF+comment, varMap) - updatedFlag = True - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # record status change - try: - if updatedFlag: - self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) - except: - _logger.error('recordStatusChange in keepJob') - return True - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("keepJob : %s retry : %s" % (job.PandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("keepJob : %s %s" % (type,value)) - return False - - - # archive job to jobArchived and remove the job from jobsActive or jobsDefined - def archiveJob(self,job,fromJobsDefined): - comment = ' /* DBProxy.archiveJob */' - _logger.debug("archiveJob : %s" % job.PandaID) - if fromJobsDefined: - sql1 = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)" - else: - sql1 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID" - sql2 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() - sql2+= JobSpec.bindValuesExpression() - updatedJobList = [] - nTry=3 - for iTry in range(nTry): - try: - # begin transaction - self.conn.begin() - # delete - varMap = {} - varMap[':PandaID'] = job.PandaID - if fromJobsDefined: - varMap[':oldJobStatus1'] = 'assigned' - varMap[':oldJobStatus2'] = 'defined' - self.cur.execute(sql1+comment, varMap) - n = self.cur.rowcount - if n==0: - # already killed - _logger.debug("archiveJob : Not found %s" % job.PandaID) - else: - # insert - job.modificationTime = datetime.datetime.utcnow() - job.stateChangeTime = job.modificationTime - if job.endTime == 'NULL': - job.endTime = job.modificationTime - self.cur.execute(sql2+comment, job.valuesMap()) - # update files - for file in job.Files: - sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" - varMap = file.valuesMap(onlyChanged=True) - if varMap != {}: - varMap[':row_ID'] = file.row_ID - _logger.debug(sqlF+comment+str(varMap)) - self.cur.execute(sqlF+comment, varMap) - # update metadata and parameters - sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':modificationTime'] = job.modificationTime - self.cur.execute(sqlFMod+comment,varMap) - self.cur.execute(sqlMMod+comment,varMap) - self.cur.execute(sqlPMod+comment,varMap) - # increment the number of failed jobs in _dis - myDisList = [] - if job.jobStatus == 'failed' and job.prodSourceLabel in ['managed','test']: - for tmpFile in job.Files: - if tmpFile.type == 'input' and not tmpFile.dispatchDBlock in ['','NULL',None] \ - and not tmpFile.dispatchDBlock in myDisList: - varMap = {} - varMap[':name'] = tmpFile.dispatchDBlock - # check currentfiles - sqlGetCurFiles = """SELECT /*+ BEGIN_OUTLINE_DATA """ - sqlGetCurFiles += """INDEX_RS_ASC(@"SEL$1" "TAB"@"SEL$1" ("DATASETS"."NAME")) """ - sqlGetCurFiles += """OUTLINE_LEAF(@"SEL$1") ALL_ROWS """ - sqlGetCurFiles += """OPTIMIZER_FEATURES_ENABLE('10.2.0.4') """ - sqlGetCurFiles += """IGNORE_OPTIM_EMBEDDED_HINTS """ - sqlGetCurFiles += """END_OUTLINE_DATA */ """ - sqlGetCurFiles += "currentfiles,vuid FROM ATLAS_PANDA.Datasets tab WHERE name=:name" - self.cur.execute(sqlGetCurFiles+comment,varMap) - resCurFiles = self.cur.fetchone() - _logger.debug("archiveJob : %s %s" % (job.PandaID,str(resCurFiles))) - if resCurFiles != None: - # increment currentfiles only for the first failed job since that is enough - tmpCurrentFiles,tmpVUID = resCurFiles - _logger.debug("archiveJob : %s %s currentfiles=%s" % (job.PandaID,tmpFile.dispatchDBlock,tmpCurrentFiles)) - if tmpCurrentFiles == 0: - _logger.debug("archiveJob : %s %s update currentfiles" % (job.PandaID,tmpFile.dispatchDBlock)) - varMap = {} - varMap[':vuid'] = tmpVUID - sqlFailedInDis = 'UPDATE ATLAS_PANDA.Datasets ' - sqlFailedInDis += 'SET currentfiles=currentfiles+1 WHERE vuid=:vuid' - self.cur.execute(sqlFailedInDis+comment,varMap) - myDisList.append(tmpFile.dispatchDBlock) - # collect to record state change - updatedJobList.append(job) - # delete downstream jobs - ddmIDs = [] - newJob = None - ddmAttempt = 0 - if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed': - # look for outputs - upOutputs = [] - for file in job.Files: - if file.type == 'output': - upOutputs.append(file.lfn) - toBeClosedSubList = {} - topUserDsList = [] - # look for downstream jobs - sqlD = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND lfn=:lfn GROUP BY PandaID" - sqlDJS = "SELECT %s " % JobSpec.columnNames() - sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" - sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" - sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() - sqlDJI+= JobSpec.bindValuesExpression() - sqlDFup = "UPDATE ATLAS_PANDA.filesTable4 SET status=:status WHERE PandaID=:PandaID AND type IN (:type1,:type2)" - sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlGetSub = "SELECT DISTINCT destinationDBlock FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND PandaID=:PandaID" - sqlCloseSub = 'UPDATE /*+ INDEX_RS_ASC(TAB("DATASETS"."NAME")) */ ATLAS_PANDA.Datasets tab ' - sqlCloseSub += 'SET status=:status,modificationDate=CURRENT_DATE WHERE name=:name' - for upFile in upOutputs: - _logger.debug("look for downstream jobs for %s" % upFile) - # select PandaID - varMap = {} - varMap[':lfn'] = upFile - varMap[':type'] = 'input' - self.cur.arraysize = 100000 - self.cur.execute(sqlD+comment, varMap) - res = self.cur.fetchall() - for downID, in res: - _logger.debug("delete : %s" % downID) - # select jobs - varMap = {} - varMap[':PandaID'] = downID - self.cur.arraysize = 10 - self.cur.execute(sqlDJS+comment, varMap) - resJob = self.cur.fetchall() - if len(resJob) == 0: - continue - # instantiate JobSpec - dJob = JobSpec() - dJob.pack(resJob[0]) - # delete - varMap = {} - varMap[':PandaID'] = downID - self.cur.execute(sqlDJD+comment, varMap) - retD = self.cur.rowcount - if retD == 0: - continue - # error code - dJob.jobStatus = 'cancelled' - dJob.endTime = datetime.datetime.utcnow() - dJob.taskBufferErrorCode = ErrorCode.EC_Kill - dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed' - dJob.modificationTime = dJob.endTime - dJob.stateChangeTime = dJob.endTime - # insert - self.cur.execute(sqlDJI+comment, dJob.valuesMap()) - # update file status - varMap = {} - varMap[':PandaID'] = downID - varMap[':status'] = 'failed' - varMap[':type1'] = 'output' - varMap[':type2'] = 'log' - self.cur.execute(sqlDFup+comment, varMap) - # update files,metadata,parametes - varMap = {} - varMap[':PandaID'] = downID - varMap[':modificationTime'] = dJob.modificationTime - self.cur.execute(sqlFMod+comment,varMap) - self.cur.execute(sqlMMod+comment,varMap) - self.cur.execute(sqlPMod+comment,varMap) - # collect to record state change - updatedJobList.append(dJob) - # set tobeclosed to sub datasets - if not toBeClosedSubList.has_key(dJob.jobDefinitionID): - # init - toBeClosedSubList[dJob.jobDefinitionID] = [] - # get sub datasets - varMap = {} - varMap[':type'] = 'output' - varMap[':PandaID'] = downID - self.cur.arraysize = 1000 - self.cur.execute(sqlGetSub+comment, varMap) - resGetSub = self.cur.fetchall() - if len(resGetSub) == 0: - continue - # loop over all sub datasets - for tmpDestinationDBlock, in resGetSub: - if re.search('_sub\d+$',tmpDestinationDBlock) == None: - continue - if not tmpDestinationDBlock in toBeClosedSubList[dJob.jobDefinitionID]: - # set tobeclosed - varMap = {} - varMap[':status'] = 'tobeclosed' - varMap[':name'] = tmpDestinationDBlock - self.cur.execute(sqlCloseSub+comment, varMap) - _logger.debug("set tobeclosed for %s" % tmpDestinationDBlock) - # append - toBeClosedSubList[dJob.jobDefinitionID].append(tmpDestinationDBlock) - # close top-level user dataset - topUserDsName = re.sub('_sub\d+$','',tmpDestinationDBlock) - if topUserDsName != tmpDestinationDBlock and not topUserDsName in topUserDsList: - # set tobeclosed - varMap = {} - if dJob.processingType.startswith('gangarobot') or \ - dJob.processingType.startswith('hammercloud'): - varMap[':status'] = 'completed' - else: - varMap[':status'] = 'tobeclosed' - varMap[':name'] = topUserDsName - self.cur.execute(sqlCloseSub+comment, varMap) - _logger.debug("set %s for %s" % (varMap[':status'],topUserDsName)) - # append - topUserDsList.append(topUserDsName) - elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='dis': - # get corresponding jobs for production movers - vuid = '' - # extract vuid - match = re.search('--callBack (\S+)',job.jobParameters) - if match != None: - try: - callbackUrl = urllib.unquote(match.group(1)) - callbackUrl = re.sub('[&\?]',' ', callbackUrl) - # look for vuid= - for item in callbackUrl.split(): - if item.startswith('vuid='): - vuid = item.split('=')[-1] - break - except: - pass - if vuid == '': - _logger.error("cannot extract vuid from %s" % job.jobParameters) - else: - # get name - varMap = {} - varMap[':vuid'] = vuid - varMap[':type'] = 'dispatch' - self.cur.arraysize = 10 - self.cur.execute("SELECT name FROM ATLAS_PANDA.Datasets WHERE vuid=:vuid AND type=:type "+comment, varMap) - res = self.cur.fetchall() - if len(res) != 0: - disName = res[0][0] - # check lost files - varMap = {} - varMap[':status'] = 'lost' - varMap[':dispatchDBlock'] = disName - sqlLost = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ distinct PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE status=:status AND dispatchDBlock=:dispatchDBlock" - self.cur.execute(sqlLost+comment,varMap) - resLost = self.cur.fetchall() - # fail jobs with lost files - sqlDJS = "SELECT %s " % JobSpec.columnNames() - sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" - sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" - sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() - sqlDJI+= JobSpec.bindValuesExpression() - sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - lostJobIDs = [] - for tmpID, in resLost: - _logger.debug("fail due to lost files : %s" % tmpID) - varMap = {} - varMap[':PandaID'] = tmpID - self.cur.arraysize = 10 - self.cur.execute(sqlDJS+comment, varMap) - resJob = self.cur.fetchall() - if len(resJob) == 0: - continue - # instantiate JobSpec - dJob = JobSpec() - dJob.pack(resJob[0]) - # delete - varMap = {} - varMap[':PandaID'] = tmpID - self.cur.execute(sqlDJD+comment, varMap) - retD = self.cur.rowcount - if retD == 0: - continue - # error code - dJob.jobStatus = 'failed' - dJob.endTime = datetime.datetime.utcnow() - dJob.ddmErrorCode = 101 #ErrorCode.EC_LostFile - dJob.ddmErrorDiag = 'lost file in SE' - dJob.modificationTime = dJob.endTime - dJob.stateChangeTime = dJob.endTime - # insert - self.cur.execute(sqlDJI+comment, dJob.valuesMap()) - # update files,metadata,parametes - varMap = {} - varMap[':PandaID'] = tmpID - varMap[':modificationTime'] = dJob.modificationTime - self.cur.execute(sqlFMod+comment,varMap) - self.cur.execute(sqlMMod+comment,varMap) - self.cur.execute(sqlPMod+comment,varMap) - # append - lostJobIDs.append(tmpID) - # collect to record state change - updatedJobList.append(dJob) - # get PandaIDs - varMap = {} - varMap[':jobStatus'] = 'assigned' - varMap[':dispatchDBlock'] = disName - self.cur.execute("SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE dispatchDBlock=:dispatchDBlock AND jobStatus=:jobStatus "+comment, - varMap) - resDDM = self.cur.fetchall() - for tmpID, in resDDM: - if not tmpID in lostJobIDs: - ddmIDs.append(tmpID) - # get offset - ddmAttempt = job.attemptNr - _logger.debug("get PandaID for reassign : %s ddmAttempt=%s" % (str(ddmIDs),ddmAttempt)) - elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='ddm' and job.attemptNr<2 \ - and job.commandToPilot != 'tobekilled': - # instantiate new mover to retry subscription - newJob = JobSpec() - newJob.jobDefinitionID = job.jobDefinitionID - newJob.jobName = job.jobName - newJob.attemptNr = job.attemptNr + 1 - newJob.transformation = job.transformation - newJob.destinationDBlock = job.destinationDBlock - newJob.destinationSE = job.destinationSE - newJob.currentPriority = job.currentPriority - newJob.prodSourceLabel = job.prodSourceLabel - newJob.prodUserID = job.prodUserID - newJob.computingSite = job.computingSite - newJob.transferType = job.transferType - newJob.sourceSite = job.sourceSite - newJob.destinationSite = job.destinationSite - newJob.jobParameters = job.jobParameters - if job.Files != []: - file = job.Files[0] - fileOL = FileSpec() - # add attempt nr - fileOL.lfn = re.sub("\.\d+$","",file.lfn) - fileOL.lfn = "%s.%d" % (fileOL.lfn,job.attemptNr) - fileOL.destinationDBlock = file.destinationDBlock - fileOL.destinationSE = file.destinationSE - fileOL.dataset = file.dataset - fileOL.type = file.type - newJob.addFile(fileOL) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # record status change - try: - for tmpJob in updatedJobList: - self.recordStatusChange(tmpJob.PandaID,tmpJob.jobStatus,jobInfo=tmpJob) - except: - _logger.error('recordStatusChange in archiveJob') - return True,ddmIDs,ddmAttempt,newJob - except: - # roll back - self._rollback(True) - if iTry+1 < nTry: - _logger.debug("archiveJob : %s retry : %s" % (job.PandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("archiveJob : %s" % job.PandaID) - _logger.error("archiveJob : %s %s" % (type,value)) - return False,[],0,None - - - # overload of archiveJob - def archiveJobLite(self,pandaID,jobStatus,param): - comment = ' /* DBProxy.archiveJobLite */' - _logger.debug("archiveJobLite : %s" % pandaID) - sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames() - sql1+= "WHERE PandaID=:PandaID" - sql2 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID" - sql3 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() - sql3+= JobSpec.bindValuesExpression() - sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - nTry=3 - for iTry in range(nTry): - try: - # begin transaction - self.conn.begin() - # select - varMap = {} - varMap[':PandaID'] = pandaID - self.cur.arraysize = 10 - self.cur.execute(sql1+comment, varMap) - res = self.cur.fetchall() - if len(res) == 0: - _logger.error("archiveJobLite() : PandaID %d not found" % pandaID) - self._rollback() - return False - job = JobSpec() - job.pack(res[0]) - job.jobStatus = jobStatus - for key in param.keys(): - if param[key] != None: - setattr(job,key,param[key]) - job.modificationTime = datetime.datetime.utcnow() - job.endTime = job.modificationTime - job.stateChangeTime = job.modificationTime - # delete - self.cur.execute(sql2+comment, varMap) - n = self.cur.rowcount - if n==0: - # already killed - _logger.debug("archiveJobLite : Not found %s" % pandaID) - else: - # insert - self.cur.execute(sql3+comment, job.valuesMap()) - # update files - for file in job.Files: - sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" - varMap = file.valuesMap(onlyChanged=True) - if varMap != {}: - varMap[':row_ID'] = file.row_ID - _logger.debug(sqlF+comment+str(varMap)) - self.cur.execute(sqlF+comment, varMap) - # update files,metadata,parametes - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':modificationTime'] = job.modificationTime - self.cur.execute(sqlFMod+comment,varMap) - self.cur.execute(sqlMMod+comment,varMap) - self.cur.execute(sqlPMod+comment,varMap) - # delete downstream jobs - if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed': - # file select - sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=:PandaID" - varMap = {} - varMap[':PandaID'] = pandaID - self.cur.arraysize = 100000 - self.cur.execute(sqlFile+comment, varMap) - resFs = self.cur.fetchall() - for resF in resFs: - file = FileSpec() - file.pack(resF) - job.addFile(file) - # look for outputs - upOutputs = [] - for file in job.Files: - if file.type == 'output': - upOutputs.append(file.lfn) - # look for downstream jobs - sqlD = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND lfn=:lfn GROUP BY PandaID" - sqlDJS = "SELECT %s " % JobSpec.columnNames() - sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" - sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" - sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() - sqlDJI+= JobSpec.bindValuesExpression() - for upFile in upOutputs: - _logger.debug("look for downstream jobs for %s" % upFile) - # select PandaID - varMap = {} - varMap[':lfn'] = upFile - varMap[':type'] = 'input' - self.cur.arraysize = 100000 - self.cur.execute(sqlD+comment, varMap) - res = self.cur.fetchall() - for downID, in res: - _logger.debug("delete : %s" % downID) - # select jobs - varMap = {} - varMap[':PandaID'] = downID - self.cur.arraysize = 10 - self.cur.execute(sqlDJS+comment, varMap) - resJob = self.cur.fetchall() - if len(resJob) == 0: - continue - # instantiate JobSpec - dJob = JobSpec() - dJob.pack(resJob[0]) - # delete - varMap = {} - varMap[':PandaID'] = downID - self.cur.execute(sqlDJD+comment, varMap) - retD = self.cur.rowcount - if retD == 0: - continue - # error code - dJob.jobStatus = 'failed' - dJob.endTime = datetime.datetime.utcnow() - dJob.taskBufferErrorCode = ErrorCode.EC_Kill - dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed' - dJob.modificationTime = dJob.endTime - dJob.stateChangeTime = dJob.endTime - # insert - self.cur.execute(sqlDJI+comment, dJob.valuesMap()) - # update files,metadata,parametes - varMap = {} - varMap[':PandaID'] = downID - varMap[':modificationTime'] = dJob.modificationTime - self.cur.execute(sqlFMod+comment,varMap) - self.cur.execute(sqlMMod+comment,varMap) - self.cur.execute(sqlPMod+comment,varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("archiveJobLite : %s retry : %s" % (pandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("archiveJobLite : %s %s" % (type,value)) - return False - - - # finalize pending jobs - def finalizePendingJobs(self,prodUserName,jobDefinitionID): - comment = ' /* DBProxy.finalizePendingJobs */' - _logger.debug("finalizePendingJobs : %s %s" % (prodUserName,jobDefinitionID)) - sql0 = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 " - sql0+= "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql0+= "AND prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus " - sqlU = "UPDATE ATLAS_PANDA.jobsActive4 SET jobStatus=:newJobStatus " - sqlU+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus " - sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames() - sql1+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus " - sql2 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID AND jobStatus=:jobStatus " - sql3 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() - sql3+= JobSpec.bindValuesExpression() - sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - try: - # begin transaction - self.conn.begin() - self.cur.arraysize = 100000 - # select - varMap = {} - varMap[':jobStatus'] = 'failed' - varMap[':prodUserName'] = prodUserName - varMap[':jobDefinitionID'] = jobDefinitionID - varMap[':prodSourceLabel'] = 'user' - self.cur.execute(sql0+comment,varMap) - resPending = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # lock - pPandaIDs = [] - for pandaID, in resPending: - # begin transaction - self.conn.begin() - # update - varMap = {} - varMap[':jobStatus'] = 'failed' - varMap[':newJobStatus'] = 'holding' - varMap[':PandaID'] = pandaID - self.cur.execute(sqlU+comment,varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - retU = self.cur.rowcount - if retU != 0: - pPandaIDs.append(pandaID) - # loop over all PandaIDs - for pandaID in pPandaIDs: - # begin transaction - self.conn.begin() - # get job - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':jobStatus'] = 'holding' - self.cur.arraysize = 10 - self.cur.execute(sql1+comment,varMap) - res = self.cur.fetchall() - if len(res) == 0: - _logger.debug("finalizePendingJobs : PandaID %d not found" % pandaID) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - continue - job = JobSpec() - job.pack(res[0]) - job.jobStatus = 'failed' - job.modificationTime = datetime.datetime.utcnow() - # delete - self.cur.execute(sql2+comment,varMap) - n = self.cur.rowcount - if n==0: - # already killed - _logger.debug("finalizePendingJobs : Not found %s" % pandaID) - else: - # insert - self.cur.execute(sql3+comment,job.valuesMap()) - # update files,metadata,parametes - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':modificationTime'] = job.modificationTime - self.cur.execute(sqlFMod+comment,varMap) - self.cur.execute(sqlMMod+comment,varMap) - self.cur.execute(sqlPMod+comment,varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("finalizePendingJobs : %s %s done for %s" % (prodUserName,jobDefinitionID,len(pPandaIDs))) - return True - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("finalizePendingJobs : %s %s" % (errType,errValue)) - return False - - - # delete stalled jobs - def deleteStalledJobs(self,libFileName): - comment = ' /* DBProxy.deleteStalledJobs */' - _logger.debug("deleteStalledJobs : %s" % libFileName) - sql2 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() - sql2+= JobSpec.bindValuesExpression() - nTry=3 - try: - # begin transaction - self.conn.begin() - # look for downstream jobs - sqlD = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND lfn=:lfn GROUP BY PandaID" - sqlDJS = "SELECT %s " % JobSpec.columnNames() - sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" - sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" - sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() - sqlDJI+= JobSpec.bindValuesExpression() - sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - _logger.debug("deleteStalledJobs : look for downstream jobs for %s" % libFileName) - # select PandaID - varMap = {} - varMap[':lfn'] = libFileName - varMap[':type'] = 'input' - self.cur.arraysize = 100000 - self.cur.execute(sqlD+comment, varMap) - res = self.cur.fetchall() - for downID, in res: - _logger.debug("deleteStalledJobs : delete %s" % downID) - # select jobs - varMap = {} - varMap[':PandaID'] = downID - self.cur.arraysize = 10 - self.cur.execute(sqlDJS+comment, varMap) - resJob = self.cur.fetchall() - if len(resJob) == 0: - continue - # instantiate JobSpec - dJob = JobSpec() - dJob.pack(resJob[0]) - # delete - varMap = {} - varMap[':PandaID'] = downID - self.cur.execute(sqlDJD+comment, varMap) - retD = self.cur.rowcount - if retD == 0: - continue - # error code - dJob.jobStatus = 'cancelled' - dJob.endTime = datetime.datetime.utcnow() - dJob.taskBufferErrorCode = ErrorCode.EC_Kill - dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed' - dJob.modificationTime = dJob.endTime - dJob.stateChangeTime = dJob.endTime - # insert - self.cur.execute(sqlDJI+comment, dJob.valuesMap()) - # update files,metadata,parametes - varMap = {} - varMap[':PandaID'] = downID - varMap[':modificationTime'] = dJob.modificationTime - self.cur.execute(sqlFMod+comment,varMap) - self.cur.execute(sqlMMod+comment,varMap) - self.cur.execute(sqlPMod+comment,varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback(True) - errtype,errvalue = sys.exc_info()[:2] - _logger.error("deleteStalledJobs : %s %s" % (errtype,errvalue)) - return False - - - # update Job status in jobsActive - def updateJobStatus(self,pandaID,jobStatus,param,updateStateChange=False,attemptNr=None): - comment = ' /* DBProxy.updateJobStatus */' - _logger.debug("updateJobStatus : PandaID=%s attemptNr=%s status=%s" % (pandaID,attemptNr,jobStatus)) - sql0 = "SELECT commandToPilot,endTime,specialHandling,jobStatus,computingSite,cloud,prodSourceLabel FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID " - varMap0 = {} - varMap0[':PandaID'] = pandaID - sql1 = "UPDATE ATLAS_PANDA.jobsActive4 SET jobStatus=:jobStatus,modificationTime=CURRENT_DATE" - if updateStateChange or jobStatus in ['starting']: - sql1 += ",stateChangeTime=CURRENT_DATE" - varMap = {} - varMap[':jobStatus'] = jobStatus - presetEndTime = False - for key in param.keys(): - if param[key] != None: - sql1 += ',%s=:%s' % (key,key) - varMap[':%s' % key] = param[key] - if key == 'endTime': - presetEndTime = True - try: - # store positive error code even for pilot retry - if key == 'pilotErrorCode' and param[key].startswith('-'): - varMap[':%s' % key] = param[key][1:] - except: - pass - sql1W = " WHERE PandaID=:PandaID " - varMap[':PandaID'] = pandaID - if attemptNr != None: - sql0 += "AND attemptNr=:attemptNr " - sql1W += "AND attemptNr=:attemptNr " - varMap[':attemptNr'] = attemptNr - varMap0[':attemptNr'] = attemptNr - # prevent change from holding to transferring which doesn't register files to sub/tid - if jobStatus == 'transferring': - sql1W += "AND NOT jobStatus=:ngStatus " - varMap[':ngStatus'] = 'holding' - updatedFlag = False - nTry=3 - for iTry in range(nTry): - try: - # begin transaction - self.conn.begin() - # select - self.cur.arraysize = 10 - self.cur.execute (sql0+comment,varMap0) - res = self.cur.fetchone() - if res != None: - ret = '' - commandToPilot,endTime,specialHandling,oldJobStatus,computingSite,cloud,prodSourceLabel = res - # debug mode - """ - if not specialHandling in [None,''] and 'debug' in specialHandling: - ret += 'debugon,' - else: - ret += 'debugoff,' - """ - # kill command - if not commandToPilot in [None,'']: - ret += '%s,' % commandToPilot - ret = ret[:-1] - # convert empty to NULL - if ret == '': - ret = 'NULL' - # don't update holding - if oldJobStatus == 'holding' and jobStatus == 'holding': - _logger.debug("updateJobStatus : PandaID=%s skip to reset holding" % pandaID) - else: - # set endTime if undefined for holding - if jobStatus == 'holding' and endTime==None and not presetEndTime: - sql1 += ',endTime=CURRENT_DATE ' - # update - self.cur.execute (sql1+sql1W+comment,varMap) - nUp = self.cur.rowcount - _logger.debug("updateJobStatus : PandaID=%s attemptNr=%s nUp=%s" % (pandaID,attemptNr,nUp)) - if nUp == 1: - updatedFlag = True - if nUp == 0 and jobStatus == 'transferring': - _logger.debug("updateJobStatus : PandaID=%s ignore to update for transferring" % pandaID) - else: - _logger.debug("updateJobStatus : PandaID=%s attemptNr=%s notFound" % (pandaID,attemptNr)) - # already deleted or bad attempt number - ret = "badattemptnr" - #ret = 'tobekilled' - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # record status change - try: - if updatedFlag and oldJobStatus != None and oldJobStatus != jobStatus: - self.recordStatusChange(pandaID,jobStatus, - infoMap={'computingSite':computingSite, - 'cloud':cloud, - 'prodSourceLabel':prodSourceLabel}) - except: - _logger.error('recordStatusChange in updateJobStatus') - return ret - except: - # roll back - self._rollback(True) - if iTry+1 < nTry: - _logger.debug("updateJobStatus : %s retry : %s" % (pandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("updateJobStatus : %s %s" % (type,value)) - _logger.error("updateJobStatus : %s" % pandaID) - return False - - - # update job information in jobsActive or jobsDefined - def updateJob(self,job,inJobsDefined): - comment = ' /* DBProxy.updateJob */' - _logger.debug("updateJob : %s" % job.PandaID) - updatedFlag = False - nTry=3 - for iTry in range(nTry): - try: - job.modificationTime = datetime.datetime.utcnow() - # set stateChangeTime for defined->assigned - if inJobsDefined: - job.stateChangeTime = job.modificationTime - # make SQL - if inJobsDefined: - sql1 = "UPDATE ATLAS_PANDA.jobsDefined4 SET %s " % job.bindUpdateChangesExpression() - else: - sql1 = "UPDATE ATLAS_PANDA.jobsActive4 SET %s " % job.bindUpdateChangesExpression() - sql1+= "WHERE PandaID=:PandaID " - if inJobsDefined: - sql1+= " AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2) " - # begin transaction - self.conn.begin() - # update - varMap = job.valuesMap(onlyChanged=True) - varMap[':PandaID'] = job.PandaID - if inJobsDefined: - varMap[':oldJobStatus1'] = 'assigned' - varMap[':oldJobStatus2'] = 'defined' - _logger.debug(sql1+comment+str(varMap)) - self.cur.execute(sql1+comment, varMap) - n = self.cur.rowcount - if n==0: - # already killed or activated - _logger.debug("updateJob : Not found %s" % job.PandaID) - else: - for file in job.Files: - sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" - varMap = file.valuesMap(onlyChanged=True) - if varMap != {}: - varMap[':row_ID'] = file.row_ID - _logger.debug(sqlF+comment+str(varMap)) - self.cur.execute(sqlF+comment, varMap) - # update job parameters - sqlJobP = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID" - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':param'] = job.jobParameters - self.cur.execute(sqlJobP+comment, varMap) - updatedFlag = True - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # record status change - try: - if updatedFlag: - self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) - except: - _logger.error('recordStatusChange in updateJob') - return True - except: - # roll back - self._rollback(True) - if iTry+1 < nTry: - _logger.debug("updateJob : %s retry : %s" % (job.PandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("updateJob : %s %s" % (type,value)) - return False - - - # retry analysis job - def retryJob(self,pandaID,param,failedInActive=False,changeJobInMem=False,inMemJob=None, - getNewPandaID=False,attemptNr=None): - comment = ' /* DBProxy.retryJob */' - _logger.debug("retryJob : %s inActive=%s" % (pandaID,failedInActive)) - sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames() - sql1+= "WHERE PandaID=:PandaID " - if failedInActive: - sql1+= "AND jobStatus=:jobStatus " - updatedFlag = False - nTry=3 - for iTry in range(nTry): - try: - retValue = False - if not changeJobInMem: - # begin transaction - self.conn.begin() - # select - varMap = {} - varMap[':PandaID'] = pandaID - if failedInActive: - varMap[':jobStatus'] = 'failed' - self.cur.arraysize = 10 - self.cur.execute(sql1+comment, varMap) - res = self.cur.fetchall() - if len(res) == 0: - _logger.debug("retryJob() : PandaID %d not found" % pandaID) - self._rollback() - return retValue - job = JobSpec() - job.pack(res[0]) - else: - job = inMemJob - # don't use getNewPandaID for buildJob since the order of PandaIDs is broken - if getNewPandaID and job.prodSourceLabel in ['panda']: - if not changeJobInMem: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return retValue - # convert attemptNr to int - try: - attemptNr = int(attemptNr) - except: - _logger.debug("retryJob : %s attemptNr=%s non-integer" % (pandaID,attemptNr)) - attemptNr = -999 - # check attemptNr - if attemptNr != None: - if job.attemptNr != attemptNr: - _logger.debug("retryJob : %s bad attemptNr job.%s != pilot.%s" % (pandaID,job.attemptNr,attemptNr)) - if not changeJobInMem: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return retValue - # check if already retried - if job.taskBufferErrorCode in [ErrorCode.EC_Reassigned,ErrorCode.EC_Retried,ErrorCode.EC_PilotRetried]: - _logger.debug("retryJob : %s already retried %s" % (pandaID,job.taskBufferErrorCode)) - if not changeJobInMem: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return retValue - # check pilot retry - usePilotRetry = False - if job.prodSourceLabel in ['user','panda','ptest','rc_test'] and \ - param.has_key('pilotErrorCode') and param['pilotErrorCode'].startswith('-') and \ - job.maxAttempt > job.attemptNr and \ - (not job.processingType.startswith('gangarobot') or job.processingType=='gangarobot-rctest') and \ - not job.processingType.startswith('hammercloud'): - usePilotRetry = True - # check if it's analysis job # FIXME once pilot retry works correctly the conditions below will be cleaned up - if (((job.prodSourceLabel == 'user' or job.prodSourceLabel == 'panda') \ - and not job.processingType.startswith('gangarobot') \ - and not job.processingType.startswith('hammercloud') \ - and job.computingSite.startswith('ANALY_') and param.has_key('pilotErrorCode') \ - and param['pilotErrorCode'] in ['1200','1201','1213'] and (not job.computingSite.startswith('ANALY_LONG_')) \ - and job.attemptNr < 2) or (job.prodSourceLabel == 'ddm' and job.cloud == 'CA' and job.attemptNr <= 10) \ - or failedInActive or usePilotRetry) \ - and job.commandToPilot != 'tobekilled': - _logger.debug('reset PandaID:%s #%s' % (job.PandaID,job.attemptNr)) - if not changeJobInMem: - # job parameters - sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" - varMap = {} - varMap[':PandaID'] = job.PandaID - self.cur.execute(sqlJobP+comment, varMap) - for clobJobP, in self.cur: - job.jobParameters = clobJobP.read() - break - # reset job - job.jobStatus = 'activated' - job.startTime = None - job.modificationTime = datetime.datetime.utcnow() - job.attemptNr = job.attemptNr + 1 - if usePilotRetry: - job.currentPriority -= 10 - if failedInActive: - job.endTime = None - job.transExitCode = None - for attr in job._attributes: - if attr.endswith('ErrorCode') or attr.endswith('ErrorDiag'): - setattr(job,attr,None) - # remove flag regarding to pledge-resource handling - if not job.specialHandling in [None,'NULL','']: - newSpecialHandling = re.sub(',*localpool','',job.specialHandling) - if newSpecialHandling == '': - job.specialHandling = None - else: - job.specialHandling = newSpecialHandling - # send it to long queue for analysis jobs - oldComputingSite = job.computingSite - if not changeJobInMem: - if job.computingSite.startswith('ANALY'): - longSite = None - tmpLongSiteList = [] - tmpLongSite = re.sub('^ANALY_','ANALY_LONG_',job.computingSite) - tmpLongSite = re.sub('_\d+$','',tmpLongSite) - tmpLongSiteList.append(tmpLongSite) - tmpLongSite = job.computingSite + '_LONG' - tmpLongSiteList.append(tmpLongSite) - tmpLongSite = re.sub('SHORT','LONG',job.computingSite) - if tmpLongSite != job.computingSite: - tmpLongSiteList.append(tmpLongSite) - # loop over all possible long sitenames - for tmpLongSite in tmpLongSiteList: - varMap = {} - varMap[':siteID'] = tmpLongSite - varMap[':status'] = 'online' - sqlSite = "SELECT COUNT(*) FROM ATLAS_PANDAMETA.schedconfig WHERE siteID=:siteID AND status=:status" - self.cur.execute(sqlSite+comment, varMap) - resSite = self.cur.fetchone() - if resSite != None and resSite[0] > 0: - longSite = tmpLongSite - break - # use long site if exists - if longSite != None: - _logger.debug('sending PandaID:%s to %s' % (job.PandaID,longSite)) - job.computingSite = longSite - # set destinationSE if queue is changed - if oldComputingSite == job.destinationSE: - job.destinationSE = job.computingSite - if not changeJobInMem: - # select files - varMap = {} - varMap[':PandaID'] = job.PandaID - if not getNewPandaID: - varMap[':type1'] = 'log' - varMap[':type2'] = 'output' - sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() - if not getNewPandaID: - sqlFile+= "WHERE PandaID=:PandaID AND (type=:type1 OR type=:type2)" - else: - sqlFile+= "WHERE PandaID=:PandaID" - self.cur.arraysize = 100 - self.cur.execute(sqlFile+comment, varMap) - resFs = self.cur.fetchall() - else: - # get log or output files only - resFs = [] - for tmpFile in job.Files: - if tmpFile.type in ['log','output']: - resFs.append(tmpFile) - # loop over all files - for resF in resFs: - if not changeJobInMem: - # set PandaID - file = FileSpec() - file.pack(resF) - job.addFile(file) - else: - file = resF - # set new GUID - if file.type == 'log': - file.GUID = commands.getoutput('uuidgen') - # don't change input and lib.tgz - if file.type == 'input' or (file.type == 'output' and job.prodSourceLabel == 'panda') or \ - (file.type == 'output' and file.lfn.endswith('.lib.tgz') and job.prodSourceLabel in ['rc_test','ptest']): - continue - # append attemptNr to LFN - oldName = file.lfn - file.lfn = re.sub('\.\d+$','',file.lfn) - file.lfn = '%s.%s' % (file.lfn,job.attemptNr) - newName = file.lfn - # set destinationSE - if oldComputingSite == file.destinationSE: - file.destinationSE = job.computingSite - # modify jobParameters - sepPatt = "(\'|\"|%20)" + oldName + "(\'|\"|%20)" - matches = re.findall(sepPatt,job.jobParameters) - for match in matches: - oldPatt = match[0]+oldName+match[-1] - newPatt = match[0]+newName+match[-1] - job.jobParameters = re.sub(oldPatt,newPatt,job.jobParameters) - if not changeJobInMem and not getNewPandaID: - # reset file status - if file.type in ['output','log']: - file.status = 'unknown' - # update files - sqlFup = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" - varMap = file.valuesMap(onlyChanged=True) - if varMap != {}: - varMap[':row_ID'] = file.row_ID - self.cur.execute(sqlFup+comment, varMap) - if not changeJobInMem: - # reuse original PandaID - if not getNewPandaID: - # update job - sql2 = "UPDATE ATLAS_PANDA.jobsActive4 SET %s " % job.bindUpdateChangesExpression() - sql2+= "WHERE PandaID=:PandaID " - varMap = job.valuesMap(onlyChanged=True) - varMap[':PandaID'] = job.PandaID - self.cur.execute(sql2+comment, varMap) - # update job parameters - sqlJobP = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID" - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':param'] = job.jobParameters - self.cur.execute(sqlJobP+comment, varMap) - updatedFlag = True - else: - # read metadata - sqlMeta = "SELECT metaData FROM ATLAS_PANDA.metaTable WHERE PandaID=:PandaID" - varMap = {} - varMap[':PandaID'] = job.PandaID - self.cur.execute(sqlMeta+comment, varMap) - for clobJobP, in self.cur: - job.metadata = clobJobP.read() - break - # insert job with new PandaID - sql1 = "INSERT INTO ATLAS_PANDA.jobsActive4 (%s) " % JobSpec.columnNames() - sql1+= JobSpec.bindValuesExpression(useSeq=True) - sql1+= " RETURNING PandaID INTO :newPandaID" - # set parentID - job.parentID = job.PandaID - varMap = job.valuesMap(useSeq=True) - varMap[':newPandaID'] = self.cur.var(cx_Oracle.NUMBER) - # insert - retI = self.cur.execute(sql1+comment, varMap) - # set PandaID - job.PandaID = long(varMap[':newPandaID'].getvalue()) - _logger.debug('Generate new PandaID %s -> %s #%s' % (job.parentID,job.PandaID,job.attemptNr)) - # insert files - sqlFile = "INSERT INTO ATLAS_PANDA.filesTable4 (%s) " % FileSpec.columnNames() - sqlFile+= FileSpec.bindValuesExpression(useSeq=True) - sqlFile+= " RETURNING row_ID INTO :newRowID" - for file in job.Files: - # reset rowID - file.row_ID = None - # insert - varMap = file.valuesMap(useSeq=True) - varMap[':newRowID'] = self.cur.var(cx_Oracle.NUMBER) - self.cur.execute(sqlFile+comment, varMap) - # update mod time for files - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':modificationTime'] = job.modificationTime - sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - self.cur.execute(sqlFMod+comment,varMap) - # metadata - sqlMeta = "INSERT INTO ATLAS_PANDA.metaTable (PandaID,metaData,modificationTime) VALUES (:PandaID,:metaData,:modTime)" - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':metaData'] = job.metadata - varMap[':modTime'] = job.modificationTime - self.cur.execute(sqlMeta+comment, varMap) - # job parameters - sqlJob = "INSERT INTO ATLAS_PANDA.jobParamsTable (PandaID,jobParameters,modificationTime) VALUES (:PandaID,:param,:modTime)" - varMap = {} - varMap[':PandaID'] = job.PandaID - varMap[':param'] = job.jobParameters - varMap[':modTime'] = job.modificationTime - self.cur.execute(sqlJob+comment, varMap) - # set error code to original job to avoid being retried by another process - sqlE = "UPDATE ATLAS_PANDA.jobsActive4 SET taskBufferErrorCode=:errCode,taskBufferErrorDiag=:errDiag WHERE PandaID=:PandaID" - varMap = {} - varMap[':PandaID'] = job.parentID - varMap[':errCode'] = ErrorCode.EC_PilotRetried - varMap[':errDiag'] = 'retrying at the same site. new PandaID=%s' % job.PandaID - self.cur.execute(sqlE+comment, varMap) - # set return - if not getNewPandaID: - retValue = True - if not changeJobInMem: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # record status change - try: - if updatedFlag: - self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) - except: - _logger.error('recordStatusChange in retryJob') - return retValue - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("retryJob : %s retry : %s" % (pandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - # error report - type, value, traceBack = sys.exc_info() - _logger.error("retryJob : %s %s" % (type,value)) - return False - - - # retry failed analysis jobs in Active4 - def retryJobsInActive(self,prodUserName,jobDefinitionID): - comment = ' /* DBProxy.retryJobsInActive */' - _logger.debug("retryJobsInActive : start - %s %s" % (prodUserName,jobDefinitionID)) - try: - # begin transaction - self.conn.begin() - # count the number of jobs in Defined - sqlC = "SELECT COUNT(*) FROM ATLAS_PANDA.jobsDefined4 " - sqlC += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sqlC += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " - varMap = {} - varMap[':prodUserName'] = prodUserName - varMap[':jobDefinitionID'] = jobDefinitionID - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - self.cur.arraysize = 10 - self.cur.execute(sqlC+comment,varMap) - res = self.cur.fetchone() - # failed to get the number of jobs in Defined - if res == None: - _logger.error("retryJobsInActive : %s %s - failed to get num of jobs in Def" % (prodUserName,jobDefinitionID)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return None for DB error - return None - nJobsInDef = res[0] - # get failed PandaIDs in Active - sql0 = "SELECT PandaID,jobStatus,taskBufferErrorCode,attemptNr FROM ATLAS_PANDA.jobsActive4 " - sql0+= "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql0+= "AND prodSourceLabel=:prodSourceLabel " - varMap = {} - varMap[':prodUserName'] = prodUserName - varMap[':jobDefinitionID'] = jobDefinitionID - varMap[':prodSourceLabel'] = 'user' - self.cur.execute(sql0+comment,varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # the number of jobs in Active - nJobsInAct = len(res) - # loop over all PandaID - failedPandaIDs = [] - for pandaID,tmpJobStatus,tmpTaskBufferErrorCode,tmpAttemptNr in res: - if tmpJobStatus == 'failed' and not tmpTaskBufferErrorCode in \ - [ErrorCode.EC_Reassigned,ErrorCode.EC_Retried,ErrorCode.EC_PilotRetried]: - failedPandaIDs.append((pandaID,tmpAttemptNr)) - _logger.debug("retryJobsInActive : %s %s - %s failed jobs" % (prodUserName,jobDefinitionID,len(failedPandaIDs))) - # there are some failed jobs in Active - if failedPandaIDs != []: - # get list of sub datasets to lock Closer - sqlF = "SELECT DISTINCT destinationDBlock FROM ATLAS_PANDA.filesTable4 " - sqlF += "WHERE PandaID=:PandaID AND type IN (:type1,:type2) " - varMap = {} - varMap[':PandaID'] = failedPandaIDs[0][0] - varMap[':type1'] = 'log' - varMap[':type2'] = 'output' - # begin transaction - self.conn.begin() - self.cur.arraysize = 100000 - self.cur.execute(sqlF+comment,varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - subDsList = [] - for tmpDSname, in res: - tmpDS = self.queryDatasetWithMap({'name':tmpDSname}) - if tmpDS == None: - _logger.error("retryJobsInActive : %s %s - failed to get DS=%s" % (prodUserName,jobDefinitionID,tmpDSname)) - # return None for DB error - return None - # append - subDsList.append(tmpDS) - # lock datasets - lockedDS = True - ngStatus = ['closed','tobeclosed','completed','tobemerged','merging','cleanup'] - sqlD = "UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE " - sqlD+= "WHERE vuid=:vuid AND NOT status IN (" - for tmpIdx,tmpNgStat in enumerate(ngStatus): - sqlD += ':ngSt%s,' % tmpIdx - sqlD = sqlD[:-1] - sqlD += ") " - self.conn.begin() - self.cur.arraysize = 10 - for tmpDS in subDsList: - varMap = {} - varMap[':status'] = 'locked' - varMap[':vuid'] = tmpDS.vuid - for tmpIdx,tmpNgStat in enumerate(ngStatus): - tmpKey = ':ngSt%s' % tmpIdx - varMap[tmpKey] = tmpNgStat - # update - self.cur.execute(sqlD+comment,varMap) - retD = self.cur.rowcount - # datasets already closed - if retD == 0: - # roll back - self._rollback() - # failed to lock datasets - _logger.debug("retryJobsInActive : %s %s - %s is closed" % (prodUserName,jobDefinitionID,tmpDS.name)) - lockedDS = False - break - # retry jobs - if lockedDS: - # commit for dataset lock - if not self._commit(): - raise RuntimeError, 'Commit error' - # loop over all PandaIDs - for pandaID,tmpAttemptNr in failedPandaIDs: - retryRet = self.retryJob(pandaID,{},failedInActive=True,attemptNr=tmpAttemptNr) - _logger.debug("retryJobsInActive : %s %s - PandaID=%s %s" % (prodUserName,jobDefinitionID,pandaID,retryRet)) - # unlock datasets - sqlDU = "UPDATE ATLAS_PANDA.Datasets SET status=:nStatus,modificationdate=CURRENT_DATE " - sqlDU+= "WHERE vuid=:vuid AND status=:oStatus" - self.conn.begin() - self.cur.arraysize = 10 - for tmpDS in subDsList: - varMap = {} - varMap[':oStatus'] = 'locked' - varMap[':nStatus'] = tmpDS.status - varMap[':vuid'] = tmpDS.vuid - # update - self.cur.execute(sqlDU+comment,varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return True when job is active - retVal = False - if nJobsInAct > 0 or nJobsInDef > 0: - retVal = True - _logger.debug("retryJobsInActive : end %s - %s %s" % (retVal,prodUserName,jobDefinitionID)) - return retVal - except: - # roll back - self._rollback() - # error report - errType,errValue = sys.exc_info()[:2] - _logger.error("retryJobsInActive : %s %s" % (errType,errValue)) - return None - - - # get jobs - def getJobs(self,nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, - atlasRelease,prodUserID,countryGroup,workingGroup,allowOtherCountry): - comment = ' /* DBProxy.getJobs */' - # use memcache - useMemcache = False - try: - if panda_config.memcached_enable and siteName in ['MWT2_UC','ANALY_MWT2','BNL_ATLAS_test','ANALY_BNL_test', - 'ANALY_GLASGOW']: # FIXME - # initialize memcache - if self.memcache == None: - from MemProxy import MemProxy - self.memcache = MemProxy() - if not self.memcache in [None,False]: - useMemcache = True - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("failed to initialize memcached with %s %s" % (errType,errValue)) - # aggregated sites which use different appdirs - aggSiteMap = {'CERN-PROD':{'CERN-RELEASE':'release', - 'CERN-UNVALID':'unvalid', - 'CERN-BUILDS' :'builds', - }, - } - # construct where clause - dynamicBrokering = False - getValMap = {} - getValMap[':oldJobStatus'] = 'activated' - getValMap[':computingSite'] = siteName - if not aggSiteMap.has_key(siteName): - sql1 = "WHERE jobStatus=:oldJobStatus AND computingSite=:computingSite AND commandToPilot IS NULL " - else: - # aggregated sites - sql1 = "WHERE jobStatus=:oldJobStatus AND computingSite IN (:computingSite," - for tmpAggIdx,tmpAggSite in enumerate(aggSiteMap[siteName].keys()): - tmpKeyName = ':computingSite%s' % tmpAggIdx - sql1 += '%s,' % tmpKeyName - getValMap[tmpKeyName] = tmpAggSite - sql1 = sql1[:-1] - sql1 += ") AND commandToPilot IS NULL " - if not mem in [0,'0']: - sql1+= "AND (minRamCount<=:minRamCount OR minRamCount=0) " - getValMap[':minRamCount'] = mem - if not diskSpace in [0,'0']: - sql1+= "AND (maxDiskCount<=:maxDiskCount OR maxDiskCount=0) " - getValMap[':maxDiskCount'] = diskSpace - if prodSourceLabel == 'user': - sql1+= "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2,:prodSourceLabel3) " - getValMap[':prodSourceLabel1'] = 'user' - getValMap[':prodSourceLabel2'] = 'panda' - getValMap[':prodSourceLabel3'] = 'install' - elif prodSourceLabel == 'ddm': - dynamicBrokering = True - sql1+= "AND prodSourceLabel=:prodSourceLabel " - getValMap[':prodSourceLabel'] = 'ddm' - elif prodSourceLabel in [None,'managed']: - sql1+= "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2,:prodSourceLabel3,:prodSourceLabel4) " - getValMap[':prodSourceLabel1'] = 'managed' - getValMap[':prodSourceLabel2'] = 'test' - getValMap[':prodSourceLabel3'] = 'prod_test' - getValMap[':prodSourceLabel4'] = 'install' - elif prodSourceLabel == 'software': - sql1+= "AND prodSourceLabel=:prodSourceLabel " - getValMap[':prodSourceLabel'] = 'software' - elif prodSourceLabel == 'test' and computingElement != None: - dynamicBrokering = True - sql1+= "AND (processingType IN (:processingType1,:processingType2,:processingType3) " - sql1+= "OR prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2,:prodSourceLabel3)) " - getValMap[':processingType1'] = 'gangarobot' - getValMap[':processingType2'] = 'analy_test' - getValMap[':processingType3'] = 'prod_test' - getValMap[':prodSourceLabel1'] = 'test' - getValMap[':prodSourceLabel2'] = 'prod_test' - getValMap[':prodSourceLabel3'] = 'install' - else: - sql1+= "AND prodSourceLabel=:prodSourceLabel " - getValMap[':prodSourceLabel'] = prodSourceLabel - # user ID - if prodUserID != None: - # get compact DN - compactDN = self.cleanUserID(prodUserID) - if compactDN in ['','NULL',None]: - compactDN = prodUserID - sql1+= "AND prodUserName=:prodUserName " - getValMap[':prodUserName'] = compactDN - # country group - specialHandled = False - if prodSourceLabel == 'user': - # update pledge resource ratio - self.getPledgeResourceRatio() - # other country is allowed to use the pilot - if allowOtherCountry=='True' and self.beyondPledgeRatio.has_key(siteName) and self.beyondPledgeRatio[siteName] > 0: - # check if countryGroup needs to be used for beyond-pledge - if self.checkCountryGroupForBeyondPledge(siteName): - countryGroup = self.beyondPledgeRatio[siteName]['countryGroup'] - specialHandled = True - else: - countryGroup = '' - # countryGroup - if not countryGroup in ['',None]: - sql1+= "AND countryGroup IN (" - idxCountry = 1 - for tmpCountry in countryGroup.split(','): - tmpKey = ":countryGroup%s" % idxCountry - sql1+= "%s," % tmpKey - getValMap[tmpKey] = tmpCountry - idxCountry += 1 - sql1 = sql1[:-1] - sql1+= ") " - # workingGroup - if not workingGroup in ['',None]: - sql1+= "AND workingGroup IN (" - idxWorking = 1 - for tmpWorking in workingGroup.split(','): - tmpKey = ":workingGroup%s" % idxWorking - sql1+= "%s," % tmpKey - getValMap[tmpKey] = tmpWorking - idxWorking += 1 - sql1 = sql1[:-1] - sql1+= ") " - # production share - if prodSourceLabel in ['managed',None,'sharetest']: - aggSitesForFairshare = [] - if aggSiteMap.has_key(siteName): - aggSitesForFairshare = aggSiteMap[siteName].keys() - shareSQL,shareVarMap = self.getCriteriaForProdShare(siteName,aggSitesForFairshare) - if shareVarMap != {}: - sql1 += shareSQL - for tmpShareKey in shareVarMap.keys(): - getValMap[tmpShareKey] = shareVarMap[tmpShareKey] - sql2 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames() - sql2+= "WHERE PandaID=:PandaID" - retJobs = [] - nSent = 0 - try: - timeLimit = datetime.timedelta(seconds=timeout-10) - timeStart = datetime.datetime.utcnow() - strName = datetime.datetime.isoformat(timeStart) - attLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=15) - attSQL = "AND ((creationTime<:creationTime AND attemptNr>1) OR attemptNr<=1) " - # get nJobs - for iJob in range(nJobs): - pandaID = 0 - fileMapForMem = {} - # select channel for ddm jobs - if prodSourceLabel == 'ddm': - sqlDDM = "SELECT count(*),jobStatus,sourceSite,destinationSite,transferType FROM ATLAS_PANDA.jobsActive4 WHERE computingSite=:computingSite AND prodSourceLabel=:prodSourceLabel " \ - + attSQL + "GROUP BY jobStatus,sourceSite,destinationSite,transferType" - ddmValMap = {} - ddmValMap[':computingSite'] = siteName - ddmValMap[':creationTime'] = attLimit - ddmValMap[':prodSourceLabel'] = 'ddm' - _logger.debug(sqlDDM+comment+str(ddmValMap)) - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 100 - self.cur.execute(sqlDDM+comment, ddmValMap) - resDDM = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # make a channel map - channelMap = {} - for tmp_count,tmp_jobStatus,tmp_sourceSite,tmp_destinationSite,tmp_transferType in resDDM: - # use source,dest,type as the key - channel = (tmp_sourceSite,tmp_destinationSite,tmp_transferType) - if not channelMap.has_key(channel): - channelMap[channel] = {} - # ignore holding - if tmp_jobStatus == 'holding': - continue - # distinguish activate from other stats - if tmp_jobStatus != 'activated': - tmp_jobStatus = 'others' - # append - if not channelMap[channel].has_key(tmp_jobStatus): - channelMap[channel][tmp_jobStatus] = int(tmp_count) - else: - channelMap[channel][tmp_jobStatus] += int(tmp_count) - _logger.debug(channelMap) - # choose channel - channels = channelMap.keys() - random.shuffle(channels) - foundChannel = False - for channel in channels: - # no activated jobs - if (not channelMap[channel].has_key('activated')) or channelMap[channel]['activated'] == 0: - continue - maxRunning = 15 - # prestaging job - if channel[0] == channel[1] and channel[2] == 'dis': - maxRunning = 50 - if (not channelMap[channel].has_key('others')) or channelMap[channel]['others'] < maxRunning: - # set SQL - sql1+= "AND sourceSite=:sourceSite AND destinationSite=:destinationSite AND transferType=:transferType " - getValMap[':sourceSite'] = channel[0] - getValMap[':destinationSite'] = channel[1] - getValMap[':transferType'] = channel[2] - foundChannel = True - break - # no proper channel - if not foundChannel: - _logger.debug("getJobs : no DDM jobs for Site %s" % siteName) - break - # get job - if prodSourceLabel in ['ddm']: - # to add some delay for attempts - sql1 += attSQL - getValMap[':creationTime'] = attLimit - nTry=1 - for iTry in range(nTry): - # set siteID - tmpSiteID = siteName - if siteName.startswith('ANALY_BNL_ATLAS'): - tmpSiteID = 'ANALY_BNL_ATLAS_1' - # get file lock - _logger.debug("getJobs : %s -> lock" % strName) - if (datetime.datetime.utcnow() - timeStart) < timeLimit: - toGetPandaIDs = True - pandaIDs = [] - specialHandlingMap = {} - # get max priority for analysis jobs - if prodSourceLabel in ['panda','user']: - sqlMX = "SELECT /*+ INDEX_RS_ASC(tab (PRODSOURCELABEL COMPUTINGSITE JOBSTATUS) ) */ MAX(currentPriority) FROM ATLAS_PANDA.jobsActive4 tab " - sqlMX+= sql1 - _logger.debug(sqlMX+comment+str(getValMap)) - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10 - self.cur.execute(sqlMX+comment, getValMap) - tmpPriority, = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # no jobs - if tmpPriority == None: - toGetPandaIDs = False - else: - # set priority - sql1 += "AND currentPriority=:currentPriority" - getValMap[':currentPriority'] = tmpPriority - maxAttemptIDx = 10 - if toGetPandaIDs: - # get PandaIDs - sqlP = "SELECT /*+ INDEX_RS_ASC(tab (PRODSOURCELABEL COMPUTINGSITE JOBSTATUS) ) */ PandaID,currentPriority,specialHandling FROM ATLAS_PANDA.jobsActive4 tab " - sqlP+= sql1 - _logger.debug(sqlP+comment+str(getValMap)) - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 100000 - self.cur.execute(sqlP+comment, getValMap) - resIDs = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - maxCurrentPriority = None - # get max priority and min PandaID - for tmpPandaID,tmpCurrentPriority,tmpSpecialHandling in resIDs: - if maxCurrentPriority==None or maxCurrentPriority < tmpCurrentPriority: - maxCurrentPriority = tmpCurrentPriority - pandaIDs = [tmpPandaID] - elif maxCurrentPriority == tmpCurrentPriority: - pandaIDs.append(tmpPandaID) - specialHandlingMap[tmpPandaID] = tmpSpecialHandling - # sort - pandaIDs.sort() - if pandaIDs == []: - _logger.debug("getJobs : %s -> no PandaIDs" % strName) - retU = 0 - else: - # check the number of available files - if useMemcache: - _logger.debug("getJobs : %s -> memcache check start" % strName) - # truncate - pandaIDs = pandaIDs[:maxAttemptIDx] - # get input files - availableFileMap = {} - self.cur.arraysize = 100000 - sqlMemFile = "SELECT lfn FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type" - for tmpPandaID in pandaIDs: - varMap = {} - varMap[':type'] = 'input' - varMap[':PandaID'] = tmpPandaID - # start transaction - self.conn.begin() - # select - self.cur.execute(sqlMemFile+comment,varMap) - resFiles = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # get list - fileMapForMem[tmpPandaID] = [] - for tmpItem, in resFiles: - fileMapForMem[tmpPandaID].append(tmpItem) - # get number of available files - nAvailable = self.memcache.checkFiles(tmpPandaID,fileMapForMem[tmpPandaID], - siteName,node) - # append - if not nAvailable in availableFileMap: - availableFileMap[nAvailable] = [] - availableFileMap[nAvailable].append(tmpPandaID) - # sort by the number of available files - tmpAvaKeys = availableFileMap.keys() - tmpAvaKeys.sort() - tmpAvaKeys.reverse() - pandaIDs = [] - for tmpAvaKey in tmpAvaKeys: - pandaIDs += availableFileMap[tmpAvaKey] - _logger.debug("getJobs : %s -> memcache check done" % strName) - # update - for indexID,tmpPandaID in enumerate(pandaIDs): - # max attempts - if indexID > maxAttemptIDx: - break - # update - sqlJ = "UPDATE ATLAS_PANDA.jobsActive4 " - sqlJ+= "SET jobStatus=:newJobStatus,modificationTime=CURRENT_DATE,modificationHost=:modificationHost,startTime=CURRENT_DATE" - varMap = {} - varMap[':PandaID'] = tmpPandaID - varMap[':newJobStatus'] = 'sent' - varMap[':oldJobStatus'] = 'activated' - varMap[':modificationHost'] = node - # set CE - if computingElement != None: - sqlJ+= ",computingElement=:computingElement" - varMap[':computingElement'] = computingElement - # set special handlng - if specialHandled: - sqlJ+= ",specialHandling=:specialHandling" - spString = 'localpool' - if specialHandlingMap.has_key(tmpPandaID) and isinstance(specialHandlingMap[tmpPandaID],types.StringType): - if not spString in specialHandlingMap[tmpPandaID]: - varMap[':specialHandling'] = specialHandlingMap[tmpPandaID]+','+spString - else: - varMap[':specialHandling'] = specialHandlingMap[tmpPandaID] - else: - varMap[':specialHandling'] = spString - sqlJ+= " WHERE PandaID=:PandaID AND jobStatus=:oldJobStatus" - # SQL to get nSent - sentLimit = timeStart - datetime.timedelta(seconds=60) - sqlSent = "SELECT count(*) FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus " - sqlSent += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " - sqlSent += "AND computingSite=:computingSite " - sqlSent += "AND modificationTime>:modificationTime " - varMapSent = {} - varMapSent[':jobStatus'] = 'sent' - varMapSent[':computingSite'] = tmpSiteID - varMapSent[':modificationTime'] = sentLimit - varMapSent[':prodSourceLabel1'] = 'managed' - varMapSent[':prodSourceLabel2'] = 'test' - # start - _logger.debug(sqlJ+comment+str(varMap)) - # start transaction - self.conn.begin() - # update - self.cur.execute(sqlJ+comment, varMap) - retU = self.cur.rowcount - if retU != 0: - # get nSent for production jobs - if prodSourceLabel in [None,'managed']: - _logger.debug(sqlSent+comment+str(varMapSent)) - self.cur.execute(sqlSent+comment, varMapSent) - resSent = self.cur.fetchone() - if resSent != None: - nSent, = resSent - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # succeeded - if retU != 0: - pandaID = tmpPandaID - break - else: - _logger.debug("getJobs : %s -> do nothing" % strName) - retU = 0 - # release file lock - _logger.debug("getJobs : %s -> unlock" % strName) - # succeeded - if retU != 0: - break - if iTry+1 < nTry: - #time.sleep(0.5) - pass - # failed to UPDATE - if retU == 0: - # reset pandaID - pandaID = 0 - _logger.debug("getJobs : Site %s : retU %s : PandaID %s - %s" - % (siteName,retU,pandaID,prodSourceLabel)) - if pandaID == 0: - break - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[':PandaID'] = pandaID - self.cur.arraysize = 10 - self.cur.execute(sql2+comment, varMap) - res = self.cur.fetchone() - if len(res) == 0: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - break - # instantiate Job - job = JobSpec() - job.pack(res) - # Files - sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=:PandaID" - self.cur.arraysize = 10000 - self.cur.execute(sqlFile+comment, varMap) - resFs = self.cur.fetchall() - # job parameters - sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" - varMap = {} - varMap[':PandaID'] = job.PandaID - self.cur.execute(sqlJobP+comment, varMap) - for clobJobP, in self.cur: - job.jobParameters = clobJobP.read() - break - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - for resF in resFs: - file = FileSpec() - file.pack(resF) - job.addFile(file) - # overwrite processingType for appdir at aggrigates sites - if aggSiteMap.has_key(siteName): - if aggSiteMap[siteName].has_key(job.computingSite): - job.processingType = aggSiteMap[siteName][job.computingSite] - job.computingSite = job.computingSite - # append - retJobs.append(job) - # record status change - try: - self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) - except: - _logger.error('recordStatusChange in getJobs') - return retJobs,nSent - except: - # roll back - self._rollback() - # error report - type, value, traceBack = sys.exc_info() - _logger.error("getJobs : %s %s" % (type,value)) - return [],0 - - - # reset job in jobsActive or jobsWaiting - def resetJob(self,pandaID,activeTable=True,keepSite=False,getOldSubs=False,forPending=True): - comment = ' /* DBProxy.resetJob */' - _logger.debug("resetJobs : %s" % pandaID) - # select table - table = 'ATLAS_PANDA.jobsWaiting4' - if activeTable: - table = 'ATLAS_PANDA.jobsActive4' - sql1 = "SELECT %s FROM %s " % (JobSpec.columnNames(),table) - sql1+= "WHERE PandaID=:PandaID" - sql2 = "DELETE FROM %s " % table - sql2+= "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)" - sql3 = "INSERT INTO ATLAS_PANDA.jobsDefined4 (%s) " % JobSpec.columnNames() - sql3+= JobSpec.bindValuesExpression() - try: - # transaction causes Request ndbd time-out in ATLAS_PANDA.jobsActive4 - self.conn.begin() - # select - varMap = {} - varMap[':PandaID'] = pandaID - self.cur.arraysize = 10 - self.cur.execute(sql1+comment,varMap) - res = self.cur.fetchone() - # not found - if res == None: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return None - # instantiate Job - job = JobSpec() - job.pack(res) - # if already running - if job.jobStatus != 'waiting' and job.jobStatus != 'activated' \ - and (forPending and job.jobStatus != 'pending'): - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return None - # do nothing for analysis jobs - if job.prodSourceLabel in ['user','panda']: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return None - # delete - varMap = {} - varMap[':PandaID'] = pandaID - if not forPending: - varMap[':oldJobStatus1'] = 'waiting' - else: - varMap[':oldJobStatus1'] = 'pending' - varMap[':oldJobStatus2'] = 'activated' - self.cur.execute(sql2+comment,varMap) - retD = self.cur.rowcount - # delete failed - _logger.debug("resetJobs : retD = %s" % retD) - if retD != 1: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return None - # delete from jobsDefined4 just in case - varMap = {} - varMap[':PandaID'] = pandaID - sqlD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" - self.cur.execute(sqlD+comment,varMap) - # increase priority - if job.jobStatus == 'activated' and job.currentPriority < 100: - job.currentPriority = 100 - # reset computing site and dispatchDBlocks - job.jobStatus = 'defined' - job.dispatchDBlock = None - # erase old assignment - if (not keepSite) and job.relocationFlag != 1: - job.computingSite = None - job.computingElement = None - # host and time information - job.modificationHost = self.hostname - job.modificationTime = datetime.datetime.utcnow() - job.stateChangeTime = job.modificationTime - # reset - job.brokerageErrorDiag = None - job.brokerageErrorCode = None - # insert - self.cur.execute(sql3+comment, job.valuesMap()) - # job parameters - sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" - self.cur.execute(sqlJobP+comment, varMap) - for clobJobP, in self.cur: - job.jobParameters = clobJobP.read() - break - # Files - oldSubList = [] - sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=:PandaID" - self.cur.arraysize = 10000 - self.cur.execute(sqlFile+comment, varMap) - resFs = self.cur.fetchall() - for resF in resFs: - file = FileSpec() - file.pack(resF) - # reset GUID to trigger LRC/LFC scanning - if file.status == 'missing': - file.GUID = None - # collect old subs - if job.prodSourceLabel in ['managed','test'] and file.type in ['output','log'] \ - and re.search('_sub\d+$',file.destinationDBlock) != None: - if not file.destinationDBlock in oldSubList: - oldSubList.append(file.destinationDBlock) - # reset status, destinationDBlock and dispatchDBlock - file.status ='unknown' - file.dispatchDBlock = None - file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock) - # add file - job.addFile(file) - # update files - sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" - varMap = file.valuesMap(onlyChanged=True) - if varMap != {}: - varMap[':row_ID'] = file.row_ID - _logger.debug(sqlF+comment+str(varMap)) - self.cur.execute(sqlF+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # record status change - try: - self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) - except: - _logger.error('recordStatusChange in resetJobs') - if getOldSubs: - return job,oldSubList - return job - except: - # roll back - self._rollback() - # error report - type, value, traceBack = sys.exc_info() - _logger.error("resetJobs : %s %s" % (type,value)) - _logger.error("resetJobs : %s" % pandaID) - return None - - - # reset jobs in jobsDefined - def resetDefinedJob(self,pandaID,keepSite=False,getOldSubs=False): - comment = ' /* DBProxy.resetDefinedJob */' - _logger.debug("resetDefinedJob : %s" % pandaID) - sql1 = "UPDATE ATLAS_PANDA.jobsDefined4 SET " - sql1 += "jobStatus=:newJobStatus," - sql1 += "modificationTime=CURRENT_DATE," - sql1 += "dispatchDBlock=NULL," - sql1 += "computingElement=NULL" - sql1 += " WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)" - sql2 = "SELECT %s FROM ATLAS_PANDA.jobsDefined4 " % JobSpec.columnNames() - sql2+= "WHERE PandaID=:PandaID" - try: - oldSubList = [] - # begin transaction - self.conn.begin() - # update - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':newJobStatus'] = 'defined' - varMap[':oldJobStatus1'] = 'assigned' - varMap[':oldJobStatus2'] = 'defined' - self.cur.execute(sql1+comment,varMap) - retU = self.cur.rowcount - # not found - updatedFlag = False - job = None - if retU == 0: - _logger.debug("resetDefinedJob : Not found %s" % pandaID) - else: - # select - varMap = {} - varMap[':PandaID'] = pandaID - self.cur.arraysize = 10 - self.cur.execute(sql2+comment,varMap) - res = self.cur.fetchone() - # not found - if res == None: - raise RuntimeError, 'Could not SELECT : PandaID=%s' % pandaID - # instantiate Job - job = JobSpec() - job.pack(res) - # do nothing for analysis jobs - if job.prodSourceLabel in ['user','panda']: - _logger.debug('resetDefinedJob : rollback since PandaID=%s is analysis job' % pandaID) - # roll back - self._rollback() - return None - job.dispatchDBlock = None - if (not keepSite) and job.relocationFlag != 1: - # erase old assignment - job.computingSite = None - job.computingElement = None - # job parameters - sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" - self.cur.execute(sqlJobP+comment, varMap) - for clobJobP, in self.cur: - job.jobParameters = clobJobP.read() - break - # Files - sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=:PandaID" - self.cur.arraysize = 10000 - self.cur.execute(sqlFile+comment, varMap) - resFs = self.cur.fetchall() - for resF in resFs: - file = FileSpec() - file.pack(resF) - # collect old subs - if job.prodSourceLabel in ['managed','test'] and file.type in ['output','log'] \ - and re.search('_sub\d+$',file.destinationDBlock) != None: - if not file.destinationDBlock in oldSubList: - oldSubList.append(file.destinationDBlock) - # reset status, destinationDBlock and dispatchDBlock - file.status ='unknown' - file.dispatchDBlock = None - file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock) - # add file - job.addFile(file) - # update files - sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" - varMap = file.valuesMap(onlyChanged=True) - if varMap != {}: - varMap[':row_ID'] = file.row_ID - _logger.debug(sqlF+comment+str(varMap)) - self.cur.execute(sqlF+comment, varMap) - updatedFlag = True - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # record status change - try: - if updatedFlag: - self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) - except: - _logger.error('recordStatusChange in resetDefinedJobs') - if getOldSubs: - return job,oldSubList - return job - except: - # error report - type, value, traceBack = sys.exc_info() - _logger.error("resetDefinedJobs : %s %s" % (type,value)) - # roll back - self._rollback() - return None - - - # kill job - def killJob(self,pandaID,user,code,prodManager,getUserInfo=False,wgProdRole=[]): - # code - # 2 : expire - # 3 : aborted - # 4 : expire in waiting - # 7 : retry by server - # 8 : rebrokerage - # 9 : force kill - # 91 : kill user jobs with prod role - comment = ' /* DBProxy.killJob */' - _logger.debug("killJob : code=%s PandaID=%s role=%s user=%s wg=%s" % (code,pandaID,prodManager,user,wgProdRole)) - # check PandaID - try: - long(pandaID) - except: - _logger.error("not an integer : %s" % pandaID) - if getUserInfo: - return False,{} - return False - sql0 = "SELECT prodUserID,prodSourceLabel,jobDefinitionID,jobsetID,workingGroup FROM %s WHERE PandaID=:PandaID" - sql1 = "UPDATE %s SET commandToPilot=:commandToPilot,taskBufferErrorDiag=:taskBufferErrorDiag WHERE PandaID=:PandaID AND commandToPilot IS NULL" - sql1F = "UPDATE %s SET commandToPilot=:commandToPilot,taskBufferErrorDiag=:taskBufferErrorDiag WHERE PandaID=:PandaID" - sql2 = "SELECT %s " % JobSpec.columnNames() - sql2 += "FROM %s WHERE PandaID=:PandaID AND jobStatus<>:jobStatus" - sql3 = "DELETE FROM %s WHERE PandaID=:PandaID" - sqlU = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)" - sql4 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() - sql4 += JobSpec.bindValuesExpression() - sqlF = "UPDATE ATLAS_PANDA.filesTable4 SET status=:status WHERE PandaID=:PandaID AND type IN (:type1,:type2)" - sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" - try: - flagCommand = False - flagKilled = False - userProdUserID = '' - userProdSourceLabel = '' - userJobDefinitionID = '' - userJobsetID = '' - updatedFlag = False - # begin transaction - self.conn.begin() - for table in ('ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4'): - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # begin transaction - self.conn.begin() - # get DN if user is not production DN - varMap = {} - varMap[':PandaID'] = pandaID - self.cur.arraysize = 10 - self.cur.execute((sql0+comment) % table, varMap) - res = self.cur.fetchone() - # not found - if res == None: - continue - # owner? - def getCN(dn): - distinguishedName = '' - for line in dn.split('/'): - if line.startswith('CN='): - distinguishedName = re.sub('^CN=','',line) - distinguishedName = re.sub('\d+$','',distinguishedName) - distinguishedName = distinguishedName.strip() - break - if distinguishedName == '': - distinguishedName = dn - return distinguishedName - # prevent prod proxy from killing analysis jobs - userProdUserID,userProdSourceLabel,userJobDefinitionID,userJobsetID,workingGroup = res - # check group prod role - validGroupProdRole = False - if res[1] in ['managed','test'] and workingGroup != '': - for tmpGroupProdRole in wgProdRole: - if tmpGroupProdRole == '': - continue - if re.search('(^|_)'+tmpGroupProdRole+'$',workingGroup,re.I) != None: - validGroupProdRole = True - break - if prodManager: - if res[1] in ['user','panda'] and (not code in ['2','4','7','8','9','91']): - _logger.debug("ignore killJob -> prod proxy tried to kill analysis job type=%s" % res[1]) - break - _logger.debug("killJob : %s using prod role" % pandaID) - elif validGroupProdRole: - # WGs with prod role - _logger.debug("killJob : %s using group prod role for workingGroup=%s" % (pandaID,workingGroup)) - pass - else: - cn1 = getCN(res[0]) - cn2 = getCN(user) - _logger.debug("Owner:%s - Requester:%s " % (cn1,cn2)) - if cn1 != cn2: - _logger.debug("ignore killJob -> Owner != Requester") - break - # update - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':commandToPilot'] = 'tobekilled' - varMap[':taskBufferErrorDiag'] = 'killed by %s' % user - if userProdSourceLabel in ['managed','test'] and code in ['9',]: - # ignore commandToPilot for force kill - self.cur.execute((sql1F+comment) % table, varMap) - else: - self.cur.execute((sql1+comment) % table, varMap) - retU = self.cur.rowcount - if retU == 0: - continue - # set flag - flagCommand = True - # select - varMap = {} - varMap[':PandaID'] = pandaID - if (userProdSourceLabel in ['managed','test'] or 'test' in userProdSourceLabel) and code in ['9',]: - # use dummy for force kill - varMap[':jobStatus'] = 'dummy' - else: - varMap[':jobStatus'] = 'running' - self.cur.arraysize = 10 - self.cur.execute((sql2+comment) % table, varMap) - res = self.cur.fetchall() - if len(res) == 0: - continue - # instantiate JobSpec - job = JobSpec() - job.pack(res[0]) - # delete - if table=='ATLAS_PANDA.jobsDefined4': - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':oldJobStatus1'] = 'assigned' - varMap[':oldJobStatus2'] = 'defined' - self.cur.execute(sqlU+comment, varMap) - else: - varMap = {} - varMap[':PandaID'] = pandaID - self.cur.execute((sql3+comment) % table, varMap) - retD = self.cur.rowcount - if retD == 0: - continue - # error code - if job.jobStatus != 'failed': - # set status etc for non-failed jobs - job.endTime = datetime.datetime.utcnow() - job.modificationTime = job.endTime - if code in ['2','4']: - # expire - if code == '2': - job.taskBufferErrorCode = ErrorCode.EC_Expire - job.taskBufferErrorDiag = 'expired after 7 days since submission' - else: - # waiting timeout - job.taskBufferErrorCode = ErrorCode.EC_Expire - #job.taskBufferErrorCode = ErrorCode.EC_WaitTimeout - job.taskBufferErrorDiag = 'expired after waiting for input data for 2 days' - elif code=='3': - # aborted - job.taskBufferErrorCode = ErrorCode.EC_Aborted - job.taskBufferErrorDiag = 'aborted by ExtIF' - elif code=='8': - # reassigned by rebrokeage - job.taskBufferErrorCode = ErrorCode.EC_Reassigned - job.taskBufferErrorDiag = 'reassigned to another site by rebrokerage. new %s' % user - job.commandToPilot = None - else: - # killed - job.taskBufferErrorCode = ErrorCode.EC_Kill - job.taskBufferErrorDiag = 'killed by %s' % user - # set job status - job.jobStatus = 'cancelled' - else: - # keep status for failed jobs - job.modificationTime = datetime.datetime.utcnow() - if code=='7': - # retried by server - job.taskBufferErrorCode = ErrorCode.EC_Retried - job.taskBufferErrorDiag = 'retrying at another site. new %s' % user - job.commandToPilot = None - job.stateChangeTime = job.modificationTime - # insert - self.cur.execute(sql4+comment, job.valuesMap()) - # update file - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':status'] = 'failed' - varMap[':type1'] = 'output' - varMap[':type2'] = 'log' - self.cur.execute(sqlF+comment,varMap) - # update files,metadata,parametes - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':modificationTime'] = job.modificationTime - self.cur.execute(sqlFMod+comment,varMap) - self.cur.execute(sqlMMod+comment,varMap) - self.cur.execute(sqlPMod+comment,varMap) - flagKilled = True - updatedFlag = True - break - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("killJob : com=%s kill=%s " % (flagCommand,flagKilled)) - # record status change - try: - if updatedFlag: - self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) - except: - _logger.error('recordStatusChange in killJob') - if getUserInfo: - return (flagCommand or flagKilled),{'prodUserID':userProdUserID, - 'prodSourceLabel':userProdSourceLabel, - 'jobDefinitionID':userJobDefinitionID, - 'jobsetID':userJobsetID} - return (flagCommand or flagKilled) - except: - type, value, traceBack = sys.exc_info() - _logger.error("killJob : %s %s" % (type,value)) - # roll back - self._rollback() - if getUserInfo: - return False,{} - return False - - - # peek at job - def peekJob(self,pandaID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal=False): - comment = ' /* DBProxy.peekJob */' - _logger.debug("peekJob : %s" % pandaID) - # return None for NULL PandaID - if pandaID in ['NULL','','None',None]: - return None - # only int - try: - tmpID = int(pandaID) - except: - _logger.debug("peekJob : return None for %s:non-integer" % pandaID) - return None - sql1_0 = "SELECT %s FROM %s " - sql1_1 = "WHERE PandaID=:PandaID" - nTry=3 - for iTry in range(nTry): - try: - tables=[] - if fromDefined: - tables.append('ATLAS_PANDA.jobsDefined4') - if fromActive: - tables.append('ATLAS_PANDA.jobsActive4') - if fromArchived: - tables.append('ATLAS_PANDA.jobsArchived4') - if fromWaiting: - tables.append('ATLAS_PANDA.jobsWaiting4') - if fromDefined: - # for jobs which are just reset - tables.append('ATLAS_PANDA.jobsDefined4') - # select - varMap = {} - varMap[':PandaID'] = pandaID - for table in tables: - # start transaction - self.conn.begin() - # select - sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1 - self.cur.arraysize = 10 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if len(res) != 0: - # Job - job = JobSpec() - job.pack(res[0]) - # Files - # start transaction - self.conn.begin() - # select - sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=:PandaID" - self.cur.arraysize = 10000 - self.cur.execute(sqlFile+comment, varMap) - resFs = self.cur.fetchall() - # metadata - resMeta = None - if table == 'ATLAS_PANDA.jobsArchived4' or forAnal: - # read metadata only for finished/failed production jobs - sqlMeta = "SELECT metaData FROM ATLAS_PANDA.metaTable WHERE PandaID=:PandaID" - self.cur.execute(sqlMeta+comment, varMap) - for clobMeta, in self.cur: - if clobMeta != None: - resMeta = clobMeta.read() - break - # job parameters - job.jobParameters = None - sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" - varMap = {} - varMap[':PandaID'] = job.PandaID - self.cur.execute(sqlJobP+comment, varMap) - for clobJobP, in self.cur: - if clobJobP != None: - job.jobParameters = clobJobP.read() - break - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # set files - for resF in resFs: - file = FileSpec() - file.pack(resF) - job.addFile(file) - # set metadata - job.metadata = resMeta - return job - _logger.debug("peekJob() : PandaID %s not found" % pandaID) - return None - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("peekJob : %s retry : %s" % (pandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("peekJob : %s %s %s" % (pandaID,type,value)) - # return None for analysis - if forAnal: - return None - # return 'unknown' - job = JobSpec() - job.PandaID = pandaID - job.jobStatus = 'unknown' - return job - - - # get PandaID with jobexeID - def getPandaIDwithJobExeID(self,jobexeID): - comment = ' /* DBProxy.getPandaIDwithJobExeID */' - _logger.debug("getPandaIDwithJobExeID : %s" % jobexeID) - failedRetVal = (None,None,'') - # return for wrong jobexeID - if jobexeID in ['NULL','','None',None]: - return failedRetVal - # SQL - sql = "SELECT PandaID,jobDefinitionID,jobName FROM ATLAS_PANDA.jobsWaiting4 " - sql += "WHERE jobExecutionID=:jobexeID AND prodSourceLabel=:prodSourceLabel " - sql += "AND jobStatus=:jobStatus " - varMap = {} - varMap[':jobexeID'] = jobexeID - varMap[':jobStatus'] = 'pending' - varMap[':prodSourceLabel'] = 'managed' - try: - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10 - self.cur.execute(sql+comment,varMap) - res = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # not found - if res == None: - _logger.debug("getPandaIDwithJobExeID : jobexeID %s not found" % jobexeID) - return failedRetVal - _logger.debug("getPandaIDwithJobExeID : %s -> %s" % (jobexeID,str(res))) - return res - except: - # roll back - self._rollback() - errtype,errvalue = sys.exc_info()[:2] - _logger.error("getPandaIDwithJobExeID : %s %s %s" % (jobexeID,errtype,errvalue)) - return failedRetVal - - - # get express jobs - def getExpressJobs(self,dn): - comment = ' /* DBProxy.getExpressJobs */' - _logger.debug("getExpressJobs : %s" % dn) - sqlX = "SELECT specialHandling,COUNT(*) FROM %s " - sqlX += "WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel1 " - sqlX += "AND specialHandling IS NOT NULL " - sqlXJob = "SELECT PandaID,jobStatus,prodSourceLabel,modificationTime,jobDefinitionID,jobsetID,startTime,endTime FROM %s " - sqlXJob += "WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel1 " - sqlXJob += "AND specialHandling IS NOT NULL AND specialHandling=:specialHandling " - sqlQ = sqlX - sqlQ += "GROUP BY specialHandling " - sqlQJob = sqlXJob - sqlA = sqlX - sqlA += "AND modificationTime>:modificationTime GROUP BY specialHandling " - sqlAJob = sqlXJob - sqlAJob += "AND modificationTime>:modificationTime " - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ['','NULL',None]: - compactDN = dn - expressStr = 'express' - activeExpressU = [] - timeUsageU = datetime.timedelta(0) - executionTimeU = datetime.timedelta(hours=1) - jobCreditU = 3 - timeCreditU = executionTimeU * jobCreditU - timeNow = datetime.datetime.utcnow() - timeLimit = timeNow - datetime.timedelta(hours=6) - # loop over tables - for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: - varMap = {} - varMap[':prodUserName'] = compactDN - varMap[':prodSourceLabel1'] = 'user' - if table == 'ATLAS_PANDA.jobsArchived4': - varMap[':modificationTime'] = timeLimit - sql = sqlA % table - sqlJob = sqlAJob % table - else: - sql = sqlQ % table - sqlJob = sqlQJob % table - # start transaction - self.conn.begin() - # get the number of jobs for each specialHandling - self.cur.arraysize = 10 - _logger.debug(sql+comment+str(varMap)) - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - _logger.debug("getExpressJobs %s" % str(res)) - for specialHandling,countJobs in res: - if specialHandling == None: - continue - # look for express jobs - if expressStr in specialHandling: - varMap[':specialHandling'] = specialHandling - self.cur.arraysize = 1000 - self.cur.execute(sqlJob+comment, varMap) - resJobs = self.cur.fetchall() - _logger.debug("getExpressJobs %s" % str(resJobs)) - for tmp_PandaID,tmp_jobStatus,tmp_prodSourceLabel,tmp_modificationTime,\ - tmp_jobDefinitionID,tmp_jobsetID,tmp_startTime,tmp_endTime \ - in resJobs: - # collect active jobs - if not tmp_jobStatus in ['finished','failed','cancelled']: - activeExpressU.append((tmp_PandaID,tmp_jobsetID,tmp_jobDefinitionID)) - # get time usage - if not tmp_jobStatus in ['defined','activated']: - # check only jobs which actually use or used CPU on WN - if tmp_startTime != None: - # running or not - if tmp_endTime == None: - # job got started before/after the time limit - if timeLimit > tmp_startTime: - timeDelta = timeNow - timeLimit - else: - timeDelta = timeNow - tmp_startTime - else: - # job got started before/after the time limit - if timeLimit > tmp_startTime: - timeDelta = tmp_endTime - timeLimit - else: - timeDelta = tmp_endTime - tmp_startTime - # add - if timeDelta > datetime.timedelta(0): - timeUsageU += timeDelta - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # check quota - rRet = True - rRetStr = '' - rQuota = 0 - if len(activeExpressU) >= jobCreditU: - rRetStr += "The number of queued runXYZ exceeds the limit = %s. " % jobCreditU - rRet = False - if timeUsageU >= timeCreditU: - rRetStr += "The total execution time for runXYZ exceeds the limit = %s min. " % (timeCreditU.seconds / 60) - rRet = False - # calculate available quota - if rRet: - tmpQuota = jobCreditU - len(activeExpressU) - timeUsageU.seconds/executionTimeU.seconds - if tmpQuota < 0: - rRetStr += "Quota for runXYZ exceeds. " - rRet = False - else: - rQuota = tmpQuota - # return - retVal = {'status':rRet,'quota':rQuota,'output':rRetStr,'usage':timeUsageU,'jobs':activeExpressU} - _logger.debug("getExpressJobs : %s" % str(retVal)) - return retVal - except: - # roll back - self._rollback() - errtype,errvalue = sys.exc_info()[:2] - _logger.error("getExpressJobs : %s %s" % (errtype,errvalue)) - return None - - - # get active debug jobs - def getActiveDebugJobs(self,dn): - comment = ' /* DBProxy.getActiveDebugJobs */' - _logger.debug("getActiveDebugJobs : %s" % dn) - sqlX = "SELECT PandaID,jobStatus,specialHandling FROM %s " - sqlX += "WHERE prodUserName=:prodUserName " - sqlX += "AND specialHandling IS NOT NULL " - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ['','NULL',None]: - compactDN = dn - debugStr = 'debug' - activeDebugJobs = [] - # loop over tables - for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4']: - varMap = {} - varMap[':prodUserName'] = compactDN - sql = sqlX % table - # start transaction - self.conn.begin() - # get jobs with specialHandling - self.cur.arraysize = 100000 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # loop over all PandaIDs - for pandaID,jobStatus,specialHandling in res: - if specialHandling == None: - continue - # only active jobs - if not jobStatus in ['defined','activated','running','sent','starting']: - continue - # look for debug jobs - if debugStr in specialHandling and not pandaID in activeDebugJobs: - activeDebugJobs.append(pandaID) - # return - activeDebugJobs.sort() - _logger.debug("getActiveDebugJobs : %s -> %s" % (dn,str(activeDebugJobs))) - return activeDebugJobs - except: - # roll back - self._rollback() - errtype,errvalue = sys.exc_info()[:2] - _logger.error("getActiveDebugJobs : %s %s" % (errtype,errvalue)) - return None - - - # set debug mode - def setDebugMode(self,dn,pandaID,prodManager,modeOn): - comment = ' /* DBProxy.setDebugMode */' - _logger.debug("turnDebugModeOn : dn=%s id=%s prod=%s mode=%s" % (dn,pandaID,prodManager,modeOn)) - sqlX = "SELECT prodUserName,jobStatus,specialHandling FROM %s " - sqlX += "WHERE PandaID=:PandaID " - sqlU = "UPDATE %s SET specialHandling=:specialHandling " - sqlU += "WHERE PandaID=:PandaID " - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ['','NULL',None]: - compactDN = dn - debugStr = 'debug' - retStr = '' - retCode = False - # loop over tables - for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4']: - varMap = {} - varMap[':PandaID'] = pandaID - sql = sqlX % table - # start transaction - self.conn.begin() - # get jobs with specialHandling - self.cur.arraysize = 10 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchone() - # not found - if res == None: - retStr = 'Not found in active DB' - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - continue - prodUserName,jobStatus,specialHandling = res - # not active - if not jobStatus in ['defined','activated','running','sent','starting']: - retStr = 'Not in one of active job status' - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - break - # not owner - if not prodManager and prodUserName != compactDN: - retStr = 'Permission denied. Not the owner' - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - break - # set specialHandling - updateSH = True - if specialHandling in [None,'']: - if modeOn: - # set debug mode - specialHandling = debugStr - else: - # already disabled debug mode - updateSH = False - elif debugStr in specialHandling: - if modeOn: - # already in debug mode - updateSH = False - else: - # disable debug mode - specialHandling = re.sub(debugStr,'',specialHandling) - specialHandling = re.sub(',,',',',specialHandling) - specialHandling = re.sub('^,','',specialHandling) - specialHandling = re.sub(',$','',specialHandling) - else: - if modeOn: - # set debug mode - specialHandling = '%s,%s' % (debugStr,specialHandling) - else: - # already disabled debug mode - updateSH = False - - # no update - if not updateSH: - retStr = 'Already set accordingly' - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - break - # update - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':specialHandling'] = specialHandling - self.cur.execute((sqlU+comment) % table, varMap) - retD = self.cur.rowcount - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if retD == 0: - retStr = 'Failed to update DB' - else: - retStr = 'Succeeded' - break - # return - _logger.debug("setDebugMode : %s %s -> %s" % (dn,pandaID,retStr)) - return retStr - except: - # roll back - self._rollback() - errtype,errvalue = sys.exc_info()[:2] - _logger.error("setDebugMode : %s %s" % (errtype,errvalue)) - return None - - - # get PandaID with destinationDBlock - def getPandaIDwithDestDBlock(self,destinationDBlock): - comment = ' /* DBProxy.getPandaIDwithDestDBlock */' - _logger.debug("getPandaIDwithDestDBlock : %s" % destinationDBlock) - try: - sqlP = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab " - sqlP += "WHERE type IN (:type1,:type2) AND destinationDBlock=:destinationDBlock AND rownum<=1" - # start transaction - self.conn.begin() - pandaID = None - varMap = {} - varMap[':type1'] = 'log' - varMap[':type2'] = 'output' - varMap[':destinationDBlock'] = destinationDBlock - # select - self.cur.arraysize = 10 - self.cur.execute(sqlP+comment, varMap) - res = self.cur.fetchone() - # append - if res != None: - pandaID, = res - # commit to release tables - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return pandaID - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getPandaIDwithDestDBlock : %s %s" % (errType,errValue)) - # return empty list - return None - - - # get destSE with destinationDBlock - def getDestSEwithDestDBlock(self,destinationDBlock): - comment = ' /* DBProxy.getDestSEwithDestDBlock */' - _logger.debug("getDestSEwithDestDBlock : %s" % destinationDBlock) - try: - sqlP = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ destinationSE FROM ATLAS_PANDA.filesTable4 tab " - sqlP += "WHERE type IN (:type1,:type2) AND destinationDBlock=:destinationDBlock AND rownum<=1" - # start transaction - self.conn.begin() - varMap = {} - varMap[':type1'] = 'log' - varMap[':type2'] = 'output' - varMap[':destinationDBlock'] = destinationDBlock - # select - self.cur.arraysize = 10 - self.cur.execute(sqlP+comment, varMap) - res = self.cur.fetchone() - # append - destinationSE = None - if res != None: - destinationSE, = res - # commit to release tables - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return destinationSE - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getDestSEwithDestDBlock : %s %s" % (errType,errValue)) - # return empty list - return None - - - # get number of activated/defined jobs with output datasets - def getNumWaitingJobsWithOutDS(self,outputDSs): - comment = ' /* DBProxy.getNumWaitingJobsWithOutDS */' - _logger.debug("getNumWaitingJobsWithOutDS : %s" % str(outputDSs)) - try: - sqlD = "SELECT distinct destinationDBlock FROM ATLAS_PANDA.filesTable4 " - sqlD += "WHERE type IN (:type1,:type2) AND dataset=:dataset AND status IN (:status1,:status2)" - sqlP = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab " - sqlP += "WHERE type IN (:type1,:type2) AND destinationDBlock=:destinationDBlock AND status IN (:status1,:status2) AND rownum<=1" - sqlJ = "SELECT jobDefinitionID,taskID,prodUserName,jobStatus,prodSourceLabel FROM %s " - sqlJ += "WHERE PandaID=:PandaID" - sqlC = "SELECT count(*) FROM ATLAS_PANDA.jobsActive4 " - sqlC += "WHERE jobDefinitionID=:jobDefinitionID AND prodUserName=:prodUserName AND jobStatus IN (:jobStatus1)" - # start transaction - self.conn.begin() - # get sub datasets - subDSList = [] - for outputDS in outputDSs: - varMap = {} - varMap[':type1'] = 'log' - varMap[':type2'] = 'output' - varMap[':status1'] = 'unknown' - varMap[':status2'] = 'pending' - varMap[':dataset'] = outputDS - # select - self.cur.arraysize = 1000 - self.cur.execute(sqlD+comment, varMap) - resList = self.cur.fetchall() - # append - for destinationDBlock, in resList: - subDSList.append(destinationDBlock) - # get PandaIDs - pandaIDs = [] - for subDS in subDSList: - varMap = {} - varMap[':type1'] = 'log' - varMap[':type2'] = 'output' - varMap[':status1'] = 'unknown' - varMap[':status2'] = 'pending' - varMap[':destinationDBlock'] = subDS - # select - self.cur.arraysize = 10 - self.cur.execute(sqlP+comment, varMap) - res = self.cur.fetchone() - # append - if res != None: - pandaID, = res - pandaIDs.append(pandaID) - # commit to release tables - if not self._commit(): - raise RuntimeError, 'Commit error' - # loop over all PandaIDs - jobInfos = [] - for pandaID in pandaIDs: - varMap = {} - varMap[':PandaID'] = pandaID - # start transaction - self.conn.begin() - # get jobID,nJobs,jobStatus,userName - res = None - for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']: - # select - self.cur.arraysize = 10 - self.cur.execute((sqlJ % table)+comment,varMap) - res = self.cur.fetchone() - if res != None: - break - # commit to release tables - if not self._commit(): - raise RuntimeError, 'Commit error' - # not found - if res == None: - continue - # append - jobInfos.append(res) - # no jobs - if jobInfos == []: - _logger.error("getNumWaitingJobsWithOutDS : no jobs found") - return False,{} - # loop over all jobIDs - retMap = {} - for jobID,taskID,prodUserName,jobStatus,prodSourceLabel in jobInfos: - if retMap.has_key(jobID): - continue - retMap[jobID] = {} - retMap[jobID]['nJobs'] = taskID - retMap[jobID]['sourceLabel'] = prodSourceLabel - # don't check # of activated - if jobStatus in ['defined']: - retMap[jobID]['activated'] = False - retMap[jobID]['nActs'] = 0 - continue - retMap[jobID]['activated'] = True - # get # of activated jobs - varMap = {} - varMap[':prodUserName'] = prodUserName - varMap[':jobDefinitionID'] = jobID - varMap[':jobStatus1'] = 'activated' - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10 - self.cur.execute(sqlC+comment, varMap) - res = self.cur.fetchone() - # commit to release tables - if not self._commit(): - raise RuntimeError, 'Commit error' - if res == None: - _logger.error("getNumWaitingJobsWithOutDS : cannot get # of activated for %s:%s" % \ - (jobID,prodUserName)) - return False,{} - # set # of activated - nActs, = res - retMap[jobID]['nActs'] = nActs - # return - _logger.debug("getNumWaitingJobsWithOutDS -> %s" % str(retMap)) - return True,retMap - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getNumWaitingJobsWithOutDS : %s %s" % (errType,errValue)) - # return empty list - return False,{} - - - # get slimmed file info with PandaIDs - def getSlimmedFileInfoPandaIDs(self,pandaIDs): - comment = ' /* DBProxy.getSlimmedFileInfoPandaIDs */' - _logger.debug("getSlimmedFileInfoPandaIDs : %s len=%s" % (pandaIDs[0],len(pandaIDs))) - try: - sqlL = "SELECT lfn,type,dataset FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID" - sqlA = "SELECT /*+ INDEX(tab FILES_ARCH_PANDAID_IDX)*/ lfn,type,dataset FROM ATLAS_PANDAARCH.filesTable_ARCH tab " - sqlA += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-60)" - retMap = {'inDS':[],'outDS':[]} - # start transaction - self.conn.begin() - # select - for pandaID in pandaIDs: - # make sql - varMap = {} - varMap[':PandaID'] = pandaID - # select - self.cur.arraysize = 10000 - self.cur.execute(sqlL+comment, varMap) - resList = self.cur.fetchall() - # try archived if not found in filesTable4 - if len(resList) == 0: - self.cur.execute(sqlA+comment, varMap) - resList = self.cur.fetchall() - # append - for tmp_lfn,tmp_type,tmp_dataset in resList: - # skip lib.tgz - if tmp_lfn.endswith('.lib.tgz'): - continue - if tmp_type == 'input': - if not tmp_dataset in retMap['inDS']: - retMap['inDS'].append(tmp_dataset) - elif tmp_type == 'output': - if not tmp_dataset in retMap['outDS']: - retMap['outDS'].append(tmp_dataset) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("getSlimmedFileInfoPandaIDs : %s" % str(retMap)) - return retMap - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getSlimmedFileInfoPandaIDs : %s %s" % (type,value)) - # return empty list - return {} - - - # get JobIDs in a time range - def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs): - comment = ' /* DBProxy.getJobIDsInTimeRange */' - _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ['','NULL',None]: - compactDN = dn - tables = ['ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4'] - # select - for table in tables: - # make sql - if table == 'ATLAS_PANDA.jobsArchived4': - sql = 'SELECT /*+ INDEX_RS_ASC(TAB("JOBSARCHIVED4"."PRODUSERNAME")) NO_INDEX(TAB("JOBSARCHIVED4"."MODIFICATIONTIME")) */ jobDefinitionID FROM %s tab ' % table - elif table == 'ATLAS_PANDA.jobsActive4': - sql = 'SELECT /*+ INDEX_RS_ASC(TAB("JOBSACTIVE4"."PRODUSERNAME")) NO_INDEX(TAB("JOBSACTIVE4"."MODIFICATIONTIME")) */ jobDefinitionID FROM %s tab ' % table - else: - sql = "SELECT jobDefinitionID FROM %s " % table - sql += "WHERE prodUserName=:prodUserName AND modificationTime>:modificationTime " - sql += "AND prodSourceLabel=:prodSourceLabel GROUP BY jobDefinitionID" - varMap = {} - varMap[':prodUserName'] = compactDN - varMap[':prodSourceLabel'] = 'user' - varMap[':modificationTime'] = timeRange - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - _logger.debug(sql+comment+str(varMap)) - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for tmpID, in resList: - if not tmpID in retJobIDs: - retJobIDs.append(tmpID) - _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs)) - return retJobIDs - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getJobIDsInTimeRange : %s %s" % (type,value)) - # return empty list - return [] - - - # get PandaIDs for a JobID - def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs): - comment = ' /* DBProxy.getPandIDsWithJobID */' - _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID)) - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ['','NULL',None]: - compactDN = dn - tables = ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsArchived4'] - buildJobID = None - # select - for table in tables: - # skip if all jobs have already been gotten - if nJobs > 0 and len(idStatus) >= nJobs: - continue - # make sql - sql = "SELECT PandaID,jobStatus,commandToPilot,prodSourceLabel,taskBufferErrorCode FROM %s " % table - sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql += "AND prodSourceLabel in (:prodSourceLabel1,:prodSourceLabel2)" - varMap = {} - varMap[':prodUserName'] = compactDN - varMap[':jobDefinitionID'] = jobID - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - # select - _logger.debug(sql+comment+str(varMap)) - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - # append - for tmpID,tmpStatus,tmpCommand,tmpProdSourceLabel,tmpTaskBufferErrorCode in resList: - # ignore jobs retried by pilot since they have new PandaIDs with the same jobsetID/jobdefID - if tmpTaskBufferErrorCode in [ErrorCode.EC_PilotRetried]: - continue - # ignore old buildJob which was replaced by rebrokerage - if tmpProdSourceLabel == 'panda': - if buildJobID == None: - # first buildJob - buildJobID = tmpID - elif buildJobID >= tmpID: - # don't append old one - continue - else: - # delete old one - del idStatus[buildJobID] - buildJobID = tmpID - # append - idStatus[tmpID] = (tmpStatus,tmpCommand) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("getPandIDsWithJobID : %s" % str(idStatus)) - return idStatus,buildJobID - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getPandIDsWithJobID : %s %s" % (type,value)) - # return empty list - return {},None - - - # lock jobs for reassign - def lockJobsForReassign(self,tableName,timeLimit,statList,labels,processTypes,sites,clouds): - comment = ' /* DBProxy.lockJobsForReassign */' - _logger.debug("lockJobsForReassign : %s %s %s %s %s %s %s" % \ - (tableName,timeLimit,statList,labels,processTypes,sites,clouds)) - try: - # make sql - sql = "SELECT PandaID FROM %s " % tableName - sql += "WHERE modificationTime<:modificationTime " - varMap = {} - varMap[':modificationTime'] = timeLimit - if statList != []: - sql += 'AND jobStatus IN (' - tmpIdx = 0 - for tmpStat in statList: - tmpKey = ':stat%s' % tmpIdx - varMap[tmpKey] = tmpStat - sql += '%s,' % tmpKey - sql = sql[:-1] - sql += ') ' - if labels != []: - sql += 'AND prodSourceLabel IN (' - tmpIdx = 0 - for tmpStat in labels: - tmpKey = ':label%s' % tmpIdx - varMap[tmpKey] = tmpStat - sql += '%s,' % tmpKey - sql = sql[:-1] - sql += ') ' - if processTypes != []: - sql += 'AND processingType IN (' - tmpIdx = 0 - for tmpStat in processTypes: - tmpKey = ':processType%s' % tmpIdx - varMap[tmpKey] = tmpStat - sql += '%s,' % tmpKey - sql = sql[:-1] - sql += ') ' - if sites != []: - sql += 'AND computingSite IN (' - tmpIdx = 0 - for tmpStat in sites: - tmpKey = ':site%s' % tmpIdx - varMap[tmpKey] = tmpStat - sql += '%s,' % tmpKey - sql = sql[:-1] - sql += ') ' - if clouds != []: - sql += 'AND cloud IN (' - tmpIdx = 0 - for tmpStat in clouds: - tmpKey = ':cloud%s' % tmpIdx - varMap[tmpKey] = tmpStat - sql += '%s,' % tmpKey - sql = sql[:-1] - sql += ') ' - # sql for lock - sqlLock = 'UPDATE %s SET modificationTime=CURRENT_DATE WHERE PandaID=:PandaID' % tableName - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 1000000 - self.cur.execute(sql+comment,varMap) - resList = self.cur.fetchall() - retList = [] - # lock - for tmpID, in resList: - varLock = {':PandaID':tmpID} - self.cur.execute(sqlLock+comment,varLock) - retList.append((tmpID,)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # sort - retList.sort() - _logger.debug("lockJobsForReassign : %s" % (len(retList))) - return True,retList - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("lockJobsForReassign : %s %s" % (errType,errValue)) - # return empty - return False,[] - - - # lock jobs for finisher - def lockJobsForFinisher(self,timeNow,rownum,highPrio): - comment = ' /* DBProxy.lockJobsForFinisher */' - _logger.debug("lockJobsForFinisher : %s %s %s" % (timeNow,rownum,highPrio)) - try: - varMap = {} - varMap[':jobStatus'] = 'transferring' - varMap[':currentPriority'] = 800 - varMap[':prodSourceLabel'] = 'managed' - # make sql - sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 " - sql += "WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel " - if highPrio: - varMap[':modificationTime'] = timeNow - datetime.timedelta(hours=1) - sql += "AND currentPriority>=:currentPriority AND rownum<=%s " % rownum - else: - sql += "AND currentPriority<:currentPriority AND rownum<=%s " % rownum - varMap[':modificationTime'] = timeNow - datetime.timedelta(hours=12) - sql += "FOR UPDATE " - # sql for lock - sqlLock = 'UPDATE ATLAS_PANDA.jobsActive4 SET modificationTime=CURRENT_DATE WHERE PandaID=:PandaID' - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 1000 - self.cur.execute(sql+comment,varMap) - resList = self.cur.fetchall() - retList = [] - # lock - for tmpID, in resList: - varLock = {':PandaID':tmpID} - self.cur.execute(sqlLock+comment,varLock) - retList.append(tmpID) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # sort - retList.sort() - _logger.debug("lockJobsForFinisher : %s" % (len(retList))) - return True,retList - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("lockJobsForFinisher : %s %s" % (errType,errValue)) - # return empty - return False,[] - - - # get the number of waiting jobs with a dataset - def getNumWaitingJobsForPD2P(self,datasetName): - comment = ' /* DBProxy.getNumWaitingJobsForPD2P */' - _logger.debug("getNumWaitingJobsForPD2P : %s" % datasetName) - try: - tables = ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4'] - nJobs = 0 - # select - for table in tables: - # make sql - sql = "SELECT COUNT(*) FROM %s " % table - sql += "WHERE prodDBlock=:prodDBlock AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " - sql += "AND jobStatus IN (:jobStatus1,:jobStatus2) " - varMap = {} - varMap[':prodDBlock'] = datasetName - varMap[':jobStatus1'] = 'defined' - varMap[':jobStatus2'] = 'activated' - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if res != None: - tmpN, = res - nJobs += tmpN - _logger.debug("getNumWaitingJobsForPD2P : %s -> %s" % (datasetName,nJobs)) - return nJobs - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getNumWaitingJobsForPD2P : %s %s" % (errType,errValue)) - # return 0 - return 0 - - - # get the number of waiting jobsets with a dataset - def getNumWaitingJobsetsForPD2P(self,datasetName): - comment = ' /* DBProxy.getNumWaitingJobsetsForPD2P */' - _logger.debug("getNumWaitingJobsetsForPD2P : %s" % datasetName) - try: - tables = ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4'] - jobsetIDuserList = [] - # select - for table in tables: - # make sql - sql = "SELECT jobsetID,prodUserName FROM %s " % table - sql += "WHERE prodDBlock=:prodDBlock AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " - sql += "AND jobStatus IN (:jobStatus1,:jobStatus2) GROUP BY jobsetID,prodUserName" - varMap = {} - varMap[':prodDBlock'] = datasetName - varMap[':jobStatus1'] = 'defined' - varMap[':jobStatus2'] = 'activated' - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - for jobsetID,prodUserName in resList: - tmpKey = (jobsetID,prodUserName) - if not tmpKey in jobsetIDuserList: - jobsetIDuserList.append(tmpKey) - _logger.debug("getNumWaitingJobsetsForPD2P : %s -> %s" % (datasetName,len(jobsetIDuserList))) - return len(jobsetIDuserList) - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getNumWaitingJobsetsForPD2P : %s %s" % (errType,errValue)) - # return 0 - return 0 - - - # lock job for re-brokerage - def lockJobForReBrokerage(self,dn,jobID,simulation,forceOpt,forFailed=False): - comment = ' /* lockJobForReBrokerage */' - _logger.debug("lockJobForReBrokerage : %s %s %s %s %s" % (dn,jobID,simulation,forceOpt,forFailed)) - try: - errMsg = '' - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ['','NULL',None]: - compactDN = dn - # start transaction - self.conn.begin() - buildJobPandaID = None - buildJobStatus = None - buildJobDefID = None - buildCreationTime = None - runPandaID = None - minPandaIDlibDS = None - maxPandaIDlibDS = None - # get one runXYZ job - if errMsg == '': - for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']: - sql = "SELECT PandaID FROM %s " % table - sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql += "AND prodSourceLabel=:prodSourceLabel1 AND jobStatus IN (:jobStatus1,:jobStatus2) " - sql += "AND rownum <= 1" - varMap = {} - varMap[':prodUserName'] = compactDN - varMap[':jobDefinitionID'] = jobID - varMap[':prodSourceLabel1'] = 'user' - if not forFailed: - # lock active jobs for normal rebrokerage - varMap[':jobStatus1'] = 'defined' - varMap[':jobStatus2'] = 'activated' - else: - # lock failed jobs for retry - varMap[':jobStatus1'] = 'failed' - varMap[':jobStatus2'] = 'dummy' - # select - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchone() - # not found - if res != None: - runPandaID, = res - break - if runPandaID == None: - if not forFailed: - errMsg = "no defined/activated jobs to reassign. running/finished/failed jobs are not reassigned by rebrokerage " - else: - errMsg = "could not get failed runXYZ jobs" - # get libDS - libDS = '' - if errMsg == '': - sql = "SELECT lfn,dataset FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND PandaID=:PandaID" - varMap = {} - varMap[':type'] = 'input' - varMap[':PandaID'] = runPandaID - # select - self.cur.arraysize = 10000 - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - for tmpLFN,tmpDS in resList: - if tmpLFN.endswith('.lib.tgz'): - libDS = tmpDS - break - # check status of corresponding buildJob - if libDS != '': - sql = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 " - sql += "WHERE type=:type AND dataset=:dataset" - varMap = {} - varMap[':type'] = 'output' - varMap[':dataset'] = libDS - # select - self.cur.arraysize = 10 - # select - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchone() - # not found in active table - if res == None: - # look for buildJob in archived table - sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODSOURCELABEL_IDX JOBS_PRODUSERNAME_IDX) */ " - sql += "PandaID,jobStatus,jobDefinitionID,creationTime " - sql += "FROM ATLAS_PANDAARCH.jobsArchived tab " - sql += "WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLable1 " - sql += "AND modificationTime>(CURRENT_DATE-10) ORDER BY PandaID DESC" - varMap = {} - varMap[':prodUserName'] = compactDN - varMap[':prodSourceLable1'] = 'panda' - # select - self.cur.arraysize = 10000 - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - # loop over PandaIDs to find corresponding libDS - sql = "SELECT /*+ INDEX(tab FILES_ARCH_PANDAID_IDX)*/ PandaID FROM ATLAS_PANDAARCH.filesTable_ARCH tab " - sql += "WHERE PandaID=:PandaID AND type=:type AND dataset=:dataset AND status=:status " - sql += "AND modificationTime>(CURRENT_DATE-10)" - self.cur.arraysize = 10 - for tmpID,tmpJobStatus,tmpJobDefID,tmpCreationTime in resList: - varMap = {} - varMap[':PandaID'] = tmpID - varMap[':type'] = 'output' - varMap[':status'] = 'ready' - varMap[':dataset'] = libDS - # select - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchone() - if res != None: - # get PandaID of buildJob - buildJobPandaID, = res - buildJobStatus = tmpJobStatus - buildJobDefID = tmpJobDefID - buildCreationTime = tmpCreationTime - break - # not found - if buildJobPandaID == None: - errMsg = "could not find successful buildJob for %s" % libDS - else: - # get PandaID of buildJob - buildJobPandaID, = res - # found buildJob - if errMsg == '': - # get current buildJob status - if buildJobStatus == None: - for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsDefined4']: - # make sql - sql = "SELECT jobStatus,jobDefinitionID,creationTime FROM %s " % table - sql += "WHERE PandaID=:PandaID " - varMap = {} - varMap[':PandaID'] = buildJobPandaID - # select - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchone() - # found - if res != None: - buildJobStatus,buildJobDefID,buildCreationTime = res - break - # not found - if buildJobStatus == None: - errMsg = "could not find buildJob=%s in database" % buildJobPandaID - # check status - if errMsg != '': - if not buildJobStatus in ['defined','activated','finished','cancelled']: - errMsg = "status of buildJob is '%s' != defined/activated/finished/cancelled so that jobs cannot be reassigned" \ - % buildJobStatus - # get max/min PandaIDs using the libDS - if errMsg == '': - sql = "SELECT MAX(PandaID),MIN(PandaID) FROM ATLAS_PANDA.filesTable4 " - sql += "WHERE type=:type AND dataset=:dataset" - varMap = {} - varMap[':type'] = 'input' - varMap[':dataset'] = libDS - self.cur.arraysize = 10 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchone() - if res == None: - errMsg = "cannot get MAX/MIN PandaID for multiple usage for %s" % libDS - else: - maxPandaIDlibDS,minPandaIDlibDS = res - # check creationDate of buildJob - if errMsg == '': - # buildJob has already finished - timeLimit = datetime.datetime.utcnow()-datetime.timedelta(days=6) - if buildJobStatus in ['finished','cancelled'] and buildCreationTime < timeLimit: - errMsg = "corresponding buildJob %s is too old %s" % (buildJobPandaID,buildCreationTime.strftime('%Y-%m-%d %H:%M:%S')) - # check modificationTime - if errMsg == '': - # make sql - tables = ['ATLAS_PANDA.jobsDefined4'] - if not buildJobStatus in ['defined']: - tables.append('ATLAS_PANDA.jobsActive4') - sql = "SELECT modificationTime FROM %s " - sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus IN (:jobStatus1,:jobStatus2) " - sql += "FOR UPDATE " - varMap = {} - varMap[':prodUserName'] = compactDN - varMap[':jobDefinitionID'] = jobID - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - if not forFailed: - # normal rebrokerage - varMap[':jobStatus1'] = 'defined' - varMap[':jobStatus2'] = 'activated' - else: - # retry - varMap[':jobStatus1'] = 'failed' - varMap[':jobStatus2'] = 'dummy' - for tableName in tables: - # select - self.cur.execute((sql % tableName)+comment, varMap) - res = self.cur.fetchone() - if res != None: - break - if res == None: - if not forFailed: - errMsg = "no defined/activated jobs to be reassigned" - else: - errMsg = "no failed jobs to be retried" - else: - tmpModificationTime, = res - # prevent users from rebrokering more than once in one hour - timeLimit = datetime.datetime.utcnow()-datetime.timedelta(hours=1) - if timeLimit < tmpModificationTime and not forceOpt: - errMsg = "last mod time is %s > current-1hour. Cannot run (re)brokerage more than once in one hour" \ - % tmpModificationTime.strftime('%Y-%m-%d %H:%M:%S') - elif simulation: - pass - else: - # update modificationTime for locking - for tableName in tables: - sql = 'UPDATE %s ' % tableName - sql += 'SET modificationTime=CURRENT_DATE ' - sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus IN (:jobStatus1,:jobStatus2) " - self.cur.execute(sql+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return failure - if errMsg != '': - _logger.debug('lockJobForReBrokerage : '+errMsg) - return False,{'err':errMsg} - # return - retMap = {'bPandaID':buildJobPandaID,'bStatus':buildJobStatus,'userName':compactDN, - 'bJobID':buildJobDefID,'rPandaID':runPandaID, - 'maxPandaIDlibDS':maxPandaIDlibDS,'minPandaIDlibDS':minPandaIDlibDS} - _logger.debug("lockJobForReBrokerage %s" % str(retMap)) - return True,retMap - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("lockJobForReBrokerage : %s %s" % (type,value)) - # return empty list - return False,{'err':'database error'} - - - # get input datasets for rebrokerage - def getInDatasetsForReBrokerage(self,jobID,userName): - comment = ' /* DBProxy.getInDatasetsForReBrokerage */' - failedRet = False,{},None - try: - _logger.debug("getInDatasetsForReBrokerage(%s,%s)" % (jobID,userName)) - # start transaction - self.conn.begin() - # get pandaID - pandaIDs = [] - maxTotalFileSize = None - for table in ['jobsActive4','jobsDefined4']: - sql = "SELECT PandaID FROM ATLAS_PANDA.%s WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " % table - sql += "AND prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2)" - varMap = {} - varMap[':prodUserName'] = userName - varMap[':jobDefinitionID'] = jobID - varMap[':prodSourceLabel'] = 'user' - varMap[':jobStatus1'] = 'defined' - varMap[':jobStatus2'] = 'activated' - self.cur.arraysize = 10000 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - if res != []: - for tmpItem in res: - pandaIDs.append(tmpItem[0]) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # not found - if pandaIDs == []: - _logger.debug("getInDatasetsForReBrokerage : PandaIDs not found") - return failedRet - # get dataset and lfn - retMapLFN = {} - sql = "SELECT dataset,lfn,fsize FROM ATLAS_PANDA.filesTable4 " - sql += "WHERE PandaID=:PandaID AND type=:type" - for pandaID in pandaIDs: - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':type'] = 'input' - # start transaction - self.conn.begin() - self.cur.arraysize = 10000 - self.cur.execute(sql+comment, varMap) - resL = self.cur.fetchall() - # append - tmpTotalFileSize = 0 - for tmpDataset,tmpLFN,tmpFileSize in resL: - # ignore lib.tgz - if tmpLFN.endswith('.lib.tgz'): - continue - if not retMapLFN.has_key(tmpDataset): - retMapLFN[tmpDataset] = [] - if not tmpLFN in retMapLFN[tmpDataset]: - retMapLFN[tmpDataset].append(tmpLFN) - try: - tmpTotalFileSize += long(tmpFileSize) - except: - pass - if maxTotalFileSize == None or maxTotalFileSize < tmpTotalFileSize: - maxTotalFileSize = tmpTotalFileSize - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("getInDatasetsForReBrokerage : done") - # max size in MB - maxTotalFileSize /= (1024*1024) - # return - return True,retMapLFN,maxTotalFileSize - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getInDatasetsForReBrokerage(%s,%s) : %s %s" % (jobID,userName,errType,errValue)) - return failedRet - - - # move jobs to jobsDefine4 for re-brokerage - def resetBuildJobForReBrokerage(self,pandaID): - comment = ' /* resetBuildJobForReBrokerage */' - _logger.debug("resetBuildJobForReBrokerage : start %s" % pandaID) - try: - # make sql to move jobs - sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames() - sql1+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus1" - sql3 = "INSERT INTO ATLAS_PANDA.jobsDefined4 (%s) " % JobSpec.columnNames() - sql3+= JobSpec.bindValuesExpression() - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':jobStatus1'] = 'activated' - self.cur.arraysize = 10 - self.cur.execute(sql1+comment,varMap) - res = self.cur.fetchone() - # not found - if res == None: - _logger.error("resetBuildJobForReBrokerage : PandaID=%s not found" % pandaID) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return False - # instantiate Job - job = JobSpec() - job.pack(res) - # delete from jobsDefined4 just in case - varMap = {} - varMap[':PandaID'] = pandaID - sqlD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" - self.cur.execute(sqlD+comment,varMap) - # reset job status - job.jobStatus = 'defined' - # host and time information - job.modificationHost = self.hostname - job.modificationTime = datetime.datetime.utcnow() - # insert to Defined - self.cur.execute(sql3+comment, job.valuesMap()) - # delete from Active - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':jobStatus1'] = 'activated' - sql2 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID AND jobStatus=:jobStatus1" - self.cur.execute(sql2+comment,varMap) - retD = self.cur.rowcount - # delete failed - if retD != 1: - _logger.error("resetBuildJobForReBrokerage : failed to delete PandaID=%s" % pandaID) - # rollback - self._rollback() - return False - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - _logger.debug("resetBuildJobForReBrokerage : end %s" % pandaID) - return True - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("resetBuildJobForReBrokerage : %s %s" % (type,value)) - # return empty list - return False - - - # get PandaIDs using userName/jobID for re-brokerage or retry - def getPandaIDsForReBrokerage(self,userName,jobID,fromActive,forFailed=False): - comment = ' /* DBProxy.getPandaIDsForReBrokerage */' - _logger.debug("getPandaIDsForReBrokerage : %s %s %s %s" % (userName,jobID,fromActive,forFailed)) - try: - returnList = [] - varMap = {} - varMap[':prodUserName'] = userName - varMap[':jobDefinitionID'] = jobID - if not forFailed: - varMap[':jobStatus1'] = 'activated' - else: - varMap[':jobStatus1'] = 'failed' - sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 " - sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql += "AND jobStatus=:jobStatus1" - # get IDs from Active table - if fromActive: - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 20000 - self.cur.execute(sql+comment,varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for tmpID, in resList: - if not tmpID in returnList: - returnList.append(tmpID) - # set holding to prevent activated jobs from being picked up - if not forFailed: - sql = 'UPDATE ATLAS_PANDA.jobsActive4 SET jobStatus=:newStatus ' - sql += 'WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID ' - sql += "AND jobStatus=:jobStatus1" - varMap[':newStatus'] = 'holding' - # start transaction - self.conn.begin() - # update - self.cur.execute(sql+comment,varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # get IDs from Defined table just in case - varMap = {} - varMap[':prodUserName'] = userName - varMap[':jobDefinitionID'] = jobID - varMap[':jobStatus1'] = 'defined' - varMap[':jobStatus2'] = 'assgined' - sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 " - sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql += "AND jobStatus IN (:jobStatus1,:jobStatus2)" - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 20000 - self.cur.execute(sql+comment,varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for tmpID, in resList: - if not tmpID in returnList: - returnList.append(tmpID) - # sort - returnList.sort() - # return - return returnList - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getPandaIDsForReBrokerage : %s %s" % (type,value)) - # return empty list - return [] - - - # get outDSs with userName/jobID - def getOutDSsForReBrokerage(self,userName,jobID): - comment = ' /* DBProxy.getOutDSsForReBrokerage */' - _logger.debug("getOutDSsForReBrokerage : %s %s" % (userName,jobID)) - falseRet = (False,[],None,None) - try: - # get one PandaID - sql = "SELECT PandaID,computingSite,destinationSE FROM ATLAS_PANDA.jobsActive4 " - sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql += "AND prodSourceLabel=:prodSourceLabel AND rownum<=1" - varMap = {} - varMap[':prodUserName'] = userName - varMap[':jobDefinitionID'] = jobID - varMap[':prodSourceLabel'] = 'user' - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchone() - # not found - if res == None: - _logger.debug("getOutDSsForReBrokerage : failed to get PandaID") - if not self._commit(): - raise RuntimeError, 'Commit error' - return falseRet - pandaID,computingSite,destinationSE = res - # get outDSs - sql = "SELECT dataset FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type IN (:type1,:type2)" - varMap = {} - varMap[':type1'] = 'output' - varMap[':type2'] = 'log' - varMap[':PandaID'] = pandaID - self.cur.arraysize = 1000 - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - returnList = [] - for tmpOutDS, in resList: - if not tmpOutDS in returnList: - returnList.append(tmpOutDS) - # return - return True,returnList,computingSite,destinationSE - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getOutDSsForReBrokerage : %s %s" % (type,value)) - # return empty list - return falseRet - - - # query PandaID - def queryPandaID(self,jobDefID): - comment = ' /* DBProxy.queryPandaID */' - _logger.debug("queryPandaID : %s" % jobDefID) - sql0 = "SELECT PandaID,attemptNr FROM %s WHERE attemptNr=(" - sql0+= "SELECT MAX(attemptNr) FROM %s" - sql1= " WHERE prodSourceLabel=:prodSourceLabel AND jobDefinitionID=:jobDefinitionID)" - sql1+=" AND prodSourceLabel=:prodSourceLabel AND jobDefinitionID=:jobDefinitionID" - try: - ids = [] - # select - varMap = {} - varMap[':jobDefinitionID'] = jobDefID - varMap[':prodSourceLabel'] = 'managed' - for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsWaiting4']: - # start transaction - self.conn.begin() - # select - sql = sql0 % (table,table) + sql1 - self.cur.arraysize = 10 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - ids += list(res) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # look for the latest attempt - preAtt =-1 - pandaID=None - for pID,att in ids: - if att > preAtt: - pandaID = pID - preAtt = att - if att == preAtt: - if pandaID < pID: - pandaID = pID - return pandaID - except: - type, value, traceBack = sys.exc_info() - _logger.error("queryPandaID : %s %s" % (type,value)) - # roll back - self._rollback() - return None - - - # query job info per cloud - def queryJobInfoPerCloud(self,cloud,schedulerID=None): - comment = ' /* DBProxy.queryJobInfoPerCloud */' - _logger.debug("queryJobInfoPerCloud : %s %s" % (cloud,schedulerID)) - attrs = ['PandaID','jobStatus','jobName'] - sql0 = "SELECT " - for attr in attrs: - sql0 += "%s," % attr - sql0 = "%s " % sql0[:-1] - sql0+= "FROM %s " - sql0+= "WHERE cloud=:cloud " - varMap = {} - varMap[':cloud'] = cloud - if schedulerID != None: - sql0+= "AND schedulerID=:schedulerID " - varMap[':schedulerID'] = schedulerID - try: - ids = [] - returnList = [] - # select - for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: - # start transaction - self.conn.begin() - # select - sql = sql0 % table - self.cur.arraysize = 10000 - self.cur.execute(sql+comment,varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # loop over all - for res in resList: - valMap = {} - # skip if already in the list - PandaID = res[0] - if PandaID in ids: - continue - # convert to map - for idx,attr in enumerate(attrs): - valMap[attr] = res[idx] - # append to list - ids.append(PandaID) - returnList.append(valMap) - # return - return returnList - except: - type, value, traceBack = sys.exc_info() - _logger.error("queryJobInfoPerCloud : %s %s" % (type,value)) - # roll back - self._rollback() - return None - - - # get PandaIDs at Site - def getPandaIDsSite(self,site,status,limit): - comment = ' /* DBProxy.getPandaIDsSite */' - _logger.debug("getPandaIDsSite : %s %s %s" % (site,status,limit)) - try: - ids = [] - # find table - if status in ['defined','assigned']: - table = 'ATLAS_PANDA.jobsDefined4' - elif status in ['activated','running','holding','trasnferring']: - table = 'ATLAS_PANDA.jobsActive4' - elif status in ['waiting']: - table = 'ATLAS_PANDA.jobsWaiting4' - elif status in ['finished','failed']: - table = 'ATLAS_PANDA.jobsArchived4' - else: - _logger.error("unknown status:%s" % status) - return ids - # limit - limit = int(limit) - # SQL - sql = "SELECT PandaID FROM %s " % table - sql += "WHERE computingSite=:computingSite AND jobStatus=:jobStatus AND prodSourceLabel=:prodSourceLabel " - sql += "AND rownum<=:limit" - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[':computingSite'] = site - varMap[':jobStatus'] = status - varMap[':limit'] = limit - varMap[':prodSourceLabel'] = 'managed' - self.cur.arraysize = limit - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # convert to list - for id, in res: - ids.append(id) - return ids - except: - type, value, traceBack = sys.exc_info() - _logger.error("getPandaIDsSite : %s %s" % (type,value)) - # roll back - self._rollback() - return [] - - - # get PandaIDs to be updated in prodDB - def getPandaIDsForProdDB(self,limit,lockedby): - comment = ' /* DBProxy.getPandaIDsForProdDB */' - _logger.debug("getPandaIDsForProdDB %s" % limit) - sql0 = "PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID FROM %s " - sqlW = "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND lockedby=:lockedby " - sqlX = "AND stateChangeTime>prodDBUpdateTime " - sqlA = "AND (CASE WHEN stateChangeTime>prodDBUpdateTime THEN 1 ELSE null END) = 1 " - sql1 = "AND rownum<=:limit " - varMap = {} - varMap[':lockedby'] = lockedby - varMap[':limit'] = limit - varMap[':prodSourceLabel1'] = 'managed' - varMap[':prodSourceLabel2'] = 'rc_test' - try: - retMap = {} - totalIDs = 0 - # select - for table in ['ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: - # start transaction - self.conn.begin() - # select - sql = sql0 % table - if table in ['ATLAS_PANDA.jobsArchived4']: - sql = "SELECT /*+ INDEX_RS_ASC(tab JOBSARCHIVED4_CHANGETIME) NO_INDEX(tab(PRODSOURCELABEL))*/ " + sql + " tab " + sqlW + sqlA - else: - sql = "SELECT " + sql + sqlW + sqlX - sql += sql1 - self.cur.arraysize = limit - _logger.debug("getPandaIDsForProdDB %s %s" % (sql+comment,str(varMap))) - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - _logger.debug("getPandaIDsForProdDB got %s" % len(res)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - for PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID in res: - # ignore dummy jobs in jobsDefined4 - if table == 'ATLAS_PANDA.jobsDefined4' and (not jobStatus in ['defined','assigned']): - continue - # add status - if not retMap.has_key(jobStatus): - retMap[jobStatus] = [] - # append - retMap[jobStatus].append({'PandaID':PandaID,'attemptNr':attemptNr, - 'stateChangeTime':stateChangeTime.strftime('%Y-%m-%d %H:%M:%S'), - 'jobDefinitionID':jobDefinitionID, - 'jobExecutionID':jobExecutionID}) - totalIDs += 1 - # limit - if totalIDs > limit: - break - _logger.debug("getPandaIDsForProdDB %s ret->%s" % (limit,totalIDs)) - return retMap - except: - type, value, traceBack = sys.exc_info() - _logger.error("getPandaIDsForProdDB : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # update prodDBUpdateTime - def updateProdDBUpdateTime(self,param): - comment = ' /* DBProxy.updateProdDBUpdateTime */' - _logger.debug("updateProdDBUpdateTime %s" % str(param)) - sql0 = "UPDATE %s " - sql0+= "SET prodDBUpdateTime=TO_TIMESTAMP(:prodDBUpdateTime,'YYYY-MM-DD HH24:MI:SS') " - sql0+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus AND stateChangeTime=TO_TIMESTAMP(:stateChangeTime,'YYYY-MM-DD HH24:MI:SS') " - varMap = {} - varMap[':prodDBUpdateTime'] = param['stateChangeTime'] - varMap[':PandaID'] = param['PandaID'] - varMap[':jobStatus'] = param['jobStatus'] - varMap[':stateChangeTime'] = param['stateChangeTime'] - try: - # convert to string - if isinstance(varMap[':prodDBUpdateTime'],datetime.datetime): - varMap[':prodDBUpdateTime'] = varMap[':prodDBUpdateTime'].strftime('%Y-%m-%d %H:%M:%S') - if isinstance(varMap[':stateChangeTime'],datetime.datetime): - varMap[':stateChangeTime'] = varMap[':stateChangeTime'].strftime('%Y-%m-%d %H:%M:%S') - # set table - if param['jobStatus'] in ['defined','assigned']: - table = 'ATLAS_PANDA.jobsDefined4' - elif param['jobStatus'] in ['waiting','pending']: - table = 'ATLAS_PANDA.jobsWaiting4' - elif param['jobStatus'] in ['activated','sent','starting','running','holding','transferring']: - table = 'ATLAS_PANDA.jobsActive4' - elif param['jobStatus'] in ['finished','failed','cancelled']: - table = 'ATLAS_PANDA.jobsArchived4' - else: - _logger.error("invalid status %s" % param['jobStatus']) - return False - # set transaction - self.conn.begin() - # update - sql = sql0 % table - _logger.debug(sql+comment+str(varMap)) - self.cur.execute(sql+comment, varMap) - retU = self.cur.rowcount - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("updateProdDBUpdateTime %s ret=%s" % (param['PandaID'],retU)) - if retU == 1: - return True - return False - except: - type, value, traceBack = sys.exc_info() - _logger.error("updateProdDBUpdateTime : %s %s" % (type,value)) - # roll back - self._rollback() - return False - - - # add metadata - def addMetadata(self,pandaID,metadata): - comment = ' /* DBProxy.addMetaData */' - _logger.debug("addMetaData : %s" % pandaID) - sql0 = "SELECT PandaID FROM ATLAS_PANDA.metaTable WHERE PandaID=:PandaID" - sql1 = "INSERT INTO ATLAS_PANDA.metaTable (PandaID,metaData) VALUES (:PandaID,:metaData)" - nTry=3 - for iTry in range(nTry): - try: - # autocommit on - self.conn.begin() - # select - varMap = {} - varMap[':PandaID'] = pandaID - self.cur.arraysize = 10 - self.cur.execute(sql0+comment, varMap) - res = self.cur.fetchone() - # already exist - if res != None: - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - # insert - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':metaData'] = metadata - self.cur.execute(sql1+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("addMetaData : %s retry : %s" % (pandaID,iTry)) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("addMetaData : %s %s" % (type,value)) - return False - - - # add stdout - def addStdOut(self,pandaID,stdOut): - comment = ' /* DBProxy.addStdOut */' - _logger.debug("addStdOut : %s start" % pandaID) - sqlJ = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID FOR UPDATE " - sqlC = "SELECT PandaID FROM ATLAS_PANDA.jobsDebug WHERE PandaID=:PandaID " - sqlI = "INSERT INTO ATLAS_PANDA.jobsDebug (PandaID,stdOut) VALUES (:PandaID,:stdOut) " - sqlU = "UPDATE ATLAS_PANDA.jobsDebug SET stdOut=:stdOut WHERE PandaID=:PandaID " - try: - # autocommit on - self.conn.begin() - # select - varMap = {} - varMap[':PandaID'] = pandaID - self.cur.arraysize = 10 - # check job table - self.cur.execute(sqlJ+comment, varMap) - res = self.cur.fetchone() - if res == None: - _logger.debug("addStdOut : %s non active" % pandaID) - else: - # check debug table - self.cur.execute(sqlC+comment, varMap) - res = self.cur.fetchone() - # already exist - if res != None: - # update - sql = sqlU - else: - # insert - sql = sqlI - # write stdout - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':stdOut'] = stdOut - self.cur.execute(sql+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - errtype,errvalue = sys.exc_info()[:2] - _logger.error("addStdOut : %s %s" % (errtype,errvalue)) - return False - - - # insert sandbox file info - def insertSandboxFileInfo(self,userName,hostName,fileName,fileSize,checkSum): - comment = ' /* DBProxy.insertSandboxFileInfo */' - _logger.debug("insertSandboxFileInfo : %s %s %s %s %s" % (userName,hostName,fileName,fileSize,checkSum)) - sqlC = "SELECT userName,fileSize,checkSum FROM ATLAS_PANDAMETA.userCacheUsage " - sqlC += "WHERE hostName=:hostName AND fileName=:fileName FOR UPDATE" - sql = "INSERT INTO ATLAS_PANDAMETA.userCacheUsage " - sql += "(userName,hostName,fileName,fileSize,checkSum,creationTime,modificationTime) " - sql += "VALUES (:userName,:hostName,:fileName,:fileSize,:checkSum,CURRENT_DATE,CURRENT_DATE) " - try: - # begin transaction - self.conn.begin() - # check if it already exists - varMap = {} - varMap[':hostName'] = hostName - varMap[':fileName'] = fileName - self.cur.arraysize = 10 - self.cur.execute(sqlC+comment, varMap) - res = self.cur.fetchall() - if len(res) != 0: - _logger.debug("insertSandboxFileInfo : skip %s %s since already exists" % (hostName,fileName)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return "WARNING: file exist" - # insert - varMap = {} - varMap[':userName'] = userName - varMap[':hostName'] = hostName - varMap[':fileName'] = fileName - varMap[':fileSize'] = fileSize - varMap[':checkSum'] = checkSum - self.cur.execute(sql+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return "OK" - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("insertSandboxFileInfo : %s %s" % (type,value)) - return "ERROR: DB failure" - - - # check duplicated sandbox file - def checkSandboxFile(self,dn,fileSize,checkSum): - comment = ' /* DBProxy.checkSandboxFile */' - _logger.debug("checkSandboxFile : %s %s %s" % (dn,fileSize,checkSum)) - sqlC = "SELECT hostName,fileName FROM ATLAS_PANDAMETA.userCacheUsage " - sqlC += "WHERE userName=:userName AND fileSize=:fileSize AND checkSum=:checkSum " - sqlC += "AND hostName<>:ngHostName AND creationTime>CURRENT_DATE-3 " - sqlC += "AND creationTime>CURRENT_DATE-3 " - try: - retStr = 'NOTFOUND' - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ['','NULL',None]: - compactDN = dn - # begin transaction - self.conn.begin() - # check if it already exists - varMap = {} - varMap[':userName'] = compactDN - varMap[':fileSize'] = fileSize - varMap[':checkSum'] = checkSum - varMap[':ngHostName'] = 'localhost.localdomain' - self.cur.arraysize = 10 - self.cur.execute(sqlC+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if len(res) != 0: - hostName,fileName = res[0] - retStr = "FOUND:%s:%s" % (hostName,fileName) - _logger.debug("checkSandboxFile -> %s" % retStr) - return retStr - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("checkSandboxFile : %s %s" % (type,value)) - return "ERROR: DB failure" - - - # insert dataset - def insertDataset(self,dataset,tablename="ATLAS_PANDA.Datasets"): - comment = ' /* DBProxy.insertDataset */' - _logger.debug("insertDataset(%s)" % dataset.name) - sql0 = "SELECT COUNT(*) FROM %s WHERE vuid=:vuid" % tablename - sql1 = "INSERT INTO %s " % tablename - sql1+= "(%s) " % DatasetSpec.columnNames() - sql1+= DatasetSpec.bindValuesExpression() - # time information - dataset.creationdate = datetime.datetime.utcnow() - dataset.modificationdate = dataset.creationdate - try: - # subtype - if dataset.subType in ['','NULL',None]: - # define using name - if re.search('_dis\d+$',dataset.name) != None: - dataset.subType = 'dis' - elif re.search('_sub\d+$',dataset.name) != None: - dataset.subType= 'sub' - else: - dataset.subType= 'top' - # begin transaction - self.conn.begin() - # check if it already exists - varMap = {} - varMap[':vuid'] = dataset.vuid - self.cur.execute(sql0+comment, varMap) - nDS, = self.cur.fetchone() - _logger.debug("insertDataset nDS=%s with %s" % (nDS,dataset.vuid)) - if nDS == 0: - # insert - _logger.debug("insertDataset insert %s" % dataset.name) - self.cur.execute(sql1+comment, dataset.valuesMap()) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("insertDataset() : %s %s" % (type,value)) - return False - - - # get and lock dataset with a query - def getLockDatasets(self,sqlQuery,varMapGet,modTimeOffset='',getVersion=False): - comment = ' /* DBProxy.getLockDatasets */' - _logger.debug("getLockDatasets(%s,%s,%s)" % (sqlQuery,str(varMapGet),modTimeOffset)) - sqlGet = "SELECT /*+ INDEX_RS_ASC(tab(STATUS,TYPE,MODIFICATIONDATE)) */ vuid,name,modificationdate,version,transferStatus FROM ATLAS_PANDA.Datasets tab WHERE " + sqlQuery - sqlLock = "UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE" - if modTimeOffset != '': - sqlLock += "+%s" % modTimeOffset - sqlLock += ",transferStatus=MOD(transferStatus+1,10)" - if getVersion: - sqlLock += ",version=:version" - sqlLock += " WHERE vuid=:vuid AND transferStatus=:transferStatus" - retList = [] - try: - # begin transaction - self.conn.begin() - # get datasets - self.cur.arraysize = 1000000 - self.cur.execute(sqlGet+comment,varMapGet) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # loop over all datasets - if res != None and len(res) != 0: - for vuid,name,modificationdate,version,transferStatus in res: - # lock - varMapLock = {} - varMapLock[':vuid'] = vuid - varMapLock[':transferStatus'] = transferStatus - if getVersion: - try: - varMapLock[':version'] = str(int(version) + 1) - except: - varMapLock[':version'] = str(1) - # begin transaction - self.conn.begin() - # update for lock - self.cur.execute(sqlLock+comment,varMapLock) - retU = self.cur.rowcount - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if retU > 0: - # append - if not getVersion: - retList.append((vuid,name,modificationdate)) - else: - retList.append((vuid,name,modificationdate,version)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # retrun - return retList - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getLockDatasets : %s %s" % (type,value)) - return [] - - - # query dataset with map - def queryDatasetWithMap(self,map): - comment = ' /* DBProxy.queryDatasetWithMap */' - _logger.debug("queryDatasetWithMap(%s)" % map) - if map.has_key('name'): - sql1 = """SELECT /*+ BEGIN_OUTLINE_DATA """ - sql1 += """INDEX_RS_ASC(@"SEL$1" "TAB"@"SEL$1" ("DATASETS"."NAME")) """ - sql1 += """OUTLINE_LEAF(@"SEL$1") ALL_ROWS """ - sql1 += """OPTIMIZER_FEATURES_ENABLE('10.2.0.4') """ - sql1 += """IGNORE_OPTIM_EMBEDDED_HINTS """ - sql1 += """END_OUTLINE_DATA */ """ - sql1 += "%s FROM ATLAS_PANDA.Datasets tab" % DatasetSpec.columnNames() - else: - sql1 = "SELECT %s FROM ATLAS_PANDA.Datasets" % DatasetSpec.columnNames() - varMap = {} - for key in map.keys(): - if len(varMap)==0: - sql1+= " WHERE %s=:%s" % (key,key) - else: - sql1+= " AND %s=:%s" % (key,key) - varMap[':%s' % key] = map[key] - try: - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 100 - _logger.debug(sql1+comment+str(varMap)) - self.cur.execute(sql1+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # instantiate Dataset - if res != None and len(res) != 0: - dataset = DatasetSpec() - dataset.pack(res[0]) - return dataset - _logger.error("queryDatasetWithMap(%s) : dataset not found" % map) - return None - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("queryDatasetWithMap(%s) : %s %s" % (map,type,value)) - return None - - - # update dataset - def updateDataset(self,datasets,withLock,withCriteria,criteriaMap): - comment = ' /* DBProxy.updateDataset */' - _logger.debug("updateDataset()") - sql1 = "UPDATE ATLAS_PANDA.Datasets SET %s " % DatasetSpec.bindUpdateExpression() - sql1+= "WHERE vuid=:vuid" - if withCriteria != "": - sql1+= " AND %s" % withCriteria - retList = [] - try: - # start transaction - self.conn.begin() - for dataset in datasets: - _logger.debug("updateDataset(%s,%s)" % (dataset.name,dataset.status)) - # time information - dataset.modificationdate = datetime.datetime.utcnow() - # update - varMap = dataset.valuesMap() - varMap[':vuid'] = dataset.vuid - for cKey in criteriaMap.keys(): - varMap[cKey] = criteriaMap[cKey] - self.cur.execute(sql1+comment, varMap) - retU = self.cur.rowcount - if retU != 0 and retU != 1: - raise RuntimeError, 'Invalid retrun %s' % retU - retList.append(retU) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("updateDataset() ret:%s" % retList) - return retList - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("updateDataset() : %s %s" % (type,value)) - return [] - - - # delete dataset - def deleteDataset(self,name): - comment = ' /* DBProxy.deleteDataset */' - sql1 = "DELETE /*+ INDEX(tab DATASETS_NAME_IDX)*/ FROM ATLAS_PANDA.Datasets tab WHERE name=:name" - try: - # start transaction - self.conn.begin() - # delete - varMap = {} - varMap[':name'] = name - self.cur.execute(sql1+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("deleteDataset() : %s %s" % (type,value)) - return False - - - # get serial number for dataset, insert dummy datasets to increment SN - def getSerialNumber(self,datasetname,definedFreshFlag=None): - comment = ' /* DBProxy.getSerialNumber */' - try: - _logger.debug("getSerialNumber(%s,%s)" % (datasetname,definedFreshFlag)) - # start transaction - self.conn.begin() - # check freashness - if definedFreshFlag == None: - # select - varMap = {} - varMap[':name'] = datasetname - varMap[':type'] = 'output' - sql = "SELECT /*+ INDEX_RS_ASC(TAB (DATASETS.NAME)) */ COUNT(*) FROM ATLAS_PANDA.Datasets tab WHERE type=:type AND name=:name" - self.cur.arraysize = 100 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchone() - # fresh dataset or not - if res != None and len(res) != 0 and res[0] > 0: - freshFlag = False - else: - freshFlag = True - else: - # use predefined flag - freshFlag = definedFreshFlag - # get serial number - sql = "SELECT ATLAS_PANDA.SUBCOUNTER_SUBID_SEQ.nextval FROM dual"; - self.cur.arraysize = 100 - self.cur.execute(sql+comment, {}) - sn, = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # release file lock - _logger.debug("getSerialNumber : %s %s" % (sn,freshFlag)) - return (sn,freshFlag) - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getSerialNumber() : %s %s" % (type,value)) - return (-1,False) - - - # get serial number for group job - def getSerialNumberForGroupJob(self,name): - comment = ' /* DBProxy.getSerialNumberForGroupJob */' - retVal = {'sn':'','status':False} - try: - _logger.debug("getSerialNumberForGroupJob(%s)" % name) - # start transaction - self.conn.begin() - # get serial number - sql = "SELECT ATLAS_PANDA.GROUP_JOBID_SEQ.nextval FROM dual"; - self.cur.execute(sql+comment, {}) - sn, = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - retVal['sn'] = sn - retVal['status'] = True - _logger.debug("getSerialNumberForGroupJob : %s %s" % (name,str(retVal))) - return retVal - except: - # roll back - self._rollback() - # error - errtype,errvalue = sys.exc_info()[:2] - _logger.error("getSerialNumberForGroupJob : %s %s" % (errtype,errvalue)) - retVal['status'] = False - return retVal - - - # change job priorities - def changeJobPriorities(self,newPrioMap): - comment = ' /* DBProxy.changeJobPriorities */' - try: - _logger.debug("changeJobPriorities start") - sql = "UPDATE %s SET currentPriority=:currentPriority,assignedPriority=:assignedPriority " - sql += "WHERE PandaID=:PandaID" - # loop over all PandaIDs - for pandaID,newPrio in newPrioMap.iteritems(): - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':currentPriority'] = newPrio - varMap[':assignedPriority'] = newPrio - _logger.debug("changeJobPriorities PandaID=%s -> prio=%s" % (pandaID,newPrio)) - # start transaction - self.conn.begin() - # try active tables - retU = None - for tableName in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsWaiting4']: - # execute - self.cur.execute((sql % tableName)+comment,varMap) - retU = self.cur.rowcount - if retU > 0: - break - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("changeJobPriorities PandaID=%s retU=%s" % (pandaID,retU)) - # return - _logger.debug("changeJobPriorities done") - return True,'' - except: - # roll back - self._rollback() - # error - errtype,errvalue = sys.exc_info()[:2] - _logger.error("changeJobPriorities : %s %s" % (errtype,errvalue)) - return False,'database error' - - - # update transfer status for a dataset - def updateTransferStatus(self,datasetname,bitMap): - comment = ' /* DBProxy.updateTransferStatus */' - try: - _logger.debug("updateTransferStatus(%s,%s)" % (datasetname,hex(bitMap))) - # start transaction - self.conn.begin() - retTransSt = 0 - # update bitmap - sqlU = 'UPDATE /*+ INDEX_RS_ASC(TAB("DATASETS"."NAME")) */ ATLAS_PANDA.Datasets tab SET transferStatus=ATLAS_PANDA.BITOR(transferStatus,:bitMap) WHERE name=:name' - varMap = {} - varMap[':bitMap'] = bitMap - varMap[':name'] = datasetname - retU = self.cur.execute(sqlU+comment, varMap) - # get transferStatus - sqlS = 'SELECT /*+ INDEX_RS_ASC(TAB("DATASETS"."NAME")) */ transferStatus FROM ATLAS_PANDA.Datasets tab WHERE name=:name' - varMap = {} - varMap[':name'] = datasetname - self.cur.arraysize = 10 - retS = self.cur.execute(sqlS+comment, varMap) - resS = self.cur.fetchall() - if resS != None and len(resS) != 0: - retTransSt = resS[0][0] - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("updateTransferStatus : %s" % hex(retTransSt)) - return retTransSt - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("updateTransferStatus : %s %s" % (type,value)) - return 0 - - - # get CloudTask. If not exist, create it - def getCloudTask(self,tid): - comment = ' /* getCloudTask */' - try: - _logger.debug("getCloudTask(%s)" % tid) - # check tid - if tid in [None,'NULL']: - _logger.error("invalid TID : %s" % tid) - return None - # start transaction - self.conn.begin() - # get CloudTask - sql = "SELECT %s FROM ATLAS_PANDA.cloudtasks " % CloudTaskSpec.columnNames() - sql += "WHERE taskid=:taskid" - varMap = {} - varMap[':taskid'] = tid - # select - self.cur.arraysize = 10 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - # already exist - if res != None and len(res) != 0: - # instantiate CloudTask - cloudTask = CloudTaskSpec() - cloudTask.pack(res[0]) - # update tmod if status is defined - if cloudTask.status == 'defined': - sql = "UPDATE ATLAS_PANDA.cloudtasks SET tmod=CURRENT_DATE WHERE taskid=:taskid" - varMap = {} - varMap[':taskid'] = cloudTask.taskid - self.cur.execute(sql+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return cloudTask - # insert new CloudTask - _logger.debug("insert new CloudTask") - cloudTask = CloudTaskSpec() - cloudTask.taskid = tid - cloudTask.status = 'defined' - sql = "INSERT INTO ATLAS_PANDA.cloudtasks (id,taskid,status,tmod,tenter) VALUES(ATLAS_PANDA.CLOUDTASKS_ID_SEQ.nextval,:taskid,:status,CURRENT_DATE,CURRENT_DATE)" - sql+= " RETURNING id INTO :newID" - varMap = {} - varMap[':taskid'] = cloudTask.taskid - varMap[':status'] = cloudTask.status - varMap[':newID'] = self.cur.var(cx_Oracle.NUMBER) - self.cur.execute(sql+comment, varMap) - # get id - cloudTask.id = long(varMap[':newID'].getvalue()) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("return new CloudTask") - return cloudTask - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getCloudTask() : %s %s" % (type,value)) - return None - - - # set cloud to CloudTask - def setCloudTask(self,cloudTask): - comment = ' /* setCloudTask */' - try: - _logger.debug("setCloudTask(id=%s,taskid=%s)" % (cloudTask.id,cloudTask.taskid)) - sql = "UPDATE ATLAS_PANDA.cloudtasks SET cloud=:cloud,status=:newStatus,tmod=CURRENT_DATE WHERE id=:id AND status=:oldStatus" - # start transaction - self.conn.begin() - # update - varMap = {} - varMap[':cloud'] = cloudTask.cloud - varMap[':id'] = cloudTask.id - varMap[':newStatus'] = 'assigned' - varMap[':oldStatus'] = 'defined' - self.cur.execute(sql+comment, varMap) - retU = self.cur.rowcount - # succeeded - if retU == 1: - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return cloudTask - # read if it is already set by another thread - sql = "SELECT %s FROM ATLAS_PANDA.cloudtasks " % CloudTaskSpec.columnNames() - sql += "WHERE id=:id" - varMap = {} - varMap[':id'] = cloudTask.id - # select - self.cur.arraysize = 10 - retS = self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # retrun CloudTask - if res != None and len(res) != 0: - # instantiate CloudTask - cloudTask = CloudTaskSpec() - cloudTask.pack(res[0]) - return cloudTask - _logger.error("setCloudTask() : cannot find CloudTask for %s" % cloudTask.id) - return None - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("setCloudTask() : %s %s" % (type,value)) - return None - - - # see CloudTask - def seeCloudTask(self,tid): - comment = ' /* seeCloudTask */' - try: - _logger.debug("seeCloudTask(%s)" % tid) - # check tid - if tid in [None,'NULL']: - _logger.error("invalid TID : %s" % tid) - return None - # start transaction - self.conn.begin() - # select - sql = "SELECT cloud FROM ATLAS_PANDA.cloudtasks WHERE taskid=:taskid" - varMap = {} - varMap[':taskid'] = tid - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # existing task - if res != None and len(res) != 0: - # return cloud - return res[0][0] - else: - return None - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("seeCloudTask() : %s %s" % (type,value)) - return None - - - # reset modification time of a task to shorten retry interval - def resetTmodCloudTask(self,tid): - comment = ' /* resetTmodCloudTask */' - try: - _logger.debug("resetTmodCloudTask %s" % tid) - # check tid - if tid in [None,'NULL']: - _logger.error("invalid TID : %s" % tid) - return None - # start transaction - self.conn.begin() - # update - sql = "UPDATE ATLAS_PANDA.cloudtasks SET tmod=:tmod WHERE taskid=:taskid" - varMap = {} - varMap[':taskid'] = tid - varMap[':tmod'] = datetime.datetime.utcnow() - datetime.timedelta(minutes=165) - self.cur.execute(sql+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("resetTmodCloudTask : %s %s" % (type,value)) - return False - - - # get assigning task - def getAssigningTask(self): - comment = ' /* getAssigningTask */' - try: - _logger.debug("getAssigningTask") - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) - # start transaction - self.conn.begin() - # select - sql = "SELECT taskid FROM ATLAS_PANDA.cloudtasks WHERE status=:status AND tmod>:tmod" - varMap = {} - varMap[':tmod'] = timeLimit - varMap[':status'] = 'defined' - self.cur.arraysize = 100 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # loop over all taskid - retList = [] - if res != None: - for tid, in res: - retList.append(tid) - # return - _logger.debug("getAssigningTask ret:%s" % retList) - return retList - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getAssigningTask : %s %s" % (type,value)) - return [] - - - # set CloudTask by user - def setCloudTaskByUser(self,user,tid,cloud,status): - comment = ' /* setCloudTaskByUser */' - try: - _logger.debug("setCloudTaskByUser(tid=%s,cloud=%s,status=%s) by %s" % (tid,cloud,status,user)) - # check tid - if tid in [None,'NULL']: - tmpMsg = "invalid TID : %s" % tid - _logger.error(tmpMsg) - return "ERROR: " + tmpMsg - # check status - statusList = ['tobeaborted'] - if not status in statusList: - tmpMsg = "invalid status=%s. Must be one of %s" (status,str(statusList)) - _logger.error(tmpMsg) - return "ERROR: " + tmpMsg - # start transaction - self.conn.begin() - # get CloudTask - sql = "SELECT %s FROM ATLAS_PANDA.cloudtasks " % CloudTaskSpec.columnNames() - sql += "WHERE taskid=:taskid" - varMap = {} - varMap[':taskid'] = tid - # select - self.cur.arraysize = 10 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - # already exist - if res != None and len(res) != 0: - # set status - sql = "UPDATE ATLAS_PANDA.cloudtasks SET status=:status,tmod=CURRENT_DATE WHERE taskid=:taskid" - varMap = {} - varMap[':taskid'] = tid - varMap[':status'] = status - self.cur.execute(sql+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return "SUCCEEDED" - # insert new CloudTask - sql = "INSERT INTO ATLAS_PANDA.cloudtasks (id,taskid,status,tmod,tenter) VALUES(ATLAS_PANDA.CLOUDTASKS_ID_SEQ.nextval,:taskid,:status,CURRENT_DATE,CURRENT_DATE)" - varMap = {} - varMap[':taskid'] = tid - varMap[':status'] = status - self.cur.execute(sql+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return "SUCCEEDED" - except: - # roll back - self._rollback() - # error - errType,errValue = sys.exc_info()[:2] - _logger.error("setCloudTaskByUser() : %s %s" % (errType,errValue)) - return "ERROR: database error" - - - # query files with map - def queryFilesWithMap(self,map): - comment = ' /* DBProxy.queryFilesWithMap */' - _logger.debug("queryFilesWithMap()") - sql1 = "SELECT PandaID,%s FROM ATLAS_PANDA.filesTable4" % FileSpec.columnNames() - varMap = {} - for key in map.keys(): - if len(varMap)==0: - sql1+= " WHERE %s=:%s" % (key,key) - else: - sql1+= " AND %s=:%s" % (key,key) - varMap[':%s' % key] = map[key] - nTry=3 - for iTry in range(nTry): - try: - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - self.cur.execute(sql1+comment, varMap) - res = self.cur.fetchall() - _logger.debug("queryFilesWithMap() : %s" % str(res)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # instantiate files - retList = [] - for item in res: - # instantiate dummy JobSpec obj for PandaID - job = JobSpec() - job.PandaID = item[0] - # instantiate file - file = FileSpec() - file.pack(item[1:]) - # set owner - file.setOwner(job) - # append - retList.append(file) - return retList - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("queryFilesWithMap retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("queryFilesWithMap : %s %s" % (type,value)) - return [] - - - # count the number of files with map - def countFilesWithMap(self,map): - comment = ' /* DBProxy.countFilesWithMap */' - sql1 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ COUNT(*) FROM ATLAS_PANDA.filesTable4 tab" - varMap = {} - for key in map.keys(): - if len(varMap)==0: - sql1+= " WHERE %s=:%s" % (key,key) - else: - sql1+= " AND %s=:%s" % (key,key) - varMap[':%s' % key] = map[key] - nTry=3 - for iTry in range(nTry): - try: - # start transaction - self.conn.begin() - # select - _logger.debug("countFilesWithMap() : %s %s" % (sql1,str(map))) - self.cur.arraysize = 10 - retS = self.cur.execute(sql1+comment, varMap) - res = self.cur.fetchone() - _logger.debug("countFilesWithMap() : %s %s" % (retS,str(res))) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - nFiles=0 - if res != None: - nFiles=res[0] - return nFiles - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("countFilesWithMap() retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("countFilesWithMap(%s) : %s %s" % (map,type,value)) - return -1 - - - # count the number of pending files - def countPendingFiles(self,pandaID,forInput=True): - comment = ' /* DBProxy.countPendingFiles */' - varMap = {} - varMap[':pandaID'] = pandaID - varMap[':status'] = 'ready' - if forInput: - sql1 = "SELECT COUNT(*) FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:pandaID AND type=:type AND status<>:status " - varMap[':type'] = 'input' - else: - sql1 = "SELECT COUNT(*) FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:pandaID AND type IN (:type1,:type2) AND status<>:status " - varMap[':type1'] = 'output' - varMap[':type2'] = 'log' - try: - # start transaction - self.conn.begin() - # select - _logger.debug("countPendingFiles : %s start" % pandaID) - self.cur.arraysize = 10 - retS = self.cur.execute(sql1+comment, varMap) - res = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - nFiles = -1 - if res != None: - nFiles=res[0] - _logger.debug("countPendingFiles : %s -> %s" % (pandaID,nFiles)) - return nFiles - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("countPendingFiles : %s : %s %s" % (pandaID,errType,errValue)) - return -1 - - - # get datasets associated with file - def getDatasetWithFile(self,lfn,jobPrioity=0): - comment = ' /* DBProxy.getDatasetWithFile */' - varMap = {} - varMap[':lfn'] = lfn - varMap[':status1'] = 'pending' - varMap[':status2'] = 'transferring' - sql1 = "SELECT PandaID,status,destinationDBlock,destinationDBlockToken,dispatchDBlock FROM ATLAS_PANDA.filesTable4 " - sql1 += "WHERE lfn=:lfn AND status IN (:status1,:status2) AND modificationTime %s" % (lfn,str(retMap))) - return retMap - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getDatasetWithFile : %s : %s %s" % (lfn,errType,errValue)) - return {} - - - # get input files currently in use for analysis - def getFilesInUseForAnal(self,outDataset): - comment = ' /* DBProxy.getFilesInUseForAnal */' - sqlSub = "SELECT destinationDBlock,PandaID FROM ATLAS_PANDA.filesTable4 " - sqlSub += "WHERE dataset=:dataset AND type IN (:type1,:type2) GROUP BY destinationDBlock,PandaID" - sqlPaA = "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsDefined4 " - sqlPaA += "WHERE PandaID=:PandaID " - sqlPaA += "UNION " - sqlPaA += "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsActive4 " - sqlPaA += "WHERE PandaID=:PandaID " - sqlPan = "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsArchived4 " - sqlPan += "WHERE PandaID=:PandaID AND modificationTime<=CURRENT_DATE " - sqlPan += "UNION " - sqlPan += "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDAARCH.jobsArchived " - sqlPan += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)" - sqlIdA = "SELECT PandaID,jobStatus FROM ATLAS_PANDA.jobsArchived4 " - sqlIdA += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sqlIdA += "AND prodSourceLabel=:prodSourceLabel1 " - sqlIdL = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ " - sqlIdL += "PandaID,jobStatus FROM ATLAS_PANDAARCH.jobsArchived tab " - sqlIdL += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sqlIdL += "AND prodSourceLabel=:prodSourceLabel1 AND modificationTime>(CURRENT_DATE-30) " - sqlDis = "SELECT distinct dispatchDBlock FROM ATLAS_PANDA.filesTable4 " - sqlDis += "WHERE PandaID=:PandaID AND type=:type AND dispatchDBlock IS NOT NULL AND modificationTime <= CURRENT_DATE" - sqlLfn = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ lfn,PandaID FROM ATLAS_PANDA.filesTable4 tab " - sqlLfn += "WHERE dispatchDBlock=:dispatchDBlock AND type=:type " - sqlLfn += "AND (destinationDBlockToken IS NULL OR destinationDBlockToken<>:noshadow) AND modificationTime<=CURRENT_DATE" - nTry=3 - for iTry in range(nTry): - inputFilesList = [] - try: - # start transaction - self.conn.begin() - # get sub datasets - varMap = {} - varMap[':dataset'] = outDataset - varMap[':type1'] = 'output' - varMap[':type2'] = 'log' - _logger.debug("getFilesInUseForAnal : %s %s" % (sqlSub,str(varMap))) - self.cur.arraysize = 100000 - retS = self.cur.execute(sqlSub+comment, varMap) - res = self.cur.fetchall() - subDSpandaIDmap = {} - checkedPandaIDs = {} - for subDataset,pandaID in res: - # avoid redundunt lookup - if checkedPandaIDs.has_key(pandaID): - continue - if subDSpandaIDmap.has_key(subDataset): - # append jobs as running since they are not in archived tables - if not pandaID in subDSpandaIDmap[subDataset]: - checkedPandaIDs[pandaID] = 'running' - subDSpandaIDmap[subDataset].append(pandaID) - continue - # look for jobdefID and userName - varMap = {} - varMap[':PandaID'] = pandaID - _logger.debug("getFilesInUseForAnal : %s %s" % (sqlPaA,str(varMap))) - retP = self.cur.execute(sqlPaA+comment, varMap) - resP = self.cur.fetchall() - if len(resP) != 0: - jobDefinitionID,prodUserName = resP[0] - else: - _logger.debug("getFilesInUseForAnal : %s %s" % (sqlPan,str(varMap))) - retP = self.cur.execute(sqlPan+comment, varMap) - resP = self.cur.fetchall() - if len(resP) != 0: - jobDefinitionID,prodUserName = resP[0] - else: - continue - # get PandaIDs with obdefID and userName - tmpPandaIDs = [] - varMap = {} - varMap[':prodUserName'] = prodUserName - varMap[':jobDefinitionID'] = jobDefinitionID - varMap[':prodSourceLabel1'] = 'user' - _logger.debug("getFilesInUseForAnal : %s %s" % (sqlIdA,str(varMap))) - retID = self.cur.execute(sqlIdA+comment, varMap) - resID = self.cur.fetchall() - for tmpPandaID,tmpJobStatus in resID: - checkedPandaIDs[tmpPandaID] = tmpJobStatus - tmpPandaIDs.append(tmpPandaID) - _logger.debug("getFilesInUseForAnal : %s %s" % (sqlIdL,str(varMap))) - retID = self.cur.execute(sqlIdL+comment, varMap) - resID = self.cur.fetchall() - for tmpPandaID,tmpJobStatus in resID: - if not tmpPandaID in tmpPandaIDs: - checkedPandaIDs[tmpPandaID] = tmpJobStatus - tmpPandaIDs.append(tmpPandaID) - # append - if not subDSpandaIDmap.has_key(subDataset): - subDSpandaIDmap[subDataset] = [] - for tmpPandaID in tmpPandaIDs: - # reuse failed files if jobs are in Archived since they cannot change back to active - if checkedPandaIDs[tmpPandaID] in ['failed','cancelled']: - continue - # collect PandaIDs - subDSpandaIDmap[subDataset].append(tmpPandaID) - # loop over all sub datasets - for subDataset,activePandaIDs in subDSpandaIDmap.iteritems(): - # skip empty - if activePandaIDs == []: - continue - # get dispatchDBlocks - pandaID = activePandaIDs[0] - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':type'] = 'input' - _logger.debug("getFilesInUseForAnal : %s %s" % (sqlDis,str(varMap))) - self.cur.arraysize = 10000 - retD = self.cur.execute(sqlDis+comment, varMap) - resD = self.cur.fetchall() - # get LFNs - for disDataset, in resD: - # use new style only - if not disDataset.startswith('user_disp.'): - continue - varMap = {} - varMap[':dispatchDBlock'] = disDataset - varMap[':type'] = 'input' - varMap[':noshadow'] = 'noshadow' - _logger.debug("getFilesInUseForAnal : %s %s" % (sqlLfn,str(varMap))) - self.cur.arraysize = 100000 - retL = self.cur.execute(sqlLfn+comment, varMap) - resL = self.cur.fetchall() - # append - for lfn,filePandaID in resL: - # skip files used by archived failed or cancelled jobs - if filePandaID in activePandaIDs and not lfn in inputFilesList: - inputFilesList.append(lfn) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("getFilesInUseForAnal : %s" % len(inputFilesList)) - return inputFilesList - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("inputFilesList retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("inputFilesList(%s) : %s %s" % (outDataset,type,value)) - return [] - - - # get list of dis dataset to get input files in shadow - def getDisInUseForAnal(self,outDataset): - comment = ' /* DBProxy.getDisInUseForAnal */' - sqlSub = "SELECT destinationDBlock,PandaID,status FROM ATLAS_PANDA.filesTable4 " - sqlSub += "WHERE dataset=:dataset AND type=:type1 GROUP BY destinationDBlock,PandaID,status" - sqlPaA = "SELECT jobStatus FROM ATLAS_PANDA.jobsDefined4 " - sqlPaA += "WHERE PandaID=:PandaID " - sqlPaA += "UNION " - sqlPaA += "SELECT jobStatus FROM ATLAS_PANDA.jobsActive4 " - sqlPaA += "WHERE PandaID=:PandaID " - sqlPan = "SELECT jobStatus FROM ATLAS_PANDA.jobsArchived4 " - sqlPan += "WHERE PandaID=:PandaID AND modificationTime<=CURRENT_DATE " - sqlPan += "UNION " - sqlPan += "SELECT jobStatus FROM ATLAS_PANDAARCH.jobsArchived " - sqlPan += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)" - sqlDis = "SELECT distinct dispatchDBlock FROM ATLAS_PANDA.filesTable4 " - sqlDis += "WHERE PandaID=:PandaID AND type=:type AND dispatchDBlock IS NOT NULL AND modificationTime <= CURRENT_DATE" - inputDisList = [] - try: - timeStart = datetime.datetime.utcnow() - _logger.debug("getDisInUseForAnal start for %s" % outDataset) - # start transaction - self.conn.begin() - # get sub datasets - varMap = {} - varMap[':dataset'] = outDataset - varMap[':type1'] = 'log' - _logger.debug("getDisInUseForAnal : %s %s" % (sqlSub,str(varMap))) - self.cur.arraysize = 100000 - retS = self.cur.execute(sqlSub+comment, varMap) - res = self.cur.fetchall() - subDSpandaIDmap = {} - checkedPandaIDs = {} - for subDataset,pandaID,fileStatus in res: - # add map - if not subDSpandaIDmap.has_key(subDataset): - subDSpandaIDmap[subDataset] = [] - # check job status - if fileStatus != 'ready': - varMap = {} - varMap[':PandaID'] = pandaID - _logger.debug("getDisInUseForAnal : %s %s" % (sqlPaA,str(varMap))) - retP = self.cur.execute(sqlPaA+comment, varMap) - resP = self.cur.fetchall() - if len(resP) != 0: - # append jobs as running since they are not in archived tables yet - checkedPandaIDs[pandaID] = 'running' - subDSpandaIDmap[subDataset].append(pandaID) - else: - _logger.debug("getDisInUseForAnal : %s %s" % (sqlPan,str(varMap))) - retP = self.cur.execute(sqlPan+comment, varMap) - resP = self.cur.fetchall() - if len(resP) != 0: - checkedPandaIDs[pandaID], = resP[0] - # reuse failed files if jobs are in Archived since they cannot change back to active - if checkedPandaIDs[pandaID] in ['failed','cancelled']: - continue - # collect PandaIDs - subDSpandaIDmap[subDataset].append(pandaID) - else: - # not found - continue - else: - # no job lookup since file was sucessfully finished - checkedPandaIDs[pandaID] = 'finished' - # collect PandaIDs - subDSpandaIDmap[subDataset].append(pandaID) - # loop over all sub datasets - for subDataset,activePandaIDs in subDSpandaIDmap.iteritems(): - # skip empty - if activePandaIDs == []: - continue - resDisList = [] - # get dispatchDBlocks - pandaID = activePandaIDs[0] - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':type'] = 'input' - _logger.debug("getDisInUseForAnal : %s %s" % (sqlDis,str(varMap))) - self.cur.arraysize = 10000 - retD = self.cur.execute(sqlDis+comment, varMap) - resD = self.cur.fetchall() - # get shadow dis - for disDataset, in resD: - # use new style only - if not disDataset.startswith('user_disp.'): - continue - if not disDataset in resDisList: - resDisList.append(disDataset) - # append - if resDisList != []: - inputDisList.append((resDisList,activePandaIDs)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - timeDelta = datetime.datetime.utcnow()-timeStart - _logger.debug("getDisInUseForAnal end for %s len=%s time=%ssec" % (outDataset,len(inputDisList),timeDelta.seconds)) - return inputDisList - except: - # roll back - self._rollback() - errtype,errvalue = sys.exc_info()[:2] - _logger.error("getDisInUseForAnal(%s) : %s %s" % (outDataset,errtype,errvalue)) - return None - - - # get input LFNs currently in use for analysis with shadow dis - def getLFNsInUseForAnal(self,inputDisList): - comment = ' /* DBProxy.getLFNsInUseForAnal */' - sqlLfn = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ lfn,PandaID FROM ATLAS_PANDA.filesTable4 tab " - sqlLfn += "WHERE dispatchDBlock=:dispatchDBlock AND type=:type " - sqlLfn += "AND (destinationDBlockToken IS NULL OR destinationDBlockToken<>:noshadow) AND modificationTime<=CURRENT_DATE" - inputFilesList = [] - try: - token = datetime.datetime.utcnow().isoformat('/') - # loop over all shadow dis datasets - pandaIdLfnMap = {} - for disDatasetList,activePandaIDs in inputDisList: - for disDataset in disDatasetList: - # use new style only - if not disDataset.startswith('user_disp.'): - continue - # read LFNs and PandaIDs - if not pandaIdLfnMap.has_key(disDataset): - # start transaction - self.conn.begin() - varMap = {} - varMap[':dispatchDBlock'] = disDataset - varMap[':type'] = 'input' - varMap[':noshadow'] = 'noshadow' - _logger.debug("getLFNsInUseForAnal : <%s> %s %s" % (token,sqlLfn,str(varMap))) - timeStart = datetime.datetime.utcnow() - self.cur.arraysize = 100000 - retL = self.cur.execute(sqlLfn+comment, varMap) - resL = self.cur.fetchall() - # commit - timeDelta = datetime.datetime.utcnow()-timeStart - _logger.debug("getLFNsInUseForAnal : <%s> %s time=%ssec commit" % (token,disDataset,timeDelta.seconds)) - if not self._commit(): - raise RuntimeError, 'Commit error' - # make map - pandaIdLfnMap[disDataset] = {} - for lfn,filePandaID in resL: - if not pandaIdLfnMap[disDataset].has_key(filePandaID): - pandaIdLfnMap[disDataset][filePandaID] = [] - pandaIdLfnMap[disDataset][filePandaID].append(lfn) - _logger.debug("getLFNsInUseForAnal : <%s> %s map made with len=%s" % \ - (token,disDataset,len(resL))) - # append - for disDataset in disDatasetList: - _logger.debug("getLFNsInUseForAnal : <%s> %s list making pandaIDs=%s fileLen=%s" % \ - (token,disDataset,len(activePandaIDs),len(inputFilesList))) - for activePandaID in activePandaIDs: - # skip files used by archived failed or cancelled jobs - if pandaIdLfnMap[disDataset].has_key(activePandaID): - inputFilesList += pandaIdLfnMap[disDataset][activePandaID] - _logger.debug("getLFNsInUseForAnal : <%s> %s done" % (token,disDataset)) - _logger.debug("getLFNsInUseForAnal : <%s> %s" % (token,len(inputFilesList))) - return inputFilesList - except: - # roll back - self._rollback() - errtype,errvalue = sys.exc_info()[:2] - _logger.error("getLFNsInUseForAnal(%s) : %s %s" % (str(inputDisList),errtype,errvalue)) - return None - - - # update input files and return corresponding PandaIDs - def updateInFilesReturnPandaIDs(self,dataset,status,fileLFN=''): - comment = ' /* DBProxy.updateInFilesReturnPandaIDs */' - _logger.debug("updateInFilesReturnPandaIDs(%s,%s)" % (dataset,fileLFN)) - sql0 = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ row_ID,PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE status<>:status AND dispatchDBlock=:dispatchDBlock" - sql1 = "UPDATE /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ ATLAS_PANDA.filesTable4 tab SET status=:status WHERE status<>:status AND dispatchDBlock=:dispatchDBlock" - varMap = {} - varMap[':status'] = status - varMap[':dispatchDBlock'] = dataset - if fileLFN != '': - sql0 += " AND lfn=:lfn" - sql1 += " AND lfn=:lfn" - varMap[':lfn'] = fileLFN - for iTry in range(self.nTry): - try: - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - retS = self.cur.execute(sql0+comment, varMap) - resS = self.cur.fetchall() - # update - retU = self.cur.execute(sql1+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # collect PandaIDs - retList = [] - for tmpRowID,tmpPandaID in resS: - # append - if not tmpPandaID in retList: - retList.append(tmpPandaID) - # return - _logger.debug("updateInFilesReturnPandaIDs : %s" % str(retList)) - return retList - except: - # roll back - self._rollback() - # error report - if iTry+1 < self.nTry: - _logger.debug("updateInFilesReturnPandaIDs retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("updateInFilesReturnPandaIDs : %s %s" % (type, value)) - return [] - - - # update file status in dispatch dataset - def updateFileStatusInDisp(self,dataset,fileStatusMap): - comment = ' /* DBProxy.updateFileStatusInDisp */' - _logger.debug("updateFileStatusInDisp(%s,%s)" % (dataset,fileStatusMap)) - sql1 = "UPDATE /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ ATLAS_PANDA.filesTable4 tab SET status=:status WHERE dispatchDBlock=:dispatchDBlock AND lfn=:lfn" - nTry = 1 - for iTry in range(nTry): - try: - # start transaction - self.conn.begin() - # update - for status,lfns in fileStatusMap.iteritems(): - varMap = {} - varMap[':status'] = status - varMap[':dispatchDBlock'] = dataset - # loop over all files - for lfn in lfns: - varMap['lfn'] = lfn - # update - retU = self.cur.execute(sql1+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - _logger.debug("updateFileStatusInDisp : done") - return True - except: - # roll back - self._rollback() - # error report - if iTry+1 < nTry: - _logger.debug("updateFileStatusInDisp retry : %s" % iTry) - time.sleep(random.randint(5,10)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("updateFileStatusInDisp : %s %s" % (type, value)) - return False - - - # update output files and return corresponding PandaIDs - def updateOutFilesReturnPandaIDs(self,dataset,fileLFN=''): - comment = ' /* DBProxy.updateOutFilesReturnPandaIDs */' - _logger.debug("updateOutFilesReturnPandaIDs(%s,%s)" % (dataset,fileLFN)) - sql0 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ row_ID,PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status=:status" - sql1 = "UPDATE /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ ATLAS_PANDA.filesTable4 tab SET status='ready' WHERE destinationDBlock=:destinationDBlock AND status=:status" - varMap = {} - varMap[':status'] = 'transferring' - varMap[':destinationDBlock'] = dataset - if fileLFN != '': - sql0 += " AND lfn=:lfn" - sql1 += " AND lfn=:lfn" - varMap[':lfn'] = fileLFN - for iTry in range(self.nTry): - try: - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - retS = self.cur.execute(sql0+comment, varMap) - resS = self.cur.fetchall() - # update - retList = [] - retU = self.cur.execute(sql1+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # collect PandaIDs - retList = [] - for tmpRowID,tmpPandaID in resS: - # append - if not tmpPandaID in retList: - retList.append(tmpPandaID) - # return - _logger.debug("updateOutFilesReturnPandaIDs : %s" % str(retList)) - return retList - except: - # roll back - self._rollback() - # error report - if iTry+1 < self.nTry: - _logger.debug("updateOutFilesReturnPandaIDs retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("updateOutFilesReturnPandaIDs : %s %s" % (type, value)) - return [] - - - # get _dis datasets associated to _sub - def getAssociatedDisDatasets(self,subDsName): - comment = ' /* DBProxy.getAssociatedDisDatasets */' - _logger.debug("getAssociatedDisDatasets(%s)" % subDsName) - sqlF = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ distinct PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock" - sqlJ = "SELECT distinct dispatchDBlock FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type" - try: - # start transaction - self.conn.begin() - # get PandaIDs - varMap = {} - varMap[':destinationDBlock'] = subDsName - self.cur.arraysize = 10000 - self.cur.execute(sqlF+comment,varMap) - resS = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # loop over all PandaIDs - retList = [] - for pandaID, in resS: - # start transaction - self.conn.begin() - # get _dis name - varMap = {} - varMap[':type'] = 'input' - varMap[':PandaID'] = pandaID - self.cur.arraysize = 1000 - self.cur.execute(sqlJ+comment,varMap) - resD = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for disName, in resD: - if disName != None and not disName in retList: - retList.append(disName) - # return - _logger.debug("getAssociatedDisDatasets : %s" % str(retList)) - return retList - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getAssociatedDisDatasets : %s : %s %s" % (subDsName,errType,errValue)) - return [] - - - # set GUIDs - def setGUIDs(self,files): - comment = ' /* DBProxy.setGUIDs */' - _logger.debug("setGUIDs(%s)" % files) - sql0 = "UPDATE ATLAS_PANDA.filesTable4 SET GUID=:GUID,fsize=:fsize,checksum=:checksum,scope=:scope WHERE lfn=:lfn" - for iTry in range(self.nTry): - try: - # start transaction - self.conn.begin() - # update - for file in files: - varMap = {} - varMap[':GUID'] = file['guid'] - varMap[':lfn'] = file['lfn'] - if file['checksum'] in ['','NULL']: - varMap[':checksum'] = None - else: - varMap[':checksum'] = file['checksum'] - varMap[':fsize'] = file['fsize'] - if not file.has_key('scope') or file['scope'] in ['','NULL']: - varMap[':scope'] = None - else: - varMap[':scope'] = file['scope'] - self.cur.execute(sql0+comment, varMap) - retU = self.cur.rowcount - _logger.debug("setGUIDs : retU %s" % retU) - if retU<0: - raise RuntimeError, 'SQL error' - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - # error report - if iTry+1 < self.nTry: - _logger.debug("setGUIDs retry : %s" % iTry) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("setGUIDs : %s %s" % (type, value)) - return False - - - # query PandaID with Datasets - def queryPandaIDwithDataset(self,datasets): - comment = ' /* DBProxy.queryPandaIDwithDataset */' - _logger.debug("queryPandaIDwithDataset(%s)" % datasets) - if len(datasets) == 0: - return [] - # make SQL query - sql1 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock GROUP BY PandaID" - # execute - try: - retList = [] - for dataset in datasets: - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - varMap = {} - varMap[':destinationDBlock'] = dataset - self.cur.execute(sql1+comment,varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # get IDs - for r in res: - retList.append(r[0]) - # return - _logger.debug("queryPandaIDwithDataset : %s" % str(retList)) - return retList - except: - # roll back - self._rollback() - # error report - type, value, traceBack = sys.exc_info() - _logger.error("queryPandaIDwithDataset : %s %s" % (type, value)) - return [] - - - # query last files in datasets - def queryLastFilesInDataset(self,datasets): - comment = ' /* DBProxy.queryLastFilesInDataset */' - _logger.debug("queryLastFilesInDataset(%s)" % datasets) - if len(datasets) == 0: - return [] - # make SQL query - sql1 = "SELECT lfn,PandaID FROM ATLAS_PANDA.filesTable4 WHERE dataset=:dataset AND type=:type ORDER BY lfn DESC" - sqlL = "SELECT processingType FROM %s WHERE PandaID=:PandaID " - sqlA = "UNION SELECT processingType FROM ATLAS_PANDAARCH.jobsArchived WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)" - sql2 = "SELECT lfn FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type" - # execute - try: - retMap = {} - for dataset in datasets: - # start transaction - self.conn.begin() - # select max LFN - varMap = {} - varMap[':type'] = 'output' - varMap[':dataset'] = dataset - self.cur.arraysize = 100000 - self.cur.execute(sql1+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # found - retList = [] - for tmpLFN,pandaID in res: - # skip log.tgz - if re.search('\.log\.tgz(\.\d+)*$',tmpLFN) != None: - continue - # start transaction - self.conn.begin() - self.cur.arraysize = 10 - # check processingType - processingType = None - for tmpTable in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: - varMap = {} - varMap[':PandaID'] = pandaID - if tmpTable == 'ATLAS_PANDA.jobsArchived4': - self.cur.execute((sqlL % tmpTable)+sqlA+comment, varMap) - else: - self.cur.execute((sqlL % tmpTable)+comment, varMap) - resP = self.cur.fetchone() - if resP != None: - processingType = resP[0] - break - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # job not found - if processingType == None: - continue - # ignore merge jobs - if processingType in ['usermerge']: - continue - # start transaction - self.conn.begin() - # select LFNs - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':type'] = 'output' - self.cur.arraysize = 1000 - self.cur.execute(sql2+comment, varMap) - res = self.cur.fetchall() - for r in res: - retList.append(r[0]) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # get only the largest one - break - # append - retMap[dataset] = retList - # return - _logger.debug("queryLastFilesInDataset : %s" % str(retMap)) - return retMap - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("queryLastFilesInDataset : %s %s" % (type, value)) - return {} - - - # query PandaID with filenames - def queryPandaIDwithLFN(self,vlfns): - comment = ' /* DBProxy.queryPandaIDwithLFN */' - _logger.debug("queryPandaIDwithLFN(%s)" % vlfns) - if len(vlfns) == 0: - return [] - # make SQL query - sql1 = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE lfn=:lfn GROUP BY PandaID" - # execute - retList = [] - for lfn in vlfns: - # get generic LFNs - gLFN = re.sub('\.\d+$','',lfn) - # try - try: - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[':lfn'] = gLFN - self.cur.arraysize = 10000 - self.cur.execute(sql1+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append IDs - for tmpID, in res: - if not tmpID in retList: - retList.append(tmpID) - except: - # roll back - self._rollback() - # error report - type, value, traceBack = sys.exc_info() - _logger.error("queryPandaIDwithLFN : %s %s" % (type, value)) - return [] - # return - _logger.debug("queryPandaIDwithLFN : %s" % str(retList)) - return retList - - - # get job statistics - def getJobStatistics(self,archived=False,predefined=False,workingGroup='',countryGroup='',jobType='',forAnal=None,minPriority=None): - comment = ' /* DBProxy.getJobStatistics */' - _logger.debug("getJobStatistics(%s,%s,'%s','%s','%s',%s,%s)" % (archived,predefined,workingGroup,countryGroup,jobType,forAnal,minPriority)) - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) - sql0 = "SELECT computingSite,jobStatus,COUNT(*) FROM %s " - # processingType - tmpJobTypeMap = {} - sqlJobType = '' - useWhereInSQL = True - if forAnal == None or jobType != "": - useWhereInSQL = False - elif forAnal == True: - tmpJobTypeMap[':prodSourceLabel1'] = 'user' - tmpJobTypeMap[':prodSourceLabel2'] = 'panda' - sql0 += "WHERE prodSourceLabel IN (" - sqlJobType = ":prodSourceLabel1,:prodSourceLabel2) " - else: - tmpJobTypeMap[':prodSourceLabel1'] = 'managed' - sql0 += "WHERE prodSourceLabel IN (" - sqlJobType = ":prodSourceLabel1) " - sql0 += sqlJobType - # predefined - if predefined: - if useWhereInSQL: - sql0 += "AND relocationFlag=1 " - else: - sql0 += "WHERE relocationFlag=1 " - useWhereInSQL = True - # working group - tmpGroupMap = {} - sqlGroups = '' - if workingGroup != '': - if useWhereInSQL: - sqlGroups += "AND workingGroup IN (" - else: - sqlGroups += "WHERE workingGroup IN (" - useWhereInSQL = True - # loop over all groups - idxWG = 1 - for tmpWG in workingGroup.split(','): - tmpWGkey = ':workingGroup%s' % idxWG - sqlGroups += "%s," % tmpWGkey - tmpGroupMap[tmpWGkey] = tmpWG - idxWG += 1 - sqlGroups = sqlGroups[:-1] + ") " - # country group - if countryGroup != '': - if useWhereInSQL: - sqlGroups += "AND countryGroup IN (" - else: - sqlGroups += "WHERE countryGroup IN (" - useWhereInSQL = True - # loop over all groups - idxCG = 1 - for tmpCG in countryGroup.split(','): - tmpCGkey = ':countryGroup%s' % idxCG - sqlGroups += "%s," % tmpCGkey - tmpGroupMap[tmpCGkey] = tmpCG - idxCG += 1 - sqlGroups = sqlGroups[:-1] + ") " - sql0 += sqlGroups - # minimum priority - sqlPrio = '' - tmpPrioMap = {} - if minPriority != None: - if useWhereInSQL: - sqlPrio = "AND currentPriority>=:minPriority " - else: - sqlPrio = "WHERE currentPriority>=:minPriority " - useWhereInSQL = True - tmpPrioMap[':minPriority'] = minPriority - sql0 += sqlPrio - sql0 += "GROUP BY computingSite,jobStatus" - sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ computingSite,jobStatus,COUNT(*) FROM ATLAS_PANDA.jobsArchived4 tab WHERE modificationTime>:modificationTime " - if sqlJobType != "": - sqlA += "AND prodSourceLabel IN (" - sqlA += sqlJobType - if predefined: - sqlA += "AND relocationFlag=1 " - sqlA += sqlGroups - sqlA += sqlPrio - sqlA += "GROUP BY computingSite,jobStatus" - tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] - if archived: - tables.append('ATLAS_PANDA.jobsArchived4') - # sql for materialized view - sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0) - sqlMV = re.sub(':minPriority','TRUNC(:minPriority,-1)',sqlMV) - sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) - ret = {} - nTry=3 - for iTry in range(nTry): - try: - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - for tmpJobType in tmpJobTypeMap.keys(): - varMap[tmpJobType] = tmpJobTypeMap[tmpJobType] - for tmpGroup in tmpGroupMap.keys(): - varMap[tmpGroup] = tmpGroupMap[tmpGroup] - for tmpPrio in tmpPrioMap.keys(): - varMap[tmpPrio] = tmpPrioMap[tmpPrio] - if table != 'ATLAS_PANDA.jobsArchived4': - self.cur.arraysize = 10000 - if table == 'ATLAS_PANDA.jobsActive4': - sqlExeTmp = (sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS' - else: - sqlExeTmp = (sql0+comment) % table - _logger.debug("getJobStatistics : %s %s" % (sqlExeTmp,str(varMap))) - self.cur.execute(sqlExeTmp, varMap) - else: - varMap[':modificationTime'] = timeLimit - self.cur.arraysize = 10000 - self.cur.execute(sqlA+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for item in res: - if not ret.has_key(item[0]): - ret[item[0]] = {} - if not ret[item[0]].has_key(item[1]): - ret[item[0]][item[1]] = 0 - ret[item[0]][item[1]] += item[2] - # for zero - stateList = ['assigned','activated','running'] - if archived: - stateList += ['finished','failed'] - for site in ret.keys(): - for state in stateList: - if not ret[site].has_key(state): - ret[site][state] = 0 - # return - _logger.debug("getJobStatistics -> %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("getJobStatistics() retry : %s" % iTry) - time.sleep(2) - continue - type, value, traceBack = sys.exc_info() - _logger.error("getJobStatistics : %s %s" % (type, value)) - return {} - - - # get job statistics with label - def getJobStatisticsWithLabel(self,siteStr=''): - comment = ' /* DBProxy.getJobStatisticsWithLabel */' - _logger.debug("getJobStatisticsWithLabel(%s)" % siteStr) - sql0 = "SELECT computingSite,prodSourceLabel,jobStatus,COUNT(*) FROM %s " - # site - tmpSiteMap = {} - if siteStr != '': - sql0 += "WHERE computingSite IN (" - # loop over all sites - idxSite = 1 - for tmpSite in siteStr.split(','): - tmpSiteKey = ':site%s' % idxSite - sql0 += "%s," % tmpSiteKey - tmpSiteMap[tmpSiteKey] = tmpSite - idxSite += 1 - sql0 = sql0[:-1] + ") " - sql0 += "GROUP BY computingSite,prodSourceLabel,jobStatus " - sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0) - sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) - tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] - returnMap = {} - try: - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - self.cur.arraysize = 10000 - if table == 'ATLAS_PANDA.jobsActive4': - sqlExeTmp = (sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS' - else: - sqlExeTmp = (sql0+comment) % table - self.cur.execute(sqlExeTmp,tmpSiteMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for computingSite,prodSourceLabel,jobStatus,nCount in res: - # add site - if not returnMap.has_key(computingSite): - returnMap[computingSite] = {} - # add SourceLabel - if not returnMap[computingSite].has_key(prodSourceLabel): - returnMap[computingSite][prodSourceLabel] = {} - # add jobstatus - if not returnMap[computingSite][prodSourceLabel].has_key(jobStatus): - returnMap[computingSite][prodSourceLabel][jobStatus] = 0 - # add - returnMap[computingSite][prodSourceLabel][jobStatus] += nCount - # return - _logger.debug("getJobStatisticsWithLabel() : %s" % str(returnMap)) - return returnMap - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getJobStatisticsWithLabel : %s %s" % (errType,errValue)) - return {} - - - # get job statistics for brokerage - def getJobStatisticsBrokerage(self,minPriority=None): - comment = ' /* DBProxy.getJobStatisticsBrokerage */' - _logger.debug("getJobStatisticsBrokerage(%s)" % minPriority) - sql0 = "SELECT cloud,computingSite,jobStatus,processingType,COUNT(*) FROM %s WHERE " - sql0 += "prodSourceLabel IN (:prodSourceLabel1) " - tmpPrioMap = {} - if minPriority != None: - sql0 += "AND currentPriority>=:minPriority " - tmpPrioMap[':minPriority'] = minPriority - sql0 += "GROUP BY cloud,computingSite,jobStatus,processingType" - # sql for materialized view - sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0) - sqlMV = re.sub(':minPriority','TRUNC(:minPriority,-1)',sqlMV) - sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) - tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] - if minPriority != None: - # read the number of running jobs with prio<=MIN - tables.append('ATLAS_PANDA.jobsActive4') - sqlMVforRun = re.sub('currentPriority>=','currentPriority<=',sqlMV) - ret = {} - nTry=3 - iActive = 0 - for iTry in range(nTry): - try: - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[':prodSourceLabel1'] = 'managed' - for tmpPrio in tmpPrioMap.keys(): - varMap[tmpPrio] = tmpPrioMap[tmpPrio] - self.cur.arraysize = 10000 - useRunning = None - if table == 'ATLAS_PANDA.jobsActive4': - # first count non-running and then running if minPriority is specified - if minPriority != None: - if iActive == 0: - useRunning = False - else: - useRunning = True - iActive += 1 - if useRunning in [None,False]: - self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap) - else: - self.cur.execute((sqlMVforRun+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap) - else: - self.cur.execute((sql0+comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for cloud,computingSite,jobStatus,processingType,count in res: - # check jobstatus if minPriority isspecified - if minPriority != None: - # count the number of non-running with prio>=MIN - if useRunning == True and jobStatus != 'running': - continue - # count the number of running with prio<=MIN - if useRunning == False and jobStatus == 'running': - continue - # add cloud - if not ret.has_key(cloud): - ret[cloud] = {} - # add site - if not ret[cloud].has_key(computingSite): - ret[cloud][computingSite] = {} - # add processingType - if not ret[cloud][computingSite].has_key(processingType): - ret[cloud][computingSite][processingType] = {} - # add jobStatus - if not ret[cloud][computingSite][processingType].has_key(jobStatus): - ret[cloud][computingSite][processingType][jobStatus] = count - # for zero - for cloud,cloudVal in ret.iteritems(): - for site,siteVal in cloudVal.iteritems(): - for pType,typeVal in siteVal.iteritems(): - for stateItem in ['assigned','activated','running','transferring']: - if not typeVal.has_key(stateItem): - typeVal[stateItem] = 0 - # return - _logger.debug("getJobStatisticsBrokerage -> %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("getJobStatisticsBrokerage retry : %s" % iTry) - time.sleep(2) - continue - type, value, traceBack = sys.exc_info() - _logger.error("getJobStatisticsBrokerage : %s %s" % (type, value)) - return {} - - - # get job statistics for analysis brokerage - def getJobStatisticsAnalBrokerage(self,minPriority=None): - comment = ' /* DBProxy.getJobStatisticsAnalBrokerage */' - _logger.debug("getJobStatisticsAnalBrokerage(%s)" % minPriority) - sql0 = "SELECT computingSite,jobStatus,processingType,COUNT(*) FROM %s WHERE " - sql0 += "prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " - if minPriority != None: - sql0 += "AND currentPriority>=:minPriority " - sql0 += "GROUP BY cloud,computingSite,jobStatus,processingType" - # sql for materialized view - sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0) - sqlMV = re.sub(':minPriority','TRUNC(:minPriority,-1)',sqlMV) - sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) - tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] - ret = {} - nTry=3 - for iTry in range(nTry): - try: - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - if minPriority != None: - varMap[':minPriority'] = minPriority - self.cur.arraysize = 10000 - if table == 'ATLAS_PANDA.jobsActive4': - self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap) - else: - self.cur.execute((sql0+comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for computingSite,jobStatus,processingType,count in res: - # add site - if not ret.has_key(computingSite): - ret[computingSite] = {} - # add processingType - if not ret[computingSite].has_key(processingType): - ret[computingSite][processingType] = {} - # add jobStatus - if not ret[computingSite][processingType].has_key(jobStatus): - ret[computingSite][processingType][jobStatus] = count - # for zero - for site,siteVal in ret.iteritems(): - for pType,typeVal in siteVal.iteritems(): - for stateItem in ['defined','assigned','activated','running']: - if not typeVal.has_key(stateItem): - typeVal[stateItem] = 0 - # return - _logger.debug("getJobStatisticsAnalBrokerage -> %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.debug("getJobStatisticsAnalBrokerage retry : %s" % iTry) - time.sleep(2) - continue - type, value, traceBack = sys.exc_info() - _logger.error("getJobStatisticsAnalBrokerage : %s %s" % (type, value)) - return {} - - - # get highest prio jobs - def getHighestPrioJobStat(self): - comment = ' /* DBProxy.getHighestPrioJobStat */' - _logger.debug("getHighestPrioJobStat()") - sql0 = "SELECT cloud,max(currentPriority) FROM %s WHERE " - sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) GROUP BY cloud" - sqlC = "SELECT COUNT(*) FROM %s WHERE " - sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " - sqlC += "cloud=:cloud AND currentPriority=:currentPriority" - tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] - ret = {} - try: - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[':prodSourceLabel'] = 'managed' - if table == 'ATLAS_PANDA.jobsActive4': - varMap[':jobStatus1'] = 'activated' - varMap[':jobStatus2'] = 'dummy' - else: - varMap[':jobStatus1'] = 'defined' - varMap[':jobStatus2'] = 'assigned' - self.cur.arraysize = 100 - _logger.debug((sql0+comment) % table) - self.cur.execute((sql0+comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for cloud,maxPriority in res: - # add cloud - if not ret.has_key(cloud): - ret[cloud] = {} - # add max priority - prioKey = 'highestPrio' - nNotRunKey = 'nNotRun' - getNumber = False - if not ret[cloud].has_key(prioKey): - ret[cloud][prioKey] = maxPriority - ret[cloud][nNotRunKey] = 0 - getNumber = True - else: - # use highest one - if ret[cloud][prioKey] < maxPriority: - ret[cloud][prioKey] = maxPriority - # reset - ret[cloud][nNotRunKey] = 0 - getNumber = True - elif ret[cloud][prioKey] == maxPriority: - getNumber = True - # get number of jobs with highest prio - if getNumber: - varMap[':cloud'] = cloud - varMap[':currentPriority'] = maxPriority - self.cur.arraysize = 10 - _logger.debug((sqlC+comment) % table) - self.cur.execute((sqlC+comment) % table, varMap) - resC = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - ret[cloud][nNotRunKey] += resC[0] - # return - return ret - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getHighestPrioJobStat : %s %s" % (type, value)) - return {} - - - # get highest prio jobs per process group - def getHighestPrioJobStatPerPG(self,useMorePG=False): - comment = ' /* DBProxy.getHighestPrioJobStatPerPG */' - _logger.debug("getHighestPrioJobStatPerPG()") - if useMorePG == False: - sql0 = "SELECT cloud,max(currentPriority),processingType FROM %s WHERE " - sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) GROUP BY cloud,processingType" - sqlC = "SELECT COUNT(*) FROM %s WHERE " - sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " - sqlC += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType" - else: - sql0 = "SELECT cloud,max(currentPriority),processingType,coreCount,workingGroup FROM %s WHERE " - sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) " - sql0 += "GROUP BY cloud,processingType,coreCount,workingGroup" - sqlC = "SELECT COUNT(*) FROM %s WHERE " - sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " - sqlC += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType AND " - sqlC += "coreCount=:coreCount AND workingGroup=:workingGroup" - sqlCN = "SELECT COUNT(*) FROM %s WHERE " - sqlCN += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " - sqlCN += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType AND " - sqlCN += "coreCount IS NULL AND workingGroup=:workingGroup" - tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] - ret = {} - try: - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[':prodSourceLabel'] = 'managed' - if table == 'ATLAS_PANDA.jobsActive4': - varMap[':jobStatus1'] = 'activated' - varMap[':jobStatus2'] = 'dummy' - else: - varMap[':jobStatus1'] = 'defined' - varMap[':jobStatus2'] = 'assigned' - self.cur.arraysize = 100 - _logger.debug((sql0+comment) % table+str(varMap)) - self.cur.execute((sql0+comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for tmpItem in res: - if useMorePG == False: - cloud,maxPriority,processingType = tmpItem - origCloud = cloud - origProcessingType = processingType - else: - origCloud,maxPriority,origProcessingType,coreCount,workingGroup = tmpItem - # convert cloud and processingType for extended process group - if useMorePG == ProcessGroups.extensionLevel_1: - # extension level 1 - cloud,processingType = ProcessGroups.converCPTforEPG(origCloud,origProcessingType, - coreCount) - else: - # extension level 2 - cloud,processingType = ProcessGroups.converCPTforEPG(origCloud,origProcessingType, - coreCount,workingGroup) - # add cloud - if not ret.has_key(cloud): - ret[cloud] = {} - # get process group - processGroup = ProcessGroups.getProcessGroup(processingType) - # add process group - if not ret[cloud].has_key(processGroup): - ret[cloud][processGroup] = {} - # add max priority - prioKey = 'highestPrio' - nNotRunKey = 'nNotRun' - getNumber = False - if not ret[cloud][processGroup].has_key(prioKey): - ret[cloud][processGroup][prioKey] = maxPriority - ret[cloud][processGroup][nNotRunKey] = 0 - getNumber = True - else: - # use highest one - if ret[cloud][processGroup][prioKey] < maxPriority: - ret[cloud][processGroup][prioKey] = maxPriority - # reset - ret[cloud][processGroup][nNotRunKey] = 0 - getNumber = True - elif ret[cloud][processGroup][prioKey] == maxPriority: - getNumber = True - # get number of jobs with highest prio - if getNumber: - varMap[':cloud'] = origCloud - varMap[':currentPriority'] = maxPriority - varMap[':processingType'] = origProcessingType - if useMorePG != False: - varMap[':workingGroup'] = workingGroup - if coreCount != None: - varMap[':coreCount'] = coreCount - self.cur.arraysize = 10 - _logger.debug((sqlC+comment) % table+str(varMap)) - self.cur.execute((sqlC+comment) % table, varMap) - resC = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - ret[cloud][processGroup][nNotRunKey] += resC[0] - # return - _logger.debug("getHighestPrioJobStatPerPG -> %s" % ret) - return ret - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getHighestPrioJobStatPerPG : %s %s" % (type, value)) - return {} - - - # get queued analysis jobs at a site - def getQueuedAnalJobs(self,site,dn): - comment = ' /* DBProxy.getQueuedAnalJobs */' - _logger.debug("getQueuedAnalJobs(%s,%s)" % (site,dn)) - sql0 = "SELECT COUNT(*),jobStatus FROM %s WHERE " - sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) " - sql0 += "AND computingSite=:computingSite AND prodUserName != :prodUserName " - sql0 += "GROUP BY jobStatus " - tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ['','NULL',None]: - compactDN = dn - nQueued = 0 - nRunning = 0 - # loop over all tables - for table in tables: - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[':prodSourceLabel'] = 'user' - varMap[':computingSite'] = site - varMap[':prodUserName'] = compactDN - if table == 'ATLAS_PANDA.jobsActive4': - varMap[':jobStatus1'] = 'activated' - varMap[':jobStatus2'] = 'running' - else: - varMap[':jobStatus1'] = 'defined' - varMap[':jobStatus2'] = 'assigned' - self.cur.arraysize = 10 - self.cur.execute((sql0+comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # sum - for cnt,jobStatus in res: - if jobStatus == 'running': - nRunning += cnt - else: - nQueued += cnt - # return - return {'queued':nQueued, 'running':nRunning} - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getQueuedAnalJobs : %s %s" % (errType,errValue)) - return {} - - - # get computingSite and destinationSE for a dataset - def getDestSE(self,dsname,fromArch=False): - comment = ' /* DBProxy.getDestSE */' - _logger.debug("getDestSE(%s,%s)" % (dsname,fromArch)) - sql0 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock " - if not fromArch: - sql0 += "AND status=:status " - sql0 += "AND rownum=1" - sql1 = "SELECT computingSite,destinationSE FROM %s WHERE PandaID=:PandaID" - actTableList = ['ATLAS_PANDA.jobsActive4'] - if fromArch: - actTableList.append("ATLAS_PANDA.jobsArchived4") - try: - # start transaction - self.conn.begin() - # select - varMap = {} - if not fromArch: - varMap[':status'] = 'transferring' - varMap[':destinationDBlock'] = dsname - self.cur.arraysize = 10 - self.cur.execute(sql0+comment, varMap) - res = self.cur.fetchall() - # get PandaID - pandaID = None - if len(res) != 0: - pandaID = res[0][0] - # get computingSite and destinationSE - destSE = None,None - if pandaID != None: - varMap = {} - varMap[':PandaID'] = pandaID - # loop over all active tables - foundInActive = False - for actTable in actTableList: - self.cur.execute((sql1 % actTable)+comment, varMap) - res = self.cur.fetchall() - if len(res) != 0: - destSE = res[0] - foundInActive = True - break - # look into ARCH table - if not foundInActive: - if fromArch: - sqlA = "SELECT computingSite,destinationSE FROM ATLAS_PANDAARCH.jobsArchived WHERE PandaID=:PandaID " - sqlA += "AND modificationTime>(CURRENT_DATE-30) " - self.cur.execute(sqlA+comment, varMap) - res = self.cur.fetchall() - if len(res) != 0: - destSE = res[0] - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - _logger.debug("getDestSE(%s) : %s" % (dsname,str(destSE))) - return destSE - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getDestSE : %s %s" % (type, value)) - return None,None - - - # get destinationDBlockToken for a dataset - def getDestTokens(self,dsname): - comment = ' /* DBProxy.getDestTokens */' - _logger.debug("getDestTokens(%s)" % dsname) - sql0 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ destinationDBlockToken FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND rownum=1" - try: - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[':destinationDBlock'] = dsname - self.cur.arraysize = 10 - self.cur.execute(sql0+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - retToken = None - if len(res) != 0: - retToken = res[0][0] - # convert None to NULL - if retToken == None: - retToken = 'NULL' - # return - _logger.debug("getDestTokens(%s) : %s" % (dsname,retToken)) - return retToken - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getDestTokens : %s %s" % (type, value)) - return None - - - # get the number of job for a user - def getNumberJobsUser(self,dn,workingGroup=None): - comment = ' /* DBProxy.getNumberJobsUser */' - _logger.debug("getNumberJobsUsers(%s,%s)" % (dn,workingGroup)) - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ['','NULL',None]: - compactDN = dn - if workingGroup != None: - sql0 = "SELECT COUNT(*) FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel AND workingGroup=:workingGroup" - else: - sql0 = "SELECT COUNT(*) FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel AND workingGroup IS NULL" - nTry = 1 - nJob = 0 - for iTry in range(nTry): - try: - for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'): - # start transaction - self.conn.begin() - # select - varMap = {} - varMap[':prodUserName'] = compactDN - varMap[':prodSourceLabel'] = 'user' - if workingGroup != None: - varMap[':workingGroup'] = workingGroup - self.cur.arraysize = 10 - self.cur.execute((sql0+comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - if len(res) != 0: - nJob += res[0][0] - # return - _logger.debug("getNumberJobsUsers(%s) : %s" % (dn,nJob)) - return nJob - except: - # roll back - self._rollback() - if iTry+1 < nTry: - time.sleep(2) - continue - type, value, traceBack = sys.exc_info() - _logger.error("getNumberJobsUsers : %s %s" % (type, value)) - return 0 - - - # get job statistics for ExtIF - def getJobStatisticsForExtIF(self,sourcetype=None): - comment = ' /* DBProxy.getJobStatisticsForExtIF */' - _logger.debug("getJobStatisticsForExtIF()") - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) - if sourcetype == 'analysis': - sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) GROUP BY jobStatus,cloud" - sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ jobStatus,COUNT(*),cloud FROM %s tab WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " - else: - sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) GROUP BY jobStatus,cloud" - sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ jobStatus,COUNT(*),cloud FROM %s tab WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " - sqlA+= "AND modificationTime>:modificationTime GROUP BY jobStatus,cloud" - # sql for materialized view - sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0) - sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) - ret = {} - try: - for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsDefined4'): - # start transaction - self.conn.begin() - # select - varMap = {} - if sourcetype == 'analysis': - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - else: - varMap[':prodSourceLabel1'] = 'managed' - varMap[':prodSourceLabel2'] = 'rc_test' - if table != 'ATLAS_PANDA.jobsArchived4': - self.cur.arraysize = 10000 - if table == 'ATLAS_PANDA.jobsActive4': - self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap) - else: - self.cur.execute((sql0+comment) % table, varMap) - else: - varMap[':modificationTime'] = timeLimit - self.cur.arraysize = 10000 - self.cur.execute((sqlA+comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # change NULL to US for old jobs - newRes = [] - usMap = {} - for jobStatus,count,cloud in res: - if not cloud in ['US','NULL']: - # append since no conversion is required - newRes.append((jobStatus,count,cloud)) - else: - # sum - if not usMap.has_key(jobStatus): - usMap[jobStatus] = 0 - usMap[jobStatus] += count - # append US counts - for jobStatus,count in usMap.iteritems(): - newRes.append((jobStatus,count,'US')) - # create map - for item in newRes: - # add cloud - if not ret.has_key(item[2]): - ret[item[2]] = {} - # this is needed for auto_increment of InnoDB - if not ret[item[2]].has_key(item[0]): - ret[item[2]][item[0]] = item[1] - # return - _logger.debug("getJobStatisticsForExtIF -> %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getJobStatisticsForExtIF : %s %s" % (type, value)) - return {} - - - # get job statistics per processingType - def getJobStatisticsPerProcessingType(self,useMorePG=False): - comment = ' /* DBProxy.getJobStatisticsPerProcessingType */' - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) - _logger.debug("getJobStatisticsPerProcessingType()") - if useMorePG == False: - sqlN = "SELECT jobStatus,COUNT(*),cloud,processingType FROM %s " - sqlN += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) GROUP BY jobStatus,cloud,processingType" - sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ jobStatus,COUNT(*),cloud,processingType FROM %s tab " - sqlA += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>:modificationTime GROUP BY jobStatus,cloud,processingType" - else: - sqlN = "SELECT jobStatus,COUNT(*),cloud,processingType,coreCount,workingGroup FROM %s " - sqlN += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " - sqlN += "GROUP BY jobStatus,cloud,processingType,coreCount,workingGroup" - sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ " - sqlA += "jobStatus,COUNT(*),cloud,processingType,coreCount,workingGroup FROM %s tab " - sqlA += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>:modificationTime " - sqlA += "GROUP BY jobStatus,cloud,processingType,coreCount,workingGroup" - # sql for materialized view - sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sqlN) - sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) - ret = {} - try: - for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsDefined4'): - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - # select - varMap = {} - varMap[':prodSourceLabel1'] = 'managed' - varMap[':prodSourceLabel2'] = 'rc_test' - if table == 'ATLAS_PANDA.jobsArchived4': - varMap[':modificationTime'] = timeLimit - self.cur.execute((sqlA+comment) % table, varMap) - else: - if table == 'ATLAS_PANDA.jobsActive4' and useMorePG == False: - self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap) - else: - # use real table since coreCount is unavailable in MatView - self.cur.execute((sqlN+comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for tmpItem in res: - if useMorePG == False: - jobStatus,count,cloud,processingType = tmpItem - else: - jobStatus,count,cloud,processingType,coreCount,workingGroup = tmpItem - # convert cloud and processingType for extended process group - if useMorePG == ProcessGroups.extensionLevel_1: - # extension level 1 - cloud,processingType = ProcessGroups.converCPTforEPG(cloud,processingType, - coreCount) - else: - # extension level 2 - cloud,processingType = ProcessGroups.converCPTforEPG(cloud,processingType, - coreCount,workingGroup) - - # add cloud - if not ret.has_key(cloud): - ret[cloud] = {} - # add processingType - if not ret[cloud].has_key(processingType): - ret[cloud][processingType] = {} - # add status - if not ret[cloud][processingType].has_key(jobStatus): - ret[cloud][processingType][jobStatus] = 0 - ret[cloud][processingType][jobStatus] += count - # return - _logger.debug("getJobStatisticsPerProcessingType -> %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getJobStatisticsPerProcessingType : %s %s" % (type, value)) - return {} - - - # get the number of waiting jobs per site and user - def getJobStatisticsPerUserSite(self): - comment = ' /* DBProxy.getJobStatisticsPerUserSite */' - _logger.debug("getJobStatisticsPerUserSite()") - sqlN = "SELECT COUNT(*),prodUserID,computingSite FROM %s " - sqlN += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus=:jobStatus GROUP BY prodUserID,computingSite" - ret = {} - try: - for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'): - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 100000 - # select - if table == 'ATLAS_PANDA.jobsActive4': - jobStatus = 'activated' - else: - jobStatus = 'assigned' - varMap = {} - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - varMap[':jobStatus'] = jobStatus - self.cur.execute((sqlN+comment) % table, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for cnt,prodUserName,computingSite in res: - # add site - if not ret.has_key(computingSite): - ret[computingSite] = {} - # add user - if not ret[computingSite].has_key(prodUserName): - ret[computingSite][prodUserName] = {'assigned':0,'activated':0} - # add info - ret[computingSite][prodUserName][jobStatus] = cnt - # return - _logger.debug("getJobStatisticsPerUserSite -> %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - # error - errtype,errvalue = sys.exc_info()[:2] - _logger.error("getJobStatisticsPerUserSite : %s %s" % (errtype,errvalue)) - return {} - - - # get number of analysis jobs per user - def getNUserJobs(self,siteName,nJobs): - comment = ' /* DBProxy.getNUserJobs */' - _logger.debug("getNUserJobs(%s)" % siteName) - sql0 = "SELECT * FROM (SELECT prodUserID FROM ATLAS_PANDA.jobsActive4 " - sql0 += "WHERE jobStatus=:jobStatus AND prodSourceLabel in (:prodSourceLabel1,:prodSourceLabel2) " - sql0 += "AND computingSite=:computingSite ORDER BY currentPriority DESC) WHERE rownum<=:nJobs" - varMap = {} - varMap[':computingSite'] = siteName - varMap[':nJobs'] = nJobs - varMap[':jobStatus'] = 'activated' - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - ret = {} - try: - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - self.cur.execute(sql0+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for prodUserID, in res: - if not ret.has_key(prodUserID): - ret[prodUserID] = 0 - ret[prodUserID] += 1 - # return - _logger.debug("getNUserJobs() : %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getNUserJobs : %s %s" % (type, value)) - return {} - - - # get number of activated analysis jobs - def getNAnalysisJobs(self,nProcesses): - comment = ' /* DBProxy.getNAnalysisJobs */' - _logger.debug("getNAnalysisJobs(%s)" % nProcesses) - sql0 = "SELECT computingSite,COUNT(*) FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus " - sql0 += "AND (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2) GROUP BY computingSite" - varMap = {} - varMap[':jobStatus'] = 'activated' - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - ret = {} - try: - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - self.cur.execute(sql0+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # create map - for item in res: - ret[item[0]] = float(item[1])/nProcesses - # return - _logger.debug("getNAnalysisJobs() : %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getNAnalysisJobs : %s %s" % (type, value)) - return {} - - - # generate pilot token - def genPilotToken(self,schedulerhost,scheduleruser,schedulerid): - comment = ' /* DBProxy.genPilotToken */' - try: - _logger.debug("genPilotToken(%s,%s,%s)" % (schedulerhost,scheduleruser,schedulerid)) - token = commands.getoutput('uuidgen') - timeNow = datetime.datetime.utcnow() - timeExp = timeNow + datetime.timedelta(days=4) - sql = "INSERT INTO ATLAS_PANDA.pilottoken (token,schedulerhost,scheduleruser,schedulerid,created,expires) " - sql += "VALUES (:token,:schedulerhost,:scheduleruser,:schedulerid,:created,:expires)" - # start transaction - self.conn.begin() - # execute - varMap = {':token':token,':schedulerhost':schedulerhost,':scheduleruser':scheduleruser, - ':schedulerid':schedulerid,':created':timeNow,':expires':timeExp} - self.cur.execute(sql+comment,varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - retVal = "token=%s,created=%s,expires=%s" % (token,timeNow.strftime('%Y-%m-%d %H:%M:%S'), - timeExp.strftime('%Y-%m-%d %H:%M:%S')) - _logger.debug("genPilotToken -> %s" % retVal) - return retVal - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("genPilotToken : %s %s" % (type, value)) - return None - - - # get list of scheduler users - def getListSchedUsers(self): - comment = ' /* DBProxy.getListSchedUsers */' - try: - _logger.debug("getListSchedUsers") - sql = "SELECT token,scheduleruser FROM ATLAS_PANDA.pilottoken WHERE expires>CURRENT_DATE" - # start transaction - self.conn.begin() - # execute - self.cur.arraysize = 100 - self.cur.execute(sql+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - retVal = {} - for token,scheduleruser in res: - retVal[token] = scheduleruser - _logger.debug("getListSchedUsers->%s" % str(retVal)) - return retVal - except: - # roll back - self._rollback() - # error - type, value, traceBack = sys.exc_info() - _logger.error("getListSchedUsers : %s %s" % (type, value)) - return {} - - - ########################################################################### - # - # LogDBProxy stuff - - # update site data - def updateSiteData(self,hostID,pilotRequests): - comment = ' /* DBProxy.updateSiteData */' - _logger.debug("updateSiteData start") - sqlDel = "DELETE FROM ATLAS_PANDAMETA.SiteData WHERE HOURS=:HOURS AND LASTMOD<:LASTMOD" - sqlCh = "SELECT count(*) FROM ATLAS_PANDAMETA.SiteData WHERE FLAG=:FLAG AND HOURS=:HOURS AND SITE=:SITE" - sqlIn = "INSERT INTO ATLAS_PANDAMETA.SiteData (SITE,FLAG,HOURS,GETJOB,UPDATEJOB,LASTMOD," - sqlIn += "NSTART,FINISHED,FAILED,DEFINED,ASSIGNED,WAITING,ACTIVATED,HOLDING,RUNNING,TRANSFERRING) " - sqlIn += "VALUES (:SITE,:FLAG,:HOURS,:GETJOB,:UPDATEJOB,CURRENT_DATE," - sqlIn += "0,0,0,0,0,0,0,0,0,0)" - sqlUp = "UPDATE ATLAS_PANDAMETA.SiteData SET GETJOB=:GETJOB,UPDATEJOB=:UPDATEJOB,LASTMOD=CURRENT_DATE " - sqlUp += "WHERE FLAG=:FLAG AND HOURS=:HOURS AND SITE=:SITE" - sqlAll = "SELECT getJob,updateJob,FLAG FROM ATLAS_PANDAMETA.SiteData WHERE HOURS=:HOURS AND SITE=:SITE" - try: - # delete old records - varMap = {} - varMap[':HOURS'] = 3 - varMap[':LASTMOD'] = datetime.datetime.utcnow()-datetime.timedelta(hours=varMap[':HOURS']) - self.conn.begin() - self.cur.execute(sqlDel+comment,varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # shuffle to avoid concatenation - tmpSiteList = pilotRequests.keys() - random.shuffle(tmpSiteList) - # loop over all sites - for tmpSite in tmpSiteList: - tmpVal = pilotRequests[tmpSite] - # start transaction - self.conn.begin() - # check individual host info first - varMap = {} - varMap[':FLAG'] = hostID - varMap[':SITE'] = tmpSite - varMap[':HOURS'] = 3 - self.cur.arraysize = 10 - self.cur.execute(sqlCh+comment,varMap) - res = self.cur.fetchone() - # row exists or not - if res[0] == 0: - sql = sqlIn - else: - sql = sqlUp - if tmpVal.has_key('getJob'): - varMap[':GETJOB'] = len(tmpVal['getJob']) - else: - varMap[':GETJOB'] = 0 - if tmpVal.has_key('updateJob'): - varMap[':UPDATEJOB'] = len(tmpVal['updateJob']) - else: - varMap[':UPDATEJOB'] = 0 - # update - self.cur.execute(sql+comment,varMap) - # get all info - sumExist = False - varMap = {} - varMap[':SITE'] = tmpSite - varMap[':HOURS'] = 3 - self.cur.arraysize = 100 - self.cur.execute(sqlAll+comment,varMap) - res = self.cur.fetchall() - # get total getJob/updateJob - varMap[':GETJOB'] = 0 - varMap[':UPDATEJOB'] = 0 - nCol = 0 - for tmpGetJob,tmpUpdateJob,tmpFlag in res: - # don't use summed info - if tmpFlag == 'production': - sumExist = True - continue - if tmpFlag == 'analysis': - if tmpSite.startswith('ANALY_'): - sumExist = True - continue - if tmpFlag in ['test']: - continue - # sum - varMap[':GETJOB'] += tmpGetJob - varMap[':UPDATEJOB'] += tmpUpdateJob - nCol += 1 - # get average - if nCol != 0: - if varMap[':GETJOB'] >= nCol: - varMap[':GETJOB'] /= nCol - if varMap[':UPDATEJOB'] >= nCol: - varMap[':UPDATEJOB'] /= nCol - if tmpSite.startswith('ANALY_'): - varMap[':FLAG'] = 'analysis' - else: - varMap[':FLAG'] = 'production' - # row exists or not - if sumExist: - sql = sqlUp - else: - sql = sqlIn - # update - self.cur.execute(sql+comment,varMap) - _logger.debug('updateSiteData : %s getJob=%s updateJob=%s' % \ - (tmpSite,varMap[':GETJOB'],varMap[':UPDATEJOB'])) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("updateSiteData done") - return True - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("updateSiteData : %s %s" % (type,value)) - return False - - - # get site data - def getCurrentSiteData(self): - comment = ' /* DBProxy.getCurrentSiteData */' - _logger.debug("getCurrentSiteData") - sql = "SELECT SITE,getJob,updateJob,FLAG FROM ATLAS_PANDAMETA.SiteData WHERE FLAG IN (:FLAG1,:FLAG2) and HOURS=3" - varMap = {} - varMap[':FLAG1'] = 'production' - varMap[':FLAG2'] = 'analysis' - try: - # set autocommit on - self.conn.begin() - # select - self.cur.arraysize = 10000 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - ret = {} - for site,getJob,updateJob,flag in res: - if site.startswith('ANALY_'): - if flag != 'analysis': - continue - else: - if flag != 'production': - continue - ret[site] = {'getJob':getJob,'updateJob':updateJob} - return ret - except: - type, value, traceBack = sys.exc_info() - _logger.error("getCurrentSiteData : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # insert nRunning in site data - def insertnRunningInSiteData(self): - comment = ' /* DBProxy.insertnRunningInSiteData */' - _logger.debug("insertnRunningInSiteData start") - sqlDel = "DELETE FROM ATLAS_PANDAMETA.SiteData WHERE FLAG IN (:FLAG1,:FLAG2) AND LASTMOD= nSiteRow: - continue - tmpIdx += 1 - if usingGroup: - workingGroup = tmpItem[tmpIdx] - tmpIdx += 1 - else: - workingGroup = None - if usingType: - processingType = tmpItem[tmpIdx] - tmpIdx += 1 - # get process group - processGroup = ProcessGroups.getProcessGroup(processingType) - else: - processingType = None - processGroup = None - if usingPrio: - currentPriority = tmpItem[tmpIdx] - tmpIdx += 1 - else: - currentPriority = None - cnt = tmpItem[tmpIdx] - tmpIdx += 1 - maxPriority = tmpItem[tmpIdx] - # append processingType list - if not processGroupInQueueMap.has_key(processGroup): - processGroupInQueueMap[processGroup] = [] - if not processingType in processGroupInQueueMap[processGroup]: - processGroupInQueueMap[processGroup].append(processingType) - # count the number of jobs for each policy - for tmpShareDef in shareDefList: - policyName = tmpShareDef['policy']['name'] - # use different list based on usage of priority - if tmpShareDef['policy']['priority'] == None: - groupInDefList = self.faresharePolicy[siteName]['groupList'] - typeInDefList = self.faresharePolicy[siteName]['typeList'][tmpShareDef['policy']['group']] - else: - groupInDefList = self.faresharePolicy[siteName]['groupListWithPrio'] - typeInDefList = self.faresharePolicy[siteName]['typeListWithPrio'][tmpShareDef['policy']['group']] - # check working group - if usingGroup: - if tmpShareDef['policy']['group'] == None: - # catchall doesn't contain WGs used by other policies - if workingGroup != None and workingGroup in groupInDefList: - continue - # check for wildcard - toBeSkippedFlag = False - for tmpPattern in groupInDefList: - if '*' in tmpPattern: - tmpPattern = '^' + tmpPattern.replace('*','.*') + '$' - # don't use WG if it is included in other policies - if re.search(tmpPattern,workingGroup) != None: - toBeSkippedFlag = True - break - if toBeSkippedFlag: - continue - else: - # needs to be matched if it is specified in the policy - if '*' in tmpShareDef['policy']['group']: - # using wild card - tmpPattern = '^' + tmpShareDef['policy']['group'].replace('*','.*') + '$' - if re.search(tmpPattern,workingGroup) == None: - continue - else: - if tmpShareDef['policy']['group'] != workingGroup: - continue - # collect real WGs per defined WG mainly for wildcard - if not workingGroupInQueueMap.has_key(tmpShareDef['policy']['group']): - workingGroupInQueueMap[tmpShareDef['policy']['group']] = [] - if not workingGroup in workingGroupInQueueMap[tmpShareDef['policy']['group']]: - workingGroupInQueueMap[tmpShareDef['policy']['group']].append(workingGroup) - # check processingType - if usingType: - if tmpShareDef['policy']['type'] == None: - # catchall doesn't contain processGroups used by other policies - if processGroup != None and processGroup in typeInDefList: - continue - else: - # needs to be matched if it is specified in the policy - if tmpShareDef['policy']['type'] != processGroup: - continue - # check priority - if usingPrio: - if currentPriority != None and tmpShareDef['policy']['priority'] != None: - if tmpShareDef['policy']['prioCondition'] == '>': - if currentPriority <= tmpShareDef['policy']['priority']: - continue - elif tmpShareDef['policy']['prioCondition'] == '>=': - if currentPriority < tmpShareDef['policy']['priority']: - continue - elif tmpShareDef['policy']['prioCondition'] == '<=': - if currentPriority > tmpShareDef['policy']['priority']: - continue - elif tmpShareDef['policy']['prioCondition'] == '<': - if currentPriority >= tmpShareDef['policy']['priority']: - continue - # append job status - if not tmpShareDef['count'].has_key(jobStatus): - tmpShareDef['count'][jobStatus] = 0 - # sum - tmpShareDef['count'][jobStatus] += cnt - # max priority - if not tmpShareDef['maxprio'].has_key(jobStatus): - tmpShareDef['maxprio'][jobStatus] = maxPriority - elif tmpShareDef['maxprio'][jobStatus] < maxPriority: - tmpShareDef['maxprio'][jobStatus] = maxPriority - # loop over all policies to calcurate total number of running jobs and total share - totalRunning = 0 - shareMap = {} - msgShare = 'share->' - msgShareMap = {} - totalShareNonGP = 0 - totalRunningNonGP = 0 - totalActiveShareNonGP = 0 - for tmpShareDef in shareDefList: - tmpNumMap = tmpShareDef['count'] - policyName = tmpShareDef['policy']['name'] - # policies with priorities are used only to limit the numer of jobs - if tmpShareDef['policy']['priority'] != None: - continue - # the number of activated jobs - if not tmpNumMap.has_key('activated') or tmpNumMap['activated'] == 0: - tmpNumActivated = 0 - else: - tmpNumActivated = tmpNumMap['activated'] - # get share, removing % - tmpShareValue = tmpShareDef['policy']['share'][:-1] - tmpShareValue = int(tmpShareValue) - # get the number of runnig - if not tmpNumMap.has_key('running'): - tmpNumRunning = 0 - else: - tmpNumRunning = tmpNumMap['running'] - # debug message for share - msgShareMap[policyName] = '%s:activated=%s:running=%s' % (policyName,tmpNumActivated,tmpNumRunning) - # get total share and total number of running jobs for non-GP - if tmpShareDef['policy']['group'] == None: - totalShareNonGP += tmpShareValue - totalRunningNonGP += tmpNumRunning - # get total share for active non-GP - if tmpNumActivated != 0: - totalActiveShareNonGP += tmpShareValue - # sum - totalRunning += tmpNumRunning - # not use the policy if no activated jobs - if tmpNumActivated == 0: - continue - # max priority - maxPriority = 0 - if tmpShareDef['maxprio'].has_key('activated'): - maxPriority = tmpShareDef['maxprio']['activated'] - # append - shareMap[policyName] = { - 'share':tmpShareValue, - 'running':tmpNumRunning, - 'policy':tmpShareDef['policy'], - 'maxprio':maxPriority, - } - # re-normalize when some non-GP policies are inactive - if totalShareNonGP != totalActiveShareNonGP and totalActiveShareNonGP != 0: - for policyName,tmpVarMap in shareMap.iteritems(): - # essentially non-GP share is multiplied by totalShareNonGP/totalActiveShareNonGP - if tmpVarMap['policy']['group'] == None: - tmpVarMap['share'] *= totalShareNonGP - else: - tmpVarMap['share'] *= totalActiveShareNonGP - # make message with share info - for policyName in msgShareMap.keys(): - if shareMap.has_key(policyName): - msgShare += '%s:share=%s,' % (msgShareMap[policyName],shareMap[policyName]['share']) - else: - msgShare += '%s:share=0,' % msgShareMap[policyName] - # get total share - totalShare = 0 - for policyName,tmpVarMap in shareMap.iteritems(): - totalShare += tmpVarMap['share'] - msgShare = msgShare[:-1] - # loop over all policies to check if the priority constraint should be activated - prioToBeImposed = [] - msgPrio = '' - if usingPrio: - msgPrio += 'prio->' - for tmpShareDef in shareDefList: - tmpNumMap = tmpShareDef['count'] - policyName = tmpShareDef['policy']['name'] - # only policies with priorities are used to limit the numer of jobs - if tmpShareDef['policy']['priority'] == None: - continue - # get the number of runnig - if not tmpNumMap.has_key('running'): - tmpNumRunning = 0 - else: - tmpNumRunning = tmpNumMap['running'] - # the number of activated jobs - if not tmpNumMap.has_key('activated') or tmpNumMap['activated'] == 0: - tmpNumActivated = 0 - else: - tmpNumActivated = tmpNumMap['activated'] - # get limit - tmpLimitValue = tmpShareDef['policy']['share'] - # check if more jobs are running than the limit - toBeImposed = False - if tmpLimitValue.endswith('%'): - # percentage based - tmpLimitValue = tmpLimitValue[:-1] - if float(tmpNumRunning) > float(totalRunning) * float(tmpLimitValue) / 100.0: - toBeImposed = True - # debug message for prio - msgPrio += '%s:total=%s:running=%s:impose=%s,' % (policyName,totalRunning,tmpNumRunning,toBeImposed) - else: - # number based - if tmpNumRunning > int(tmpLimitValue): - toBeImposed = True - # debug message for prio - msgPrio += '%s:running=%s:impose=%s,' % (policyName,tmpNumRunning,toBeImposed) - # append - if toBeImposed: - prioToBeImposed.append(tmpShareDef['policy']) - msgPrio = msgPrio[:-1] - # no activated - if shareMap == {}: - _logger.debug("getCriteriaForProdShare %s : ret=None - no activated" % siteName) - return retForNone - # no running - if totalRunning == 0: - _logger.debug("getCriteriaForProdShare %s : ret=None - no running" % siteName) - return retForNone - # zero share - if totalShare == 0: - _logger.debug("getCriteriaForProdShare %s : ret=None - zero share" % siteName) - return retForNone - # select the group where share most diverges from the definition - lowestShareRatio = None - lowestSharePolicy = None - for policyName,tmpVarMap in shareMap.iteritems(): - # ignore zero share - if tmpVarMap['share'] == 0: - continue - tmpShareDef = float(tmpVarMap['share']) / float(totalShare) - tmpShareNow = float(tmpVarMap['running']) / float(totalRunning) - tmpShareRatio = tmpShareNow / tmpShareDef - # take max priority into account for cloud share - if usingCloud != '': - # skip over share - if tmpShareNow > tmpShareDef: - continue - tmpShareRatio /= float(1000 + tmpVarMap['maxprio']) - if lowestShareRatio == None or lowestShareRatio > tmpShareRatio: - lowestShareRatio = tmpShareRatio - lowestSharePolicy = policyName - # make criteria - retVarMap = {} - retStr = '' - if lowestSharePolicy != None: - tmpShareDef = shareMap[lowestSharePolicy]['policy'] - # working group - if tmpShareDef['group'] == None: - groupInDefList = self.faresharePolicy[siteName]['groupList'] - # catch all except WGs used by other policies - if groupInDefList != []: - groupUsedInClause = [] - tmpIdx = 0 - # use real name of workingGroup - for tmpGroupIdx in groupInDefList: - if not workingGroupInQueueMap.has_key(tmpGroupIdx): - continue - for tmpGroup in workingGroupInQueueMap[tmpGroupIdx]: - if tmpGroup in groupUsedInClause: - continue - # add AND at the first WG - if groupUsedInClause == []: - retStr += 'AND workingGroup NOT IN (' - # add WG - tmpKey = ':shareWG%s' % tmpIdx - retVarMap[tmpKey] = tmpGroup - retStr += '%s,' % tmpKey - tmpIdx += 1 - # append - groupUsedInClause.append(tmpGroup) - if groupUsedInClause != []: - retStr = retStr[:-1] - retStr += ') ' - else: - # match with one WG - if workingGroupInQueueMap.has_key(tmpShareDef['group']): - groupUsedInClause = [] - tmpIdx = 0 - # use real name of workingGroup - for tmpGroup in workingGroupInQueueMap[tmpShareDef['group']]: - if tmpGroup in groupUsedInClause: - continue - # add AND at the first WG - if groupUsedInClause == []: - retStr += 'AND workingGroup IN (' - # add WG - tmpKey = ':shareWG%s' % tmpIdx - retVarMap[tmpKey] = tmpGroup - retStr += '%s,' % tmpKey - tmpIdx += 1 - # append - groupUsedInClause.append(tmpGroup) - if groupUsedInClause != []: - retStr = retStr[:-1] - retStr += ') ' - # processing type - if tmpShareDef['type'] == None: - typeInDefList = self.faresharePolicy[siteName]['typeList'][tmpShareDef['group']] - # catch all except WGs used by other policies - if typeInDefList != []: - # get the list of processingTypes from the list of processGroups - retVarMapP = {} - retStrP = 'AND processingType NOT IN (' - tmpIdx = 0 - for tmpTypeGroup in typeInDefList: - if processGroupInQueueMap.has_key(tmpTypeGroup): - for tmpType in processGroupInQueueMap[tmpTypeGroup]: - tmpKey = ':sharePT%s' % tmpIdx - retVarMapP[tmpKey] = tmpType - retStrP += '%s,' % tmpKey - tmpIdx += 1 - retStrP = retStrP[:-1] - retStrP += ') ' - # copy - if retVarMapP != {}: - retStr += retStrP - for tmpKey,tmpType in retVarMapP.iteritems(): - retVarMap[tmpKey] = tmpType - else: - # match with one processingGroup - if processGroupInQueueMap.has_key(tmpShareDef['type']) and processGroupInQueueMap[tmpShareDef['type']] != []: - retStr += 'AND processingType IN (' - tmpIdx = 0 - for tmpType in processGroupInQueueMap[tmpShareDef['type']]: - tmpKey = ':sharePT%s' % tmpIdx - retVarMap[tmpKey] = tmpType - retStr += '%s,' % tmpKey - tmpIdx += 1 - retStr = retStr[:-1] - retStr += ') ' - # priority - tmpIdx = 0 - for tmpDefItem in prioToBeImposed: - if tmpDefItem['group'] in [None,tmpShareDef['group']] and \ - tmpDefItem['type'] in [None,tmpShareDef['type']]: - if tmpDefItem['prioCondition'] == '>': - retStrP = '<=' - elif tmpDefItem['prioCondition'] == '>=': - retStrP = '<' - elif tmpDefItem['prioCondition'] == '<=': - retStrP = '>' - elif tmpDefItem['prioCondition'] == '<': - retStrP = '>=' - else: - continue - tmpKey = ':sharePrio%s' % tmpIdx - retVarMap[tmpKey] = tmpDefItem['priority'] - retStr += ('AND currentPriority%s%s' % (retStrP,tmpKey)) - tmpIdx += 1 - _logger.debug("getCriteriaForProdShare %s : sql='%s' var=%s cloud=%s %s %s" % \ - (siteName,retStr,str(retVarMap),usingCloud,msgShare,msgPrio)) - # append criteria for test jobs - if retStr != '': - retVarMap[':shareLabel1'] = 'managed' - retVarMap[':shareLabel2'] = 'test' - retVarMap[':shareLabel3'] = 'prod_test' - retVarMap[':shareLabel4'] = 'install' - retStr = 'AND (prodSourceLabel IN (:shareLabel2,:shareLabel3,:shareLabel4) OR (prodSourceLabel=:shareLabel1 ' + retStr + '))' - return retStr,retVarMap - except: - errtype,errvalue = sys.exc_info()[:2] - _logger.error("getCriteriaForProdShare %s : %s %s" % (siteName,errtype,errvalue)) - # roll back - self._rollback() - return retForNone - - - # get beyond pledge resource ratio - def getPledgeResourceRatio(self): - comment = ' /* DBProxy.getPledgeResourceRatio */' - # check utime - if self.updateTimeForPledgeRatio != None and (datetime.datetime.utcnow()-self.updateTimeForPledgeRatio) < datetime.timedelta(hours=3): - return - # update utime - self.updateTimeForPledgeRatio = datetime.datetime.utcnow() - _logger.debug("getPledgeResourceRatio") - try: - # set autocommit on - self.conn.begin() - # select - sql = "SELECT siteid,countryGroup,availableCPU,availableStorage,pledgedCPU,pledgedStorage " - sql += "FROM ATLAS_PANDAMETA.schedconfig WHERE countryGroup IS NOT NULL AND siteid LIKE 'ANALY_%' " - self.cur.arraysize = 100000 - self.cur.execute(sql+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # update ratio - self.beyondPledgeRatio = {} - if res != None and len(res) != 0: - for siteid,countryGroup,tmp_availableCPU,tmp_availableStorage,tmp_pledgedCPU,tmp_pledgedStorage in res: - # ignore when countryGroup is undefined - if countryGroup in ['',None]: - continue - # append - self.beyondPledgeRatio[siteid] = {} - self.beyondPledgeRatio[siteid]['countryGroup'] = countryGroup - # convert to float - try: - availableCPU = float(tmp_availableCPU) - except: - availableCPU = 0 - try: - pledgedCPU = float(tmp_pledgedCPU) - except: - pledgedCPU = 0 - # calculate ratio - if availableCPU == 0 or pledgedCPU == 0: - # set 0% when CPU ratio is undefined - self.beyondPledgeRatio[siteid]['ratio'] = 0 - else: - # ratio = (availableCPU-pledgedCPU)/availableCPU*(1-storageTerm) - self.beyondPledgeRatio[siteid]['ratio'] = (availableCPU-pledgedCPU)/availableCPU - _logger.debug("getPledgeResourceRatio -> %s" % str(self.beyondPledgeRatio)) - return - except: - errtype,errvalue = sys.exc_info()[:2] - _logger.error("getPledgeResourceRatio : %s %s" % (errtype,errvalue)) - # roll back - self._rollback() - return - - - # get fareshare policy - def getFaresharePolicy(self,getNewMap=False): - comment = ' /* DBProxy.getFaresharePolicy */' - # check utime - if not getNewMap and self.updateTimeForFaresharePolicy != None and \ - (datetime.datetime.utcnow()-self.updateTimeForFaresharePolicy) < datetime.timedelta(hours=3): - return - if not getNewMap: - # update utime - self.updateTimeForFaresharePolicy = datetime.datetime.utcnow() - _logger.debug("getFaresharePolicy") - try: - # set autocommit on - self.conn.begin() - # get default share - cloudShareMap = {} - cloudTier1Map = {} - sqlD = "SELECT name,fairshare,tier1 FROM ATLAS_PANDAMETA.cloudconfig" - self.cur.arraysize = 100000 - self.cur.execute(sqlD+comment) - res = self.cur.fetchall() - for cloudName,cloudShare,cloudTier1 in res: - try: - cloudTier1Map[cloudName] = cloudTier1.split(',') - except: - pass - if not cloudShare in ['',None]: - cloudShareMap[cloudName] = cloudShare - # get share per site - sql = "SELECT siteid,fairsharePolicy,cloud " - sql += "FROM ATLAS_PANDAMETA.schedconfig WHERE NOT siteid LIKE 'ANALY_%' GROUP BY siteid,fairsharePolicy,cloud" - self.cur.execute(sql+comment) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # update policy - faresharePolicy = {} - for siteid,faresharePolicyStr,cloudName in res: - try: - # share is undefined - usingCloudShare = '' - if faresharePolicyStr in ['',None]: - # skip if share is not defined at site or cloud - if not cloudShareMap.has_key(cloudName): - continue - # skip if T1 doesn't define share - if cloudTier1Map.has_key(cloudName) and siteid in cloudTier1Map[cloudName]: - continue - # use cloud share - faresharePolicyStr = cloudShareMap[cloudName] - usingCloudShare = cloudName - # decompose - hasNonPrioPolicy = False - for tmpItem in faresharePolicyStr.split(','): - # skip empty - tmpItem = tmpItem.strip() - if tmpItem == '': - continue - # keep name - tmpPolicy = {'name':tmpItem} - # group - tmpPolicy['group'] = None - tmpMatch = re.search('group=([^:]+)',tmpItem) - if tmpMatch != None: - if tmpMatch.group(1) in ['','central','*','any']: - # use None for catchall - pass - else: - tmpPolicy['group'] = tmpMatch.group(1) - # type - tmpPolicy['type'] = None - tmpMatch = re.search('type=([^:]+)',tmpItem) - if tmpMatch != None: - if tmpMatch.group(1) in ['*','any']: - # use None for catchall - pass - else: - tmpPolicy['type'] = tmpMatch.group(1) - # priority - tmpPolicy['priority'] = None - tmpPolicy['prioCondition'] = None - tmpMatch = re.search('priority([=<>]+)(\d+)',tmpItem) - if tmpMatch != None: - tmpPolicy['priority'] = int(tmpMatch.group(2)) - tmpPolicy['prioCondition'] = tmpMatch.group(1) - else: - hasNonPrioPolicy = True - # share - tmpPolicy['share'] = tmpItem.split(':')[-1] - # append - if not faresharePolicy.has_key(siteid): - faresharePolicy[siteid] = {'policyList':[]} - faresharePolicy[siteid]['policyList'].append(tmpPolicy) - # add any:any if only priority policies - if not hasNonPrioPolicy: - tmpPolicy = {'name' : 'type=any', - 'group' : None, - 'type' : None, - 'priority' : None, - 'prioCondition' : None, - 'share' : '100%'} - faresharePolicy[siteid]['policyList'].append(tmpPolicy) - # some translation - faresharePolicy[siteid]['usingGroup'] = False - faresharePolicy[siteid]['usingType'] = False - faresharePolicy[siteid]['usingPrio'] = False - faresharePolicy[siteid]['usingCloud'] = usingCloudShare - faresharePolicy[siteid]['groupList'] = [] - faresharePolicy[siteid]['typeList'] = {} - faresharePolicy[siteid]['groupListWithPrio'] = [] - faresharePolicy[siteid]['typeListWithPrio'] = {} - for tmpDefItem in faresharePolicy[siteid]['policyList']: - # using WG - if tmpDefItem['group'] != None: - faresharePolicy[siteid]['usingGroup'] = True - # using PG - if tmpDefItem['type'] != None: - faresharePolicy[siteid]['usingType'] = True - # using prio - if tmpDefItem['priority'] != None: - faresharePolicy[siteid]['usingPrio'] = True - # get list of WG and PG with/without priority - if tmpDefItem['priority'] == None: - # get list of woringGroups - if tmpDefItem['group'] != None and not tmpDefItem['group'] in faresharePolicy[siteid]['groupList']: - faresharePolicy[siteid]['groupList'].append(tmpDefItem['group']) - # get list of processingGroups - if not faresharePolicy[siteid]['typeList'].has_key(tmpDefItem['group']): - faresharePolicy[siteid]['typeList'][tmpDefItem['group']] = [] - if tmpDefItem['type'] != None and not tmpDefItem['type'] in faresharePolicy[siteid]['typeList'][tmpDefItem['group']]: - faresharePolicy[siteid]['typeList'][tmpDefItem['group']].append(tmpDefItem['type']) - else: - # get list of woringGroups - if tmpDefItem['group'] != None and not tmpDefItem['group'] in faresharePolicy[siteid]['groupListWithPrio']: - faresharePolicy[siteid]['groupListWithPrio'].append(tmpDefItem['group']) - # get list of processingGroups - if not faresharePolicy[siteid]['typeListWithPrio'].has_key(tmpDefItem['group']): - faresharePolicy[siteid]['typeListWithPrio'][tmpDefItem['group']] = [] - if tmpDefItem['type'] != None and not tmpDefItem['type'] in faresharePolicy[siteid]['typeListWithPrio'][tmpDefItem['group']]: - faresharePolicy[siteid]['typeListWithPrio'][tmpDefItem['group']].append(tmpDefItem['type']) - except: - errtype,errvalue = sys.exc_info()[:2] - _logger.warning("getFaresharePolicy : wrond definition '%s' for %s : %s %s" % (faresharePolicy,siteid,errtype,errvalue)) - _logger.debug("getFaresharePolicy -> %s" % str(faresharePolicy)) - if not getNewMap: - self.faresharePolicy = faresharePolicy - return - else: - return faresharePolicy - except: - errtype,errvalue = sys.exc_info()[:2] - _logger.error("getFaresharePolicy : %s %s" % (errtype,errvalue)) - # roll back - self._rollback() - if not getNewMap: - return - else: - return {} - - - # get cloud list - def getCloudList(self): - comment = ' /* DBProxy.getCloudList */' - _logger.debug("getCloudList start") - try: - # set autocommit on - self.conn.begin() - # select - sql = "SELECT name,tier1,tier1SE,relocation,weight,server,status,transtimelo," - sql += "transtimehi,waittime,validation,mcshare,countries,fasttrack,nprestage," - sql += "pilotowners " - sql+= "FROM ATLAS_PANDAMETA.cloudconfig" - self.cur.arraysize = 10000 - self.cur.execute(sql+comment) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - ret = {} - if resList != None and len(resList) != 0: - for res in resList: - # change None to '' - resTmp = [] - for tmpItem in res: - if tmpItem == None: - tmpItem = '' - resTmp.append(tmpItem) - name,tier1,tier1SE,relocation,weight,server,status,transtimelo,transtimehi,\ - waittime,validation,mcshare,countries,fasttrack,nprestage,pilotowners = resTmp - # instantiate CloudSpec - tmpC = CloudSpec.CloudSpec() - tmpC.name = name - tmpC.tier1 = tier1 - tmpC.tier1SE = re.sub(' ','',tier1SE).split(',') - tmpC.relocation = relocation - tmpC.weight = weight - tmpC.server = server - tmpC.status = status - tmpC.transtimelo = transtimelo - tmpC.transtimehi = transtimehi - tmpC.waittime = waittime - tmpC.validation = validation - tmpC.mcshare = mcshare - tmpC.countries = countries - tmpC.fasttrack = fasttrack - tmpC.nprestage = nprestage - tmpC.pilotowners = pilotowners - # append - ret[name] = tmpC - _logger.debug("getCloudList done") - return ret - except: - type, value, traceBack = sys.exc_info() - _logger.error("getCloudList : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # check sites with release/cache - def checkSitesWithRelease(self,sites,releases,caches,cmtConfig=None): - comment = ' /* DBProxy.checkSitesWithRelease */' - try: - relStr = releases - if releases != None: - relStr = releases.replace('\n',' ') - caStr = caches - if caches != None: - caStr = caches.replace('\n',' ') - _logger.debug("checkSitesWithRelease(%s,%s,%s,%s)" % (sites,relStr,caStr,cmtConfig)) - # select - sql = "SELECT distinct siteid FROM ATLAS_PANDAMETA.InstalledSW WHERE " - loopKey2 = None - loopValues2 = [] - if not caches in ['','NULL',None]: - loopKey = ':cache' - loopValues = caches.split('\n') - sql += "cache=:cache " - if not releases in ['','NULL',None]: - loopKey2 = ':release' - loopValues2 = releases.split('\n') - sql += "AND release=:release " - elif not releases in ['','NULL',None]: - loopKey = ':release' - loopValues = releases.split('\n') - sql += "release=:release AND cache='None' " - else: - # don't check - return sites - checkCMT = False - if not cmtConfig in ['','NULL',None]: - sql += "AND cmtConfig=:cmtConfig " - checkCMT = True - sql += "AND siteid IN (" - # start transaction - self.conn.begin() - self.cur.arraysize = 1000 - # loop over all releases/caches - for loopIdx,loopVal in enumerate(loopValues): - # remove Atlas- - loopVal = re.sub('^Atlas-','',loopVal) - sqlSite = sql - varMap = {} - varMap[loopKey] = loopVal - if loopKey2 != None: - loopVal2 = loopValues2[loopIdx] - loopVal2 = re.sub('^Atlas-','',loopVal2) - varMap[loopKey2] = loopVal2 - if checkCMT: - varMap[':cmtConfig'] = cmtConfig - tmpRetSites = [] - # loop over sites - nSites = 10 - iSite = 0 - for siteIndex,site in enumerate(sites): - iSite += 1 - tmpSiteKey = ':siteid%s' % iSite - varMap[tmpSiteKey] = site - sqlSite += '%s,' % tmpSiteKey - if iSite == nSites or (siteIndex+1) == len(sites): - iSite = 0 - # close bracket in SQL - sqlSite = sqlSite[:-1] - sqlSite += ')' - # execute - _logger.debug(sqlSite+comment+str(varMap)) - self.cur.execute(sqlSite+comment, varMap) - resList = self.cur.fetchall() - # collect candidates - if len(resList) > 0: - for tmpSite, in resList: - # append - tmpRetSites.append(tmpSite) - # reset - sqlSite = sql - varMap = {} - varMap[loopKey] = loopVal - if loopKey2 != None: - varMap[loopKey2] = loopVal2 - if checkCMT: - varMap[':cmtConfig'] = cmtConfig - # set - sites = tmpRetSites - # escape - if sites == []: - break - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("checkSitesWithRelease -> %s" % sites) - return sites - except: - # roll back - self._rollback() - type,value,traceBack = sys.exc_info() - _logger.error("checkSitesWithRelease : %s %s" % (type,value)) - return [] - - - # get sites with release/cache in cloud - def getSitesWithReleaseInCloud(self,cloud,releases,caches,validation): - comment = ' /* DBProxy.getSitesWithReleaseInCloud */' - try: - relStr = releases - if releases != None: - relStr = releases.replace('\n',' ') - caStr = caches - if caches != None: - caStr = caches.replace('\n',' ') - _logger.debug("getSitesWithReleaseInCloud(%s,%s,%s,%s)" % (cloud,relStr,caStr,validation)) - # select - sql = "SELECT distinct siteid FROM ATLAS_PANDAMETA.InstalledSW WHERE cloud=:cloud AND " - varMap = {} - varMap[':cloud'] = cloud - if not caches in ['','NULL',None]: - loopKey = ':cache' - loopValues = caches.split('\n') - sql += "cache=:cache " - else: - loopKey = ':release' - loopValues = releases.split('\n') - sql += "release=:release AND cache='None' " - # validation - if validation: - sql += "validation=:validation " - varMap[':validation'] = 'validated' - # start transaction - self.conn.begin() - self.cur.arraysize = 100 - # loop over all releases/caches - retSites = None - for loopVal in loopValues: - # remove Atlas- - loopVal = re.sub('^Atlas-','',loopVal) - varMap[loopKey] = loopVal - # execute - _logger.debug(sql+comment+str(varMap)) - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - # append - tmpRetSites = [] - for tmpItem, in resList: - if retSites == None or (tmpItem in retSites): - tmpRetSites.append(tmpItem) - # set - retSites = tmpRetSites - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - retSites = [] - for tmpItem, in resList: - retSites.append(tmpItem) - _logger.debug("getSitesWithReleaseInCloud -> %s" % retSites) - return retSites - except: - # roll back - self._rollback() - type,value,traceBack = sys.exc_info() - _logger.error("getSitesWithReleaseInCloud : %s %s" % (type,value)) - return [] - - - # get list of cache prefix - def getCachePrefixes(self): - comment = ' /* DBProxy.getCachePrefixes */' - try: - _logger.debug("getCachePrefixes") - # select - sql = "SELECT distinct cache FROM ATLAS_PANDAMETA.installedSW WHERE cache IS NOT NULL" - # start transaction - self.conn.begin() - self.cur.arraysize = 10000 - # execute - self.cur.execute(sql+comment, {}) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - tmpList = [] - for tmpItem, in resList: - match = re.search('^([^-]+)-',tmpItem) - if match != None: - tmpPrefix = match.group(1) - if not tmpPrefix in tmpList: - tmpList.append(tmpPrefix) - _logger.debug("getCachePrefixes -> %s" % tmpList) - return tmpList - except: - # roll back - self._rollback() - type,value,traceBack = sys.exc_info() - _logger.error("getCachePrefixes : %s %s" % (type,value)) - return [] - - - # get pilot owners - def getPilotOwners(self): - comment = ' /* DBProxy.getPilotOwners */' - _logger.debug("getPilotOwners") - try: - # set autocommit on - self.conn.begin() - # select - sql = "SELECT pilotowners FROM ATLAS_PANDAMETA.cloudconfig" - self.cur.arraysize = 100 - self.cur.execute(sql+comment) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - ret = [] - for tmpItem, in resList: - if tmpItem != None: - for tmpOwner in tmpItem.split('|'): - if tmpOwner != '': - ret.append(tmpOwner) - _logger.debug("getPilotOwners -> %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - type,value,traceBack = sys.exc_info() - _logger.error("getPilotOwners : %s %s" % (type,value)) - return [] - - - # get allowed nodes - def getAllowedNodes(self): - comment = ' /* DBProxy.getAllowedNodes */' - _logger.debug("getAllowedNodes") - try: - # set autocommit on - self.conn.begin() - # select - sql = "SELECT siteid,allowedNode FROM ATLAS_PANDAMETA.schedconfig " - sql += "WHERE siteid IS NOT NULL AND allowedNode IS NOT NULL" - self.cur.arraysize = 1000 - self.cur.execute(sql+comment) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - ret = {} - for tmpSiteID,tmpAllowedNode in resList: - if not ret.has_key(tmpSiteID): - ret[tmpSiteID] = tmpAllowedNode.split(',') - _logger.debug("getAllowedNodes -> %s" % str(ret)) - return ret - except: - # roll back - self._rollback() - tmpType,tmpValue = sys.exc_info()[:2] - _logger.error("getAllowedNodes : %s %s" % (tmpType,tmpValue)) - return {} - - - # extract name from DN - def cleanUserID(self, id): - try: - up = re.compile('/(DC|O|OU|C|L)=[^\/]+') - username = up.sub('', id) - up2 = re.compile('/CN=[0-9]+') - username = up2.sub('', username) - up3 = re.compile(' [0-9]+') - username = up3.sub('', username) - up4 = re.compile('_[0-9]+') - username = up4.sub('', username) - username = username.replace('/CN=proxy','') - username = username.replace('/CN=limited proxy','') - username = username.replace('limited proxy','') - username = re.sub('/CN=Robot:[^/]+','',username) - pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)') - mat = pat.match(username) - if mat: - username = mat.group(2) - else: - username = username.replace('/CN=','') - if username.lower().find('/email') > 0: - username = username[:username.lower().find('/email')] - pat = re.compile('.*(limited.*proxy).*') - mat = pat.match(username) - if mat: - username = mat.group(1) - username = username.replace('(','') - username = username.replace(')','') - username = username.replace("'",'') - return username - except: - return id - - - # extract scope from dataset name - def extractScope(self,name): - try: - if name.lower().startswith('user') or \ - name.lower().startswith('group'): - # return None if there are not enough fields - if len(name.split('.')) < 2: - return None - return name.lower().split('.')[0] + '.' + name.lower().split('.')[1] - return name.split('.')[0] - except: - return None - - - # check quota - def checkQuota(self,dn): - comment = ' /* DBProxy.checkQuota */' - _logger.debug("checkQuota %s" % dn) - try: - # set autocommit on - self.conn.begin() - # select - name = self.cleanUserID(dn) - sql = "SELECT cpua1,cpua7,cpua30,quotaa1,quotaa7,quotaa30 FROM ATLAS_PANDAMETA.users WHERE name=:name" - varMap = {} - varMap[':name'] = name - self.cur.arraysize = 10 - self.cur.execute(sql+comment,varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - weight = 0.0 - if res != None and len(res) != 0: - item = res[0] - # cpu and quota - cpu1 = item[0] - cpu7 = item[1] - cpu30 = item[2] - if item[3] in [0,None]: - quota1 = 0 - else: - quota1 = item[3] * 3600 - if item[4] in [0,None]: - quota7 = 0 - else: - quota7 = item[4] * 3600 - if item[5] in [0,None]: - quota30 = 0 - else: - quota30 = item[5] * 3600 - # CPU usage - if cpu1 == None: - cpu1 = 0.0 - # weight - if quota1 > 0: - weight = float(cpu1) / float(quota1) - # not exceeded the limit - if weight < 1.0: - weight = 0.0 - _logger.debug("checkQuota %s Weight:%s Quota:%s CPU:%s" % (dn,weight,quota1,cpu1)) - else: - _logger.debug("checkQuota cannot found %s" % dn) - return weight - except: - type, value, traceBack = sys.exc_info() - _logger.error("checkQuota : %s %s" % (type,value)) - # roll back - self._rollback() - return 0.0 - - - # get serialize JobID and status - def getUserParameter(self,dn,jobID,jobsetID): - comment = ' /* DBProxy.getUserParameter */' - _logger.debug("getUserParameter %s JobID=%s JobsetID=%s" % (dn,jobID,jobsetID)) - try: - # set initial values - retStatus = True - if jobsetID == -1: - # generate new jobsetID - retJobsetID = jobID - # new jobID = 1 + new jobsetID - retJobID = retJobsetID + 1 - elif jobsetID in ['NULL',None,0]: - # no jobsetID - retJobsetID = None - retJobID = jobID - else: - # user specified jobsetID - retJobsetID = jobsetID - retJobID = jobID - # set autocommit on - self.conn.begin() - # select - name = self.cleanUserID(dn) - sql = "SELECT jobid,status FROM ATLAS_PANDAMETA.users WHERE name=:name " - sql += "FOR UPDATE " - sqlAdd = "INSERT INTO ATLAS_PANDAMETA.users " - sqlAdd += "(ID,NAME,LASTMOD,FIRSTJOB,LATESTJOB,CACHETIME,NCURRENT,JOBID) " - sqlAdd += "VALUES(ATLAS_PANDAMETA.USERS_ID_SEQ.nextval,:name," - sqlAdd += "CURRENT_DATE,CURRENT_DATE,CURRENT_DATE,CURRENT_DATE,0,1) " - varMap = {} - varMap[':name'] = name - self.cur.execute(sql+comment,varMap) - self.cur.arraysize = 10 - res = self.cur.fetchall() - # insert if no record - if res == None or len(res) == 0: - try: - self.cur.execute(sqlAdd+comment,varMap) - retI = self.cur.rowcount - _logger.debug("getUserParameter %s inserted new row with %s" % (dn,retI)) - # emulate DB response - res = [[1,'']] - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("getUserParameter %s failed to insert new row with %s:%s" % (dn,errType,errValue)) - if res != None and len(res) != 0: - item = res[0] - # JobID in DB - dbJobID = item[0] - # check status - if item[1] in ['disabled']: - retStatus = False - # use larger JobID - if dbJobID >= int(retJobID) or (jobsetID == -1 and dbJobID >= int(retJobsetID)): - if jobsetID == -1: - # generate new jobsetID = 1 + exsiting jobID - retJobsetID = dbJobID+1 - # new jobID = 1 + new jobsetID - retJobID = retJobsetID + 1 - else: - # new jobID = 1 + exsiting jobID - retJobID = dbJobID+1 - # update DB - varMap = {} - varMap[':name'] = name - varMap[':jobid'] = retJobID - sql = "UPDATE ATLAS_PANDAMETA.users SET jobid=:jobid WHERE name=:name" - self.cur.execute(sql+comment,varMap) - _logger.debug("getUserParameter set JobID=%s for %s" % (retJobID,dn)) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("getUserParameter %s return JobID=%s JobsetID=%s Status=%s" % (dn,retJobID,retJobsetID,retStatus)) - return retJobID,retJobsetID,retStatus - except: - type, value, traceBack = sys.exc_info() - _logger.error("getUserParameter : %s %s" % (type,value)) - # roll back - self._rollback() - return retJobID,retJobsetID,retStatus - - - # get JobID for user - def getJobIdUser(self,dn): - comment = ' /* DBProxy.getJobIdUser */' - _logger.debug("getJobIdUser %s" % dn) - jobID = 0 - try: - # set autocommit on - self.conn.begin() - # select - name = self.cleanUserID(dn) - sql = "SELECT jobid FROM ATLAS_PANDAMETA.users WHERE name=:name" - varMap = {} - varMap[':name'] = name - self.cur.arraysize = 10 - self.cur.execute(sql+comment,varMap) - res = self.cur.fetchone() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if res != None: - jobID, = res - _logger.debug("getJobIdUser %s -> %s" % (name,jobID)) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("getJobIdUser : %s %s" % (errType,errValue)) - # roll back - self._rollback() - return jobID - - - # check ban user - def checkBanUser(self,dn,sourceLabel): - comment = ' /* DBProxy.checkBanUser */' - _logger.debug("checkBanUser %s %s" % (dn,sourceLabel)) - try: - # set initial values - retStatus = True - # set autocommit on - self.conn.begin() - # select - name = self.cleanUserID(dn) - sql = "SELECT status FROM ATLAS_PANDAMETA.users WHERE name=:name" - varMap = {} - varMap[':name'] = name - self.cur.execute(sql+comment,varMap) - self.cur.arraysize = 10 - res = self.cur.fetchone() - if res != None: - # check status - tmpStatus, = res - if tmpStatus in ['disabled']: - retStatus = False - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("checkBanUser %s %s Status=%s" % (dn,sourceLabel,retStatus)) - return retStatus - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("checkBanUser %s %s : %s %s" % (dn,sourceLabel,errType,errValue)) - # roll back - self._rollback() - return retStatus - - - # get email address for a user - def getEmailAddr(self,name): - comment = ' /* DBProxy.getEmailAddr */' - _logger.debug("get email for %s" % name) - try: - # set autocommit on - self.conn.begin() - # select - sql = "SELECT email FROM ATLAS_PANDAMETA.users WHERE name=:name" - varMap = {} - varMap[':name'] = name - self.cur.execute(sql+comment,varMap) - self.cur.arraysize = 10 - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if res != None and len(res) != 0: - return res[0][0] - # return empty string - return "" - except: - type, value, traceBack = sys.exc_info() - _logger.error("getEmailAddr : %s %s" % (type,value)) - # roll back - self._rollback() - return "" - - - # get client version - def getPandaClientVer(self): - comment = ' /* DBProxy.getPandaClientVer */' - _logger.debug("getPandaClientVer") - try: - # set autocommit on - self.conn.begin() - # select - sql = "SELECT pathena FROM ATLAS_PANDAMETA.pandaconfig WHERE name=:name" - varMap = {} - varMap[':name'] = 'current' - self.cur.execute(sql+comment,varMap) - self.cur.arraysize = 10 - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - retStr = '' - if res != None and len(res) != 0: - retStr = res[0][0] - _logger.debug("getPandaClientVer -> %s" % retStr) - return retStr - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getPandaClientVer : %s %s" % (type,value)) - return "" - - - # add files to memcached - def addFilesToMemcached(self,site,node,files): - _logger.debug("addFilesToMemcached start %s %s" % (site,node)) - # memcached is unused - if not panda_config.memcached_enable: - _logger.debug("addFilesToMemcached skip %s %s" % (site,node)) - return True - try: - # initialize memcache if needed - if self.memcache == None: - from MemProxy import MemProxy - self.memcache = MemProxy() - # convert string to list - fileList = files.split(',') - # remove '' - try: - fileList.remove('') - except: - pass - # empty list - if len(fileList) == 0: - _logger.debug("addFilesToMemcached skipped for empty list") - return True - # list of siteIDs - siteIDs = site.split(',') - # loop over all siteIDs - for tmpSite in siteIDs: - # add - iFiles = 0 - nFiles = 100 - retS = True - while iFiles < len(fileList): - tmpRetS = self.memcache.setFiles(None,tmpSite,node,fileList[iFiles:iFiles+nFiles]) - if not tmpRetS: - retS = False - iFiles += nFiles - _logger.debug("addFilesToMemcached done %s %s with %s" % (site,node,retS)) - return retS - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("addFilesToMemcached : %s %s" % (errType,errValue)) - return False - - - # delete files from memcached - def deleteFilesFromMemcached(self,site,node,files): - _logger.debug("deleteFilesFromMemcached start %s %s" % (site,node)) - # memcached is unused - if not panda_config.memcached_enable: - _logger.debug("deleteFilesFromMemcached skip %s %s" % (site,node)) - return True - try: - # initialize memcache if needed - if self.memcache == None: - from MemProxy import MemProxy - self.memcache = MemProxy() - # list of siteIDs - siteIDs = site.split(',') - # loop over all siteIDs - for tmpSite in siteIDs: - # delete - self.memcache.deleteFiles(tmpSite,node,files) - _logger.debug("deleteFilesFromMemcached done %s %s" % (site,node)) - return True - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("deleteFilesFromMemcached : %s %s" % (errType,errValue)) - return False - - - # flush memcached - def flushMemcached(self,site,node): - _logger.debug("flushMemcached start %s %s" % (site,node)) - # memcached is unused - if not panda_config.memcached_enable: - _logger.debug("flushMemcached skip %s %s" % (site,node)) - return True - try: - # initialize memcache if needed - if self.memcache == None: - from MemProxy import MemProxy - self.memcache = MemProxy() - # list of siteIDs - siteIDs = site.split(',') - # loop over all siteIDs - for tmpSite in siteIDs: - # flush - self.memcache.flushFiles(tmpSite,node) - _logger.debug("flushMemcached done %s %s" % (site,node)) - return True - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("flushMemcached : %s %s" % (errType,errValue)) - return False - - - # check files with memcached - def checkFilesWithMemcached(self,site,node,files): - _logger.debug("checkFilesWithMemcached start %s %s" % (site,node)) - # convert string to list - fileList = files.split(',') - # remove '' - try: - fileList.remove('') - except: - pass - # memcached is unused - if not panda_config.memcached_enable: - _logger.debug("checkFilesWithMemcached skip %s %s" % (site,node)) - # return 0 - retStr = '' - for tmpF in fileList: - retStr += '0,' - retStr = retStr[:-1] - return retStr - try: - # initialize memcache if needed - if self.memcache == None: - from MemProxy import MemProxy - self.memcache = MemProxy() - # empty list - if len(fileList) == 0: - _logger.debug("checkFilesWithMemcached skipped for empty list") - return '' - # check - iFiles = 0 - nFiles = 100 - retS = '' - while iFiles < len(fileList): - retS += self.memcache.checkFiles(None,fileList[iFiles:iFiles+nFiles],site,node,getDetail=True) - retS += ',' - iFiles += nFiles - retS = retS[:-1] - _logger.debug("checkFilesWithMemcached done %s %s with %s" % (site,node,retS)) - return retS - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("checkFilesWithMemcached : %s %s" % (errType,errValue)) - return False - - - # register proxy key - def registerProxyKey(self,params): - comment = ' /* DBProxy.registerProxyKey */' - _logger.debug("register ProxyKey %s" % str(params)) - try: - # set autocommit on - self.conn.begin() - # construct SQL - sql0 = 'INSERT INTO ATLAS_PANDAMETA.proxykey (id,' - sql1 = 'VALUES (ATLAS_PANDAMETA.PROXYKEY_ID_SEQ.nextval,' - vals = {} - for key,val in params.iteritems(): - sql0 += '%s,' % key - sql1 += ':%s,' % key - vals[':%s' % key] = val - sql0 = sql0[:-1] - sql1 = sql1[:-1] - sql = sql0 + ') ' + sql1 + ') ' - # insert - self.cur.execute(sql+comment,vals) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return True - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("registerProxyKey : %s %s" % (type,value)) - # roll back - self._rollback() - return "" - - - # get proxy key - def getProxyKey(self,dn): - comment = ' /* DBProxy.getProxyKey */' - _logger.debug("get ProxyKey %s" % dn) - try: - # set autocommit on - self.conn.begin() - # construct SQL - sql = 'SELECT credname,expires,origin,myproxy FROM ATLAS_PANDAMETA.proxykey WHERE dn=:dn ORDER BY expires DESC' - varMap = {} - varMap[':dn'] = dn - # select - self.cur.execute(sql+comment,varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - retMap = {} - if res != None and len(res) != 0: - credname,expires,origin,myproxy = res[0] - retMap['credname'] = credname - retMap['expires'] = expires - retMap['origin'] = origin - retMap['myproxy'] = myproxy - _logger.debug(retMap) - return retMap - except: - type, value, traceBack = sys.exc_info() - _logger.error("getProxyKey : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # check site access - def checkSiteAccess(self,siteid,longDN): - comment = ' /* DBProxy.checkSiteAccess */' - _logger.debug("checkSiteAccess %s:%s" % (siteid,longDN)) - try: - # use compact DN - dn = self.cleanUserID(longDN) - # construct SQL - sql = 'SELECT poffset,rights,status,workingGroups FROM ATLAS_PANDAMETA.siteaccess WHERE dn=:dn AND pandasite=:pandasite' - varMap = {} - varMap[':dn'] = dn - varMap[':pandasite'] = siteid - # set autocommit on - self.conn.begin() - # select - self.cur.execute(sql+comment,varMap) - self.cur.arraysize = 10 - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - retMap = {} - if res != None and len(res) != 0: - poffset,rights,status,workingGroups = res[0] - retMap['poffset'] = poffset - retMap['rights'] = rights - retMap['status'] = status - if workingGroups in ['',None]: - workingGroups = [] - else: - workingGroups = workingGroups.split(',') - retMap['workingGroups'] = workingGroups - _logger.debug(retMap) - return retMap - except: - type, value, traceBack = sys.exc_info() - _logger.error("checkSiteAccess : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # add account to siteaccess - def addSiteAccess(self,siteID,longDN): - comment = ' /* DBProxy.addSiteAccess */' - _logger.debug("addSiteAccess : %s %s" % (siteID,longDN)) - try: - # use compact DN - dn = self.cleanUserID(longDN) - # set autocommit on - self.conn.begin() - # select - sql = 'SELECT status FROM ATLAS_PANDAMETA.siteaccess WHERE dn=:dn AND pandasite=:pandasite' - varMap = {} - varMap[':dn'] = dn - varMap[':pandasite'] = siteID - self.cur.execute(sql+comment,varMap) - self.cur.arraysize = 10 - res = self.cur.fetchone() - if res != None: - _logger.debug("account already exists with status=%s" % res[0]) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return res[0] - # add - sql = 'INSERT INTO ATLAS_PANDAMETA.siteaccess (id,dn,pandasite,status,created) VALUES (ATLAS_PANDAMETA.SITEACCESS_ID_SEQ.nextval,:dn,:pandasite,:status,CURRENT_DATE)' - varMap = {} - varMap[':dn'] = dn - varMap[':pandasite'] = siteID - varMap[':status'] = 'requested' - self.cur.execute(sql+comment,varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("account was added") - return 0 - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("addSiteAccess : %s %s" % (type,value)) - # return None - return -1 - - - # list site access - def listSiteAccess(self,siteid=None,dn=None,longFormat=False): - comment = ' /* DBProxy.listSiteAccess */' - _logger.debug("listSiteAccess %s:%s" % (siteid,dn)) - try: - if siteid==None and dn==None: - return [] - longAttributes = 'status,poffset,rights,workingGroups,created' - # set autocommit on - self.conn.begin() - # construct SQL - if siteid != None: - varMap = {':pandasite':siteid} - if not longFormat: - sql = 'SELECT dn,status FROM ATLAS_PANDAMETA.siteaccess WHERE pandasite=:pandasite ORDER BY dn' - else: - sql = 'SELECT dn,%s FROM ATLAS_PANDAMETA.siteaccess ' % longAttributes - sql += 'WHERE pandasite=:pandasite ORDER BY dn' - else: - shortDN = self.cleanUserID(dn) - varMap = {':dn':shortDN} - if not longFormat: - sql = 'SELECT pandasite,status FROM ATLAS_PANDAMETA.siteaccess WHERE dn=:dn ORDER BY pandasite' - else: - sql = 'SELECT pandasite,%s FROM ATLAS_PANDAMETA.siteaccess ' % longAttributes - sql += 'WHERE dn=:dn ORDER BY pandasite' - # select - self.cur.execute(sql+comment,varMap) - self.cur.arraysize = 1000 - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - ret = [] - if res != None and len(res) != 0: - for tmpRes in res: - if not longFormat: - ret.append(tmpRes) - else: - # create map for long format - tmpRetMap = {} - # use first value as a primary key - tmpRetMap['primKey'] = tmpRes[0] - idxVal = 1 - for tmpKey in longAttributes.split(','): - tmpRetMap[tmpKey] = tmpRes[idxVal] - idxVal += 1 - ret.append(tmpRetMap) - _logger.debug(ret) - return ret - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("listSiteAccess : %s %s" % (type,value)) - return [] - - - # update site access - def updateSiteAccess(self,method,siteid,requesterDN,userName,attrValue): - comment = ' /* DBProxy.updateSiteAccess */' - _logger.debug("updateSiteAccess %s:%s:%s:%s:%s" % (method,siteid,requesterDN,userName,attrValue)) - try: - # set autocommit on - self.conn.begin() - # check existence - varMap = {} - varMap[':pandasite'] = siteid - varMap[':dn'] = userName - sql = 'SELECT count(*) FROM ATLAS_PANDAMETA.siteaccess WHERE pandasite=:pandasite AND dn=:dn' - self.cur.execute(sql+comment,varMap) - self.cur.arraysize = 10 - res = self.cur.fetchall() - if res == None or res[0][0] == 0: - _logger.error("updateSiteAccess : No request for %s" % varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return 'No request for %s:%s' % (siteid,userName) - # get cloud - varMap = {':pandasite':siteid} - sql = 'SELECT cloud,dn FROM ATLAS_PANDAMETA.schedconfig WHERE siteid=:pandasite AND rownum<=1' - self.cur.execute(sql+comment,varMap) - res = self.cur.fetchall() - if res == None or len(res) == 0: - _logger.error("updateSiteAccess : No cloud in schedconfig for %s" % siteid) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return "No cloud in schedconfig for %s" % siteid - cloud = res[0][0] - siteContact = res[0][1] - # get cloud responsible - varMap = {':cloud':cloud} - sql = 'SELECT dn FROM ATLAS_PANDAMETA.cloudconfig WHERE name=:cloud' - self.cur.execute(sql+comment,varMap) - res = self.cur.fetchall() - if res == None or len(res) == 0: - _logger.error("updateSiteAccess : No contact in cloudconfig for %s" % cloud) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - return "No contact in cloudconfig for %s" % cloud - contactNames = res[0][0] - if contactNames in [None,'']: - contactNames = [] - else: - contactNames = contactNames.split(',') - # get site responsible - if not siteContact in [None,'']: - contactNames += siteContact.split(',') - # check privilege - if not self.cleanUserID(requesterDN) in contactNames: - _logger.error("updateSiteAccess : %s is not one of contacts %s" % (requesterDN,str(contactNames))) - # return - return "Insufficient privilege" - # update - varMap = {} - varMap[':pandasite'] = siteid - varMap[':dn'] = userName - if method in ['approve','reject']: - # update status - sql = 'UPDATE ATLAS_PANDAMETA.siteaccess SET status=:newStatus WHERE pandasite=:pandasite AND dn=:dn' - if method == 'approve': - varMap[':newStatus'] = 'tobeapproved' - else: - varMap[':newStatus'] = 'toberejected' - elif method == 'delete': - # delete - sql = 'DELETE FROM ATLAS_PANDAMETA.siteaccess WHERE pandasite=:pandasite AND dn=:dn' - elif method == 'set': - # check value - if re.search('^[a-z,A-Z]+:[a-z,A-Z,0-9,\,_\-]+$',attrValue) == None: - errStr = "Invalid argument for set : %s. Must be key:value" % attrValue - _logger.error("updateSiteAccess : %s" % errStr) - # retrun - return errStr - # decompose to key and value - tmpKey = attrValue.split(':')[0].lower() - tmpVal = attrValue.split(':')[-1] - # check key - changeableKeys = ['poffset','workinggroups','rights'] - if not tmpKey in changeableKeys: - errStr = "%s cannot be set. Only %s are allowed" % (tmpKey,str(changeableKeys)) - _logger.error("updateSiteAccess : %s" % errStr) - # retrun - return errStr - # set value map - varMap[':%s' % tmpKey] = tmpVal - sql = 'UPDATE ATLAS_PANDAMETA.siteaccess SET %s=:%s WHERE pandasite=:pandasite AND dn=:dn' % (tmpKey,tmpKey) - else: - _logger.error("updateSiteAccess : Unknown method '%s'" % method) - # return - return "Unknown method '%s'" % method - # execute - _logger.debug(sql+comment+str(varMap)) - self.cur.execute(sql+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("updateSiteAccess : completed") - return True - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("updateSiteAccess : %s %s" % (type,value)) - return 'DB error %s %s' % (type,value) - - - # get list of archived tables - def getArchiveTables(self): - # return - return ['ATLAS_PANDAARCH.jobsArchived'] - - - # get JobIDs in a time range - def getJobIDsInTimeRangeLog(self,dn,timeRange,retJobIDs): - comment = ' /* DBProxy.getJobIDsInTimeRangeLog */' - _logger.debug("getJobIDsInTimeRangeLog : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ['','NULL',None]: - compactDN = dn - # get list of archived tables - tables = self.getArchiveTables() - # select - for table in tables: - # make sql - sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODSOURCELABEL_IDX JOBS_PRODUSERNAME_IDX) */ " - sql += "jobDefinitionID FROM %s tab " % table - sql += "WHERE prodUserName=:prodUserName AND modificationTime>:modificationTime " - sql += "AND prodSourceLabel=:prodSourceLabel GROUP BY jobDefinitionID" - varMap = {} - varMap[':prodUserName'] = compactDN - varMap[':prodSourceLabel'] = 'user' - varMap[':modificationTime'] = timeRange - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - _logger.debug(sql+comment+str(varMap)) - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for tmpID, in resList: - if not tmpID in retJobIDs: - retJobIDs.append(tmpID) - _logger.debug("getJobIDsInTimeRangeLog : %s" % str(retJobIDs)) - return retJobIDs - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getJobIDsInTimeRangeLog : %s %s" % (type,value)) - # return empty list - return retJobIDs - - - # get PandaIDs for a JobID - def getPandIDsWithJobIDLog(self,dn,jobID,idStatus,nJobs,buildJobID=None): - comment = ' /* Proxy.getPandIDsWithJobIDLog */' - _logger.debug("getPandIDsWithJobIDLog : %s %s" % (dn,jobID)) - try: - # get compact DN - compactDN = self.cleanUserID(dn) - if compactDN in ['','NULL',None]: - compactDN = dn - # get list of archived tables - tables = self.getArchiveTables() - # select - for table in tables: - # skip if all jobs have already been gotten - if nJobs > 0 and len(idStatus) >= nJobs: - continue - # make sql - sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ " - sql += "PandaID,jobStatus,commandToPilot,prodSourceLabel,taskBufferErrorCode FROM %s tab " % table - sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>(CURRENT_DATE-30) " - varMap = {} - varMap[':prodUserName'] = compactDN - varMap[':jobDefinitionID'] = jobID - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - # select - _logger.debug(sql+comment+str(varMap)) - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for tmpID,tmpStatus,tmpCommand,tmpProdSourceLabel,tmpTaskBufferErrorCode in resList: - # ignore jobs retried by pilot since they have new PandaIDs with the same jobsetID/jobdefID - if tmpTaskBufferErrorCode in [ErrorCode.EC_PilotRetried]: - continue - # ignore old buildJob which was replaced by rebrokerage - if tmpProdSourceLabel == 'panda': - if buildJobID == None: - # first buildJob - buildJobID = tmpID - elif buildJobID >= tmpID: - # don't append old one - continue - else: - # delete old one - del idStatus[buildJobID] - buildJobID = tmpID - # append - if not idStatus.has_key(tmpID): - idStatus[tmpID] = (tmpStatus,tmpCommand) - _logger.debug("getPandIDsWithJobIDLog : %s" % str(idStatus)) - return idStatus - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getPandIDsWithJobIDLog : %s %s" % (type,value)) - # return empty list - return {} - - - # get PandaIDs for a JobsetID or JobdefID in jobsArchived - def getPandIDsWithIdInArch(self,prodUserName,id,isJobset): - comment = ' /* Proxy.getPandIDsWithIdInArch */' - _logger.debug("getPandIDsWithIdInArch : %s %s %s" % (prodUserName,id,isJobset)) - try: - # make sql - if isJobset: - sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBSETID_IDX) */ " - else: - sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ " - sql += "PandaID FROM ATLAS_PANDAARCH.jobsArchived tab " - sql += "WHERE prodUserName=:prodUserName " - sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>(CURRENT_DATE-30) " - if isJobset: - sql += "AND jobsetID=:jobID " - else: - sql += "AND jobDefinitionID=:jobID " - varMap = {} - varMap[':prodUserName'] = prodUserName - varMap[':jobID'] = id - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 1000000 - # select - _logger.debug(sql+comment+str(varMap)) - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - pandaIDs = [] - for tmpID, in resList: - pandaIDs.append(tmpID) - _logger.debug("getPandIDsWithIdInArch : %s %s -> %s" % (prodUserName,id,str(pandaIDs))) - return pandaIDs - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getPandIDsWithIdInArch : %s %s" % (errType,errValue)) - # return empty list - return [] - - - # peek at job - def peekJobLog(self,pandaID): - comment = ' /* DBProxy.peekJobLog */' - _logger.debug("peekJobLog : %s" % pandaID) - # return None for NULL PandaID - if pandaID in ['NULL','','None',None]: - return None - sql1_0 = "SELECT %s FROM %s " - sql1_1 = "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30) " - # select - varMap = {} - varMap[':PandaID'] = pandaID - nTry=3 - for iTry in range(nTry): - try: - # get list of archived tables - tables = self.getArchiveTables() - # select - for table in tables: - # start transaction - self.conn.begin() - # select - sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1 - self.cur.arraysize = 10 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if len(res) != 0: - # Job - job = JobSpec() - job.pack(res[0]) - # Files - # start transaction - self.conn.begin() - # select - fileTableName = re.sub('jobsArchived','filesTable_ARCH',table) - sqlFile = "SELECT /*+ INDEX(tab FILES_ARCH_PANDAID_IDX)*/ %s " % FileSpec.columnNames() - sqlFile+= "FROM %s tab " % fileTableName - # put constraint on modificationTime to avoid full table scan - sqlFile+= "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-60)" - self.cur.arraysize = 10000 - self.cur.execute(sqlFile+comment, varMap) - resFs = self.cur.fetchall() - # metadata - job.metadata = None - metaTableName = re.sub('jobsArchived','metaTable_ARCH',table) - sqlMeta = "SELECT metaData FROM %s WHERE PandaID=:PandaID" % metaTableName - self.cur.execute(sqlMeta+comment, varMap) - for clobMeta, in self.cur: - if clobMeta != None: - job.metadata = clobMeta.read() - break - # job parameters - job.jobParameters = None - jobParamTableName = re.sub('jobsArchived','jobParamsTable_ARCH',table) - sqlJobP = "SELECT jobParameters FROM %s WHERE PandaID=:PandaID" % jobParamTableName - varMap = {} - varMap[':PandaID'] = job.PandaID - self.cur.execute(sqlJobP+comment, varMap) - for clobJobP, in self.cur: - if clobJobP != None: - job.jobParameters = clobJobP.read() - break - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # set files - for resF in resFs: - file = FileSpec() - file.pack(resF) - # remove redundant white spaces - try: - file.md5sum = file.md5sum.strip() - except: - pass - try: - file.checksum = file.checksum.strip() - except: - pass - job.addFile(file) - return job - _logger.debug("peekJobLog() : PandaID %s not found" % pandaID) - return None - except: - # roll back - self._rollback() - if iTry+1 < nTry: - _logger.error("peekJobLog : %s" % pandaID) - time.sleep(random.randint(10,20)) - continue - type, value, traceBack = sys.exc_info() - _logger.error("peekJobLog : %s %s" % (type,value)) - # return None - return None - - - # get user subscriptions - def getUserSubscriptions(self,datasetName,timeRange): - comment = ' /* DBProxy.getUserSubscriptions */' - _logger.debug("getUserSubscriptions(%s,%s)" % (datasetName,timeRange)) - sql0 = "SELECT site FROM ATLAS_PANDAMETA.UserSubs " - sql0 += "WHERE datasetName=:datasetName and modificationDate>CURRENT_DATE-:timeRange" - varMap = {} - varMap[':datasetName'] = datasetName - varMap[':timeRange'] = timeRange - try: - # start transaction - self.conn.begin() - # select - self.cur.execute(sql0+comment, varMap) - resSs = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - retList = [] - for tmpSite, in resSs: - retList.append(tmpSite) - return retList - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getUserSubscriptions : %s %s" % (errType,errValue)) - return [] - - - # get the number of user subscriptions - def getNumUserSubscriptions(self): - comment = ' /* DBProxy.getNumUserSubscriptions */' - _logger.debug("getNumUserSubscriptions") - sql0 = "SELECT site,COUNT(*) FROM ATLAS_PANDAMETA.UserSubs " - sql0 += "WHERE creationDate>CURRENT_DATE-2 GROUP BY site" - try: - # start transaction - self.conn.begin() - # select - self.cur.execute(sql0+comment,{}) - resSs = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - retList = {} - for tmpSite,countNum in resSs: - retList[tmpSite] = countNum - return retList - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getNumUserSubscriptions : %s %s" % (errType,errValue)) - return [] - - - # add user subscriptions - def addUserSubscription(self,datasetName,dq2IDs): - comment = ' /* DBProxy.addUserSubscription */' - _logger.debug("addUserSubscription(%s,%s)" % (datasetName,dq2IDs)) - sql0 = "INSERT INTO ATLAS_PANDAMETA.UserSubs " - sql0 += "(datasetName,site,creationDate,modificationDate,nUsed) " - sql0 += "VALUES (:datasetName,:site,CURRENT_DATE,CURRENT_DATE,:nUsed)" - try: - # start transaction - self.conn.begin() - for site in dq2IDs: - varMap = {} - varMap[':datasetName'] = datasetName - varMap[':site'] = site - varMap[':nUsed'] = 0 - # insert - self.cur.execute(sql0+comment, varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return True - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("addUserSubscription : %s %s" % (errType,errValue)) - return False - - - # increment counter for subscription - def incrementUsedCounterSubscription(self,datasetName): - comment = ' /* DBProxy.incrementUsedCounterSubscription */' - _logger.debug("incrementUsedCounterSubscription(%s)" % datasetName) - sql0 = "UPDATE ATLAS_PANDAMETA.UserSubs SET nUsed=nUsed+1 " - sql0 += "WHERE datasetName=:datasetName AND nUsed IS NOT NULL" - sqlU = "SELECT MAX(nUsed) FROM ATLAS_PANDAMETA.UserSubs " - sqlU += "WHERE datasetName=:datasetName" - try: - # start transaction - self.conn.begin() - varMap = {} - varMap[':datasetName'] = datasetName - # update - self.cur.execute(sql0+comment,varMap) - # get nUsed - nUsed = 0 - retU = self.cur.rowcount - if retU > 0: - # get nUsed - self.cur.execute(sqlU+comment,varMap) - self.cur.arraysize = 10 - res = self.cur.fetchone() - if res != None: - nUsed = res[0] - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return nUsed - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("incrementUsedCounterSubscription : %s %s" % (errType,errValue)) - return -1 - - - # get active datasets - def getActiveDatasets(self,computingSite,prodSourceLabel): - comment = ' /* DBProxy.getActiveDatasets */' - _logger.debug("getActiveDatasets(%s,%s)" % (computingSite,prodSourceLabel)) - varMap = {} - varMap[':computingSite'] = computingSite - varMap[':jobStatus1'] = 'assigned' - varMap[':jobStatus2'] = 'activated' - varMap[':jobStatus3'] = 'waiting' - varMap[':prodSourceLabel'] = prodSourceLabel - try: - retList = [] - for table in ['jobsActive4','jobsDefined4','jobsWaiting4']: - if table == 'jobsActive4': - sql0 = "SELECT distinct prodDBlock FROM ATLAS_PANDA.%s " % table - else: - sql0 = "SELECT distinct prodDBlock FROM ATLAS_PANDA.%s " % table - sql0 += "WHERE computingSite=:computingSite AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3) " - sql0 += "AND prodSourceLabel=:prodSourceLabel" - # start transaction - self.conn.begin() - # select - self.cur.execute(sql0+comment, varMap) - resSs = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for prodDBlock, in resSs: - if not prodDBlock in retList: - retList.append(prodDBlock) - # make string - retStr = '' - for tmpItem in retList: - retStr += '%s,' % tmpItem - retStr = retStr[:-1] - return retStr - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getActiveDatasets : %s %s" % (errType,errValue)) - return "" - - - # check status of all sub datasets to trigger Notifier - def checkDatasetStatusForNotifier(self,jobsetID,jobDefinitionID,prodUserName): - comment = ' /* DBProxy.checkDatasetStatusForNotifier */' - _logger.debug("checkDatasetStatusForNotifier(%s,%s,%s)" % (jobsetID,jobDefinitionID,prodUserName)) - try: - # get PandaIDs to get all associated destinationDBlocks - varMap = {} - varMap[':jobsetID'] = jobsetID - varMap[':prodUserName'] = prodUserName - sql = "SELECT MAX(PandaID),jobDefinitionID FROM %s WHERE prodUserName=:prodUserName AND jobsetID=:jobsetID GROUP BY jobDefinitionID" - pandaIDs = {} - for table in ['ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsWaiting4']: - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 1000 - self.cur.execute((sql % table)+comment, varMap) - resSs = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # get PandaIDs - for tmpPandaID,tmpJobDefID in resSs: - if (not pandaIDs.has_key(tmpJobDefID)) or tmpPandaID > pandaIDs[tmpJobDefID]: - pandaIDs[tmpJobDefID] = tmpPandaID - # get all destinationDBlocks - varMap = {} - varMap[':type1'] = 'log' - varMap[':type2'] = 'output' - sql = 'SELECT DISTINCT destinationDBlock FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type IN (:type1,:type2)' - datasetMap = {} - # start transaction - self.conn.begin() - self.cur.arraysize = 1000 - for tmpJobDefID,tmpPandaID in pandaIDs.iteritems(): - varMap[':PandaID'] = tmpPandaID - # select - self.cur.execute(sql+comment, varMap) - resSs = self.cur.fetchall() - # get destinationDBlock - for tmpDestDBlock, in resSs: - if not datasetMap.has_key(tmpJobDefID): - datasetMap[tmpJobDefID] = [] - if not tmpDestDBlock in datasetMap[tmpJobDefID]: - datasetMap[tmpJobDefID].append(tmpDestDBlock) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # check dataset status - allClosed = True - retInfo = {} - latestUpdate = None - latestJobDefID = None - varMap = {} - varMap[':type1'] = 'log' - varMap[':type2'] = 'output' - sql = 'SELECT status,modificationDate FROM ATLAS_PANDA.Datasets WHERE name=:name AND type IN (:type1,:type2)' - sqlJ = "SELECT MAX(modificationTime) FROM ATLAS_PANDA.jobsArchived4 " - sqlJ += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID" - # start transaction - self.conn.begin() - self.cur.arraysize = 1000 - for tmpJobDefID,tmpDatasets in datasetMap.iteritems(): - retInfo[tmpJobDefID] = [] - for tmpDataset in tmpDatasets: - if not tmpDataset in retInfo[tmpJobDefID]: - retInfo[tmpJobDefID].append(tmpDataset) - varMap[':name'] = tmpDataset - # select - self.cur.execute(sql+comment, varMap) - resSs = self.cur.fetchall() - # check status and mod time - for tmpStatus,tmpModificationDate in resSs: - _logger.debug("checkDatasetStatusForNotifier(%s,%s) %s has %s with %s at %s" % \ - (jobsetID,jobDefinitionID,tmpJobDefID,tmpDataset,tmpStatus,tmpModificationDate)) - if not tmpStatus in ['closed','tobeclosed','completed']: - # some datasets are still active - allClosed = False - _logger.debug("checkDatasetStatusForNotifier(%s,%s) wait due to %s %s %s" % \ - (jobsetID,jobDefinitionID,tmpJobDefID,tmpDataset,tmpStatus)) - break - elif tmpStatus == 'tobeclosed': - # select latest modificationTime in job table - varMapJ = {} - varMapJ[':prodUserName'] = prodUserName - varMapJ[':jobDefinitionID'] = tmpJobDefID - self.cur.execute(sqlJ+comment, varMapJ) - resJ = self.cur.fetchone() - if resJ == None: - # error - allClosed = False - _logger.error("checkDatasetStatusForNotifier(%s,%s) %s cannot find job" % \ - (jobsetID,jobDefinitionID,tmpJobDefID)) - break - tmpModificationTime, = resJ - _logger.debug("checkDatasetStatusForNotifier(%s,%s) %s modtime:%s" % \ - (jobsetID,jobDefinitionID,tmpJobDefID,tmpModificationTime)) - if latestUpdate == None or latestUpdate < tmpModificationTime: - # use the latest updated jobDefID - latestUpdate = tmpModificationTime - latestJobDefID = tmpJobDefID - elif latestUpdate == tmpModificationTime and latestJobDefID < tmpJobDefID: - # use larger jobDefID when datasets are closed at the same time - latestJobDefID = tmpJobDefID - # escape - if not allClosed: - break - # escape - if not allClosed: - break - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - _logger.debug("checkDatasetStatusForNotifier(%s,%s) -> all:%s %s latest:%s" % \ - (jobsetID,jobDefinitionID,allClosed,latestJobDefID, - jobDefinitionID == latestJobDefID)) - # return - if not allClosed or jobDefinitionID != latestJobDefID: - return False,{} - return True,retInfo - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("checkDatasetStatusForNotifier : %s %s" % (errType,errValue)) - return False,{} - - - # get MoU share for T2 PD2P - def getMouShareForT2PD2P(self): - comment = ' /* DBProxy.getMouShareForT2PD2P */' - _logger.debug("getMouShareForT2PD2P start") - sqlG = "SELECT gid,ntup_share FROM ATLAS_GRISLI.t_tier2_groups " - sqlT = "SELECT tier2,t2group,status FROM ATLAS_GRISLI.t_m4regions_replication" - try: - # start transaction - self.conn.begin() - self.cur.arraysize = 100000 - # get weight for each group - self.cur.execute(sqlG+comment) - resG = self.cur.fetchall() - gidShareMap = {} - for gid,ntup_share in resG: - gidShareMap[gid] = {'ntup_share':ntup_share,'nSites':0} - # get group for each site - self.cur.execute(sqlT+comment) - resT = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - siteGroupMap = {} - # loop over all sites - for tier2,t2group,t2status in resT: - # unknown group - if not gidShareMap.has_key(t2group): - _logger.error("getMouShareForT2PD2P unknown group %s for %s" % (t2group,tier2)) - continue - # use only DATADISK - if not tier2.endswith('_DATADISK'): - continue - # count the number of ready sites per group - if t2status in ['ready']: - gidShareMap[t2group]['nSites'] += 1 - # append - siteGroupMap[tier2] = {'group':t2group,'status':t2status} - # normalize - _logger.debug("getMouShareForT2PD2P normalize factor = %s" % str(gidShareMap)) - weightsMap = {} - for tier2,t2Val in siteGroupMap.iteritems(): - t2group = t2Val['group'] - t2status = t2Val['status'] - if gidShareMap[t2group]['ntup_share'] == 0: - # set 0 to be skipped in the brokerage - tmpWeight = 0 - elif gidShareMap[t2group]['nSites'] > 0: - # normalize - tmpWeight = float(gidShareMap[t2group]['ntup_share']) / float(gidShareMap[t2group]['nSites']) - else: - # no site is ready in this group - tmpWeight = 0 - weightsMap[tier2] = {'weight':tmpWeight,'status':t2status} - _logger.debug("getMouShareForT2PD2P -> %s" % str(weightsMap)) - return weightsMap - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("getMouShareForT2PD2P : %s %s" % (errType,errValue)) - return {} - - - # record status change - def recordStatusChange(self,pandaID,jobStatus,jobInfo=None,infoMap={}): - comment = ' /* DBProxy.recordStatusChange */' - # check config - if not hasattr(panda_config,'record_statuschange') or panda_config.record_statuschange != True: - return - # get job info - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':jobStatus'] = jobStatus - varMap[':modificationHost'] = self.myHostName - if jobInfo != None: - varMap[':computingSite'] = jobInfo.computingSite - varMap[':cloud'] = jobInfo.cloud - varMap[':prodSourceLabel'] = jobInfo.prodSourceLabel - elif infoMap != None: - varMap[':computingSite'] = infoMap['computingSite'] - varMap[':cloud'] = infoMap['cloud'] - varMap[':prodSourceLabel'] = infoMap['prodSourceLabel'] - else: - # no info - return - # convert NULL to None - for tmpKey in varMap.keys(): - if varMap[tmpKey] == 'NULL': - varMap[tmpKey] = None - # insert - sql = "INSERT INTO ATLAS_PANDA.jobs_StatusLog " - sql += "(PandaID,modificationTime,jobStatus,prodSourceLabel,cloud,computingSite,modificationHost) " - sql += "VALUES (:PandaID,CURRENT_DATE,:jobStatus,:prodSourceLabel,:cloud,:computingSite,:modificationHost) " - try: - # start transaction - self.conn.begin() - self.cur.execute(sql+comment,varMap) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - except: - # roll back - self._rollback() - errType,errValue = sys.exc_info()[:2] - _logger.error("recordStatusChange %s %s: %s %s" % (pandaID,jobStatus,errType,errValue)) - return - - - # wake up connection - def wakeUp(self): - for iTry in range(5): - try: - # check if the connection is working - self.conn.ping() - return - except: - type, value, traceBack = sys.exc_info() - _logger.debug("wakeUp %d : %s %s" % (iTry,type,value)) - # wait for reconnection - time.sleep(1) - self.connect(reconnect=True) - - - # commit - def _commit(self): - try: - self.conn.commit() - return True - except: - _logger.error("commit error") - return False - - - # rollback - def _rollback(self,useOtherError=False): - retVal = True - # rollback - _logger.debug("rollback") - try: - self.conn.rollback() - except: - _logger.error("rollback error") - retVal = False - # reconnect if needed - try: - # get ORA ErrorCode - errType,errValue = sys.exc_info()[:2] - oraErrCode = str(errValue).split()[0] - oraErrCode = oraErrCode[:-1] - _logger.debug("rollback EC:%s %s" % (oraErrCode,errValue)) - # error codes for connection error - error_Codes = ['ORA-01012','ORA-01033','ORA-01034','ORA-01089', - 'ORA-03113','ORA-03114','ORA-12203','ORA-12500', - 'ORA-12571','ORA-03135','ORA-25402'] - # other errors are apperantly given when connection lost contact - if useOtherError: - error_Codes += ['ORA-01861','ORA-01008'] - if oraErrCode in error_Codes: - # reconnect - retFlag = self.connect(reconnect=True) - _logger.debug("rollback reconnected %s" % retFlag) - except: - pass - # return - return retVal diff --git a/current/pandaserver/taskbuffer/OraLogDBProxy.py b/current/pandaserver/taskbuffer/OraLogDBProxy.py deleted file mode 100755 index 8f397db40..000000000 --- a/current/pandaserver/taskbuffer/OraLogDBProxy.py +++ /dev/null @@ -1,727 +0,0 @@ -""" -proxy for log database connection - -""" - -import re -import sys -import time - -import cx_Oracle - -from pandalogger.PandaLogger import PandaLogger -from config import panda_config - -import SiteSpec -import CloudSpec - -from JobSpec import JobSpec -from FileSpec import FileSpec - -# logger -_logger = PandaLogger().getLogger('LogDBProxy') - -# proxy -class LogDBProxy: - - # constructor - def __init__(self): - # connection object - self.conn = None - # cursor object - self.cur = None - - # connect to DB - def connect(self,dbhost=panda_config.logdbhost,dbpasswd=panda_config.logdbpasswd, - dbuser=panda_config.logdbuser,dbname=panda_config.logdbname,reconnect=False): - # keep parameters for reconnect - if not reconnect: - self.dbhost = dbhost - self.dbpasswd = dbpasswd - self.dbuser = dbuser - self.dbname = dbname - # connect - try: - self.conn = cx_Oracle.connect(dsn=self.dbhost,user=self.dbuser, - password=self.dbpasswd,threaded=True) - self.cur=self.conn.cursor() - # set TZ - self.cur.execute("ALTER SESSION SET TIME_ZONE='UTC'") - # set DATE format - self.cur.execute("ALTER SESSION SET NLS_DATE_FORMAT='YYYY/MM/DD HH24:MI:SS'") - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("connect : %s %s" % (type,value)) - # roll back - self._rollback() - return False - - - # query an SQL - def querySQL(self,sql,arraySize=1000): - try: - # begin transaction - self.conn.begin() - self.cur.arraysize = arraySize - self.cur.execute(sql) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - return res - except: - type, value, traceBack = sys.exc_info() - _logger.error("querySQL : %s %s" % (type,value)) - return None - - - # get site data - def getCurrentSiteData(self): - _logger.debug("getCurrentSiteData") - sql = "SELECT SITE,getJob,updateJob FROM SiteData WHERE FLAG='production' and HOURS=3" - try: - # set autocommit on - self.conn.begin() - # select - self.cur.arraysize = 10000 - self.cur.execute(sql) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - ret = {} - for item in res: - ret[item[0]] = {'getJob':item[1],'updateJob':item[2]} - _logger.debug(ret) - return ret - except: - type, value, traceBack = sys.exc_info() - _logger.error("getCurrentSiteData : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # get list of site - def getSiteList(self): - _logger.debug("getSiteList start") - try: - # set autocommit on - self.conn.begin() - # select - sql = "SELECT siteid,nickname FROM schedconfig WHERE siteid IS NOT NULL" - self.cur.arraysize = 10000 - self.cur.execute(sql) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - retMap = {} - if res != None and len(res) != 0: - for siteid,nickname in res: - # skip invalid siteid - if siteid in [None,'']: - continue - # append - if not retMap.has_key(siteid): - retMap[siteid] = [] - retMap[siteid].append(nickname) - _logger.debug(retMap) - _logger.debug("getSiteList done") - return retMap - except: - type, value, traceBack = sys.exc_info() - _logger.error("getSiteList : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # get site info - def getSiteInfo(self): - _logger.debug("getSiteInfo start") - try: - # set autocommit on - self.conn.begin() - # select - sql = "SELECT nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory," - sql+= "maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec," - sql+= "priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue " - sql+= "FROM schedconfig WHERE siteid IS NOT NULL" - self.cur.arraysize = 10000 - self.cur.execute(sql) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - retList = {} - if resList != None: - # loop over all results - for res in resList: - # change None to '' - resTmp = [] - for tmpItem in res: - if tmpItem == None: - tmpItem = '' - resTmp.append(tmpItem) - nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory,\ - maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec,\ - priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue \ - = resTmp - # skip invalid siteid - if siteid in [None,'']: - continue - # instantiate SiteSpec - ret = SiteSpec.SiteSpec() - ret.sitename = siteid - ret.nickname = nickname - ret.dq2url = dq2url - ret.cloud = cloud - ret.ddm = ddm.split(',')[0] - ret.lfchost = lfchost - ret.se = se - ret.gatekeeper = gatekeeper - ret.memory = memory - ret.maxtime = maxtime - ret.status = status - ret.space = space - ret.glexec = glexec - ret.queue = queue - ret.localqueue = localqueue - # job recoverty - ret.retry = True - if retry == 'FALSE': - ret.retry = False - # convert releases to list - ret.releases = [] - for tmpRel in releases.split('|'): - # remove white space - tmpRel = tmpRel.strip() - if tmpRel != '': - ret.releases.append(tmpRel) - # cmtconfig - # add slc3 if the column is empty - ret.cmtconfig = ['i686-slc3-gcc323-opt'] - if cmtconfig != '': - ret.cmtconfig.append(cmtconfig) - # map between token and DQ2 ID - ret.setokens = {} - tmpTokens = setokens.split(',') - for idxToken,tmpddmID in enumerate(ddm.split(',')): - if idxToken < len(tmpTokens): - ret.setokens[tmpTokens[idxToken]] = tmpddmID - # expand [] in se path - match = re.search('([^\[]*)\[([^\]]+)\](.*)',seprodpath) - if match != None and len(match.groups()) == 3: - seprodpath = '' - for tmpBody in match.group(2).split(','): - seprodpath += '%s%s%s,' % (match.group(1),tmpBody,match.group(3)) - seprodpath = seprodpath[:-1] - # map between token and se path - ret.seprodpath = {} - tmpTokens = setokens.split(',') - for idxToken,tmpSePath in enumerate(seprodpath.split(',')): - if idxToken < len(tmpTokens): - ret.seprodpath[tmpTokens[idxToken]] = tmpSePath - # VO related params - ret.priorityoffset = priorityoffset - ret.allowedgroups = allowedgroups - ret.defaulttoken = defaulttoken - # append - retList[ret.nickname] = ret - _logger.debug("getSiteInfo done") - return retList - except: - type, value, traceBack = sys.exc_info() - _logger.error("getSiteInfo : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # get cloud list - def getCloudList(self): - _logger.debug("getCloudList start") - try: - # set autocommit on - self.conn.begin() - # select - sql = "SELECT name,tier1,tier1SE,relocation,weight,server,status,transtimelo," - sql += "transtimehi,waittime,validation,mcshare,countries,fasttrack " - sql+= "FROM cloudconfig" - self.cur.arraysize = 10000 - self.cur.execute(sql) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - ret = {} - if resList != None and len(resList) != 0: - for res in resList: - # change None to '' - resTmp = [] - for tmpItem in res: - if tmpItem == None: - tmpItem = '' - resTmp.append(tmpItem) - name,tier1,tier1SE,relocation,weight,server,status,transtimelo,transtimehi,\ - waittime,validation,mcshare,countries,fasttrack = resTmp - # instantiate CloudSpec - tmpC = CloudSpec.CloudSpec() - tmpC.name = name - tmpC.tier1 = tier1 - tmpC.tier1SE = re.sub(' ','',tier1SE).split(',') - tmpC.relocation = relocation - tmpC.weight = weight - tmpC.server = server - tmpC.status = status - tmpC.transtimelo = transtimelo - tmpC.transtimehi = transtimehi - tmpC.waittime = waittime - tmpC.validation = validation - tmpC.mcshare = mcshare - tmpC.countries = countries - tmpC.fasttrack = fasttrack - # append - ret[name] = tmpC - _logger.debug("getCloudList done") - return ret - except: - type, value, traceBack = sys.exc_info() - _logger.error("getCloudList : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # extract name from DN - def cleanUserID(self, id): - try: - up = re.compile('/(DC|O|OU|C|L)=[^\/]+') - username = up.sub('', id) - up2 = re.compile('/CN=[0-9]+') - username = up2.sub('', username) - up3 = re.compile(' [0-9]+') - username = up3.sub('', username) - up4 = re.compile('_[0-9]+') - username = up4.sub('', username) - username = username.replace('/CN=proxy','') - username = username.replace('/CN=limited proxy','') - username = username.replace('limited proxy','') - pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)') - mat = pat.match(username) - if mat: - username = mat.group(2) - else: - username = username.replace('/CN=','') - if username.lower().find('/email') > 0: - username = username[:username.lower().find('/email')] - pat = re.compile('.*(limited.*proxy).*') - mat = pat.match(username) - if mat: - username = mat.group(1) - username = username.replace('(','') - username = username.replace(')','') - return username - except: - return id - - - # check quota - def checkQuota(self,dn): - _logger.debug("checkQuota %s" % dn) - try: - # set autocommit on - self.conn.begin() - # select - name = self.cleanUserID(dn) - sql = "SELECT cpua1,cpua7,cpua30,quotaa1,quotaa7,quotaa30 FROM users WHERE name = :name" - varMap = {} - varMap[':name'] = name - self.cur.arraysize = 10 - self.cur.execute(sql,varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - weight = 0.0 - if res != None and len(res) != 0: - item = res[0] - # cpu and quota - cpu1 = item[0] - cpu7 = item[1] - cpu30 = item[2] - quota1 = item[3] * 3600 - quota7 = item[4] * 3600 - quota30 = item[5] * 3600 - # CPU usage - if cpu1 == None: - cpu1 = 0.0 - # weight - weight = float(cpu1) / float(quota1) - # not exceeded the limit - if weight < 1.0: - weight = 0.0 - _logger.debug("checkQuota %s Weight:%s Quota:%s CPU:%s" % (dn,weight,quota1,cpu1)) - else: - _logger.debug("checkQuota cannot found %s" % dn) - return weight - except: - type, value, traceBack = sys.exc_info() - _logger.error("checkQuota : %s %s" % (type,value)) - # roll back - self._rollback() - return 0.0 - - - # get serialize JobID and status - def getUserParameter(self,dn,jobID): - _logger.debug("getUserParameter %s %s" % (dn,jobID)) - try: - # set autocommit on - self.conn.begin() - # select - name = self.cleanUserID(dn) - sql = "SELECT jobid,status FROM users WHERE name = :name" - varMap = {} - varMap[':name'] = name - self.cur.execute(sql,varMap) - self.cur.arraysize = 10 - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - retJobID = jobID - retStatus = True - if res != None and len(res) != 0: - item = res[0] - # JobID in DB - dbJobID = item[0] - # check status - if item[1] in ['disabled']: - retStatus = False - # use larger JobID - if dbJobID >= int(retJobID): - retJobID = dbJobID+1 - # update DB - sql = "UPDATE users SET jobid=%d WHERE name = '%s'" % (retJobID,name) - self.cur.execute(sql) - _logger.debug("getUserParameter set JobID=%s for %s" % (retJobID,dn)) - return retJobID,retStatus - except: - type, value, traceBack = sys.exc_info() - _logger.error("getUserParameter : %s %s" % (type,value)) - # roll back - self._rollback() - return jobID,True - - - # get email address for a user - def getEmailAddr(self,name): - _logger.debug("get email for %s" % name) - try: - # set autocommit on - self.conn.begin() - # select - sql = "SELECT email FROM users WHERE name=:name" - varMap = {} - varMap[':name'] = name - self.cur.execute(sql,varMap) - self.cur.arraysize = 10 - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if res != None and len(res) != 0: - return res[0][0] - # return empty string - return "" - except: - type, value, traceBack = sys.exc_info() - _logger.error("getEmailAddr : %s %s" % (type,value)) - # roll back - self._rollback() - return "" - - - # register proxy key - def registerProxyKey(self,params): - _logger.debug("register ProxyKey %s" % str(params)) - try: - # set autocommit on - self.conn.begin() - # construct SQL - sql0 = 'INSERT INTO proxykey (' - sql1 = 'VALUES (' - vals = {} - for key,val in params.iteritems(): - sql0 += '%s,' % key - sql1 += ':%s,' % key - vals[':%s' % key] = val - sql0 = sql0[:-1] - sql1 = sql1[:-1] - sql = sql0 + ') ' + sql1 + ') ' - # insert - self.cur.execute(sql,vals) - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return True - return True - except: - type, value, traceBack = sys.exc_info() - _logger.error("registerProxyKey : %s %s" % (type,value)) - # roll back - self._rollback() - return "" - - - # get proxy key - def getProxyKey(self,dn): - _logger.debug("get ProxyKey %s" % dn) - try: - # set autocommit on - self.conn.begin() - # construct SQL - sql = 'SELECT credname,expires,origin,myproxy FROM proxykey WHERE dn=:dn ORDER BY expires DESC' - varMap = {} - varMap[':dn'] = dn - # select - self.cur.execute(sql,varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # return - retMap = {} - if res != None and len(res) != 0: - credname,expires,origin,myproxy = res[0] - retMap['credname'] = credname - retMap['expires'] = expires - retMap['origin'] = origin - retMap['myproxy'] = myproxy - _logger.debug(retMap) - return retMap - except: - type, value, traceBack = sys.exc_info() - _logger.error("getProxyKey : %s %s" % (type,value)) - # roll back - self._rollback() - return {} - - - # get list of archived tables - def getArchiveTables(self): - tables = [] - cdate = datetime.datetime.utcnow() - for iCycle in range(2): # 2 = (1 months + 2 just in case)/2 - if cdate.month==1: - cdate = cdate.replace(year = (cdate.year-1)) - cdate = cdate.replace(month = 12, day = 1) - else: - cdate = cdate.replace(month = (cdate.month/2)*2, day = 1) - tableName = "jobsArchived_%s%s" % (cdate.strftime('%b'),cdate.year) - if not tableName in tables: - tables.append(tableName) - # one older table - if cdate.month > 2: - cdate = cdate.replace(month = (cdate.month-2)) - else: - cdate = cdate.replace(year = (cdate.year-1), month = 12) - # return - return tables - - - # get JobIDs in a time range - def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs): - comment = ' /* LogDBProxy.getJobIDsInTimeRange */' - _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) - try: - # get list of archived tables - tables = self.getArchiveTables() - # select - for table in tables: - # make sql - sql = "SELECT jobDefinitionID FROM %s " % table - sql += "WHERE prodUserID=:prodUserID AND modificationTime>:modificationTime " - sql += "AND prodSourceLabel='user' GROUP BY jobDefinitionID" - varMap = {} - varMap[':prodUserID'] = dn - varMap[':modificationTime'] = timeRange - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 10000 - _logger.debug(sql+comment+str(varMap)) - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for tmpID, in resList: - if not tmpID in retJobIDs: - retJobIDs.append(tmpID) - _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs)) - return retJobIDs - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getJobIDsInTimeRange : %s %s" % (type,value)) - # return empty list - return retJobIDs - - - # get PandaIDs for a JobID - def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs): - comment = ' /* LogProxy.getPandIDsWithJobID */' - _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID)) - try: - # get list of archived tables - tables = self.getArchiveTables() - # select - for table in tables: - # skip if all jobs have already been gotten - if nJobs > 0 and len(idStatus) >= nJobs: - continue - # make sql - sql = "SELECT PandaID,jobStatus,commandToPilot FROM %s " % table - sql += "WHERE prodUserID=:prodUserID AND jobDefinitionID=:jobDefinitionID " - sql += "AND prodSourceLabel in ('user','panda') " - varMap = {} - varMap[':prodUserID'] = dn - varMap[':jobDefinitionID'] = jobID - # start transaction - self.conn.begin() - # select - self.cur.arraysize = 5000 - # select - _logger.debug(sql+comment+str(varMap)) - self.cur.execute(sql+comment, varMap) - resList = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # append - for tmpID,tmpStatus,tmpCommand in resList: - if not idStatus.has_key(tmpID): - idStatus[tmpID] = (tmpStatus,tmpCommand) - _logger.debug("getPandIDsWithJobID : %s" % str(idStatus)) - return idStatus - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("getPandIDsWithJobID : %s %s" % (type,value)) - # return empty list - return {} - - - # peek at job - def peekJob(self,pandaID): - comment = ' /* LogDBProxy.peekJob */' - _logger.debug("peekJob : %s" % pandaID) - # return None for NULL PandaID - if pandaID in ['NULL','','None',None]: - return None - sql1_0 = "SELECT %s FROM %s " - sql1_1 = "WHERE PandaID=:PandaID" - # select - varMap = {} - varMap[':PandaID'] = pandaID - try: - # get list of archived tables - tables = self.getArchiveTables() - # select - for table in tables: - # start transaction - self.conn.begin() - # select - sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1 - self.cur.arraysize = 10 - self.cur.execute(sql+comment, varMap) - res = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - if len(res) != 0: - # Job - job = JobSpec() - job.pack(res[0]) - # Files - # start transaction - self.conn.begin() - # select - fileTableName = re.sub('jobsArchived','filesTable',table) - sqlFile = "SELECT %s " % FileSpec.columnNames() - sqlFile+= "FROM %s " % fileTableName - sqlFile+= "WHERE PandaID=:PandaID" - self.cur.arraysize = 10000 - self.cur.execute(sqlFile+comment, varMap) - resFs = self.cur.fetchall() - # commit - if not self._commit(): - raise RuntimeError, 'Commit error' - # set files - for resF in resFs: - file = FileSpec() - file.pack(resF) - job.addFile(file) - return job - _logger.debug("peekJob() : PandaID %s not found" % pandaID) - return None - except: - # roll back - self._rollback() - type, value, traceBack = sys.exc_info() - _logger.error("peekJob : %s %s" % (type,value)) - # return None - return None - - - # wake up connection - def wakeUp(self): - for iTry in range(5): - try: - # check if the connection is working - self.cur.execute("select user from dual") - return - except: - type, value, traceBack = sys.exc_info() - _logger.debug("wakeUp %d : %s %s" % (iTry,type,value)) - # wait for reconnection - time.sleep(1) - self.connect(reconnect=True) - - - # close - def close(self): - try: - self.cur.close() - self.conn.close() - except: - type, value, traceBack = sys.exc_info() - _logger.error("close : %s %s" % (type,value)) - - - # commit - def _commit(self): - try: - self.conn.commit() - return True - except: - _logger.error("commit error") - return False - - - # rollback - def _rollback(self): - try: - self.conn.rollback() - return True - except: - _logger.error("rollback error") - return False - diff --git a/current/pandaserver/taskbuffer/PrioUtil.py b/current/pandaserver/taskbuffer/PrioUtil.py deleted file mode 100644 index ac8d99d5f..000000000 --- a/current/pandaserver/taskbuffer/PrioUtil.py +++ /dev/null @@ -1,4 +0,0 @@ -# calculate priority for user jobs -def calculatePriority(priorityOffset,serNum,weight): - priority = 1000 + priorityOffset - (serNum / 5) - int(100 * weight) - return priority diff --git a/current/pandaserver/taskbuffer/ProcessGroups.py b/current/pandaserver/taskbuffer/ProcessGroups.py deleted file mode 100644 index 1318ca0d1..000000000 --- a/current/pandaserver/taskbuffer/ProcessGroups.py +++ /dev/null @@ -1,101 +0,0 @@ -processGroups = [('others', []), - ('evgensimul', ['evgen','simul']), - ('reprocessing', ['reprocessing']), - ('test', ['prod_test','rc_test','validation']), - ('mcore', ['mcore']), - ('group', ['group']), - ] - -# source labels used for panda internal purpose -internalSourceLabels = ['ddm'] - -# maximum number of debug jobs per user -maxDebugJobs = 3 - -# maximum number of debug jobs for prod role -maxDebugProdJobs = 30 - -# extension level for GP -extensionLevel_1 = 1 - - -# get corresponding group -def getProcessGroup(valGroup): - tmpGroup = None - for tmpKey,tmpList in processGroups: - # set default - if tmpGroup == None: - tmpGroup = tmpKey - continue - if valGroup in tmpList: - tmpGroup = tmpKey - break - # return - return tmpGroup - - -# convert cloud and processingType for extended PG -def converCPTforEPG(cloud,processingType,coreCount,workingGroup=None): - if coreCount in [0,1,None]: - # use group queue for GP jobs - if workingGroup != None and workingGroup.startswith('GP_'): - return cloud,'group' - return cloud,processingType - else: - # use MCORE queue for MPC jobs in all clouds - return "ALL","mcore" - - -# count the number of jobs per group -def countJobsPerGroup(valMap): - ret = {} - # loop over all clouds - for cloud,cloudVal in valMap.iteritems(): - # add cloud - if not ret.has_key(cloud): - ret[cloud] = {} - # loop over all sites - for site,siteVal in cloudVal.iteritems(): - # add site - if not ret[cloud].has_key(site): - ret[cloud][site] = {} - # loop over all types - for pType,typeVal in siteVal.iteritems(): - # get process group - tmpGroup = getProcessGroup(pType) - # add group - if not ret[cloud][site].has_key(tmpGroup): - ret[cloud][site][tmpGroup] = {} - # loop over all status - for jobStatus,statVal in typeVal.iteritems(): - if not ret[cloud][site][tmpGroup].has_key(jobStatus): - ret[cloud][site][tmpGroup][jobStatus] = 0 - # add - ret[cloud][site][tmpGroup][jobStatus] += statVal - # return - return ret - - -# count the number of jobs per group for analysis -def countJobsPerGroupForAnal(valMap): - ret = {} - # loop over all sites - for site,siteVal in valMap.iteritems(): - # add site - if not ret.has_key(site): - ret[site] = {} - # loop over all types - for pType,typeVal in siteVal.iteritems(): - # get process group - tmpGroup = getProcessGroup(pType) - # add group - if not ret[site].has_key(tmpGroup): - ret[site][tmpGroup] = {} - # loop over all status - for jobStatus,statVal in typeVal.iteritems(): - if not ret[site][tmpGroup].has_key(jobStatus): - ret[site][tmpGroup][jobStatus] = 0 - # add - ret[site][tmpGroup][jobStatus] += statVal - # return - return ret diff --git a/current/pandaserver/taskbuffer/SQLDumper.py b/current/pandaserver/taskbuffer/SQLDumper.py deleted file mode 100644 index 16240d1be..000000000 --- a/current/pandaserver/taskbuffer/SQLDumper.py +++ /dev/null @@ -1,22 +0,0 @@ -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('SQLDumper') - -class SQLDumper(object): - def __init__(self,cur): - self.cursor = cur - def __iter__(self): - return self - def next(self): - return self.cursor.next() - def my_execute(self,sql,var={}): - _logger.debug('SQL=%s var=%s' % (sql,str(var))) - return self.cursor.execute(sql,var) - def __getattribute__(self,name): - if name == 'execute': - return object.__getattribute__(self,'my_execute') - elif name in ['cursor','__iter__','next']: - return object.__getattribute__(self,name) - else: - return getattr(self.cursor,name) diff --git a/current/pandaserver/taskbuffer/SiteSpec.py b/current/pandaserver/taskbuffer/SiteSpec.py deleted file mode 100644 index e261e08d0..000000000 --- a/current/pandaserver/taskbuffer/SiteSpec.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -site specification - -""" - -class SiteSpec(object): - # attributes - _attributes = ('sitename','nickname','dq2url','cloud','ddm','lfchost','se','type','gatekeeper', - 'releases','memory','maxtime','status','space','retry','cmtconfig','setokens', - 'seprodpath','glexec','priorityoffset','allowedgroups','defaulttoken','queue', - 'localqueue','validatedreleases','accesscontrol','copysetup','maxinputsize', - 'cachedse','allowdirectaccess','comment','cloudlist','statusmodtime','lfcregister', - 'countryGroup','availableCPU','pledgedCPU','coreCount','reliabilityLevel', - 'iscvmfs','transferringlimit') - - # constructor - def __init__(self): - # install attributes - for attr in self._attributes: - setattr(self,attr,None) - - # serialize - def __str__(self): - str = '' - for attr in self._attributes: - str += '%s:%s ' % (attr,getattr(self,attr)) - return str - - - - diff --git a/current/pandaserver/taskbuffer/TaskBuffer.py b/current/pandaserver/taskbuffer/TaskBuffer.py deleted file mode 100755 index 9c03a1b35..000000000 --- a/current/pandaserver/taskbuffer/TaskBuffer.py +++ /dev/null @@ -1,2294 +0,0 @@ -import re -import sys -import types -import shlex -import datetime -import ProcessGroups -from threading import Lock -from DBProxyPool import DBProxyPool -from brokerage.SiteMapper import SiteMapper -from dataservice.Setupper import Setupper -from dataservice.Closer import Closer -from dataservice.TaLauncher import TaLauncher -from dataservice.ProcessLimiter import ProcessLimiter - -# logger -from pandalogger.PandaLogger import PandaLogger -_logger = PandaLogger().getLogger('TaskBuffer') - - -class TaskBuffer: - """ - task queue - - """ - - # constructor - def __init__(self): - self.proxyPool = None - self.lock = Lock() - self.processLimiter = None - - - # initialize - def init(self,dbname,dbpass,nDBConnection=10,useTimeout=False): - # lock - self.lock.acquire() - # create Proxy Pool - if self.proxyPool == None: - self.proxyPool = DBProxyPool(dbname,dbpass,nDBConnection,useTimeout) - # create process limiter - if self.processLimiter == None: - self.processLimiter = ProcessLimiter() - # release - self.lock.release() - - - # check production role - def checkProdRole(self,fqans): - for fqan in fqans: - # check production role - match = re.search('/([^/]+)/Role=production',fqan) - if match != None: - return True,match.group(1) - return False,None - - - # get priority parameters for user - def getPrioParameters(self,jobs,user,fqans,userDefinedWG,validWorkingGroup): - withProdRole = False - workingGroup = None - priorityOffset = 0 - serNum = 0 - weight = None - # get DB proxy - proxy = self.proxyPool.getProxy() - # check production role - withProdRole,workingGroup = self.checkProdRole(fqans) - if withProdRole: - # check dataset name - for tmpFile in jobs[-1].Files: - if tmpFile.type in ['output','log'] and not tmpFile.lfn.startswith('group'): - # reset - withProdRole,workingGroup = False,None - break - # set high prioryty for production role - """ - if withProdRole: - serNum = 0 - weight = 0.0 - priorityOffset = 2000 - """ - # reset nJob/weight for HC - if jobs[0].processingType in ['hammercloud','gangarobot'] \ - or jobs[0].processingType.startswith('gangarobot-'): - serNum = 0 - weight = 0.0 - if jobs[0].processingType in ['gangarobot','gangarobot-pft']: - priorityOffset = 3000 - # check quota - if weight == None: - weight = proxy.checkQuota(user) - # get nJob - if userDefinedWG and validWorkingGroup: - serNum = proxy.getNumberJobsUser(user,workingGroup=jobs[0].workingGroup) - else: - serNum = proxy.getNumberJobsUser(user,workingGroup=None) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return withProdRole,workingGroup,priorityOffset,serNum,weight - - - # store Jobs into DB - def storeJobs(self,jobs,user,joinThr=False,forkSetupper=False,fqans=[],hostname='',resetLocInSetupper=False, - checkSpecialHandling=True,toPending=False): - try: - _logger.debug("storeJobs : start for %s nJobs=%s" % (user,len(jobs))) - # check quota for priority calculation - weight = 0.0 - userJobID = -1 - userJobsetID = -1 - userStatus = True - priorityOffset = 0 - userVO = 'atlas' - userCountry = None - useExpress = False - nExpressJobs = 0 - useDebugMode = False - # check ban user except internally generated jobs - if len(jobs) > 0 and not jobs[0].prodSourceLabel in ProcessGroups.internalSourceLabels: - # get DB proxy - proxy = self.proxyPool.getProxy() - # check user status - tmpStatus = proxy.checkBanUser(user,jobs[0].prodSourceLabel) - # release proxy - self.proxyPool.putProxy(proxy) - # return if DN is blocked - if not tmpStatus: - _logger.debug("storeJobs : end for %s DN is blocked 1" % user) - return [] - # set parameters for user jobs - if len(jobs) > 0 and (jobs[0].prodSourceLabel in ['user','panda','ptest','rc_test','ssc']) \ - and (not jobs[0].processingType in ['merge','unmerge']): - # get DB proxy - proxy = self.proxyPool.getProxy() - # get JobID and status - userJobID,userJobsetID,userStatus = proxy.getUserParameter(user,jobs[0].jobDefinitionID,jobs[0].jobsetID) - # get site access - userSiteAccess = proxy.checkSiteAccess(jobs[0].computingSite,user) - # check quota for express jobs - if 'express' in jobs[0].specialHandling: - expressQuota = proxy.getExpressJobs(user) - if expressQuota != None and expressQuota['status'] and expressQuota['quota'] > 0: - nExpressJobs = expressQuota['quota'] - if nExpressJobs > 0: - useExpress = True - # debug mode - if 'debug' in jobs[0].specialHandling: - debugJobList = proxy.getActiveDebugJobs(user) - if len(debugJobList) < ProcessGroups.maxDebugJobs: - useDebugMode = True - # release proxy - self.proxyPool.putProxy(proxy) - # get site spec - siteMapper = SiteMapper(self) - tmpSiteSpec = siteMapper.getSite(jobs[0].computingSite) - # check allowed groups - if userStatus and hasattr(tmpSiteSpec,'allowedgroups') and (not tmpSiteSpec.allowedgroups in ['',None]): - # set status to False when allowedgroups is defined - userStatus = False - # loop over all groups - for tmpGroup in tmpSiteSpec.allowedgroups.split(','): - if tmpGroup == '': - continue - # loop over all FQANs - for tmpFQAN in fqans: - if re.search('^%s' % tmpGroup,tmpFQAN) != None: - userStatus = True - break - # escape - if userStatus: - break - # get priority offset - if hasattr(tmpSiteSpec,'priorityoffset') and (not tmpSiteSpec.priorityoffset in ['',None]): - # loop over all groups - for tmpGP in tmpSiteSpec.priorityoffset.split(','): - if tmpGP == '': - continue - # get group and offset - tmpGroup = tmpGP.split(':')[0] - try: - tmpOffset = int(tmpGP.split(':')[-1]) - except: - tmpOffset = 0 - # loop over all FQANs - for tmpFQAN in fqans: - _logger.debug(tmpFQAN) - if re.search('^%s/' % tmpGroup,tmpFQAN) != None or \ - re.search('%s$' % tmpGroup,tmpFQAN) != None: - # use the largest offset - if tmpOffset > priorityOffset: - priorityOffset = tmpOffset - break - # check site access - if hasattr(tmpSiteSpec,'accesscontrol') and tmpSiteSpec.accesscontrol == 'grouplist': - if userSiteAccess == {} or userSiteAccess['status'] != 'approved': - # user is not allowed - userStatus = False - # set priority offset - if userStatus: - if userSiteAccess.has_key('poffset') and userSiteAccess['poffset'] > priorityOffset: - priorityOffset = userSiteAccess['poffset'] - # extract country group - for tmpFQAN in fqans: - match = re.search('^/atlas/([^/]+)/',tmpFQAN) - if match != None: - tmpCountry = match.group(1) - # use country code or usatlas - if len(tmpCountry) == 2: - userCountry = tmpCountry - break - # usatlas - if tmpCountry in ['usatlas']: - userCountry = 'us' - break - # return if DN is blocked - if not userStatus: - _logger.debug("storeJobs : end for %s DN is blocked 2" % user) - return [] - # extract VO - for tmpFQAN in fqans: - match = re.search('^/([^/]+)/',tmpFQAN) - if match != None: - userVO = match.group(1) - break - # get number of jobs currently in PandaDB - serNum = 0 - userDefinedWG = False - validWorkingGroup = False - usingBuild = False - withProdRole = False - workingGroup = None - if len(jobs) > 0 and (jobs[0].prodSourceLabel in ['user','panda']) \ - and (not jobs[0].processingType in ['merge','unmerge']): - # check workingGroup - if not jobs[0].workingGroup in ['',None,'NULL']: - userDefinedWG = True - if userSiteAccess != {}: - if userSiteAccess['status'] == 'approved' and jobs[0].workingGroup in userSiteAccess['workingGroups']: - # valid workingGroup - validWorkingGroup = True - # using build for analysis - if jobs[0].prodSourceLabel == 'panda': - usingBuild = True - # get priority parameters for user - withProdRole,workingGroup,priorityOffset,serNum,weight = self.getPrioParameters(jobs,user,fqans,userDefinedWG, - validWorkingGroup) - # get DB proxy - proxy = self.proxyPool.getProxy() - # get group job serial number - groupJobSerialNum = 0 - if len(jobs) > 0 and (jobs[0].prodSourceLabel in ['user','panda']) \ - and (not jobs[0].processingType in ['merge','unmerge']): - for tmpFile in jobs[-1].Files: - if tmpFile.type in ['output','log'] and '$GROUPJOBSN' in tmpFile.lfn: - tmpSnRet = proxy.getSerialNumberForGroupJob(user) - if tmpSnRet['status']: - groupJobSerialNum = tmpSnRet['sn'] - break - # loop over all jobs - ret =[] - newJobs=[] - usePandaDDM = False - firstLiveLog = True - nRunJob = 0 - for job in jobs: - # set JobID. keep original JobID when retry - if userJobID != -1 and job.prodSourceLabel in ['user','panda'] \ - and (job.attemptNr in [0,'0','NULL'] or (not job.jobExecutionID in [0,'0','NULL'])) \ - and (not jobs[0].processingType in ['merge','unmerge']): - job.jobDefinitionID = userJobID - # set jobsetID - if job.prodSourceLabel in ['user','panda','ptest','rc_test']: - job.jobsetID = userJobsetID - # set specialHandling - if job.prodSourceLabel in ['user','panda']: - if checkSpecialHandling: - specialHandling = '' - # debug mode - if useDebugMode and nRunJob == 0 and job.prodSourceLabel == 'user': - specialHandling += 'debug,' - # express mode - if useExpress and (nRunJob < nExpressJobs or job.prodSourceLabel == 'panda'): - specialHandling += 'express,' - # reset specialHandling - specialHandling = specialHandling[:-1] - job.specialHandling = specialHandling - if job.prodSourceLabel != 'panda': - nRunJob += 1 - # set relocation flag - if job.computingSite != 'NULL': - job.relocationFlag = 1 - # protection agains empty jobParameters - if job.jobParameters in ['',None,'NULL']: - job.jobParameters = ' ' - # set country group and nJobs (=taskID) - if job.prodSourceLabel in ['user','panda']: - job.countryGroup = userCountry - # set workingGroup - if not validWorkingGroup: - if withProdRole: - # set country group if submitted with production role - job.workingGroup = workingGroup - else: - if userDefinedWG: - # reset invalid working group - job.workingGroup = None - # set nJobs (=taskID) - if usingBuild: - tmpNumBuild = 1 - tmpNunRun = len(jobs) - 1 - else: - tmpNumBuild = 0 - tmpNunRun = len(jobs) - # encode - job.taskID = tmpNumBuild + (tmpNunRun << 1) - # change TRF URL just in case - if job.transformation.startswith('http://www.usatlas.bnl.gov/svn/panda/pathena/trf'): - job.transformation = re.sub('^http://www.usatlas.bnl.gov/svn/panda/pathena/trf/', - 'http://pandaserver.cern.ch:25080/trf/user/', - job.transformation) - # set hostname - if hostname != '': - job.creationHost = hostname - # insert job to DB - if not proxy.insertNewJob(job,user,serNum,weight,priorityOffset,userVO,groupJobSerialNum, - toPending): - # reset if failed - job.PandaID = None - else: - # live log - if job.prodSourceLabel in ['user','panda']: - if ' --liveLog ' in job.jobParameters: - # enable liveLog only for the first one - if firstLiveLog: - # set file name - repPatt = ' --liveLog stdout.%s ' % job.PandaID - else: - # remove the option - repPatt = ' ' - job.jobParameters = re.sub(' --liveLog ',repPatt,job.jobParameters) - firstLiveLog = False - # append - newJobs.append(job) - if job.prodSourceLabel in ['user','panda','ptest','rc_test']: - ret.append((job.PandaID,job.jobDefinitionID,{'jobsetID':job.jobsetID})) - else: - ret.append((job.PandaID,job.jobDefinitionID,job.jobName)) - serNum += 1 - # release DB proxy - self.proxyPool.putProxy(proxy) - # set up dataset - if not toPending: - if joinThr: - thr = Setupper(self,newJobs,pandaDDM=usePandaDDM,forkRun=forkSetupper,resetLocation=resetLocInSetupper) - thr.start() - thr.join() - else: - # cannot use 'thr =' because it may trigger garbage collector - Setupper(self,newJobs,pandaDDM=usePandaDDM,forkRun=forkSetupper,resetLocation=resetLocInSetupper).start() - # return jobIDs - _logger.debug("storeJobs : end for %s succeeded" % user) - return ret - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("storeJobs : %s %s" % (errType,errValue)) - return "ERROR: ServerError with storeJobs" - - - # lock jobs for reassign - def lockJobsForReassign(self,tableName,timeLimit,statList,labels,processTypes,sites,clouds): - # get DB proxy - proxy = self.proxyPool.getProxy() - # exec - res = proxy.lockJobsForReassign(tableName,timeLimit,statList,labels,processTypes,sites,clouds) - # release DB proxy - self.proxyPool.putProxy(proxy) - # return - return res - - - # lock jobs for finisher - def lockJobsForFinisher(self,timeNow,rownum,highPrio): - # get DB proxy - proxy = self.proxyPool.getProxy() - # exec - res = proxy.lockJobsForFinisher(timeNow,rownum,highPrio) - # release DB proxy - self.proxyPool.putProxy(proxy) - # return - return res - - - # get number of activated/defined jobs with output datasets - def getNumWaitingJobsWithOutDS(self,outputDSs): - # get DB proxy - proxy = self.proxyPool.getProxy() - # exec - res = proxy.getNumWaitingJobsWithOutDS(outputDSs) - # release DB proxy - self.proxyPool.putProxy(proxy) - # return - return res - - - # resubmit jobs - def resubmitJobs(self,jobIDs): - # get DB proxy - proxy = self.proxyPool.getProxy() - jobs=[] - # get jobs - for jobID in jobIDs: - res = proxy.peekJob(jobID,True,False,False,False) - if res: - jobs.append(res) - # release DB proxy - self.proxyPool.putProxy(proxy) - # set up dataset - if len(jobs) > 0: - Setupper(self,jobs).start() - # return jobIDs - return True - - - # update overall job information - def updateJobs(self,jobs,inJobsDefined): - # get DB proxy - proxy = self.proxyPool.getProxy() - # loop over all jobs - returns = [] - ddmIDs = [] - ddmAttempt = 0 - newMover = None - for job in jobs: - # update DB - tmpddmIDs = [] - if job.jobStatus == 'failed' and job.prodSourceLabel == 'user' and not inJobsDefined: - # keep failed analy jobs in Active4 - ret = proxy.updateJob(job,inJobsDefined) - elif job.jobStatus in ['finished','failed','cancelled']: - ret,tmpddmIDs,ddmAttempt,newMover = proxy.archiveJob(job,inJobsDefined) - else: - ret = proxy.updateJob(job,inJobsDefined) - returns.append(ret) - # collect IDs for reassign - if ret: - ddmIDs += tmpddmIDs - # release proxy - self.proxyPool.putProxy(proxy) - # retry mover - if newMover != None: - self.storeJobs([newMover],None,joinThr=True) - # reassign jobs when ddm failed - if ddmIDs != []: - self.reassignJobs(ddmIDs,ddmAttempt,joinThr=True) - # return - return returns - - - # update job jobStatus only - def updateJobStatus(self,jobID,jobStatus,param,updateStateChange=False,attemptNr=None): - # get DB proxy - proxy = self.proxyPool.getProxy() - # update DB and buffer - if re.match('^finished$',jobStatus,re.I) or re.match('^failed$',jobStatus,re.I): - ret = proxy.archiveJobLite(jobID,jobStatus,param) - else: - ret = proxy.updateJobStatus(jobID,jobStatus,param,updateStateChange,attemptNr) - # release proxy - self.proxyPool.putProxy(proxy) - return ret - - - # finalize pending analysis jobs - def finalizePendingJobs(self,prodUserName,jobDefinitionID): - # get DB proxy - proxy = self.proxyPool.getProxy() - # update DB - ret = proxy.finalizePendingJobs(prodUserName,jobDefinitionID) - # release proxy - self.proxyPool.putProxy(proxy) - return ret - - - # retry job - def retryJob(self,jobID,param,failedInActive=False,changeJobInMem=False,inMemJob=None, - getNewPandaID=False,attemptNr=None): - # get DB proxy - proxy = self.proxyPool.getProxy() - # update DB - ret = proxy.retryJob(jobID,param,failedInActive,changeJobInMem,inMemJob, - getNewPandaID,attemptNr) - # release proxy - self.proxyPool.putProxy(proxy) - return ret - - - # retry failed analysis jobs in Active4 - def retryJobsInActive(self,prodUserName,jobDefinitionID): - # get DB proxy - proxy = self.proxyPool.getProxy() - # update DB - ret = proxy.retryJobsInActive(prodUserName,jobDefinitionID) - # release proxy - self.proxyPool.putProxy(proxy) - return ret - - - # activate jobs - def activateJobs(self,jobs): - # get DB proxy - proxy = self.proxyPool.getProxy() - # loop over all jobs - returns = [] - for job in jobs: - # update DB - ret = proxy.activateJob(job) - returns.append(ret) - # release proxy - self.proxyPool.putProxy(proxy) - return returns - - - # send jobs to jobsWaiting - def keepJobs(self,jobs): - # get DB proxy - proxy = self.proxyPool.getProxy() - # loop over all jobs - returns = [] - for job in jobs: - # update DB - ret = proxy.keepJob(job) - returns.append(ret) - # release proxy - self.proxyPool.putProxy(proxy) - return returns - - - # delete stalled jobs - def deleteStalledJobs(self,libFileName): - # get DB proxy - proxy = self.proxyPool.getProxy() - # execute - ret = proxy.deleteStalledJobs(libFileName) - # release proxy - self.proxyPool.putProxy(proxy) - return ret - - - # set debug mode - def setDebugMode(self,dn,pandaID,prodManager,modeOn): - # get DB proxy - proxy = self.proxyPool.getProxy() - # check the number of debug jobs - if modeOn == True: - jobList = proxy.getActiveDebugJobs(dn) - else: - jobList = [] - if (not prodManager and len(jobList) >= ProcessGroups.maxDebugJobs) or \ - (prodManager and len(jobList) >= ProcessGroups.maxDebugProdJobs): - # exceeded - retStr = 'You already hit the limit on the maximum number of debug subjobs per ' - if not prodManager: - retStr += 'user (%s). ' % ProcessGroups.maxDebugJobs - else: - retStr += 'prod user (%s). ' % ProcessGroups.maxDebugProdJobs - retStr += 'Please set the debug mode off for one of the following PandaIDs : ' - for tmpID in jobList: - retStr += '%s,' % tmpID - retStr = retStr[:-1] - else: - # execute - retStr = proxy.setDebugMode(dn,pandaID,prodManager,modeOn) - # release proxy - self.proxyPool.putProxy(proxy) - return retStr - - - # get jobs - def getJobs(self,nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, - atlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup,allowOtherCountry): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get waiting jobs - jobs,nSent = proxy.getJobs(nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, - atlasRelease,prodUserID,countryGroup,workingGroup,allowOtherCountry) - # release proxy - self.proxyPool.putProxy(proxy) - # get Proxy Key - proxyKey = {} - if getProxyKey and len(jobs) > 0: - # get MetaDB proxy - proxy = self.proxyPool.getProxy() - # get Proxy Key - proxyKey = proxy.getProxyKey(jobs[0].prodUserID) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return jobs+[nSent,proxyKey] - - - # run task assignment - def runTaskAssignment(self,jobs): - # get DB proxy - proxy = self.proxyPool.getProxy() - # loop over all jobs - retList =[] - newJobs =[] - for job in jobs: - ret = None - if not job.taskID in ['NULL',0,'']: - # get cloud - cloudTask = proxy.getCloudTask(job.taskID) - if cloudTask != None and cloudTask.status == 'assigned': - ret = cloudTask.cloud - if ret == None: - # append for TA - newJobs.append(job) - retList.append(ret) - # release DB proxy - self.proxyPool.putProxy(proxy) - # run setupper - if newJobs != []: - TaLauncher(self,newJobs).start() - # return clouds - return retList - - - # reset modification time of a task to shorten retry interval - def resetTmodCloudTask(self,tid): - # get DBproxy - proxy = self.proxyPool.getProxy() - # run - res = proxy.resetTmodCloudTask(tid) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return res - - - # get assigning task - def getAssigningTask(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # run - res = proxy.getAssigningTask() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return res - - - # get fareshare policy - def getFaresharePolicy(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # run - res = proxy.getFaresharePolicy(True) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return res - - - # check merge job generation status - def checkMergeGenerationStatus(self,dn,jobID): - # return for NA - retNA = {'status':'NA','mergeIDs':[]} - try: - # get at most 2 PandaIDs - idStatus = self.getPandIDsWithJobID(dn,jobID,2) - if idStatus == {}: - return retNA - # use larger PandaID which corresponds to runXYZ - tmpKeys = idStatus.keys() - tmpKeys.sort() - pandaID = tmpKeys[-1] - # get job - tmpJobs = self.getFullJobStatus([pandaID]) - if tmpJobs == [] or tmpJobs[0] == None: - return retNA - pandaJob = tmpJobs[0] - # non-merge job - if not '--mergeOutput' in pandaJob.jobParameters: - return retNA - # loop over all sub datasets - subDsList = [] - mergeStatus = None - mergeIDs = [] - for tmpFile in pandaJob.Files: - if tmpFile.type in ['output','log']: - if not tmpFile.destinationDBlock in subDsList: - subDsList.append(tmpFile.destinationDBlock) - # get dataset - tmpDsSpec = self.queryDatasetWithMap({'name':tmpFile.destinationDBlock}) - if tmpDsSpec != None: - if tmpDsSpec.status in ['tobemerged']: - # going to be merged - mergeStatus = 'generating' - mergeIDs = [] - elif tmpDsSpec.status in ['tobeclosed','closed','completed']: - # another dataset from --individualOutDS is waiting for Merger - if mergeStatus == 'generating': - continue - # set status - mergeStatus = 'generated' - # collect JobIDs of merge jobs - tmpMergeID = tmpDsSpec.MoverID - if not tmpMergeID in [0,None,'NULL']+mergeIDs: - mergeIDs.append(tmpMergeID) - # no merger most likely because jobs were killed - if mergeStatus == 'generated' and mergeIDs == []: - mergeStatus = 'aborted' - # jobs are still runnign - if mergeStatus == None: - mergeStatus = 'standby' - # return - return {'status':mergeStatus,'mergeIDs':mergeIDs} - except: - return retNA - - - # get job status - def getJobStatus(self,jobIDs,fromDefined=True,fromActive=True,fromArchived=True,fromWaiting=True): - # get DBproxy - proxy = self.proxyPool.getProxy() - retStatus = [] - # peek at job - for jobID in jobIDs: - res = proxy.peekJob(jobID,fromDefined,fromActive,fromArchived,fromWaiting) - if res: - retStatus.append(res.jobStatus) - else: - retStatus.append(None) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retStatus - - - # peek at jobs - def peekJobs(self,jobIDs,fromDefined=True,fromActive=True,fromArchived=True,fromWaiting=True,forAnal=False): - # get DBproxy - proxy = self.proxyPool.getProxy() - retJobs = [] - # peek at job - for jobID in jobIDs: - res = proxy.peekJob(jobID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal) - if res: - retJobs.append(res) - else: - retJobs.append(None) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retJobs - - - # get PandaID with jobexeID - def getPandaIDwithJobExeID(self,jobexeIDs): - # get DBproxy - proxy = self.proxyPool.getProxy() - retJobs = [] - # peek at job - for jobexeID in jobexeIDs: - res = proxy.getPandaIDwithJobExeID(jobexeID) - retJobs.append(res) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retJobs - - - # get slimmed file info with PandaIDs - def getSlimmedFileInfoPandaIDs(self,pandaIDs): - iPandaID = 0 - nPandaID = 100 - retInfo = {} - while iPandaID < len(pandaIDs): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - tmpRetInfo = proxy.getSlimmedFileInfoPandaIDs(pandaIDs[iPandaID:iPandaID+nPandaID]) - # release proxy - self.proxyPool.putProxy(proxy) - iPandaID += nPandaID - if retInfo == {}: - retInfo = tmpRetInfo - else: - for outKey in tmpRetInfo.keys(): - if not retInfo.has_key(outKey): - retInfo[outKey] = [] - # append - for tmpItemRetInfo in tmpRetInfo[outKey]: - if not tmpItemRetInfo in retInfo[outKey]: - retInfo[outKey].append(tmpItemRetInfo) - # return - return retInfo - - - # get JobIDs in a time range - def getJobIDsInTimeRange(self,dn,timeRangeStr): - # check DN - if dn in ['NULL','','None',None]: - return [] - # check timeRange - match = re.match('^(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)$',timeRangeStr) - if match == None: - return [] - timeRange = datetime.datetime(year = int(match.group(1)), - month = int(match.group(2)), - day = int(match.group(3)), - hour = int(match.group(4)), - minute = int(match.group(5)), - second = int(match.group(6))) - # max range is 3 months - maxRange = datetime.datetime.utcnow() - datetime.timedelta(days=30) - if timeRange < maxRange: - timeRange = maxRange - retJobIDs = [] - # get DBproxy - proxy = self.proxyPool.getProxy() - # get JobIDs - retJobIDs = proxy.getJobIDsInTimeRange(dn,timeRange,retJobIDs) - # release proxy - self.proxyPool.putProxy(proxy) - # read ARCH when time window is more than 3days (- 3 hours as a margin) - if timeRange < datetime.datetime.utcnow() - datetime.timedelta(days=2,hours=21) : - # get ArchiveDBproxy - proxy = self.proxyPool.getProxy() - # get JobIDs - retJobIDs = proxy.getJobIDsInTimeRangeLog(dn,timeRange,retJobIDs) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retJobIDs - - - # get PandaIDs for a JobID - def getPandIDsWithJobID(self,dn,jobID,nJobs): - idStatus = {} - # check DN - if dn in ['NULL','','None',None]: - return idStatus - # check JobID - try: - jobID = long(jobID) - nJobs = long(nJobs) - except: - return idStatus - # get DBproxy - proxy = self.proxyPool.getProxy() - # get IDs - idStatus,buildJobID = proxy.getPandIDsWithJobID(dn,jobID,idStatus,nJobs) - # release proxy - self.proxyPool.putProxy(proxy) - # get ArchiveDBproxy - proxy = self.proxyPool.getProxy() - # get IDs - idStatus = proxy.getPandIDsWithJobIDLog(dn,jobID,idStatus,nJobs,buildJobID) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return idStatus - - - # get PandaIDs for a JobsetID or JobdefID in jobsArchived - def getPandIDsWithIdInArch(self,prodUserName,id,isJobset): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getPandIDsWithIdInArch(prodUserName,id,isJobset) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get beyond pledge resource ratio - # ! this method is not thread-safe - def getPledgeResourceRatio(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getPledgeResourceRatio() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return proxy.beyondPledgeRatio - - - # get the number of waiting jobs with a dataset - def getNumWaitingJobsForPD2P(self,datasetName): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - nJobs = proxy.getNumWaitingJobsForPD2P(datasetName) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return nJobs - - - # get the number of waiting jobsets with a dataset - def getNumWaitingJobsetsForPD2P(self,datasetName): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - nJobs = proxy.getNumWaitingJobsetsForPD2P(datasetName) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return nJobs - - - # lock job for re-brokerage - def lockJobForReBrokerage(self,dn,jobID,simulation,forceOpt,forFailed=False): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get IDs - ret = proxy.lockJobForReBrokerage(dn,jobID,simulation,forceOpt,forFailed) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # reset buildJob for re-brokerage - def resetBuildJobForReBrokerage(self,pandaID): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get IDs - ret = proxy.resetBuildJobForReBrokerage(pandaID) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get PandaIDs using libDS for re-brokerage - def getPandaIDsForReBrokerage(self,userName,jobID,fromActive,forFailed=False): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get IDs - ret = proxy.getPandaIDsForReBrokerage(userName,jobID,fromActive,forFailed) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get input datasets for rebroerage - def getInDatasetsForReBrokerage(self,jobID,userName): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get IDs - ret = proxy.getInDatasetsForReBrokerage(jobID,userName) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get outDSs with userName/jobID - def getOutDSsForReBrokerage(self,userName,jobID): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get IDs - ret = proxy.getOutDSsForReBrokerage(userName,jobID) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get full job status - def getFullJobStatus(self,jobIDs,fromDefined=True,fromActive=True,fromArchived=True,fromWaiting=True,forAnal=True): - retJobMap = {} - # peek at job - for jobID in jobIDs: - # get DBproxy for each job to avoid occupying connection for long time - proxy = self.proxyPool.getProxy() - # peek job - res = proxy.peekJob(jobID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal) - retJobMap[jobID] = res - # release proxy - self.proxyPool.putProxy(proxy) - # get IDs - for jobID in jobIDs: - if retJobMap[jobID] == None: - # get ArchiveDBproxy - proxy = self.proxyPool.getProxy() - # peek job - res = proxy.peekJobLog(jobID) - retJobMap[jobID] = res - # release proxy - self.proxyPool.putProxy(proxy) - # sort - retJobs = [] - for jobID in jobIDs: - retJobs.append(retJobMap[jobID]) - # return - return retJobs - - - # get script for offline running - def getScriptOfflineRunning(self,pandaID): - try: - # get job - tmpJobs = self.getFullJobStatus([pandaID]) - if tmpJobs == [] or tmpJobs[0] == None: - return "ERROR: Cannot get PandaID=%s in DB for the last 30 days" % pandaID - tmpJob = tmpJobs[0] - # check prodSourceLabel - if not tmpJob.prodSourceLabel in ['managed','test']: - return "ERROR: Non production job : prodSourceLabel=%s. This method is only for production jobs" % tmpJob.prodSourceLabel - # release and trf - tmpRels = tmpJob.homepackage.split("\n") - tmpPars = tmpJob.jobParameters.split("\n") - tmpTrfs = tmpJob.transformation.split("\n") - if not (len(tmpRels) == len(tmpPars) == len(tmpTrfs)): - return "ERROR: The number of releases or parameters or trfs is inconsitent with others" - # construct script - scrStr = "#retrieve inputs\n\n" - # collect inputs - dsFileMap = {} - for tmpFile in tmpJob.Files: - if tmpFile.type=='input': - if not dsFileMap.has_key(tmpFile.dataset): - dsFileMap[tmpFile.dataset] = [] - if not tmpFile.lfn in dsFileMap[tmpFile.dataset]: - dsFileMap[tmpFile.dataset].append(tmpFile.lfn) - # dq2 - for tmpDS,tmpFileList in dsFileMap.iteritems(): - scrStr += "dq2-get --files " - for tmpLFN in tmpFileList: - scrStr += "%s," % tmpLFN - scrStr = scrStr[:-1] - scrStr += " %s\n" % tmpDS - # ln - for tmpLFN in tmpFileList: - scrStr += "ln -fs %s*/%s ./%s\n" % (tmpDS.rstrip("/"),tmpLFN,tmpLFN) - scrStr += "\n#transform commands\n\n" - bitNum = '32' - if 'x86_64' in tmpJob.cmtConfig: - bitNum = '64' - for tmpIdx,tmpRel in enumerate(tmpRels): - # asetup - scrStr += "asetup %s,%s,%s\n" % tuple(tmpRel.split("/")+[bitNum]) - # athenaMP - if not tmpJob.coreCount in ['NULL',None] and tmpJob.coreCount > 1: - scrStr += "export ATHENA_PROC_NUMBER=%s\n" % tmpJob.coreCount - # add double quotes for zsh - tmpParamStr = tmpPars[tmpIdx] - tmpSplitter = shlex.shlex(tmpParamStr, posix=True) - tmpSplitter.whitespace = ' ' - tmpSplitter.whitespace_split = True - # loop for params - for tmpItem in tmpSplitter: - tmpMatch = re.search('^([^=]+=)(.+)$',tmpItem) - if tmpMatch != None: - tmpArgName = tmpMatch.group(1) - tmpArgVal = tmpMatch.group(2) - tmpArgIdx = tmpParamStr.find(tmpArgName) + len(tmpArgName) - # add " - if tmpParamStr[tmpArgIdx] != '"': - tmpParamStr = tmpParamStr.replace(tmpMatch.group(0), - tmpArgName+'"'+tmpArgVal+'"') - # run trf - scrStr += "%s %s\n\n" % (tmpTrfs[tmpIdx],tmpParamStr) - return scrStr - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("getScriptOfflineRunning : %s %s" % (errType,errValue)) - return "ERROR: ServerError with getScriptOfflineRunning" - - - # kill jobs - def killJobs(self,ids,user,code,prodManager,wgProdRole=[]): - # get DBproxy - proxy = self.proxyPool.getProxy() - rets = [] - # kill jobs - pandaIDforCloserMap = {} - for id in ids: - ret,userInfo = proxy.killJob(id,user,code,prodManager,True,wgProdRole) - rets.append(ret) - if ret and userInfo['prodSourceLabel'] in ['user','managed','test']: - jobIDKey = (userInfo['prodUserID'],userInfo['jobDefinitionID'],userInfo['jobsetID']) - if not pandaIDforCloserMap.has_key(jobIDKey): - pandaIDforCloserMap[jobIDKey] = id - # release proxy - self.proxyPool.putProxy(proxy) - # run Closer - try: - if pandaIDforCloserMap != {}: - for pandaIDforCloser in pandaIDforCloserMap.values(): - tmpJobs = self.peekJobs([pandaIDforCloser]) - tmpJob = tmpJobs[0] - if tmpJob != None: - tmpDestDBlocks = [] - # get destDBlock - for tmpFile in tmpJob.Files: - if tmpFile.type in ['output','log']: - if not tmpFile.destinationDBlock in tmpDestDBlocks: - tmpDestDBlocks.append(tmpFile.destinationDBlock) - # run - cThr = Closer(self,tmpDestDBlocks,tmpJob) - cThr.start() - cThr.join() - except: - pass - # return - return rets - - - # reassign jobs - def reassignJobs(self,ids,attempt=0,joinThr=False,forkSetupper=False,forPending=False): - # get DBproxy - proxy = self.proxyPool.getProxy() - jobs = [] - oldSubMap = {} - # keep old assignment - keepSiteFlag = False - if (attempt % 2) != 0: - keepSiteFlag = True - # reset jobs - for id in ids: - try: - # try to reset active job - if not forPending: - tmpRet = proxy.resetJob(id,keepSite=keepSiteFlag,getOldSubs=True) - if isinstance(tmpRet,types.TupleType): - ret,tmpOldSubList = tmpRet - else: - ret,tmpOldSubList = tmpRet,[] - if ret != None: - jobs.append(ret) - for tmpOldSub in tmpOldSubList: - if not oldSubMap.has_key(tmpOldSub): - oldSubMap[tmpOldSub] = ret - continue - # try to reset waiting job - tmpRet = proxy.resetJob(id,False,keepSite=keepSiteFlag,getOldSubs=False,forPending=forPending) - if isinstance(tmpRet,types.TupleType): - ret,tmpOldSubList = tmpRet - else: - ret,tmpOldSubList = tmpRet,[] - if ret != None: - jobs.append(ret) - # waiting jobs don't create sub or dis - continue - # try to reset defined job - if not forPending: - tmpRet = proxy.resetDefinedJob(id,keepSite=keepSiteFlag,getOldSubs=True) - if isinstance(tmpRet,types.TupleType): - ret,tmpOldSubList = tmpRet - else: - ret,tmpOldSubList = tmpRet,[] - if ret != None: - jobs.append(ret) - for tmpOldSub in tmpOldSubList: - if not oldSubMap.has_key(tmpOldSub): - oldSubMap[tmpOldSub] = ret - continue - except: - pass - # release DB proxy - self.proxyPool.putProxy(proxy) - # run Closer for old sub datasets - if not forPending: - for tmpOldSub,tmpJob in oldSubMap.iteritems(): - cThr = Closer(self,[tmpOldSub],tmpJob) - cThr.start() - cThr.join() - # setup dataset - if jobs != []: - if joinThr: - thr = Setupper(self,jobs,resubmit=True,ddmAttempt=attempt,forkRun=forkSetupper) - thr.start() - thr.join() - else: - # cannot use 'thr =' because it may trigger garbage collector - Setupper(self,jobs,resubmit=True,ddmAttempt=attempt,forkRun=forkSetupper).start() - # return - return True - - - # awake jobs in jobsWaiting - def awakeJobs(self,ids): - # get DBproxy - proxy = self.proxyPool.getProxy() - jobs = [] - # reset jobs - for id in ids: - # try to reset waiting job - ret = proxy.resetJob(id,False) - if ret != None: - jobs.append(ret) - # release DB proxy - self.proxyPool.putProxy(proxy) - # setup dataset - Setupper(self,jobs).start() - # return - return True - - - # query PandaIDs - def queryPandaIDs(self,jobDefIDs): - # get DBproxy - proxy = self.proxyPool.getProxy() - pandaIDs = [] - # query PandaID - for jobDefID in jobDefIDs: - id = proxy.queryPandaID(jobDefID) - pandaIDs.append(id) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return pandaIDs - - - # query job info per cloud - def queryJobInfoPerCloud(self,cloud,schedulerID=None): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query job info - ret = proxy.queryJobInfoPerCloud(cloud,schedulerID) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get PandaIDs to be updated in prodDB - def getPandaIDsForProdDB(self,limit,lockedby): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query PandaID - ret = proxy.getPandaIDsForProdDB(limit,lockedby) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # update prodDBUpdateTime - def updateProdDBUpdateTimes(self,paramList): - retList = [] - # get DBproxy - proxy = self.proxyPool.getProxy() - # update - for param in paramList: - ret = proxy.updateProdDBUpdateTime(param) - retList.append(ret) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # get PandaIDs at Site - def getPandaIDsSite(self,site,status,limit): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query PandaID - ids = proxy.getPandaIDsSite(site,status,limit) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ids - - - # get input files currently in used for analysis - def getFilesInUseForAnal(self,outDataset): - # get DBproxy - proxy = self.proxyPool.getProxy() - retList = [] - # query LFNs - retList = proxy.getFilesInUseForAnal(outDataset) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # get list of dis dataset to get input files in shadow - def getDisInUseForAnal(self,outDataset): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query dis - retList = proxy.getDisInUseForAnal(outDataset) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # get input LFNs currently in use for analysis with shadow dis - def getLFNsInUseForAnal(self,inputDisList): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query dis - retList = proxy.getLFNsInUseForAnal(inputDisList) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # update input files and return corresponding PandaIDs - def updateInFilesReturnPandaIDs(self,dataset,status,fileLFN=''): - # get DBproxy - proxy = self.proxyPool.getProxy() - retList = [] - # query PandaID - retList = proxy.updateInFilesReturnPandaIDs(dataset,status,fileLFN) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # update file status in dispatch dataset - def updateFileStatusInDisp(self,dataset,fileStatusMap): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query PandaID - retVal = proxy.updateFileStatusInDisp(dataset,fileStatusMap) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retVal - - - # update output files and return corresponding PandaIDs - def updateOutFilesReturnPandaIDs(self,dataset,fileLFN=''): - # get DBproxy - proxy = self.proxyPool.getProxy() - retList = [] - # query PandaID - retList = proxy.updateOutFilesReturnPandaIDs(dataset,fileLFN) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # get datasets associated with file - def getDatasetWithFile(self,lfn,jobPrioity=0): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query PandaID - retList = proxy.getDatasetWithFile(lfn,jobPrioity) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # get _dis datasets associated to _sub - def getAssociatedDisDatasets(self,subDsName): - # get DBproxy - proxy = self.proxyPool.getProxy() - retList = [] - # query - retList = proxy.getAssociatedDisDatasets(subDsName) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # insert sandbox file info - def insertSandboxFileInfo(self,userName,hostName,fileName,fileSize,checkSum): - # get DBproxy - proxy = self.proxyPool.getProxy() - # exec - ret= proxy.insertSandboxFileInfo(userName,hostName,fileName,fileSize,checkSum) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # check duplicated sandbox file - def checkSandboxFile(self,userName,fileSize,checkSum): - # get DBproxy - proxy = self.proxyPool.getProxy() - # exec - ret= proxy.checkSandboxFile(userName,fileSize,checkSum) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # insert datasets - def insertDatasets(self,datasets): - # get DBproxy - proxy = self.proxyPool.getProxy() - retList = [] - # insert - for dataset in datasets: - ret= proxy.insertDataset(dataset) - retList.append(ret) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # query Dataset - def queryDatasetWithMap(self,map): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query Dataset - ret = proxy.queryDatasetWithMap(map) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # query last files in a dataset - def queryLastFilesInDataset(self,datasets): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query files - ret = proxy.queryLastFilesInDataset(datasets) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # set GUIDs - def setGUIDs(self,files): - # get DBproxy - proxy = self.proxyPool.getProxy() - # set GUIDs - ret = proxy.setGUIDs(files) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # query PandaID with dataset - def queryPandaIDwithDataset(self,datasets): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query Dataset - ret = proxy.queryPandaIDwithDataset(datasets) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # query PandaID with filenames - def queryPandaIDwithLFN(self,lfns): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query Dataset - ret = proxy.queryPandaIDwithLFN(lfns) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # update dataset - def updateDatasets(self,datasets,withLock=False,withCriteria="",criteriaMap={}): - # get DBproxy - proxy = self.proxyPool.getProxy() - # update Dataset - retList = proxy.updateDataset(datasets,withLock,withCriteria,criteriaMap) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # delete dataset - def deleteDatasets(self,datasets): - # get DBproxy - proxy = self.proxyPool.getProxy() - retList = [] - # query Dataset - for dataset in datasets: - ret = proxy.deleteDataset(dataset) - retList.append(ret) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # query files with map - def queryFilesWithMap(self,map): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query files - ret = proxy.queryFilesWithMap(map) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # count the number of files with map - def countFilesWithMap(self,map): - # get DBproxy - proxy = self.proxyPool.getProxy() - # query files - ret = proxy.countFilesWithMap(map) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # count the number of pending files - def countPendingFiles(self,pandaID,forInput=True): - # get DBproxy - proxy = self.proxyPool.getProxy() - # count files - ret = proxy.countPendingFiles(pandaID,forInput) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get serial number for dataset - def getSerialNumber(self,datasetname,definedFreshFlag=None): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.getSerialNumber(datasetname,definedFreshFlag) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get serial number for group job - def getSerialNumberForGroupJob(self,name): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.getSerialNumberForGroupJob(name) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # add metadata - def addMetadata(self,ids,metadataList): - # get DBproxy - proxy = self.proxyPool.getProxy() - # add metadata - index = 0 - retList = [] - for id in ids: - ret = proxy.addMetadata(id,metadataList[index]) - retList.append(ret) - index += 1 - # release proxy - self.proxyPool.putProxy(proxy) - # return - return retList - - - # add stdout - def addStdOut(self,id,stdout): - # get DBproxy - proxy = self.proxyPool.getProxy() - # add - ret = proxy.addStdOut(id,stdout) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # extract name from DN - def cleanUserID(self,id): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.cleanUserID(id) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # extract scope from dataset name - def extractScope(self,name): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.extractScope(name) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # change job priorities - def changeJobPriorities(self,newPrioMap): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.changeJobPriorities(newPrioMap) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get destinationDBlockToken for a dataset - def getDestTokens(self,dsname): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get token - ret = proxy.getDestTokens(dsname) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get destinationSE for a dataset - def getDestSE(self,dsname,fromArch=False): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get token - ret = proxy.getDestSE(dsname,fromArch) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get job statistics - def getJobStatistics(self,archived=False,predefined=False,workingGroup='',countryGroup='',jobType='',forAnal=None,minPriority=None): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.getJobStatistics(archived,predefined,workingGroup,countryGroup,jobType,forAnal,minPriority) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get job statistics with label - def getJobStatisticsWithLabel(self,siteStr=''): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.getJobStatisticsWithLabel(siteStr) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get job statistics for brokerage - def getJobStatisticsBrokerage(self,minPrio=None): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get stat - ret = proxy.getJobStatisticsBrokerage(minPrio) - # release proxy - self.proxyPool.putProxy(proxy) - # convert - conRet = ProcessGroups.countJobsPerGroup(ret) - # return - return conRet - - - # get job statistics for analysis brokerage - def getJobStatisticsAnalBrokerage(self,minPriority=None): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get stat - ret = proxy.getJobStatisticsAnalBrokerage(minPriority=minPriority) - # release proxy - self.proxyPool.putProxy(proxy) - # convert - conRet = ProcessGroups.countJobsPerGroupForAnal(ret) - # return - return conRet - - - # get the number of waiting jobs per site and user - def getJobStatisticsPerUserSite(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get stat - ret = proxy.getJobStatisticsPerUserSite() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get highest prio jobs - def getHighestPrioJobStat(self,perPG=False,useMorePG=False): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get stat - if not perPG: - ret = proxy.getHighestPrioJobStat() - else: - ret = proxy.getHighestPrioJobStatPerPG(useMorePG) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get queued analysis jobs at a site - def getQueuedAnalJobs(self,site,dn): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get stat - ret = proxy.getQueuedAnalJobs(site,dn) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get job statistics for ExtIF - def getJobStatisticsForExtIF(self,sourcetype=None): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.getJobStatisticsForExtIF(sourcetype) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get job statistics for Bamboo - def getJobStatisticsForBamboo(self,useMorePG=False): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.getJobStatisticsPerProcessingType(useMorePG) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get number of analysis jobs per user - def getNUserJobs(self,siteName,nJobs): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get number of analysis jobs per user - tmpRet = proxy.getNUserJobs(siteName,nJobs) - # release proxy - self.proxyPool.putProxy(proxy) - # get log proxy - proxy = self.proxyPool.getProxy() - # get Proxy Key - ret = {} - for userID,nJobs in tmpRet.iteritems(): - proxyKey = proxy.getProxyKey(userID) - if proxyKey != {}: - # add nJobs - proxyKey['nJobs'] = nJobs - # append - ret[userID] = proxyKey - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get number of activated analysis jobs - def getNAnalysisJobs(self,nProcesses): - # get DBproxy - proxy = self.proxyPool.getProxy() - # count - ret = proxy.getNAnalysisJobs(nProcesses) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # update transfer status for a dataset - def updateTransferStatus(self,datasetname,bitMap): - # get DBproxy - proxy = self.proxyPool.getProxy() - # update - ret = proxy.updateTransferStatus(datasetname,bitMap) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get CloudTask - def getCloudTask(self,tid): - # get DBproxy - proxy = self.proxyPool.getProxy() - # count - ret = proxy.getCloudTask(tid) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # set cloud to CloudTask - def setCloudTask(self,cloudTask): - # get DBproxy - proxy = self.proxyPool.getProxy() - # count - ret = proxy.setCloudTask(cloudTask) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # see CloudTask - def seeCloudTask(self,tid): - # get DBproxy - proxy = self.proxyPool.getProxy() - # count - ret = proxy.seeCloudTask(tid) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # set cloud to CloudTask by user - def setCloudTaskByUser(self,user,tid,cloud,status): - # get DBproxy - proxy = self.proxyPool.getProxy() - # count - ret = proxy.setCloudTaskByUser(user,tid,cloud,status) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # update site data - def updateSiteData(self,hostID,pilotRequests): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.updateSiteData(hostID,pilotRequests) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get current site data - def getCurrentSiteData(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.getCurrentSiteData() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # insert nRunning in site data - def insertnRunningInSiteData(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.insertnRunningInSiteData() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get nRunning in site data - def getnRunningInSiteData(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get serial number - ret = proxy.getnRunningInSiteData() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get site list - def getSiteList(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get site info - ret = proxy.getSiteList() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get site info - def getSiteInfo(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get site info - ret = proxy.getSiteInfo() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get cloud list - def getCloudList(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get cloud list - ret = proxy.getCloudList() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # check sites with release/cache - def checkSitesWithRelease(self,sites,releases=None,caches=None,cmtConfig=None): - # get DBproxy - proxy = self.proxyPool.getProxy() - # check - ret = proxy.checkSitesWithRelease(sites,releases,caches,cmtConfig) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get sites with release/cache in cloud - def getSitesWithReleaseInCloud(self,cloud,releases=None,caches=None,validation=False): - # get DBproxy - proxy = self.proxyPool.getProxy() - # check - ret = proxy.getSitesWithReleaseInCloud(cloud,releases,caches,validation) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get list of cache prefix - def getCachePrefixes(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # check - ret = proxy.getCachePrefixes() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get pilot owners - def getPilotOwners(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get pilot owners - ret = proxy.getPilotOwners() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get allowed nodes - def getAllowedNodes(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getAllowedNodes() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get email address - def getEmailAddr(self,name): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getEmailAddr(name) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get client version - def getPandaClientVer(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getPandaClientVer() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # register proxy key - def registerProxyKey(self,params): - # get DBproxy - proxy = self.proxyPool.getProxy() - # register proxy key - ret = proxy.registerProxyKey(params) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # register proxy key - def registerProxyKey(self,params): - # get DBproxy - proxy = self.proxyPool.getProxy() - # register proxy key - ret = proxy.registerProxyKey(params) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get proxy key - def getProxyKey(self,dn): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get proxy key - ret = proxy.getProxyKey(dn) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # add account to siteaccess - def addSiteAccess(self,siteID,dn): - # get DBproxy - proxy = self.proxyPool.getProxy() - # add account to siteaccess - ret = proxy.addSiteAccess(siteID,dn) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # list site access - def listSiteAccess(self,siteid,dn,longFormat=False): - # get DBproxy - proxy = self.proxyPool.getProxy() - # list site access - ret = proxy.listSiteAccess(siteid,dn,longFormat) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # update site access - def updateSiteAccess(self,method,siteid,requesterDN,userName,attrValue): - # get DBproxy - proxy = self.proxyPool.getProxy() - # update site access - ret = proxy.updateSiteAccess(method,siteid,requesterDN,userName,attrValue) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # generate pilot token - def genPilotToken(self,schedulerhost,scheduleruser,schedulerid): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.genPilotToken(schedulerhost,scheduleruser,schedulerid) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # add files to memcached - def addFilesToMemcached(self,site,node,files): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.addFilesToMemcached(site,node,files) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # delete files from memcached - def deleteFilesFromMemcached(self,site,node,files): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.deleteFilesFromMemcached(site,node,files) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # flush memcached - def flushMemcached(self,site,node): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.flushMemcached(site,node) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - # check files with memcached - def checkFilesWithMemcached(self,site,node,files): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.checkFilesWithMemcached(site,node,files) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get list of scheduler users - def getListSchedUsers(self): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getListSchedUsers() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # query an SQL return Status - def querySQLS(self,sql,varMap,arraySize=1000): - # get DBproxy - proxy = self.proxyPool.getProxy() - # get - ret = proxy.querySQLS(sql,varMap,arraySize) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # check quota - def checkQuota(self,dn): - # query an SQL return Status - proxy = self.proxyPool.getProxy() - # get - ret = proxy.checkQuota(dn) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get JobID for user - def getJobIdUser(self,dn): - # query an SQL return Status - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getJobIdUser(dn) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get user subscriptions - def getUserSubscriptions(self,datasetName,timeRange): - # query an SQL return Status - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getUserSubscriptions(datasetName,timeRange) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get the number of user subscriptions - def getNumUserSubscriptions(self): - # query an SQL return Status - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getNumUserSubscriptions() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # add user subscriptions - def addUserSubscription(self,datasetName,dq2IDs): - # query an SQL return Status - proxy = self.proxyPool.getProxy() - # get - ret = proxy.addUserSubscription(datasetName,dq2IDs) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # increment counter for subscription - def incrementUsedCounterSubscription(self,datasetName): - # query an SQL return Status - proxy = self.proxyPool.getProxy() - # get - ret = proxy.incrementUsedCounterSubscription(datasetName) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get active datasets - def getActiveDatasets(self,computingSite,prodSourceLabel): - # query an SQL return Status - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getActiveDatasets(computingSite,prodSourceLabel) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # check status of all sub datasets to trigger Notifier - def checkDatasetStatusForNotifier(self,jobsetID,jobDefinitionID,prodUserName): - # query an SQL return Status - proxy = self.proxyPool.getProxy() - # get - ret = proxy.checkDatasetStatusForNotifier(jobsetID,jobDefinitionID,prodUserName) - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - - # get MoU share for T2 PD2P - def getMouShareForT2PD2P(self): - # query an SQL return Status - proxy = self.proxyPool.getProxy() - # get - ret = proxy.getMouShareForT2PD2P() - # release proxy - self.proxyPool.putProxy(proxy) - # return - return ret - - -# Singleton -taskBuffer = TaskBuffer() - diff --git a/current/pandaserver/taskbuffer/Utils.py b/current/pandaserver/taskbuffer/Utils.py deleted file mode 100755 index e3ad1efe9..000000000 --- a/current/pandaserver/taskbuffer/Utils.py +++ /dev/null @@ -1,512 +0,0 @@ -""" -utility service - -""" -import os -import re -import sys -import zlib -import uuid -import time -import socket -import struct -import datetime -import jobdispatcher.Protocol as Protocol -import ErrorCode -from userinterface import Client -from config import panda_config - -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('Utils') - -# check if server is alive -def isAlive(req): - return "alive=yes" - - -# extract name from DN -def cleanUserID(id): - try: - up = re.compile('/(DC|O|OU|C|L)=[^\/]+') - username = up.sub('', id) - up2 = re.compile('/CN=[0-9]+') - username = up2.sub('', username) - up3 = re.compile(' [0-9]+') - username = up3.sub('', username) - up4 = re.compile('_[0-9]+') - username = up4.sub('', username) - username = username.replace('/CN=proxy','') - username = username.replace('/CN=limited proxy','') - username = username.replace('limited proxy','') - username = re.sub('/CN=Robot:[^/]+','',username) - pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)') - mat = pat.match(username) - if mat: - username = mat.group(2) - else: - username = username.replace('/CN=','') - if username.lower().find('/email') > 0: - username = username[:username.lower().find('/email')] - pat = re.compile('.*(limited.*proxy).*') - mat = pat.match(username) - if mat: - username = mat.group(1) - username = username.replace('(','') - username = username.replace(')','') - username = username.replace("'",'') - return username - except: - return id - - -# insert with rety -def insertWithRetryCassa(familyName,keyName,valMap,msgStr,nTry=3): - for iTry in range(nTry): - try: - familyName.insert(keyName,valMap) - except pycassa.MaximumRetryException,tmpE: - if iTry+1 < nTry: - _logger.debug("%s sleep %s/%s" % (msgStr,iTry,nTry)) - time.sleep(30) - else: - raise pycassa.MaximumRetryException,tmpE.value - else: - break - - -# touch in Cassandra -def touchFileCassa(filefamily,fileKeyName,timeNow): - try: - # get old timestamp - oldFileInfo = filefamily.get(fileKeyName) - except: - _logger.warning('cannot get old fileinfo for %s from Cassandra' % fileKeyName) - return False - try: - # update time in fileTable - for splitIdx in range(oldFileInfo['nSplit']): - tmpFileKeyName = fileKeyName - if splitIdx != 0: - tmpFileKeyName += '_%s' % splitIdx - insertWithRetryCassa(filefamily,tmpFileKeyName, - {'year' : timeNow.year, - 'month' : timeNow.month, - 'day' : timeNow.day, - 'hour' : timeNow.hour, - 'minute' : timeNow.minute, - 'second' : timeNow.second}, - 'touchFileCassa : %s' % fileKeyName - ) - return True - except: - errType,errValue = sys.exc_info()[:2] - errStr = "cannot touch %s due to %s %s" % (fileKeyName,errType,errValue) - _logger.error(errStr) - return False - - -# upload file -def putFile(req,file): - if not Protocol.isSecure(req): - return False - if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: - return False - _logger.debug("putFile : start %s %s" % (req.subprocess_env['SSL_CLIENT_S_DN'],file.filename)) - # size check - fullSizeLimit = 768*1024*1024 - if not file.filename.startswith('sources.'): - noBuild = True - sizeLimit = 10*1024*1024 - else: - noBuild = False - sizeLimit = fullSizeLimit - # get file size - contentLength = 0 - try: - contentLength = long(req.headers_in["content-length"]) - except: - if req.headers_in.has_key("content-length"): - _logger.error("cannot get CL : %s" % req.headers_in["content-length"]) - else: - _logger.error("no CL") - _logger.debug("size %s" % contentLength) - if contentLength > sizeLimit: - errStr = "ERROR : Upload failure. Exceeded size limit %s>%s." % (contentLength,sizeLimit) - if noBuild: - errStr += " Please submit the job without --noBuild/--libDS since those options impose a tighter size limit" - else: - errStr += " Please remove redundant files from your workarea" - _logger.error(errStr) - _logger.debug("putFile : end") - return errStr - try: - fileFullPath = '%s/%s' % (panda_config.cache_dir,file.filename.split('/')[-1]) - # avoid overwriting - if os.path.exists(fileFullPath): - # touch - os.utime(fileFullPath,None) - # send error message - errStr = "ERROR : Cannot overwrite file" - _logger.debug('putFile : cannot overwrite file %s' % file.filename) - _logger.debug("putFile : end") - return errStr - # write - fo = open(fileFullPath,'wb') - fileContent = file.file.read() - fo.write(fileContent) - fo.close() - except: - errStr = "ERROR : Cannot write file" - _logger.error(errStr) - _logger.debug("putFile : end") - return errStr - # checksum - try: - # decode Footer - footer = fileContent[-8:] - checkSum,isize = struct.unpack("II",footer) - _logger.debug("CRC from gzip Footer %s" % checkSum) - except: - # calculate on the fly - """ - import zlib - checkSum = zlib.adler32(fileContent) & 0xFFFFFFFF - """ - # use None to avoid delay for now - checkSum = None - _logger.debug("CRC calculated %s" % checkSum) - # file size - fileSize = len(fileContent) - # user name - username = cleanUserID(req.subprocess_env['SSL_CLIENT_S_DN']) - _logger.debug("putFile : written dn=%s file=%s size=%s crc=%s" % \ - (username,file.filename,fileSize,checkSum)) - # put file info to DB - statClient,outClient = Client.insertSandboxFileInfo(username,file.filename, - fileSize,checkSum) - if statClient != 0 or outClient.startswith("ERROR"): - _logger.error("putFile : failed to put sandbox to DB with %s %s" % (statClient,outClient)) - #_logger.debug("putFile : end") - #return "ERROR : Cannot insert sandbox to DB" - else: - _logger.debug("putFile : inserted sandbox to DB with %s" % outClient) - # store to cassandra - if hasattr(panda_config,'cacheUseCassandra') and panda_config.cacheUseCassandra == True: - try: - # time-stamp - timeNow = datetime.datetime.utcnow() - creationTime = timeNow.strftime('%Y-%m-%d %H:%M:%S') - # user name - username = req.subprocess_env['SSL_CLIENT_S_DN'] - username = username.replace('/CN=proxy','') - username = username.replace('/CN=limited proxy','') - # file size - fileSize = len(fileContent) - # key - fileKeyName = file.filename.split('/')[-1] - sizeCheckSum = '%s:%s' % (fileSize,checkSum) - # insert to cassandra - import pycassa - pool = pycassa.ConnectionPool(panda_config.cacheKeySpace) - filefamily = pycassa.ColumnFamily(pool,panda_config.cacheFileTable) - # avoid overwriting - gotoNextCassa = True - if filefamily.get_count(fileKeyName) > 0: - # touch - touchFlag = touchFileCassa(filefamily,fileKeyName,timeNow) - if touchFlag: - gotoNextCassa = False - # send error message - errStr = "ERROR : Cannot overwrite file in Cassandra" - _logger.error(errStr) - if not panda_config.cacheIgnoreCassandraError: - _logger.debug("putFile : end") - return errStr - # check uniqueness with size and checksum - if gotoNextCassa: - try: - uniqExp = pycassa.index.create_index_expression('uniqID',sizeCheckSum) - userExp = pycassa.index.create_index_expression('user',username) - tmpClause = pycassa.index.create_index_clause([uniqExp,userExp]) - tmpResults = filefamily.get_indexed_slices(tmpClause,columns=['creationTime']) - for oldFileKeyName,tmpDict in tmpResults: - _logger.debug('The same size and chksum %s found in old:%s and new:%s' % \ - (sizeCheckSum,oldFileKeyName,fileKeyName)) - # touch - touchFlag = touchFileCassa(filefamily,oldFileKeyName,timeNow) - if touchFlag: - # make alias - _logger.debug('Making alias %s->%s' % (fileKeyName,oldFileKeyName)) - insertWithRetryCassa(filefamily,fileKeyName, - {'alias':oldFileKeyName, - 'creationTime':creationTime, - 'nSplit':0, - }, - 'putFile : make alias for %s' % file.filename - ) - # set time - touchFileCassa(filefamily,fileKeyName,timeNow) - _logger.debug("putFile : end") - return True - except: - gotoNextCassa = False - errType,errValue = sys.exc_info()[:2] - errStr = "cannot make alias for %s due to %s %s" % (fileKeyName,errType,errValue) - _logger.error(errStr) - if not panda_config.cacheIgnoreCassandraError: - _logger.debug("putFile : end") - return errStr - # insert new record - if gotoNextCassa: - splitIdx = 0 - splitSize = 5 * 1024 * 1024 - nSplit,tmpMod = divmod(len(fileContent),splitSize) - if tmpMod != 0: - nSplit += 1 - _logger.debug('Inserting %s with %s blocks' % (fileKeyName,nSplit)) - for splitIdx in range(nSplit): - # split to small chunks since cassandra is not good at large files - tmpFileContent = fileContent[splitSize*splitIdx:splitSize*(splitIdx+1)] - tmpFileKeyName = fileKeyName - tmpAttMap = {'file':tmpFileContent, - 'user':username, - 'creationTime':creationTime, - } - if splitIdx == 0: - tmpAttMap['size'] = fileSize - tmpAttMap['nSplit'] = nSplit - tmpAttMap['uniqID'] = sizeCheckSum - tmpAttMap['checkSum'] = str(checkSum) - else: - tmpFileKeyName += '_%s' % splitIdx - tmpAttMap['size'] = 0 - tmpAttMap['nSplit'] = 0 - # insert with retry - insertWithRetryCassa(filefamily,tmpFileKeyName,tmpAttMap, - 'putFile : insert %s' % file.filename) - # set time - touchFileCassa(filefamily,fileKeyName,timeNow) - except: - errType,errValue = sys.exc_info()[:2] - errStr = "cannot put %s into Cassandra due to %s %s" % (fileKeyName,errType,errValue) - _logger.error(errStr) - # send error message - errStr = "ERROR : " + errStr - if not panda_config.cacheIgnoreCassandraError: - _logger.debug("putFile : end") - return errStr - _logger.debug("putFile : %s end" % file.filename) - return True - - -# get file -def getFile(req,fileName): - _logger.debug("getFile : %s start" % fileName) - try: - # look into cassandra - import pycassa - pool = pycassa.ConnectionPool(panda_config.cacheKeySpace) - filefamily = pycassa.ColumnFamily(pool,panda_config.cacheFileTable) - fileInfo = filefamily.get(fileName) - # check alias - if fileInfo.has_key('alias') and fileInfo['alias'] != '': - realFileName = fileInfo['alias'] - fileInfo = filefamily.get(realFileName) - _logger.debug("getFile : %s use alias=%s" % (fileName,realFileName)) - else: - realFileName = fileName - # check cached file - hostKey = socket.gethostname() + '_cache' - if fileInfo.has_key(hostKey) and fileInfo[hostKey] != '': - _logger.debug("getFile : %s found cache=%s" % (fileName,fileInfo[hostKey])) - try: - fileFullPath = '%s%s' % (panda_config.cache_dir,fileInfo[hostKey]) - # touch - os.utime(fileFullPath,None) - _logger.debug("getFile : %s end" % fileName) - # return - return ErrorCode.EC_Redirect('/cache%s' % fileInfo[hostKey]) - except: - errtype,errvalue = sys.exc_info()[:2] - _logger.debug("getFile : %s failed to touch %s due to %s:%s" % (fileName,fileFullPath,errtype,errvalue)) - # write to cache file - fileRelPath = '/cassacache/%s' % str(uuid.uuid4()) - fileFullPath = '%s%s' % (panda_config.cache_dir,fileRelPath) - _logger.debug("getFile : %s write cache to %s" % (fileName,fileFullPath)) - fo = open(fileFullPath,'wb') - fo.write(fileInfo['file']) - if fileInfo['nSplit'] > 1: - for splitIdx in range(fileInfo['nSplit']): - if splitIdx == 0: - continue - fileInfo = filefamily.get(realFileName+'_%s' % splitIdx) - fo.write(fileInfo['file']) - fo.close() - # set cache name in DB - insertWithRetryCassa(filefamily,realFileName,{hostKey:fileRelPath}, - 'getFile : set cache for %s' % fileName) - _logger.debug("getFile : %s end" % fileName) - # return - return ErrorCode.EC_Redirect('/cache%s' % fileRelPath) - except pycassa.NotFoundException: - _logger.error("getFile : %s not found" % fileName) - return ErrorCode.EC_NotFound - except: - errtype,errvalue = sys.exc_info()[:2] - errStr = "getFile : %s %s for %s" % (errtype,errvalue,fileName) - _logger.error(errStr) - raise RuntimeError,errStr - - -# get event picking request -def putEventPickingRequest(req,runEventList='',eventPickDataType='',eventPickStreamName='', - eventPickDS='',eventPickAmiTag='',userDatasetName='',lockedBy='', - params='',inputFileList=''): - if not Protocol.isSecure(req): - return "ERROR : no HTTPS" - userName = req.subprocess_env['SSL_CLIENT_S_DN'] - creationTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') - _logger.debug("putEventPickingRequest : %s start" % userName) - # size check - sizeLimit = 10*1024*1024 - # get total size - try: - contentLength = long(req.headers_in["content-length"]) - except: - errStr = "cannot get content-length from HTTP request." - _logger.error("putEventPickingRequest : " + errStr + " " + userName) - _logger.debug("putEventPickingRequest : %s end" % userName) - return "ERROR : " + errStr - _logger.debug("size %s" % contentLength) - if contentLength > sizeLimit: - errStr = "Too large run/event list. Exceeded size limit %s>%s." % (contentLength,sizeLimit) - _logger.error("putEventPickingRequest : " + errStr + " " + userName) - _logger.debug("putEventPickingRequest : %s end" % userName) - return "ERROR : " + errStr - try: - # make filename - evpFileName = '%s/evp.%s' % (panda_config.cache_dir,str(uuid.uuid4())) - _logger.debug("putEventPickingRequest : %s -> %s" % (userName,evpFileName)) - # write - fo = open(evpFileName,'wb') - fo.write("userName=%s\n" % userName) - fo.write("creationTime=%s\n" % creationTime) - fo.write("eventPickDataType=%s\n" % eventPickDataType) - fo.write("eventPickStreamName=%s\n" % eventPickStreamName) - fo.write("eventPickDS=%s\n" % eventPickDS) - fo.write("eventPickAmiTag=%s\n" % eventPickAmiTag) - fo.write("userDatasetName=%s\n" % userDatasetName) - fo.write("lockedBy=%s\n" % lockedBy) - fo.write("params=%s\n" % params) - fo.write("inputFileList=%s\n" % inputFileList) - for tmpLine in runEventList.split('\n'): - tmpItems = tmpLine.split() - if len(tmpItems) != 2: - continue - fo.write("runEvent=%s,%s\n" % tuple(tmpItems)) - fo.close() - except: - errType,errValue = sys.exc_info()[:2] - errStr = "cannot put request due to %s %s" % (errType,errValue) - _logger.error("putEventPickingRequest : " + errStr + " " + userName) - return "ERROR : " + errStr - _logger.debug("putEventPickingRequest : %s end" % userName) - return True - - -# delete file -def deleteFile(req,file): - if not Protocol.isSecure(req): - return 'False' - try: - # may be reused for rebrokreage - #os.remove('%s/%s' % (panda_config.cache_dir,file.split('/')[-1])) - return 'True' - except: - return 'False' - - -# touch file -def touchFile(req,filename): - if not Protocol.isSecure(req): - return 'False' - try: - os.utime('%s/%s' % (panda_config.cache_dir,filename.split('/')[-1]),None) - return 'True' - except: - errtype,errvalue = sys.exc_info()[:2] - _logger.error("touchFile : %s %s" % (errtype,errvalue)) - return 'False' - - -# get server name:port for SSL -def getServer(req): - return "%s:%s" % (panda_config.pserverhost,panda_config.pserverport) - - -# update stdout -def updateLog(req,file): - _logger.debug("updateLog : %s start" % file.filename) - # write to file - try: - # expand - extStr = zlib.decompress(file.file.read()) - # stdout name - logName = '%s/%s' % (panda_config.cache_dir,file.filename.split('/')[-1]) - # append - ft = open(logName,'wa') - ft.write(extStr) - ft.close() - except: - type, value, traceBack = sys.exc_info() - _logger.error("updateLog : %s %s" % (type,value)) - _logger.debug("updateLog : %s end" % file.filename) - return True - - -# fetch stdout -def fetchLog(req,logName,offset=0): - _logger.debug("fetchLog : %s start offset=%s" % (logName,offset)) - # put dummy char to avoid Internal Server Error - retStr = ' ' - try: - # stdout name - fullLogName = '%s/%s' % (panda_config.cache_dir,logName.split('/')[-1]) - # read - ft = open(fullLogName,'r') - ft.seek(long(offset)) - retStr += ft.read() - ft.close() - except: - type, value, traceBack = sys.exc_info() - _logger.error("fetchLog : %s %s" % (type,value)) - _logger.debug("fetchLog : %s end read=%s" % (logName,len(retStr))) - return retStr - - -# get VOMS attributes -def getVomsAttr(req): - vomsAttrs = [] - for tmpKey,tmpVal in req.subprocess_env.iteritems(): - # compact credentials - if tmpKey.startswith('GRST_CRED_'): - vomsAttrs.append('%s : %s\n' % (tmpKey,tmpVal)) - vomsAttrs.sort() - retStr = '' - for tmpStr in vomsAttrs: - retStr += tmpStr - return retStr - - -# get all attributes -def getAttr(req): - allAttrs = [] - for tmpKey,tmpVal in req.subprocess_env.iteritems(): - allAttrs.append('%s : %s\n' % (tmpKey,tmpVal)) - allAttrs.sort() - retStr = '' - for tmpStr in allAttrs: - retStr += tmpStr - return retStr diff --git a/current/pandaserver/taskbuffer/WrappedPickle.py b/current/pandaserver/taskbuffer/WrappedPickle.py deleted file mode 100644 index a3e1fa12f..000000000 --- a/current/pandaserver/taskbuffer/WrappedPickle.py +++ /dev/null @@ -1,38 +0,0 @@ -import sys -import StringIO -import cPickle as pickle - -# wrapper to avoid de-serializing unsafe objects -class WrappedPickle(object): - # allowed modules and classes - allowedModClass = { - 'copy_reg' : ['_reconstructor'], - '__builtin__' : ['object'], - 'datetime' : ['datetime'], - 'taskbuffer.JobSpec' : ['JobSpec'], - 'taskbuffer.FileSpec' : ['FileSpec'], - } - - # check module and class - @classmethod - def find_class(cls,module,name): - # check module - if not cls.allowedModClass.has_key(module): - raise pickle.UnpicklingError,'Attempting to import disallowed module %s' % module - # import module - __import__(module) - mod = sys.modules[module] - # check class - if not name in cls.allowedModClass[module]: - raise pickle.UnpicklingError,'Attempting to get disallowed class %s in %s' % (name,module) - klass = getattr(mod,name) - return klass - - # loads - @classmethod - def loads(cls,pickle_string): - pickle_obj = pickle.Unpickler(StringIO.StringIO(pickle_string)) - pickle_obj.find_global = cls.find_class - return pickle_obj.load() - - diff --git a/current/pandaserver/taskbuffer/__init__.py b/current/pandaserver/taskbuffer/__init__.py deleted file mode 100755 index e69de29bb..000000000 diff --git a/current/pandaserver/test/XrdAna.py b/current/pandaserver/test/XrdAna.py deleted file mode 100755 index 37cea8021..000000000 --- a/current/pandaserver/test/XrdAna.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import re -import sys -import commands - -tarList = [] -realTime = [] -timeStamps = {} -for item in os.listdir('.'): - if item.endswith('log.tgz'): - commands.getoutput('tar xvfz %s' % item) - for dirItem in os.listdir('.'): - if os.path.isdir(dirItem): - foundTime = False - file = open('%s/pilot_child.stdout' % dirItem) - event = -1 - for line in file: - line = re.sub('\n','',line) - if line.startswith('AthenaEventLoopMgr INFO ===>>> start of event') \ - or line.startswith('Init Time :') or line.startswith('Wake Time :'): - #event = line.split()[-2] - event += 1 - match = re.search('Wake Time : \d{4}-\d{2}-\d{2} (\d{2}:\d{2}:\d{2}\.\d{3})',line) - if line.startswith('Exec Time :') or line.startswith('Init Time :') \ - or match != None: - if match != None: - timeVal = match.group(1) - else: - timeVal = line.split()[-1] - if not (int(event) < 10 or int(event) % 10 == 0): - continue - if not timeStamps.has_key(event): - timeStamps[event] = [] - timeStamps[event].append(timeVal) - if line.startswith('real'): - rT = re.sub('m',':',line.split()[-1]) - rT = re.sub('s','',rT) - realTime.append(rT) - file.close() - commands.getoutput('rm -rf %s' % dirItem) -outReal = open('real.txt','w') -for rT in realTime: - outReal.write('%s\n' % rT) -outReal.close() -nStamp = 0 -events = timeStamps.keys() -events.sort() -outStamp = open('stamp.txt','w') -for event in events: - stamps = timeStamps[event] - if nStamp == 0: - nStamp = len(stamps) - if nStamp != len(stamps): - print "ERROR : invalid nStamp %s %s" % (nStamp,len(stamps)) - str = '%s' % event - for s in stamps: - str += ',%s' % s - outStamp.write(str+'\n') -outStamp.close() diff --git a/current/pandaserver/test/XrdTest.py b/current/pandaserver/test/XrdTest.py deleted file mode 100755 index f377f46de..000000000 --- a/current/pandaserver/test/XrdTest.py +++ /dev/null @@ -1,65 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = "ANALY_BNL_ATLAS_1" - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = 'BNL_SE' - -jobDefinitionID = int(time.time()) % 10000 - -jobList = [] - -for i in range(2): - job = JobSpec() - job.jobDefinitionID = jobDefinitionID - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) - job.AtlasRelease = 'Atlas-12.0.6' - job.homepackage = 'AnalysisTransforms' - job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/runAthenaXrd' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.currentPriority = 3000 - job.assignedPriority = 3000 - job.prodSourceLabel = 'user' - job.computingSite = site - - file = FileSpec() - file.lfn = "%s.AANT._%05d.root" % (job.jobName,i) - file.destinationDBlock = job.destinationDBlock - file.destinationSE = job.destinationSE - file.dataset = job.destinationDBlock - file.type = 'output' - job.addFile(file) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.type = 'log' - job.addFile(fileOL) - - fileL = FileSpec() - fileL.dataset = 'user.TadashiMaeno.acas0003.lib._000134' - fileL.prodDBlock = fileL.dataset - fileL.lfn = 'user.TadashiMaeno.acas0003.lib._000134.lib.tgz' - fileL.type = 'input' - fileL.status = 'ready' - job.addFile(fileL) - - job.jobParameters=("-l %s " % fileL.lfn) + """-r run/ -j "%20AnalysisSkeleton_topOptions.py" -i "[]" -m "[]" -n "[]" -o "{'AANT': [('AANTupleStream', 'AANT', """ + ("""'%s')]}" -c""" % file.lfn) - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/activateBNL.py b/current/pandaserver/test/activateBNL.py deleted file mode 100755 index 55be46f85..000000000 --- a/current/pandaserver/test/activateBNL.py +++ /dev/null @@ -1,63 +0,0 @@ -import sys -import time -from dataservice.DDM import ddm -from taskbuffer.DBProxy import DBProxy -import userinterface.Client as Client -import urllib2,urllib,datetime,time -import jobscheduler.siteinfo -import jobscheduler.Site -import brokerage.broker_util - -# password -# A very minor edit. -from config import panda_config -passwd = panda_config.dbpasswd - -# instantiate DB proxies -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -# get PandaIDs from jobsDefined -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) -sql = "SELECT dispatchDBlock from jobsDefined4 WHERE jobStatus='assigned' AND prodSourceLabel='managed' " -sql += "AND (computingSite='BNL_ATLAS_1' OR computingSite='BNL_ATLAS_2') AND modificationTime<'%s' " -sql += "GROUP BY dispatchDBlock" - -res = proxyS.querySQL(sql % timeLimit.strftime('%Y-%m-%d %H:%M:%S')) - -# emulate DDM callbacks -for dispatchDBlock, in res: - print dispatchDBlock - time.sleep(5) - # get file list - status,out = ddm.dq2.main(['listFilesInDataset',dispatchDBlock]) - if status != 0 or out.startswith('Error'): - print out - continue - # make LFN list - lfns = [] - for line in out.split('\n'): - items = line.split() - if len(items) == 2: - lfns.append(items[1]) - # skip empty datasets - if len(lfns) == 0: - print "empty dataset" - continue - # get missing file - missLFNs = brokerage.broker_util.getMissLFNsFromLRC(lfns,jobscheduler.Site.KnownSite('BNL_ATLAS_2').getDQ2URL()) - if len(missLFNs) != 0: - print "some files are missing" - continue - # get VUID and creationdate - resvuid = proxyS.querySQL("SELECT vuid from Datasets WHERE name='%s'" % dispatchDBlock) - if len(resvuid) == 1: - vuid, = resvuid[0] - # make HTTP request - node={'vuid':vuid} - url=Client.baseURLSSL+'/datasetCompleted' - rdata=urllib.urlencode(node) - req=urllib2.Request(url) - # invoke callback - fd=urllib2.urlopen(req,rdata) - diff --git a/current/pandaserver/test/activateDefJobs.py b/current/pandaserver/test/activateDefJobs.py deleted file mode 100755 index d2d826c55..000000000 --- a/current/pandaserver/test/activateDefJobs.py +++ /dev/null @@ -1,36 +0,0 @@ -from taskbuffer.DBProxy import DBProxy -import userinterface.Client as Client -import urllib2,urllib,datetime,time - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# time limit -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) - -# instantiate DB proxies -proxyS = DBProxy() -proxyS.connect('adbpro.usatlas.bnl.gov',passwd,'panda-developer','PandaDevDB') - -# get PandaIDs from jobsDefined -res = proxyS.querySQL("SELECT dispatchDBlock from jobsDefined4 GROUP BY dispatchDBlock") - -# emulate DDM callbacks -jobs=[] -for dispatchDBlock, in res: - # get VUID and creationdate - resvuid = proxyS.querySQL("SELECT vuid,creationdate from Datasets WHERE name='%s'" % dispatchDBlock) - if len(resvuid) == 1: - vuid,creationdate = resvuid[0] - # convert creatindate to datetime - creation_datetime = datetime.datetime(*time.strptime(creationdate,'%Y-%m-%d %H:%M:%S')[:6]) - if creation_datetime < timeLimit: - # make HTTP request - node={'vuid':vuid} - url=Client.baseURLSSL+'/datasetCompleted' - rdata=urllib.urlencode(node) - req=urllib2.Request(url) - # invoke callback - fd=urllib2.urlopen(req,rdata) - diff --git a/current/pandaserver/test/activateDefJobs.sh b/current/pandaserver/test/activateDefJobs.sh deleted file mode 100755 index b2c1bc6bf..000000000 --- a/current/pandaserver/test/activateDefJobs.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -BASEPATH=/usatlas/u/sm/prod -BINPATH=/usatlas/u/sm/latest - -# for python -export PATH=$BINPATH/python/bin:$PATH -export PYTHONPATH=$BASEPATH/panda:$PYTHONPATH - -python $BASEPATH/panda/test/activateDefJobs.py diff --git a/current/pandaserver/test/activateJobs.py b/current/pandaserver/test/activateJobs.py deleted file mode 100755 index b33769d45..000000000 --- a/current/pandaserver/test/activateJobs.py +++ /dev/null @@ -1,41 +0,0 @@ -import sys - -from taskbuffer.DBProxy import DBProxy -import userinterface.Client as Client -import urllib2,urllib,datetime,time - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -if len(sys.argv) == 2: - startID = int(sys.argv[1]) - endID = startID -else: - startID = int(sys.argv[1]) - endID = int(sys.argv[2]) - if startID > endID: - print '%d is less than %d' % (endID,startID) - sys.exit(1) - -# instantiate DB proxies -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -# get PandaIDs from jobsDefined -res = proxyS.querySQL("SELECT dispatchDBlock from jobsDefined4 WHERE PandaID>=%s AND PandaID<=%s GROUP BY dispatchDBlock" % (startID,endID)) - -# emulate DDM callbacks -for dispatchDBlock, in res: - # get VUID and creationdate - resvuid = proxyS.querySQL("SELECT vuid from Datasets WHERE name='%s'" % dispatchDBlock) - if len(resvuid) == 1: - vuid, = resvuid[0] - # make HTTP request - node={'vuid':vuid} - url=Client.baseURLSSL+'/datasetCompleted' - rdata=urllib.urlencode(node) - req=urllib2.Request(url) - # invoke callback - fd=urllib2.urlopen(req,rdata) - diff --git a/current/pandaserver/test/activator.py b/current/pandaserver/test/activator.py deleted file mode 100755 index 8ad5292de..000000000 --- a/current/pandaserver/test/activator.py +++ /dev/null @@ -1,24 +0,0 @@ -import os -import re -import sys -import time -import datetime -import commands -from taskbuffer.TaskBuffer import taskBuffer -from pandalogger.PandaLogger import PandaLogger -from dataservice.Activator import Activator - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# instantiate TB -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - -if len(sys.argv) != 2: - print "datasetname is required" - -dataset = taskBuffer.queryDatasetWithMap({'name':sys.argv[1]}) -thr = Activator(taskBuffer,dataset) -thr.start() -thr.join() diff --git a/current/pandaserver/test/add.py b/current/pandaserver/test/add.py deleted file mode 100755 index a3e1437e5..000000000 --- a/current/pandaserver/test/add.py +++ /dev/null @@ -1,434 +0,0 @@ -import os -import re -import sys -import time -import glob -import fcntl -import random -import datetime -import commands -import threading -from taskbuffer.TaskBuffer import taskBuffer -from pandalogger.PandaLogger import PandaLogger -from dataservice.Adder2 import Adder -from brokerage.SiteMapper import SiteMapper - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# logger -_logger = PandaLogger().getLogger('add') - -_logger.debug("===================== start =====================") - -# overall timeout value -overallTimeout = 20 - -# current minute -currentMinute = datetime.datetime.utcnow().minute - -# kill old process -try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName) - for line in out.split('\n'): - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) -except: - type, value, traceBack = sys.exc_info() - _logger.error("kill process : %s %s" % (type,value)) - - -# instantiate TB -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - -# instantiate sitemapper -aSiteMapper = SiteMapper(taskBuffer) - -# delete -_logger.debug("Del session") -status,retSel = taskBuffer.querySQLS("SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4",{}) -if retSel != None: - try: - maxID = retSel[0][0] - _logger.debug("maxID : %s" % maxID) - if maxID != None: - varMap = {} - varMap[':maxID'] = maxID - varMap[':jobStatus1'] = 'activated' - varMap[':jobStatus2'] = 'waiting' - varMap[':jobStatus3'] = 'failed' - varMap[':jobStatus4'] = 'cancelled' - status,retDel = taskBuffer.querySQLS("DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)",varMap) - except: - pass - -# count # of getJob/updateJob in dispatcher's log -try: - # don't update when logrotate is running - timeNow = datetime.datetime.utcnow() - logRotateTime = timeNow.replace(hour=3,minute=2,second=0,microsecond=0) - if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \ - (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)): - _logger.debug("skip pilotCounts session for logrotate") - else: - # log filename - dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) - # check if tgz is required - com = 'head -1 %s' % dispLogName - lostat,loout = commands.getstatusoutput(com) - useLogTgz = True - if lostat == 0: - match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',loout) - if match != None: - startTime = datetime.datetime(*time.strptime(match.group(0),'%Y-%m-%d %H:%M:%S')[:6]) - # current log contains all info - if startTime datetime.timedelta(minutes=1) and \ - (timeNow - modTime) < datetime.timedelta(hours=1): - cSt,cOut = commands.getstatusoutput('ps aux | grep fork | grep -v PYTH') - # if no process is running for the file - if cSt == 0 and not tmpName in cOut: - nThr += 1 - thr = ForkThr(tmpName) - thr.start() - forkThrList.append(thr) - if nThr > maxThr: - break - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s %s" % (errType,errValue)) - - -# thread pool -class ThreadPool: - def __init__(self): - self.lock = threading.Lock() - self.list = [] - - def add(self,obj): - self.lock.acquire() - self.list.append(obj) - self.lock.release() - - def remove(self,obj): - self.lock.acquire() - self.list.remove(obj) - self.lock.release() - - def join(self): - self.lock.acquire() - thrlist = tuple(self.list) - self.lock.release() - for thr in thrlist: - thr.join() - -# thread to adder -class AdderThr (threading.Thread): - def __init__(self,lock,pool,taskBuffer,aSiteMapper,pandaID,jobStatus,fileName,ignoreError=True): - threading.Thread.__init__(self) - self.lock = lock - self.pool = pool - self.pool.add(self) - self.adder = Adder(taskBuffer,pandaID,"",jobStatus,xmlFile=fileName, - ignoreDDMError=ignoreError,joinCloser=True,addOutput=True, - siteMapper=aSiteMapper) - - def run(self): - self.lock.acquire() - self.adder.start() - self.adder.join() - self.pool.remove(self) - self.lock.release() - - -# get buildJobs in the holding state -holdingAna = [] -varMap = {} -varMap[':prodSourceLabel'] = 'panda' -varMap[':jobStatus'] = 'holding' -status,res = taskBuffer.querySQLS("SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus",varMap) -if res != None: - for id, in res: - holdingAna.append(id) -_logger.debug("holding Ana %s " % holdingAna) - -# add files -_logger.debug("Adder session") -timeNow = datetime.datetime.utcnow() -timeInt = datetime.datetime.utcnow() -dirName = panda_config.logdir -fileList = os.listdir(dirName) -fileList.sort() -# remove duplicated files -tmpList = [] -uMap = {} -for file in fileList: - match = re.search('^(\d+)_([^_]+)_.{36}(_\d+)*$',file) - if match != None: - fileName = '%s/%s' % (dirName,file) - id = match.group(1) - if uMap.has_key(id): - try: - os.remove(fileName) - except: - pass - else: - uMap[id] = fileName - if long(id) in holdingAna: - # give a priority to buildJobs - tmpList.insert(0,file) - else: - tmpList.append(file) -nFixed = 50 -randTmp = tmpList[nFixed:] -random.shuffle(randTmp) -fileList = tmpList[:nFixed] + randTmp - -# create thread pool and semaphore -adderLock = threading.Semaphore(3) -adderThreadPool = ThreadPool() - -# add -while len(fileList) != 0: - # time limit to aviod too many copyArchve running at the sametime - if (datetime.datetime.utcnow() - timeNow) > datetime.timedelta(minutes=overallTimeout): - _logger.debug("time over in Adder session") - break - # try to get Semaphore - adderLock.acquire() - # get fileList - if (datetime.datetime.utcnow() - timeInt) > datetime.timedelta(minutes=15): - timeInt = datetime.datetime.utcnow() - # get file - fileList = os.listdir(dirName) - fileList.sort() - # remove duplicated files - tmpList = [] - uMap = {} - for file in fileList: - match = re.search('^(\d+)_([^_]+)_.{36}(_\d+)*$',file) - if match != None: - fileName = '%s/%s' % (dirName,file) - id = match.group(1) - if uMap.has_key(id): - try: - os.remove(fileName) - except: - pass - else: - uMap[id] = fileName - if long(id) in holdingAna: - # give a priority to buildJob - tmpList.insert(0,file) - else: - tmpList.append(file) - fileList = tmpList - # choose a file - file = fileList.pop(0) - # release lock - adderLock.release() - # check format - match = re.search('^(\d+)_([^_]+)_.{36}(_\d+)*$',file) - if match != None: - fileName = '%s/%s' % (dirName,file) - if not os.path.exists(fileName): - continue - try: - modTime = datetime.datetime(*(time.gmtime(os.path.getmtime(fileName))[:7])) - if (timeNow - modTime) > datetime.timedelta(hours=24): - # last chance - _logger.debug("Last Add File : %s" % fileName) - thr = AdderThr(adderLock,adderThreadPool,taskBuffer,aSiteMapper,match.group(1), - match.group(2),fileName,False) - thr.start() - elif (timeInt - modTime) > datetime.timedelta(minutes=3): - # add - _logger.debug("Add File : %s" % fileName) - thr = AdderThr(adderLock,adderThreadPool,taskBuffer,aSiteMapper,match.group(1), - match.group(2),fileName) - thr.start() - except: - type, value, traceBack = sys.exc_info() - _logger.error("%s %s" % (type,value)) - -# join all threads -adderThreadPool.join() - -# join sender -mailSender.join() - -# join fork threads -for thr in forkThrList: - thr.join() - -_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/add.sh b/current/pandaserver/test/add.sh deleted file mode 100755 index fed990df6..000000000 --- a/current/pandaserver/test/add.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# Panda home -export PANDA_HOME=/home/sm/prod - -# for python -export PYTHONPATH=$PANDA_HOME/panda:$PYTHONPATH - -python $PANDA_HOME/panda/test/add.py diff --git a/current/pandaserver/test/aho.xml b/current/pandaserver/test/aho.xml deleted file mode 100755 index 8bfd17333..000000000 --- a/current/pandaserver/test/aho.xml +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/current/pandaserver/test/analysis.py b/current/pandaserver/test/analysis.py deleted file mode 100755 index 91f498431..000000000 --- a/current/pandaserver/test/analysis.py +++ /dev/null @@ -1,78 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = None - -jobList = [] -for i in range(2): - datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') - destName = 'ANALY_BNL_ATLAS_1' - - job = JobSpec() - job.jobDefinitionID = 1 - job.jobName = commands.getoutput('uuidgen') - job.AtlasRelease = 'Atlas-12.0.2' - job.homepackage = 'AnalysisTransforms' - job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/runAthena2' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.currentPriority = 3000 - job.prodSourceLabel = 'user' - job.computingSite = site - job.prodDBlock = 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103' - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen') - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.type = 'log' - job.addFile(fileOL) - - fileOZ = FileSpec() - fileOZ.lfn = "AANT.%s.root" % commands.getoutput('uuidgen') - fileOZ.destinationDBlock = job.destinationDBlock - fileOZ.destinationSE = job.destinationSE - fileOZ.dataset = job.destinationDBlock - fileOZ.type = 'output' - job.addFile(fileOZ) - - files = [ - 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00001.pool.root.1', - 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00002.pool.root.1', - 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00003.pool.root.1', - ] - for lfn in files: - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - fileI.status = 'ready' - job.addFile(fileI) - - fileL = FileSpec() - fileL.dataset = 'user.TadashiMaeno.lib._000157' - fileL.prodDBlock = 'user.TadashiMaeno.lib._000157' - fileL.lfn = 'user.TadashiMaeno.lib._000157.lib.tgz' - fileL.type = 'input' - fileL.status = 'ready' - job.addFile(fileL) - - job.jobParameters=""" -l user.TadashiMaeno.lib._000157.lib.tgz -r run/ -j " AnalysisSkeleton_jobOptions.py" -i "['testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00001.pool.root.1', 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00002.pool.root.1', 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00003.pool.root.1']" -o "{'AANT': [('AANTupleStream', 'AANT', '%s')]}" """ % fileOZ.lfn - - jobList.append(job) - - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/analyzeLog.py b/current/pandaserver/test/analyzeLog.py deleted file mode 100755 index 8b9314e5c..000000000 --- a/current/pandaserver/test/analyzeLog.py +++ /dev/null @@ -1,55 +0,0 @@ -import re -from config import panda_config - -# analyze Setupper log -logSetupper = open('%s/panda-Setupper.log' % panda_config.logdir) -# extract subscriptions -mapSub = {} -mapDataset = {} -for line in logSetupper: - items = re.findall("'registerDatasetSubscription', '(.+_dis\d+)', '([^']+)'",line) - if len(items) != 0: - dataset = items[0][0] - siteID = items[0][1] - date = '%s %s' % tuple(re.split(' |,',line)[:2]) - if not mapSub.has_key(siteID): - mapSub[siteID] = [] - # append - mapSub[siteID].append(dataset) - mapDataset[dataset] = (date,False) -logSetupper.close() - -# analyze Activator log -logActivator = open('%s/panda-Activator.log' % panda_config.logdir) -# extract callbacks -for line in logActivator: - items = re.findall("start: (\S+_dis\d+)$",line) - if len(items) != 0: - dataset = items[0] - if dataset in mapDataset.keys(): - mapDataset[dataset] = mapDataset[dataset][:-1]+(True,) -logActivator.close() - -# print -for siteID in mapSub.keys(): - print "ID : %s" % siteID - nSucceed = 0 - failedSubs = [] - for dataset in mapSub[siteID]: - # succeeded - if mapDataset[dataset][-1:][0]: - nSucceed += 1 - # failed - else: - failedSubs.append((mapDataset[dataset][0],dataset)) - # statistics - print " Total:%d Succeeded:%d" % (len(mapSub[siteID]),nSucceed) - # not completed subscriptions - print " Not completed" - for item in failedSubs: - print " %s" % item[0] - print " %s" % item[1] - print - - - diff --git a/current/pandaserver/test/archivelogs.py b/current/pandaserver/test/archivelogs.py deleted file mode 100644 index 86d81d8ab..000000000 --- a/current/pandaserver/test/archivelogs.py +++ /dev/null @@ -1,45 +0,0 @@ -import re -import os -import glob -import stat -import commands - -from config import panda_config - -srcDir = panda_config.logdir -dstDir = '/tmp/logbackup' + srcDir - -logFiles = glob.glob(srcDir+'/*log.1.gz') - -# check time stamp -for logFile in logFiles: - baseName = logFile.split('/')[-1] - print "log name : %s" % baseName - targetFile = "%s/%s" % (dstDir,baseName) - # already exists - if os.path.exists(targetFile) and \ - os.stat(logFile)[stat.ST_SIZE] == os.stat(targetFile)[stat.ST_SIZE]: - com = 'cmp %s %s' % (logFile,targetFile) - cmpSt,cmpOut = commands.getstatusoutput(com) - if cmpSt == 0: - print " -> skip : already exists" - continue - # increment - maxIndex = 60 - if os.path.exists(targetFile): - templateName = re.sub('1\.gz$','%s.gz',baseName) - for tmpIdx in range(1,maxIndex): - renameSrc = dstDir + '/' + (templateName % (maxIndex-tmpIdx)) - renameDst = dstDir + '/' + (templateName % (maxIndex-tmpIdx+1)) - if os.path.exists(renameSrc): - com = 'mv -f %s %s' % (renameSrc,renameDst) - print com - print commands.getoutput(com) - # copy - com = 'cp -fp %s %s' % (logFile,dstDir) - print com - print commands.getoutput(com) - -# touch to avoid tmpwatch -com = 'touch %s/*' % dstDir -print commands.getoutput(com) diff --git a/current/pandaserver/test/backupJobArch.py b/current/pandaserver/test/backupJobArch.py deleted file mode 100755 index 6ebc8dac2..000000000 --- a/current/pandaserver/test/backupJobArch.py +++ /dev/null @@ -1,176 +0,0 @@ -import os -import re -import sys -import time -import fcntl -import types -import shelve -import random -import datetime -import commands -import threading -import userinterface.Client as Client -from dataservice.DDM import ddm -from dataservice.DDM import dashBorad -from taskbuffer.OraDBProxy import DBProxy -from taskbuffer.TaskBuffer import taskBuffer -from pandalogger.PandaLogger import PandaLogger -from jobdispatcher.Watcher import Watcher -from brokerage.SiteMapper import SiteMapper -from dataservice.Adder import Adder -from dataservice.Finisher import Finisher -from dataservice.MailUtils import MailUtils -from taskbuffer import ProcessGroups -import brokerage.broker_util -import brokerage.broker -import taskbuffer.ErrorCode -import dataservice.DDM - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# logger -_logger = PandaLogger().getLogger('backupJobArch') - -_logger.debug("===================== start =====================") - -# memory checker -def _memoryCheck(str): - try: - proc_status = '/proc/%d/status' % os.getpid() - procfile = open(proc_status) - name = "" - vmSize = "" - vmRSS = "" - # extract Name,VmSize,VmRSS - for line in procfile: - if line.startswith("Name:"): - name = line.split()[-1] - continue - if line.startswith("VmSize:"): - vmSize = "" - for item in line.split()[1:]: - vmSize += item - continue - if line.startswith("VmRSS:"): - vmRSS = "" - for item in line.split()[1:]: - vmRSS += item - continue - procfile.close() - _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str)) - except: - type, value, traceBack = sys.exc_info() - _logger.error("memoryCheck() : %s %s" % (type,value)) - _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str)) - return - -_memoryCheck("start") - -# kill old dq2 process -try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('ps axo user,pid,lstart,args | grep dq2.clientapi | grep -v PYTHONPATH | grep -v grep') - for line in out.split('\n'): - if line == '': - continue - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old dq2 process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) -except: - type, value, traceBack = sys.exc_info() - _logger.error("kill dq2 process : %s %s" % (type,value)) - - -# kill old process -try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName) - for line in out.split('\n'): - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) -except: - type, value, traceBack = sys.exc_info() - _logger.error("kill process : %s %s" % (type,value)) - - -# instantiate TB -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - -# instantiate sitemapper -siteMapper = SiteMapper(taskBuffer) - - -# table names -jobATableName = "ATLAS_PANDAARCH.jobsArchived" -filesATableName = "ATLAS_PANDAARCH.filesTable_ARCH" -paramATableName = "ATLAS_PANDAARCH.jobParamsTable_ARCH" -metaATableName = "ATLAS_PANDAARCH.metaTable_ARCH" - -# time limit -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=3) - -# copy -_logger.debug("get PandaIDs for Archive") -varMap = {} -varMap[':archivedFlag'] = 0 -status,res = taskBuffer.querySQLS("SELECT PandaID,modificationTime FROM ATLAS_PANDA.jobsArchived4 WHERE archivedFlag=:archivedFlag ORDER BY PandaID", - varMap,arraySize=1000000) -if res == None: - _logger.debug("total %s " % res) -else: - _logger.debug("total %s " % len(res)) - # copy - tmpIndex = 0 - tmpTotal = len(res) - random.shuffle(res) - for (id,srcEndTime) in res: - tmpIndex += 1 - try: - # copy - proxyS = taskBuffer.proxyPool.getProxy() - proxyS.insertJobSimpleUnread(id,srcEndTime) - taskBuffer.proxyPool.putProxy(proxyS) - _logger.debug("INSERT %s" % id) - if tmpIndex % 100 == 1: - _logger.debug(" copied %s/%s" % (tmpIndex,tmpTotal)) - except: - pass - -_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/banUser.py b/current/pandaserver/test/banUser.py deleted file mode 100644 index 6217a058c..000000000 --- a/current/pandaserver/test/banUser.py +++ /dev/null @@ -1,41 +0,0 @@ -import sys -import time -import datetime -import optparse - -from taskbuffer.OraDBProxy import DBProxy -# password -from config import panda_config - -optP = optparse.OptionParser(conflict_handler="resolve") -optP.add_option('--user', action='store',dest='user', default=None,help='prodUserName') -optP.add_option('--unban',action='store_const',const=True,dest='unban',default=False,help='unban the user') - -options,args = optP.parse_args() - -if options.user == None: - print "--user= is required" - sys.exit(1) - -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -prodUserName = sys.argv[1] -import userinterface.Client as Client - -varMap = {} -varMap[':name'] = options.user -if options.unban: - varMap[':status'] = None -else: - varMap[':status'] = 'disabled' - -sql = "UPDATE ATLAS_PANDAMETA.users SET status=:status WHERE name=:name" - -status,res = proxyS.querySQLS(sql,varMap) -if res == None: - print "Failed with database error" -else: - print "%s rows updated" % res - - diff --git a/current/pandaserver/test/boostPrio.py b/current/pandaserver/test/boostPrio.py deleted file mode 100755 index 4bc13fda6..000000000 --- a/current/pandaserver/test/boostPrio.py +++ /dev/null @@ -1,20 +0,0 @@ -import time -import sys - -from taskbuffer.OraDBProxy import DBProxy - -# password -from config import panda_config - -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -varMap = {} -varMap[':prodSourceLabel'] = 'managed' -varMap[':taskID'] = sys.argv[1] -varMap[':prio'] = sys.argv[2] -sql = "UPDATE %s SET currentPriority=currentPriority+:prio WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID" -for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: - status,res = proxyS.querySQLS(sql % table,varMap) - - diff --git a/current/pandaserver/test/boostUser.py b/current/pandaserver/test/boostUser.py deleted file mode 100755 index 17f6c1483..000000000 --- a/current/pandaserver/test/boostUser.py +++ /dev/null @@ -1,34 +0,0 @@ -import sys -from config import panda_config - -# initialize cx_Oracle using dummy connection -from taskbuffer.Initializer import initializer -initializer.init() - -from dataservice.Merger import Merger -from taskbuffer.TaskBuffer import taskBuffer -from pandalogger.PandaLogger import PandaLogger - - -# logger -_logger = PandaLogger().getLogger('boostUser') -_logger.debug("================= start ==================") - -# instantiate TB -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - -user = sys.stdin.read() -user = user[:-1] - -sql = "UPDATE atlas_panda.%s set currentPriority=:prio where prodUserName=:uname and prodSourceLabel IN (:label1,:label2) and currentPriority<:prio" -varMap = {} -varMap[':prio'] = 4000 -varMap[':uname'] = user -varMap[':label1'] = 'user' -varMap[':label2'] = 'panda' -for table in ('jobsactive4','jobsdefined4'): - _logger.debug((sql % table) + str(varMap)) - ret = taskBuffer.querySQLS(sql % table,varMap) - _logger.debug('ret -> %s' % str(ret)) - -_logger.debug("================= end ==================") diff --git a/current/pandaserver/test/callbackDDM.py b/current/pandaserver/test/callbackDDM.py deleted file mode 100755 index 8564b272e..000000000 --- a/current/pandaserver/test/callbackDDM.py +++ /dev/null @@ -1,12 +0,0 @@ -import sys -import urllib2,urllib - -node={} -node['vuid']=sys.argv[1] -url='https://gridui01.usatlas.bnl.gov:25443/server/panda/datasetCompleted' -rdata=urllib.urlencode(node) -req=urllib2.Request(url) -fd=urllib2.urlopen(req,rdata) -data = fd.read() - -print data diff --git a/current/pandaserver/test/checkGetJob.py b/current/pandaserver/test/checkGetJob.py deleted file mode 100644 index 79d1a0ecf..000000000 --- a/current/pandaserver/test/checkGetJob.py +++ /dev/null @@ -1,18 +0,0 @@ -import sys -import re -import time -import datetime -timeLimit = datetime.timedelta(seconds=10) -f = open("../../httpd/logs/panda-DBProxy.log") -for line in f: - match = re.search('unlock',line) - if match: - timeM = re.search('^(\d+-\d+-\d+ \d+:\d+:\d+),(\d+)',line) - endTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%d %H:%M:%S')[:6]) - endTime = endTime.replace(microsecond = 1000*int(timeM.group(2))) - timeM = re.search('getJobs : (\d+-\d+-\d+T\d+:\d+:\d+)\.(\d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%dT%H:%M:%S')[:6]) - startTime = startTime.replace(microsecond = int(timeM.group(2))) - if (endTime-startTime) > timeLimit: - print '%s %s' % (startTime,endTime-startTime) -f.close() diff --git a/current/pandaserver/test/checkSetupper.py b/current/pandaserver/test/checkSetupper.py deleted file mode 100644 index 1f1dbfdd6..000000000 --- a/current/pandaserver/test/checkSetupper.py +++ /dev/null @@ -1,31 +0,0 @@ -import re -import time -import datetime -f = open("../../httpd/logs/panda-Setupper.log") -session = [] -timeList = {} -for line in f: - match = re.search('DEBUG (.*) startRun',line) - if match: - stamp = match.group(1) - stamp = stamp.strip() - session.append(stamp) - timeM = re.search('^(\d+-\d+-\d+ \d+:\d+:\d+),',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%d %H:%M:%S')[:6]) - timeList[stamp] = startTime - continue - match = re.search('DEBUG (.*) endRun',line) - if match: - stamp = match.group(1) - stamp = stamp.strip() - session.remove(stamp) - timeM = re.search('^(\d+-\d+-\d+ \d+:\d+:\d+),',line) - endTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%d %H:%M:%S')[:6]) - if timeList.has_key(stamp): - delta = endTime - timeList[stamp] - if delta > datetime.timedelta(minutes = 10): - print "Start : %s " % stamp - print " took -> %02d:%02d:%02d" % (delta.seconds/(60*60),(delta.seconds%(60*60))/60,delta.seconds%60) - continue - -print session diff --git a/current/pandaserver/test/cl_testEvgen.py b/current/pandaserver/test/cl_testEvgen.py deleted file mode 100644 index 137c496bd..000000000 --- a/current/pandaserver/test/cl_testEvgen.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# eg. python cl_testEvgen.py SACLAY FR -# -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)==2: - site = sys.argv[1] - cloud='CA' -elif len(sys.argv)==3: - site = sys.argv[1] - cloud=sys.argv[2] -else: - site = None - cloud = None - -datasetName = 'panda.destDB.%s_tid999991' % commands.getoutput('uuidgen') -taskid = 999989 - -jobList = [] - -for i in range(1): - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) -# job.AtlasRelease = 'Atlas-12.0.6' -# job.homepackage = 'AtlasProduction/12.0.6.5' - job.AtlasRelease = 'Atlas-12.0.7' - job.homepackage = 'AtlasProduction/12.0.7.1' - - job.transformation = 'csc_evgen_trf.py' - job.destinationDBlock = datasetName -# job.destinationSE = destName -# job.cloud = 'CA' - job.cloud = cloud - job.taskID = taskid - job.currentPriority = 1000 - job.prodSourceLabel = 'test' -# job.prodSourceLabel = 'cloudtest' - job.computingSite = site - - file = FileSpec() - file.lfn = "%s.evgen.pool.root" % job.jobName - file.destinationDBlock = job.destinationDBlock - file.destinationSE = job.destinationSE - file.dataset = job.destinationDBlock - file.type = 'output' - job.addFile(file) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="8072 0 5000 1 DC3.008072.JimmyPhotonJet1.py %s NONE NONE NONE" % file.lfn - jobList.append(job) - -for i in range(1): - s,o = Client.submitJobs(jobList) - print "---------------------" - print s - for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/cl_testG4sim.py b/current/pandaserver/test/cl_testG4sim.py deleted file mode 100644 index ed1db41ab..000000000 --- a/current/pandaserver/test/cl_testG4sim.py +++ /dev/null @@ -1,120 +0,0 @@ -# -# eg. python cl_testG4sim.py SACLAY FR -# - -import sys -import time -import random -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)==2: - site = sys.argv[1] - cloud='CA' -elif len(sys.argv)==3: - site = sys.argv[1] - cloud=sys.argv[2] -else: - site = None - cloud = None - -datasetName = 'panda.rod2.%s_tid999990' % commands.getoutput('uuidgen') -#destName = 'BNL_SE' - -if cloud=='UK': - files = { - 'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01035.pool.root.1':'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541', - } -# or mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01174.pool.root.1, mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01035.pool.root.1 -elif cloud=='CA': - files={'EVNT.012303._00901.pool.root.1':'mc12.005001.pythia_minbias.evgen.EVNT.v12000701_tid012303',} -elif cloud=='FR': - files={'EVNT.010822._00007.pool.root.1':'mc12.006873.PythiaWH140lnugamgam.evgen.EVNT.v12000701_tid010822',} -elif cloud in ['ES']: - files={'EVNT.016869._00187.pool.root.1':'mc12.005001.pythia_minbias.evgen.EVNT.v12000601_tid016869',} -elif cloud in ['DE']: - files={'EVNT.016869._00177.pool.root.2':'mc12.005001.pythia_minbias.evgen.EVNT.v12000601_tid016869',} -else: - print 'Cloud not known: %s'%cloud - cloud = None - files={'EVNT.012303._00545.pool.root.1':'rod.cloudtest1'} - -# UK -#'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01035.pool.root.1':'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541', -# CA -# 'EVNT.012303._00901.pool.root.1':'mc12.005001.pythia_minbias.evgen.EVNT.v12000701_tid012303', - - - -jobList = [] - -for i in range(1): - for lfn in files.keys(): - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = commands.getoutput('uuidgen') - job.AtlasRelease = 'Atlas-12.0.7' - job.homepackage = 'AtlasProduction/12.0.7.1' -# Need different args too -# job.AtlasRelease = 'Atlas-13.0.30' -# job.homepackage = 'AtlasProduction/13.0.30.2' - job.transformation = 'csc_simul_trf.py' - job.destinationDBlock = datasetName - job.cloud = cloud - job.computingSite = site -# job.prodDBlock = 'mc12.005001.pythia_minbias.evgen.EVNT.v12000701_tid012303' - job.prodDBlock = files[lfn] - job.prodSourceLabel = 'test' -# job.prodSourceLabel = 'cloudtest' - job.currentPriority = 1001 - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - - fileD = FileSpec() - fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' - fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' - fileD.lfn = 'DBRelease-3.1.1.tar.gz' - fileD.type = 'input' - job.addFile(fileD) - - - fileOE = FileSpec() - fileOE.lfn = "%s.HITS.pool.root" % job.jobName - fileOE.destinationDBlock = job.destinationDBlock - fileOE.destinationSE = job.destinationSE - fileOE.dataset = job.destinationDBlock - fileOE.type = 'output' - job.addFile(fileOE) - - fileOA = FileSpec() - fileOA.lfn = "%s.RDO.pool.root" % job.jobName - fileOA.destinationDBlock = job.destinationDBlock - fileOA.destinationSE = job.destinationSE - fileOA.dataset = job.destinationDBlock - fileOA.type = 'output' - job.addFile(fileOA) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="%s %s %s 1 4000 153781 ATLAS-CSC-01-02-00 NONE %s" % (fileI.lfn,fileOE.lfn,fileOA.lfn,fileD.lfn) - - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/cl_testMXreco.py b/current/pandaserver/test/cl_testMXreco.py deleted file mode 100644 index 1fb770bee..000000000 --- a/current/pandaserver/test/cl_testMXreco.py +++ /dev/null @@ -1,112 +0,0 @@ -# -# eg. python cl_testG4sim.py SACLAY FR -# - -import sys -import time -import random -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)==2: - site = sys.argv[1] - cloud='CA' -elif len(sys.argv)==3: - site = sys.argv[1] - cloud=sys.argv[2] -else: - site = None - cloud = None - -datasetName = 'panda.rod2.%s_tid999990' % commands.getoutput('uuidgen') -#destName = 'BNL_SE' - -files={'daq.m5_combined.0028997.Default.L1TT-b00000110.LB0000.SFO-1._0001.data':'M5.0028997.Default.L1TT-b00000110.RAW.v010803',} - -if cloud=='IT': - files={'daq.m5_combined.0029118.Default.L1TT-b00000010.LB0000.SFO-1._0001.data':'M5.0029118.Default.L1TT-b00000010.RAW.v010803'} - - -jobList = [] - -for i in range(1): - for lfn in files.keys(): - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = commands.getoutput('uuidgen') - job.AtlasRelease = 'Atlas-13.0.35' - job.homepackage = 'AtlasPoint1/13.0.35.1' - job.transformation = 'csc_cosmics_trf.py' - job.destinationDBlock = datasetName - job.cloud = cloud - job.computingSite = site - job.prodDBlock = files[lfn] - job.prodSourceLabel = 'test' - job.currentPriority = 1001 - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - - fileD = FileSpec() - fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' - fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' - fileD.lfn = 'DBRelease-3.1.1.tar.gz' - fileD.type = 'input' -# job.addFile(fileD) - - - fileO1 = FileSpec() - fileO1.lfn = "%s.ESD.pool.root" % job.jobName - fileO1.destinationDBlock = job.destinationDBlock - fileO1.destinationSE = job.destinationSE - fileO1.dataset = job.destinationDBlock - fileO1.type = 'output' - job.addFile(fileO1) - - fileO2 = FileSpec() - fileO2.lfn = "%s.ESDF.pool.root" % job.jobName - fileO2.destinationDBlock = job.destinationDBlock - fileO2.destinationSE = job.destinationSE - fileO2.dataset = job.destinationDBlock - fileO2.type = 'output' -# job.addFile(fileO2) - - fileO3 = FileSpec() - fileO3.lfn = "%s.NTUP.pool.root" % job.jobName - fileO3.destinationDBlock = job.destinationDBlock - fileO3.destinationSE = job.destinationSE - fileO3.dataset = job.destinationDBlock - fileO3.type = 'output' - job.addFile(fileO3) - - fileO4 = FileSpec() - fileO4.lfn = "%s.HIST.pool.root" % job.jobName - fileO4.destinationDBlock = job.destinationDBlock - fileO4.destinationSE = job.destinationSE - fileO4.dataset = job.destinationDBlock - fileO4.type = 'output' - job.addFile(fileO4) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="%s LAR_TILE_MUONS_LVL1C 10 %s NONE %s %s COMCOND-002-00 NONE" % (fileI.lfn,fileO1.lfn,fileO3.lfn,fileO4.lfn) - - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/cleanup.py b/current/pandaserver/test/cleanup.py deleted file mode 100644 index a1b170d11..000000000 --- a/current/pandaserver/test/cleanup.py +++ /dev/null @@ -1,10 +0,0 @@ -import commands - -for patt in ['dq2.clientapi.cli.cliutil.getDQ2','forkSetupper.py','LFCclient.py']: - out = commands.getoutput('ps aux | grep python | grep %s' % patt) - for line in out.split('\n'): - items = line.split() - print items[1], items[8] - if items[8] in ['Sep04','Sep05']: - commands.getoutput('kill -9 %s' % items[1]) - diff --git a/current/pandaserver/test/closeDS.py b/current/pandaserver/test/closeDS.py deleted file mode 100755 index 4aeface4f..000000000 --- a/current/pandaserver/test/closeDS.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -import time -import datetime -import commands -import jobscheduler.Site -import userinterface.Client as Client -from dataservice.DDM import ddm -from taskbuffer.DBProxy import DBProxy -from taskbuffer.TaskBuffer import taskBuffer -from pandalogger.PandaLogger import PandaLogger -from jobdispatcher.Watcher import Watcher - -# logger -_logger = PandaLogger().getLogger('closeDS') - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# instantiate DB proxies -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -# time limit for dataset closing -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7) - -# close datasets -while True: - sql = "SELECT vuid,name,modificationdate FROM Datasets " + \ - "WHERE type='output' AND (status='running' OR status='created' OR status='defined') " + \ - "AND modificationdate<'%s' AND name REGEXP '_sub[[:digit:]]+$'" - ret,res = proxyS.querySQLS(sql % timeLimit.strftime('%Y-%m-%d %H:%M:%S')) - _logger.debug("# of dataset : %s" % len(res)) - if len(res) == 0: - break - for (vuid,name,modDate) in res: - _logger.debug("start %s %s" % (modDate,name)) - retF,resF = proxyS.querySQLS("SELECT lfn FROM filesTable4 WHERE destinationDBlock='%s'" % name) - if retF<0 or retF == None or retF!=len(resF): - _logger.error("SQL error") - else: - # no files in filesTable - if len(resF) == 0: - _logger.debug("freeze %s " % name) - status,out = ddm.dq2.main(['freezeDataset',name]) - if status != 0 or (out.find('Error') != -1 and out.find('DQ2 unknown dataset exception') == -1 \ - and out.find('DQ2 security exception') == -1): - _logger.error(out) - else: - proxyS.querySQL("UPDATE Datasets SET status='completed',modificationdate=UTC_TIMESTAMP() WHERE vuid='%s'" % vuid) - else: - _logger.debug("wait %s " % name) - proxyS.querySQL("UPDATE Datasets SET modificationdate=UTC_TIMESTAMP() WHERE vuid='%s'" % vuid) - _logger.debug("end %s " % name) - time.sleep(1) diff --git a/current/pandaserver/test/copyArchive.py b/current/pandaserver/test/copyArchive.py deleted file mode 100755 index 486e28673..000000000 --- a/current/pandaserver/test/copyArchive.py +++ /dev/null @@ -1,1653 +0,0 @@ -import os -import re -import sys -import time -import fcntl -import types -import shelve -import random -import datetime -import commands -import threading -import userinterface.Client as Client -from dataservice.DDM import ddm -from dataservice.DDM import dashBorad -from taskbuffer.OraDBProxy import DBProxy -from taskbuffer.TaskBuffer import taskBuffer -from pandalogger.PandaLogger import PandaLogger -from jobdispatcher.Watcher import Watcher -from brokerage.SiteMapper import SiteMapper -from dataservice.Adder import Adder -from dataservice.Finisher import Finisher -from dataservice.MailUtils import MailUtils -from taskbuffer import ProcessGroups -import brokerage.broker_util -import brokerage.broker -import taskbuffer.ErrorCode -import dataservice.DDM - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# logger -_logger = PandaLogger().getLogger('copyArchive') - -_logger.debug("===================== start =====================") - -# memory checker -def _memoryCheck(str): - try: - proc_status = '/proc/%d/status' % os.getpid() - procfile = open(proc_status) - name = "" - vmSize = "" - vmRSS = "" - # extract Name,VmSize,VmRSS - for line in procfile: - if line.startswith("Name:"): - name = line.split()[-1] - continue - if line.startswith("VmSize:"): - vmSize = "" - for item in line.split()[1:]: - vmSize += item - continue - if line.startswith("VmRSS:"): - vmRSS = "" - for item in line.split()[1:]: - vmRSS += item - continue - procfile.close() - _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str)) - except: - type, value, traceBack = sys.exc_info() - _logger.error("memoryCheck() : %s %s" % (type,value)) - _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str)) - return - -_memoryCheck("start") - -# kill old dq2 process -try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('ps axo user,pid,lstart,args | grep dq2.clientapi | grep -v PYTHONPATH | grep -v grep') - for line in out.split('\n'): - if line == '': - continue - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old dq2 process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) -except: - type, value, traceBack = sys.exc_info() - _logger.error("kill dq2 process : %s %s" % (type,value)) - - -# kill old process -try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=7) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName) - for line in out.split('\n'): - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) -except: - type, value, traceBack = sys.exc_info() - _logger.error("kill process : %s %s" % (type,value)) - - -# instantiate TB -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - -# instantiate sitemapper -siteMapper = SiteMapper(taskBuffer) - - - -# send email for access requests -_logger.debug("Site Access") -try: - # get contact - contactAddr = {} - siteContactAddr = {} - sql = "SELECT name,email FROM ATLAS_PANDAMETA.cloudconfig" - status,res = taskBuffer.querySQLS(sql,{}) - for cloudName,cloudEmail in res: - contactAddr[cloudName] = cloudEmail - # get requests - sql = "SELECT pandaSite,status,dn FROM ATLAS_PANDAMETA.siteaccess WHERE status IN (:status1,:status2,:status3) " - sql += "ORDER BY pandaSite,status " - varMap = {} - varMap[':status1'] = 'requested' - varMap[':status2'] = 'tobeapproved' - varMap[':status3'] = 'toberejected' - status,res = taskBuffer.querySQLS(sql,varMap) - requestsInCloud = {} - mailUtils = MailUtils() - # loop over all requests - for pandaSite,reqStatus,userName in res: - cloud = siteMapper.getSite(pandaSite).cloud - _logger.debug("request : '%s' site=%s status=%s cloud=%s" % (userName,pandaSite,reqStatus,cloud)) - # send emails to user - if reqStatus in ['tobeapproved','toberejected']: - # set status - if reqStatus == 'tobeapproved': - newStatus = 'approved' - else: - newStatus = 'rejected' - # get mail address for user - userMailAddr = '' - sqlUM = "SELECT email FROM ATLAS_PANDAMETA.users WHERE name=:userName" - varMap = {} - varMap[':userName'] = userName - stUM,resUM = taskBuffer.querySQLS(sqlUM,varMap) - if resUM == None or len(resUM) == 0: - _logger.error("email address is unavailable for '%s'" % userName) - else: - userMailAddr = resUM[0][0] - # send - if not userMailAddr in ['',None,'None','notsend']: - _logger.debug("send update to %s" % userMailAddr) - retMail = mailUtils.sendSiteAccessUpdate(userMailAddr,newStatus,pandaSite) - _logger.debug(retMail) - # update database - sqlUp = "UPDATE ATLAS_PANDAMETA.siteaccess SET status=:newStatus " - sqlUp += "WHERE pandaSite=:pandaSite AND dn=:userName" - varMap = {} - varMap[':userName'] = userName - varMap[':newStatus'] = newStatus - varMap[':pandaSite'] = pandaSite - stUp,resUp = taskBuffer.querySQLS(sqlUp,varMap) - else: - # append cloud - if not requestsInCloud.has_key(cloud): - requestsInCloud[cloud] = {} - # append site - if not requestsInCloud[cloud].has_key(pandaSite): - requestsInCloud[cloud][pandaSite] = [] - # append user - requestsInCloud[cloud][pandaSite].append(userName) - # send requests to the cloud responsible - for cloud,requestsMap in requestsInCloud.iteritems(): - _logger.debug("requests for approval : cloud=%s" % cloud) - # send - if contactAddr.has_key(cloud) and (not contactAddr[cloud] in ['',None,'None']): - # get site contact - for pandaSite,userNames in requestsMap.iteritems(): - if not siteContactAddr.has_key(pandaSite): - varMap = {} - varMap[':siteid'] = pandaSite - sqlSite = "SELECT email FROM ATLAS_PANDAMETA.schedconfig WHERE siteid=:siteid AND rownum<=1" - status,res = taskBuffer.querySQLS(sqlSite,varMap) - siteContactAddr[pandaSite] = res[0][0] - # append - if not siteContactAddr[pandaSite] in ['',None,'None']: - contactAddr[cloud] += ',%s' % siteContactAddr[pandaSite] - # send - _logger.debug("send request to %s" % contactAddr[cloud]) - retMail = mailUtils.sendSiteAccessRequest(contactAddr[cloud],requestsMap,cloud) - _logger.debug(retMail) - # update database - if retMail: - sqlUp = "UPDATE ATLAS_PANDAMETA.siteaccess SET status=:newStatus " - sqlUp += "WHERE pandaSite=:pandaSite AND dn=:userName" - for pandaSite,userNames in requestsMap.iteritems(): - for userName in userNames: - varMap = {} - varMap[':userName'] = userName - varMap[':newStatus'] = 'inprocess' - varMap[':pandaSite'] = pandaSite - stUp,resUp = taskBuffer.querySQLS(sqlUp,varMap) - else: - _logger.error("contact email address is unavailable for %s" % cloud) -except: - type, value, traceBack = sys.exc_info() - _logger.error("Failed with %s %s" % (type,value)) -_logger.debug("Site Access : done") - - -# finalize failed jobs -_logger.debug("AnalFinalizer session") -try: - # get min PandaID for failed jobs in Active table - sql = "SELECT MIN(PandaID),prodUserName,jobDefinitionID FROM ATLAS_PANDA.jobsActive4 " - sql += "WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus " - sql += "GROUP BY prodUserName,jobDefinitionID " - varMap = {} - varMap[':jobStatus'] = 'failed' - varMap[':prodSourceLabel'] = 'user' - status,res = taskBuffer.querySQLS(sql,varMap) - if res != None: - # loop over all user/jobdefID - for pandaID,prodUserName,jobDefinitionID in res: - # check - _logger.debug("check finalization for %s %s" % (prodUserName,jobDefinitionID)) - sqlC = "SELECT COUNT(*) FROM ATLAS_PANDA.jobsActive4 " - sqlC += "WHERE prodSourceLabel=:prodSourceLabel AND prodUserName=:prodUserName " - sqlC += "AND jobDefinitionID=:jobDefinitionID AND jobStatus<>:jobStatus " - varMap = {} - varMap[':jobStatus'] = 'failed' - varMap[':prodSourceLabel'] = 'user' - varMap[':jobDefinitionID'] = jobDefinitionID - varMap[':prodUserName'] = prodUserName - statC,resC = taskBuffer.querySQLS(sqlC,varMap) - # finalize if there is no non-failed jobs - if resC != None: - _logger.debug("n of non-failed jobs : %s" % resC[0][0]) - if resC[0][0] == 0: - _logger.debug("finalize %s %s" % (prodUserName,jobDefinitionID)) - taskBuffer.finalizePendingJobs(prodUserName,jobDefinitionID) - else: - _logger.debug("n of non-failed jobs : None") -except: - errType,errValue = sys.exc_info()[:2] - _logger.error("AnalFinalizer failed with %s %s" % (errType,errValue)) - - -_memoryCheck("watcher") - -_logger.debug("Watcher session") -# check heartbeat for analysis jobs -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) -varMap = {} -varMap[':modificationTime'] = timeLimit -varMap[':prodSourceLabel1'] = 'panda' -varMap[':prodSourceLabel2'] = 'user' -varMap[':jobStatus1'] = 'running' -varMap[':jobStatus2'] = 'starting' -varMap[':jobStatus3'] = 'stagein' -varMap[':jobStatus4'] = 'stageout' -sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2) " -sql += "AND (jobStatus=:jobStatus1 OR jobStatus=:jobStatus2 OR jobStatus=:jobStatus3 OR jobStatus=:jobStatus4) AND modificationTime<:modificationTime" -status,res = taskBuffer.querySQLS(sql,varMap) -if res == None: - _logger.debug("# of Anal Watcher : %s" % res) -else: - _logger.debug("# of Anal Watcher : %s" % len(res)) - for (id,) in res: - _logger.debug("Anal Watcher %s" % id) - thr = Watcher(taskBuffer,id,single=True,sleepTime=60,sitemapper=siteMapper) - thr.start() - thr.join() - time.sleep(1) - -# check heartbeat for sent jobs -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) -varMap = {} -varMap[':jobStatus'] = 'sent' -varMap[':modificationTime'] = timeLimit -status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime", - varMap) -if res == None: - _logger.debug("# of Sent Watcher : %s" % res) -else: - _logger.debug("# of Sent Watcher : %s" % len(res)) - for (id,) in res: - _logger.debug("Sent Watcher %s" % id) - thr = Watcher(taskBuffer,id,single=True,sleepTime=30,sitemapper=siteMapper) - thr.start() - thr.join() - time.sleep(1) - -# check heartbeat for 'holding' analysis/ddm jobs -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) -# get XMLs -xmlIDs = [] -xmlFiles = os.listdir(panda_config.logdir) -for file in xmlFiles: - match = re.search('^(\d+)_([^_]+)_.{36}$',file) - if match != None: - id = match.group(1) - xmlIDs.append(int(id)) -sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND (modificationTime<:modificationTime OR (endTime IS NOT NULL AND endTime<:endTime)) AND (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2 OR prodSourceLabel=:prodSourceLabel3) AND stateChangeTime != modificationTime" -varMap = {} -varMap[':modificationTime'] = timeLimit -varMap[':endTime'] = timeLimit -varMap[':jobStatus'] = 'holding' -varMap[':prodSourceLabel1'] = 'panda' -varMap[':prodSourceLabel2'] = 'user' -varMap[':prodSourceLabel3'] = 'ddm' -status,res = taskBuffer.querySQLS(sql,varMap) -if res == None: - _logger.debug("# of Holding Anal/DDM Watcher : %s" % res) -else: - _logger.debug("# of Holding Anal/DDM Watcher : %s - XMLs : %s" % (len(res),len(xmlIDs))) - for (id,) in res: - _logger.debug("Holding Anal/DDM Watcher %s" % id) - if int(id) in xmlIDs: - _logger.debug(" found XML -> skip %s" % id) - continue - thr = Watcher(taskBuffer,id,single=True,sleepTime=180,sitemapper=siteMapper) - thr.start() - thr.join() - time.sleep(1) - -# check heartbeat for production jobs -timeOutVal = 48 -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=timeOutVal) -sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND (modificationTime<:modificationTime OR (endTime IS NOT NULL AND endTime<:endTime))" -varMap = {} -varMap[':modificationTime'] = timeLimit -varMap[':endTime'] = timeLimit -varMap[':jobStatus'] = 'holding' -status,res = taskBuffer.querySQLS(sql,varMap) -if res == None: - _logger.debug("# of Holding Watcher : %s" % res) -else: - _logger.debug("# of Holding Watcher : %s" % len(res)) - for (id,) in res: - _logger.debug("Holding Watcher %s" % id) - thr = Watcher(taskBuffer,id,single=True,sleepTime=60*timeOutVal,sitemapper=siteMapper) - thr.start() - thr.join() - time.sleep(1) - -# check heartbeat for ddm jobs -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) -varMap = {} -varMap[':modificationTime'] = timeLimit -varMap[':jobStatus1'] = 'running' -varMap[':jobStatus2'] = 'starting' -varMap[':jobStatus3'] = 'stagein' -varMap[':jobStatus4'] = 'stageout' -varMap[':prodSourceLabel'] = 'ddm' -status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (jobStatus=:jobStatus1 OR jobStatus=:jobStatus2 OR jobStatus=:jobStatus3 OR jobStatus=:jobStatus4) AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel", - varMap) -if res == None: - _logger.debug("# of DDM Watcher : %s" % res) -else: - _logger.debug("# of DDM Watcher : %s" % len(res)) - for (id,) in res: - _logger.debug("DDM Watcher %s" % id) - thr = Watcher(taskBuffer,id,single=True,sleepTime=120,sitemapper=siteMapper) - thr.start() - thr.join() - time.sleep(1) - -# check heartbeat for production jobs -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=6) -varMap = {} -varMap[':modificationTime'] = timeLimit -varMap[':jobStatus1'] = 'running' -varMap[':jobStatus2'] = 'starting' -varMap[':jobStatus3'] = 'stagein' -varMap[':jobStatus4'] = 'stageout' -status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (jobStatus=:jobStatus1 OR jobStatus=:jobStatus2 OR jobStatus=:jobStatus3 OR jobStatus=:jobStatus4) AND modificationTime<:modificationTime", - varMap) -if res == None: - _logger.debug("# of General Watcher : %s" % res) -else: - _logger.debug("# of General Watcher : %s" % len(res)) - for (id,) in res: - _logger.debug("General Watcher %s" % id) - thr = Watcher(taskBuffer,id,single=True,sitemapper=siteMapper) - thr.start() - thr.join() - time.sleep(1) - -_memoryCheck("reassign") - -# kill long-waiting jobs in defined table -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7) -status,res = taskBuffer.querySQLS("SELECT PandaID,cloud,prodSourceLabel FROM ATLAS_PANDA.jobsDefined4 WHERE creationTime<:creationTime", - {':creationTime':timeLimit}) -jobs=[] -dashFileMap = {} -if res != None: - for pandaID,cloud,prodSourceLabel in res: - # collect PandaIDs - jobs.append(pandaID) - try: - if cloud in ['US']: - # skip US since file info is not available in dashboard - continue - # check file status for production - if not prodSourceLabel in ['managed']: - pass - else: - # get T1 site - tmpT1siteID = siteMapper.getCloud(cloud)['source'] - t1Site = siteMapper.getSite(tmpT1siteID) - # get pending input files - sqlF = "SELECT lfn,GUID,dispatchDBlock FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID " - sqlF += "AND type=:type AND status=:status" - varMap = {} - varMap[':type'] = 'input' - varMap[':status'] = 'pending' - varMap[':PandaID'] = pandaID - stFile,resFile = taskBuffer.querySQLS(sqlF,varMap) - if resFile != None: - # loop over all files - for tmpLFN,tmpGUID,tmpDispDBlock in resFile: - # get file events - tmpDQ2IDs = t1Site.setokens.values() - tmpKey = (tuple(tmpDQ2IDs),tmpLFN) - if not dashFileMap.has_key(tmpKey): - _logger.debug('getting fileEvents for %s:%s' % tmpKey) - tmpStat,tmpOut = dashBorad.listFileEvents(tmpDQ2IDs,tmpGUID) - _logger.debug(tmpStat) - _logger.debug(tmpOut) - if tmpStat != 0: - # failed - continue - # convert to list - try: - exec "tmpEvens = %s" % tmpOut - if not isinstance(tmpEvens,types.ListType): - raise TypeError,"%s is not a list" % type(tmpEvens) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error(tmpOut) - _logger.error("invalid dashboard response %s %s" % (errType,errValue)) - continue - dashFileMap[tmpKey] = None - # look for latest events - tmpLastTime = '' - for tmpEvt in tmpEvens: - # pickup only DQ2 events - if not tmpEvt['tool_id'] in ['DQ2',None]: - continue - # pickup first one or newer - if tmpLastTime == '' or tmpLastTime < tmpEvt['modified_time']: - tmpLastTime = tmpEvt['modified_time'] - dashFileMap[tmpKey] = tmpEvt['state'] - _logger.debug('got status=%s' % dashFileMap[tmpKey]) - # update failed files - if dashFileMap[tmpKey] in ['FAILED_TRANSFER','BAD']: - sqlUpF = "UPDATE ATLAS_PANDA.filesTable4 SET status=:newStatus " - sqlUpF += "WHERE PandaID=:PandaID AND lfn=:lfn" - varMap = {} - varMap[':PandaID'] = pandaID - varMap[':lfn'] = tmpLFN - varMap[':newStatus'] = dashFileMap[tmpKey].lower() - taskBuffer.querySQLS(sqlUpF,varMap) - _logger.debug('set status=%s to %s:%s' % (dashFileMap[tmpKey],pandaID,tmpLFN)) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("dashboard access failed with %s %s" % (errType,errValue)) -if len(jobs): - _logger.debug("killJobs for Defined (%s)" % str(jobs)) - Client.killJobs(jobs,2) - -# kill long-waiting jobs in active table -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7) -varMap = {} -varMap[':jobStatus'] = 'activated' -varMap[':creationTime'] = timeLimit -status,res = taskBuffer.querySQLS("SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND creationTime<:creationTime", - varMap) -jobs=[] -if res != None: - for (id,) in res: - jobs.append(id) -if len(jobs): - _logger.debug("killJobs for Active (%s)" % str(jobs)) - Client.killJobs(jobs,2) - - -# kill long-waiting ddm jobs for dispatch -_logger.debug("kill PandaMovers") -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) -sql = "SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND transferType=:transferType AND creationTime<:creationTime" -varMap = {} -varMap[':creationTime'] = timeLimit -varMap[':prodSourceLabel'] = 'ddm' -varMap[':transferType'] = 'dis' -_logger.debug(sql+str(varMap)) -status,res = taskBuffer.querySQLS(sql,varMap) -_logger.debug(res) -jobs=[] -if res != None: - for (id,) in res: - jobs.append(id) -if len(jobs): - _logger.debug("kill DDM Jobs (%s)" % str(jobs)) - Client.killJobs(jobs,2) - -# kill hang-up movers -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) -sql = "SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND transferType=:transferType AND jobStatus=:jobStatus AND startTime<:startTime" -varMap = {} -varMap[':startTime'] = timeLimit -varMap[':prodSourceLabel'] = 'ddm' -varMap[':transferType'] = 'dis' -varMap[':jobStatus'] = 'running' -_logger.debug(sql+str(varMap)) -status,res = taskBuffer.querySQLS(sql,varMap) -_logger.debug(res) -jobs = [] -movers = [] -if res != None: - for id, in res: - movers.append(id) - # get dispatch dataset - sql = 'SELECT name FROM ATLAS_PANDA.Datasets WHERE MoverID=:MoverID' - stDS,resDS = taskBuffer.querySQLS(sql,{':MoverID':id}) - if resDS != None: - disDS = resDS[0][0] - # get PandaIDs associated to the dis dataset - sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND dispatchDBlock=:dispatchDBlock" - varMap = {} - varMap[':jobStatus'] = 'assigned' - varMap[':dispatchDBlock'] = disDS - stP,resP = taskBuffer.querySQLS(sql,varMap) - if resP != None: - for pandaID, in resP: - jobs.append(pandaID) -# kill movers -if len(movers): - _logger.debug("kill hangup DDM Jobs (%s)" % str(movers)) - Client.killJobs(movers,2) -# reassign jobs -if len(jobs): - nJob = 100 - iJob = 0 - while iJob < len(jobs): - _logger.debug('reassignJobs for hangup movers (%s)' % jobs[iJob:iJob+nJob]) - taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) - iJob += nJob - -# reassign defined jobs in defined table -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=4) -# get PandaIDs -status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsDefined4",timeLimit,['defined'],['managed'],[],[],[]) -jobs=[] -if res != None: - for (id,) in res: - jobs.append(id) -# reassign -_logger.debug('reassignJobs for defined jobs -> #%s' % len(jobs)) -if len(jobs) > 0: - nJob = 100 - iJob = 0 - while iJob < len(jobs): - _logger.debug('reassignJobs for defined jobs (%s)' % jobs[iJob:iJob+nJob]) - taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) - _logger.debug('reassignJobs for defined jobs done %s' % jobs[iJob]) - iJob += nJob - - -# reassign when ratio of running/notrunning is too unbalanced -""" -_logger.debug("reassign Unbalanced") -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=4) -jobStat = {} -rangeValues = ['all','limit'] -for rangeVal in rangeValues: - for jobStatus in ['running','activated','assigned']: - table = 'ATLAS_PANDA.jobsDefined4' - if jobStatus in ['running','activated']: - table = 'ATLAS_PANDA.jobsActive4' - varMap = {} - varMap[':prodSourceLabel'] = 'managed' - varMap[':jobStatus'] = jobStatus - if rangeVal == 'all': - sql = "SELECT computingSite,cloud,processingType,count(*) FROM %s WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus GROUP BY computingSite,cloud,processingType" \ - % table - else: - sql = "SELECT computingSite,cloud,processingType,count(*) FROM %s WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus AND modificationTime<:modificationTime GROUP BY computingSite,cloud,processingType" \ - % table - varMap[':modificationTime'] = timeLimit - # execute - status,res = taskBuffer.querySQLS(sql,varMap) - if res != None: - for computingSite,cloud,processingType,nJobs in res: - # add cloud - if not jobStat.has_key(cloud): - jobStat[cloud] = {} - # add site - if not jobStat[cloud].has_key(computingSite): - jobStat[cloud][computingSite] = {} - # add range - if not jobStat[cloud][computingSite].has_key(rangeVal): - jobStat[cloud][computingSite][rangeVal] = {} - # add process group - tmpProGroup = ProcessGroups.getProcessGroup(processingType) - if not jobStat[cloud][computingSite][rangeVal].has_key(tmpProGroup): - jobStat[cloud][computingSite][rangeVal][tmpProGroup] = {} - # set status - tmpStatus = jobStatus - if jobStatus != 'running': - tmpStatus = 'notrunning' - # add status - if not jobStat[cloud][computingSite][rangeVal][tmpProGroup].has_key(tmpStatus): - jobStat[cloud][computingSite][rangeVal][tmpProGroup][tmpStatus] = 0 - # add - jobStat[cloud][computingSite][rangeVal][tmpProGroup][tmpStatus] += nJobs -# look for unbalanced site -for cloud,siteVal in jobStat.iteritems(): - jobsCloud = {} - ngSites = {} - t1Site = siteMapper.getCloud(cloud)['source'] - _logger.debug("Cloud:%s" % cloud) - for computingSite,jobVal in siteVal.iteritems(): - # set 0 - for rangeVal in rangeValues: - for pgType,pgList in ProcessGroups.processGroups: - # add range - if not jobVal.has_key(rangeVal): - jobVal[rangeVal] = {} - # add process group - if not jobVal[rangeVal].has_key(pgType): - jobVal[rangeVal][pgType] = {} - # number of jobs - if not jobVal[rangeVal][pgType].has_key('running'): - jobVal[rangeVal][pgType]['running'] = 0 - if not jobVal[rangeVal][pgType].has_key('notrunning'): - jobVal[rangeVal][pgType]['notrunning'] = 0 - # check ratio - for pgType,pgList in ProcessGroups.processGroups: - # add process group to map - if not jobsCloud.has_key(pgType): - jobsCloud[pgType] = {'notrunning':0,'running':0,'notfull':False} - if not ngSites.has_key(pgType): - ngSites[pgType] = [] - # get ratio - checkRatio = jobVal['limit'][pgType]['notrunning'] > jobVal['all'][pgType]['running']*4 - jobsCloud[pgType]['running'] += jobVal['all'][pgType]['running'] - jobsCloud[pgType]['notrunning'] += jobVal['all'][pgType]['notrunning'] - # check ratio - if computingSite in [t1Site,'NULL']: - # skip T1 - statStr = '--' - else: - if checkRatio: - statStr = 'NG' - ngSites[pgType].append(computingSite) - else: - statStr = '--' - # not full - if jobVal['all'][pgType]['notrunning'] < jobVal['all'][pgType]['running']*2: - jobsCloud[pgType]['notfull'] = True - _logger.debug("%20s : %14s %s n:%-5s r:%-5s" % (computingSite,pgType,statStr,jobVal['limit'][pgType]['notrunning'], - jobVal['all'][pgType]['running'])) - # reassign - for pgType,pgList in ProcessGroups.processGroups: - _logger.debug(" %14s : n:%-5s r:%-5s %s" % (pgType,jobsCloud[pgType]['notrunning'], - jobsCloud[pgType]['running'],jobsCloud[pgType]['notfull'])) - if jobsCloud[pgType]['notrunning'] > jobsCloud[pgType]['running']*2 and ngSites[pgType] != [] and jobsCloud[pgType]['notfull']: - # reassign except reprocessing - if pgType in ['reprocessing']: - continue - # get PandaIDs - jobs = [] - for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4']: - varMap = {} - varMap[':prodSourceLabel'] = 'managed' - varMap[':jobStatus1'] = 'activated' - varMap[':jobStatus2'] = 'assigned' - sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND computingSite IN (" % table - idxSite = 1 - for ngSite in ngSites[pgType]: - tmpSiteKey = ':computingSite%s' % idxSite - sql += "%s," % tmpSiteKey - varMap[tmpSiteKey] = ngSite - idxSite += 1 - sql = sql[:-1] - if pgList != []: - sql += ") AND processingType IN (" - tmpPgList = pgList - else: - sql += ") AND processingType NOT IN (" - # get types to be excluded - tmpPgList = [] - for tmpExPgType,tmpExPgList in ProcessGroups.processGroups: - if tmpExPgType != pgType: - tmpPgList += tmpExPgList - idxPro = 1 - for pgItem in tmpPgList: - tmpProKey = ':processingType%s' % idxPro - sql += "%s," % tmpProKey - varMap[tmpProKey] = pgItem - idxPro += 1 - sql = sql[:-1] - sql += ") AND modificationTime<:modificationTime ORDER BY PandaID" - varMap[':modificationTime'] = timeLimit - # execute - _logger.debug(sql+str(varMap)) - status,res = taskBuffer.querySQLS(sql,varMap) - if res != None: - # get IDs - for id, in res: - jobs.append(id) - # reassign - if jobs != []: - if len(jobs): - nJob = 100 - iJob = 0 - while iJob < len(jobs): - #_logger.debug('reassignJobs for Unbalanced (%s)' % jobs[iJob:iJob+nJob]) - #Client.reassignJobs(jobs[iJob:iJob+nJob]) - iJob += nJob - #time.sleep(60) -""" - - -# reassign long-waiting jobs in defined table -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) -status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsDefined4",timeLimit,[],['managed'],[],[],[]) -jobs=[] -if res != None: - for (id,) in res: - jobs.append(id) -# reassign -_logger.debug('reassignJobs for long in defined table -> #%s' % len(jobs)) -if len(jobs) > 0: - nJob = 100 - iJob = 0 - while iJob < len(jobs): - _logger.debug('reassignJobs for long in defined table (%s)' % jobs[iJob:iJob+nJob]) - taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) - iJob += nJob - - -# reassign too long-standing evgen/simul jobs with active state at T1 -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=6) -for tmpCloud in siteMapper.getCloudList(): - # ignore special clouds - if tmpCloud in ['CERN','OSG']: - continue - status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsActive4",timeLimit,['activated'],['managed'], - ['evgen','simul'],[siteMapper.getCloud(tmpCloud)['tier1']],[]) - jobs = [] - if res != None: - for (id,) in res: - jobs.append(id) - _logger.debug('reassignJobs for Active T1 evgensimul in %s -> #%s' % (tmpCloud,len(jobs))) - if len(jobs) != 0: - nJob = 100 - iJob = 0 - while iJob < len(jobs): - _logger.debug('reassignJobs for Active T1 evgensimul (%s)' % jobs[iJob:iJob+nJob]) - taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) - iJob += nJob - -# reassign too long-standing evgen/simul jobs with active state at T2 -try: - _logger.debug('looking for stuck T2s to reassign evgensimul') - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=6) - varMap = {} - varMap[':jobStatus1'] = 'activated' - varMap[':jobStatus2'] = 'running' - varMap[':prodSourceLabel'] = 'managed' - varMap[':processingType1'] = 'evgen' - varMap[':processingType2'] = 'simul' - status,res = taskBuffer.querySQLS("SELECT cloud,computingSite,jobStatus,COUNT(*) FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus IN (:jobStatus1,:jobStatus2) AND prodSourceLabel=:prodSourceLabel AND processingType IN (:processingType1,:processingType2) GROUP BY cloud,computingSite,jobStatus", - varMap) - if res != None: - # get ratio of activated/running - siteStatData = {} - for tmpCloud,tmpComputingSite,tmpJobStatus,tmpCount in res: - # skip T1 - if tmpComputingSite == siteMapper.getCloud(tmpCloud)['tier1']: - continue - # add cloud/site - tmpKey = (tmpCloud,tmpComputingSite) - if not siteStatData.has_key(tmpKey): - siteStatData[tmpKey] = {'activated':0,'running':0} - # add the number of jobs - if siteStatData[tmpKey].has_key(tmpJobStatus): - siteStatData[tmpKey][tmpJobStatus] += tmpCount - # look for stuck site - stuckThr = 10 - stuckSites = [] - for tmpKey,tmpStatData in siteStatData.iteritems(): - if tmpStatData['running'] == 0 or \ - float(tmpStatData['activated'])/float(tmpStatData['running']) > stuckThr: - tmpCloud,tmpComputingSite = tmpKey - _logger.debug(' %s:%s %s/%s > %s' % (tmpCloud,tmpComputingSite,tmpStatData['activated'],tmpStatData['running'],stuckThr)) - # get stuck jobs - status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsActive4",timeLimit,['activated'],['managed'], - ['evgen','simul'],[tmpComputingSite],[tmpCloud]) - jobs = [] - if res != None: - for (id,) in res: - jobs.append(id) - _logger.debug('reassignJobs for Active T2 evgensimul %s:%s -> #%s' % (tmpCloud,tmpComputingSite,len(jobs))) - if len(jobs) > 0: - nJob = 100 - iJob = 0 - while iJob < len(jobs): - _logger.debug('reassignJobs for Active T2 evgensimul (%s)' % jobs[iJob:iJob+nJob]) - taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) - iJob += nJob -except: - errType,errValue = sys.exc_info()[:2] - _logger.error("failed to reassign T2 evgensimul with %s:%s" % (errType,errValue)) - -# reassign too long-standing jobs in active table -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=2) -status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsActive4",timeLimit,['activated'],['managed'],[],[],[]) -jobs = [] -if res != None: - for (id,) in res: - jobs.append(id) -_logger.debug('reassignJobs for long in active table -> #%s' % len(jobs)) -if len(jobs) != 0: - nJob = 100 - iJob = 0 - while iJob < len(jobs): - _logger.debug('reassignJobs for long in active table (%s)' % jobs[iJob:iJob+nJob]) - taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) - iJob += nJob - - -# kill too long-standing analysis jobs in active table -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7) -varMap = {} -varMap[':prodSourceLabel1'] = 'test' -varMap[':prodSourceLabel2'] = 'panda' -varMap[':prodSourceLabel3'] = 'user' -varMap[':modificationTime'] = timeLimit -status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2 OR prodSourceLabel=:prodSourceLabel3) AND modificationTime<:modificationTime ORDER BY PandaID", - varMap) -jobs = [] -if res != None: - for (id,) in res: - jobs.append(id) -# kill -if len(jobs): - Client.killJobs(jobs,2) - _logger.debug("killJobs for Anal Active (%s)" % str(jobs)) - - -# kill too long pending jobs -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) -varMap = {} -varMap[':jobStatus'] = 'pending' -varMap[':creationTime'] = timeLimit -status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsWaiting4 WHERE jobStatus=:jobStatus AND creationTime<:creationTime", - varMap) -jobs = [] -if res != None: - for (id,) in res: - jobs.append(id) -# kill -if len(jobs): - Client.killJobs(jobs,4) - _logger.debug("killJobs for Pending (%s)" % str(jobs)) - -# kill too long waiting jobs -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=1) -status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsWaiting4 WHERE creationTime<:creationTime", - {':creationTime':timeLimit}) -jobs = [] -if res != None: - for (id,) in res: - jobs.append(id) -# kill -if len(jobs): - Client.killJobs(jobs,4) - _logger.debug("killJobs for Waiting (%s)" % str(jobs)) - - -# reassign long waiting jobs -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) -status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsWaiting4",timeLimit,['waiting'],['managed'],[],[],[]) -jobs = [] -if res != None: - for (id,) in res: - jobs.append(id) -_logger.debug('reassignJobs for Waiting -> #%s' % len(jobs)) -if len(jobs): - nJob = 100 - iJob = 0 - while iJob < len(jobs): - _logger.debug('reassignJobs for Waiting (%s)' % jobs[iJob:iJob+nJob]) - taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) - iJob += nJob - -# kill too long running jobs -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=21) -status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE creationTime<:creationTime", - {':creationTime':timeLimit}) -jobs = [] -if res != None: - for (id,) in res: - jobs.append(id) -# kill -if len(jobs): - nJob = 100 - iJob = 0 - while iJob < len(jobs): - # set tobekill - _logger.debug('killJobs for Running (%s)' % jobs[iJob:iJob+nJob]) - Client.killJobs(jobs[iJob:iJob+nJob],2) - # run watcher - for id in jobs[iJob:iJob+nJob]: - thr = Watcher(taskBuffer,id,single=True,sitemapper=siteMapper,sleepTime=60*24*21) - thr.start() - thr.join() - time.sleep(1) - iJob += nJob - time.sleep(10) - -# kill too long waiting ddm jobs -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=5) -varMap = {} -varMap[':prodSourceLabel'] = 'ddm' -varMap[':creationTime'] = timeLimit -status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND creationTime<:creationTime", - varMap) -jobs = [] -if res != None: - for (id,) in res: - jobs.append(id) -# kill -if len(jobs): - Client.killJobs(jobs,2) - _logger.debug("killJobs for DDM (%s)" % str(jobs)) - -_memoryCheck("closing") - - -# delete old datasets -""" -timeLimitDnS = datetime.datetime.utcnow() - datetime.timedelta(days=60) -timeLimitTop = datetime.datetime.utcnow() - datetime.timedelta(days=90) -nDelDS = 1000 -for dsType,dsPrefix in [('','top'),]: - sql = 'DELETE FROM ATLAS_PANDA.Datasets ' - if dsType != '': - # dis or sub - sql += 'WHERE type=:type AND modificationdate<:modificationdate ' - sql += 'AND REGEXP_LIKE(name,:pattern) AND rownum <= %s' % nDelDS - varMap = {} - varMap[':modificationdate'] = timeLimitDnS - varMap[':type'] = dsType - varMap[':pattern'] = '_%s[[:digit:]]+$' % dsPrefix - else: - # top level datasets - sql+= 'WHERE modificationdate<:modificationdate AND rownum <= %s' % nDelDS - varMap = {} - varMap[':modificationdate'] = timeLimitTop - for i in range(100): - # del datasets - ret,res = taskBuffer.querySQLS(sql, varMap) - _logger.debug('# of %s datasets deleted: %s' % (dsPrefix,res)) - # no more datasets - if res != nDelDS: - break -""" - -# thread pool -class ThreadPool: - def __init__(self): - self.lock = threading.Lock() - self.list = [] - - def add(self,obj): - self.lock.acquire() - self.list.append(obj) - self.lock.release() - - def remove(self,obj): - self.lock.acquire() - self.list.remove(obj) - self.lock.release() - - def join(self): - self.lock.acquire() - thrlist = tuple(self.list) - self.lock.release() - for thr in thrlist: - thr.join() - - -# thread to close dataset -class CloserThr (threading.Thread): - def __init__(self,lock,proxyLock,datasets,pool): - threading.Thread.__init__(self) - self.datasets = datasets - self.lock = lock - self.proxyLock = proxyLock - self.pool = pool - self.pool.add(self) - - def run(self): - self.lock.acquire() - try: - # loop over all datasets - for vuid,name,modDate in self.datasets: - _logger.debug("Close %s %s" % (modDate,name)) - if not name.startswith('pandaddm_'): - status,out = ddm.DQ2.main('freezeDataset',name) - else: - status,out = 0,'' - if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: - _logger.error(out) - else: - self.proxyLock.acquire() - varMap = {} - varMap[':vuid'] = vuid - varMap[':status'] = 'completed' - taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", - varMap) - self.proxyLock.release() - if name.startswith('pandaddm_'): - continue - # count # of files - status,out = ddm.DQ2.main('getNumberOfFiles',name) - _logger.debug(out) - if status != 0: - _logger.error(out) - else: - try: - nFile = int(out) - _logger.debug(nFile) - if nFile == 0: - # erase dataset - _logger.debug('erase %s' % name) - status,out = ddm.DQ2.main('eraseDataset',name) - _logger.debug(out) - except: - pass - except: - pass - self.pool.remove(self) - self.lock.release() - -# close datasets -""" -timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) -timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3) -closeLock = threading.Semaphore(5) -closeProxyLock = threading.Lock() -closeThreadPool = ThreadPool() -while True: - # lock - closeLock.acquire() - # get datasets - closeProxyLock.acquire() - varMap = {} - varMap[':modificationdateU'] = timeLimitU - varMap[':modificationdateL'] = timeLimitL - varMap[':type'] = 'output' - varMap[':status'] = 'tobeclosed' - sqlQuery = 'type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= 500' - proxyS = taskBuffer.proxyPool.getProxy() - res = proxyS.getLockDatasets(sqlQuery,varMap) - taskBuffer.proxyPool.putProxy(proxyS) - if res == None: - _logger.debug('# of datasets to be closed: %s' % res) - else: - _logger.debug('# of datasets to be closed: %s' % len(res)) - if res==None or len(res)==0: - closeProxyLock.release() - closeLock.release() - break - # release - closeProxyLock.release() - closeLock.release() - # run thread - closerThr = CloserThr(closeLock,closeProxyLock,res,closeThreadPool) - closerThr.start() - -closeThreadPool.join() -""" - -# thread to freeze dataset -class Freezer (threading.Thread): - def __init__(self,lock,proxyLock,datasets,pool): - threading.Thread.__init__(self) - self.datasets = datasets - self.lock = lock - self.proxyLock = proxyLock - self.pool = pool - self.pool.add(self) - - def run(self): - self.lock.acquire() - try: - for vuid,name,modDate in self.datasets: - _logger.debug("start %s %s" % (modDate,name)) - self.proxyLock.acquire() - retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ lfn FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock", - {':destinationDBlock':name}) - self.proxyLock.release() - if retF<0: - _logger.error("SQL error") - else: - # no files in filesTable - if len(resF) == 0: - _logger.debug("freeze %s " % name) - if not name.startswith('pandaddm_'): - status,out = ddm.DQ2.main('freezeDataset',name) - else: - status,out = 0,'' - if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: - _logger.error(out) - else: - self.proxyLock.acquire() - varMap = {} - varMap[':vuid'] = vuid - varMap[':status'] = 'completed' - taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", - varMap) - self.proxyLock.release() - if name.startswith('pandaddm_'): - continue - # count # of files - status,out = ddm.DQ2.main('getNumberOfFiles',name) - _logger.debug(out) - if status != 0: - _logger.error(out) - else: - try: - nFile = int(out) - _logger.debug(nFile) - if nFile == 0: - # erase dataset - _logger.debug('erase %s' % name) - status,out = ddm.DQ2.main('eraseDataset',name) - _logger.debug(out) - except: - pass - else: - _logger.debug("wait %s " % name) - self.proxyLock.acquire() - taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid}) - self.proxyLock.release() - _logger.debug("end %s " % name) - except: - pass - self.pool.remove(self) - self.lock.release() - -# freeze dataset -""" -timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(days=4) -timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=14) -freezeLock = threading.Semaphore(5) -freezeProxyLock = threading.Lock() -freezeThreadPool = ThreadPool() -while True: - # lock - freezeLock.acquire() - # get datasets - sqlQuery = 'type=:type AND status IN (:status1,:status2,:status3) ' + \ - 'AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND REGEXP_LIKE(name,:pattern) AND rownum <= 500' - varMap = {} - varMap[':modificationdateU'] = timeLimitU - varMap[':modificationdateL'] = timeLimitL - varMap[':type'] = 'output' - varMap[':status1'] = 'running' - varMap[':status2'] = 'created' - varMap[':status3'] = 'defined' - varMap[':pattern'] = '_sub[[:digit:]]+$' - freezeProxyLock.acquire() - proxyS = taskBuffer.proxyPool.getProxy() - res = proxyS.getLockDatasets(sqlQuery,varMap) - taskBuffer.proxyPool.putProxy(proxyS) - if res == None: - _logger.debug('# of datasets to be frozen: %s' % res) - else: - _logger.debug('# of datasets to be frozen: %s' % len(res)) - if res==None or len(res)==0: - freezeProxyLock.release() - freezeLock.release() - break - freezeProxyLock.release() - # release - freezeLock.release() - # run freezer - freezer = Freezer(freezeLock,freezeProxyLock,res,freezeThreadPool) - freezer.start() - -freezeThreadPool.join() -""" - -# thread to delete dataset replica from T2 -class T2Cleaner (threading.Thread): - def __init__(self,lock,proxyLock,datasets,pool): - threading.Thread.__init__(self) - self.datasets = datasets - self.lock = lock - self.proxyLock = proxyLock - self.pool = pool - self.pool.add(self) - - def run(self): - self.lock.acquire() - try: - for vuid,name,modDate in self.datasets: - _logger.debug("cleanT2 %s" % name) - # get list of replicas - status,out = ddm.DQ2.main('listDatasetReplicas',name,0,None,False) - if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: - _logger.error(out) - continue - else: - try: - # convert res to map - exec "tmpRepSites = %s" % out - except: - tmpRepSites = {} - _logger.error("cannot convert to replica map") - _logger.error(out) - continue - # check cloud - cloudName = None - for tmpCloudName in siteMapper.getCloudList(): - t1SiteName = siteMapper.getCloud(tmpCloudName)['source'] - t1SiteDDMs = siteMapper.getSite(t1SiteName).setokens.values() - for tmpDDM in t1SiteDDMs: - if tmpRepSites.has_key(tmpDDM): - cloudName = tmpCloudName - break - # cloud is not found - if cloudName == None: - _logger.error("cannot find cloud for %s : %s" % (name,str(tmpRepSites))) - elif not cloudName in ['DE','CA','ES','FR','IT','NL','UK','TW','RU']: - # FIXME : test only EGEE for now - pass - else: - # look for T2 IDs - t2DDMs = [] - for tmpDDM in tmpRepSites.keys(): - if not tmpDDM in t1SiteDDMs and tmpDDM.endswith('_PRODDISK'): - t2DDMs.append(tmpDDM) - # delete replica for sub - if re.search('_sub\d+$',name) != None and t2DDMs != []: - _logger.debug(('deleteDatasetReplicas',name,t2DDMs)) - status,out = ddm.DQ2.main('deleteDatasetReplicas',name,t2DDMs) - if status != 0: - _logger.error(out) - if out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \ - out.find("No replica found") == -1: - continue - # update - self.proxyLock.acquire() - varMap = {} - varMap[':vuid'] = vuid - varMap[':status'] = 'completed' - taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", - varMap) - self.proxyLock.release() - _logger.debug("end %s " % name) - except: - pass - self.pool.remove(self) - self.lock.release() - -# delete dataset replica from T2 -""" -timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) -timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3) -t2cleanLock = threading.Semaphore(5) -t2cleanProxyLock = threading.Lock() -t2cleanThreadPool = ThreadPool() -while True: - # lock - t2cleanLock.acquire() - # get datasets - varMap = {} - varMap[':modificationdateU'] = timeLimitU - varMap[':modificationdateL'] = timeLimitL - varMap[':type'] = 'output' - varMap[':status'] = 'cleanup' - sqlQuery = 'type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= 500' - t2cleanProxyLock.acquire() - proxyS = taskBuffer.proxyPool.getProxy() - res = proxyS.getLockDatasets(sqlQuery,varMap) - taskBuffer.proxyPool.putProxy(proxyS) - if res == None: - _logger.debug('# of datasets to be deleted from T2: %s' % res) - else: - _logger.debug('# of datasets to be deleted from T2: %s' % len(res)) - if res==None or len(res)==0: - t2cleanProxyLock.release() - t2cleanLock.release() - break - t2cleanProxyLock.release() - # release - t2cleanLock.release() - # run t2cleanr - t2cleanr = T2Cleaner(t2cleanLock,t2cleanProxyLock,res,t2cleanThreadPool) - t2cleanr.start() - -t2cleanThreadPool.join() -""" - - -_memoryCheck("delete XML") - -# delete old files in DA cache -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7) -files = os.listdir(panda_config.cache_dir) -for file in files: - # skip special test file - if file == 'sources.72c48dc5-f055-43e5-a86e-4ae9f8ea3497.tar.gz': - continue - if file == 'sources.090f3f51-fc81-4e80-9749-a5e4b2bd58de.tar.gz': - continue - try: - # get timestamp - timestamp = datetime.datetime.fromtimestamp(os.stat('%s/%s' % (panda_config.cache_dir,file)).st_mtime) - # delete - if timestamp < timeLimit: - _logger.debug("delete %s " % file) - os.remove('%s/%s' % (panda_config.cache_dir,file)) - except: - pass - - -_memoryCheck("delete core") - -# delete core -dirName = '%s/..' % panda_config.logdir -for file in os.listdir(dirName): - if file.startswith('core.'): - _logger.debug("delete %s " % file) - try: - os.remove('%s/%s' % (dirName,file)) - except: - pass - - -_memoryCheck("finisher") - -# finish transferring jobs -""" -timeNow = datetime.datetime.utcnow() -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) -sql = 'SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime AND rownum<=20' -for ii in range(1000): - varMap = {} - varMap[':jobStatus'] = 'transferring' - varMap[':modificationTime'] = timeLimit - ret,res = taskBuffer.querySQLS(sql, varMap) - if res == None: - _logger.debug('# of jobs to be finished : %s' % res) - break - else: - _logger.debug('# of jobs to be finished : %s' % len(res)) - if len(res) == 0: - break - # get jobs from DB - ids = [] - for (id,) in res: - ids.append(id) - jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False) - # update modificationTime to lock jobs - for job in jobs: - if job != None and job.jobStatus != 'unknown': - taskBuffer.updateJobStatus(job.PandaID,job.jobStatus,{}) - upJobs = [] - finJobs = [] - for job in jobs: - if job == None or job.jobStatus == 'unknown': - continue - # use BNL by default - dq2URL = siteMapper.getSite('BNL_ATLAS_1').dq2url - dq2SE = [] - # get LFC and SEs - if job.prodSourceLabel == 'user' and not siteMapper.siteSpecList.has_key(job.destinationSE): - # using --destSE for analysis job to transfer output - try: - dq2URL = dataservice.DDM.toa.getLocalCatalog(job.destinationSE)[-1] - match = re.search('.+://([^:/]+):*\d*/*',dataservice.DDM.toa.getSiteProperty(job.destinationSE,'srm')[-1]) - if match != None: - dq2SE.append(match.group(1)) - except: - type, value, traceBack = sys.exc_info() - _logger.error('Failed to get DQ2/SE for %s with %s %s' % (job.PandaID,type,value)) - continue - elif siteMapper.checkCloud(job.cloud): - # normal production jobs - tmpDstID = siteMapper.getCloud(job.cloud)['dest'] - tmpDstSite = siteMapper.getSite(tmpDstID) - if not tmpDstSite.lfchost in [None,'']: - # LFC - dq2URL = 'lfc://'+tmpDstSite.lfchost+':/grid/atlas/' - if tmpDstSite.se != None: - for tmpDstSiteSE in tmpDstSite.se.split(','): - match = re.search('.+://([^:/]+):*\d*/*',tmpDstSiteSE) - if match != None: - dq2SE.append(match.group(1)) - else: - # LRC - dq2URL = tmpDstSite.dq2url - dq2SE = [] - # get LFN list - lfns = [] - guids = [] - nTokens = 0 - for file in job.Files: - # only output files are checked - if file.type == 'output' or file.type == 'log': - lfns.append(file.lfn) - guids.append(file.GUID) - nTokens += len(file.destinationDBlockToken.split(',')) - # get files in LRC - _logger.debug('Cloud:%s DQ2URL:%s' % (job.cloud,dq2URL)) - okFiles = brokerage.broker_util.getFilesFromLRC(lfns,dq2URL,guids,dq2SE,getPFN=True) - # count files - nOkTokens = 0 - for okLFN,okPFNs in okFiles.iteritems(): - nOkTokens += len(okPFNs) - # check all files are ready - _logger.debug(' nToken:%s nOkToken:%s' % (nTokens,nOkTokens)) - if nTokens <= nOkTokens: - _logger.debug('Finisher : Finish %s' % job.PandaID) - for file in job.Files: - if file.type == 'output' or file.type == 'log': - file.status = 'ready' - # append to run Finisher - finJobs.append(job) - else: - endTime = job.endTime - if endTime == 'NULL': - endTime = job.startTime - # priority-dependent timeout - tmpCloudSpec = siteMapper.getCloud(job.cloud) - if job.currentPriority >= 900 and (not job.prodSourceLabel in ['user']): - if tmpCloudSpec.has_key('transtimehi'): - timeOutValue = tmpCloudSpec['transtimehi'] - else: - timeOutValue = 1 - else: - if tmpCloudSpec.has_key('transtimelo'): - timeOutValue = tmpCloudSpec['transtimelo'] - else: - timeOutValue = 2 - # protection - if timeOutValue < 1: - timeOutValue = 1 - timeOut = timeNow - datetime.timedelta(days=timeOutValue) - _logger.debug(' Priority:%s Limit:%s End:%s' % (job.currentPriority,str(timeOut),str(endTime))) - if endTime < timeOut: - # timeout - _logger.debug('Finisher : Kill %s' % job.PandaID) - strMiss = '' - for lfn in lfns: - if not lfn in okFiles: - strMiss += ' %s' % lfn - job.jobStatus = 'failed' - job.taskBufferErrorCode = taskbuffer.ErrorCode.EC_Transfer - job.taskBufferErrorDiag = 'transfer timeout for '+strMiss - guidMap = {} - for file in job.Files: - # set file status - if file.status == 'transferring': - file.status = 'failed' - # collect GUIDs to delete files from _tid datasets - if file.type == 'output' or file.type == 'log': - if not guidMap.has_key(file.destinationDBlock): - guidMap[file.destinationDBlock] = [] - guidMap[file.destinationDBlock].append(file.GUID) - else: - # wait - _logger.debug('Finisher : Wait %s' % job.PandaID) - for lfn in lfns: - if not lfn in okFiles: - _logger.debug(' -> %s' % lfn) - upJobs.append(job) - # update - _logger.debug('updating ...') - taskBuffer.updateJobs(upJobs,False) - # run Finisher - for job in finJobs: - fThr = Finisher(taskBuffer,None,job) - fThr.start() - fThr.join() - _logger.debug('done') - time.sleep(random.randint(1,10)) -""" - -# update email DB -_memoryCheck("email") -_logger.debug("Update emails") - -# lock file -_lockGetMail = open(panda_config.lockfile_getMail, 'w') -# lock email DB -fcntl.flock(_lockGetMail.fileno(), fcntl.LOCK_EX) -# open email DB -pDB = shelve.open(panda_config.emailDB) -# read -mailMap = {} -for name,addr in pDB.iteritems(): - mailMap[name] = addr -# close DB -pDB.close() -# release file lock -fcntl.flock(_lockGetMail.fileno(), fcntl.LOCK_UN) -# set email address -for name,addr in mailMap.iteritems(): - # remove _ - name = re.sub('_$','',name) - status,res = taskBuffer.querySQLS("SELECT email FROM ATLAS_PANDAMETA.users WHERE name=:name",{':name':name}) - # failed or not found - if status == -1 or len(res) == 0: - _logger.error("%s not found in user DB" % name) - continue - # already set - if not res[0][0] in ['','None',None]: - continue - # update email - _logger.debug("set '%s' to %s" % (name,addr)) - status,res = taskBuffer.querySQLS("UPDATE ATLAS_PANDAMETA.users SET email=:addr WHERE name=:name",{':addr':addr,':name':name}) - -# reassign reprocessing jobs in defined table -_memoryCheck("repro") -class ReassginRepro (threading.Thread): - def __init__(self,taskBuffer,lock,jobs): - threading.Thread.__init__(self) - self.jobs = jobs - self.lock = lock - self.taskBuffer = taskBuffer - - def run(self): - self.lock.acquire() - try: - if len(self.jobs): - nJob = 100 - iJob = 0 - while iJob < len(self.jobs): - # reassign jobs one by one to break dis dataset formation - for job in self.jobs[iJob:iJob+nJob]: - _logger.debug('reassignJobs in Pepro (%s)' % [job]) - self.taskBuffer.reassignJobs([job],joinThr=True) - iJob += nJob - except: - pass - self.lock.release() - -reproLock = threading.Semaphore(3) - -nBunch = 20 -iBunch = 0 -timeLimitMod = datetime.datetime.utcnow() - datetime.timedelta(hours=8) -timeLimitCre = datetime.datetime.utcnow() - datetime.timedelta(hours=24) -firstFlag = True -while True: - # lock - reproLock.acquire() - # get jobs - varMap = {} - varMap[':jobStatus'] = 'assigned' - varMap[':prodSourceLabel'] = 'managed' - varMap[':modificationTime'] = timeLimitMod - varMap[':creationTime'] = timeLimitCre - varMap[':processingType'] = 'reprocessing' - if firstFlag: - firstFlag = False - status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND prodSourceLabel=:prodSourceLabel AND modificationTime<:modificationTime AND creationTime<:creationTime AND processingType=:processingType ORDER BY PandaID", - varMap) - if res != None: - _logger.debug('total Repro for reassignJobs : %s' % len(res)) - # get a bunch - status,res = taskBuffer.querySQLS("SELECT * FROM (SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND prodSourceLabel=:prodSourceLabel AND modificationTime<:modificationTime AND creationTime<:creationTime AND processingType=:processingType ORDER BY PandaID) WHERE rownum<=%s" % nBunch, - varMap) - # escape - if res == None or len(res) == 0: - reproLock.release() - break - - # get IDs - jobs=[] - for id, in res: - jobs.append(id) - - # reassign - _logger.debug('reassignJobs for Pepro %s' % (iBunch*nBunch)) - # lock - currentTime = datetime.datetime.utcnow() - for jobID in jobs: - varMap = {} - varMap[':PandaID'] = jobID - varMap[':modificationTime'] = currentTime - status,res = taskBuffer.querySQLS("UPDATE ATLAS_PANDA.jobsDefined4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID", - varMap) - reproLock.release() - # run thr - reproThr = ReassginRepro(taskBuffer,reproLock,jobs) - reproThr.start() - iBunch += 1 - -_memoryCheck("end") - -_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/copyArchive.sh b/current/pandaserver/test/copyArchive.sh deleted file mode 100755 index 220f01ee2..000000000 --- a/current/pandaserver/test/copyArchive.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# Panda home -export PANDA_HOME=/home/sm/prod - -# for python -export PYTHONPATH=$PANDA_HOME/panda:$PYTHONPATH - -python $PANDA_HOME/panda/test/copyArchive.py diff --git a/current/pandaserver/test/copyROOT.py b/current/pandaserver/test/copyROOT.py deleted file mode 100644 index aeca74801..000000000 --- a/current/pandaserver/test/copyROOT.py +++ /dev/null @@ -1,81 +0,0 @@ -import os -import re -import sys -from ftplib import FTP -from pandalogger.PandaLogger import PandaLogger - -# supported architectures -targetArchs = ['Linux-slc5-gcc4.3.tar.gz','Linux-slc5_amd64-gcc4.3.tar.gz'] - -# destination dir -destDir = '/data/atlpan/srv/var/appdir' - -# logger -_logger = PandaLogger().getLogger('copyROOT') - -_logger.debug("===================== start =====================") - -try: - # login to root repository - ftp = FTP('root.cern.ch') - output = ftp.login() - _logger.debug(output) - output = ftp.cwd('root') - _logger.debug(output) - # get list - flist = ftp.nlst() - # loop over all files - for tmpFile in flist: - # skip RC - if re.search('-rc\d\.',tmpFile) != None: - continue - # check arch - supportedFlag = False - for tmpArch in targetArchs: - if tmpFile.endswith(tmpArch): - supportedFlag = True - break - # copy - if supportedFlag: - _logger.debug('start %s' % tmpFile) - dstFileName = '%s/%s' % (destDir,tmpFile) - # check local - if os.path.exists(dstFileName): - # get remote size - rsize = ftp.size(tmpFile) - if rsize == None: - _logger.debug(' cannot get remote size for %s' % tmpFile) - else: - # local size - lsize = os.path.getsize(dstFileName) - if lsize == rsize: - _logger.debug('skip since alredy there %s' % tmpFile) - continue - # copy - _logger.debug('copy %s' % tmpFile) - outFile = open(dstFileName,'wb') - ftp.retrbinary('RETR %s' % tmpFile,outFile.write) - outFile.close() - _logger.debug('end %s' % tmpFile) - # quit - output = ftp.quit() - _logger.debug(output) - # make list - listFileName = 'applist' - listFilePath = '%s/%s' % (destDir,listFileName) - listFile = open(listFilePath,'w') - for tmpFile in os.listdir(destDir): - # skip hidden files - if tmpFile.startswith('.'): - continue - # skip applist - if tmpFile == listFileName: - continue - listFile.write('%s\n' % tmpFile) - listFile.close() -except: - errType,errValue = sys.exc_info()[:2] - _logger.error("Failed with %s %s" % (errType,errValue)) - - -_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/createPandaSiteIDs.py b/current/pandaserver/test/createPandaSiteIDs.py deleted file mode 100644 index 34f8ef816..000000000 --- a/current/pandaserver/test/createPandaSiteIDs.py +++ /dev/null @@ -1,54 +0,0 @@ -import re -from jobscheduler import siteinfo - -from taskbuffer.DBProxy import DBProxy - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -proxyN = DBProxy() -proxyN.connect(panda_config.logdbhost,panda_config.logdbpasswd,panda_config.logdbuser,'PandaMetaDB') - -status,res = proxyN.querySQLS("SELECT nickname from schedconfig") - -nicknames = [] -for (nickname,) in res: - nicknames.append(nickname) - - -print "PandaSiteIDs = {" -sites = siteinfo.sites.keys() -sites.sort() -for site in sites: - vals = siteinfo.sites[site] - okFlag = vals[10] - fName = '' - sitePat = site - sitePat = re.sub('_PAUL','',sitePat) - sitePat = re.sub('_TEST$','',sitePat) - sitePat = re.sub('_test$','',sitePat) - sitePat = re.sub('^ANALY_LONG_','',sitePat) - sitePat = re.sub('^ANALY_','',sitePat) - if site == 'SLACXRD': - sitePat = 'slac' - if site == 'UVIC': - sitePat = 'VICTORIA' - if sitePat == 'LYON': - sitePat = 'IN2P3-CC-T2' - if sitePat == 'Purdue-ITB': - sitePat = 'Purdue' - if sitePat == "BNL": - sitePat = "BNL_ATLAS" - if sitePat == "RAL": - sitePat = "RAL-LCG2" - if sitePat == "SACLAY": - sitePat = "GRIF-DAPNIA" - for nickname in nicknames: - if re.search(sitePat,nickname,re.I) != None: - fName = nickname - if fName == '': - #print site, sitePat - fName = 'BNL_ATLAS_1-condor' - print " %-22s : {'nickname':'%s','status':'%s'}," % ("'"+site+"'",fName,okFlag) -print "}" diff --git a/current/pandaserver/test/datasetManager.py b/current/pandaserver/test/datasetManager.py deleted file mode 100644 index b5f8b7189..000000000 --- a/current/pandaserver/test/datasetManager.py +++ /dev/null @@ -1,924 +0,0 @@ -import os -import re -import sys -import time -import fcntl -import types -import shelve -import random -import datetime -import commands -import threading -import userinterface.Client as Client -from dataservice.DDM import ddm -from dataservice.DDM import dashBorad -from taskbuffer.OraDBProxy import DBProxy -from taskbuffer.TaskBuffer import taskBuffer -from pandalogger.PandaLogger import PandaLogger -from jobdispatcher.Watcher import Watcher -from brokerage.SiteMapper import SiteMapper -from dataservice.Adder import Adder -from dataservice.Finisher import Finisher -from dataservice.MailUtils import MailUtils -from taskbuffer import ProcessGroups -import brokerage.broker_util -import brokerage.broker -import taskbuffer.ErrorCode -import dataservice.DDM - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# logger -_logger = PandaLogger().getLogger('datasetManager') - -_logger.debug("===================== start =====================") - -# use native DQ2 -ddm.useDirectDQ2() - -# memory checker -def _memoryCheck(str): - try: - proc_status = '/proc/%d/status' % os.getpid() - procfile = open(proc_status) - name = "" - vmSize = "" - vmRSS = "" - # extract Name,VmSize,VmRSS - for line in procfile: - if line.startswith("Name:"): - name = line.split()[-1] - continue - if line.startswith("VmSize:"): - vmSize = "" - for item in line.split()[1:]: - vmSize += item - continue - if line.startswith("VmRSS:"): - vmRSS = "" - for item in line.split()[1:]: - vmRSS += item - continue - procfile.close() - _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str)) - except: - type, value, traceBack = sys.exc_info() - _logger.error("memoryCheck() : %s %s" % (type,value)) - _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str)) - return - -_memoryCheck("start") - -# kill old dq2 process -try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('ps axo user,pid,lstart,args | grep dq2.clientapi | grep -v PYTHONPATH | grep -v grep') - for line in out.split('\n'): - if line == '': - continue - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old dq2 process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) -except: - type, value, traceBack = sys.exc_info() - _logger.error("kill dq2 process : %s %s" % (type,value)) - - -# kill old process -try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=7) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName) - for line in out.split('\n'): - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) -except: - type, value, traceBack = sys.exc_info() - _logger.error("kill process : %s %s" % (type,value)) - - -# instantiate TB -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - -# instantiate sitemapper -siteMapper = SiteMapper(taskBuffer) - - -# list with lock -class ListWithLock: - def __init__(self): - self.lock = threading.Lock() - self.list = [] - - def __contains__(self,item): - self.lock.acquire() - ret = self.list.__contains__(item) - self.lock.release() - return ret - - def append(self,item): - appended = False - self.lock.acquire() - if not item in self.list: - self.list.append(item) - appended = True - self.lock.release() - return appended - - -# list of dis datasets to be deleted -deletedDisList = ListWithLock() - - -# set tobedeleted to dis dataset -def setTobeDeletedToDis(subDsName): - try: - # only production sub datasets - if subDsName.startswith('user') or subDsName.startswith('group') or \ - subDsName.startswith('pandaddm_') or re.search('_sub\d+$',subDsName)==None: - return - # get _dis names with _sub - disNameList = taskBuffer.getAssociatedDisDatasets(subDsName) - _logger.debug("setTobeDeletedToDis : sub:%s has dis:%s" % (subDsName,str(disNameList))) - # loop over all _dis datasets - for tmpDisName in disNameList: - # try to append to locked list - if not deletedDisList.append(tmpDisName): - # another thread already took care of the _dis - continue - # get dataset - _logger.debug("setTobeDeletedToDis : try to get %s in DB" % tmpDisName) - tmpDS = taskBuffer.queryDatasetWithMap({'name':tmpDisName}) - if tmpDS == None: - _logger.error("setTobeDeletedToDis : cannot get %s in DB" % tmpDisName) - continue - # check status - if tmpDS.status in ['tobedeleted','deleted']: - _logger.debug("setTobeDeletedToDis : skip %s since status=%s" % (tmpDisName,tmpDS.status)) - continue - # check the number of failed jobs associated to the _dis - if tmpDS.currentfiles == 0: - # all succeeded - tmpDS.status = 'deleting' - excStatus = 'deleted' - else: - # some failed, to reduce the lifetime - tmpDS.status = 'shortening' - excStatus = 'shortened' - # update dataset - retU = taskBuffer.updateDatasets([tmpDS],withLock=True,withCriteria="status<>:crStatus", - criteriaMap={':crStatus':excStatus}) - _logger.debug("setTobeDeletedToDis : set %s to %s with %s" % (tmpDS.status,tmpDisName,str(retU))) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("setTobeDeletedToDis : %s %s %s" % (subDsName,errType,errValue)) - - -# thread pool -class ThreadPool: - def __init__(self): - self.lock = threading.Lock() - self.list = [] - - def add(self,obj): - self.lock.acquire() - self.list.append(obj) - self.lock.release() - - def remove(self,obj): - self.lock.acquire() - self.list.remove(obj) - self.lock.release() - - def join(self): - self.lock.acquire() - thrlist = tuple(self.list) - self.lock.release() - for thr in thrlist: - thr.join() - - -# thread to close dataset -class CloserThr (threading.Thread): - def __init__(self,lock,proxyLock,datasets,pool): - threading.Thread.__init__(self) - self.datasets = datasets - self.lock = lock - self.proxyLock = proxyLock - self.pool = pool - self.pool.add(self) - - def run(self): - self.lock.acquire() - try: - # loop over all datasets - for vuid,name,modDate in self.datasets: - _logger.debug("Close %s %s" % (modDate,name)) - if not name.startswith('pandaddm_'): - status,out = ddm.DQ2.main('freezeDataset',name) - else: - status,out = 0,'' - if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: - _logger.error(out) - else: - self.proxyLock.acquire() - varMap = {} - varMap[':vuid'] = vuid - varMap[':newstatus'] = 'completed' - varMap[':oldstatus'] = 'tobeclosed' - taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:newstatus,modificationdate=CURRENT_DATE WHERE vuid=:vuid AND status=:oldstatus", - varMap) - self.proxyLock.release() - if name.startswith('pandaddm_'): - continue - # set tobedeleted to dis - setTobeDeletedToDis(name) - # count # of files - status,out = ddm.DQ2.main('getNumberOfFiles',name) - _logger.debug(out) - if status != 0: - _logger.error(out) - else: - try: - nFile = int(out) - _logger.debug(nFile) - if nFile == 0: - # erase dataset - _logger.debug('erase %s' % name) - status,out = ddm.DQ2.main('eraseDataset',name) - _logger.debug('OK with %s' % name) - except: - pass - except: - pass - self.pool.remove(self) - self.lock.release() - -# close datasets -_logger.debug("==== close datasets ====") -timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) -timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3) -closeLock = threading.Semaphore(5) -closeProxyLock = threading.Lock() -closeThreadPool = ThreadPool() -maxRows = 100000 -while True: - # lock - closeLock.acquire() - # get datasets - closeProxyLock.acquire() - varMap = {} - varMap[':modificationdateU'] = timeLimitU - varMap[':modificationdateL'] = timeLimitL - varMap[':type'] = 'output' - varMap[':status'] = 'tobeclosed' - sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows - proxyS = taskBuffer.proxyPool.getProxy() - res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60') - taskBuffer.proxyPool.putProxy(proxyS) - if res == None: - _logger.debug("# of datasets to be closed: %s" % res) - else: - _logger.debug("# of datasets to be closed: %s" % len(res)) - if res==None or len(res)==0: - closeProxyLock.release() - closeLock.release() - break - # release - closeProxyLock.release() - closeLock.release() - # run thread - iRows = 0 - nRows = 500 - while iRows < len(res): - closerThr = CloserThr(closeLock,closeProxyLock,res[iRows:iRows+nRows],closeThreadPool) - closerThr.start() - iRows += nRows - closeThreadPool.join() - if len(res) < maxRows: - break - - -# thread to freeze dataset -class Freezer (threading.Thread): - def __init__(self,lock,proxyLock,datasets,pool): - threading.Thread.__init__(self) - self.datasets = datasets - self.lock = lock - self.proxyLock = proxyLock - self.pool = pool - self.pool.add(self) - - def run(self): - self.lock.acquire() - try: - for vuid,name,modDate in self.datasets: - _logger.debug("start %s %s" % (modDate,name)) - self.proxyLock.acquire() - retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ lfn FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND NOT status IN (:status1,:status2,:status3)", - {':destinationDBlock':name,':status1':'ready',':status2':'failed',':status3':'skipped'}) - self.proxyLock.release() - if retF<0: - _logger.error("SQL error") - else: - # no files in filesTable - if len(resF) == 0: - _logger.debug("freeze %s " % name) - if not name.startswith('pandaddm_'): - status,out = ddm.DQ2.main('freezeDataset',name) - else: - status,out = 0,'' - if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: - _logger.error(out) - else: - self.proxyLock.acquire() - varMap = {} - varMap[':vuid'] = vuid - varMap[':status'] = 'completed' - taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", - varMap) - self.proxyLock.release() - if name.startswith('pandaddm_'): - continue - # set tobedeleted to dis - setTobeDeletedToDis(name) - # count # of files - status,out = ddm.DQ2.main('getNumberOfFiles',name) - _logger.debug(out) - if status != 0: - _logger.error(out) - else: - try: - nFile = int(out) - _logger.debug(nFile) - if nFile == 0: - # erase dataset - _logger.debug('erase %s' % name) - status,out = ddm.DQ2.main('eraseDataset',name) - _logger.debug('OK with %s' % name) - except: - pass - else: - _logger.debug("wait %s " % name) - self.proxyLock.acquire() - taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid}) - self.proxyLock.release() - _logger.debug("end %s " % name) - except: - pass - self.pool.remove(self) - self.lock.release() - -# freeze dataset -_logger.debug("==== freeze datasets ====") -timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(days=4) -timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=14) -freezeLock = threading.Semaphore(5) -freezeProxyLock = threading.Lock() -freezeThreadPool = ThreadPool() -maxRows = 100000 -while True: - # lock - freezeLock.acquire() - # get datasets - sqlQuery = "type=:type AND status IN (:status1,:status2,:status3,:status4) " + \ - "AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND subType=:subType AND rownum <= %s" % maxRows - varMap = {} - varMap[':modificationdateU'] = timeLimitU - varMap[':modificationdateL'] = timeLimitL - varMap[':type'] = 'output' - varMap[':status1'] = 'running' - varMap[':status2'] = 'created' - varMap[':status3'] = 'defined' - varMap[':status4'] = 'locked' - varMap[':subType'] = 'sub' - freezeProxyLock.acquire() - proxyS = taskBuffer.proxyPool.getProxy() - res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60') - taskBuffer.proxyPool.putProxy(proxyS) - if res == None: - _logger.debug("# of datasets to be frozen: %s" % res) - else: - _logger.debug("# of datasets to be frozen: %s" % len(res)) - if res==None or len(res)==0: - freezeProxyLock.release() - freezeLock.release() - break - freezeProxyLock.release() - # release - freezeLock.release() - # run freezer - iRows = 0 - nRows = 500 - while iRows < len(res): - freezer = Freezer(freezeLock,freezeProxyLock,res[iRows:iRows+nRows],freezeThreadPool) - freezer.start() - iRows += nRows - freezeThreadPool.join() - if len(res) < maxRows: - break - - -# thread to delete dataset replica from T2 -class T2Cleaner (threading.Thread): - def __init__(self,lock,proxyLock,datasets,pool): - threading.Thread.__init__(self) - self.datasets = datasets - self.lock = lock - self.proxyLock = proxyLock - self.pool = pool - self.pool.add(self) - - def run(self): - self.lock.acquire() - try: - for vuid,name,modDate in self.datasets: - _logger.debug("cleanT2 %s" % name) - # get list of replicas - status,out = ddm.DQ2.main('listDatasetReplicas',name,0,None,False) - if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: - _logger.error(out) - continue - else: - if out.find("DQUnknownDatasetException") == -1 and out.find("DQDeletedDatasetException") == -1: - listOut = out - try: - # convert res to map - exec "tmpRepSites = %s" % out - except: - tmpRepSites = {} - _logger.error("cannot convert to replica map") - _logger.error(out) - continue - # check if there is active subscription - _logger.debug('listSubscriptions %s' % name) - subStat,subOut = ddm.DQ2.main('listSubscriptions',name) - if subStat != 0: - _logger.error("cannot get subscriptions for %s" % name) - _logger.error(subOut) - _logger.debug('subscriptions for %s = %s' % (name,subOut)) - # active subscriotions - if subOut != '[]': - _logger.debug("wait %s due to active subscription" % name) - continue - # check cloud - self.proxyLock.acquire() - proxyS = taskBuffer.proxyPool.getProxy() - destSE = proxyS.getDestSEwithDestDBlock(name) - taskBuffer.proxyPool.putProxy(proxyS) - self.proxyLock.release() - cloudName = None - if siteMapper.checkSite(destSE): - cloudName = siteMapper.getSite(destSE).cloud - # cloud is not found - if cloudName == None: - _logger.error("cannot find cloud for %s : %s" % (name,str(tmpRepSites))) - else: - _logger.debug('cloud=%s for %s' % (cloudName,name)) - t1SiteDDMs = siteMapper.getSite(destSE).setokens.values() - # look for T2 IDs - t2DDMs = [] - for tmpDDM in tmpRepSites.keys(): - if not tmpDDM in t1SiteDDMs: - # check home cloud - notDeleteFlag = False - for tmpT2siteID,tmpT2siteSpec in siteMapper.siteSpecList.iteritems(): - if tmpT2siteSpec.ddm == tmpDDM: - # not delete if src and dest are in US. OSG is regarded as US due to tier1 - if tmpT2siteSpec.cloud in ['US'] and cloudName in ['US','OSG']: - notDeleteFlag = True - if not notDeleteFlag: - t2DDMs.append(tmpDDM) - # delete replica for sub - if re.search('_sub\d+$',name) != None and t2DDMs != []: - setMetaFlag = True - for tmpT2DDM in t2DDMs: - _logger.debug('setReplicaMetaDataAttribute %s %s' % (name,tmpT2DDM)) - status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',name,tmpT2DDM,'pin_lifetime','') - if status != 0: - _logger.error(out) - if out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \ - out.find("No replica found") == -1: - setMetaFlag = False - if not setMetaFlag: - continue - _logger.debug(('deleteDatasetReplicas',name,t2DDMs)) - status,out = ddm.DQ2.main('deleteDatasetReplicas',name,t2DDMs,0,False,False,False,False,False,'00:00:00') - if status != 0: - _logger.error(out) - if out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \ - out.find("No replica found") == -1: - continue - else: - _logger.debug('no delete for %s due to empty target in %s' % (name,listOut)) - # update - self.proxyLock.acquire() - varMap = {} - varMap[':vuid'] = vuid - varMap[':status'] = 'completed' - taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", - varMap) - self.proxyLock.release() - _logger.debug("end %s " % name) - except: - pass - self.pool.remove(self) - self.lock.release() - -# delete dataset replica from T2 -_logger.debug("==== delete datasets from T2 ====") -timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) -timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3) -t2cleanLock = threading.Semaphore(5) -t2cleanProxyLock = threading.Lock() -t2cleanThreadPool = ThreadPool() -maxRows = 100000 -while True: - # lock - t2cleanLock.acquire() - # get datasets - varMap = {} - varMap[':modificationdateU'] = timeLimitU - varMap[':modificationdateL'] = timeLimitL - varMap[':type'] = 'output' - varMap[':status'] = 'cleanup' - sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows - t2cleanProxyLock.acquire() - proxyS = taskBuffer.proxyPool.getProxy() - res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60') - taskBuffer.proxyPool.putProxy(proxyS) - if res == None: - _logger.debug("# of datasets to be deleted from T2: %s" % res) - else: - _logger.debug("# of datasets to be deleted from T2: %s" % len(res)) - if res==None or len(res)==0: - t2cleanProxyLock.release() - t2cleanLock.release() - break - t2cleanProxyLock.release() - # release - t2cleanLock.release() - # run t2cleanr - iRows = 0 - nRows = 500 - while iRows < len(res): - t2cleanr = T2Cleaner(t2cleanLock,t2cleanProxyLock,res[iRows:iRows+nRows],t2cleanThreadPool) - t2cleanr.start() - iRows += nRows - t2cleanThreadPool.join() - if len(res) < maxRows: - break - - -# delete dis datasets -class EraserThr (threading.Thread): - def __init__(self,lock,proxyLock,datasets,pool,operationType): - threading.Thread.__init__(self) - self.datasets = datasets - self.lock = lock - self.proxyLock = proxyLock - self.pool = pool - self.pool.add(self) - self.operationType = operationType - - def run(self): - self.lock.acquire() - try: - # loop over all datasets - for vuid,name,modDate in self.datasets: - # only dis datasets - if re.search('_dis\d+$',name) == None: - _logger.error("Eraser : non disDS %s" % name) - continue - # delete - _logger.debug("Eraser %s dis %s %s" % (self.operationType,modDate,name)) - # delete or shorten - if self.operationType == 'deleting': - # erase - endStatus = 'deleted' - status,out = ddm.DQ2.main('eraseDataset',name) - if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: - _logger.error(out) - continue - else: - # change replica lifetime - endStatus = 'shortened' - # get list of replicas - status,out = ddm.DQ2.main('listDatasetReplicas',name,0,None,False) - if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: - _logger.error(out) - continue - if out.find("DQUnknownDatasetException") == -1 and out.find("DQDeletedDatasetException") == -1: - try: - # convert res to map - exec "tmpRepSites = %s" % out - except: - tmpRepSites = {} - _logger.error("cannot convert to replica map") - _logger.error(out) - continue - # set replica lifetime - setMetaFlag = True - for tmpDDM in tmpRepSites.keys(): - _logger.debug('setReplicaMetaDataAttribute %s %s' % (name,tmpDDM)) - status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',name,tmpDDM,'lifetime','1 days') - if status != 0: - _logger.error(out) - if out.find('DQFrozenDatasetException') == -1 and \ - out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ - out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \ - out.find("No replica found") == -1: - setMetaFlag = False - if not setMetaFlag: - continue - _logger.debug('OK with %s' % name) - # update - self.proxyLock.acquire() - varMap = {} - varMap[':vuid'] = vuid - varMap[':status'] = endStatus - taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", - varMap) - self.proxyLock.release() - except: - pass - self.pool.remove(self) - self.lock.release() - -# delete dis datasets -_logger.debug("==== delete dis datasets ====") -timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) -timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3) -disEraseLock = threading.Semaphore(5) -disEraseProxyLock = threading.Lock() -disEraseThreadPool = ThreadPool() -maxRows = 100000 -for targetStatus in ['deleting','shortening']: - # lock - disEraseLock.acquire() - # get datasets - varMap = {} - varMap[':modificationdateU'] = timeLimitU - varMap[':modificationdateL'] = timeLimitL - varMap[':type'] = 'dispatch' - varMap[':status'] = targetStatus - sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows - disEraseProxyLock.acquire() - proxyS = taskBuffer.proxyPool.getProxy() - res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60') - taskBuffer.proxyPool.putProxy(proxyS) - if res == None: - _logger.debug("# of dis datasets for %s: None" % targetStatus) - else: - _logger.debug("# of dis datasets for %s: %s" % (targetStatus,len(res))) - if res==None or len(res)==0: - disEraseProxyLock.release() - disEraseLock.release() - break - disEraseProxyLock.release() - # release - disEraseLock.release() - # run disEraser - iRows = 0 - nRows = 500 - while iRows < len(res): - disEraser = EraserThr(disEraseLock,disEraseProxyLock,res[iRows:iRows+nRows], - disEraseThreadPool,targetStatus) - disEraser.start() - iRows += nRows - disEraseThreadPool.join() - - -_memoryCheck("finisher") - -# finisher thread -class FinisherThr (threading.Thread): - def __init__(self,lock,proxyLock,ids,pool,timeNow): - threading.Thread.__init__(self) - self.ids = ids - self.lock = lock - self.proxyLock = proxyLock - self.pool = pool - self.timeNow = timeNow - self.pool.add(self) - - def run(self): - self.lock.acquire() - try: - # get jobs from DB - ids = self.ids - self.proxyLock.acquire() - jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False) - self.proxyLock.release() - upJobs = [] - finJobs = [] - for job in jobs: - if job == None or job.jobStatus == 'unknown': - continue - # use BNL by default - dq2URL = siteMapper.getSite('BNL_ATLAS_1').dq2url - dq2SE = [] - # get LFC and SEs - if job.prodSourceLabel == 'user' and not siteMapper.siteSpecList.has_key(job.destinationSE): - # using --destSE for analysis job to transfer output - try: - dq2URL = dataservice.DDM.toa.getLocalCatalog(job.destinationSE)[-1] - match = re.search('.+://([^:/]+):*\d*/*',dataservice.DDM.toa.getSiteProperty(job.destinationSE,'srm')[-1]) - if match != None: - dq2SE.append(match.group(1)) - except: - type, value, traceBack = sys.exc_info() - _logger.error("%s Failed to get DQ2/SE with %s %s" % (job.PandaID,type,value)) - continue - elif siteMapper.checkCloud(job.cloud): - # normal production jobs - tmpDstID = siteMapper.getCloud(job.cloud)['dest'] - tmpDstSite = siteMapper.getSite(tmpDstID) - if not tmpDstSite.lfchost in [None,'']: - # LFC - dq2URL = 'lfc://'+tmpDstSite.lfchost+':/grid/atlas/' - if tmpDstSite.se != None: - for tmpDstSiteSE in tmpDstSite.se.split(','): - match = re.search('.+://([^:/]+):*\d*/*',tmpDstSiteSE) - if match != None: - dq2SE.append(match.group(1)) - else: - # LRC - dq2URL = tmpDstSite.dq2url - dq2SE = [] - # get LFN list - lfns = [] - guids = [] - nTokens = 0 - for file in job.Files: - # only output files are checked - if file.type == 'output' or file.type == 'log': - lfns.append(file.lfn) - guids.append(file.GUID) - nTokens += len(file.destinationDBlockToken.split(',')) - # get files in LRC - _logger.debug("%s Cloud:%s DQ2URL:%s" % (job.PandaID,job.cloud,dq2URL)) - okFiles = brokerage.broker_util.getFilesFromLRC(lfns,dq2URL,guids,dq2SE,getPFN=True) - # count files - nOkTokens = 0 - for okLFN,okPFNs in okFiles.iteritems(): - nOkTokens += len(okPFNs) - # check all files are ready - _logger.debug("%s nToken:%s nOkToken:%s" % (job.PandaID,nTokens,nOkTokens)) - if nTokens <= nOkTokens: - _logger.debug("%s Finisher : Finish" % job.PandaID) - for file in job.Files: - if file.type == 'output' or file.type == 'log': - file.status = 'ready' - # append to run Finisher - finJobs.append(job) - else: - endTime = job.endTime - if endTime == 'NULL': - endTime = job.startTime - # priority-dependent timeout - tmpCloudSpec = siteMapper.getCloud(job.cloud) - if job.currentPriority >= 800 and (not job.prodSourceLabel in ['user']): - if tmpCloudSpec.has_key('transtimehi'): - timeOutValue = tmpCloudSpec['transtimehi'] - else: - timeOutValue = 1 - else: - if tmpCloudSpec.has_key('transtimelo'): - timeOutValue = tmpCloudSpec['transtimelo'] - else: - timeOutValue = 2 - # protection - if timeOutValue < 1: - timeOutValue = 1 - timeOut = self.timeNow - datetime.timedelta(days=timeOutValue) - _logger.debug("%s Priority:%s Limit:%s End:%s" % (job.PandaID,job.currentPriority,str(timeOut),str(endTime))) - if endTime < timeOut: - # timeout - _logger.debug("%s Finisher : Kill" % job.PandaID) - strMiss = '' - for lfn in lfns: - if not lfn in okFiles: - strMiss += ' %s' % lfn - job.jobStatus = 'failed' - job.taskBufferErrorCode = taskbuffer.ErrorCode.EC_Transfer - job.taskBufferErrorDiag = 'transfer timeout for '+strMiss - guidMap = {} - for file in job.Files: - # set file status - if file.status == 'transferring': - file.status = 'failed' - # collect GUIDs to delete files from _tid datasets - if file.type == 'output' or file.type == 'log': - if not guidMap.has_key(file.destinationDBlock): - guidMap[file.destinationDBlock] = [] - guidMap[file.destinationDBlock].append(file.GUID) - else: - # wait - _logger.debug("%s Finisher : Wait" % job.PandaID) - for lfn in lfns: - if not lfn in okFiles: - _logger.debug("%s -> %s" % (job.PandaID,lfn)) - upJobs.append(job) - # update - _logger.debug("updating ...") - self.proxyLock.acquire() - taskBuffer.updateJobs(upJobs,False) - self.proxyLock.release() - # run Finisher - for job in finJobs: - fThr = Finisher(taskBuffer,None,job) - fThr.start() - fThr.join() - _logger.debug("done") - time.sleep(1) - except: - pass - self.pool.remove(self) - self.lock.release() - -# finish transferring jobs -_logger.debug("==== finish transferring jobs ====") -finisherLock = threading.Semaphore(3) -finisherProxyLock = threading.Lock() -finisherThreadPool = ThreadPool() -for loopIdx in ['low','high']: - timeNow = datetime.datetime.utcnow() - if loopIdx == 'high': - highPrioFlag = True - else: - highPrioFlag = False - # get jobs - for ii in range(1000): - # lock - finisherLock.acquire() - finisherProxyLock.acquire() - ret,res = taskBuffer.lockJobsForFinisher(timeNow,200,highPrioFlag) - finisherProxyLock.release() - finisherLock.release() - if res == None: - _logger.debug("# of jobs to be finished for %s : %s" % (loopIdx,res)) - else: - _logger.debug("# of jobs to be finished for %s : %s" % (loopIdx,len(res))) - if res == None or len(res) == 0: - break - # run thread - finThr = FinisherThr(finisherLock,finisherProxyLock,res,finisherThreadPool,timeNow) - finThr.start() - # wait - finisherThreadPool.join() - - -_memoryCheck("end") - -_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/deleteJobs.py b/current/pandaserver/test/deleteJobs.py deleted file mode 100755 index 18195c27c..000000000 --- a/current/pandaserver/test/deleteJobs.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import re -import sys -import time -import fcntl -import types -import shelve -import random -import datetime -import commands -import threading -import userinterface.Client as Client -from dataservice.DDM import ddm -from dataservice.DDM import dashBorad -from taskbuffer.OraDBProxy import DBProxy -from taskbuffer.TaskBuffer import taskBuffer -from pandalogger.PandaLogger import PandaLogger -from jobdispatcher.Watcher import Watcher -from brokerage.SiteMapper import SiteMapper -from dataservice.Adder import Adder -from dataservice.Finisher import Finisher -from dataservice.MailUtils import MailUtils -from taskbuffer import ProcessGroups -import brokerage.broker_util -import brokerage.broker -import taskbuffer.ErrorCode -import dataservice.DDM - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# logger -_logger = PandaLogger().getLogger('deleteJobs') - -_logger.debug("===================== start =====================") - -# memory checker -def _memoryCheck(str): - try: - proc_status = '/proc/%d/status' % os.getpid() - procfile = open(proc_status) - name = "" - vmSize = "" - vmRSS = "" - # extract Name,VmSize,VmRSS - for line in procfile: - if line.startswith("Name:"): - name = line.split()[-1] - continue - if line.startswith("VmSize:"): - vmSize = "" - for item in line.split()[1:]: - vmSize += item - continue - if line.startswith("VmRSS:"): - vmRSS = "" - for item in line.split()[1:]: - vmRSS += item - continue - procfile.close() - _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str)) - except: - type, value, traceBack = sys.exc_info() - _logger.error("memoryCheck() : %s %s" % (type,value)) - _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str)) - return - -_memoryCheck("start") - -# kill old process -try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName) - for line in out.split('\n'): - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) -except: - type, value, traceBack = sys.exc_info() - _logger.error("kill process : %s %s" % (type,value)) - - -# instantiate TB -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - -# instantiate sitemapper -siteMapper = SiteMapper(taskBuffer) - - -# table names -jobATableName = "ATLAS_PANDAARCH.jobsArchived" -filesATableName = "ATLAS_PANDAARCH.filesTable_ARCH" -paramATableName = "ATLAS_PANDAARCH.jobParamsTable_ARCH" -metaATableName = "ATLAS_PANDAARCH.metaTable_ARCH" - -# time limit -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=3) - -# delete -_logger.debug("get PandaIDs for Delete") -sql = "SELECT COUNT(*) FROM ATLAS_PANDA.jobsArchived4 WHERE modificationTime<:modificationTime" -varMap = {} -varMap[':modificationTime'] = timeLimit -status,res = taskBuffer.querySQLS(sql,varMap) -if res != None: - tmpTotal = res[0][0] -else: - tmpTotal = None -maxBunch = 1000 -nBunch = 500 -tmpIndex = 0 -while True: - sql = "SELECT PandaID,modificationTime FROM ATLAS_PANDA.jobsArchived4 " - sql += "WHERE modificationTime<:modificationTime AND archivedFlag=:archivedFlag AND rownum<=:rowRange" - varMap = {} - varMap[':modificationTime'] = timeLimit - varMap[':archivedFlag'] = 1 - varMap[':rowRange'] = maxBunch - status,res = taskBuffer.querySQLS(sql,varMap) - if res == None: - _logger.error("failed to get PandaIDs to be deleted") - break - else: - _logger.debug("got %s for deletion" % len(res)) - if len(res) == 0: - _logger.debug("no jobs left for for deletion") - break - else: - maxBunch = len(res) - random.shuffle(res) - res = res[:nBunch] - # loop over all jobs - for (id,srcEndTime) in res: - tmpIndex += 1 - try: - # check - sql = "SELECT PandaID from %s WHERE PandaID=:PandaID" % jobATableName - varMap = {} - varMap[':PandaID'] = id - status,check = taskBuffer.querySQLS(sql,varMap) - if check == None or len(check) == 0: - # no record in ArchivedDB - _logger.error("No backup for %s" % id) - else: - # delete - _logger.debug("DEL %s : endTime %s" % (id,srcEndTime)) - proxyS = taskBuffer.proxyPool.getProxy() - proxyS.deleteJobSimple(id) - taskBuffer.proxyPool.putProxy(proxyS) - if tmpIndex % 1000 == 1: - _logger.debug(" deleted %s/%s" % (tmpIndex,tmpTotal)) - except: - pass - # terminate - if maxBunch < nBunch: - break -_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/directSubmit.py b/current/pandaserver/test/directSubmit.py deleted file mode 100755 index 81c96e953..000000000 --- a/current/pandaserver/test/directSubmit.py +++ /dev/null @@ -1,163 +0,0 @@ -import re -import sys -import time -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv) != 2: - print "task file is missing" - sys.exit(0) - -# open task file -taskFile = open(sys.argv[1]) - -# read common parameters -line = taskFile.readline() -items = line.split() - -# common parameters -taskID = items[0] -inTaskName = items[1] -taskName = items[2] -formats = items[3].split('.') -lparams = items[4].split(',') -vparams = items[5].split(',') -trf = items[7] -trfVer = items[8] -grid = items[10] -priority = items[11] -totalJob = items[14] -cpu = items[15] -memory = items[16] - - -# input dataset -iDataset = 'NULL' -m = re.search('(.+)\.([^\.]+)\.([^\.]+)$',inTaskName) -if m != None: - step = m.group(2) - if step == 'evgen': - format = 'EVENT' - elif step == 'digit': - format = 'RDO' - else: - format = 'AOO' - #### FIXME : _tidXXXX is missing - iDataset = '%s.%s.%s.%s' % (m.group(1),step,format,m.group(3)) - - -# output datasets -m = re.search('(.+)\.([^\.]+)\.([^\.]+)$',taskName) -oDatasets = [] -for format in formats: - step = m.group(2) - if format=='HITS': - step = 'simul' - # append - oDatasets.append('%s.%s.%s.%s_tid%06d' % (m.group(1),step,format,m.group(3),int(taskID))) - -# log dataset -lDataset = '%s.%s.%s.%s_tid%06d' % (m.group(1),m.group(2),'log',m.group(3),int(taskID)) - - -# instantiate JobSpecs -iJob = 0 -jobList = [] -for line in taskFile: - iJob += 1 - job = JobSpec() - # job ID ###### FIXME - job.jobDefinitionID = int(time.time()) % 10000 - # job name - job.jobName = "%s_%05d.job" % (taskName,iJob) - # AtlasRelease - if len(re.findall('\.',trfVer)) > 2: - match = re.search('^(\d+\.\d+\.\d+)',trfVer) - job.AtlasRelease = 'Atlas-%s' % match.group(1) - else: - job.AtlasRelease = 'Atlas-%s' % trfVer - # homepackage - vers = trfVer.split('.') - if int(vers[0]) <= 11: - job.homepackage = 'JobTransforms' - for ver in vers: - job.homepackage += "-%02d" % int(ver) - else: - job.homepackage = 'AtlasProduction/%s' % trfVer - # trf - job.transformation = trf - job.destinationDBlock = oDatasets[0] - # prod DBlock - job.prodDBlock = iDataset - # souce lavel - job.prodSeriesLabel = 'pandatest' - job.prodSourceLabel = 'managed' - # priority - job.assignedPriority = priority - job.currentPriority = priority - # CPU, memory,disk ### FIXME - - # attempt number ### FIXME - - # input files - if iDataset != 'NULL': - # remove _tidXXX - pat = re.sub('_tid\d+$','',iDataset) - # search - m = re.search('('+pat+'\S+)',line) - if m != None: - file = FileSpec() - file.lfn = m.group(1) - file.type = 'input' - file.dataset = iDataset - file.prodDBlock = iDataset - job.addFile(file) - # DB release - for i,lpar in enumerate(lparams): - if lpar == 'DBRelease': - file = FileSpec() - file.lfn = "%s-%s.tgz" % (lpar,vparams[i]) - file.type = 'input' - file.dataset = iDataset - file.prodDBlock = iDataset - job.addFile(file) - break - # output files - for oDataset in oDatasets: - # remove _tidXXX - pat = re.sub('_tid\d+$','',oDataset) - # search - m = re.search('('+pat+'\S+)',line) - if m != None: - file = FileSpec() - file.lfn = m.group(1) - file.type = 'output' - file.dataset = oDataset - file.destinationDBlock = oDataset - job.addFile(file) - # log - file = FileSpec() - file.lfn = "%s._%05d.log.tgz" % (lDataset,iJob) - file.type = 'log' - file.dataset = lDataset - file.destinationDBlock = lDataset - job.addFile(file) - - # job par - job.jobParameters = line[:-1] - - """ - print job.values() - for file in job.Files: - print file.values() - sys.exit(0) - """ - jobList.append(job) - - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/distributeDefJobs.py b/current/pandaserver/test/distributeDefJobs.py deleted file mode 100755 index c1cee20a2..000000000 --- a/current/pandaserver/test/distributeDefJobs.py +++ /dev/null @@ -1,53 +0,0 @@ -import datetime -from taskbuffer.DBProxy import DBProxy -import userinterface.Client as Client -import jobscheduler.Site -import random -import time - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# time limit -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) - -# instantiate DB proxies -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -# get PandaIDs from jobsDefined -res = proxyS.querySQL("SELECT PandaID,modificationTime from jobsDefined4 ORDER BY modificationTime") - -# list of known sites -tmpSites = jobscheduler.Site.KnownSite.getAllSitesID() -allSites = [] -for site in tmpSites: - # _allSites may conain NULL after sort() - if site == 'NULL': - continue - # ignore test sites - if site.endswith('test') or site.endswith('Test'): - continue - # append - allSites.append(site) - -# reassign jobs -jobs=[] -for (id,modTime) in res: - if modTime < timeLimit: - jobs.append(id) - -# reassign -if len(jobs): - nJob = 20 - iJob = 0 - while iJob < len(jobs): - print 'reassignJobs(%s)' % jobs[iJob:iJob+nJob] - index = random.randint(1,len(allSites)) - site = allSites[int(index)-1] - print 'site=%s' % site - Client.reassignJobs(jobs[iJob:iJob+nJob],site) - iJob += nJob - time.sleep(10) - diff --git a/current/pandaserver/test/dq2cr.py b/current/pandaserver/test/dq2cr.py deleted file mode 100755 index b28b0ccea..000000000 --- a/current/pandaserver/test/dq2cr.py +++ /dev/null @@ -1,45 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = None - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = 'BNL_SE' - -jobList = [] - -for i in range(1): - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) - job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/run_dq2_cr' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.currentPriority = 100000 - #job.prodSourceLabel = 'test' - job.prodSourceLabel = 'user' - job.computingSite = site - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="8072 0 5000 1 DC3.008072.JimmyPhotonJet1.py NONE NONE NONE" - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/emailfix.py b/current/pandaserver/test/emailfix.py deleted file mode 100755 index a39bd3bc4..000000000 --- a/current/pandaserver/test/emailfix.py +++ /dev/null @@ -1,16 +0,0 @@ -''' -notifier - -''' - -import shelve - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger - -# open DB -pDB = shelve.open(panda_config.emailDB) - - - - diff --git a/current/pandaserver/test/evpPD2P.py b/current/pandaserver/test/evpPD2P.py deleted file mode 100644 index 27cb721f8..000000000 --- a/current/pandaserver/test/evpPD2P.py +++ /dev/null @@ -1,156 +0,0 @@ -import re -import sys -import glob -import time -import os.path -import commands -import datetime -import threading -from config import panda_config -from taskbuffer.TaskBuffer import taskBuffer -from brokerage import SiteMapper -from dataservice.EventPicker import EventPicker -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('evpPD2P') - -_logger.debug("===================== start =====================") - -# overall timeout value -overallTimeout = 60 -# prefix of evp files -prefixEVP = 'evp.' -# file pattern of evp files -evpFilePatt = panda_config.cache_dir + '/' + prefixEVP + '*' - -# kill old process -try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName) - for line in out.split('\n'): - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) -except: - type, value, traceBack = sys.exc_info() - _logger.error("kill process : %s %s" % (type,value)) - -# instantiate PD2P -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) -siteMapper = SiteMapper.SiteMapper(taskBuffer) - - -# thread pool -class ThreadPool: - def __init__(self): - self.lock = threading.Lock() - self.list = [] - - def add(self,obj): - self.lock.acquire() - self.list.append(obj) - self.lock.release() - - def remove(self,obj): - self.lock.acquire() - self.list.remove(obj) - self.lock.release() - - def join(self): - self.lock.acquire() - thrlist = tuple(self.list) - self.lock.release() - for thr in thrlist: - thr.join() - - -# thread to ev-pd2p -class EvpThr (threading.Thread): - def __init__(self,lock,pool,aTaskBuffer,aSiteMapper,fileName,ignoreError): - threading.Thread.__init__(self) - self.lock = lock - self.pool = pool - self.fileName = fileName - self.evp = EventPicker(aTaskBuffer,aSiteMapper,fileName,ignoreError) - self.pool.add(self) - - def run(self): - self.lock.acquire() - retRun = self.evp.run() - _logger.debug("%s : %s" % (retRun,self.fileName)) - self.pool.remove(self) - self.lock.release() - - -# get files -_logger.debug("EVP session") -timeNow = datetime.datetime.utcnow() -timeInt = datetime.datetime.utcnow() -fileList = glob.glob(evpFilePatt) -fileList.sort() - -# create thread pool and semaphore -adderLock = threading.Semaphore(3) -adderThreadPool = ThreadPool() - -# add -while len(fileList) != 0: - # time limit to aviod too many copyArchve running at the sametime - if (datetime.datetime.utcnow() - timeNow) > datetime.timedelta(minutes=overallTimeout): - _logger.debug("time over in EVP session") - break - # try to get Semaphore - adderLock.acquire() - # get fileList - if (datetime.datetime.utcnow() - timeInt) > datetime.timedelta(minutes=15): - timeInt = datetime.datetime.utcnow() - # get file - fileList = glob.glob(evpFilePatt) - fileList.sort() - # choose a file - fileName = fileList.pop(0) - # release lock - adderLock.release() - if not os.path.exists(fileName): - continue - try: - modTime = datetime.datetime(*(time.gmtime(os.path.getmtime(fileName))[:7])) - if (timeNow - modTime) > datetime.timedelta(hours=24): - # last chance - _logger.debug("Last event picking : %s" % fileName) - thr = EvpThr(adderLock,adderThreadPool,taskBuffer,siteMapper,fileName,False) - thr.start() - elif (timeInt - modTime) > datetime.timedelta(minutes=1): - # try - _logger.debug("event picking : %s" % fileName) - thr = EvpThr(adderLock,adderThreadPool,taskBuffer,siteMapper,fileName,True) - thr.start() - else: - _logger.debug("%s : %s" % ((timeInt - modTime),fileName)) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("%s %s" % (errType,errValue)) - -# join all threads -adderThreadPool.join() - -_logger.debug("===================== end =====================") - diff --git a/current/pandaserver/test/execute.py b/current/pandaserver/test/execute.py deleted file mode 100755 index 8cc2f2429..000000000 --- a/current/pandaserver/test/execute.py +++ /dev/null @@ -1,66 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = None - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = 'BNL_ATLAS_2' - -jobList = [] -for i in range(20): - - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = commands.getoutput('uuidgen') - job.AtlasRelease = 'Atlas-11.0.41' - #job.AtlasRelease = 'Atlas-11.0.3' - job.homepackage = 'AnalysisTransforms' - job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/runAthena' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.currentPriority = 100 - job.prodSourceLabel = 'user' - job.computingSite = site - #job.prodDBlock = "pandatest.b1599dfa-cd36-4fc5-92f6-495781a94c66" - job.prodDBlock = "pandatest.f228b051-077b-4f81-90bf-496340644379" - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = "lib.f228b051-077b-4f81-90bf-496340644379.tgz" - fileI.type = 'input' - job.addFile(fileI) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen') - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.type = 'log' - job.addFile(fileOL) - - fileOZ = FileSpec() - fileOZ.lfn = "%s.pool.root" % commands.getoutput('uuidgen') - fileOZ.destinationDBlock = job.destinationDBlock - fileOZ.destinationSE = job.destinationSE - fileOZ.dataset = job.destinationDBlock - fileOZ.type = 'output' - job.addFile(fileOZ) - - job.jobParameters="""-l %s -r PhysicsAnalysis/AnalysisCommon/UserAnalysis/UserAnalysis-00-05-11/run -j " jobOptions.pythia.py" -i "[]" -o "{'Stream1': '%s'}" """ % (fileI.lfn,fileOZ.lfn) - - jobList.append(job) - - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/fileCallbackListener.py b/current/pandaserver/test/fileCallbackListener.py deleted file mode 100644 index bad0c76cd..000000000 --- a/current/pandaserver/test/fileCallbackListener.py +++ /dev/null @@ -1,253 +0,0 @@ -import os -import re -import sys -import time -import signal -import socket -import commands -import optparse -import datetime -import cPickle as pickle - -from dq2.common import log as logging -from dq2.common import stomp -from config import panda_config -from brokerage.SiteMapper import SiteMapper -from dataservice.Finisher import Finisher - -# logger -from pandalogger.PandaLogger import PandaLogger -_logger = PandaLogger().getLogger('fileCallbackListener') - -# keep PID -pidFile = '%s/file_callback_listener.pid' % panda_config.logdir - -# overall timeout value -overallTimeout = 60 * 59 - -# expiration time -expirationTime = datetime.datetime.utcnow() + datetime.timedelta(minutes=overallTimeout) - - -# kill whole process -def catch_sig(sig, frame): - try: - os.remove(pidFile) - except: - pass - # kill - _logger.debug('terminating ...') - commands.getoutput('kill -9 -- -%s' % os.getpgrp()) - # exit - sys.exit(0) - - -# callback listener -class FileCallbackListener(stomp.ConnectionListener): - - def __init__(self,conn,tb,sm): - # connection - self.conn = conn - # task buffer - self.taskBuffer = tb - # site mapper - self.siteMapper = sm - - - def on_error(self,headers,body): - _logger.error("on_error : %s" % headers['message']) - - - def on_disconnected(self,headers,body): - _logger.error("on_disconnected : %s" % headers['message']) - - - def on_message(self, headers, message): - try: - lfn = 'UNKNOWN' - # send ack - id = headers['message-id'] - self.conn.ack({'message-id':id}) - # check message type - messageType = headers['cbtype'] - if not messageType in ['FileDoneMessage']: - _logger.debug('%s skip' % messageType) - return - _logger.debug('%s start' % messageType) - # re-construct message - messageObj = pickle.loads(message) - evtTime = datetime.datetime.utcfromtimestamp(messageObj.getItem('eventTime')) - lfn = messageObj.getItem('lfn') - guid = messageObj.getItem('guid') - ddmSite = messageObj.getItem('site') - _logger.debug('%s site=%s type=%s time=%s' % \ - (lfn,ddmSite,messageType,evtTime.strftime('%Y-%m-%d %H:%M:%S'))) - # ignore non production files - flagNgPrefix = False - for ngPrefix in ['user','step']: - if lfn.startswith(ngPrefix): - flagNgPrefix = True - break - if flagNgPrefix: - _logger.debug('%s skip' % lfn) - return - # get datasets associated with the file only for high priority jobs - dsNameMap = self.taskBuffer.getDatasetWithFile(lfn,800) - _logger.debug('%s ds=%s' % (lfn,str(dsNameMap))) - # loop over all datasets - for dsName,dsData in dsNameMap.iteritems(): - pandaSite,dsToken = dsData - # skip multiple destination since each file doesn't have - # transferStatus - if not dsToken in ['',None] and ',' in dsToken: - _logger.debug('%s ignore ds=%s token=%s' % (lfn,dsName,dsToken)) - continue - # check site - tmpSiteSpec = self.siteMapper.getSite(pandaSite) - if tmpSiteSpec.setokens.has_key(dsToken): - pandaSiteDdmID = tmpSiteSpec.setokens[dsToken] - else: - pandaSiteDdmID = tmpSiteSpec.ddm - if pandaSiteDdmID != ddmSite: - _logger.debug('%s ignore ds=%s site=%s:%s <> %s' % \ - (lfn,dsName,pandaSite,pandaSiteDdmID,ddmSite)) - continue - # update file - forInput = None - if re.search('_dis\d+$',dsName) != None: - # dispatch datasets - forInput = True - ids = self.taskBuffer.updateInFilesReturnPandaIDs(dsName,'ready',lfn) - elif re.search('_sub\d+$',dsName) != None: - # sub datasets - forInput = False - ids = self.taskBuffer.updateOutFilesReturnPandaIDs(dsName,lfn) - _logger.debug('%s ds=%s ids=%s' % (lfn,dsName,str(ids))) - # loop over all PandaIDs - if forInput != None and len(ids) != 0: - # remove None and unknown - targetIDs = [] - for tmpID in ids: - # count the number of pending files - nPending = self.taskBuffer.countPendingFiles(tmpID,forInput) - _logger.debug('%s PandaID=%s nPen=%s' % (lfn,tmpID,nPending)) - if nPending != 0: - continue - targetIDs.append(tmpID) - # get jobs - targetJobs = [] - if targetIDs != []: - if forInput: - jobs = self.taskBuffer.peekJobs(targetIDs,fromActive=False,fromArchived=False, - fromWaiting=False) - else: - jobs = self.taskBuffer.peekJobs(targetIDs,fromDefined=False,fromArchived=False, - fromWaiting=False) - for tmpJob in jobs: - if tmpJob == None or tmpJob.jobStatus == 'unknown': - continue - targetJobs.append(tmpJob) - # trigger subsequent processe - if targetJobs == []: - _logger.debug('%s no jobs to be triggerd for subsequent processe' % lfn) - else: - if forInput: - # activate - _logger.debug('%s activate %s' % (lfn,str(targetIDs))) - self.taskBuffer.activateJobs(targetJobs) - else: - # finish - _logger.debug('%s finish %s' % (lfn,str(targetIDs))) - for tmpJob in targetJobs: - fThr = Finisher(self.taskBuffer,None,tmpJob) - fThr.start() - fThr.join() - _logger.debug('%s done' % lfn) - except: - errtype,errvalue = sys.exc_info()[:2] - _logger.error("on_message : %s %s %s" % (lfn,errtype,errvalue)) - - -# main -def main(backGround=False): - _logger.debug('starting ...') - # register signal handler - signal.signal(signal.SIGINT, catch_sig) - signal.signal(signal.SIGHUP, catch_sig) - signal.signal(signal.SIGTERM,catch_sig) - signal.signal(signal.SIGALRM,catch_sig) - signal.alarm(overallTimeout) - # forking - pid = os.fork() - if pid != 0: - # watch child process - os.wait() - time.sleep(1) - else: - # main loop - from taskbuffer.TaskBuffer import taskBuffer - # initialize cx_Oracle using dummy connection - from taskbuffer.Initializer import initializer - initializer.init() - # instantiate TB - taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - # instantiate sitemapper - siteMapper = SiteMapper(taskBuffer) - # ActiveMQ params - clientid = 'PANDA-' + socket.getfqdn() - queue = '/queue/Consumer.PANDA.atlas.ddm.siteservices' - ssl_opts = {'use_ssl' : True, - 'ssl_cert_file' : '/data/atlpan/pandasv1_usercert.pem', - 'ssl_key_file' : '/data/atlpan/pandasv1_userkey.pem'} - # resolve multiple brokers - brokerList = socket.gethostbyname_ex('atlasddm-mb.cern.ch')[-1] - # set listener - for tmpBroker in brokerList: - try: - _logger.debug('setting listener on %s' % tmpBroker) - conn = stomp.Connection(host_and_ports = [(tmpBroker, 6162)], **ssl_opts) - conn.set_listener('FileCallbackListener', FileCallbackListener(conn,taskBuffer,siteMapper)) - conn.start() - conn.connect(headers = {'client-id': clientid}) - conn.subscribe(destination=queue, ack='client-individual') - #,headers = {'selector':"cbtype='FileDoneMessage'"}) - if not conn.is_connected(): - _logger.error("connection failure to %s" % tmpBroker) - except: - errtype,errvalue = sys.exc_info()[:2] - _logger.error("failed to set listener on %s : %s %s" % (tmpBroker,errtype,errvalue)) - catch_sig(None,None) - -# entry -if __name__ == "__main__": - optP = optparse.OptionParser(conflict_handler="resolve") - options,args = optP.parse_args() - try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(seconds=overallTimeout-180) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName) - for line in out.split('\n'): - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) - except: - errtype,errvalue = sys.exc_info()[:2] - _logger.error("kill process : %s %s" % (errtype,errvalue)) - # main loop - main() diff --git a/current/pandaserver/test/fileClean.py b/current/pandaserver/test/fileClean.py deleted file mode 100755 index edef84ea5..000000000 --- a/current/pandaserver/test/fileClean.py +++ /dev/null @@ -1,145 +0,0 @@ -import re -import sys -import datetime -from taskbuffer.DBProxy import DBProxy -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# table names -cdate = datetime.datetime.utcnow() -if cdate.month==1: - cdate = cdate.replace(year = (cdate.year-1)) - cdate = cdate.replace(month = 12, day = 1) -else: - cdate = cdate.replace(month = (cdate.month/2)*2, day = 1) -currentSuffix = "_%s%s" % (cdate.strftime('%b'),cdate.year) -if cdate.month > 2: - odate = cdate.replace(month = (cdate.month-2)) -else: - odate = cdate.replace(year = (cdate.year-1), month = 12) -previousSuffix = "_%s%s" % (odate.strftime('%b'),odate.year) - -# instantiate DB proxies -proxyS = DBProxy() -proxyN = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) -proxyN.connect(panda_config.logdbhost,panda_config.logdbpasswd,panda_config.logdbuser,'PandaArchiveDB') - -# get tables -fileTables = [] -jobsTables = {} -status,res = proxyN.querySQLS("show tables") -if res != None: - for table, in res: - if table.startswith('filesTable'): - fileTables.append(table) - if table.startswith('jobsArchived'): - # get MAX PandaID - statusJ,resJ = proxyN.querySQLS("SELECT MAX(PandaID) FROM %s" % table) - jobsTables[table] = resJ[0][0] - -# for the cumulative tables -cumulativeSuffix = '4_current' -cumulativePandaID = jobsTables['jobsArchived%s' % cumulativeSuffix] - -# create a map between MAX PandaID and suffix -suffixMap = {} -for table,maxPandaID in jobsTables.iteritems(): - # get suffix - match = re.search('(\d??_.+)$',table) - suffix = match.group(1) - # special treatment is required for the cumulative tables - if suffix == cumulativeSuffix: - continue - # name of corresponding file table - name = "filesTable%s" % suffix - if not name in fileTables: - print "%s is not found" % name - sys.exit(0) - # check duplication - if suffixMap.has_key(maxPandaID): - print "%s is already used by %s" % (maxPandaID,suffixMap[maxPandaID]) - sys.exit(0) - # append - suffixMap[maxPandaID] = suffix - -# print the cumulative -print "%8d %s" % (cumulativePandaID,cumulativeSuffix) -# sort by max PandaID -suffixKeys = suffixMap.keys() -suffixKeys.sort() -for key in suffixKeys: - print "%8d %s" % (key,suffixMap[key]) - -# get files -minPandaID = -1 -sql = "SELECT PandaID FROM filesTable4 WHERE PandaID > %s GROUP BY PandaID ORDER BY PandaID LIMIT 100" -#while True: -for i in range(5): - status,res = proxyS.querySQLS(sql % minPandaID) - # no more job - if len(res) == 0: - break - # set min - minPandaID = res[-1][0] - # loop over all PandaIDs - for id, in res: - # look for corresponding table - tableSuffix = '' - if id < cumulativePandaID: - # use the cumulative - tableSuffix = cumulativeSuffix - else: - for key in suffixKeys: - if id < key: - tableSuffix = suffixMap[key] - break - # check suffix - if tableSuffix in ['',currentSuffix,previousSuffix]: - print "Terminated since fresh PandID=%s found for '%s'" % (id,tableSuffix) - sys.exit(0) - print "PandaID:%s Suffix:%s" % (id,tableSuffix) - # get FileSpec - sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() - sqlFile+= "WHERE PandaID=%s" % id - statusF,resFs = proxyS.querySQLS(sqlFile) - for resF in resFs: - file = FileSpec() - file.pack(resF) - # create a dummy Job to set PandaID - job = JobSpec() - job.PandaID = id - job.addFile(file) - # file table - fileTable = 'filesTable%s' % tableSuffix - # check - sqlFileCheck = "SELECT PandaID FROM %s WHERE rowID=%s" % (fileTable,file.rowID) - statusC,resC = proxyN.querySQLS(sqlFileCheck) - if len(resC) != 0: - if resC[0][0] != id: - print "PandaID mismatch PandaArchive:%s PandaDB:%s for rowID=%s" % \ - (resC[0][0],id,file.rowID) - else: - print "rowID=%s not found" % file.rowID - """ - # construct SQL - sqlFileIn = "INSERT INTO %s " % fileTable - sqlFileIn+= "(%s) " % FileSpec.columnNames() - sqlFileIn+= FileSpec.valuesExpression() - try: - proxyN.cur.execute("SET AUTOCOMMIT=1") - ret = proxyN.cur.execute(sqlFileIn,file.values()) - res = proxyN.cur.fetchall() - # commit - if not proxyN._commit(): - raise RuntimeError, 'Commit error' - except: - type, value, traceBack = sys.exc_info() - print "insert error : %s %s" % (type,value) - # roll back - proxyN._rollback() - """ diff --git a/current/pandaserver/test/finishJob.py b/current/pandaserver/test/finishJob.py deleted file mode 100755 index 559bd61c3..000000000 --- a/current/pandaserver/test/finishJob.py +++ /dev/null @@ -1,74 +0,0 @@ -import os -import re -import sys -import urllib2,urllib - -import userinterface.Client as Client -from userinterface.Client import baseURLSSL - -import httplib -import commands - -id = sys.argv[1] -s,o = Client.getJobStatus([id]) - -if s != 0: - print "failed to get job with:%s" % s - sys.exit(0) - -job = o[0] - -if job == None: - print "got None" - sys.exit(0) - -xml = """ - - - -""" - -for file in job.Files: - if file.type in ['output','log']: - xml += """ - - - - - - - - """ % (commands.getoutput('uuidgen'),file.lfn,file.lfn) - -xml += """ - -""" - -node={} -node['jobId']=id -node['state']='finished' -node['metaData']='finished' -#node['state']='failed' -#node['pilotErrorCode']=1200 -node['siteName']='BNL_ATLAS_test' - -node['xml']=xml -url='%s/updateJob' % baseURLSSL - -match = re.search('[^:/]+://([^/]+)(/.+)',url) -host = match.group(1) -path = match.group(2) - -if os.environ.has_key('X509_USER_PROXY'): - certKey = os.environ['X509_USER_PROXY'] -else: - certKey = '/tmp/x509up_u%s' % os.getuid() - -rdata=urllib.urlencode(node) - -conn = httplib.HTTPSConnection(host,key_file=certKey,cert_file=certKey) -conn.request('POST',path,rdata) -resp = conn.getresponse() -data = resp.read() - -print data diff --git a/current/pandaserver/test/getJobs.py b/current/pandaserver/test/getJobs.py deleted file mode 100755 index 10fd553eb..000000000 --- a/current/pandaserver/test/getJobs.py +++ /dev/null @@ -1,54 +0,0 @@ -import sys -import time -import datetime -import commands -import threading -import urllib2,urllib - -import httplib - -import re -import os - -from userinterface.Client import baseURLSSL - -node={} -node['siteName']=sys.argv[1] -node['mem']=1000 -node['node']=commands.getoutput('hostname -f') -#node['prodSourceLabel']='user' -url='%s/getJob' % baseURLSSL - -match = re.search('[^:/]+://([^/]+)(/.+)',url) -host = match.group(1) -path = match.group(2) - -if os.environ.has_key('X509_USER_PROXY'): - certKey = os.environ['X509_USER_PROXY'] -else: - certKey = '/tmp/x509up_u%s' % os.getuid() - -rdata=urllib.urlencode(node) - -class Thr(threading.Thread): - def __init__(self): - threading.Thread.__init__(self) - - def run(self): - print datetime.datetime.utcnow().isoformat(' ') - conn = httplib.HTTPSConnection(host,key_file=certKey,cert_file=certKey) - conn.request('POST',path,rdata) - resp = conn.getresponse() - data = resp.read() - conn.close() - print datetime.datetime.utcnow().isoformat(' ') - import cgi - print cgi.parse_qs(data) - -nThr = 1 -thrs = [] -for i in range(nThr): - thrs.append(Thr()) - -for thr in thrs: - thr.start() diff --git a/current/pandaserver/test/input.data b/current/pandaserver/test/input.data deleted file mode 100755 index 08272e947..000000000 --- a/current/pandaserver/test/input.data +++ /dev/null @@ -1,2 +0,0 @@ -pandatest.000003.dd.input:pandatest.000003.dd.input._00047.junk -pandatest.000003.dd.input:pandatest.000003.dd.input._00001.junk diff --git a/current/pandaserver/test/installSW.py b/current/pandaserver/test/installSW.py deleted file mode 100755 index 1dbb349bf..000000000 --- a/current/pandaserver/test/installSW.py +++ /dev/null @@ -1,83 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -# extract pacball and site -argStr = "" -pacball = None -pacFlag = False -siteName = None -siteFlag = False -for arg in sys.argv[1:]: - if arg == '--pacball': - pacFlag = True - continue - if pacFlag: - pacball = arg - pacFlag = False - continue - if arg == '--sitename': - siteFlag = True - continue - if siteFlag: - siteName = arg - siteFlag = False - continue - argStr += "%s " % arg - -# check site -if siteName == None: - print "ERROR : --sitename needs to be specified" - sys.exit(1) -# append sitename -argStr += "--sitename %s " % siteName - -# check pacball format -if pacball != None and pacball.find(':') != -1: - pacDS = pacball.split(':')[0] - pacFile = pacball.split(':')[-1] -else: - pacDS = None - pacFile = pacball - -# append pacball to arg -if pacFile != None: - argStr += "--pacball %s " % pacFile - -job = JobSpec() -job.jobDefinitionID = int(time.time()) % 10000 -job.jobName = "%s_%s" % (siteName,commands.getoutput('uuidgen')) -job.transformation = 'http://www.usatlas.bnl.gov/svn/panda/apps/sw/installAtlasSW' -job.destinationDBlock = 'panda.%s' % job.jobName -job.currentPriority = 10000 -job.prodSourceLabel = 'software' -job.computingSite = siteName -job.cloud = 'US' - -fileOL = FileSpec() -fileOL.lfn = "%s.job.log.tgz" % job.jobName -fileOL.destinationDBlock = job.destinationDBlock -fileOL.dataset = job.destinationDBlock -fileOL.type = 'log' -job.addFile(fileOL) - -# pacball -if pacDS != None: - job.prodDBlock = pacDS - fileP = FileSpec() - fileP.dataset = pacDS - fileP.prodDBlock = pacDS - fileP.lfn = pacFile - fileP.type = 'input' - job.addFile(fileP) - -job.jobParameters = argStr - -s,o = Client.submitJobs([job]) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/killDefJobs.py b/current/pandaserver/test/killDefJobs.py deleted file mode 100755 index a646ea202..000000000 --- a/current/pandaserver/test/killDefJobs.py +++ /dev/null @@ -1,26 +0,0 @@ -import datetime -from taskbuffer.DBProxy import DBProxy -import userinterface.Client as Client - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# time limit -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=1) - -# instantiate DB proxies -proxyS = DBProxy() -proxyS.connect('adbpro.usatlas.bnl.gov',passwd,'panda-developer','PandaDevDB') - -# get PandaIDs from jobsDefined -res = proxyS.querySQL("SELECT PandaID,modificationTime from jobsDefined4 ORDER BY modificationTime") - -# kill f old -jobs=[] -for (id,modTime) in res: - if modTime < timeLimit: - jobs.append(id) - -Client.killJobs(jobs) - diff --git a/current/pandaserver/test/killJob.py b/current/pandaserver/test/killJob.py deleted file mode 100755 index 0238f2e79..000000000 --- a/current/pandaserver/test/killJob.py +++ /dev/null @@ -1,36 +0,0 @@ -import sys -import optparse -import userinterface.Client as Client - -optP = optparse.OptionParser(conflict_handler="resolve") -optP.add_option('-9',action='store_const',const=True,dest='forceKill', - default=False,help='kill jobs before next heartbeat is coming') -optP.add_option('--killOwnProdJobs',action='store_const',const=True,dest='killOwnProdJobs', - default=False,help='kill own production jobs without a production role') -optP.add_option('--killUserJobs',action='store_const',const=True,dest='killUserJobs', - default=False,help='kill user jobs using a production role') -options,args = optP.parse_args() - - -aSrvID = None - -codeV = None -useMailAsIDV = False - -if options.forceKill: - codeV = 9 -elif options.killUserJobs: - codeV = 91 -if options.killOwnProdJobs: - useMailAsIDV = True - -if len(args) == 1: - Client.killJobs([args[0]],code=codeV,useMailAsID=useMailAsIDV) -else: - startID = int(args[0]) - endID = int(args[1]) - if startID > endID: - print '%d is less than %d' % (endID,startID) - sys.exit(1) - Client.killJobs(range(startID,endID+1),code=codeV,useMailAsID=useMailAsIDV) - diff --git a/current/pandaserver/test/killJobLowPrio.py b/current/pandaserver/test/killJobLowPrio.py deleted file mode 100755 index 347da336a..000000000 --- a/current/pandaserver/test/killJobLowPrio.py +++ /dev/null @@ -1,86 +0,0 @@ -import time -import sys -import optparse - -import userinterface.Client as Client - -aSrvID = None - -from taskbuffer.OraDBProxy import DBProxy -# password -from config import panda_config - -usageStr = """%prog [options] - -Description: kill jobs with low priorities below a given value""" -optP = optparse.OptionParser(conflict_handler="resolve",usage=usageStr) -optP.add_option('-9',action='store_const',const=True,dest='forceKill', - default=False,help='kill jobs before next heartbeat is coming') -optP.add_option('--running',action='store_const',const=True,dest='killRunning', - default=False,help='kill running jobs to free up CPU slots. jobs will be killed regardless of job status if omitted') -optP.add_option('--site',action='store',dest='site',default=None,help='computingSite') -optP.add_option('--cloud',action='store',dest='cloud',default=None,help='cloud') -optP.add_option('--maxJobs',action='store',dest='maxJobs',default=None,help='max number of jobs to be killed') -options,args = optP.parse_args() - -if options.cloud == None and options.site == None: - optP.error("--site= and/or --cloud= is required") - -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -jobsMap = {} - -if len(args) == 0: - optP.error('priority is required') - -varMap = {} -varMap[':prodSourceLabel'] = 'managed' -varMap[':currentPriority'] = args[0] -sql = "SELECT PandaID,currentPriority FROM %s WHERE prodSourceLabel=:prodSourceLabel AND currentPriority<:currentPriority " -if options.killRunning: - sql += "AND jobStatus=:jobStatus " - varMap[':jobStatus'] = 'running' -if options.cloud != None: - sql += "AND cloud=:cloud " - varMap[':cloud'] = options.cloud -if options.site != None: - sql += "AND computingSite=:site " - varMap[':site'] = options.site -for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: - status,res = proxyS.querySQLS(sql % table,varMap) - if res != None: - for id,prio in res: - if not jobsMap.has_key(prio): - jobsMap[prio] = [] - if not id in jobsMap[prio]: - jobsMap[prio].append(id) - -# order by PandaID and currentPriority -jobs = [] -prioList = jobsMap.keys() -prioList.sort() -for prio in prioList: - # reverse order by PandaID to kill newer jobs - ids = jobsMap[prio] - ids.sort() - ids.reverse() - jobs += ids - -if options.maxJobs != None: - jobs = jobs[:int(options.maxJobs)] - -print 'The number of jobs with priorities below %s : %s' % (args[0],len(jobs)) -if len(jobs): - nJob = 100 - iJob = 0 - while iJob < len(jobs): - print 'kill %s' % str(jobs[iJob:iJob+nJob]) - if options.forceKill: - Client.killJobs(jobs[iJob:iJob+nJob],9) - else: - Client.killJobs(jobs[iJob:iJob+nJob]) - iJob += nJob - time.sleep(1) - - diff --git a/current/pandaserver/test/killJobsInTask.py b/current/pandaserver/test/killJobsInTask.py deleted file mode 100755 index 26c9ddb16..000000000 --- a/current/pandaserver/test/killJobsInTask.py +++ /dev/null @@ -1,53 +0,0 @@ -import time -import sys -import optparse - -import userinterface.Client as Client - -aSrvID = None - -from taskbuffer.OraDBProxy import DBProxy -# password -from config import panda_config - -optP = optparse.OptionParser(conflict_handler="resolve") -optP.add_option('-9',action='store_const',const=True,dest='forceKill', - default=False,help='kill jobs before next heartbeat is coming') -options,args = optP.parse_args() - -useMailAsIDV = False -if options.killOwnProdJobs: - useMailAsIDV = True - -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -jobs = [] - -varMap = {} -varMap[':prodSourceLabel'] = 'managed' -varMap[':taskID'] = args[0] -varMap[':pandaIDl'] = args[1] -varMap[':pandaIDu'] = args[2] -sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID AND PandaID BETWEEN :pandaIDl AND :pandaIDu ORDER BY PandaID" -for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: - status,res = proxyS.querySQLS(sql % table,varMap) - if res != None: - for id, in res: - if not id in jobs: - jobs.append(id) - -print 'The number of jobs to be killed : %s' % len(jobs) -if len(jobs): - nJob = 100 - iJob = 0 - while iJob < len(jobs): - print 'kill %s' % str(jobs[iJob:iJob+nJob]) - if options.forceKill: - Client.killJobs(jobs[iJob:iJob+nJob],9,useMailAsID=useMailAsIDV) - else: - Client.killJobs(jobs[iJob:iJob+nJob],useMailAsID=useMailAsIDV) - iJob += nJob - time.sleep(1) - - diff --git a/current/pandaserver/test/killProdJobs.py b/current/pandaserver/test/killProdJobs.py deleted file mode 100755 index 85e8113ea..000000000 --- a/current/pandaserver/test/killProdJobs.py +++ /dev/null @@ -1,30 +0,0 @@ -import sys - -import userinterface.Client as Client - -if len(sys.argv) == 2: - jobDefIDs = [sys.argv[1]] -else: - startID = int(sys.argv[1]) - endID = int(sys.argv[2]) - if startID > endID: - print '%d is less than %d' % (endID,startID) - sys.exit(1) - jobDefIDs = range(startID,endID+1) - -# quesry PandaID -status, ids = Client.queryPandaIDs(jobDefIDs) - -if status != 0: - sys.exit(0) - -# remove None -while True: - if not None in ids: - break - ids.remove(None) - -# kill -if len(ids) != 0: - Client.killJobs(ids) - diff --git a/current/pandaserver/test/killTask.py b/current/pandaserver/test/killTask.py deleted file mode 100755 index 0784a18b9..000000000 --- a/current/pandaserver/test/killTask.py +++ /dev/null @@ -1,53 +0,0 @@ -import time -import sys -import optparse - -import userinterface.Client as Client - -aSrvID = None - -from taskbuffer.OraDBProxy import DBProxy -# password -from config import panda_config - -optP = optparse.OptionParser(conflict_handler="resolve") -optP.add_option('-9',action='store_const',const=True,dest='forceKill', - default=False,help='kill jobs even if they are still running') -optP.add_option('--noRunning',action='store_const',const=True,dest='noRunning', - default=False,help='kill only activated/assigned/waiting jobs') -options,args = optP.parse_args() - -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -jobs = [] - -varMap = {} -varMap[':prodSourceLabel'] = 'managed' -varMap[':taskID'] = args[0] -if not options.noRunning: - sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID ORDER BY PandaID" -else: - sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID AND jobStatus<>:jobStatus ORDER BY PandaID" - varMap[':jobStatus'] = 'running' -for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: - status,res = proxyS.querySQLS(sql % table,varMap) - if res != None: - for id, in res: - if not id in jobs: - jobs.append(id) - -print 'The number of jobs to be killed : %s' % len(jobs) -if len(jobs): - nJob = 100 - iJob = 0 - while iJob < len(jobs): - print 'kill %s' % str(jobs[iJob:iJob+nJob]) - if options.forceKill: - Client.killJobs(jobs[iJob:iJob+nJob],9) - else: - Client.killJobs(jobs[iJob:iJob+nJob]) - iJob += nJob - time.sleep(1) - - diff --git a/current/pandaserver/test/killUser.py b/current/pandaserver/test/killUser.py deleted file mode 100644 index 4e3bbaa19..000000000 --- a/current/pandaserver/test/killUser.py +++ /dev/null @@ -1,71 +0,0 @@ -import sys -import time -import datetime -import optparse - -from taskbuffer.OraDBProxy import DBProxy -# password -from config import panda_config - -optP = optparse.OptionParser(conflict_handler="resolve") -optP.add_option('--user', action='store',dest='user', default=None,help='prodUserName') -optP.add_option('--jobID',action='store',dest='jobID',default=None,help='jobDefinitionID') -optP.add_option('--jobsetID',action='store',dest='jobsetID',default=None,help="jobsetID, or 'all' to kill all jobs") -optP.add_option('--prodSourceLabel',action='store',dest='prodSourceLabel',default=None,help='additional prodSourceLabel') - - -options,args = optP.parse_args() - -if options.user == None: - print "--user= is required" - sys.exit(1) -if options.jobID == None and options.jobsetID == None: - print "--jobID= or --jobsetID= is required" - sys.exit(1) - - -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -prodUserName = sys.argv[1] -import userinterface.Client as Client - -varMap = {} -varMap[':src1'] = 'user' -varMap[':src2'] = 'panda' -varMap[':prodUserName'] = options.user -srcSQL = '(:src1,:src2' -if options.jobID != None: - varMap[':jobDefinitionID'] = options.jobID -if not options.jobsetID in (None,'all'): - varMap[':jobsetID'] = options.jobsetID -if options.prodSourceLabel != None: - varMap[':src3'] = options.prodSourceLabel - srcSQL += ',:src3' -srcSQL += ')' - -jobs = [] -tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4'] -for table in tables: - sql = "SELECT PandaID FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel IN %s " % (table,srcSQL) - if options.jobID != None: - sql += "AND jobDefinitionID=:jobDefinitionID " - if not options.jobsetID in (None,'all'): - sql += "AND jobsetID=:jobsetID " - sql += "ORDER BY PandaID " - status,res = proxyS.querySQLS(sql,varMap) - if res != None: - for id, in res: - if not id in jobs: - jobs.append(id) -if len(jobs): - iJob = 0 - nJob = 1000 - while iJob < len(jobs): - subJobs = jobs[iJob:iJob+nJob] - print "kill %s %s/%s" % (str(subJobs),iJob,len(jobs)) - Client.killJobs(subJobs,code=9) - iJob += nJob -else: - print "no job was killed" - diff --git a/current/pandaserver/test/killWaiting.py b/current/pandaserver/test/killWaiting.py deleted file mode 100755 index fe76014a8..000000000 --- a/current/pandaserver/test/killWaiting.py +++ /dev/null @@ -1,35 +0,0 @@ -import sys -import time -import datetime -from taskbuffer.DBProxy import DBProxy -import userinterface.Client as Client - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -cloud = sys.argv[1] - -# instantiate DB proxies -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -while True: - # get PandaIDs - res = proxyS.querySQL("SELECT PandaID FROM jobsWaiting4 WHERE cloud='%s' ORDER BY PandaID" % cloud) - # escape - if len(res) == 0: - break - # convert to list - jobs = [] - for id, in res: - jobs.append(id) - # reassign - nJob = 300 - iJob = 0 - while iJob < len(jobs): - print 'killJobs(%s)' % jobs[iJob:iJob+nJob] - Client.killJobs(jobs[iJob:iJob+nJob]) - iJob += nJob - time.sleep(60) - diff --git a/current/pandaserver/test/logrotate.sh b/current/pandaserver/test/logrotate.sh deleted file mode 100755 index 51db686c0..000000000 --- a/current/pandaserver/test/logrotate.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -/usr/sbin/logrotate /usatlas/u/sm/prod/panda/config/logrotate.conf -s /usatlas/u/sm/logrotate.status diff --git a/current/pandaserver/test/missing.py b/current/pandaserver/test/missing.py deleted file mode 100755 index b77eaeecf..000000000 --- a/current/pandaserver/test/missing.py +++ /dev/null @@ -1,43 +0,0 @@ -import re -import commands - -stMap = [] -tmpMap = {} -nLog = 30 -for i in range(0,nLog): - if i == 0: - out = commands.getoutput('cat /data/sm/prod/httpd/logs/panda-Adder.log') - else: - out = commands.getoutput('zcat /data/sm/prod/httpd/logs/panda-Adder.log.%s.gz' % (nLog-i)) - for line in out.split('\n'): - stStr = re.search('start: finished',line) - idsStr = re.search('ids = .*$',line) - mapStr = re.search('idMap = .*$',line) - if stStr == None and idsStr == None and mapStr == None: - continue - items = line.split() - try: - pandaID = int(items[4]) - except: - continue - if stStr != None: - stMap.append(pandaID) - if idsStr != None: - exec idsStr.group(0) - tmpMap[pandaID] = ids - if mapStr != None: - exec mapStr.group(0) - if (pandaID in stMap) and idMap == {} and tmpMap[pandaID] != ([], []): - print pandaID - print tmpMap[pandaID] - try: - del tmpMap[pandaID] - except: - pass - try: - stMap.remove(pandaID) - except: - pass -if tmpMap != {}: - print tmpMap - diff --git a/current/pandaserver/test/pandadb.sql b/current/pandaserver/test/pandadb.sql deleted file mode 100644 index 5bc00b59d..000000000 --- a/current/pandaserver/test/pandadb.sql +++ /dev/null @@ -1,430 +0,0 @@ -DROP TABLE jobsDefined4; -DROP TABLE jobsActive4; -DROP TABLE jobsArchived4; -DROP TABLE jobsWaiting4; -DROP TABLE filesTable4; -DROP TABLE Datasets; -DROP TABLE metaTable; -DROP TABLE subCounter; - - -CREATE TABLE jobsDefined4 -( - PandaID NUMBER(11) default 0 primary key, - jobDefinitionID NUMBER(11) default 0, - schedulerID VARCHAR(128), - pilotID VARCHAR(128), - creationTime DATE, - creationHost VARCHAR(128), - modificationTime DATE, - modificationHost VARCHAR(128), - AtlasRelease VARCHAR(64), - transformation VARCHAR(250), - homepackage VARCHAR(64), - prodSeriesLabel VARCHAR(20) default 'pandatest', - prodSourceLabel VARCHAR(20) default 'managed', - prodUserID VARCHAR(250), - assignedPriority NUMBER(9) default 0, - currentPriority NUMBER(9) default 0, - attemptNr NUMBER(2) default 0, - maxAttempt NUMBER(2) default 0, - jobStatus VARCHAR(15) default 'defined', - jobName VARCHAR(128), - maxCpuCount NUMBER(9) default 0, - maxCpuUnit VARCHAR(32), - maxDiskCount NUMBER(9) default 0, - maxDiskUnit CHAR(2), - ipConnectivity CHAR(3), - minRamCount NUMBER(9) default 0, - minRamUnit CHAR(2), - startTime DATE, - endTime DATE, - cpuConsumptionTime NUMBER(20) default 0, - cpuConsumptionUnit VARCHAR(128), - commandToPilot VARCHAR(250), - transExitCode VARCHAR(128), - pilotErrorCode NUMBER(6) default 0, - pilotErrorDiag VARCHAR(250), - exeErrorCode NUMBER(6) default 0, - exeErrorDiag VARCHAR(250), - supErrorCode NUMBER(6) default 0, - supErrorDiag VARCHAR(250) default NULL, - ddmErrorCode NUMBER(6) default 0, - ddmErrorDiag VARCHAR(250) default NULL, - brokerageErrorCode NUMBER(6) default 0, - brokerageErrorDiag VARCHAR(250) default NULL, - jobDispatcherErrorCode NUMBER(6) default 0, - jobDispatcherErrorDiag VARCHAR(250) default NULL, - taskBufferErrorCode NUMBER(6) default 0, - taskBufferErrorDiag VARCHAR(250) default NULL, - computingSite VARCHAR(128), - computingElement VARCHAR(128), - jobParameters VARCHAR(4000) default NULL, - metadata VARCHAR(32) default NULL, - prodDBlock VARCHAR(250), - dispatchDBlock VARCHAR(250), - destinationDBlock VARCHAR(250), - destinationSE VARCHAR(250), - nEvents NUMBER(9) default 0, - grid VARCHAR(32), - cloud VARCHAR(32), - cpuConversion NUMBER(9,4) default NULL, - sourceSite VARCHAR(36), - destinationSite VARCHAR(36), - transferType VARCHAR(10), - taskID NUMBER(9) default NULL, - cmtConfig VARCHAR(250), - stateChangeTime DATE, - prodDBUpdateTime DATE, - lockedby VARCHAR(128), - relocationFlag NUMBER(1) default 0, - jobExecutionID NUMBER(11) default 0, - VO VARCHAR(16), - pilotTiming VARCHAR(100), - workingGroup VARCHAR(20) -); - - -CREATE TABLE jobsActive4 -( - PandaID NUMBER(11) default 0 primary key, - jobDefinitionID NUMBER(11) default 0, - schedulerID VARCHAR(128), - pilotID VARCHAR(128), - creationTime DATE, - creationHost VARCHAR(128), - modificationTime DATE, - modificationHost VARCHAR(128), - AtlasRelease VARCHAR(64), - transformation VARCHAR(250), - homepackage VARCHAR(64), - prodSeriesLabel VARCHAR(20) default 'pandatest', - prodSourceLabel VARCHAR(20) default 'managed', - prodUserID VARCHAR(250), - assignedPriority NUMBER(9) default 0, - currentPriority NUMBER(9) default 0, - attemptNr NUMBER(2) default 0, - maxAttempt NUMBER(2) default 0, - jobStatus VARCHAR(15) default 'activated', - jobName VARCHAR(128), - maxCpuCount NUMBER(9) default 0, - maxCpuUnit VARCHAR(32), - maxDiskCount NUMBER(9) default 0, - maxDiskUnit CHAR(2), - ipConnectivity CHAR(3), - minRamCount NUMBER(9) default 0, - minRamUnit CHAR(2), - startTime DATE, - endTime DATE, - cpuConsumptionTime NUMBER(20) default 0, - cpuConsumptionUnit VARCHAR(128), - commandToPilot VARCHAR(250), - transExitCode VARCHAR(128), - pilotErrorCode NUMBER(6) default 0, - pilotErrorDiag VARCHAR(250), - exeErrorCode NUMBER(6) default 0, - exeErrorDiag VARCHAR(250), - supErrorCode NUMBER(6) default 0, - supErrorDiag VARCHAR(250) default NULL, - ddmErrorCode NUMBER(6) default 0, - ddmErrorDiag VARCHAR(250) default NULL, - brokerageErrorCode NUMBER(6) default 0, - brokerageErrorDiag VARCHAR(250) default NULL, - jobDispatcherErrorCode NUMBER(6) default 0, - jobDispatcherErrorDiag VARCHAR(250) default NULL, - taskBufferErrorCode NUMBER(6) default 0, - taskBufferErrorDiag VARCHAR(250) default NULL, - computingSite VARCHAR(128), - computingElement VARCHAR(128), - jobParameters VARCHAR(4000) default NULL, - metadata VARCHAR(32) default NULL, - prodDBlock VARCHAR(250), - dispatchDBlock VARCHAR(250), - destinationDBlock VARCHAR(250), - destinationSE VARCHAR(250), - nEvents NUMBER(9) default 0, - grid VARCHAR(32), - cloud VARCHAR(32), - cpuConversion NUMBER(9,4) default NULL, - sourceSite VARCHAR(36), - destinationSite VARCHAR(36), - transferType VARCHAR(10), - taskID NUMBER(9) default NULL, - cmtConfig VARCHAR(250), - stateChangeTime DATE, - prodDBUpdateTime DATE, - lockedby VARCHAR(128), - relocationFlag NUMBER(1) default 0, - jobExecutionID NUMBER(11) default 0, - VO VARCHAR(16), - pilotTiming VARCHAR(100), - workingGroup VARCHAR(20) -); - -CREATE TABLE jobsWaiting4 -( - PandaID NUMBER(11) default 0 primary key, - jobDefinitionID NUMBER(11) default 0, - schedulerID VARCHAR(128), - pilotID VARCHAR(128), - creationTime DATE, - creationHost VARCHAR(128), - modificationTime DATE, - modificationHost VARCHAR(128), - AtlasRelease VARCHAR(64), - transformation VARCHAR(250), - homepackage VARCHAR(64), - prodSeriesLabel VARCHAR(20) default 'pandatest', - prodSourceLabel VARCHAR(20) default 'managed', - prodUserID VARCHAR(250), - assignedPriority NUMBER(9) default 0, - currentPriority NUMBER(9) default 0, - attemptNr NUMBER(2) default 0, - maxAttempt NUMBER(2) default 0, - jobStatus VARCHAR(15) default 'activated', - jobName VARCHAR(128), - maxCpuCount NUMBER(9) default 0, - maxCpuUnit VARCHAR(32), - maxDiskCount NUMBER(9) default 0, - maxDiskUnit CHAR(2), - ipConnectivity CHAR(3), - minRamCount NUMBER(9) default 0, - minRamUnit CHAR(2), - startTime DATE, - endTime DATE, - cpuConsumptionTime NUMBER(20) default 0, - cpuConsumptionUnit VARCHAR(128), - commandToPilot VARCHAR(250), - transExitCode VARCHAR(128), - pilotErrorCode NUMBER(6) default 0, - pilotErrorDiag VARCHAR(250), - exeErrorCode NUMBER(6) default 0, - exeErrorDiag VARCHAR(250), - supErrorCode NUMBER(6) default 0, - supErrorDiag VARCHAR(250) default NULL, - ddmErrorCode NUMBER(6) default 0, - ddmErrorDiag VARCHAR(250) default NULL, - brokerageErrorCode NUMBER(6) default 0, - brokerageErrorDiag VARCHAR(250) default NULL, - jobDispatcherErrorCode NUMBER(6) default 0, - jobDispatcherErrorDiag VARCHAR(250) default NULL, - taskBufferErrorCode NUMBER(6) default 0, - taskBufferErrorDiag VARCHAR(250) default NULL, - computingSite VARCHAR(128), - computingElement VARCHAR(128), - jobParameters VARCHAR(4000) default NULL, - metadata VARCHAR(32) default NULL, - prodDBlock VARCHAR(250), - dispatchDBlock VARCHAR(250), - destinationDBlock VARCHAR(250), - destinationSE VARCHAR(250), - nEvents NUMBER(9) default 0, - grid VARCHAR(32), - cloud VARCHAR(32), - cpuConversion NUMBER(9,4) default NULL, - sourceSite VARCHAR(36), - destinationSite VARCHAR(36), - transferType VARCHAR(10), - taskID NUMBER(9) default NULL, - cmtConfig VARCHAR(250), - stateChangeTime DATE, - prodDBUpdateTime DATE, - lockedby VARCHAR(128), - relocationFlag NUMBER(1) default 0, - jobExecutionID NUMBER(11) default 0, - VO VARCHAR(16), - pilotTiming VARCHAR(100), - workingGroup VARCHAR(20) -); - -CREATE TABLE jobsArchived4 -( - PandaID NUMBER(11) default 0 primary key, - jobDefinitionID NUMBER(11) default 0, - schedulerID VARCHAR(128), - pilotID VARCHAR(128), - creationTime DATE, - creationHost VARCHAR(128), - modificationTime DATE, - modificationHost VARCHAR(128), - AtlasRelease VARCHAR(64), - transformation VARCHAR(250), - homepackage VARCHAR(64), - prodSeriesLabel VARCHAR(20) default 'pandatest', - prodSourceLabel VARCHAR(20) default 'managed', - prodUserID VARCHAR(250), - assignedPriority NUMBER(9) default 0, - currentPriority NUMBER(9) default 0, - attemptNr NUMBER(2) default 0, - maxAttempt NUMBER(2) default 0, - jobStatus VARCHAR(15) default 'activated', - jobName VARCHAR(128), - maxCpuCount NUMBER(9) default 0, - maxCpuUnit VARCHAR(32), - maxDiskCount NUMBER(9) default 0, - maxDiskUnit CHAR(2), - ipConnectivity CHAR(3), - minRamCount NUMBER(9) default 0, - minRamUnit CHAR(2), - startTime DATE, - endTime DATE, - cpuConsumptionTime NUMBER(20) default 0, - cpuConsumptionUnit VARCHAR(128), - commandToPilot VARCHAR(250), - transExitCode VARCHAR(128), - pilotErrorCode NUMBER(6) default 0, - pilotErrorDiag VARCHAR(250), - exeErrorCode NUMBER(6) default 0, - exeErrorDiag VARCHAR(250), - supErrorCode NUMBER(6) default 0, - supErrorDiag VARCHAR(250) default NULL, - ddmErrorCode NUMBER(6) default 0, - ddmErrorDiag VARCHAR(250) default NULL, - brokerageErrorCode NUMBER(6) default 0, - brokerageErrorDiag VARCHAR(250) default NULL, - jobDispatcherErrorCode NUMBER(6) default 0, - jobDispatcherErrorDiag VARCHAR(250) default NULL, - taskBufferErrorCode NUMBER(6) default 0, - taskBufferErrorDiag VARCHAR(250) default NULL, - computingSite VARCHAR(128), - computingElement VARCHAR(128), - jobParameters VARCHAR(4000) default NULL, - metadata VARCHAR(32) default NULL, - prodDBlock VARCHAR(250), - dispatchDBlock VARCHAR(250), - destinationDBlock VARCHAR(250), - destinationSE VARCHAR(250), - nEvents NUMBER(9) default 0, - grid VARCHAR(32), - cloud VARCHAR(32), - cpuConversion NUMBER(9,4) default NULL, - sourceSite VARCHAR(36), - destinationSite VARCHAR(36), - transferType VARCHAR(10), - taskID NUMBER(9) default NULL, - cmtConfig VARCHAR(250), - stateChangeTime DATE, - prodDBUpdateTime DATE, - lockedby VARCHAR(128), - relocationFlag NUMBER(1) default 0, - jobExecutionID NUMBER(11) default 0, - VO VARCHAR(16), - pilotTiming VARCHAR(100), - workingGroup VARCHAR(20) -); - - -CREATE TABLE filesTable4 -( - row_ID NUMBER(11) default 0 primary key, - PandaID NUMBER(11) default 0, - GUID VARCHAR(64), - lfn VARCHAR(256), - type VARCHAR(20), - dataset VARCHAR(128), - status VARCHAR(64), - prodDBlock VARCHAR(250), - prodDBlockToken VARCHAR(250), - dispatchDBlock VARCHAR(250), - dispatchDBlockToken VARCHAR(250), - destinationDBlock VARCHAR(250), - destinationDBlockToken VARCHAR(250), - destinationSE VARCHAR(250), - fsize NUMBER(10) default 0, - md5sum CHAR(36), - checksum CHAR(36) -); - - -CREATE TABLE Datasets -( - vuid VARCHAR(40) default '' primary key, - name VARCHAR(250), - version VARCHAR(10) default NULL, - type VARCHAR(20) default NULL, - status VARCHAR(10) default NULL, - numberfiles NUMBER(9) default NULL, - currentfiles NUMBER(9) default NULL, - creationdate DATE, - modificationdate DATE, - MoverID NUMBER(11) default 0, - transferStatus NUMBER(2) default 0 -); - - -CREATE TABLE metaTable -( - PandaID NUMBER(11) default 0 primary key, - metaData VARCHAR(4000) default NULL -); - - -CREATE TABLE subCounter -( - subID NUMBER(11) default 0 -); - - - -CREATE INDEX jobsA4_currentPriority_IDX ON jobsActive4 (currentPriority); -CREATE INDEX jobsA4_jobStatus_IDX ON jobsActive4 (jobStatus); -CREATE INDEX jobsA4_computingSite_IDX ON jobsActive4 (computingSite); - -CREATE INDEX file4_PandaID_IDX ON filesTable4 (PandaID); -CREATE INDEX file4_status_IDX ON filesTable4 (status); -CREATE INDEX file4_dispDBlock_IDX ON filesTable4 (dispatchDBlock); -CREATE INDEX file4_destDBlock_IDX ON filesTable4 (destinationDBlock); - -CREATE INDEX Datasets_name_IDX ON Datasets (name); - -DROP SEQUENCE PandaID_SEQ; -DROP SEQUENCE rowID_SEQ; -DROP SEQUENCE subID_SEQ; - - -CREATE SEQUENCE PandaID_SEQ; -CREATE SEQUENCE rowID_SEQ; -CREATE SEQUENCE subID_SEQ; - - -CREATE OR REPLACE TRIGGER PandaID_TRIGGER -BEFORE INSERT ON jobsDefined4 -FOR EACH ROW -BEGIN - IF (:NEW.PandaID IS NULL) THEN - SELECT PandaID_SEQ.NEXTVAL INTO :NEW.PandaID FROM DUAL ; - END IF; -END; -/ - - -CREATE OR REPLACE TRIGGER rowID_TRIGGER -BEFORE INSERT ON filesTable4 -FOR EACH ROW -BEGIN - SELECT rowID_SEQ.NEXTVAL INTO :NEW.row_ID FROM DUAL ; -END; -/ - - -CREATE OR REPLACE TRIGGER subID_TRIGGER -BEFORE INSERT ON subCounter -FOR EACH ROW -BEGIN - SELECT subID_SEQ.NEXTVAL INTO :NEW.subID FROM DUAL ; -END; -/ - - -CREATE OR REPLACE FUNCTION BITOR( P_BITS1 IN NATURAL, P_BITS2 IN NATURAL ) -RETURN NATURAL -IS -BEGIN - RETURN UTL_RAW.CAST_TO_BINARY_INTEGER( - UTL_RAW.BIT_OR( - UTL_RAW.CAST_FROM_BINARY_INTEGER(P_BITS1), - UTL_RAW.CAST_FROM_BINARY_INTEGER(P_BITS2) - ) - ); -END; -/ diff --git a/current/pandaserver/test/pandameta.sql b/current/pandaserver/test/pandameta.sql deleted file mode 100644 index ed234a5d2..000000000 --- a/current/pandaserver/test/pandameta.sql +++ /dev/null @@ -1,97 +0,0 @@ -DROP TABLE cloudconfig; -DROP TABLE schedconfig; - - -CREATE TABLE cloudconfig -( - name VARCHAR(20) primary key, - description VARCHAR(50), - tier1 VARCHAR(20), - tier1SE VARCHAR(400), - relocation VARCHAR(10), - weight NUMBER(11) default 0, - server VARCHAR(100), - status VARCHAR(20), - transtimelo NUMBER(11) default 0, - transtimehi NUMBER(11) default 0, - waittime NUMBER(11) default 0, - cloudcomment VARCHAR(200), - space NUMBER(11) default 0, - moduser VARCHAR(30), - modtime DATE default CURRENT_DATE, - validation VARCHAR(20), - mcshare NUMBER(11) default 0, - countries VARCHAR(80) -); - - -CREATE TABLE schedconfig -( - name VARCHAR(60) default 'default', - nickname VARCHAR(60) primary key, - queue VARCHAR(60), - localqueue VARCHAR(20), - system VARCHAR(60), - sysconfig VARCHAR(20), - environ VARCHAR(250), - gatekeeper VARCHAR(40), - jobmanager VARCHAR(80), - se VARCHAR(250), - ddm VARCHAR(80), - jdladd CLOB default NULL, - globusadd VARCHAR(100), - jdl VARCHAR(60), - jdltxt CLOB default NULL, - version VARCHAR(60), - site VARCHAR(60), - region VARCHAR(60), - gstat VARCHAR(60), - tags VARCHAR(200), - cmd VARCHAR(200), - lastmod TIMESTAMP default CURRENT_TIMESTAMP, - errinfo VARCHAR(80), - nqueue NUMBER(11) default 0, - queuecomment CLOB default NULL, - appdir VARCHAR(80), - datadir VARCHAR(80), - tmpdir VARCHAR(80), - wntmpdir VARCHAR(80), - dq2url VARCHAR(80), - special_par VARCHAR(80), - python_path VARCHAR(80), - nodes NUMBER(11) default 0, - status VARCHAR(10), - copytool VARCHAR(80), - copysetup VARCHAR(200), - releases VARCHAR(500), - sepath VARCHAR(80), - envsetup VARCHAR(200), - copyprefix VARCHAR(160), - lfcpath VARCHAR(80), - seopt VARCHAR(60), - sein VARCHAR(60), - seinopt VARCHAR(60), - lfchost VARCHAR(80), - cloud VARCHAR(60), - siteid VARCHAR(60), - proxy VARCHAR(80), - retry VARCHAR(10), - queuehours NUMBER(9) default 0, - envsetupin VARCHAR(200), - copytoolin VARCHAR(180), - copysetupin VARCHAR(200), - seprodpath VARCHAR(200), - lfcprodpath VARCHAR(80), - copyprefixin VARCHAR(80), - recoverdir VARCHAR(80), - memory NUMBER(11) default 0, - maxtime NUMBER(11) default 0, - space NUMBER(11) default 0, - tspace TIMESTAMP default TO_DATE('0001-01-01 00:00:00','YYYY-MM-DD HH24:MI:SS'), - cmtconfig VARCHAR(250), - setokens VARCHAR(80), - glexec VARCHAR(10), - priorityoffset VARCHAR(60), - allowedgroups VARCHAR(100), - defaulttoken VARCHAR(100) -); diff --git a/current/pandaserver/test/pcron.sh b/current/pandaserver/test/pcron.sh deleted file mode 100755 index 4cb8f3653..000000000 --- a/current/pandaserver/test/pcron.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -"exec" "python" "$0" "$@" - -import os -import sys -import time -import commands - -_python = "/direct/usatlas+u/gfg/python-latest/python-2.4.1/python-2.4.1/bin/python" - -class Woker: - # constructor - def __init__(self): - pass - - # main - def run(self): - os.chdir('/direct/usatlas+u/sm/panda/pilot2') - com = "python pilot.py -a /usatlas/projects/OSG -d /tmp -l /usatlas/prodjob/share/ -q http://dms02.usatlas.bnl.gov:8000/dq2/ -s BNL_ATLAS_DDM" - os.spawnv(os.P_NOWAIT,_python,com.split()) - -# count # of processes -out = commands.getoutput('ps auxww | grep pilot.py | grep -v auxww | grep -v "sh -c" | grep -v grep' ) -if out == '': - nPilot = 0 -else: - nPilot = len(out.split('\n')) -maxPilot = 10 -print nPilot -if nPilot >= maxPilot: - sys.exit(0) - -for i in range(maxPilot-nPilot): - thr = Woker() - thr.run() - time.sleep(5) diff --git a/current/pandaserver/test/pdq2_cr b/current/pandaserver/test/pdq2_cr deleted file mode 100755 index 538a6c5a6..000000000 --- a/current/pandaserver/test/pdq2_cr +++ /dev/null @@ -1,159 +0,0 @@ -#!/bin/bash - -"exec" "python" "$0" "$@" - - -def _usage(): - print \ -""" -NAME - pdq2_cr - copy and register DQ2 dataset via PANDA - -SYNOPSIS - - pdq2_cr [ -h | --help] - [ -p | --parallel n ] - [ -t | --timeout n ] - [ -d | --destination destination ] - [ -r | --remote remoteSite ] - [ -s | --source sourceSite ] - datasetname - [lfn1 [lfn2 [...]]] -DESCRIPTION - - dq2_cr copies and registers DQ2 dataset. It scans the LRC to find missing or corrupted - files in a dataset, copies the files to the local SE using 3rd-party transfers, and - registers the files to the LRC. - -OPTIONS - - -h | --help Print this message - - -p | --parallel Number of copy threads (default:3) - - -t | --timeout Timeout limit in second for each file transfer (default:1800) - - -d | --destination Directory in the storage element where files will be put. - - -r | --remote Specify remote site to which files get copied - - -s | --source Specify source site from which files get copied - -""" - -# error codes -EC_Configuration = 20 -EC_VUID = 30 -EC_QueryFiles = 40 -EC_Location = 50 -EC_Copy = 60 -EC_Main = 70 -EC_PFNfromLFC = 80 -EC_INVALIDSIZE = 90 -EC_RegisterLRC = 100 -EC_LS = 110 - -#################################################################### -# main -def main(): - import sys - import getopt - - # option class - class _options: - def __init__(self): - pass - options = _options() - del _options - # set default values - options.source = '' - options.destination = '' - options.remote = '' - # get command-line parameters - try: - opts, args = getopt.getopt(sys.argv[1:],"hvn:cd:p:t:s:r:l:u", - ["help","verbose","ntry=","choose", - "destination=","parallel=","timeout=", - "source=","remote=","location=","uber", - "noSleep","uberHost=","gsiHost=","srmHost=", - "guids=","lfns=","debug", - ]) - except: - _usage() - print "ERROR : Invalid options" - sys.exit(EC_Main) - # set options - for o, a in opts: - if o in ("-h","--help"): - _usage() - sys.exit() - if o in ("-s","--source"): - options.source = a - if o in ("-r","--remote"): - options.remote = a - if o in ("-d","--destination"): - options.destination = a - # datasetname - if len(args) == 0: - print "ERROR : no datasetname" - sys.exit(EC_Main) - # source - if options.source == "": - print "ERROR : no source. use -s" - sys.exit(EC_Main) - # destination - if options.destination == "": - print "ERROR : no destination. use -d" - sys.exit(EC_Main) - # remote - if options.remote == "": - print "ERROR : no remote. use -r" - sys.exit(EC_Main) - - # submit - import time - import commands - import userinterface.Client as Client - from taskbuffer.JobSpec import JobSpec - from taskbuffer.FileSpec import FileSpec - - site = "BNL_ATLAS_DDM" - - datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') - destName = 'BNL_SE' - - jobList = [] - - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s" % commands.getoutput('uuidgen') - job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/run_dq2_cr' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.currentPriority = 100000 - job.prodSourceLabel = 'test' - job.computingSite = site - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.type = 'log' - job.addFile(fileOL) - - argStr = "" - for arg in sys.argv[1:]: - argStr += "%s " % arg - job.jobParameters = argStr - - jobList.append(job) - - s,o = Client.submitJobs(jobList) - print "---------------------" - print s - for x in o: - print "PandaID=%s" % x[0] - -if __name__ == "__main__": - main() diff --git a/current/pandaserver/test/plot.py b/current/pandaserver/test/plot.py deleted file mode 100755 index 9d37de977..000000000 --- a/current/pandaserver/test/plot.py +++ /dev/null @@ -1,51 +0,0 @@ -import re -import time -import datetime -import pylab -file = open('panda-DBProxy.log') -datesMap = {} -valuesMap = {} -for line in file: - items = re.findall('countPilotRequests[^\']+\'([^\']+)\': (\d+)',line) - if len(items) != 0: - # statistics - site = items[0][0] - count = float(items[0][1]) - # date - items = re.split(' |,',line) - if len(items) >= 2: - strDate = '%s %s' % tuple(items[:2]) - datetimeTime = datetime.datetime(*time.strptime(strDate,'%Y-%m-%d %H:%M:%S')[:6]) - # assign - if not datesMap.has_key(site): - datesMap[site] = [] - valuesMap[site] = [] - datesMap[site].append(pylab.date2num(datetimeTime)) - valuesMap[site].append(count) -# close file -file.close() -# plot -nRow = 1 #len(datesMap.keys()) -nCol = 1 -nFig = 1 -tFig = 1 -sites = datesMap.keys() -sites.sort() -for site in sites: - if nFig == (nRow*nCol+1): - pylab.savefig('pilot%d.png' % tFig) - tFig += 1 - pylab.figure(tFig) - nFig = 1 - pylab.subplot(int('%d%d%d' % (nRow,nCol,nFig))) - pylab.title('Number of pilots @%s' % site) - pylab.plot_date(datesMap[site],valuesMap[site]) - nFig += 1 -# save the last figure -pylab.savefig('pilot%d.png' % tFig) -# show -#pylab.show() - - - - diff --git a/current/pandaserver/test/prioryMassage.py b/current/pandaserver/test/prioryMassage.py deleted file mode 100644 index 887bca19f..000000000 --- a/current/pandaserver/test/prioryMassage.py +++ /dev/null @@ -1,364 +0,0 @@ -import os -import re -import sys -import datetime -from taskbuffer.TaskBuffer import taskBuffer -from pandalogger.PandaLogger import PandaLogger - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# logger -_logger = PandaLogger().getLogger('prioryMassage') - -_logger.debug("================= start ==================") - -# instantiate TB -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - -# get usage breakdown -usageBreakDownPerUser = {} -usageBreakDownPerSite = {} -workingGroupList = [] -for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: - varMap = {} - varMap[':prodSourceLabel'] = 'user' - if table == 'ATLAS_PANDA.jobsActive4': - sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table - else: - # with time range for archived table - varMap[':modificationTime'] = datetime.datetime.utcnow() - datetime.timedelta(minutes=60) - sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel AND modificationTime>:modificationTime GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table - # exec - status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10000) - if res == None: - _logger.debug("total %s " % res) - else: - _logger.debug("total %s " % len(res)) - # make map - for cnt,prodUserName,jobStatus,workingGroup,computingSite in res: - # use workingGroup name as prodUserName - if workingGroup != None: - if not workingGroup in workingGroupList: - workingGroupList.append(workingGroup) - prodUserName = workingGroup - workingGroup = None - # append to PerUser map - if not usageBreakDownPerUser.has_key(prodUserName): - usageBreakDownPerUser[prodUserName] = {} - if not usageBreakDownPerUser[prodUserName].has_key(workingGroup): - usageBreakDownPerUser[prodUserName][workingGroup] = {} - if not usageBreakDownPerUser[prodUserName][workingGroup].has_key(computingSite): - usageBreakDownPerUser[prodUserName][workingGroup][computingSite] = {'rundone':0,'activated':0} - # append to PerSite map - if not usageBreakDownPerSite.has_key(computingSite): - usageBreakDownPerSite[computingSite] = {} - if not usageBreakDownPerSite[computingSite].has_key(prodUserName): - usageBreakDownPerSite[computingSite][prodUserName] = {} - if not usageBreakDownPerSite[computingSite][prodUserName].has_key(workingGroup): - usageBreakDownPerSite[computingSite][prodUserName][workingGroup] = {'rundone':0,'activated':0} - # count # of running/done and activated - if jobStatus in ['activated']: - usageBreakDownPerUser[prodUserName][workingGroup][computingSite]['activated'] += cnt - usageBreakDownPerSite[computingSite][prodUserName][workingGroup]['activated'] += cnt - elif jobStatus in ['cancelled','holding']: - pass - else: - usageBreakDownPerUser[prodUserName][workingGroup][computingSite]['rundone'] += cnt - usageBreakDownPerSite[computingSite][prodUserName][workingGroup]['rundone'] += cnt - -# get total number of users and running/done jobs -totalUsers = 0 -totalRunDone = 0 -for prodUserName,wgValMap in usageBreakDownPerUser.iteritems(): - for workingGroup,siteValMap in wgValMap.iteritems(): - # ignore group production - if workingGroup != None: - continue - totalUsers += 1 - for computingSite,statValMap in siteValMap.iteritems(): - totalRunDone += statValMap['rundone'] - -_logger.debug("total users : %s" % totalUsers) -_logger.debug("total RunDone : %s" % totalRunDone) -_logger.debug("") - -if totalUsers == 0: - sys.exit(0) - -# global average -globalAverageRunDone = float(totalRunDone)/float(totalUsers) - -_logger.debug("global average : %s" % globalAverageRunDone) - -# count the number of users and run/done jobs for each site -siteRunDone = {} -siteUsers = {} -for computingSite,userValMap in usageBreakDownPerSite.iteritems(): - for prodUserName,wgValMap in userValMap.iteritems(): - for workingGroup,statValMap in wgValMap.iteritems(): - # ignore group production - if workingGroup != None: - continue - # count the number of users and running/done jobs - if not siteUsers.has_key(computingSite): - siteUsers[computingSite] = 0 - siteUsers[computingSite] += 1 - if not siteRunDone.has_key(computingSite): - siteRunDone[computingSite] = 0 - siteRunDone[computingSite] += statValMap['rundone'] - -# get site average -_logger.debug("site average") -siteAverageRunDone = {} -for computingSite,nRunDone in siteRunDone.iteritems(): - siteAverageRunDone[computingSite] = float(nRunDone)/float(siteUsers[computingSite]) - _logger.debug(" %-25s : %s" % (computingSite,siteAverageRunDone[computingSite])) - -# check if the number of user's jobs is lower than the average -for prodUserName,wgValMap in usageBreakDownPerUser.iteritems(): - _logger.debug("---> %s" % prodUserName) - # no private jobs - if not wgValMap.has_key(None): - _logger.debug("no private jobs") - continue - # count the number of running/done jobs - userTotalRunDone = 0 - for workingGroup,siteValMap in wgValMap.iteritems(): - if workingGroup != None: - continue - for computingSite,statValMap in siteValMap.iteritems(): - userTotalRunDone += statValMap['rundone'] - # no priority boost when the number of jobs is higher than the average - if userTotalRunDone >= globalAverageRunDone: - _logger.debug("enough running %s > %s (global average)" % (userTotalRunDone,globalAverageRunDone)) - continue - _logger.debug("user total:%s global average:%s" % (userTotalRunDone,globalAverageRunDone)) - # check with site average - toBeBoostedSites = [] - for computingSite,statValMap in wgValMap[None].iteritems(): - # the number of running/done jobs is lower than the average and activated jobs are waiting - if statValMap['rundone'] >= siteAverageRunDone[computingSite]: - _logger.debug("enough running %s > %s (site average) at %s" % \ - (statValMap['rundone'],siteAverageRunDone[computingSite],computingSite)) - elif statValMap['activated'] == 0: - _logger.debug("no activated jobs at %s" % computingSite) - else: - toBeBoostedSites.append(computingSite) - # no boost is required - if toBeBoostedSites == []: - _logger.debug("no sites to be boosted") - continue - # check special prioritized site - siteAccessForUser = {} - varMap = {} - varMap[':dn'] = prodUserName - sql = "SELECT pandaSite,pOffset,status,workingGroups FROM ATLAS_PANDAMETA.siteAccess WHERE dn=:dn" - status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10000) - if res != None: - for pandaSite,pOffset,pStatus,workingGroups in res: - # ignore special working group for now - if not workingGroups in ['',None]: - continue - # only approved sites - if pStatus != 'approved': - continue - # no priority boost - if pOffset == 0: - continue - # append - siteAccessForUser[pandaSite] = pOffset - # set weight - totalW = 0 - defaultW = 100 - for computingSite in toBeBoostedSites: - totalW += defaultW - if siteAccessForUser.has_key(computingSite): - totalW += siteAccessForUser[computingSite] - totalW = float(totalW) - # the total number of jobs to be boosted - numBoostedJobs = globalAverageRunDone - float(userTotalRunDone) - # get quota - quotaFactor = 1.0 + taskBuffer.checkQuota(prodUserName) - _logger.debug("quota factor:%s" % quotaFactor) - # make priority boost - nJobsPerPrioUnit = 5 - highestPrio = 1000 - for computingSite in toBeBoostedSites: - weight = float(defaultW) - if siteAccessForUser.has_key(computingSite): - weight += float(siteAccessForUser[computingSite]) - weight /= totalW - # the number of boosted jobs at the site - numBoostedJobsSite = int(numBoostedJobs * weight / quotaFactor) - _logger.debug("nSite:%s nAll:%s W:%s Q:%s at %s" % (numBoostedJobsSite,numBoostedJobs,weight,quotaFactor,computingSite)) - if numBoostedJobsSite/nJobsPerPrioUnit == 0: - _logger.debug("too small number of jobs %s to be boosted at %s" % (numBoostedJobsSite,computingSite)) - continue - # get the highest prio of activated jobs at the site - varMap = {} - varMap[':jobStatus'] = 'activated' - varMap[':prodSourceLabel'] = 'user' - varMap[':prodUserName'] = prodUserName - varMap[':computingSite'] = computingSite - sql = "SELECT MAX(currentPriority) FROM ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND prodUserName=:prodUserName AND workingGroup IS NULL AND jobStatus=:jobStatus AND computingSite=:computingSite" - status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10) - maxPrio = None - if res != None: - try: - maxPrio = res[0][0] - except: - pass - if maxPrio == None: - _logger.debug("cannot get the highest prio at %s" % computingSite) - continue - # delta for priority boost - prioDelta = highestPrio - maxPrio - # already boosted - if prioDelta <= 0: - _logger.debug("already boosted (prio=%s) at %s" % (maxPrio,computingSite)) - continue - # lower limit - minPrio = maxPrio - numBoostedJobsSite/nJobsPerPrioUnit - # SQL for priority boost - varMap = {} - varMap[':jobStatus'] = 'activated' - varMap[':prodSourceLabel'] = 'user' - varMap[':prodUserName'] = prodUserName - varMap[':computingSite'] = computingSite - varMap[':prioDelta'] = prioDelta - varMap[':maxPrio'] = maxPrio - varMap[':minPrio'] = minPrio - varMap[':rlimit'] = numBoostedJobsSite - sql = "UPDATE ATLAS_PANDA.jobsActive4 SET currentPriority=currentPriority+:prioDelta " - sql += "WHERE prodSourceLabel=:prodSourceLabel " - if prodUserName in workingGroupList: - sql += "AND workingGroup=:prodUserName " - else: - sql += "AND prodUserName=:prodUserName AND workingGroup IS NULL " - sql += "AND jobStatus=:jobStatus AND computingSite=:computingSite AND currentPriority>:minPrio " - sql += "AND currentPriority<=:maxPrio AND rownum<=:rlimit" - _logger.debug("boost %s" % str(varMap)) - status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10) - _logger.debug(" database return : %s" % res) - - -# redo stalled analysis jobs -_logger.debug("=== redo stalled jobs") -try: - varMap = {} - varMap[':prodSourceLabel'] = 'user' - sqlJ = "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsDefined4 " - sqlJ += "WHERE prodSourceLabel=:prodSourceLabel AND modificationTime delete downstream jobs") - # FIXME - #taskBuffer.deleteStalledJobs(libLFN) - else: - # activate - if useLib and libStatus == 'ready' and (not libGUID in [None,'']) and (not libDSName in [None,'']): - # update GUID - _logger.debug(" set GUID:%s for %s" % (libGUID,libLFN)) - #retG = taskBuffer.setGUIDs([{'lfn':libLFN,'guid':libGUID}]) - # FIXME - retG = True - if not retG: - _logger.error(" failed to update GUID for %s" % libLFN) - else: - # get PandaID with lib.tgz - #ids = taskBuffer.updateInFilesReturnPandaIDs(libDSName,'ready') - ids = [] - # get jobs - jobs = taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False) - # remove None and unknown - acJobs = [] - for job in jobs: - if job == None or job.jobStatus == 'unknown': - continue - acJobs.append(job) - # activate - _logger.debug(" -> activate downstream jobs") - #taskBuffer.activateJobs(acJobs) - else: - # wait - _logger.debug(" -> wait") - varMap = {} - varMap[':prodSourceLabel'] = 'user' - varMap[':jobDefinitionID'] = jobDefinitionID - varMap[':prodUserName'] = prodUserName - # FIXME - #stU,resU = taskBuffer.querySQLS(sqlU,varMap) -except: - errtype,errvalue = sys.exc_info()[:2] - _logger.error("failed to redo stalled jobs with %s %s" % (errtype,errvalue)) - -_logger.debug("-------------- end") diff --git a/current/pandaserver/test/proxy.sh b/current/pandaserver/test/proxy.sh deleted file mode 100755 index 674e1d248..000000000 --- a/current/pandaserver/test/proxy.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -l - -echo '************** start' -date -source /afs/cern.ch/project/gd/LCG-share/current/external/etc/profile.d/grid-env.sh -echo '************** check proxy' -voms-proxy-info -all -echo '************** check novoms' -voms-proxy-info -all -file /tmp/x509up_u`id -u`_novoms -echo '************** voms-proxy-init' -voms-proxy-init -voms atlas:/atlas/usatlas/Role=production -valid 100000:0 -noregen -debug -cert /tmp/x509up_u`id -u`_novoms -echo '************** check new proxy' -voms-proxy-info -all -echo '************** end' -echo diff --git a/current/pandaserver/test/reassignDefJobs.py b/current/pandaserver/test/reassignDefJobs.py deleted file mode 100755 index 3aecd1374..000000000 --- a/current/pandaserver/test/reassignDefJobs.py +++ /dev/null @@ -1,63 +0,0 @@ -import sys -import time -import datetime -from taskbuffer.OraDBProxy import DBProxy -import userinterface.Client as Client -from dataservice.DDM import ddm - -timeL = 60 -if len(sys.argv) == 2: - timeL = int(sys.argv[1]) - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# erase datasets -def eraseDispDatasets(ids): - datasets = [] - # get jobs - status,jobs = Client.getJobStatus(ids) - if status != 0: - return - # gather dispDBlcoks - for job in jobs: - for file in job.Files: - if not file.dispatchDBlock in datasets: - datasets.append(file.dispatchDBlock) - # erase - for dataset in datasets: - ddm.DQ2.main(['eraseDataset',datasets]) - -# time limit -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=int(timeL)) - -# instantiate DB proxies -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -while True: - # get PandaIDs - varMap = {} - varMap[':jobStatus'] = 'defined' - varMap[':modificationTime'] = timeLimit - varMap[':prodSourceLabel'] = 'managed' - sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" - status,res = proxyS.querySQLS(sql,varMap) - # escape - if len(res) == 0: - break - # convert to list - jobs = [] - for id, in res: - jobs.append(id) - # reassign - nJob = 100 - iJob = 0 - while iJob < len(jobs): - print 'reassignJobs(%s)' % jobs[iJob:iJob+nJob] - Client.reassignJobs(jobs[iJob:iJob+nJob]) - iJob += nJob - time.sleep(120) - - diff --git a/current/pandaserver/test/reassignJobs.py b/current/pandaserver/test/reassignJobs.py deleted file mode 100755 index ab17c5b42..000000000 --- a/current/pandaserver/test/reassignJobs.py +++ /dev/null @@ -1,14 +0,0 @@ -import sys - -import userinterface.Client as Client - -if len(sys.argv) == 2: - Client.reassignJobs([sys.argv[1]]) -else: - startID = int(sys.argv[1]) - endID = int(sys.argv[2]) - if startID > endID: - print '%d is less than %d' % (endID,startID) - sys.exit(1) - Client.reassignJobs(range(startID,endID+1)) - diff --git a/current/pandaserver/test/reassignSite.py b/current/pandaserver/test/reassignSite.py deleted file mode 100644 index 2d80aaa36..000000000 --- a/current/pandaserver/test/reassignSite.py +++ /dev/null @@ -1,64 +0,0 @@ -import sys -import time -import datetime - -from taskbuffer.OraDBProxy import DBProxy -# password -from config import panda_config - -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -site = sys.argv[1] -import userinterface.Client as Client - -# erase dispatch datasets -def eraseDispDatasets(ids): - print "eraseDispDatasets" - datasets = [] - # get jobs - status,jobs = Client.getJobStatus(ids) - if status != 0: - return - # gather dispDBlcoks - for job in jobs: - # dispatchDS is not a DQ2 dataset in US - if job.cloud == 'US': - continue - # erase disp datasets for production jobs only - if job.prodSourceLabel != 'managed': - continue - for file in job.Files: - if file.dispatchDBlock == 'NULL': - continue - if (not file.dispatchDBlock in datasets) and \ - re.search('_dis\d+$',file.dispatchDBlock) != None: - datasets.append(file.dispatchDBlock) - # erase - for dataset in datasets: - print 'erase %s' % dataset - status,out = ddm.DQ2.main('eraseDataset',dataset) - print out - -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=4) -varMap[':jobStatus'] = 'activated' -varMap[':modificationTime'] = timeLimit -varMap[':prodSourceLabel'] = 'managed' -varMap[':computingSite'] = site -sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND computingSite=:computingSite AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" -status,res = proxyS.querySQLS(sql,varMap) - -jobs = [] -if res != None: - for (id,) in res: - jobs.append(id) -if len(jobs): - nJob = 100 - iJob = 0 - while iJob < len(jobs): - print 'reassign %s' % str(jobs[iJob:iJob+nJob]) - eraseDispDatasets(jobs[iJob:iJob+nJob]) - Client.reassignJobs(jobs[iJob:iJob+nJob]) - iJob += nJob - time.sleep(10) - diff --git a/current/pandaserver/test/reassignTask.py b/current/pandaserver/test/reassignTask.py deleted file mode 100644 index 475975aeb..000000000 --- a/current/pandaserver/test/reassignTask.py +++ /dev/null @@ -1,60 +0,0 @@ -import re -import sys -import time -import datetime - -from taskbuffer.OraDBProxy import DBProxy -# password -from config import panda_config - -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -taskid = sys.argv[1] -import userinterface.Client as Client - -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) -varMap = {} -varMap[':modificationTime'] = timeLimit -varMap[':prodSourceLabel'] = 'managed' -varMap[':taskID'] = taskid -sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE taskID=:taskID AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" -status,res = proxyS.querySQLS(sql,varMap) - -jobs = [] -if res != None: - for (id,) in res: - jobs.append(id) -if len(jobs): - nJob = 100 - iJob = 0 - while iJob < len(jobs): - print 'reassign %s' % str(jobs[iJob:iJob+nJob]) - Client.reassignJobs(jobs[iJob:iJob+nJob]) - iJob += nJob - time.sleep(10) - -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) -varMap = {} -varMap[':jobStatus'] = 'activated' -varMap[':modificationTime'] = timeLimit -varMap[':prodSourceLabel'] = 'managed' -varMap[':taskID'] = taskid -sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND taskID=:taskID AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" -status,res = proxyS.querySQLS(sql,varMap) - -jobs = [] -if res != None: - for (id,) in res: - jobs.append(id) -if len(jobs): - nJob = 100 - iJob = 0 - while iJob < len(jobs): - print 'reassign %s' % str(jobs[iJob:iJob+nJob]) - Client.reassignJobs(jobs[iJob:iJob+nJob]) - iJob += nJob - time.sleep(10) - - - diff --git a/current/pandaserver/test/reassignWaiting.py b/current/pandaserver/test/reassignWaiting.py deleted file mode 100755 index 24c8a232f..000000000 --- a/current/pandaserver/test/reassignWaiting.py +++ /dev/null @@ -1,39 +0,0 @@ -import time -import datetime -from taskbuffer.OraDBProxy import DBProxy -import userinterface.Client as Client - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# time limit -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) - -# instantiate DB proxies -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -while True: - # get PandaIDs - varMap = {} - varMap[':modificationTime'] = timeLimit - sql = "SELECT PandaID FROM ATLAS_PANDA.jobsWaiting4 WHERE modificationTime<:modificationTime ORDER BY PandaID" - status,res = proxyS.querySQLS(sql,varMap) - - # escape - if len(res) == 0: - break - # convert to list - jobs = [] - for id, in res: - jobs.append(id) - # reassign - nJob = 300 - iJob = 0 - while iJob < len(jobs): - print 'reassignJobs(%s)' % jobs[iJob:iJob+nJob] - Client.reassignJobs(jobs[iJob:iJob+nJob]) - iJob += nJob - time.sleep(60) - diff --git a/current/pandaserver/test/redirectLog.py b/current/pandaserver/test/redirectLog.py deleted file mode 100755 index 351d4a192..000000000 --- a/current/pandaserver/test/redirectLog.py +++ /dev/null @@ -1,40 +0,0 @@ - -""" -redirect apache log to the logging server - -""" - -import re -from pandalogger.PandaLogger import PandaLogger - -# logger -_loggerMap = {} -pandaLogger = PandaLogger() - -while True: - # read line - line = raw_input() - # extract host, request and response - items = re.findall('(\S+) - - \[[^\]]+\] ("[^"]+") (\d+)',line) - if len(items) == 1: - # host - host = items[0][0] - # request - request = items[0][1].split()[1].split('/')[-1] - if request == 'isAlive': - # somehow isAlive is not recorded - request = 'IsAlive' - # set logtype - if request.startswith('datasetCompleted'): - logtype = 'datasetCompleted' - else: - logtype = request - # response - response = items[0][2] - # make message - message = '%s - %s %s' % (host,request,response) - # get logger - pandaLogger.setParam('Type',logtype) - logger = pandaLogger.getHttpLogger('prod') - # add message - logger.info(message) diff --git a/current/pandaserver/test/redirectLog.sh b/current/pandaserver/test/redirectLog.sh deleted file mode 100755 index c60e9ff27..000000000 --- a/current/pandaserver/test/redirectLog.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -BASEPATH=/usatlas/u/sm/prod -BINPATH=/usatlas/u/sm/latest -LOG=$BASEPATH/httpd/logs/access_log - -# for python -export PATH=$BINPATH/python/bin:$PATH -export PYTHONPATH=$BASEPATH/panda:$PYTHONPATH - -tail -F $LOG | python $BASEPATH/panda/test/redirectLog.py diff --git a/current/pandaserver/test/resubmitJobs.py b/current/pandaserver/test/resubmitJobs.py deleted file mode 100755 index 7272d19ca..000000000 --- a/current/pandaserver/test/resubmitJobs.py +++ /dev/null @@ -1,14 +0,0 @@ -import sys - -import userinterface.Client as Client - -if len(sys.argv) == 2: - Client.resubmitJobs([sys.argv[1]]) -else: - startID = int(sys.argv[1]) - endID = int(sys.argv[2]) - if startID > endID: - print '%d is less than %d' % (endID,startID) - sys.exit(1) - Client.resubmitJobs(range(startID,endID+1)) - diff --git a/current/pandaserver/test/runMerger.py b/current/pandaserver/test/runMerger.py deleted file mode 100644 index ba765b16f..000000000 --- a/current/pandaserver/test/runMerger.py +++ /dev/null @@ -1,219 +0,0 @@ -import os -import re -import sys -import time -import datetime -import commands -import threading - -from config import panda_config - -# initialize cx_Oracle using dummy connection -from taskbuffer.Initializer import initializer -initializer.init() - -from dataservice.Merger import Merger -from taskbuffer.TaskBuffer import taskBuffer -from pandalogger.PandaLogger import PandaLogger - - -# logger -_logger = PandaLogger().getLogger('runMerger') - -_logger.debug("================= start ==================") - -# overall timeout value -overallTimeout = 60 - -# kill old process -try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName) - for line in out.split('\n'): - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) -except: - type, value, traceBack = sys.exc_info() - _logger.error("kill process : %s %s" % (type,value)) - -# time limit -timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=5) -timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(hours=12) -timeLimitX = datetime.datetime.utcnow() - datetime.timedelta(hours=6) - -# instantiate TB -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - -# thread pool -class ThreadPool: - def __init__(self): - self.lock = threading.Lock() - self.list = [] - - def add(self,obj): - self.lock.acquire() - self.list.append(obj) - self.lock.release() - - def remove(self,obj): - self.lock.acquire() - self.list.remove(obj) - self.lock.release() - - def join(self): - self.lock.acquire() - thrlist = tuple(self.list) - self.lock.release() - for thr in thrlist: - thr.join() - - -# thread to merge dataset -class MergerThr (threading.Thread): - def __init__(self,lock,proxyLock,datasets,pool): - threading.Thread.__init__(self) - self.datasets = datasets - self.lock = lock - self.proxyLock = proxyLock - self.pool = pool - self.maxTry = 3 - self.pool.add(self) - - def run(self): - self.lock.acquire() - try: - # loop over all datasets - for vuid,name,modDate,verNum in self.datasets: - try: - try: - verNum = int(verNum) - except: - verNum = 0 - _logger.debug("Merge %s %s %s" % (modDate,name,verNum)) - toBeClosed = False - # close old datasets anyway - if modDate < timeLimitX or verNum >= self.maxTry: - toBeClosed = True - # check version - dsSpec = taskBuffer.queryDatasetWithMap({'vuid':vuid}) - if dsSpec == None: - _logger.error("failed to get dataset spec for %s:%s" % (name,vuid)) - continue - try: - if int(dsSpec.version) != verNum+1: - _logger.debug("skip %s due to version mismatch %s != %s+1" % (name,dsSpec.version,verNum)) - continue - except: - _logger.error("failed to convert version='%s' to int for %s" % (dsSpec.version,name)) - continue - # get PandaID - self.proxyLock.acquire() - proxyS = taskBuffer.proxyPool.getProxy() - pandaID = proxyS.getPandaIDwithDestDBlock(name) - taskBuffer.proxyPool.putProxy(proxyS) - self.proxyLock.release() - if pandaID == None: - _logger.error("failed to find PandaID for %s" % name) - toBeClosed = True - else: - # get job - self.proxyLock.acquire() - pandaJob = taskBuffer.peekJobs([pandaID])[0] - self.proxyLock.release() - if pandaJob == None: - _logger.error("failed to get job for %s PandaID=%s" % (name,pandaID)) - toBeClosed = True - else: - # run merger - _logger.debug("run merger for %s" % name) - merger = Merger(taskBuffer,pandaJob) - mRet = merger.run() - if mRet == None: - _logger.debug("got unrecoverable for %s" % name) - toBeClosed = True - elif mRet == True: - _logger.debug("succeeded for %s" % name) - toBeClosed = True - else: - _logger.debug("failed for %s" % name) - # close dataset - if toBeClosed: - _logger.debug("close %s" % name) - self.proxyLock.acquire() - varMap = {} - varMap[':vuid'] = vuid - varMap[':status'] = 'tobeclosed' - taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", - varMap) - self.proxyLock.release() - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("Failed %s with %s:%s" % (name,errType,errValue)) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("MergerThr failed with %s:%s" % (errType,errValue)) - self.pool.remove(self) - self.lock.release() - - -# start merger -mergeLock = threading.Semaphore(3) -mergeProxyLock = threading.Lock() -mergeThreadPool = ThreadPool() -maxRows = 10000 -sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows -while True: - # lock - mergeLock.acquire() - # get datasets - mergeProxyLock.acquire() - varMap = {} - varMap[':modificationdateU'] = timeLimitU - varMap[':modificationdateL'] = timeLimitL - varMap[':type'] = 'output' - varMap[':status'] = 'tobemerged' - proxyS = taskBuffer.proxyPool.getProxy() - res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60',getVersion=True) - taskBuffer.proxyPool.putProxy(proxyS) - if res == None: - _logger.debug("# of datasets to be merged: %s" % res) - else: - _logger.debug("# of datasets to be merged: %s" % len(res)) - if res==None or len(res)==0: - mergeProxyLock.release() - mergeLock.release() - break - # release - mergeProxyLock.release() - mergeLock.release() - # run thread - iRows = 0 - nRows = 100 - while iRows < len(res): - mergerThr = MergerThr(mergeLock,mergeProxyLock,res[iRows:iRows+nRows],mergeThreadPool) - mergerThr.start() - iRows += nRows - mergeThreadPool.join() - if len(res) < maxRows: - break - - -_logger.debug("================= end ==================") diff --git a/current/pandaserver/test/runRebro.py b/current/pandaserver/test/runRebro.py deleted file mode 100755 index 494a0798d..000000000 --- a/current/pandaserver/test/runRebro.py +++ /dev/null @@ -1,198 +0,0 @@ -import os -import re -import sys -import pytz -import time -import fcntl -import types -import shelve -import random -import datetime -import commands -import threading -import userinterface.Client as Client -from dataservice.DDM import ddm -from dataservice.DDM import dashBorad -from taskbuffer.OraDBProxy import DBProxy -from taskbuffer.TaskBuffer import taskBuffer -from pandalogger.PandaLogger import PandaLogger -from jobdispatcher.Watcher import Watcher -from brokerage.SiteMapper import SiteMapper -from dataservice.Adder import Adder -from dataservice.Finisher import Finisher -from dataservice.MailUtils import MailUtils -from taskbuffer import ProcessGroups -import brokerage.broker_util -import brokerage.broker -import taskbuffer.ErrorCode -import dataservice.DDM - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# logger -_logger = PandaLogger().getLogger('runRebro') - -_logger.debug("===================== start =====================") - -# memory checker -def _memoryCheck(str): - try: - proc_status = '/proc/%d/status' % os.getpid() - procfile = open(proc_status) - name = "" - vmSize = "" - vmRSS = "" - # extract Name,VmSize,VmRSS - for line in procfile: - if line.startswith("Name:"): - name = line.split()[-1] - continue - if line.startswith("VmSize:"): - vmSize = "" - for item in line.split()[1:]: - vmSize += item - continue - if line.startswith("VmRSS:"): - vmRSS = "" - for item in line.split()[1:]: - vmRSS += item - continue - procfile.close() - _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str)) - except: - type, value, traceBack = sys.exc_info() - _logger.error("memoryCheck() : %s %s" % (type,value)) - _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str)) - return - -_memoryCheck("start") - -# kill old process -try: - # time limit - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=7) - # get process list - scriptName = sys.argv[0] - out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName) - for line in out.split('\n'): - items = line.split() - # owned process - if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron - continue - # look for python - if re.search('python',line) == None: - continue - # PID - pid = items[1] - # start time - timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) - startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) - # kill old process - if startTime < timeLimit: - _logger.debug("old process : %s %s" % (pid,startTime)) - _logger.debug(line) - commands.getoutput('kill -9 %s' % pid) -except: - type, value, traceBack = sys.exc_info() - _logger.error("kill process : %s %s" % (type,value)) - - -# instantiate TB -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - -# instantiate sitemapper -siteMapper = SiteMapper(taskBuffer) - -_memoryCheck("rebroker") - -# rebrokerage -_logger.debug("Rebrokerage start") -try: - normalTimeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=24) - sortTimeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) - sql = "SELECT jobDefinitionID,prodUserName,prodUserID,computingSite,MAX(modificationTime) FROM ATLAS_PANDA.jobsActive4 " - sql += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus=:jobStatus " - sql += "AND modificationTime<:modificationTime " - sql += "AND jobsetID IS NOT NULL " - sql += "AND processingType IN (:processingType1,:processingType2) " - sql += "GROUP BY jobDefinitionID,prodUserName,prodUserID,computingSite " - varMap = {} - varMap[':prodSourceLabel1'] = 'user' - varMap[':prodSourceLabel2'] = 'panda' - varMap[':modificationTime'] = sortTimeLimit - varMap[':processingType1'] = 'pathena' - varMap[':processingType2'] = 'prun' - varMap[':jobStatus'] = 'activated' - # get jobs older than threshold - ret,res = taskBuffer.querySQLS(sql, varMap) - sql = "SELECT PandaID,modificationTime FROM %s WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " - sql += "AND modificationTime>:modificationTime AND rownum <= 1" - if res != None: - from userinterface.ReBroker import ReBroker - recentRuntimeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) - # loop over all user/jobID combinations - iComb = 0 - nComb = len(res) - _logger.debug("total combinations = %s" % nComb) - for jobDefinitionID,prodUserName,prodUserID,computingSite,maxModificationTime in res: - # check time if it is closed to log-rotate - timeNow = datetime.datetime.now(pytz.timezone('Europe/Zurich')) - timeCron = timeNow.replace(hour=4,minute=0,second=0,microsecond=0) - if (timeNow-timeCron) < datetime.timedelta(seconds=60*10) and \ - (timeCron-timeNow) < datetime.timedelta(seconds=60*30): - _logger.debug("terminate since close to log-rotate time") - break - # check if jobs with the jobID have run recently - varMap = {} - varMap[':prodUserName'] = prodUserName - varMap[':jobDefinitionID'] = jobDefinitionID - varMap[':modificationTime'] = recentRuntimeLimit - _logger.debug(" rebro:%s/%s:ID=%s:%s" % (iComb,nComb,jobDefinitionID,prodUserName)) - iComb += 1 - hasRecentJobs = False - # check site - if not siteMapper.checkSite(computingSite): - _logger.debug(" -> skip unknown site=%s" % computingSite) - continue - # check site status - tmpSiteStatus = siteMapper.getSite(computingSite).status - if not tmpSiteStatus in ['offline','test']: - # use normal time limit for nornal site status - if maxModificationTime > normalTimeLimit: - _logger.debug(" -> skip wait for normal timelimit=%s skip %s ran recently at %s" % (resU[0][0],resU[0][1])) - break - else: - _logger.debug(" -> immidiate rebro due to site status=%s" % tmpSiteStatus) - if hasRecentJobs: - # skip since some jobs have run recently - continue - else: - reBroker = ReBroker(taskBuffer) - # try to lock - rebRet,rebOut = reBroker.lockJob(prodUserID,jobDefinitionID) - if not rebRet: - # failed to lock - _logger.debug(" -> failed to lock : %s" % rebOut) - continue - else: - # start - _logger.debug(" -> start") - reBroker.start() - reBroker.join() -except: - errType,errValue = sys.exc_info()[:2] - _logger.error("rebrokerage failed with %s:%s" % (errType,errValue)) - -_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/setPriority.py b/current/pandaserver/test/setPriority.py deleted file mode 100755 index 7dab5b3c2..000000000 --- a/current/pandaserver/test/setPriority.py +++ /dev/null @@ -1,30 +0,0 @@ -import time -import sys -import optparse - - -from taskbuffer.OraDBProxy import DBProxy - -# password -from config import panda_config - -usage = """%prog - - Set a priority to jobs in a task""" - -optP = optparse.OptionParser(usage=usage,conflict_handler="resolve") -options,args = optP.parse_args() - - -proxyS = DBProxy() -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -varMap = {} -varMap[':prodSourceLabel'] = 'managed' -varMap[':taskID'] = sys.argv[1] -varMap[':prio'] = sys.argv[2] -sql = "UPDATE %s SET currentPriority=:prio WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID" -for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: - status,res = proxyS.querySQLS(sql % table,varMap) - - diff --git a/current/pandaserver/test/testDB.py b/current/pandaserver/test/testDB.py deleted file mode 100755 index 752bf3f77..000000000 --- a/current/pandaserver/test/testDB.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/python - -""" -test DB access - -""" - -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec -from taskbuffer.DatasetSpec import DatasetSpec -from taskbuffer.DBProxyPool import DBProxyPool - -import getpass -passwd = getpass.getpass() - -pool = DBProxyPool('adbpro.usatlas.bnl.gov',passwd,2) - -proxy = pool.getProxy() - -import sys -import commands - -job1 = JobSpec() -job1.PandaID='NULL' -job1.jobStatus='unknown' -job1.computingSite="aaa" -f11 = FileSpec() -f11.lfn = 'in1.pool.root' -f11.type = 'input' -job1.addFile(f11) -f12 = FileSpec() -f12.lfn = 'out1.pool.root' -f12.type = 'output' -job1.addFile(f12) - -job2 = JobSpec() -job2.PandaID='NULL' -job2.jobStatus='unknown' -job2.computingSite="bbb" -f21 = FileSpec() -f21.lfn = 'in2.pool.root' -f21.type = 'input' -job2.addFile(f21) -f22 = FileSpec() -f22.lfn = 'out2.pool.root' -f22.type = 'output' -job2.addFile(f22) - -proxy.insertNewJob(job1) -proxy.insertNewJob(job2) -print "Inserted %d %d" % (job1.PandaID,job2.PandaID) -proxy.activateJob(job1) -proxy.activateJob(job2) -print "activated" -ret = proxy.getJobs(1,"aaa") -print "Got Jobs" -for j in ret: - print j.PandaID -print proxy.peekJob(job1.PandaID).jobStatus -proxy.updateJobStatus(job1.PandaID,"unknown") -print " ->" ,proxy.peekJob(job1.PandaID).jobStatus - -print proxy.peekJob(job2.PandaID).jobStatus -job2.jobStatus = "running" -proxy.updateJob(job2,False) -print " ->" ,proxy.peekJob(job2.PandaID).jobStatus -print "Updated" -proxy.archiveJob(job1,False) -proxy.archiveJobLite(job2.PandaID,job2.jobStatus) -print "Archived" -proxy.querySQL("DELETE FROM jobsArchived3 WHERE PandaID=%d" % job1.PandaID) -proxy.querySQL("DELETE FROM jobsArchived3 WHERE PandaID=%d" % job2.PandaID) -print "job Deleted" - -print "dataset" -dataset = DatasetSpec() -dataset.vuid = commands.getoutput('/usr/bin/uuidgen') -dataset.name = 'test.%s' % dataset.vuid - -proxy.insertDataset(dataset) -print dataset.vuid -dataset2 = proxy.queryDataset(dataset.vuid) -print dataset2.values() -dataset2.type = 'test' -proxy.updateDataset(dataset2) -dataset3 = proxy.queryDataset(dataset.vuid) -print dataset3.values() -proxy.querySQL("DELETE FROM Datasets WHERE vuid='%s'" % dataset.vuid) diff --git a/current/pandaserver/test/testDQ.py b/current/pandaserver/test/testDQ.py deleted file mode 100755 index 381cdece8..000000000 --- a/current/pandaserver/test/testDQ.py +++ /dev/null @@ -1,102 +0,0 @@ -import commands -from dataservice.DDM import ddm - -#print ddm.DQ2ProductionClient.generateUUID() -#print ddm.DQ2.getFilesFromCatalog('aho.xml') -#print ddm.DQ2ProductionClient.dq2_makeblocks('input.data') - -ids=['pandatest.000003.dd.input._00047.junk','09801b0a-9fd0-4237-8caf-a37932c26e39', - 'pandatest.000003.dd.input._00050.junk','6dd3d367-4aa3-4e1a-9ac3-9ad14b7311f4', - 'pandatest.000003.dd.input._00037.junk','817c2c92-467b-4a1b-9482-f2ec8468cf2e', - 'pandatest.000003.dd.input._00021.junk','7720527f-817e-40c7-9e29-ce237f59edfa', - 'pandatest.000003.dd.input._00023.junk','5f1f9982-85a3-4d1a-9ee9-f1de22c02544', - 'pandatest.000003.dd.input._00042.junk','610cc91a-c731-4bce-ac7a-ff5133e7d18b', - 'pandatest.000003.dd.input._00027.junk','bd987478-3c59-4551-b12b-2853bac25613', - 'pandatest.000003.dd.input._00032.junk','9d0424f3-7552-4282-92f2-dfe74e9a6c12', - 'pandatest.000003.dd.input._00009.junk','dce33d4a-4569-49ee-95c5-b619b161c777', - 'pandatest.000003.dd.input._00036.junk','2fc9836b-82d6-41b0-b966-a5c37662172d', - 'pandatest.000003.dd.input._00031.junk','65b957e0-5ecc-44bb-a1f9-cccb61ca2d16', - 'pandatest.000003.dd.input._00025.junk','be29fe82-17e2-4122-b4c8-f49a0b76c81f', - 'pandatest.000003.dd.input._00029.junk','afa4322f-409b-4327-9169-229d8d48ad5a', - 'pandatest.000003.dd.input._00013.junk','cf236d3b-45fd-4b58-bdfb-59abc983c886', - 'pandatest.000003.dd.input._00020.junk','b02f98da-0138-4b58-89ba-a88f37214a89', - 'pandatest.000003.dd.input._00001.junk','12ab5bb9-944e-4e75-bb90-b64c462d4cd8', - 'pandatest.000003.dd.input._00001.junk','12ab5bb9-944e-4e75-bb90-b64c462d4cd8', - 'pandatest.000003.dd.input._00006.junk','c0a422ad-e9f1-44bb-9539-cfef7e739da2', - 'pandatest.000003.dd.input._00034.junk','da670db3-3638-4f06-b650-a9315eb2bd63', - 'pandatest.000003.dd.input._00046.junk','2fcef270-2e41-472d-83c0-53749b401b74', - 'pandatest.000003.dd.input._00012.junk','5e212fa1-201f-494d-a2b2-420b229b08fc', - 'pandatest.000003.dd.input._00044.junk','87c8ebcc-a637-4204-b77b-8219e68b98d7', - 'pandatest.000003.dd.input._00030.junk','87ad811f-7d39-43d9-8a13-e117079bb208', - 'pandatest.000003.dd.input._00022.junk','6b902506-1ee1-46b1-a105-1521a8c0dbca', - 'pandatest.000003.dd.input._00017.junk','2bbed213-943c-41be-b9d7-7d86a309b0b2', - 'pandatest.000003.dd.input._00049.junk','8366e269-f9ae-4b9c-bd98-df4027c992c7', - 'pandatest.000003.dd.input._00015.junk','f3c5f37c-b4c2-4933-9633-467ba3a7c364', - 'pandatest.000003.dd.input._00004.junk','35d66be2-9d21-44a3-96f7-903a7abf4a87', - 'pandatest.000003.dd.input._00010.junk','2279ea3e-ebbb-4b19-9a69-9868f0cce694', - 'pandatest.000003.dd.input._00040.junk','a847dbbb-4f98-4b5b-b353-e29e3e3b3fd5', - 'pandatest.000003.dd.input._00007.junk','abfef002-62ca-4d84-9813-6329764e38bd', - 'pandatest.000003.dd.input._00048.junk','52854023-67d8-4a0f-99ac-bb1f0bd1dc98', - 'pandatest.000003.dd.input._00016.junk','bddf7441-6ac9-4087-bafe-32e47448cdc1', - 'pandatest.000003.dd.input._00041.junk','c76999ba-4cdf-49e9-bfa5-ff3525fbf1ab', - 'pandatest.000003.dd.input._00003.junk','4865119e-367f-4dd8-bdff-505bd878dfde', - 'pandatest.000003.dd.input._00019.junk','b9fce1fd-8d4c-4fc4-932f-12b13263ca0c', - 'pandatest.000003.dd.input._00011.junk','f93a4e08-fd4f-45fc-b324-91ff59555b1c', - 'pandatest.000003.dd.input._00018.junk','e4894561-9589-40d8-871b-b57d70564384', - 'pandatest.000003.dd.input._00002.junk','58934980-5ab3-4a66-b3da-55f86d4b54bd', - 'pandatest.000003.dd.input._00005.junk','5993fe60-bc8c-4fd8-aac1-dfd55700c9c3', - 'pandatest.000003.dd.input._00028.junk','6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27', - 'pandatest.000003.dd.input._00033.junk','98f79ba1-1793-4253-aac7-bdf90a51d1ee', - 'pandatest.000003.dd.input._00039.junk','33660dd5-7cef-422a-a7fc-6c24cb10deb1', - 'pandatest.000003.dd.input._00014.junk','5c0e9ed8-05a6-41c4-8c07-39b2be33ebc1', - 'pandatest.000003.dd.input._00008.junk','b0c184d1-5f5e-45a6-9cc8-8b0f20a85463', - 'pandatest.000003.dd.input._00038.junk','b9171997-4d2b-4075-b154-579ebe9438fa', - 'pandatest.000003.dd.input._00026.junk','89e5bdf1-15de-44ae-a388-06c1e7d7e2fc', - 'pandatest.000003.dd.input._00024.junk','c77b77a2-e6d1-4360-8751-19d9fb77e1f1', - 'pandatest.000003.dd.input._00043.junk','cc6ac2a1-4616-4551-80a7-d96f79252b64', - 'pandatest.000003.dd.input._00045.junk','ddbed17a-6d65-4e8d-890a-21e1eaa3e9d6', - 'pandatest.000003.dd.input._00035.junk','8ed1875a-eb90-4906-8fc4-0449d300ddfe' - ] - -for i in range(1): - datasetName='testDQ.%s' % commands.getoutput('/usr/bin/uuidgen') - print datasetName - - #['pandatest.000003.dd.input._00004.junk','35d66be2-9d21-44a3-96f7-903a7abf4a87'] - #'pandatest.000003.dd.input._00028.junk','6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27', - # 'pandatest.000003.dd.input._00033.junk','98f79ba1-1793-4253-aac7-bdf90a51d1ee'] - print (['registerNewDataset','-c',datasetName]+ids[i*2:i*2+2]) - ddm.DQ2.main(['registerNewDataset','-c',datasetName]+ids[i*2:i*2+2]) - ''' - status,out = ddm.RepositoryClient.main(['queryDatasetByName',datasetName]) - exec "vuids = %s" % out.split('\n')[0] - if vuids.has_key(datasetName): - vuid = vuids[datasetName] - print vuid - status,out = ddm.RepositoryClient.main(['resolveVUID',vuid]) - status,out = ddm.DQ2.getFilesFromCatalog('baka.xml') - exec "rets = %s" % out.split('\n')[0] - print rets[0] - exec "ids = %s" % out - print ddm.DQ2.main(['addFilesToDataset',datasetName]+ids) - status,out = ddm.DQ2.main(['listFilesInDataset',datasetName]) - print out - ''' - print (['registerDatasetLocations','-c',datasetName,'http://dms02.usatlas.bnl.gov/sites/bnl/lrc']) - ddm.DQ2.main(['registerDatasetLocations','-c',datasetName, - 'http://dms02.usatlas.bnl.gov/sites/bnl/lrc']) - print (['registerDatasetSubscription',datasetName,'http://doe-dhcp241.bu.edu:8000/dq2/']) - ddm.DQ2.main(['registerDatasetSubscription',datasetName,'http://doe-dhcp241.bu.edu:8000/dq2/']) -#print ddm.DQ2.main(['eraseDataset',datasetName]) - -#print ddm.DQ2.main(['eraseDataset',datasetName]) -#print ddm.DQ2ProductionClient.dq2_create_dataset(datasetName) -#status,out = ddm.DQ2ProductionClient.dq2_assign_destination(datasetName,'BNL_SE') -#print out -#print ddm.DQ2.main(['eraseDataset',datasetName]) -#status,out = ddm.DQ2.main(['listFilesInDataset','panda.destDB.11aed982-8079-4db9-964c-37a284b8597a']) -#print out - -ddm.DQ2_iter.listFileReplicasBySites('mc11_7TeV.151900.madgraph_SM_SG_SS_direct_1200_600_395.merge.AOD.e1095_a131_s1353_a145_r2993_tid723983_00', - 0,['SARA-MATRIX_DATADISK'], - 0,300) diff --git a/current/pandaserver/test/testEvgen.py b/current/pandaserver/test/testEvgen.py deleted file mode 100755 index db636a439..000000000 --- a/current/pandaserver/test/testEvgen.py +++ /dev/null @@ -1,59 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = None - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = None - -jobList = [] - -for i in range(1): - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) - job.AtlasRelease = 'Atlas-14.1.0' - job.homepackage = 'AtlasProduction/14.1.0.3' - job.transformation = 'csc_evgen_trf.py' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.currentPriority = 100 - job.prodSourceLabel = 'test' - job.computingSite = site - job.cloud = 'US' - job.cmtConfig = 'i686-slc4-gcc34-opt' - - file = FileSpec() - file.lfn = "%s.evgen.pool.root" % job.jobName - file.destinationDBlock = job.destinationDBlock - file.destinationSE = job.destinationSE - file.dataset = job.destinationDBlock - file.destinationDBlockToken = 'ATLASDATADISK' - file.type = 'output' - job.addFile(file) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="5144 1 5000 1 CSC.005144.PythiaZee.py %s NONE NONE NONE" % file.lfn - jobList.append(job) - -for i in range(1): - s,o = Client.submitJobs(jobList) - print "---------------------" - print s - for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testEvgen14.py b/current/pandaserver/test/testEvgen14.py deleted file mode 100755 index af53c0e95..000000000 --- a/current/pandaserver/test/testEvgen14.py +++ /dev/null @@ -1,59 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = None - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = 'BNL_SE' - -jobList = [] - -for i in range(1): - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) - job.AtlasRelease = 'Atlas-14.1.0' - job.homepackage = 'AtlasProduction/14.1.0.3' - job.transformation = 'csc_evgen_trf.py' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.currentPriority = 1000 - job.prodSourceLabel = 'test' - job.computingSite = site - job.processingType = 'test' - job.cmtConfig = 'i686-slc4-gcc34-opt' - - file = FileSpec() - file.lfn = "%s.evgen.pool.root" % job.jobName - file.destinationDBlock = job.destinationDBlock - file.destinationSE = job.destinationSE - file.dataset = job.destinationDBlock - file.destinationDBlockToken = 'ATLASDATADISK' - file.type = 'output' - job.addFile(file) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="5144 1 5000 1 CSC.005144.PythiaZee.py %s NONE NONE NONE" % file.lfn - jobList.append(job) - -for i in range(1): - s,o = Client.submitJobs(jobList) - print "---------------------" - print s - for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testEvgen15.py b/current/pandaserver/test/testEvgen15.py deleted file mode 100755 index 0753e3329..000000000 --- a/current/pandaserver/test/testEvgen15.py +++ /dev/null @@ -1,57 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -site = sys.argv[1] -cloud = sys.argv[2] - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = None - -jobList = [] - -for i in range(1): - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) - job.AtlasRelease = 'Atlas-15.6.10' - job.homepackage = 'AtlasProduction/15.6.10.1' - job.transformation = 'Evgen_trf.py' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.currentPriority = 10000 - job.prodSourceLabel = 'test' - job.computingSite = site - job.cloud = cloud - job.cmtConfig = 'i686-slc5-gcc43-opt' - - file = FileSpec() - file.lfn = "%s.evgen.pool.root" % job.jobName - file.destinationDBlock = job.destinationDBlock - file.destinationSE = job.destinationSE - file.dataset = job.destinationDBlock - file.destinationDBlockToken = 'ATLASDATADISK' - file.type = 'output' - job.addFile(file) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="10000 105815 12330001 5000 12467 MC9.105815.JF140_pythia_jet_filter.py %s NONE NONE NONE MC09JobOpts-00-01-88.tar.gz" % file.lfn - jobList.append(job) - -for i in range(1): - s,o = Client.submitJobs(jobList) - print "---------------------" - print s - for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testEvgen16.py b/current/pandaserver/test/testEvgen16.py deleted file mode 100755 index 0c0cc67f4..000000000 --- a/current/pandaserver/test/testEvgen16.py +++ /dev/null @@ -1,57 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -site = sys.argv[1] -cloud = sys.argv[2] - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = None - -jobList = [] - -for i in range(1): - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) - job.AtlasRelease = 'Atlas-16.6.2' - job.homepackage = 'AtlasProduction/16.6.2.1' - job.transformation = 'Evgen_trf.py' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.currentPriority = 10000 - job.prodSourceLabel = 'test' - job.computingSite = site - job.cloud = cloud - job.cmtConfig = 'i686-slc5-gcc43-opt' - - file = FileSpec() - file.lfn = "%s.evgen.pool.root" % job.jobName - file.destinationDBlock = job.destinationDBlock - file.destinationSE = job.destinationSE - file.dataset = job.destinationDBlock - file.destinationDBlockToken = 'ATLASDATADISK' - file.type = 'output' - job.addFile(file) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="2760 105048 19901 101 200 MC10.105048.PythiaB_ccmu3mu1X.py %s NONE NONE NONE MC10JobOpts-latest-test.tar.gz" % file.lfn - jobList.append(job) - -for i in range(1): - s,o = Client.submitJobs(jobList) - print "---------------------" - print s - for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testEvgen17.py b/current/pandaserver/test/testEvgen17.py deleted file mode 100755 index ce808e4e6..000000000 --- a/current/pandaserver/test/testEvgen17.py +++ /dev/null @@ -1,58 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -site = sys.argv[1] -cloud = sys.argv[2] - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = None - -jobList = [] - -for i in range(1): - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) - job.AtlasRelease = 'Atlas-17.0.5' - job.homepackage = 'AtlasProduction/17.0.5.6' - job.transformation = 'Evgen_trf.py' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.currentPriority = 10000 - job.prodSourceLabel = 'test' - job.computingSite = site - job.cloud = cloud - job.cmtConfig = 'i686-slc5-gcc43-opt' - - file = FileSpec() - file.lfn = "%s.evgen.pool.root" % job.jobName - file.destinationDBlock = job.destinationDBlock - file.destinationSE = job.destinationSE - file.dataset = job.destinationDBlock - file.destinationDBlockToken = 'ATLASDATADISK' - file.type = 'output' - job.addFile(file) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="7000 108316 1 5000 1 MC11.108316.Pythia8_minbias_ND.py %s" % file.lfn - - jobList.append(job) - -for i in range(1): - s,o = Client.submitJobs(jobList) - print "---------------------" - print s - for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testFinder.py b/current/pandaserver/test/testFinder.py deleted file mode 100644 index 09bb9574d..000000000 --- a/current/pandaserver/test/testFinder.py +++ /dev/null @@ -1,69 +0,0 @@ -import sys -from taskbuffer.OraDBProxy import DBProxy - -from dataservice import AddressFinder - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# instantiate DB proxies -proxyS = DBProxy(True) -proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) - -# get DN and address -status,res = proxyS.querySQLS("SELECT dn,email,name FROM ATLAS_PANDAMETA.users",{},arraySize=1000000) -if res == None: - print "SQL error" - sys.exit(0) - -# to upper chrs -def toUpper(emails): - retA = [] - for email in emails: - retA.append(email.upper()) - return retA - -outF = open('newemail.sql','w') - -for dn,origEmail,name in res: - if dn == None: - dn = name - if dn == None: - continue - emailsP = AddressFinder.getEmailPhonebook(dn) - emailsX = AddressFinder.getEmailXwho(dn) - if toUpper(emailsP) != toUpper(emailsX) and len(emailsP) != 0: - print dn - print "ERROR : xwho != phone" - print "phone : %s" % str(emailsP) - print "xwho : %s" % str(emailsX) - print "DB : %s" % origEmail - print - elif len(emailsP) == 0: - print dn - print "ERROR : not found" - print "DB : %s" % origEmail - print - elif len(emailsP) > 1: - print dn - print "ERROR : non-unique %s" % str(emailsP) - print "DB : %s" % origEmail - print - elif origEmail == None or origEmail.upper() != emailsP[0].upper() and origEmail != 'notsend': - print dn - print "phone : %s" % str(emailsP) - print "xwho : %s" % str(emailsX) - print "ERROR : %-40s new: %s\n" % (origEmail,emailsP[0]) - outF.write("/* %-40s new: %s */\n" % (origEmail,emailsP[0])) - outF.write("UPDATE atlas_pandameta.users SET email='%s' WHERE name='%s';\n" % (emailsP[0],name)) - pass - else: - pass - #print dn - #print "OK" - -outF.write('COMMIT;') -outF.close() - - diff --git a/current/pandaserver/test/testG4sim.py b/current/pandaserver/test/testG4sim.py deleted file mode 100755 index b2f8f2f9a..000000000 --- a/current/pandaserver/test/testG4sim.py +++ /dev/null @@ -1,83 +0,0 @@ -import sys -import time -import random -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = None - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = 'BNL_ATLAS_2' -#destName = 'BU_ATLAS_Tier2' - -files = { - 'mc11.007204.singlepart_mu4.evgen.EVNT.v11000302._00037.pool.root.1':None, - 'mc11.007204.singlepart_mu4.evgen.EVNT.v11000302._00038.pool.root.1':None, - } - -jobList = [] - -for lfn in files.keys(): - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = commands.getoutput('uuidgen') - job.AtlasRelease = 'Atlas-11.0.3' - job.homepackage = 'JobTransforms-11-00-03-02' - job.transformation = 'share/csc.simul.trf' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.computingSite = site - job.prodDBlock = 'mc11.007204.singlepart_mu4.evgen.EVNT.v11000302' - job.cmtConfig = 'i686-slc4-gcc34-opt' - - job.prodSourceLabel = 'test' - job.currentPriority = 1000 - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - - fileOE = FileSpec() - fileOE.lfn = "%s.HITS.pool.root" % commands.getoutput('uuidgen') - fileOE.destinationDBlock = job.destinationDBlock - fileOE.destinationSE = job.destinationSE - fileOE.dataset = job.destinationDBlock - fileOE.destinationDBlockToken = 'ATLASDATADISK' - fileOE.type = 'output' - job.addFile(fileOE) - - fileOA = FileSpec() - fileOA.lfn = "%s.RDO.pool.root" % commands.getoutput('uuidgen') - fileOA.destinationDBlock = job.destinationDBlock - fileOA.destinationSE = job.destinationSE - fileOA.dataset = job.destinationDBlock - fileOA.destinationDBlockToken = 'ATLASDATADISK' - fileOA.type = 'output' - job.addFile(fileOA) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen') - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="%s %s %s 100 700 2158" % (fileI.lfn,fileOE.lfn,fileOA.lfn) - - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testG4sim15.py b/current/pandaserver/test/testG4sim15.py deleted file mode 100644 index 19b8d4e4b..000000000 --- a/current/pandaserver/test/testG4sim15.py +++ /dev/null @@ -1,88 +0,0 @@ -import sys -import time -import random -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -site = sys.argv[1] -cloud = sys.argv[2] - -prodDBlock = 'mc09_10TeV.105807.JF35_pythia_jet_filter.evgen.EVNT.e469_tid095268' -inputFile = 'EVNT.095268._000110.pool.root.1' - -if len(sys.argv)==5: - site = sys.argv[1] - cloud = sys.argv[2] - prodDBlock = sys.argv[3] - inputFile = sys.argv[4] - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') - -files = { - inputFile:None, - } - -jobList = [] - -index = 0 -for lfn in files.keys(): - index += 1 - job = JobSpec() - job.jobDefinitionID = (time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) - job.AtlasRelease = 'Atlas-15.3.1' - job.homepackage = 'AtlasProduction/15.3.1.5' - job.transformation = 'csc_atlasG4_trf.py' - job.destinationDBlock = datasetName - job.computingSite = site - job.prodDBlock = prodDBlock - - job.prodSourceLabel = 'test' - job.processingType = 'test' - job.currentPriority = 10000 - job.cloud = cloud - job.cmtConfig = 'i686-slc4-gcc34-opt' - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - - fileD = FileSpec() - fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v070302' - fileD.prodDBlock = fileD.dataset - fileD.lfn = 'DBRelease-7.3.2.tar.gz' - fileD.type = 'input' - job.addFile(fileD) - - fileOA = FileSpec() - fileOA.lfn = "%s.HITS.pool.root" % job.jobName - fileOA.destinationDBlock = job.destinationDBlock - fileOA.destinationSE = job.destinationSE - fileOA.dataset = job.destinationDBlock - fileOA.destinationDBlockToken = 'ATLASDATADISK' - fileOA.type = 'output' - job.addFile(fileOA) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="%s %s 5 1850 8738 ATLAS-GEO-08-00-01 QGSP_BERT VertexPos.py %s OFLCOND-SIM-01-00-00 False s595" % \ - (fileI.lfn,fileOA.lfn,fileD.lfn) - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testG4sim16.py b/current/pandaserver/test/testG4sim16.py deleted file mode 100644 index c540c4cba..000000000 --- a/current/pandaserver/test/testG4sim16.py +++ /dev/null @@ -1,88 +0,0 @@ -import sys -import time -import random -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -site = sys.argv[1] -cloud = sys.argv[2] - -prodDBlock = 'mc10_7TeV.105001.pythia_minbias.evgen.EVNT.e574_tid153937_00' -inputFile = 'EVNT.153937._000184.pool.root.1' - -if len(sys.argv)==5: - site = sys.argv[1] - cloud = sys.argv[2] - prodDBlock = sys.argv[3] - inputFile = sys.argv[4] - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') - -files = { - inputFile:None, - } - -jobList = [] - -index = 0 -for lfn in files.keys(): - index += 1 - job = JobSpec() - job.jobDefinitionID = (time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) - job.AtlasRelease = 'Atlas-16.6.2' - job.homepackage = 'AtlasProduction/16.6.2.1' - job.transformation = 'AtlasG4_trf.py' - job.destinationDBlock = datasetName - job.computingSite = site - job.prodDBlock = prodDBlock - - job.prodSourceLabel = 'test' - job.processingType = 'test' - job.currentPriority = 10000 - job.cloud = cloud - job.cmtConfig = 'i686-slc5-gcc43-opt' - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - - fileD = FileSpec() - fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v140201' - fileD.prodDBlock = fileD.dataset - fileD.lfn = 'DBRelease-14.2.1.tar.gz' - fileD.type = 'input' - job.addFile(fileD) - - fileOA = FileSpec() - fileOA.lfn = "%s.HITS.pool.root" % job.jobName - fileOA.destinationDBlock = job.destinationDBlock - fileOA.destinationSE = job.destinationSE - fileOA.dataset = job.destinationDBlock - fileOA.destinationDBlockToken = 'ATLASDATADISK' - fileOA.type = 'output' - job.addFile(fileOA) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters='inputEvgenFile=%s outputHitsFile=%s maxEvents=3 skipEvents=1700 DBRelease=%s preInclude=SimuJobTransforms/VertexFromCondDB.py postExec="from InDetBeamSpotService.InDetBeamSpotServiceConf import BeamCondSvc;ServiceMgr+=BeamCondSvc();ServiceMgr.BeamCondSvc.useDB=False;ServiceMgr.BeamCondSvc.posX=0.1352;ServiceMgr.BeamCondSvc.posY=1.1621;ServiceMgr.BeamCondSvc.posZ=2.87;ServiceMgr.BeamCondSvc.sigmaX=0;ServiceMgr.BeamCondSvc.sigmaY=0;ServiceMgr.BeamCondSvc.sigmaZ=0" geometryVersion=ATLAS-GEO-16-00-00 conditionsTag=OFLCOND-SDR-BS7T-02 AMITag=s1019 randomSeed=568 physicsList=QGSP_BERT firstEvent=1701 RunNumber=106047' % \ - (fileI.lfn,fileOA.lfn,fileD.lfn) - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testG4sim17.py b/current/pandaserver/test/testG4sim17.py deleted file mode 100644 index 0b53acb0d..000000000 --- a/current/pandaserver/test/testG4sim17.py +++ /dev/null @@ -1,88 +0,0 @@ -import sys -import time -import random -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -site = sys.argv[1] -cloud = sys.argv[2] - -prodDBlock = 'mc10_7TeV.105001.pythia_minbias.evgen.EVNT.e574_tid153937_00' -inputFile = 'EVNT.153937._000184.pool.root.1' - -if len(sys.argv)==5: - site = sys.argv[1] - cloud = sys.argv[2] - prodDBlock = sys.argv[3] - inputFile = sys.argv[4] - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') - -files = { - inputFile:None, - } - -jobList = [] - -index = 0 -for lfn in files.keys(): - index += 1 - job = JobSpec() - job.jobDefinitionID = (time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) - job.AtlasRelease = 'Atlas-17.0.5' - job.homepackage = 'AtlasProduction/17.0.5.6' - job.transformation = 'AtlasG4_trf.py' - job.destinationDBlock = datasetName - job.computingSite = site - job.prodDBlock = prodDBlock - - job.prodSourceLabel = 'test' - job.processingType = 'test' - job.currentPriority = 10000 - job.cloud = cloud - job.cmtConfig = 'i686-slc5-gcc43-opt' - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - - fileD = FileSpec() - fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v170602' - fileD.prodDBlock = fileD.dataset - fileD.lfn = 'DBRelease-17.6.2.tar.gz' - fileD.type = 'input' - job.addFile(fileD) - - fileOA = FileSpec() - fileOA.lfn = "%s.HITS.pool.root" % job.jobName - fileOA.destinationDBlock = job.destinationDBlock - fileOA.destinationSE = job.destinationSE - fileOA.dataset = job.destinationDBlock - fileOA.destinationDBlockToken = 'ATLASDATADISK' - fileOA.type = 'output' - job.addFile(fileOA) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters='inputEvgenFile=%s outputHitsFile=%s maxEvents=3 skipEvents=0 DBRelease=%s geometryVersion=ATLAS-GEO-18-01-03_VALIDATION conditionsTag=OFLCOND-SDR-BS7T-05-14 randomSeed=1 physicsList=QGSP_BERT RunNumber=116870 firstEvent=1' % (fileI.lfn,fileOA.lfn,fileD.lfn) - - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testGetJobStatus.py b/current/pandaserver/test/testGetJobStatus.py deleted file mode 100755 index 4e47c2547..000000000 --- a/current/pandaserver/test/testGetJobStatus.py +++ /dev/null @@ -1,17 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client - -id = sys.argv[1] - -s,o = Client.getJobStatus([id]) -print s -if s == 0: - for job in o: - if job == None: - continue - print job.PandaID - for file in job.Files: - print file.lfn,file.type - diff --git a/current/pandaserver/test/testMultiTRF.py b/current/pandaserver/test/testMultiTRF.py deleted file mode 100755 index c9fcd9853..000000000 --- a/current/pandaserver/test/testMultiTRF.py +++ /dev/null @@ -1,95 +0,0 @@ -import sys -import time -import random -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = None - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') - -index = 0 - -job = JobSpec() -job.jobDefinitionID = int(time.time()) % 10000 -job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) -job.AtlasRelease = 'Atlas-14.1.0\nAtlas-14.1.0' -job.homepackage = 'AtlasProduction/14.1.0.3\nAtlasProduction/14.1.0.3' -job.transformation = 'csc_digi_trf.py\ncsc_reco_trf.py' -job.destinationDBlock = datasetName - -job.computingSite = site - -job.prodDBlock = 'valid1.005200.T1_McAtNlo_Jimmy.simul.HITS.e322_s429_tid022081' - -job.prodSourceLabel = 'test' -job.currentPriority = 10000 -job.cloud = 'US' - -for lfn in ['HITS.022081._00001.pool.root','HITS.022081._00002.pool.root']: - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - -fileD1 = FileSpec() -fileD1.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v050001' -fileD1.prodDBlock = fileD1.dataset -fileD1.lfn = 'DBRelease-5.0.1.tar.gz' -fileD1.type = 'input' -job.addFile(fileD1) - -fileD2 = FileSpec() -fileD2.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v050101' -fileD2.prodDBlock = fileD2.dataset -fileD2.lfn = 'DBRelease-5.1.1.tar.gz' -fileD2.type = 'input' -job.addFile(fileD2) - -fileOE = FileSpec() -fileOE.lfn = "%s.ESD.pool.root" % job.jobName -fileOE.destinationDBlock = job.destinationDBlock -fileOE.destinationSE = job.destinationSE -fileOE.dataset = job.destinationDBlock -fileOE.type = 'output' -job.addFile(fileOE) - -fileOA = FileSpec() -fileOA.lfn = "%s.AOD.pool.root" % job.jobName -fileOA.destinationDBlock = job.destinationDBlock -fileOA.destinationSE = job.destinationSE -fileOA.dataset = job.destinationDBlock -fileOA.type = 'output' -job.addFile(fileOA) - -fileOC = FileSpec() -fileOC.lfn = "%s.NTUP.root" % job.jobName -fileOC.destinationDBlock = job.destinationDBlock -fileOC.destinationSE = job.destinationSE -fileOC.dataset = job.destinationDBlock -fileOC.type = 'output' -job.addFile(fileOC) - -fileOL = FileSpec() -fileOL.lfn = "%s.job.log.tgz" % job.jobName -fileOL.destinationDBlock = job.destinationDBlock -fileOL.destinationSE = job.destinationSE -fileOL.dataset = job.destinationDBlock -fileOL.type = 'log' -job.addFile(fileOL) - -job.jobParameters="HITS.022081._[00001,00002].pool.root RDO.TMP._00001_tmp.pool.root 250 0 ATLAS-CSC-05-00-00 1 1 NONE NONE None %s AtRndmGenSvc QGSP_EMV DEFAULT NONE NONE NONE NONE NONE\n RDO.TMP._00001_tmp.pool.root %s %s %s 250 0 ATLAS-CSC-05-00-00 DEFAULT None %s NONE" % \ - (fileD1.lfn,fileOE.lfn,fileOA.lfn,fileOC.lfn,fileD2.lfn) - -s,o = Client.submitJobs([job]) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testReco.py b/current/pandaserver/test/testReco.py deleted file mode 100755 index 0eb597e45..000000000 --- a/current/pandaserver/test/testReco.py +++ /dev/null @@ -1,106 +0,0 @@ -import sys -import time -import random -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = None - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = None - -files = { - 'misal1_mc12.005802.JF17_pythia_jet_filter.digit.RDO.v12000601_tid008610._11615.pool.root.1':None, - #'misal1_mc12.005802.JF17_pythia_jet_filter.digit.RDO.v12000601_tid008610._11639.pool.root.1':None, - #'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554._03634.pool.root.1':None, - #'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554._03248.pool.root.1':None, - #'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554._03634.pool.root.1':None, - } - -jobList = [] - -index = 0 -for lfn in files.keys(): - index += 1 - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) - job.AtlasRelease = 'Atlas-12.0.6' - job.homepackage = 'AtlasProduction/12.0.6.4' - job.transformation = 'csc_reco_trf.py' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.computingSite = site - #job.prodDBlock = 'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554' - job.prodDBlock = 'misal1_mc12.005802.JF17_pythia_jet_filter.digit.RDO.v12000601_tid008610' - job.cloud = 'US' - - job.prodSourceLabel = 'test' - job.currentPriority = 10000 - job.cmtConfig = 'i686-slc4-gcc34-opt' - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - - fileD = FileSpec() - fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' - fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' - fileD.lfn = 'DBRelease-3.1.1.tar.gz' - fileD.type = 'input' - job.addFile(fileD) - - fileOE = FileSpec() - fileOE.lfn = "%s.ESD.pool.root" % job.jobName - fileOE.destinationDBlock = job.destinationDBlock - fileOE.destinationSE = job.destinationSE - fileOE.dataset = job.destinationDBlock - fileOE.destinationDBlockToken = 'ATLASDATADISK' - fileOE.type = 'output' - job.addFile(fileOE) - - fileOA = FileSpec() - fileOA.lfn = "%s.AOD.pool.root" % job.jobName - fileOA.destinationDBlock = job.destinationDBlock - fileOA.destinationSE = job.destinationSE - fileOA.dataset = job.destinationDBlock - fileOA.destinationDBlockToken = 'ATLASDATADISK' - fileOA.type = 'output' - job.addFile(fileOA) - - fileOC = FileSpec() - fileOC.lfn = "%s.NTUP.root" % job.jobName - fileOC.destinationDBlock = job.destinationDBlock - fileOC.destinationSE = job.destinationSE - fileOC.dataset = job.destinationDBlock - fileOC.destinationDBlockToken = 'ATLASDATADISK' - fileOC.type = 'output' - job.addFile(fileOC) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="%s %s %s %s 250 0 ATLAS-CSC-01-02-00 CSC-06 NoRestrictedESDRecConfig.py %s" % \ - (fileI.lfn,fileOE.lfn,fileOA.lfn,fileOC.lfn,fileD.lfn) - - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testRepro.py b/current/pandaserver/test/testRepro.py deleted file mode 100755 index 9b0b7f679..000000000 --- a/current/pandaserver/test/testRepro.py +++ /dev/null @@ -1,116 +0,0 @@ -import re -import sys -import time -import random -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -cloud = sys.argv[1] -if len(sys.argv)>2: - site = sys.argv[2] -else: - site = None - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = None - -files = { - 'daq.ATLAS.0092045.physics.RPCwBeam.LB0016.SFO-2._0009.data':None, - } - -jobList = [] - -index = 0 -for lfn in files.keys(): - index += 1 - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) - job.AtlasRelease = 'Atlas-14.4.0' - job.homepackage = 'AtlasTier0/14.4.0.2' - job.transformation = 'Reco_trf.py' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.computingSite = site - job.prodDBlock = 'data08_cos.00092045.physics_RPCwBeam.daq.RAW.o4_T1224560091' - - job.prodSourceLabel = 'test' - job.processingType = 'reprocessing' - job.currentPriority = 10000 - job.cloud = cloud - job.cmtConfig = 'i686-slc4-gcc34-opt' - - origParams = """inputBSFile=daq.ATLAS.0092045.physics.RPCwBeam.LB0016.SFO-2._0009.data maxEvents=5 skipEvents=0 autoConfiguration=FieldAndGeo preInclude=RecExCommission/RecExCommission.py,RecExCommission/MinimalCommissioningSetup.py,RecJobTransforms/UseOracle.py preExec="jetFlags.Enabled.set_Value_and_Lock(False)" DBRelease=DBRelease-6.2.1.5.tar.gz conditionsTag=COMCOND-ES1C-000-00 RunNumber=92045 beamType=cosmics AMITag=r595 projectName=data08_cos trigStream=physics_RPCwBeam outputTypes=DPDCOMM outputESDFile=ESD.029868._01110.pool.root outputTAGComm=TAG_COMM.029868._01110.pool.root outputAODFile=AOD.029868._01110.pool.root outputMergedDQMonitorFile=DQM_MERGED.029868._01110.root DPD_PIXELCOMM=DPD_PIXELCOMM.029868._01110.pool.root DPD_SCTCOMM=DPD_SCTCOMM.029868._01110.pool.root DPD_IDCOMM=DPD_IDCOMM.029868._01110.pool.root DPD_IDPROJCOMM=DPD_IDPROJCOMM.029868._01110.pool.root DPD_CALOCOMM=DPD_CALOCOMM.029868._01110.pool.root DPD_TILECOMM=DPD_TILECOMM.029868._01110.pool.root DPD_EMCLUSTCOMM=DPD_EMCLUSTCOMM.029868._01110.pool.root DPD_EGAMMACOMM=DPD_EGAMMACOMM.029868._01110.pool.root DPD_RPCCOMM=DPD_RPCCOMM.029868._01110.pool.root DPD_TGCCOMM=DPD_TGCCOMM.029868._01110.pool.root --ignoreunknown""" - - match = re.findall("([^\s]+=[^\s]+)",origParams) - outMap = {} - for item in match: - arg = item.split('=')[0] - var = item.split('=')[-1] - # output - if arg.startswith('output') or arg.startswith('DPD_'): - # skip some keys - if arg in ['outputTypes']: - continue - prefix = var.split('.')[0] - sumatch = re.search('(\.[^\.]+\.[^\.]+)(\.\d+)*$',var) - suffix = sumatch.group(1) - newName = '%s.%s%s' % (job.jobName,prefix,suffix) - outMap[arg] = (var,newName) - # DBRelease - elif arg == 'DBRelease': - dbrMap = (arg,var) - # input - elif arg.startswith('input') and arg.endswith('File'): - inputMap = (arg,var) - - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - - fileD = FileSpec() - fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v06020105' - fileD.prodDBlock = fileD.dataset - fileD.lfn = 'DBRelease-6.2.1.5.tar.gz' - fileD.type = 'input' - job.addFile(fileD) - - newParams = origParams - newParams = newParams.replace(dbrMap[0]+'='+dbrMap[1],dbrMap[0]+'='+fileD.lfn) - newParams = newParams.replace(inputMap[0]+'='+inputMap[1],inputMap[0]+'='+fileI.lfn) - - for arg,vars in outMap.iteritems(): - fileO = FileSpec() - fileO.lfn = vars[1] - fileO.destinationDBlock = job.destinationDBlock - fileO.destinationSE = job.destinationSE - fileO.dataset = job.destinationDBlock - fileO.destinationDBlockToken = 'ATLASDATADISK' - fileO.type = 'output' - job.addFile(fileO) - newParams = newParams.replace(arg+'='+vars[0],arg+'='+fileO.lfn) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters=newParams - - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testScript.py b/current/pandaserver/test/testScript.py deleted file mode 100755 index 2299a441d..000000000 --- a/current/pandaserver/test/testScript.py +++ /dev/null @@ -1,45 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -aSrvID = None - -for idx,argv in enumerate(sys.argv): - if argv == '-s': - aSrvID = sys.argv[idx+1] - sys.argv = sys.argv[:idx] - break - -site = sys.argv[1] - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = None - -job = JobSpec() -job.jobDefinitionID = int(time.time()) % 10000 -job.jobName = "%s" % commands.getoutput('uuidgen') -job.transformation = 'https://atlpan.web.cern.ch/atlpan/test.sh' -job.destinationDBlock = datasetName -job.destinationSE = destName -job.currentPriority = 1000 -job.prodSourceLabel = 'test' -job.computingSite = site - -job.jobParameters="aaaaa" - -fileOL = FileSpec() -fileOL.lfn = "%s.job.log.tgz" % job.jobName -fileOL.destinationDBlock = job.destinationDBlock -fileOL.destinationSE = job.destinationSE -fileOL.dataset = job.destinationDBlock -fileOL.type = 'log' -job.addFile(fileOL) - - -s,o = Client.submitJobs([job],srvID=aSrvID) -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testSimul13.py b/current/pandaserver/test/testSimul13.py deleted file mode 100644 index 4b8ef5247..000000000 --- a/current/pandaserver/test/testSimul13.py +++ /dev/null @@ -1,81 +0,0 @@ -import sys -import time -import random -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = None - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = 'BNL_ATLAS_2' - -files = { - 'EVNT.019128._00011.pool.root.1':None, - } - -jobList = [] - -index = 0 -for lfn in files.keys(): - index += 1 - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) - job.AtlasRelease = 'Atlas-13.0.40' - job.homepackage = 'AtlasProduction/13.0.40.3' - job.transformation = 'csc_simul_trf.py' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.computingSite = site - job.prodDBlock = 'valid1.005001.pythia_minbias.evgen.EVNT.e306_tid019128' - - job.prodSourceLabel = 'test' - job.currentPriority = 10000 - job.cloud = 'IT' - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - - fileD = FileSpec() - fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v040701' - fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' - fileD.lfn = 'DBRelease-4.7.1.tar.gz' - fileD.type = 'input' - job.addFile(fileD) - - fileOE = FileSpec() - fileOE.lfn = "%s.HITS.pool.root" % job.jobName - fileOE.destinationDBlock = job.destinationDBlock - fileOE.destinationSE = job.destinationSE - fileOE.dataset = job.destinationDBlock - fileOE.destinationDBlockToken = 'ATLASDATADISK' - fileOE.type = 'output' - job.addFile(fileOE) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="%s %s NONE 1 3250 55866 ATLAS-CSC-02-01-00 55866 55866 QGSP_EMV None %s DEFAULT" % \ - (fileI.lfn,fileOE.lfn,fileD.lfn) - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testSimulReco14.py b/current/pandaserver/test/testSimulReco14.py deleted file mode 100644 index 41c78c68d..000000000 --- a/current/pandaserver/test/testSimulReco14.py +++ /dev/null @@ -1,101 +0,0 @@ -import sys -import time -import random -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] - cloud = None -else: - site = None - cloud = 'US' - - - -#cloud = 'TW' -#Recent changes (BNL migration to LFC?) forvce the cloud to be specified -cloud = 'US' - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = 'BNL_ATLAS_2' - -files = { - 'EVNT.023986._00001.pool.root.1':None, - #'EVNT.023989._00001.pool.root.1':None, - } - -jobList = [] - -index = 0 -for lfn in files.keys(): - index += 1 - job = JobSpec() - job.jobDefinitionID = (time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) - job.AtlasRelease = 'Atlas-14.2.20' - job.homepackage = 'AtlasProduction/14.2.20.1' - job.transformation = 'csc_simul_reco_trf.py' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.computingSite = site - job.prodDBlock = 'mc08.105031.Jimmy_jetsJ2.evgen.EVNT.e347_tid023986' - #job.prodDBlock = 'mc08.105034.Jimmy_jetsJ5.evgen.EVNT.e347_tid023989' - - job.prodSourceLabel = 'test' - job.processingType = 'test' - job.currentPriority = 10000 - job.cloud = cloud - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - - fileD = FileSpec() - fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v050601' - fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v050601' - fileD.lfn = 'DBRelease-5.6.1.tar.gz' - fileD.type = 'input' - job.addFile(fileD) - - fileOA = FileSpec() - fileOA.lfn = "%s.AOD.pool.root" % job.jobName - fileOA.destinationDBlock = job.destinationDBlock - fileOA.destinationSE = job.destinationSE - fileOA.dataset = job.destinationDBlock - fileOA.destinationDBlockToken = 'ATLASDATADISK' - fileOA.type = 'output' - job.addFile(fileOA) - - fileOE = FileSpec() - fileOE.lfn = "%s.ESD.pool.root" % job.jobName - fileOE.destinationDBlock = job.destinationDBlock - fileOE.destinationSE = job.destinationSE - fileOE.dataset = job.destinationDBlock - fileOE.destinationDBlockToken = 'ATLASDATADISK' - fileOE.type = 'output' - job.addFile(fileOE) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.destinationDBlockToken = 'ATLASDATADISK' - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="%s %s 30 500 3 ATLAS-GEO-02-01-00 3 3 QGSP_BERT jobConfig.VertexPosFastIDKiller.py FastSimulationJobTransforms/FastCaloSimAddCellsRecConfig.py,NoTrackSlimming.py %s OFF NONE NONE %s NONE" % (fileI.lfn, fileOA.lfn, fileD.lfn, fileOE.lfn) - - jobList.append(job) - -s,o = Client.submitJobs(jobList) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testSiteMap.py b/current/pandaserver/test/testSiteMap.py deleted file mode 100755 index f11053958..000000000 --- a/current/pandaserver/test/testSiteMap.py +++ /dev/null @@ -1,23 +0,0 @@ -import os -import re -import sys -import time -import random -import datetime -import commands -from taskbuffer.TaskBuffer import taskBuffer -from brokerage import SiteMapper - -# password -from config import panda_config -passwd = panda_config.dbpasswd - -# instantiate TB -taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - -siteMapper = SiteMapper.SiteMapper(taskBuffer) - -#x = siteMapper.getSite('BNL_ATLAS_1') -#print x - - diff --git a/current/pandaserver/test/testTB.py b/current/pandaserver/test/testTB.py deleted file mode 100755 index d94e06560..000000000 --- a/current/pandaserver/test/testTB.py +++ /dev/null @@ -1,145 +0,0 @@ -""" -test TaskBuffer and JobDispatcher on local PC - -$ python -i testTB.py ->>> testGetJobs(10) ->>> testGetJobStatus(1) ->>> testUpdateJob(1,'running') ->>> testGetJobStatus(1) ->>> testUpdateJob(1,'finished') ->>> testGetJobStatus(1) ->>> taskBuffer.peekJobs([1,]) ->>> taskBuffer.queryPandaIDs([0,]) - - -""" - - -import time -import commands -import threading - -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -class TestThread (threading.Thread): - def __init__(self,tb,i,n,siteName): - threading.Thread.__init__(self) - self.taskbuffer = tb - self.interval = i - self.jobDefinitionID = n - self.siteName = siteName - - def run(self): - for i in range(1): - prodDBlock = 'rome.004201.evgen.ZeeJimmy' - destinationDBlock = 'pandatest.000123.test.simul' - destinationSE = 'BNL_SE' - jobs = [] - #for i in range(self.interval): - for i in range(2): - job = JobSpec() - job.jobDefinitionID=self.jobDefinitionID - job.AtlasRelease='Atlas-11.0.1' - job.prodDBlock=prodDBlock - job.destinationDBlock=destinationDBlock - job.destinationSE=destinationSE - job.currentPriority=i - - lfnI = 'rome.004201.evgen.ZeeJimmy._00001.pool.root' - file = FileSpec() - file.lfn = lfnI - file.dataset = 'rome.004201.evgen.ZeeJimmy' - file.type = 'input' - file.prodDBlock = prodDBlock - file.dataset = prodDBlock - job.addFile(file) - - lfnO ='%s.pool.root.1' % commands.getoutput('uuidgen') - file = FileSpec() - file.lfn = lfnO - file.type = 'output' - file.destinationDBlock = destinationDBlock - file.dataset = destinationDBlock - file.destinationSE = destinationSE - job.addFile(file) - - job.homepackage='JobTransforms-11-00-01-01' - job.transformation='share/rome.g4sim.standard.trf' - job.jobParameters='%s %s 1 2 14268' % (lfnI,lfnO) - jobs.append(job) - self.taskbuffer.storeJobs(jobs,None) - time.sleep(self.interval) - -from taskbuffer.TaskBuffer import taskBuffer -from jobdispatcher.JobDispatcher import jobDispatcher -from userinterface.UserIF import userIF - -import getpass -passwd = getpass.getpass() - -taskBuffer.init('adbpro.usatlas.bnl.gov',passwd,nDBConnection=3) - -jobDispatcher.init(taskBuffer) -userIF.init(taskBuffer) - -jobDefID = int(time.time()) % 10000 -thr1 = TestThread(taskBuffer,4,jobDefID,"myhost") -thr2 = TestThread(taskBuffer,3,jobDefID+1,"testsite") - -thr1.start() -#thr2.start() - -from jobdispatcher.JobDispatcher import getJob,updateJob -from userinterface.UserIF import submitJobs,getJobStatus,queryPandaIDs - - -### emulate HTTP requests - -class Request: - def __init__(self): - self.subprocess_env = {} - self.subprocess_env['SSL_CLIENT_S_DN'] = "aaa" - self.subprocess_env['HTTPS'] = "on" - -req = Request() - -def testGetJob(): - print getJob(req,"BNL_ATLAS_2") - -def testGetJobStatus(arg): - print getJobStatus(req,arg) - -def testSubmitJobs(arg): - print submitJobs(req,arg) - -def testUpdateJob(arg0,arg1): - print updateJob(req,arg0,arg1) - -def testQueryPandaIDs(arg): - print queryPandaIDs(req,arg) - -""" - -import cPickle as pickle -ids=[3023,3414] -testGetJobStatus(pickle.dumps(ids)) - -job = JobSpec() -job.jobDefinitionID='user.%s' % commands.getoutput('/usr/bin/uuidgen') -ids = {'pandatest.000003.dd.input._00028.junk':'6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27', - 'pandatest.000003.dd.input._00033.junk':'98f79ba1-1793-4253-aac7-bdf90a51d1ee', - 'pandatest.000003.dd.input._00039.junk':'33660dd5-7cef-422a-a7fc-6c24cb10deb1'} -for lfn in ids.keys(): - file = FileSpec() - file.lfn = lfn - file.GUID = ids[file.lfn] - file.dataset = 'pandatest.000003.dd.input' - file.type = 'input' - job.addFile(file) - -testSubmitJobs(pickle.dumps([job])) - -testQueryPandaIDs(pickle.dumps([10])) - -""" diff --git a/current/pandaserver/test/testTaskA2.py b/current/pandaserver/test/testTaskA2.py deleted file mode 100755 index e54e3948f..000000000 --- a/current/pandaserver/test/testTaskA2.py +++ /dev/null @@ -1,64 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = None - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -#destName = 'BNL_SE' - -jobList = [] - -for i in [999905,999906,999907]: - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) - job.AtlasRelease = 'Atlas-14.1.0' - job.homepackage = 'AtlasProduction/12.0.6.2' - job.transformation = 'csc_evgen_trf.py' - job.destinationDBlock = datasetName - #job.destinationSE = destName - job.currentPriority = 1000 - job.prodSourceLabel = 'managed' - #job.prodSourceLabel = 'test' - #job.computingSite = site - job.cmtConfig = 'i686-slc4-gcc34-opt' - job.metadata = 'evgen;%s;%s;%s' % (str({'FR': 46, 'NL': 45, 'NDGF': 300, 'CERN': 19, 'TW': 44110, 'CA': 2922, 'DE': 9903, 'IT': 1168, 'US': 6226, 'UK': 1026, 'ES': 26619}),str({999907:100,999906:200,999905:300}),str({999905:100,999906:910,999907:500})) - #job.metadata = 'evgen;%s' % str({'FR': 46, 'NL': 45, 'NDGF': 300, 'CERN': 19, 'TW': 44110, 'CA': 2922, 'DE': 9903, 'IT': 1168, 'US': 6226, 'UK': 1026, 'ES': 26619}) - - #job.cloud = "UK" - job.taskID = i - - file = FileSpec() - file.lfn = "%s.evgen.pool.root" % job.jobName - file.destinationDBlock = job.destinationDBlock - file.destinationSE = job.destinationSE - file.dataset = job.destinationDBlock - #file.destinationDBlockToken = 'ATLASDATADISK' - file.type = 'output' - job.addFile(file) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % job.jobName - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="7087 0 500000 1 DC3.007087.singlepart_fwdgamma_etaplus_E500.py %s NONE NONE NONE" % file.lfn - jobList.append(job) - -for i in range(1): - #s,o = Client.submitJobs(jobList) - s,outS = Client.runTaskAssignment(jobList) - print "---------------------" - print s - for tmpOut in outS: - print tmpOut diff --git a/current/pandaserver/test/testUser.py b/current/pandaserver/test/testUser.py deleted file mode 100755 index fd51cd1af..000000000 --- a/current/pandaserver/test/testUser.py +++ /dev/null @@ -1,44 +0,0 @@ -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -job = JobSpec() -job.jobDefinitionID = int(time.time()) % 10000 -job.jobName = commands.getoutput('/usr/bin/uuidgen') -job.AtlasRelease = 'Atlas-9.0.4' -job.prodDBlock = 'pandatest.000003.dd.input' -job.destinationDBlock = 'panda.destDB.%s' % commands.getoutput('/usr/bin/uuidgen') -job.destinationSE = 'BNL_SE' - -ids = {'pandatest.000003.dd.input._00028.junk':'6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27', - 'pandatest.000003.dd.input._00033.junk':'98f79ba1-1793-4253-aac7-bdf90a51d1ee', - 'pandatest.000003.dd.input._00039.junk':'33660dd5-7cef-422a-a7fc-6c24cb10deb1'} -for lfn in ids.keys(): - file = FileSpec() - file.lfn = lfn - file.GUID = ids[file.lfn] - file.dataset = 'pandatest.000003.dd.input' - file.type = 'input' - job.addFile(file) - -s,o = Client.submitJobs([job]) -print "---------------------" -print s -print o -print "---------------------" -s,o = Client.getJobStatus([4934, 4766, 4767, 4768, 4769]) -print s -if s == 0: - for job in o: - if job == None: - continue - print job.PandaID - for file in job.Files: - print file.lfn,file.type -print "---------------------" -s,o = Client.queryPandaIDs([0]) -print s -print o - diff --git a/current/pandaserver/test/testWait.py b/current/pandaserver/test/testWait.py deleted file mode 100755 index adbd9c246..000000000 --- a/current/pandaserver/test/testWait.py +++ /dev/null @@ -1,119 +0,0 @@ -import sys -import time -import commands -import userinterface.Client as Client -from taskbuffer.JobSpec import JobSpec -from taskbuffer.FileSpec import FileSpec - -if len(sys.argv)>1: - site = sys.argv[1] -else: - site = None - -datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') -destName = 'BNL_SE' - -jobListE = [] -lfnListE = [] - -for i in range(2): - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = commands.getoutput('uuidgen') - job.AtlasRelease = 'Atlas-11.0.3' - job.homepackage = 'JobTransforms-11-00-03-03' - job.transformation = 'share/csc.evgen.trf' - job.destinationDBlock = datasetName - job.destinationSE = destName - job.currentPriority = 1000 - job.prodSourceLabel = 'test' - job.computingSite = site - - file = FileSpec() - file.lfn = "%s.evgen.pool.root" % commands.getoutput('uuidgen') - lfnListE.append(file.lfn) - file.lfn += ('.%d' % (i+1)) - file.destinationDBlock = job.destinationDBlock - file.destinationSE = job.destinationSE - file.dataset = job.destinationDBlock - file.type = 'output' - job.addFile(file) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen') - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="5056 %s NONE 81000 9000 10 DC3.005056.PythiaPhotonJet2.py NONE" % file.lfn - jobListE.append(job) - -s,o = Client.submitJobs(jobListE) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] - -time.sleep(20) - -datasetNameS = 'panda.simu.%s' % commands.getoutput('uuidgen') - -jobListS = [] - -for lfn in lfnListE: - job = JobSpec() - job.jobDefinitionID = int(time.time()) % 10000 - job.jobName = commands.getoutput('uuidgen') - job.AtlasRelease = 'Atlas-11.0.3' - job.homepackage = 'JobTransforms-11-00-03-04' - job.transformation = 'share/csc.simul.trf' - job.destinationDBlock = datasetNameS - job.destinationSE = destName - job.prodDBlock = datasetName - - job.prodSourceLabel = 'test' - job.currentPriority = 1000 - - fileI = FileSpec() - fileI.dataset = job.prodDBlock - fileI.prodDBlock = job.prodDBlock - fileI.lfn = lfn - fileI.type = 'input' - job.addFile(fileI) - - fileOE = FileSpec() - fileOE.lfn = "%s.HITS.pool.root" % commands.getoutput('uuidgen') - fileOE.destinationDBlock = job.destinationDBlock - fileOE.destinationSE = job.destinationSE - fileOE.dataset = job.destinationDBlock - fileOE.type = 'output' - job.addFile(fileOE) - - fileOA = FileSpec() - fileOA.lfn = "%s.RDO.pool.root" % commands.getoutput('uuidgen') - fileOA.destinationDBlock = job.destinationDBlock - fileOA.destinationSE = job.destinationSE - fileOA.dataset = job.destinationDBlock - fileOA.type = 'output' - job.addFile(fileOA) - - fileOL = FileSpec() - fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen') - fileOL.destinationDBlock = job.destinationDBlock - fileOL.destinationSE = job.destinationSE - fileOL.dataset = job.destinationDBlock - fileOL.type = 'log' - job.addFile(fileOL) - - job.jobParameters="%s %s %s 100 4900 400" % (fileI.lfn,fileOE.lfn,fileOA.lfn) - - jobListS.append(job) - -s,o = Client.submitJobs(jobListS) -print "---------------------" -print s -for x in o: - print "PandaID=%s" % x[0] - diff --git a/current/pandaserver/test/tmpwatch.py b/current/pandaserver/test/tmpwatch.py deleted file mode 100644 index ee75d2720..000000000 --- a/current/pandaserver/test/tmpwatch.py +++ /dev/null @@ -1,47 +0,0 @@ -import os -import glob -import optparse -import datetime - -# options -optP = optparse.OptionParser(conflict_handler="resolve") -optP.add_option('-t',action='store_const',const=True,dest='test',default=False, - help='test mode') -optP.add_option('-h',action='store',type='int',dest='limit',default=12, - help='time limit in hour') -options,args = optP.parse_args() - -# patterns of tmp files -tmpPatts = ['/tmp/tmp*','/tmp/atlpan/tmp*'] - -# limit -timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=options.limit) - -# loop over all pattern -for tmpPatt in tmpPatts: - tmpFiles = glob.glob(tmpPatt) - # loop over all files - for tmpFile in tmpFiles: - try: - print 'INFO: tmpfile -> %s' % tmpFile - # only file - if not os.path.isfile(tmpFile): - continue - # not symlink - if os.path.islink(tmpFile): - continue - # writable - if not os.access(tmpFile,os.W_OK): - continue - # check time stamp - timeStamp = os.path.getmtime(tmpFile) - timeStamp = datetime.datetime.fromtimestamp(timeStamp) - if timeStamp > timeLimit: - continue - # remove - print 'INFO: remove %s' % tmpFile - if not options.test: - os.remove(tmpFile) - except: - errType,errValue = sys.exc_info()[:2] - print 'ERROR: failed with %s:%s' % (errType,errValue) diff --git a/current/pandaserver/test/update.sh b/current/pandaserver/test/update.sh deleted file mode 100755 index c1edbf515..000000000 --- a/current/pandaserver/test/update.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/python - -import os -import sys - -os.chdir('..') - -option = '' -if len(sys.argv) > 1 and sys.argv[1] == '-n': - option = ' -n' - -packages = ['liveconfigparser','pandalogger','taskbuffer', - 'brokerage','jobdispatcher','userinterface', - 'dataservice','test','server'] #,'config'] - -for pack in packages: - com = 'cvs%s update %s' % (option,pack) - print com - os.system(com) diff --git a/current/pandaserver/test/valConf.py b/current/pandaserver/test/valConf.py deleted file mode 100644 index 69ec8688c..000000000 --- a/current/pandaserver/test/valConf.py +++ /dev/null @@ -1,15 +0,0 @@ -from config import panda_config -from config import panda_config_new - -for item in dir(panda_config): - if item.startswith('__'): - continue - old = getattr(panda_config,item) - if not hasattr(panda_config_new,item): - print "NG : %s not found" % item - continue - new = getattr(panda_config_new,item) - if old != new: - print "NG : %s missmatch" % item - print " old:%s" % old - print " new:%s" % new diff --git a/current/pandaserver/userinterface/Client.py b/current/pandaserver/userinterface/Client.py deleted file mode 100755 index 529e2d11c..000000000 --- a/current/pandaserver/userinterface/Client.py +++ /dev/null @@ -1,880 +0,0 @@ -''' -client methods - -''' - -import os -import re -import sys -import urllib -import commands -import cPickle as pickle - - -# configuration -try: - baseURL = os.environ['PANDA_URL'] -except: - baseURL = 'http://pandaserver.cern.ch:25080/server/panda' -try: - baseURLSSL = os.environ['PANDA_URL_SSL'] -except: - baseURLSSL = 'https://pandaserver.cern.ch:25443/server/panda' - - -# exit code -EC_Failed = 255 - - -# panda server URLs -if os.environ.has_key('PANDA_URL_MAP'): - serverURLs = {'default' : {'URL' : baseURL, - 'URLSSL' : baseURLSSL}, - } - # decode envvar to map - try: - for tmpCompStr in os.environ['PANDA_URL_MAP'].split('|'): - tmpKey,tmpURL,tmpURLSSL = tmpCompStr.split(',') - # append - serverURLs[tmpKey] = {'URL' : tmpURL, - 'URLSSL' : tmpURLSSL} - except: - pass -else: - # default - serverURLs = {'default' : {'URL' : baseURL, - 'URLSSL' : baseURLSSL}, - 'CERN' : {'URL' : 'http://pandaserver.cern.ch:25080/server/panda', - 'URLSSL' : 'https://pandaserver.cern.ch:25443/server/panda'}, - } - -# bamboo -baseURLBAMBOO = 'http://pandabamboo.cern.ch:25070/bamboo/bamboo' - - -# get URL -def _getURL(type,srvID=None): - if serverURLs.has_key(srvID): - urls = serverURLs[srvID] - else: - urls = serverURLs['default'] - return urls[type] - - -# get Panda srvIDs -def getPandas(): - srvs = serverURLs.keys() - # remove 'default' - try: - srvs.remove('default') - except: - pass - return srvs - - -# look for a grid proxy certificate -def _x509(): - # see X509_USER_PROXY - try: - return os.environ['X509_USER_PROXY'] - except: - pass - # see the default place - x509 = '/tmp/x509up_u%s' % os.getuid() - if os.access(x509,os.R_OK): - return x509 - # no valid proxy certificate - # FIXME - print "No valid grid proxy certificate found" - return '' - - -# curl class -class _Curl: - # constructor - def __init__(self): - # path to curl - self.path = 'curl' - # verification of the host certificate - self.verifyHost = False - # request a compressed response - self.compress = True - # SSL cert/key - self.sslCert = '' - self.sslKey = '' - # verbose - self.verbose = False - - - # GET method - def get(self,url,data): - # make command - com = '%s --silent --get' % self.path - if not self.verifyHost: - com += ' --insecure' - if self.compress: - com += ' --compressed' - if self.sslCert != '': - com += ' --cert %s' % self.sslCert - if self.sslKey != '': - com += ' --key %s' % self.sslKey - # timeout - com += ' -m 600' - # data - strData = '' - for key in data.keys(): - strData += 'data="%s"\n' % urllib.urlencode({key:data[key]}) - # write data to temporary config file - try: - tmpName = os.environ['PANDA_TMP'] - except: - tmpName = '/tmp' - tmpName += '/%s_%s' % (commands.getoutput('whoami'),commands.getoutput('uuidgen')) - tmpFile = open(tmpName,'w') - tmpFile.write(strData) - tmpFile.close() - com += ' --config %s' % tmpName - com += ' %s' % url - # execute - if self.verbose: - print com - print commands.getoutput('cat %s' % tmpName) - ret = commands.getstatusoutput(com) - # remove temporary file - os.remove(tmpName) - if ret[0] != 0: - ret = (ret[0]%255,ret[1]) - if self.verbose: - print ret - return ret - - - # POST method - def post(self,url,data): - # make command - com = '%s --silent' % self.path - if not self.verifyHost: - com += ' --insecure' - if self.compress: - com += ' --compressed' - if self.sslCert != '': - com += ' --cert %s' % self.sslCert - if self.sslKey != '': - com += ' --key %s' % self.sslKey - # timeout - com += ' -m 600' - # data - strData = '' - for key in data.keys(): - strData += 'data="%s"\n' % urllib.urlencode({key:data[key]}) - # write data to temporary config file - try: - tmpName = os.environ['PANDA_TMP'] - except: - tmpName = '/tmp' - tmpName += '/%s_%s' % (commands.getoutput('whoami'),commands.getoutput('uuidgen')) - tmpFile = open(tmpName,'w') - tmpFile.write(strData) - tmpFile.close() - com += ' --config %s' % tmpName - com += ' %s' % url - # execute - if self.verbose: - print com - print commands.getoutput('cat %s' % tmpName) - ret = commands.getstatusoutput(com) - # remove temporary file - os.remove(tmpName) - if ret[0] != 0: - ret = (ret[0]%255,ret[1]) - if self.verbose: - print ret - return ret - - - # PUT method - def put(self,url,data): - # make command - com = '%s --silent' % self.path - if not self.verifyHost: - com += ' --insecure' - if self.compress: - com += ' --compressed' - if self.sslCert != '': - com += ' --cert %s' % self.sslCert - if self.sslKey != '': - com += ' --key %s' % self.sslKey - # emulate PUT - for key in data.keys(): - com += ' -F "%s=@%s"' % (key,data[key]) - com += ' %s' % url - # execute - if self.verbose: - print com - ret = commands.getstatusoutput(com) - if ret[0] != 0: - ret = (ret[0]%255,ret[1]) - if self.verbose: - print ret - return ret - - -''' -public methods - -''' - -# use web cache -def useWebCache(): - global baseURL - baseURL = 'http://pandaserver.cern.ch:25085/server/panda' - global serverURLs - for tmpKey,tmpVal in serverURLs.iteritems(): - tmpVal['URL'] = baseURL - - -# submit jobs -def submitJobs(jobs,srvID=None,toPending=False): - # set hostname - hostname = commands.getoutput('hostname') - for job in jobs: - job.creationHost = hostname - # serialize - strJobs = pickle.dumps(jobs) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - # execute - url = _getURL('URLSSL',srvID) + '/submitJobs' - data = {'jobs':strJobs} - if toPending: - data['toPending'] = True - status,output = curl.post(url,data) - if status!=0: - print output - return status,output - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR submitJobs : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# run task assignment -def runTaskAssignment(jobs): - # set hostname - hostname = commands.getoutput('hostname') - for job in jobs: - job.creationHost = hostname - # serialize - strJobs = pickle.dumps(jobs) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - # execute - url = baseURLSSL + '/runTaskAssignment' - data = {'jobs':strJobs} - status,output = curl.post(url,data) - if status!=0: - print output - return status,output - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR runTaskAssignment : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# get job status -def getJobStatus(ids,srvID=None): - # serialize - strIDs = pickle.dumps(ids) - # instantiate curl - curl = _Curl() - # execute - url = _getURL('URL',srvID) + '/getJobStatus' - data = {'ids':strIDs} - status,output = curl.post(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR getJobStatus : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# get PandaID with jobexeID -def getPandaIDwithJobExeID(ids): - # serialize - strIDs = pickle.dumps(ids) - # instantiate curl - curl = _Curl() - # execute - url = _getURL('URL') + '/getPandaIDwithJobExeID' - data = {'ids':strIDs} - status,output = curl.post(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR getPandaIDwithJobExeID : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# get assigning task -def getAssigningTask(): - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/getAssigningTask' - status,output = curl.get(url,{}) - try: - return status,pickle.loads(output) - except: - print output - type, value, traceBack = sys.exc_info() - errStr = "ERROR getAssigningTask : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# get assigned cloud for tasks -def seeCloudTask(ids): - # serialize - strIDs = pickle.dumps(ids) - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/seeCloudTask' - data = {'ids':strIDs} - status,output = curl.post(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR seeCloudTask : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# kill jobs -def killJobs(ids,code=None,verbose=False,srvID=None,useMailAsID=False): - # serialize - strIDs = pickle.dumps(ids) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose - # execute - url = _getURL('URLSSL',srvID) + '/killJobs' - data = {'ids':strIDs,'code':code,'useMailAsID':useMailAsID} - status,output = curl.post(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR killJobs : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# reassign jobs -def reassignJobs(ids,forPending=False): - # serialize - strIDs = pickle.dumps(ids) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - # execute - url = baseURLSSL + '/reassignJobs' - data = {'ids':strIDs} - if forPending: - data['forPending'] = True - status,output = curl.post(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR reassignJobs : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# query PandaIDs -def queryPandaIDs(ids): - # serialize - strIDs = pickle.dumps(ids) - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/queryPandaIDs' - data = {'ids':strIDs} - status,output = curl.post(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR queryPandaIDs : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# query job info per cloud -def queryJobInfoPerCloud(cloud,schedulerID=None): - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/queryJobInfoPerCloud' - data = {'cloud':cloud} - if schedulerID != None: - data['schedulerID'] = schedulerID - status,output = curl.post(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR queryJobInfoPerCloud : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# get job statistics -def getJobStatistics(sourcetype=None): - # instantiate curl - curl = _Curl() - # execute - ret = {} - for srvID in getPandas(): - url = _getURL('URL',srvID) + '/getJobStatistics' - data = {} - if sourcetype != None: - data['sourcetype'] = sourcetype - status,output = curl.get(url,data) - try: - tmpRet = status,pickle.loads(output) - if status != 0: - return tmpRet - except: - print output - type, value, traceBack = sys.exc_info() - errStr = "ERROR getJobStatistics : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - # gather - for tmpCloud,tmpVal in tmpRet[1].iteritems(): - if not ret.has_key(tmpCloud): - # append cloud values - ret[tmpCloud] = tmpVal - else: - # sum statistics - for tmpStatus,tmpCount in tmpVal.iteritems(): - if ret[tmpCloud].has_key(tmpStatus): - ret[tmpCloud][tmpStatus] += tmpCount - else: - ret[tmpCloud][tmpStatus] = tmpCount - return 0,ret - - -# get job statistics for Bamboo -def getJobStatisticsForBamboo(useMorePG=False): - # instantiate curl - curl = _Curl() - # execute - ret = {} - for srvID in getPandas(): - url = _getURL('URL',srvID) + '/getJobStatisticsForBamboo' - data = {} - if useMorePG != False: - data['useMorePG'] = useMorePG - status,output = curl.get(url,data) - try: - tmpRet = status,pickle.loads(output) - if status != 0: - return tmpRet - except: - print output - type, value, traceBack = sys.exc_info() - errStr = "ERROR getJobStatisticsForBamboo : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - # gather - for tmpCloud,tmpMap in tmpRet[1].iteritems(): - if not ret.has_key(tmpCloud): - # append cloud values - ret[tmpCloud] = tmpMap - else: - # sum statistics - for tmpPType,tmpVal in tmpMap.iteritems(): - if not ret[tmpCloud].has_key(tmpPType): - ret[tmpCloud][tmpPType] = tmpVal - else: - for tmpStatus,tmpCount in tmpVal.iteritems(): - if ret[tmpCloud][tmpPType].has_key(tmpStatus): - ret[tmpCloud][tmpPType][tmpStatus] += tmpCount - else: - ret[tmpCloud][tmpPType][tmpStatus] = tmpCount - return 0,ret - - -# get highest prio jobs -def getHighestPrioJobStat(perPG=False,useMorePG=False): - # instantiate curl - curl = _Curl() - # execute - ret = {} - url = baseURL + '/getHighestPrioJobStat' - data = {'perPG':perPG} - if useMorePG != False: - data['useMorePG'] = useMorePG - status,output = curl.get(url,data) - try: - return status,pickle.loads(output) - except: - print output - type, value, traceBack = sys.exc_info() - errStr = "ERROR getHighestPrioJobStat : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# get jobs updated recently -def getJobsToBeUpdated(limit=5000,lockedby='',srvID=None): - # instantiate curl - curl = _Curl() - # execute - url = _getURL('URL',srvID) + '/getJobsToBeUpdated' - status,output = curl.get(url,{'limit':limit,'lockedby':lockedby}) - try: - return status,pickle.loads(output) - except: - print output - type, value, traceBack = sys.exc_info() - errStr = "ERROR getJobsToBeUpdated : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# update prodDBUpdateTimes -def updateProdDBUpdateTimes(params,verbose=False,srvID=None): - # serialize - strPar = pickle.dumps(params) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose - # execute - url = _getURL('URLSSL',srvID) + '/updateProdDBUpdateTimes' - data = {'params':strPar} - status,output = curl.post(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR updateProdDBUpdateTimes : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# get PandaID at site -def getPandaIDsSite(site,status,limit=500): - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/getPandaIDsSite' - status,output = curl.get(url,{'site':site,'status':status,'limit':limit}) - try: - return status,pickle.loads(output) - except: - print output - type, value, traceBack = sys.exc_info() - errStr = "ERROR getPandaIDsSite : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# get job statistics per site -def getJobStatisticsPerSite(predefined=False,workingGroup='',countryGroup='',jobType='',minPriority=None, - readArchived=None): - # instantiate curl - curl = _Curl() - # execute - ret = {} - for srvID in getPandas(): - url = _getURL('URL',srvID) + '/getJobStatisticsPerSite' - data = {'predefined':predefined} - if not workingGroup in ['',None]: - data['workingGroup'] = workingGroup - if not countryGroup in ['',None]: - data['countryGroup'] = countryGroup - if not jobType in ['',None]: - data['jobType'] = jobType - if not minPriority in ['',None]: - data['minPriority'] = minPriority - if not readArchived in ['',None]: - data['readArchived'] = readArchived - status,output = curl.get(url,data) - try: - tmpRet = status,pickle.loads(output) - if status != 0: - return tmpRet - except: - print output - type, value, traceBack = sys.exc_info() - errStr = "ERROR getJobStatisticsPerSite : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - # gather - for tmpSite,tmpVal in tmpRet[1].iteritems(): - if not ret.has_key(tmpSite): - # append site values - ret[tmpSite] = tmpVal - else: - # sum statistics - for tmpStatus,tmpCount in tmpVal.iteritems(): - if ret[tmpSite].has_key(tmpStatus): - ret[tmpSite][tmpStatus] += tmpCount - else: - ret[tmpSite][tmpStatus] = tmpCount - return 0,ret - - -# get job statistics per site with label -def getJobStatisticsWithLabel(site=''): - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/getJobStatisticsWithLabel' - data = {} - if not site in ['',None]: - data['site'] = site - status,output = curl.get(url,data) - try: - return status,pickle.loads(output) - except: - print output - type, value, traceBack = sys.exc_info() - errStr = "ERROR getJobStatisticsWithLabel : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# get the number of waiting jobs per site and user -def getJobStatisticsPerUserSite(): - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/getJobStatisticsPerUserSite' - data = {} - status,output = curl.get(url,data) - try: - return status,pickle.loads(output) - except: - print output - type, value, traceBack = sys.exc_info() - errStr = "ERROR getJobStatisticsPerUserSite : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# query last files in datasets -def queryLastFilesInDataset(datasets): - # serialize - strDSs = pickle.dumps(datasets) - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/queryLastFilesInDataset' - data = {'datasets':strDSs} - status,output = curl.post(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - print "ERROR queryLastFilesInDataset : %s %s" % (type,value) - return EC_Failed,None - - -# insert sandbox file info -def insertSandboxFileInfo(userName,fileName,fileSize,checkSum,verbose=False): - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - curl.verbose = verbose - # execute - url = baseURLSSL + '/insertSandboxFileInfo' - data = {'userName':userName,'fileName':fileName,'fileSize':fileSize,'checkSum':checkSum} - return curl.post(url,data) - - -# put file -def putFile(file): - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - # execute - url = baseURLSSL + '/putFile' - data = {'file':file} - return curl.put(url,data) - - -# delete file -def deleteFile(file): - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - # execute - url = baseURLSSL + '/deleteFile' - data = {'file':file} - return curl.post(url,data) - - -# touch file -def touchFile(sourceURL,filename): - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - # execute - url = sourceURL + '/server/panda/touchFile' - data = {'filename':filename} - return curl.post(url,data) - - -# resubmit jobs -def resubmitJobs(ids): - # serialize - strIDs = pickle.dumps(ids) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - # execute - url = baseURLSSL + '/resubmitJobs' - data = {'ids':strIDs} - status,output = curl.post(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - print "ERROR resubmitJobs : %s %s" % (type,value) - return EC_Failed,None - - -# get site specs -def getSiteSpecs(siteType=None): - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/getSiteSpecs' - data = {} - if siteType != None: - data = {'siteType':siteType} - status,output = curl.get(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR getSiteSpecs : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# get cloud specs -def getCloudSpecs(): - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/getCloudSpecs' - status,output = curl.get(url,{}) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR getCloudSpecs : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# get nPilots -def getNumPilots(): - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/getNumPilots' - status,output = curl.get(url,{}) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR getNumPilots : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# run brokerage -def runBrokerage(sites,atlasRelease,cmtConfig=None): - # serialize - strSites = pickle.dumps(sites) - # instantiate curl - curl = _Curl() - # execute - url = baseURL + '/runBrokerage' - data = {'sites':strSites, - 'atlasRelease':atlasRelease} - if cmtConfig != None: - data['cmtConfig'] = cmtConfig - return curl.get(url,data) - - -# get RW -def getRW(priority=0): - # instantiate curl - curl = _Curl() - # execute - url = baseURLBAMBOO + '/getRW' - # get RWs for high priority tasks - data = {'priority':priority} - status,output = curl.get(url,data) - try: - return status,pickle.loads(output) - except: - type, value, traceBack = sys.exc_info() - errStr = "ERROR getRW : %s %s" % (type,value) - print errStr - return EC_Failed,output+'\n'+errStr - - -# change job priorities -def changeJobPriorities(newPrioMap): - # serialize - newPrioMapStr = pickle.dumps(newPrioMap) - # instantiate curl - curl = _Curl() - curl.sslCert = _x509() - curl.sslKey = _x509() - # execute - url = baseURLSSL + '/changeJobPriorities' - data = {'newPrioMap':newPrioMapStr} - status,output = curl.post(url,data) - try: - return status,pickle.loads(output) - except: - errtype,errvalue = sys.exc_info()[:2] - errStr = "ERROR changeJobPriorities : %s %s" % (errtype,errvalue) - return EC_Failed,output+'\n'+errStr - - diff --git a/current/pandaserver/userinterface/RbLauncher.py b/current/pandaserver/userinterface/RbLauncher.py deleted file mode 100755 index a23a6fbcf..000000000 --- a/current/pandaserver/userinterface/RbLauncher.py +++ /dev/null @@ -1,52 +0,0 @@ -''' -launcer for ReBroker - -''' - -import sys -import time -import commands -import threading - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('RbLauncher') - - -class RbLauncher (threading.Thread): - # constructor - def __init__(self,dn,jobID,cloud=None,excludedSite=None): - threading.Thread.__init__(self) - self.dn = dn - self.jobID = jobID - self.cloud = cloud - self.excludedSite = excludedSite - # time stamp - self.timestamp = time.asctime() - - - # main - def run(self): - try: - _logger.debug('%s startRun' % self.timestamp) - # run - com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) - com += 'source %s; ' % panda_config.glite_source - com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/userinterface/runReBroker.py ' % \ - (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, - panda_config.pandaPython_dir) - com += '-j %s -d "%s" ' % (self.jobID,self.dn) - if self.cloud != None: - com += '-c %s ' % self.cloud - if self.excludedSite != None: - com += '-e %s ' % self.excludedSite - # exeute - _logger.debug('%s com=%s' % (self.timestamp,com)) - status,output = commands.getstatusoutput(com) - _logger.debug("%s Ret from another process: %s %s" % (self.timestamp,status,output)) - _logger.debug('%s endRun' % self.timestamp) - except: - type, value, traceBack = sys.exc_info() - _logger.error("run() : %s %s" % (type,value)) diff --git a/current/pandaserver/userinterface/ReBroker.py b/current/pandaserver/userinterface/ReBroker.py deleted file mode 100644 index 205b375ee..000000000 --- a/current/pandaserver/userinterface/ReBroker.py +++ /dev/null @@ -1,1022 +0,0 @@ -''' -find another candidate site for analysis - -''' - -import re -import sys -import time -import random -import datetime -import threading - -from dataservice.DDM import ddm -from dataservice.DDM import dq2Common -from taskbuffer.JobSpec import JobSpec -from taskbuffer.OraDBProxy import DBProxy -from dataservice.Setupper import Setupper -from brokerage.SiteMapper import SiteMapper -import brokerage.broker - -from config import panda_config -from pandalogger.PandaLogger import PandaLogger - -# logger -_logger = PandaLogger().getLogger('ReBroker') - -def initLogger(pLogger): - # redirect logging to parent as it doesn't work in nested threads - global _logger - _logger = pLogger - - -class ReBroker (threading.Thread): - - # constructor - def __init__(self,taskBuffer,cloud=None,excludedSite=None,overrideSite=True, - simulation=False,forceOpt=False,userRequest=False,forFailed=False, - avoidSameSite=False): - threading.Thread.__init__(self) - self.job = None - self.jobID = None - self.pandaID = None - self.cloud = cloud - self.pandaJobList = [] - self.buildStatus = None - self.taskBuffer = taskBuffer - self.token = None - self.newDatasetMap = {} - self.simulation = simulation - self.forceOpt = forceOpt - self.excludedSite = excludedSite - self.overrideSite = overrideSite - self.maxPandaIDlibDS = None - self.userRequest = userRequest - self.forFailed = forFailed - self.revNum = 0 - self.avoidSameSite = avoidSameSite - self.brokerageInfo = [] - - - # main - def run(self): - try: - # get job - tmpJobs = self.taskBuffer.getFullJobStatus([self.rPandaID]) - if tmpJobs == [] or tmpJobs[0] == None: - _logger.debug("cannot find job for PandaID=%s" % self.rPandaID) - return - self.job = tmpJobs[0] - _logger.debug("%s start %s:%s:%s" % (self.token,self.job.jobDefinitionID,self.job.prodUserName,self.job.computingSite)) - # using output container - if not self.job.destinationDBlock.endswith('/'): - _logger.debug("%s ouput dataset container is required" % self.token) - _logger.debug("%s end" % self.token) - return - # FIXEME : dont' touch group jobs for now - if self.job.destinationDBlock.startswith('group') and (not self.userRequest): - _logger.debug("%s skip group jobs" % self.token) - _logger.debug("%s end" % self.token) - return - # check processingType - typesForRebro = ['pathena','prun','ganga','ganga-rbtest'] - if not self.job.processingType in typesForRebro: - _logger.debug("%s skip processingType=%s not in %s" % \ - (self.token,self.job.processingType,str(typesForRebro))) - _logger.debug("%s end" % self.token) - return - # check jobsetID - if self.job.jobsetID in [0,'NULL',None]: - _logger.debug("%s jobsetID is undefined" % self.token) - _logger.debug("%s end" % self.token) - return - # check metadata - if self.job.metadata in [None,'NULL']: - _logger.debug("%s metadata is unavailable" % self.token) - _logger.debug("%s end" % self.token) - return - # check --disableRebrokerage - match = re.search("--disableRebrokerage",self.job.metadata) - if match != None and (not self.simulation) and (not self.forceOpt) \ - and (not self.userRequest): - _logger.debug("%s diabled rebrokerage" % self.token) - _logger.debug("%s end" % self.token) - return - # check --site - match = re.search("--site",self.job.metadata) - if match != None and (not self.simulation) and (not self.forceOpt) \ - and (not self.userRequest): - _logger.debug("%s --site is used" % self.token) - _logger.debug("%s end" % self.token) - return - # check --libDS - match = re.search("--libDS",self.job.metadata) - if match != None: - _logger.debug("%s --libDS is used" % self.token) - _logger.debug("%s end" % self.token) - return - # check --workingGroup since it is site-specific - match = re.search("--workingGroup",self.job.metadata) - if match != None: - _logger.debug("%s workingGroup is specified" % self.token) - _logger.debug("%s end" % self.token) - return - # avoid too many rebrokerage - if not self.checkRev(): - _logger.debug("%s avoid too many rebrokerage" % self.token) - _logger.debug("%s end" % self.token) - return - # check if multiple JobIDs use the same libDS - if self.bPandaID != None and self.buildStatus not in ['finished','failed']: - if self.minPandaIDlibDS == None or self.maxPandaIDlibDS == None: - _logger.debug("%s max/min PandaIDs are unavailable for the libDS" % self.token) - _logger.debug("%s end" % self.token) - return - tmpPandaIDsForLibDS = self.taskBuffer.getFullJobStatus([self.minPandaIDlibDS,self.maxPandaIDlibDS]) - if len(tmpPandaIDsForLibDS) != 2 or tmpPandaIDsForLibDS[0] == None or tmpPandaIDsForLibDS[1] == None: - _logger.debug("%s failed to get max/min PandaIDs for the libDS" % self.token) - _logger.debug("%s end" % self.token) - return - # check - if tmpPandaIDsForLibDS[0].jobDefinitionID != tmpPandaIDsForLibDS[1].jobDefinitionID: - _logger.debug("%s multiple JobIDs use the libDS %s:%s %s:%s" % (self.token,tmpPandaIDsForLibDS[0].jobDefinitionID, - self.minPandaIDlibDS,tmpPandaIDsForLibDS[1].jobDefinitionID, - self.maxPandaIDlibDS)) - _logger.debug("%s end" % self.token) - return - # check excludedSite - if self.excludedSite == None: - self.excludedSite = [] - match = re.search("--excludedSite( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata) - if match != None: - self.excludedSite = match.group(3).split(',') - # remove empty - try: - self.excludedSite.remove('') - except: - pass - _logger.debug("%s excludedSite=%s" % (self.token,str(self.excludedSite))) - # check cloud - if self.cloud == None: - match = re.search("--cloud( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata) - if match != None: - self.cloud = match.group(3) - _logger.debug("%s cloud=%s" % (self.token,self.cloud)) - # get inDS/LFNs - status,tmpMapInDS,maxFileSize = self.taskBuffer.getInDatasetsForReBrokerage(self.jobID,self.userName) - if not status: - # failed - _logger.error("%s failed to get inDS/LFN from DB" % self.token) - return - status,inputDS = self.getListDatasetsUsedByJob(tmpMapInDS) - if not status: - # failed - _logger.error("%s failed" % self.token) - return - # get relicas - replicaMap = {} - unknownSites = {} - for tmpDS in inputDS: - if tmpDS.endswith('/'): - # container - status,tmpRepMaps = self.getListDatasetReplicasInContainer(tmpDS) - else: - # normal dataset - status,tmpRepMap = self.getListDatasetReplicas(tmpDS) - tmpRepMaps = {tmpDS:tmpRepMap} - if not status: - # failed - _logger.debug("%s failed" % self.token) - return - # make map per site - for tmpDS,tmpRepMap in tmpRepMaps.iteritems(): - for tmpSite,tmpStat in tmpRepMap.iteritems(): - # ignore special sites - if tmpSite in ['CERN-PROD_TZERO','CERN-PROD_DAQ','CERN-PROD_TMPDISK']: - continue - # ignore tape sites - if tmpSite.endswith('TAPE'): - continue - # keep sites with unknown replica info - if tmpStat[-1]['found'] == None: - if not unknownSites.has_key(tmpDS): - unknownSites[tmpDS] = [] - unknownSites[tmpDS].append(tmpSite) - # ignore ToBeDeleted - if tmpStat[-1]['archived'] in ['ToBeDeleted',]: - continue - # change EOS - if tmpSite.startswith('CERN-PROD_EOS'): - tmpSite = 'CERN-PROD_EOS' - # change EOS TMP - if tmpSite.startswith('CERN-PROD_TMP'): - tmpSite = 'CERN-PROD_TMP' - # change DISK to SCRATCHDISK - tmpSite = re.sub('_[^_-]+DISK$','',tmpSite) - # change PERF-XYZ to SCRATCHDISK - tmpSite = re.sub('_PERF-[^_-]+$','',tmpSite) - # change PHYS-XYZ to SCRATCHDISK - tmpSite = re.sub('_PHYS-[^_-]+$','',tmpSite) - # patch for BNLPANDA - if tmpSite in ['BNLPANDA']: - tmpSite = 'BNL-OSG2' - # add to map - if not replicaMap.has_key(tmpSite): - replicaMap[tmpSite] = {} - replicaMap[tmpSite][tmpDS] = tmpStat[-1] - _logger.debug("%s replica map -> %s" % (self.token,str(replicaMap))) - # refresh replica info in needed - self.refreshReplicaInfo(unknownSites) - # instantiate SiteMapper - siteMapper = SiteMapper(self.taskBuffer) - # get original DDM - origSiteDDM = self.getAggName(siteMapper.getSite(self.job.computingSite).ddm) - # check all datasets - maxDQ2Sites = [] - if inputDS != []: - # loop over all sites - for tmpSite,tmpDsVal in replicaMap.iteritems(): - # loop over all datasets - appendFlag = True - for tmpOrigDS in inputDS: - # check completeness - if tmpDsVal.has_key(tmpOrigDS) and tmpDsVal[tmpOrigDS]['found'] != None and \ - tmpDsVal[tmpOrigDS]['total'] == tmpDsVal[tmpOrigDS]['found']: - pass - else: - appendFlag = False - # append - if appendFlag: - if not tmpSite in maxDQ2Sites: - maxDQ2Sites.append(tmpSite) - _logger.debug("%s candidate DQ2s -> %s" % (self.token,str(maxDQ2Sites))) - if inputDS != [] and maxDQ2Sites == []: - _logger.debug("%s no DQ2 candidate" % self.token) - else: - maxPandaSites = [] - # original maxinputsize - origMaxInputSize = siteMapper.getSite(self.job.computingSite).maxinputsize - # look for Panda siteIDs - for tmpSiteID,tmpSiteSpec in siteMapper.siteSpecList.iteritems(): - # use ANALY_ only - if not tmpSiteID.startswith('ANALY_'): - continue - # remove test and local - if re.search('_test',tmpSiteID,re.I) != None: - continue - if re.search('_local',tmpSiteID,re.I) != None: - continue - # avoid same site - if self.avoidSameSite and self.getAggName(tmpSiteSpec.ddm) == origSiteDDM: - continue - # check DQ2 ID - if self.cloud in [None,tmpSiteSpec.cloud] \ - and (self.getAggName(tmpSiteSpec.ddm) in maxDQ2Sites or inputDS == []): - # excluded sites - excludedFlag = False - for tmpExcSite in self.excludedSite: - if re.search(tmpExcSite,tmpSiteID) != None: - excludedFlag = True - break - if excludedFlag: - _logger.debug("%s skip %s since excluded" % (self.token,tmpSiteID)) - continue - # use online only - if tmpSiteSpec.status != 'online': - _logger.debug("%s skip %s status=%s" % (self.token,tmpSiteID,tmpSiteSpec.status)) - continue - # check maxinputsize - if (maxFileSize == None and origMaxInputSize > siteMapper.getSite(tmpSiteID).maxinputsize) or \ - maxFileSize > siteMapper.getSite(tmpSiteID).maxinputsize: - _logger.debug("%s skip %s due to maxinputsize" % (self.token,tmpSiteID)) - continue - # append - if not tmpSiteID in maxPandaSites: - maxPandaSites.append(tmpSiteID) - # choose at most 20 sites randomly to avoid too many lookup - random.shuffle(maxPandaSites) - maxPandaSites = maxPandaSites[:20] - _logger.debug("%s candidate PandaSites -> %s" % (self.token,str(maxPandaSites))) - # no Panda siteIDs - if maxPandaSites == []: - _logger.debug("%s no Panda site candidate" % self.token) - else: - # set AtlasRelease and cmtConfig to dummy job - tmpJobForBrokerage = JobSpec() - if self.job.AtlasRelease in ['NULL',None]: - tmpJobForBrokerage.AtlasRelease = '' - else: - tmpJobForBrokerage.AtlasRelease = self.job.AtlasRelease - # use nightlies - matchNight = re.search('^AnalysisTransforms-.*_(rel_\d+)$',self.job.homepackage) - if matchNight != None: - tmpJobForBrokerage.AtlasRelease += ':%s' % matchNight.group(1) - # use cache - else: - matchCache = re.search('^AnalysisTransforms-([^/]+)',self.job.homepackage) - if matchCache != None: - tmpJobForBrokerage.AtlasRelease = matchCache.group(1).replace('_','-') - if not self.job.cmtConfig in ['NULL',None]: - tmpJobForBrokerage.cmtConfig = self.job.cmtConfig - # memory size - if not self.job.minRamCount in ['NULL',None,0]: - tmpJobForBrokerage.minRamCount = self.job.minRamCount - # CPU count - if not self.job.maxCpuCount in ['NULL',None,0]: - tmpJobForBrokerage.maxCpuCount = self.job.maxCpuCount - # run brokerage - brokerage.broker.schedule([tmpJobForBrokerage],self.taskBuffer,siteMapper,forAnalysis=True, - setScanSiteList=maxPandaSites,trustIS=True,reportLog=True) - newSiteID = tmpJobForBrokerage.computingSite - self.brokerageInfo += tmpJobForBrokerage.brokerageErrorDiag - _logger.debug("%s runBrokerage - > %s" % (self.token,newSiteID)) - # unknown site - if not siteMapper.checkSite(newSiteID): - _logger.error("%s unknown site" % self.token) - _logger.debug("%s failed" % self.token) - return - # get new site spec - newSiteSpec = siteMapper.getSite(newSiteID) - # avoid repetition - if self.getAggName(newSiteSpec.ddm) == origSiteDDM: - _logger.debug("%s assigned to the same site %s " % (self.token,newSiteID)) - _logger.debug("%s end" % self.token) - return - # simulation mode - if self.simulation: - _logger.debug("%s end simulation" % self.token) - return - # prepare jobs - status = self.prepareJob(newSiteID,newSiteSpec.cloud) - if status: - # run SetUpper - statusSetUp = self.runSetUpper() - if not statusSetUp: - _logger.debug("%s runSetUpper failed" % self.token) - else: - _logger.debug("%s successfully assigned to %s" % (self.token,newSiteID)) - _logger.debug("%s end" % self.token) - except: - errType,errValue,errTraceBack = sys.exc_info() - _logger.error("%s run() : %s %s" % (self.token,errType,errValue)) - - - # get aggregated DQ2 ID - def getAggName(self,origName): - if origName.startswith('CERN-PROD_EOS'): - return 'CERN-PROD_EOS' - if origName.startswith('CERN-PROD_TMP'): - return 'CERN-PROD_TMP' - return re.sub('_[^_-]+DISK$','',origName) - - - # lock job to disable multiple broker running in parallel - def lockJob(self,dn,jobID): - # make token - tmpProxy = DBProxy() - self.token = "%s:%s:" % (tmpProxy.cleanUserID(dn),jobID) - _logger.debug("%s lockJob" % self.token) - # lock - resST,resVal = self.taskBuffer.lockJobForReBrokerage(dn,jobID,self.simulation,self.forceOpt, - forFailed=self.forFailed) - # failed - if not resST: - _logger.debug("%s lockJob failed since %s" % (self.token,resVal['err'])) - return False,resVal['err'] - # keep jobID - self.jobID = jobID - # set PandaID,buildStatus,userName - self.rPandaID = resVal['rPandaID'] - self.bPandaID = resVal['bPandaID'] - self.userName = resVal['userName'] - self.buildStatus = resVal['bStatus'] - self.buildJobID = resVal['bJobID'] - self.minPandaIDlibDS = resVal['minPandaIDlibDS'] - self.maxPandaIDlibDS = resVal['maxPandaIDlibDS'] - # use JobID as rev num - self.revNum = self.taskBuffer.getJobIdUser(dn) - _logger.debug("%s run PandaID=%s / build PandaID=%s Status=%s JobID=%s rev=%s" % \ - (self.token,self.rPandaID,self.bPandaID,self.buildStatus, - self.buildJobID,self.revNum)) - # return - return True,'' - - - # move build job to jobsDefined4 - def prepareJob(self,site,cloud): - _logger.debug("%s prepareJob" % self.token) - # reuse buildJob + all runJobs - if self.jobID == self.buildJobID and self.buildStatus in ['defined','activated']: - if self.buildStatus == 'activated': - # move build job to jobsDefined4 - ret = self.taskBuffer.resetBuildJobForReBrokerage(self.bPandaID) - if not ret: - _logger.error("%s failed to move build job %s to jobsDefined" % (self.token,self.bPandaID)) - return False - # get PandaIDs from jobsDefined4 - tmpPandaIDs = self.taskBuffer.getPandaIDsForReBrokerage(self.userName,self.jobID,False, - forFailed=self.forFailed) - if tmpPandaIDs == []: - _logger.error("%s cannot find PandaDSs" % self.token) - return False - # get jobSpecs - iBunchJobs = 0 - nBunchJobs = 500 - tmpJobsMap = {} - while iBunchJobs < len(tmpPandaIDs): - # get IDs - tmpJobs = self.taskBuffer.peekJobs(tmpPandaIDs[iBunchJobs:iBunchJobs+nBunchJobs],True,False,False,False) - for tmpJob in tmpJobs: - if tmpJob != None and tmpJob.jobStatus in ['defined','assigned']: - # remove _sub suffix - for tmpFile in tmpJob.Files: - if tmpFile.type != 'input': - tmpFile.destinationDBlock = re.sub('_sub\d+$','',tmpFile.destinationDBlock) - self.pandaJobList.append(tmpJob) - # increment index - iBunchJobs += nBunchJobs - # make new bunch - else: - # make new buildJob - if self.bPandaID != None: - tmpJobs = self.taskBuffer.getFullJobStatus([self.bPandaID]) - if tmpJobs == [] or tmpJobs[0] == None: - _logger.debug("cannot find build job for PandaID=%s" % self.bPandaID) - return False - # make - tmpBuildJob,oldLibDS,newLibDS = self.makeNewBuildJobForRebrokerage(tmpJobs[0]) - # set parameters - tmpBuildJob.jobExecutionID = self.jobID - tmpBuildJob.jobsetID = -1 - tmpBuildJob.sourceSite = self.job.jobsetID - # register - status = self.registerNewDataset(newLibDS) - if not status: - _logger.debug("%s failed to register new libDS" % self.token) - return False - # append - self.pandaJobList = [tmpBuildJob] - # prepare outputDS - status = self.prepareDS() - if not status: - _logger.error("%s failed to prepare outputDS" % self.token) - return False - # get PandaIDs - if self.buildStatus in ['finished',None]: - # from jobsActivated when buildJob already finished or noBuild - tmpPandaIDs = self.taskBuffer.getPandaIDsForReBrokerage(self.userName,self.jobID,True, - forFailed=self.forFailed) - else: - # from jobsDefined - tmpPandaIDs = self.taskBuffer.getPandaIDsForReBrokerage(self.userName,self.jobID,False, - forFailed=self.forFailed) - if tmpPandaIDs == []: - _logger.error("%s cannot find PandaDSs" % self.token) - return False - # get jobSpecs - iBunchJobs = 0 - nBunchJobs = 500 - tmpJobsMap = {} - while iBunchJobs < len(tmpPandaIDs): - # get jobs - tmpJobs = self.taskBuffer.peekJobs(tmpPandaIDs[iBunchJobs:iBunchJobs+nBunchJobs],True,True,False,False,True) - for tmpJob in tmpJobs: - # reset parameters for retry - if self.forFailed and tmpJob != None: - self.taskBuffer.retryJob(tmpJob.PandaID,{},failedInActive=True, - changeJobInMem=True,inMemJob=tmpJob) - # set holding to be compatible with rebro jobs - tmpJob.jobStatus = 'holding' - # check job status. activated jobs were changed to holding by getPandaIDsForReBrokerage - if tmpJob != None and tmpJob.jobStatus in ['defined','assigned','holding']: - # reset parameter - tmpJob.parentID = tmpJob.PandaID - tmpJob.PandaID = None - tmpJob.jobExecutionID = tmpJob.jobDefinitionID - tmpJob.jobsetID = -1 - tmpJob.sourceSite = self.job.jobsetID - if self.bPandaID != None: - tmpJob.jobParameters = re.sub(oldLibDS,newLibDS,tmpJob.jobParameters) - for tmpFile in tmpJob.Files: - tmpFile.row_ID = None - tmpFile.PandaID = None - if tmpFile.type == 'input': - if self.bPandaID != None and tmpFile.dataset == oldLibDS: - tmpFile.status = 'unknown' - tmpFile.GUID = None - tmpFile.dataset = newLibDS - tmpFile.dispatchDBlock = newLibDS - tmpFile.lfn = re.sub(oldLibDS,newLibDS,tmpFile.lfn) - else: - # use new dataset - tmpFile.destinationDBlock = re.sub('_sub\d+$','',tmpFile.destinationDBlock) - if not self.newDatasetMap.has_key(tmpFile.destinationDBlock): - _logger.error("%s cannot find new dataset for %s:%s" % (self.token,tmpFile.PandaID,tmpFile.destinationDBlock)) - return False - tmpFile.destinationDBlock = self.newDatasetMap[tmpFile.destinationDBlock] - # append - self.pandaJobList.append(tmpJob) - # increment index - iBunchJobs += nBunchJobs - # no jobs - if self.pandaJobList == []: - _logger.error("%s no jobs" % self.token) - return False - # set cloud, site, and specialHandling - for tmpJob in self.pandaJobList: - # set specialHandling - if tmpJob.specialHandling in [None,'NULL','']: - if not self.forFailed: - tmpJob.specialHandling = 'rebro' - else: - tmpJob.specialHandling = 'sretry' - else: - if not self.forFailed: - tmpJob.specialHandling += ',rebro' - else: - tmpJob.specialHandling += ',sretry' - # check if --destSE is used - oldComputingSite = tmpJob.computingSite - if tmpJob.destinationSE == oldComputingSite: - tmpJob.destinationSE = site - # set site and cloud - tmpJob.computingSite = site - tmpJob.cloud = cloud - # reset destinationDBlock - for tmpFile in tmpJob.Files: - if tmpFile.type in ['output','log']: - # set destSE - if tmpFile.destinationSE == oldComputingSite: - tmpFile.destinationSE = site - # set the same specialHandling since new build may have different specialHandling - self.pandaJobList[0].specialHandling = self.pandaJobList[-1].specialHandling - # return - return True - - - # prepare libDS - def prepareDS(self): - _logger.debug("%s prepareDS" % self.token) - # get all outDSs - shadowDsName = None - for tmpFile in self.job.Files: - if tmpFile.type in ['output','log']: - tmpDS = re.sub('_sub\d+$','',tmpFile.destinationDBlock) - # append new rev number - match = re.search('_rev(\d+)$',tmpDS) - if match == None: - newDS = tmpDS + '_rev%s' % self.revNum - else: - newDS = re.sub('_rev(\d+)$','_rev%s' % self.revNum,tmpDS) - # add shadow - """ - if shadowDsName == None and tmpFile.type == 'log': - shadowDsName = "%s_shadow" % newDS - status = self.registerNewDataset(shadowDsName) - if not status: - _logger.debug("%s prepareDS failed for shadow" % self.token) - return False - """ - # add datasets - if not tmpDS in self.newDatasetMap: - # register - status = self.registerNewDataset(newDS,tmpFile.dataset) - if not status: - _logger.debug("%s prepareDS failed" % self.token) - return False - # append - self.newDatasetMap[tmpDS] = newDS - return True - - - # run SetUpper - def runSetUpper(self): - # reuse buildJob + all runJobs - reuseFlag = False - if self.jobID == self.buildJobID and self.buildStatus in ['defined','activated']: - reuseFlag = True - _logger.debug("%s start Setupper for JobID=%s" % (self.token,self.jobID)) - thr = Setupper(self.taskBuffer,self.pandaJobList,resetLocation=True) - thr.start() - thr.join() - # new bunch - else: - # fake FQANs - fqans = [] - if not self.job.countryGroup in ['','NULL',None]: - fqans.append('/atlas/%s/Role=NULL' % self.job.countryGroup) - if self.job.destinationDBlock.startswith('group') and not self.job.workingGroup in ['','NULL',None]: - fqans.append('/atlas/%s/Role=production' % self.job.workingGroup) - # insert jobs - _logger.debug("%s start storeJobs for JobID=%s" % (self.token,self.jobID)) - ret = self.taskBuffer.storeJobs(self.pandaJobList,self.job.prodUserID,True,False,fqans, - self.job.creationHost,True,checkSpecialHandling=False) - if ret == []: - _logger.error("%s storeJobs failed with [] for JobID=%s" % (self.token,self.jobID)) - return False - # get PandaIDs to be killed - pandaIDsTobeKilled = [] - newJobDefinitionID = None - newJobsetID = None - strNewIDsList = [] - for tmpIndex,tmpItem in enumerate(ret): - if not tmpItem[0] in ['NULL',None]: - tmpJob = self.pandaJobList[tmpIndex] - if not tmpJob.parentID in [0,None,'NULL']: - pandaIDsTobeKilled.append(tmpJob.parentID) - if newJobDefinitionID == None: - newJobDefinitionID = tmpItem[1] - if newJobsetID == None: - newJobsetID = tmpItem[2]['jobsetID'] - strNewIDs = 'PandaID=%s JobsetID=%s JobID=%s' % (tmpItem[0],newJobsetID,newJobDefinitionID) - strNewIDsList.append(strNewIDs) - if pandaIDsTobeKilled != []: - strNewJobIDs = "JobsetID=%s JobID=%s" % (newJobsetID,newJobDefinitionID) - _logger.debug("%s kill jobs for JobID=%s -> new %s : %s" % \ - (self.token,self.jobID,strNewJobIDs,str(pandaIDsTobeKilled))) - for tmpIdx,tmpPandaID in enumerate(pandaIDsTobeKilled): - if not self.forFailed: - self.taskBuffer.killJobs([tmpPandaID],strNewIDsList[tmpIdx],'8',True) - else: - self.taskBuffer.killJobs([tmpPandaID],strNewIDsList[tmpIdx],'7',True) - # send brokerage info - if not self.forFailed: - tmpMsg = 'action=rebrokerage ntry=%s ' % self.pandaJobList[0].specialHandling.split(',').count('rebro') - else: - tmpMsg = 'action=serverretry ntry=%s ' % self.pandaJobList[0].specialHandling.split(',').count('sretry') - tmpMsg += 'old_jobset=%s old_jobdef=%s old_site=%s' % (self.job.jobsetID,self.jobID,self.job.computingSite) - self.brokerageInfo.append(tmpMsg) - brokerage.broker.sendMsgToLoggerHTTP(self.brokerageInfo,self.pandaJobList[0]) - # succeeded - _logger.debug("%s completed for JobID=%s" % (self.token,self.jobID)) - return True - - - # check DDM response - def isDQ2ok(self,out): - if out.find("DQ2 internal server exception") != -1 \ - or out.find("An error occurred on the central catalogs") != -1 \ - or out.find("MySQL server has gone away") != -1 \ - or out == '()': - return False - return True - - - # get list of datasets - def getListDatasets(self,dataset): - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug("%s %s/%s listDatasets %s" % (self.token,iDDMTry,nTry,dataset)) - status,out = ddm.DQ2.main('listDatasets',dataset,0,True) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if status != 0 or out.startswith('Error'): - _logger.error(self.token+' '+out) - _logger.error('%s bad DQ2 response for %s' % (self.token,dataset)) - return False,{} - try: - # convert res to map - exec "tmpDatasets = %s" % out - # remove _sub/_dis - resList = [] - for tmpDS in tmpDatasets.keys(): - if re.search('(_sub|_dis)\d+$',tmpDS) == None and re.search('(_shadow$',tmpDS) == None: - resList.append(tmpDS) - _logger.debug('%s getListDatasets->%s' % (self.token,str(resList))) - return True,resList - except: - _logger.error(self.token+' '+out) - _logger.error('%s could not convert HTTP-res to datasets for %s' % (self.token,dataset)) - return False,{} - - - # get list of replicas for a dataset - def getListDatasetReplicas(self,dataset): - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug("%s %s/%s listDatasetReplicas %s" % (self.token,iDDMTry,nTry,dataset)) - status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if status != 0 or out.startswith('Error'): - _logger.error(self.token+' '+out) - _logger.error('%s bad DQ2 response for %s' % (self.token,dataset)) - return False,{} - try: - # convert res to map - exec "tmpRepSites = %s" % out - _logger.debug('%s getListDatasetReplicas->%s' % (self.token,str(tmpRepSites))) - return True,tmpRepSites - except: - _logger.error(self.token+' '+out) - _logger.error('%s could not convert HTTP-res to replica map for %s' % (self.token,dataset)) - return False,{} - - - # get replicas for a container - def getListDatasetReplicasInContainer(self,container): - # response for failure - resForFailure = False,{} - # get datasets in container - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug('%s %s/%s listDatasetsInContainer %s' % (self.token,iDDMTry,nTry,container)) - status,out = ddm.DQ2.main('listDatasetsInContainer',container) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error(self.token+' '+out) - _logger.error('%s bad DQ2 response for %s' % (self.token,container)) - return resForFailure - datasets = [] - try: - # convert to list - exec "datasets = %s" % out - except: - _logger.error('%s could not convert HTTP-res to dataset list for %s' % (self.token,container)) - return resForFailure - # loop over all datasets - allRepMap = {} - for dataset in datasets: - # get replicas - status,tmpRepSites = self.getListDatasetReplicas(dataset) - if not status: - return resForFailure - # append - allRepMap[dataset] = tmpRepSites - # return - _logger.debug('%s getListDatasetReplicasInContainer done') - return True,allRepMap - - - # delete original locations - def deleteDatasetReplicas(self,datasets): - # loop over all datasets - for dataset in datasets: - # get locations - status,tmpRepSites = self.getListDatasetReplicas(dataset) - if not status: - return False - # no replicas - if len(tmpRepSites.keys()) == 0: - continue - # delete - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug("%s %s/%s deleteDatasetReplicas %s" % (self.token,iDDMTry,nTry,dataset)) - status,out = ddm.DQ2.main('deleteDatasetReplicas',dataset,tmpRepSites.keys()) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if status != 0 or out.startswith('Error'): - _logger.error(self.token+' '+out) - _logger.error('%s bad DQ2 response for %s' % (self.token,dataset)) - return False - _logger.debug(self.token+' '+out) - # return - _logger.debug('%s deleted replicas for %s' % (self.token,str(datasets))) - return True - - - # check if datasets are empty - def checkDatasetContents(self,datasets): - # loop over all datasets - for dataset in datasets: - # check - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug("%s %s/%s getNumberOfFiles %s" % (self.token,iDDMTry,nTry,dataset)) - status,out = ddm.DQ2.main('getNumberOfFiles',dataset) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if status != 0 or out.startswith('Error'): - _logger.error(self.token+' '+out) - _logger.error('%s bad DQ2 response for %s' % (self.token,dataset)) - return False - # convert to int - _logger.debug(self.token+' '+out) - try: - nFile = int(out) - # not empty - if nFile != 0: - _logger.error('%s %s is not empty' % (self.token,dataset)) - return False - except: - _logger.error("%s could not convert HTTP-res to nFiles" % (self.token,dataset)) - return False - # all OK - return True - - - # register dataset - def registerNewDataset(self,dataset,container=''): - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug("%s %s/%s registerNewDataset %s" % (self.token,iDDMTry,nTry,dataset)) - status,out = ddm.DQ2.main('registerNewDataset',dataset) - if out.find('DQDatasetExistsException') != -1: - break - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if out.find('DQDatasetExistsException') != -1: - # ignore DQDatasetExistsException - pass - elif status != 0 or out.startswith('Error'): - _logger.error(self.token+' '+out) - _logger.error('%s failed to register new dataset %s' % (self.token,dataset)) - return False - # remove /CN=proxy and /CN=limited from DN - tmpRealDN = self.job.prodUserID - tmpRealDN = re.sub('/CN=limited proxy','',tmpRealDN) - tmpRealDN = re.sub('/CN=proxy','',tmpRealDN) - status,out = dq2Common.parse_dn(tmpRealDN) - if status != 0: - _logger.error(self.token+' '+out) - _logger.error('%s failed to truncate DN:%s' % (self.token,self.job.prodUserID)) - return False - tmpRealDN = out - # set owner - for iDDMTry in range(nTry): - _logger.debug("%s %s/%s setMetaDataAttribute %s %s" % (self.token,iDDMTry,nTry,dataset,tmpRealDN)) - status,out = ddm.DQ2.main('setMetaDataAttribute',dataset,'owner',tmpRealDN) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error(self.token+' '+out) - _logger.error('%s failed to set owner to dataset %s' % (self.token,dataset)) - return False - # add to contaner - if container != '' and container.endswith('/'): - for iDDMTry in range(nTry): - _logger.debug("%s %s/%s registerDatasetsInContainer %s to %s" % (self.token,iDDMTry,nTry,dataset,container)) - status,out = ddm.DQ2.main('registerDatasetsInContainer',container,[dataset]) - if out.find('DQContainerAlreadyHasDataset') != -1: - break - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - if out.find('DQContainerAlreadyHasDataset') != -1: - # ignore DQContainerAlreadyHasDataset - pass - elif status != 0 or out.startswith('Error'): - _logger.error(self.token+' '+out) - _logger.error('%s add %s to container:%s' % (self.token,dataset,container)) - return False - # return - return True - - - # get list of dataset used by the job - def getListDatasetsUsedByJob(self,mapDsLFN): - # response for failure - resForFailure = False,[] - # loop over all datasets - retList = [] - for tmpDsContainer,tmpLFNs in mapDsLFN.iteritems(): - # not a container - if not tmpDsContainer.endswith('/'): - if not tmpDsContainer in retList: - retList.append(tmpDsContainer) - continue - # get datasets in container - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug('%s %s/%s listDatasetsInContainer %s' % (self.token,iDDMTry,nTry,tmpDsContainer)) - status,out = ddm.DQ2.main('listDatasetsInContainer',tmpDsContainer) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error(self.token+' '+out) - _logger.error('%s bad DQ2 response for %s' % (self.token,tmpDsContainer)) - return resForFailure - tmpDatasets = [] - try: - # convert to list - exec "tmpDatasets = %s" % out - except: - _logger.error('%s could not convert HTTP-res to dataset list for %s' % (self.token,tmpDsContainer)) - return resForFailure - # get files in dataset - for tmpDS in tmpDatasets: - if tmpDS in retList: - continue - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug('%s %s/%s listFilesInDataset %s' % (self.token,iDDMTry,nTry,tmpDS)) - status,out = ddm.DQ2.main('listFilesInDataset',tmpDS) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - if status != 0 or out.startswith('Error'): - _logger.error(self.token+' '+out) - _logger.error('%s bad DQ2 response for %s' % (self.token,tmpDS)) - return resForFailure - # get LFN map - tmpMapDQ2 = {} - try: - # convert to list - exec "tmpMapDQ2 = %s[0]" % out - for tmpGUID,tmpVal in tmpMapDQ2.iteritems(): - # check if a file in DS is used by the job - if tmpVal['lfn'] in tmpLFNs: - # append - if not tmpDS in retList: - retList.append(tmpDS) - break - except: - _logger.error('%s could not convert HTTP-res to LFN map for %s' % (self.token,tmpDS)) - return resForFailure - # return - _logger.debug('%s getListDatasetsUsedByJob done %s' % (self.token,str(retList))) - return True,retList - - - # refresh replica info in needed - def refreshReplicaInfo(self,unknownSites): - for tmpDS,sites in unknownSites.iteritems(): - nTry = 3 - for iDDMTry in range(nTry): - _logger.debug("%s %s/%s listFileReplicasBySites %s %s" % (self.token,iDDMTry,nTry,tmpDS,str(sites))) - status,out = ddm.DQ2_iter.listFileReplicasBySites(tmpDS,0,sites,0,300) - if status != 0 or (not self.isDQ2ok(out)): - time.sleep(60) - else: - break - # result - if status != 0 or out.startswith('Error'): - _logger.error(self.token+' '+out) - _logger.error('%s bad DQ2 response for %s' % (self.token,dataset)) - # return - return True - - - # check rev to avoid too many rebrokerage - def checkRev(self): - # check specialHandling - if self.job.specialHandling in [None,'NULL','']: - revNum = 0 - else: - revNum = self.job.specialHandling.split(',').count('rebro') - revNum += self.job.specialHandling.split(',').count('sretry') - # check with limit - if revNum < 5: - return True - return False - - - # make buildJob for re-brokerage - def makeNewBuildJobForRebrokerage(self,buildJob): - # new libDS - oldLibDS = buildJob.destinationDBlock - match = re.search('_rev(\d+)$',oldLibDS) - if match == None: - newLibDS = oldLibDS + '__id%s_rev%s' % (self.job.jobDefinitionID,self.revNum) - else: - newLibDS = re.sub('_rev(\d+)$','_rev%s' % self.revNum,oldLibDS) - # reset parameters - buildJob.PandaID = None - buildJob.jobStatus = None - buildJob.commandToPilot = None - buildJob.schedulerID = None - buildJob.pilotID = None - for attr in buildJob._attributes: - if attr.endswith('ErrorCode') or attr.endswith('ErrorDiag'): - setattr(buildJob,attr,None) - buildJob.transExitCode = None - buildJob.creationTime = datetime.datetime.utcnow() - buildJob.modificationTime = buildJob.creationTime - buildJob.startTime = None - buildJob.endTime = None - buildJob.destinationDBlock = newLibDS - buildJob.jobParameters = re.sub(oldLibDS,newLibDS,buildJob.jobParameters) - for tmpFile in buildJob.Files: - tmpFile.row_ID = None - tmpFile.GUID = None - tmpFile.status = 'unknown' - tmpFile.PandaID = None - tmpFile.dataset = newLibDS - tmpFile.destinationDBlock = tmpFile.dataset - tmpFile.lfn = re.sub(oldLibDS,newLibDS,tmpFile.lfn) - return buildJob,oldLibDS,newLibDS diff --git a/current/pandaserver/userinterface/UserIF.py b/current/pandaserver/userinterface/UserIF.py deleted file mode 100755 index 31aa5cc0c..000000000 --- a/current/pandaserver/userinterface/UserIF.py +++ /dev/null @@ -1,1570 +0,0 @@ -''' -provide web interface to users - -''' - -import re -import sys -import time -import types -import cPickle as pickle -import jobdispatcher.Protocol as Protocol -import brokerage.broker -import taskbuffer.ProcessGroups -from config import panda_config -from taskbuffer.JobSpec import JobSpec -from taskbuffer.WrappedPickle import WrappedPickle -from brokerage.SiteMapper import SiteMapper -from pandalogger.PandaLogger import PandaLogger -from RbLauncher import RbLauncher -from ReBroker import ReBroker -from taskbuffer import PrioUtil -from dataservice.DDM import dq2Info - -# logger -_logger = PandaLogger().getLogger('UserIF') - - -# main class -class UserIF: - # constructor - def __init__(self): - self.taskBuffer = None - - - # initialize - def init(self,taskBuffer): - self.taskBuffer = taskBuffer - - - # submit jobs - def submitJobs(self,jobsStr,user,host,userFQANs,prodRole=False,toPending=False): - try: - # deserialize jobspecs - jobs = WrappedPickle.loads(jobsStr) - _logger.debug("submitJobs %s len:%s FQAN:%s" % (user,len(jobs),str(userFQANs))) - maxJobs = 5000 - if len(jobs) > maxJobs: - _logger.error("too may jobs more than %s" % maxJobs) - jobs = jobs[:maxJobs] - except: - type, value, traceBack = sys.exc_info() - _logger.error("submitJobs : %s %s" % (type,value)) - jobs = [] - # check prodSourceLabel - try: - goodProdSourceLabel = True - for tmpJob in jobs: - # prevent internal jobs from being submitted from outside - if tmpJob.prodSourceLabel in taskbuffer.ProcessGroups.internalSourceLabels: - _logger.error("submitJobs %s wrong prodSourceLabel=%s" % (user,tmpJob.prodSourceLabel)) - goodProdSourceLabel = False - break - # check production role - if tmpJob.prodSourceLabel in ['managed']: - if not prodRole: - _logger.error("submitJobs %s missing prod-role for prodSourceLabel=%s" % (user,tmpJob.prodSourceLabel)) - goodProdSourceLabel = False - break - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("submitJobs : checking goodProdSourceLabel %s %s" % (errType,errValue)) - goodProdSourceLabel = False - # reject injection for bad prodSourceLabel - if not goodProdSourceLabel: - return "ERROR: production role is required for production jobs" - # store jobs - ret = self.taskBuffer.storeJobs(jobs,user,forkSetupper=True,fqans=userFQANs, - hostname=host,toPending=toPending) - _logger.debug("submitJobs %s ->:%s" % (user,len(ret))) - # serialize - return pickle.dumps(ret) - - - # logger interface - def sendLogInfo(self,user,msgType,msgListStr): - try: - # deserialize message - msgList = WrappedPickle.loads(msgListStr) - # short user name - cUID = self.taskBuffer.cleanUserID(user) - # logging - iMsg = 0 - for msgBody in msgList: - # make message - message = "dn='%s' %s" % (cUID,msgBody) - # send message to logger - if msgType in ['analy_brokerage']: - brokerage.broker.sendMsgToLogger(message) - # get logger - _pandaLogger = PandaLogger() - _pandaLogger.lock() - _pandaLogger.setParams({'Type':msgType}) - logger = _pandaLogger.getHttpLogger(panda_config.loggername) - # add message - logger.info(message) - # release HTTP handler - _pandaLogger.release() - # sleep - iMsg += 1 - if iMsg % 5 == 0: - time.sleep(1) - except: - pass - # return - return True - - - # run task assignment - def runTaskAssignment(self,jobsStr): - try: - # deserialize jobspecs - jobs = WrappedPickle.loads(jobsStr) - except: - type, value, traceBack = sys.exc_info() - _logger.error("runTaskAssignment : %s %s" % (type,value)) - jobs = [] - # run - ret = self.taskBuffer.runTaskAssignment(jobs) - # serialize - return pickle.dumps(ret) - - - # get serial number for group job - def getSerialNumberForGroupJob(self,name): - # get - ret = self.taskBuffer.getSerialNumberForGroupJob(name) - # serialize - return pickle.dumps(ret) - - - # change job priorities - def changeJobPriorities(self,user,prodRole,newPrioMapStr): - # check production role - if not prodRole: - return False,"production role is required" - try: - # deserialize map - newPrioMap = WrappedPickle.loads(newPrioMapStr) - _logger.debug("changeJobPriorities %s : %s" % (user,str(newPrioMap))) - # change - ret = self.taskBuffer.changeJobPriorities(newPrioMap) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("changeJobPriorities : %s %s" % (errType,errValue)) - return False,'internal server error' - # serialize - return ret - - - # run rebrokerage - def runReBrokerage(self,dn,jobID,cloud,excludedSite,forceRebro): - returnVal = "True" - try: - # lock job in simulation mode to check - checker = ReBroker(self.taskBuffer,simulation=True,userRequest=True) - stLock,retLock = checker.lockJob(dn,jobID) - # failed - if not stLock: - returnVal = "ERROR: "+retLock - return returnVal - # continue to run rebrokerage in background - if excludedSite in [None,'']: - # use None for empty excludedSite - excludedSite = None - _logger.debug("runReBrokerage %s JobID:%s cloud=%s ex=%s forceOpt=%s" % (dn,jobID,cloud,str(excludedSite),forceRebro)) - # instantiate ReBroker - thr = RbLauncher(dn,jobID,cloud,excludedSite) - # start ReBroker - thr.start() - except: - errType,errValue,errTraceBack = sys.exc_info() - _logger.error("runReBrokerage: %s %s" % (errType,errValue)) - returnVal = "ERROR: runReBrokerage crashed" - # return - return returnVal - - - # retry failed subjobs in running job - def retryFailedJobsInActive(self,dn,jobID): - returnVal = False - try: - _logger.debug("retryFailedJobsInActive %s JobID:%s" % (dn,jobID)) - cUID = self.taskBuffer.cleanUserID(dn) - # instantiate ReBroker - tmpRet = self.taskBuffer.retryJobsInActive(cUID,jobID) - returnVal = True - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("retryFailedJobsInActive: %s %s" % (errType,errValue)) - returnVal = "ERROR: server side crash" - # return - return returnVal - - - # set debug mode - def setDebugMode(self,dn,pandaID,prodManager,modeOn): - ret = self.taskBuffer.setDebugMode(dn,pandaID,prodManager,modeOn) - # return - return ret - - - # insert sandbox file info - def insertSandboxFileInfo(self,userName,hostName,fileName,fileSize,checkSum): - ret = self.taskBuffer.insertSandboxFileInfo(userName,hostName,fileName,fileSize,checkSum) - # return - return ret - - - # check duplicated sandbox file - def checkSandboxFile(self,userName,fileSize,checkSum): - ret = self.taskBuffer.checkSandboxFile(userName,fileSize,checkSum) - # return - return ret - - - # get job status - def getJobStatus(self,idsStr): - try: - # deserialize jobspecs - ids = WrappedPickle.loads(idsStr) - _logger.debug("getJobStatus len : %s" % len(ids)) - maxIDs = 5500 - if len(ids) > maxIDs: - _logger.error("too long ID list more than %s" % maxIDs) - ids = ids[:maxIDs] - except: - type, value, traceBack = sys.exc_info() - _logger.error("getJobStatus : %s %s" % (type,value)) - ids = [] - _logger.debug("getJobStatus start : %s" % ids) - # peek jobs - ret = self.taskBuffer.peekJobs(ids) - _logger.debug("getJobStatus end") - # serialize - return pickle.dumps(ret) - - - # get PandaID with jobexeID - def getPandaIDwithJobExeID(self,idsStr): - try: - # deserialize jobspecs - ids = WrappedPickle.loads(idsStr) - _logger.debug("getPandaIDwithJobExeID len : %s" % len(ids)) - maxIDs = 5500 - if len(ids) > maxIDs: - _logger.error("too long ID list more than %s" % maxIDs) - ids = ids[:maxIDs] - except: - errtype,errvalue = sys.exc_info()[:2] - _logger.error("getPandaIDwithJobExeID : %s %s" % (errtype,errvalue)) - ids = [] - _logger.debug("getPandaIDwithJobExeID start : %s" % ids) - # peek jobs - ret = self.taskBuffer.getPandaIDwithJobExeID(ids) - _logger.debug("getPandaIDwithJobExeID end") - # serialize - return pickle.dumps(ret) - - - # get assigned cloud for tasks - def seeCloudTask(self,idsStr): - try: - # deserialize jobspecs - ids = WrappedPickle.loads(idsStr) - except: - type, value, traceBack = sys.exc_info() - _logger.error("seeCloudTask : %s %s" % (type,value)) - ids = [] - _logger.debug("seeCloudTask start : %s" % ids) - # peek jobs - ret = {} - for id in ids: - tmpRet = self.taskBuffer.seeCloudTask(id) - ret[id] = tmpRet - _logger.debug("seeCloudTask end") - # serialize - return pickle.dumps(ret) - - - # get active datasets - def getActiveDatasets(self,computingSite,prodSourceLabel): - # run - ret = self.taskBuffer.getActiveDatasets(computingSite,prodSourceLabel) - # return - return ret - - - # get assigning task - def getAssigningTask(self): - # run - ret = self.taskBuffer.getAssigningTask() - # serialize - return pickle.dumps(ret) - - - # set task by user - def setCloudTaskByUser(self,user,tid,cloud,status): - # run - ret = self.taskBuffer.setCloudTaskByUser(user,tid,cloud,status) - return ret - - - # add files to memcached - def addFilesToMemcached(self,site,node,files): - # add - ret = self.taskBuffer.addFilesToMemcached(site,node,files) - # return - return ret - - - # delete files from memcached - def deleteFilesFromMemcached(self,site,node,files): - # delete - ret = self.taskBuffer.deleteFilesFromMemcached(site,node,files) - # return - return ret - - - # flush memcached - def flushMemcached(self,site,node): - # flush - ret = self.taskBuffer.flushMemcached(site,node) - # return - return ret - - - # check files with memcached - def checkFilesWithMemcached(self,site,node,files): - # check - ret = self.taskBuffer.checkFilesWithMemcached(site,node,files) - # return - return ret - - - # get job statistics - def getJobStatistics(self,sourcetype=None): - # get job statistics - ret = self.taskBuffer.getJobStatisticsForExtIF(sourcetype) - # serialize - return pickle.dumps(ret) - - - # get highest prio jobs - def getHighestPrioJobStat(self,perPG=False,useMorePG=False): - # get job statistics - ret = self.taskBuffer.getHighestPrioJobStat(perPG,useMorePG) - # serialize - return pickle.dumps(ret) - - - # get queued analysis jobs at a site - def getQueuedAnalJobs(self,site,dn): - # get job statistics - ret = self.taskBuffer.getQueuedAnalJobs(site,dn) - # serialize - return pickle.dumps(ret) - - - # get job statistics for Bamboo - def getJobStatisticsForBamboo(self,useMorePG=False): - # get job statistics - ret = self.taskBuffer.getJobStatisticsForBamboo(useMorePG) - # serialize - return pickle.dumps(ret) - - - # get job statistics per site - def getJobStatisticsPerSite(self,predefined=False,workingGroup='',countryGroup='',jobType='', - minPriority=None,readArchived=True): - # get job statistics - ret = self.taskBuffer.getJobStatistics(readArchived,predefined,workingGroup,countryGroup,jobType, - minPriority=minPriority) - # serialize - return pickle.dumps(ret) - - - # get the number of waiting jobs per site and use - def getJobStatisticsPerUserSite(self): - # get job statistics - ret = self.taskBuffer.getJobStatisticsPerUserSite() - # serialize - return pickle.dumps(ret) - - - # get job statistics per site with label - def getJobStatisticsWithLabel(self,site): - # get job statistics - ret = self.taskBuffer.getJobStatisticsWithLabel(site) - # serialize - return pickle.dumps(ret) - - - # query PandaIDs - def queryPandaIDs(self,idsStr): - # deserialize IDs - ids = WrappedPickle.loads(idsStr) - # query PandaIDs - ret = self.taskBuffer.queryPandaIDs(ids) - # serialize - return pickle.dumps(ret) - - - # get number of analysis jobs per user - def getNUserJobs(self,siteName,nJobs): - # get - ret = self.taskBuffer.getNUserJobs(siteName,nJobs) - # serialize - return pickle.dumps(ret) - - - # query job info per cloud - def queryJobInfoPerCloud(self,cloud,schedulerID): - # query PandaIDs - ret = self.taskBuffer.queryJobInfoPerCloud(cloud,schedulerID) - # serialize - return pickle.dumps(ret) - - - # query PandaIDs at site - def getPandaIDsSite(self,site,status,limit): - # query PandaIDs - ret = self.taskBuffer.getPandaIDsSite(site,status,limit) - # serialize - return pickle.dumps(ret) - - - # get PandaIDs to be updated in prodDB - def getJobsToBeUpdated(self,limit,lockedby): - # query PandaIDs - ret = self.taskBuffer.getPandaIDsForProdDB(limit,lockedby) - # serialize - return pickle.dumps(ret) - - - # update prodDBUpdateTimes - def updateProdDBUpdateTimes(self,paramsStr): - # deserialize IDs - params = WrappedPickle.loads(paramsStr) - # get jobs - ret = self.taskBuffer.updateProdDBUpdateTimes(params) - # serialize - return pickle.dumps(True) - - - # query last files in datasets - def queryLastFilesInDataset(self,datasetStr): - # deserialize names - datasets = WrappedPickle.loads(datasetStr) - # get files - ret = self.taskBuffer.queryLastFilesInDataset(datasets) - # serialize - return pickle.dumps(ret) - - - # get input files currently in used for analysis - def getFilesInUseForAnal(self,outDataset): - # get files - ret = self.taskBuffer.getFilesInUseForAnal(outDataset) - # serialize - return pickle.dumps(ret) - - - # get list of dis dataset to get input files in shadow - def getDisInUseForAnal(self,outDataset): - # get files - ret = self.taskBuffer.getDisInUseForAnal(outDataset) - # serialize - return pickle.dumps(ret) - - - # get input LFNs currently in use for analysis with shadow dis - def getLFNsInUseForAnal(self,inputDisListStr): - # deserialize IDs - inputDisList = WrappedPickle.loads(inputDisListStr) - # get files - ret = self.taskBuffer.getLFNsInUseForAnal(inputDisList) - # serialize - return pickle.dumps(ret) - - - # kill jobs - def killJobs(self,idsStr,user,host,code,prodManager,useMailAsID,fqans): - # deserialize IDs - ids = WrappedPickle.loads(idsStr) - if not isinstance(ids,types.ListType): - ids = [ids] - _logger.debug("killJob : %s %s %s %s %s" % (user,code,prodManager,fqans,ids)) - try: - if useMailAsID: - _logger.debug("killJob : getting mail address for %s" % user) - realDN = re.sub('/CN=limited proxy','',user) - realDN = re.sub('(/CN=proxy)+','',realDN) - nTry = 3 - for iDDMTry in range(nTry): - status,out = dq2Info.finger(realDN) - if status == 0: - exec "userInfo=%s" % out - _logger.debug("killJob : %s is converted to %s" % (user,userInfo['email'])) - user = userInfo['email'] - break - time.sleep(1) - except: - errType,errValue = sys.exc_info()[:2] - _logger.error("killJob : failed to convert email address %s : %s %s" % (user,errType,errValue)) - # get working groups with prod role - wgProdRole = [] - for fqan in fqans: - tmpMatch = re.search('/atlas/([^/]+)/Role=production',fqan) - if tmpMatch != None: - # ignore usatlas since it is used as atlas prod role - tmpWG = tmpMatch.group(1) - if not tmpWG in ['','usatlas']+wgProdRole: - wgProdRole.append(tmpWG) - # group production - wgProdRole.append('gr_%s' % tmpWG) - # kill jobs - ret = self.taskBuffer.killJobs(ids,user,code,prodManager,wgProdRole) - # logging - try: - # make message - message = '%s - PandaID =' % host - maxID = 10 - for id in ids[:maxID]: - message += ' %s' % id - if len(ids) > maxID: - message += ' ...' - # get logger - _pandaLogger = PandaLogger() - _pandaLogger.lock() - _pandaLogger.setParams({'Type':'killJobs','User':user}) - logger = _pandaLogger.getHttpLogger(panda_config.loggername) - # add message - logger.info(message) - # release HTTP handler - _pandaLogger.release() - except: - pass - # serialize - return pickle.dumps(ret) - - - # reassign jobs - def reassignJobs(self,idsStr,user,host,forPending): - # deserialize IDs - ids = WrappedPickle.loads(idsStr) - # reassign jobs - ret = self.taskBuffer.reassignJobs(ids,forkSetupper=True,forPending=forPending) - # logging - try: - # make message - message = '%s - PandaID =' % host - maxID = 10 - for id in ids[:maxID]: - message += ' %s' % id - if len(ids) > maxID: - message += ' ...' - # get logger - _pandaLogger = PandaLogger() - _pandaLogger.lock() - _pandaLogger.setParams({'Type':'reassignJobs','User':user}) - logger = _pandaLogger.getHttpLogger(panda_config.loggername) - # add message - logger.info(message) - # release HTTP handler - _pandaLogger.release() - except: - pass - # serialize - return pickle.dumps(ret) - - - # resubmit jobs - def resubmitJobs(self,idsStr): - # deserialize IDs - ids = WrappedPickle.loads(idsStr) - # kill jobs - ret = self.taskBuffer.resubmitJobs(ids) - # serialize - return pickle.dumps(ret) - - - # get list of site spec - def getSiteSpecs(self,siteType='analysis'): - # get analysis site list - specList = {} - siteMapper = SiteMapper(self.taskBuffer) - for id,spec in siteMapper.siteSpecList.iteritems(): - if siteType == 'all' or spec.type == siteType: - # convert to map - tmpSpec = {} - for attr in spec._attributes: - tmpSpec[attr] = getattr(spec,attr) - specList[id] = tmpSpec - # serialize - return pickle.dumps(specList) - - - # get list of cloud spec - def getCloudSpecs(self): - # get cloud list - siteMapper = SiteMapper(self.taskBuffer) - # serialize - return pickle.dumps(siteMapper.cloudSpec) - - - # get list of cache prefix - def getCachePrefixes(self): - # get - ret = self.taskBuffer.getCachePrefixes() - # serialize - return pickle.dumps(ret) - - - # get nPilots - def getNumPilots(self): - # get nPilots - ret = self.taskBuffer.getCurrentSiteData() - numMap = {} - for siteID,siteNumMap in ret.iteritems(): - nPilots = 0 - # nPilots = getJob+updateJob - if siteNumMap.has_key('getJob'): - nPilots += siteNumMap['getJob'] - if siteNumMap.has_key('updateJob'): - nPilots += siteNumMap['updateJob'] - # append - numMap[siteID] = {'nPilots':nPilots} - # serialize - return pickle.dumps(numMap) - - - # run brokerage - def runBrokerage(self,sitesStr,cmtConfig,atlasRelease,trustIS=False,processingType=None, - dn=None,loggingFlag=False,memorySize=None,workingGroup=None,fqans=[], - nJobs=None,preferHomeCountry=False,siteReliability=None,maxCpuCount=None): - if not loggingFlag: - ret = 'NULL' - else: - ret = {'site':'NULL','logInfo':[]} - try: - # deserialize sites - sites = WrappedPickle.loads(sitesStr) - # instantiate siteMapper - siteMapper = SiteMapper(self.taskBuffer) - # instantiate job - job = JobSpec() - job.AtlasRelease = atlasRelease - job.cmtConfig = cmtConfig - if processingType != None: - job.processingType = processingType - if memorySize != None: - job.minRamCount = memorySize - if workingGroup != None: - userDefinedWG = True - validWorkingGroup = True - job.workingGroup = workingGroup - else: - userDefinedWG = False - validWorkingGroup = False - if maxCpuCount != None: - job.maxCpuCount = maxCpuCount - # get parameters related to priority - withProdRole,workingGroup,priorityOffset,serNum,weight = self.taskBuffer.getPrioParameters([job],dn,fqans, - userDefinedWG, - validWorkingGroup) - # get min priority using nJobs - try: - nJobs = long(nJobs) - except: - # use 200 as a default # of jobs - nJobs =200 - minPrio = PrioUtil.calculatePriority(priorityOffset,serNum+nJobs,weight) - # get countryGroup - prefCountries = [] - if preferHomeCountry: - for tmpFQAN in fqans: - match = re.search('^/atlas/([^/]+)/',tmpFQAN) - if match != None: - tmpCountry = match.group(1) - # use country code or usatlas - if len(tmpCountry) == 2: - prefCountries.append(tmpCountry) - break - # usatlas - if tmpCountry in ['usatlas']: - prefCountries.append('us') - break - # run brokerage - _logger.debug("runBrokerage for dn=%s FQAN=%s minPrio=%s preferred:%s:%s" % (dn,str(fqans),minPrio, - preferHomeCountry, - str(prefCountries))) - brokerage.broker.schedule([job],self.taskBuffer,siteMapper,True,sites,trustIS,dn, - reportLog=loggingFlag,minPriority=minPrio,preferredCountries=prefCountries, - siteReliability=siteReliability) - # get computingSite - if not loggingFlag: - ret = job.computingSite - else: - ret = pickle.dumps({'site':job.computingSite,'logInfo':job.brokerageErrorDiag}) - except: - type, value, traceBack = sys.exc_info() - _logger.error("runBrokerage : %s %s" % (type,value)) - return ret - - - # get script for offline running - def getScriptOfflineRunning(self,pandaID): - # register - ret = self.taskBuffer.getScriptOfflineRunning(pandaID) - # return - return ret - - - # register proxy key - def registerProxyKey(self,params): - # register - ret = self.taskBuffer.registerProxyKey(params) - # return - return ret - - - # get client version - def getPandaClientVer(self): - # get - ret = self.taskBuffer.getPandaClientVer() - # return - return ret - - - # get proxy key - def getProxyKey(self,dn): - # get files - ret = self.taskBuffer.getProxyKey(dn) - # serialize - return pickle.dumps(ret) - - - # get slimmed file info with PandaIDs - def getSlimmedFileInfoPandaIDs(self,pandaIDsStr,dn): - try: - # deserialize IDs - pandaIDs = WrappedPickle.loads(pandaIDsStr) - # truncate - maxIDs = 5500 - if len(pandaIDs) > maxIDs: - _logger.error("too long ID list more than %s" % maxIDs) - pandaIDs = pandaIDs[:maxIDs] - # get - _logger.debug("getSlimmedFileInfoPandaIDs start : %s %s" % (dn,len(pandaIDs))) - ret = self.taskBuffer.getSlimmedFileInfoPandaIDs(pandaIDs) - _logger.debug("getSlimmedFileInfoPandaIDs end") - except: - ret = {} - # serialize - return pickle.dumps(ret) - - - # get JobIDs in a time range - def getJobIDsInTimeRange(self,dn,timeRange): - # get IDs - ret = self.taskBuffer.getJobIDsInTimeRange(dn,timeRange) - # serialize - return pickle.dumps(ret) - - - # get PandaIDs for a JobID - def getPandIDsWithJobID(self,dn,jobID,nJobs): - # get IDs - ret = self.taskBuffer.getPandIDsWithJobID(dn,jobID,nJobs) - # serialize - return pickle.dumps(ret) - - - # check merge job generation status - def checkMergeGenerationStatus(self,dn,jobID): - # check - ret = self.taskBuffer.checkMergeGenerationStatus(dn,jobID) - # serialize - return pickle.dumps(ret) - - - # get full job status - def getFullJobStatus(self,idsStr,dn): - try: - # deserialize jobspecs - ids = WrappedPickle.loads(idsStr) - # truncate - maxIDs = 5500 - if len(ids) > maxIDs: - _logger.error("too long ID list more than %s" % maxIDs) - ids = ids[:maxIDs] - except: - type, value, traceBack = sys.exc_info() - _logger.error("getFullJobStatus : %s %s" % (type,value)) - ids = [] - _logger.debug("getFullJobStatus start : %s %s" % (dn,str(ids))) - # peek jobs - ret = self.taskBuffer.getFullJobStatus(ids) - _logger.debug("getFullJobStatus end") - # serialize - return pickle.dumps(ret) - - - # add account to siteaccess - def addSiteAccess(self,siteID,dn): - # add - ret = self.taskBuffer.addSiteAccess(siteID,dn) - # serialize - return pickle.dumps(ret) - - - # list site access - def listSiteAccess(self,siteID,dn,longFormat=False): - # list - ret = self.taskBuffer.listSiteAccess(siteID,dn,longFormat) - # serialize - return pickle.dumps(ret) - - - # update site access - def updateSiteAccess(self,method,siteid,requesterDN,userName,attrValue): - # list - ret = self.taskBuffer.updateSiteAccess(method,siteid,requesterDN,userName,attrValue) - # serialize - return str(ret) - - -# Singleton -userIF = UserIF() -del UserIF - - -# get FQANs -def _getFQAN(req): - fqans = [] - for tmpKey,tmpVal in req.subprocess_env.iteritems(): - # compact credentials - if tmpKey.startswith('GRST_CRED_'): - # VOMS attribute - if tmpVal.startswith('VOMS'): - # FQAN - fqan = tmpVal.split()[-1] - # append - fqans.append(fqan) - # old style - elif tmpKey.startswith('GRST_CONN_'): - tmpItems = tmpVal.split(':') - # FQAN - if len(tmpItems)==2 and tmpItems[0]=='fqan': - fqans.append(tmpItems[-1]) - # return - return fqans - - -# get DN -def _getDN(req): - realDN = '' - if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - realDN = req.subprocess_env['SSL_CLIENT_S_DN'] - # remove redundant CN - realDN = re.sub('/CN=limited proxy','',realDN) - realDN = re.sub('/CN=proxy(/CN=proxy)+','/CN=proxy',realDN) - return realDN - - -# check role -def _isProdRoleATLAS(req): - # check role - prodManager = False - # get FQANs - fqans = _getFQAN(req) - # loop over all FQANs - for fqan in fqans: - # check production role - for rolePat in ['/atlas/usatlas/Role=production','/atlas/Role=production']: - if fqan.startswith(rolePat): - return True - return False - - - -""" -web service interface - -""" - -# security check -def isSecure(req): - # check security - if not Protocol.isSecure(req): - return False - # disable limited proxy - if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: - _logger.warning("access via limited proxy : %s" % req.subprocess_env['SSL_CLIENT_S_DN']) - return False - return True - - -# submit jobs -def submitJobs(req,jobs,toPending=None): - # check security - if not isSecure(req): - return False - # get DN - user = None - if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - user = _getDN(req) - # get FQAN - fqans = _getFQAN(req) - # hostname - host = req.get_remote_host() - # production Role - prodRole = _isProdRoleATLAS(req) - # to pending - if toPending == 'True': - toPending = True - else: - toPending = False - return userIF.submitJobs(jobs,user,host,fqans,prodRole,toPending) - - -# run task assignment -def runTaskAssignment(req,jobs): - # check security - if not isSecure(req): - return "False" - return userIF.runTaskAssignment(jobs) - - -# get job status -def getJobStatus(req,ids): - return userIF.getJobStatus(ids) - - -# get PandaID with jobexeID -def getPandaIDwithJobExeID(req,ids): - return userIF.getPandaIDwithJobExeID(ids) - - -# get queued analysis jobs at a site -def getQueuedAnalJobs(req,site): - # check security - if not isSecure(req): - return "ERROR: SSL is required" - # get DN - user = None - if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - user = _getDN(req) - return userIF.getQueuedAnalJobs(site,user) - - -# get active datasets -def getActiveDatasets(req,computingSite,prodSourceLabel='managed'): - return userIF.getActiveDatasets(computingSite,prodSourceLabel) - - -# get assigning task -def getAssigningTask(req): - return userIF.getAssigningTask() - - -# get assigned cloud for tasks -def seeCloudTask(req,ids): - return userIF.seeCloudTask(ids) - - -# set task by user -def setCloudTaskByUser(req,tid,cloud='',status=''): - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return "ERROR: SSL connection is required" - user = _getDN(req) - # check role - if not _isProdRoleATLAS(req): - return "ERROR: production role is required" - return userIF.setCloudTaskByUser(user,tid,cloud,status) - - -# set debug mode -def setDebugMode(req,pandaID,modeOn): - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return "ERROR: SSL connection is required" - user = _getDN(req) - # check role - prodManager = _isProdRoleATLAS(req) - # mode - if modeOn == 'True': - modeOn = True - else: - modeOn = False - # exec - return userIF.setDebugMode(user,pandaID,prodManager,modeOn) - - -# insert sandbox file info -def insertSandboxFileInfo(req,userName,fileName,fileSize,checkSum): - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return "ERROR: SSL connection is required" - user = _getDN(req) - # check role - prodManager = _isProdRoleATLAS(req) - if not prodManager: - return "ERROR: missing role" - # hostname - hostName = req.get_remote_host() - # exec - return userIF.insertSandboxFileInfo(userName,hostName,fileName,fileSize,checkSum) - - -# check duplicated sandbox file -def checkSandboxFile(req,fileSize,checkSum): - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return "ERROR: SSL connection is required" - user = _getDN(req) - # exec - return userIF.checkSandboxFile(user,fileSize,checkSum) - - -# add files to memcached -def addFilesToCacheDB(req,site,node,guids='',lfns=''): - # exec - return userIF.addFilesToMemcached(site,node,lfns) - - -# delete files from memcached -def deleteFilesFromCacheDB(req,site,node,guids='',lfns=''): - # exec - return userIF.deleteFilesFromMemcached(site,node,lfns) - - -# flush memcached -def flushCacheDB(req,site,node): - # exec - return userIF.flushMemcached(site,node) - - -# check files with memcached -def checkFilesWithCacheDB(req,site,node,guids='',lfns=''): - # exec - return userIF.checkFilesWithMemcached(site,node,lfns) - - -# query PandaIDs -def queryPandaIDs(req,ids): - return userIF.queryPandaIDs(ids) - - -# query job info per cloud -def queryJobInfoPerCloud(req,cloud,schedulerID=None): - return userIF.queryJobInfoPerCloud(cloud,schedulerID) - - -# get PandaIDs at site -def getPandaIDsSite(req,site,status,limit=500): - return userIF.getPandaIDsSite(site,status,limit) - - -# get PandaIDs to be updated in prodDB -def getJobsToBeUpdated(req,limit=5000,lockedby=''): - limit = int(limit) - return userIF.getJobsToBeUpdated(limit,lockedby) - - -# update prodDBUpdateTimes -def updateProdDBUpdateTimes(req,params): - # check security - if not isSecure(req): - return False - return userIF.updateProdDBUpdateTimes(params) - - -# get job statistics -def getJobStatistics(req,sourcetype=None): - return userIF.getJobStatistics(sourcetype) - - -# get highest prio jobs -def getHighestPrioJobStat(req,perPG=None,useMorePG=None): - if perPG == 'True': - perPG = True - else: - perPG = False - if useMorePG == 'True': - useMorePG = taskbuffer.ProcessGroups.extensionLevel_1 - elif useMorePG in ['False',None]: - useMorePG = False - else: - try: - useMorePG = int(useMorePG) - except: - useMorePG = False - return userIF.getHighestPrioJobStat(perPG,useMorePG) - - -# get job statistics for Babmoo -def getJobStatisticsForBamboo(req,useMorePG=None): - if useMorePG == 'True': - useMorePG = taskbuffer.ProcessGroups.extensionLevel_1 - elif useMorePG in ['False',None]: - useMorePG = False - else: - try: - useMorePG = int(useMorePG) - except: - useMorePG = False - return userIF.getJobStatisticsForBamboo(useMorePG) - - -# get the number of waiting jobs per site and user -def getJobStatisticsPerUserSite(req): - return userIF.getJobStatisticsPerUserSite() - - -# get job statistics per site -def getJobStatisticsPerSite(req,predefined='False',workingGroup='',countryGroup='',jobType='', - minPriority=None,readArchived=None): - if predefined=='True': - predefined=True - else: - predefined=False - if minPriority != None: - try: - minPriority = int(minPriority) - except: - minPriority = None - if readArchived=='True': - readArchived = True - elif readArchived=='False': - readArchived = False - else: - host = req.get_remote_host() - # read jobsArchived for panglia - if re.search('panglia.*\.triumf\.ca$',host) != None or host in ['gridweb.triumf.ca']: - readArchived = True - else: - readArchived = False - return userIF.getJobStatisticsPerSite(predefined,workingGroup,countryGroup,jobType, - minPriority,readArchived) - - -# get job statistics per site with label -def getJobStatisticsWithLabel(req,site=''): - return userIF.getJobStatisticsWithLabel(site) - - -# query last files in datasets -def queryLastFilesInDataset(req,datasets): - return userIF.queryLastFilesInDataset(datasets) - - -# get input files currently in used for analysis -def getFilesInUseForAnal(req,outDataset): - return userIF.getFilesInUseForAnal(outDataset) - - -# get list of dis dataset to get input files in shadow -def getDisInUseForAnal(req,outDataset): - return userIF.getDisInUseForAnal(outDataset) - - -# get input LFNs currently in use for analysis with shadow dis -def getLFNsInUseForAnal(req,inputDisList): - return userIF.getLFNsInUseForAnal(inputDisList) - - -# kill jobs -def killJobs(req,ids,code=None,useMailAsID=None): - # check security - if not isSecure(req): - return False - # get DN - user = None - if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - user = _getDN(req) - # check role - prodManager = False - # get FQANs - fqans = _getFQAN(req) - # loop over all FQANs - for fqan in fqans: - # check production role - for rolePat in ['/atlas/usatlas/Role=production','/atlas/Role=production']: - if fqan.startswith(rolePat): - prodManager = True - break - # escape - if prodManager: - break - # use email address as ID - if useMailAsID == 'True': - useMailAsID = True - else: - useMailAsID = False - # hostname - host = req.get_remote_host() - return userIF.killJobs(ids,user,host,code,prodManager,useMailAsID,fqans) - - -# reassign jobs -def reassignJobs(req,ids,forPending=None): - # check security - if not isSecure(req): - return False - # get DN - user = None - if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - user = _getDN(req) - # hostname - host = req.get_remote_host() - # for pending - if forPending == 'True': - forPending = True - else: - forPending = False - return userIF.reassignJobs(ids,user,host,forPending) - - -# resubmit jobs -def resubmitJobs(req,ids): - # check security - if not isSecure(req): - return False - return userIF.resubmitJobs(ids) - - -# change job priorities -def changeJobPriorities(req,newPrioMap=None): - # check security - if not isSecure(req): - return pickle.dumps((False,'secure connection is required')) - # get DN - user = None - if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - user = _getDN(req) - # check role - prodRole = _isProdRoleATLAS(req) - ret = userIF.changeJobPriorities(user,prodRole,newPrioMap) - return pickle.dumps(ret) - - -# get list of site spec -def getSiteSpecs(req,siteType=None): - if siteType != None: - return userIF.getSiteSpecs(siteType) - else: - return userIF.getSiteSpecs() - -# get list of cloud spec -def getCloudSpecs(req): - return userIF.getCloudSpecs() - -# get list of cache prefix -def getCachePrefixes(req): - return userIF.getCachePrefixes() - -# get client version -def getPandaClientVer(req): - return userIF.getPandaClientVer() - -# get nPilots -def getNumPilots(req): - return userIF.getNumPilots() - -# run brokerage -def runBrokerage(req,sites,cmtConfig=None,atlasRelease=None,trustIS=False,processingType=None, - loggingFlag=False,memorySize=None,workingGroup=None,nJobs=None, - siteGroup=None,maxCpuCount=None): - if trustIS=='True': - trustIS = True - else: - trustIS = False - if loggingFlag=='True': - loggingFlag = True - else: - loggingFlag = False - if memorySize != None: - try: - memorySize = long(memorySize) - except: - pass - if siteGroup != None: - try: - siteGroup = int(siteGroup) - except: - siteGroup = None - if maxCpuCount != None: - try: - maxCpuCount = int(maxCpuCount) - except: - maxCpuCount = None - preferHomeCountry = True - dn = _getDN(req) - fqans = _getFQAN(req) - return userIF.runBrokerage(sites,cmtConfig,atlasRelease,trustIS,processingType,dn, - loggingFlag,memorySize,workingGroup,fqans,nJobs,preferHomeCountry, - siteGroup,maxCpuCount) - -# run rebrokerage -def runReBrokerage(req,jobID,libDS='',cloud=None,excludedSite=None,forceOpt=None): - # check SSL - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return "ERROR: SSL connection is required" - # get DN - dn = _getDN(req) - if dn == '': - return "ERROR: could not get DN" - # convert jobID to long - try: - jobID = long(jobID) - except: - return "ERROR: jobID is not an integer" - # force option - if forceOpt == 'True': - forceOpt = True - else: - forceOpt = False - return userIF.runReBrokerage(dn,jobID,cloud,excludedSite,forceOpt) - - -# retry failed subjobs in running job -def retryFailedJobsInActive(req,jobID): - # check SSL - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return "ERROR: SSL connection is required" - # get DN - dn = _getDN(req) - if dn == '': - return "ERROR: could not get DN" - # convert jobID to long - try: - jobID = long(jobID) - except: - return "ERROR: jobID is not an integer" - return userIF.retryFailedJobsInActive(dn,jobID) - - -# logger interface -def sendLogInfo(req,msgType,msgList): - # check SSL - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return "ERROR: SSL connection is required" - # get DN - dn = _getDN(req) - if dn == '': - return "ERROR: could not get DN" - return userIF.sendLogInfo(dn,msgType,msgList) - - -# get serial number for group job -def getSerialNumberForGroupJob(req): - # check SSL - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return "ERROR: SSL connection is required" - # get DN - dn = _getDN(req) - if dn == '': - return "ERROR: could not get DN" - return userIF.getSerialNumberForGroupJob(dn) - - -# get script for offline running -def getScriptOfflineRunning(req,pandaID): - return userIF.getScriptOfflineRunning(pandaID) - - -# register proxy key -def registerProxyKey(req,credname,origin,myproxy): - # check security - if not isSecure(req): - return False - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return False - # get expiration date - if not req.subprocess_env.has_key('SSL_CLIENT_V_END'): - return False - params = {} - params['dn'] = _getDN(req) - # set parameters - params['credname'] = credname - params['origin'] = origin - params['myproxy'] = myproxy - # convert SSL_CLIENT_V_END - try: - expTime = req.subprocess_env['SSL_CLIENT_V_END'] - # remove redundant white spaces - expTime = re.sub('\s+',' ',expTime) - # convert to timestamp - expTime = time.strptime(expTime,'%b %d %H:%M:%S %Y %Z') - params['expires'] = time.strftime('%Y-%m-%d %H:%M:%S',expTime) - except: - _logger.error("registerProxyKey : failed to convert %s" % \ - req.subprocess_env['SSL_CLIENT_V_END']) - # execute - return userIF.registerProxyKey(params) - - -# register proxy key -def getProxyKey(req): - # check security - if not isSecure(req): - return False - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return False - dn = _getDN(req) - # execute - return userIF.getProxyKey(dn) - - -# get JobIDs in a time range -def getJobIDsInTimeRange(req,timeRange,dn=None): - # check security - if not isSecure(req): - return False - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return False - if dn == None: - dn = _getDN(req) - _logger.debug("getJobIDsInTimeRange %s %s" % (dn,timeRange)) - # execute - return userIF.getJobIDsInTimeRange(dn,timeRange) - - -# get PandaIDs for a JobID -def getPandIDsWithJobID(req,jobID,nJobs,dn=None): - # check security - if not isSecure(req): - return False - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return False - if dn == None: - dn = _getDN(req) - _logger.debug("getPandIDsWithJobID %s JobID=%s nJobs=%s" % (dn,jobID,nJobs)) - # execute - return userIF.getPandIDsWithJobID(dn,jobID,nJobs) - - -# check merge job generation status -def checkMergeGenerationStatus(req,jobID,dn=None): - # check security - if not isSecure(req): - return False - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return False - if dn == None: - dn = _getDN(req) - _logger.debug("checkMergeGenerationStatus %s JobID=%s" % (dn,jobID)) - # execute - return userIF.checkMergeGenerationStatus(dn,jobID) - - -# get slimmed file info with PandaIDs -def getSlimmedFileInfoPandaIDs(req,ids): - # check security - if not isSecure(req): - return False - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return False - dn = _getDN(req) - return userIF.getSlimmedFileInfoPandaIDs(ids,dn) - - -# get full job status -def getFullJobStatus(req,ids): - # check security - if not isSecure(req): - return False - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return False - dn = _getDN(req) - return userIF.getFullJobStatus(ids,dn) - - -# get number of analysis jobs per user -def getNUserJobs(req,siteName,nJobs=100): - # check security - prodManager = False - if not isSecure(req): - return "Failed : HTTPS connection is required" - # get FQANs - fqans = _getFQAN(req) - # loop over all FQANs - for fqan in fqans: - # check production role - for rolePat in ['/atlas/usatlas/Role=production', - '/atlas/Role=production', - '/atlas/usatlas/Role=pilot', - '/atlas/Role=pilot', - ]: - if fqan.startswith(rolePat): - prodManager = True - break - # escape - if prodManager: - break - # only prod managers can use this method - if not prodManager: - return "Failed : VOMS authorization failure" - # convert nJobs to int - try: - nJobs = int(nJobs) - except: - nJobs = 100 - # execute - return userIF.getNUserJobs(siteName,nJobs) - - -# add account to siteaccess -def addSiteAccess(req,siteID): - # check security - if not isSecure(req): - return "False" - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return "False" - dn = req.subprocess_env['SSL_CLIENT_S_DN'] - return userIF.addSiteAccess(siteID,dn) - - -# list site access -def listSiteAccess(req,siteID=None,longFormat=False): - # check security - if not isSecure(req): - return "False" - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return "False" - # set DN if siteID is none - dn = None - if siteID==None: - dn = req.subprocess_env['SSL_CLIENT_S_DN'] - # convert longFormat option - if longFormat == 'True': - longFormat = True - else: - longFormat = False - return userIF.listSiteAccess(siteID,dn,longFormat) - - -# update site access -def updateSiteAccess(req,method,siteid,userName,attrValue=''): - # check security - if not isSecure(req): - return "non HTTPS" - # get DN - if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): - return "invalid DN" - # set requester's DN - requesterDN = req.subprocess_env['SSL_CLIENT_S_DN'] - # update - return userIF.updateSiteAccess(method,siteid,requesterDN,userName,attrValue) diff --git a/current/pandaserver/userinterface/__init__.py b/current/pandaserver/userinterface/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/current/pandaserver/userinterface/runReBroker.py b/current/pandaserver/userinterface/runReBroker.py deleted file mode 100755 index e20e6d595..000000000 --- a/current/pandaserver/userinterface/runReBroker.py +++ /dev/null @@ -1,70 +0,0 @@ -# exec -def run(dn,jobID,cloud=None,excludedSite=None): - # check parameters - if dn == '': - return False - if jobID < 0: - return False - # password - from config import panda_config - passwd = panda_config.dbpasswd - # initialize cx_Oracle using dummy connection - from taskbuffer.Initializer import initializer - initializer.init() - # instantiate TB - from taskbuffer.TaskBuffer import taskBuffer - taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) - # run ReBroker - from userinterface.ReBroker import ReBroker - reThr = ReBroker(taskBuffer,cloud,excludedSite,userRequest=True) - # lock - stLock,retLock = reThr.lockJob(dn,jobID) - # failed - if not stLock: - return False - # start - reThr.start() - reThr.join() - return True - - -#################################################################### -# main -def main(): - import sys - import getopt - # option class - class _options: - def __init__(self): - pass - options = _options() - del _options - # set default values - options.jobID = -1 - options.dn = '' - options.cloud = None - options.excludedSite = None - # get command-line parameters - try: - opts, args = getopt.getopt(sys.argv[1:],"j:d:c:e:") - # set options - for o, a in opts: - if o in ("-j",): - options.jobID = long(a) - if o in ("-d",): - options.dn = a - if o in ("-c",): - options.cloud = a - if o in ("-e",): - options.excludedSite = a.split(',') - except: - print("ERROR : Invalid options") - sys.exit(1) - # run - run(options.dn,options.jobID,options.cloud,options.excludedSite) - # return - sys.exit(0) - - -if __name__ == "__main__": - main() diff --git a/current/setup.cfg b/current/setup.cfg deleted file mode 100644 index 74c606520..000000000 --- a/current/setup.cfg +++ /dev/null @@ -1,7 +0,0 @@ -[global] - -[bdist_rpm] -provides = panda-server -release = 1 -packager = Panda Team -requires = python, panda-common diff --git a/current/setup.py b/current/setup.py deleted file mode 100755 index c88adbd41..000000000 --- a/current/setup.py +++ /dev/null @@ -1,203 +0,0 @@ -#!/usr/bin/env python -# -# Setup prog for Panda Server -# -# -release_version='0.0.5' - -import re -import sys -import commands -from distutils.core import setup -from distutils.command.install import install as install_org -from distutils.command.install_data import install_data as install_data_org - -# get panda specific params -optPanda = {} -newArgv = [] -idx = 0 -while idx < len(sys.argv): - tmpArg = sys.argv[idx] - if tmpArg.startswith('--panda_'): - # panda params - idx += 1 - if len(tmpArg.split('=')) == 2: - # split to par and val if = is contained - tmpVal = tmpArg.split('=')[-1] - tmpArg = tmpArg.split('=')[0] - elif len(tmpArg.split('=')) == 1: - tmpVal = sys.argv[idx] - idx += 1 - else: - raise RuntimeError,"invalid panda option : %s" % tmpArg - # get key - tmpKey = re.sub('--panda_','',tmpArg) - # set params - optPanda[tmpKey] = tmpVal - else: - # normal opts - idx += 1 - newArgv.append(tmpArg) -# set new argv -sys.argv = newArgv - - -# set overall prefix for bdist_rpm -class install_panda(install_org): - def initialize_options (self): - install_org.initialize_options(self) - self.prefix = '/data/atlpan/srv' - - -# generates files using templates and install them -class install_data_panda (install_data_org): - - def initialize_options (self): - install_data_org.initialize_options (self) - self.install_purelib = None - - def finalize_options (self): - # set install_purelib - self.set_undefined_options('install', - ('install_purelib','install_purelib')) - # set reaming params - install_data_org.finalize_options(self) - # set hostname - if optPanda.has_key('hostname') and optPanda['hostname'] != '': - self.hostname = optPanda['hostname'] - else: - self.hostname = commands.getoutput('hostname -f') - # set user and group - if optPanda.has_key('username') and optPanda['username'] != '': - self.username = optPanda['username'] - else: - self.username = commands.getoutput('id -un') - if optPanda.has_key('usergroup') and optPanda['usergroup'] != '': - self.usergroup = optPanda['usergroup'] - else: - self.usergroup = commands.getoutput('id -gn') - - - def run (self): - # remove /usr for bdist/bdist_rpm - match = re.search('(build/[^/]+/dumb)/usr',self.install_dir) - if match != None: - self.install_dir = re.sub(match.group(0),match.group(1),self.install_dir) - # remove /var/tmp/*-buildroot for bdist_rpm - match = re.search('(/var/tmp/.*-buildroot)/usr',self.install_dir) - if match != None: - self.install_dir = re.sub(match.group(0),match.group(1),self.install_dir) - # create tmp area - tmpDir = 'build/tmp' - self.mkpath(tmpDir) - new_data_files = [] - for destDir,dataFiles in self.data_files: - newFilesList = [] - for srcFile in dataFiles: - # check extension - if not srcFile.endswith('.template'): - raise RuntimeError,"%s doesn't have the .template extension" % srcFile - # dest filename - destFile = re.sub('(\.exe)*\.template$','',srcFile) - destFile = destFile.split('/')[-1] - destFile = '%s/%s' % (tmpDir,destFile) - # open src - inFile = open(srcFile) - # read - filedata=inFile.read() - # close - inFile.close() - # replace patterns - for item in re.findall('@@([^@]+)@@',filedata): - if not hasattr(self,item): - raise RuntimeError,'unknown pattern %s in %s' % (item,srcFile) - # get pattern - patt = getattr(self,item) - # remove build/*/dump for bdist - patt = re.sub('build/[^/]+/dumb','',patt) - # remove /var/tmp/*-buildroot for bdist_rpm - patt = re.sub('/var/tmp/.*-buildroot','',patt) - # replace - filedata = filedata.replace('@@%s@@' % item, patt) - # write to dest - oFile = open(destFile,'w') - oFile.write(filedata) - oFile.close() - # chmod for exe - if srcFile.endswith('.exe.template'): - commands.getoutput('chmod +x %s' % destFile) - # append - newFilesList.append(destFile) - # replace dataFiles to install generated file - new_data_files.append((destDir,newFilesList)) - # install - self.data_files = new_data_files - install_data_org.run(self) - - -# setup for distutils -setup( - name="panda-server", - version=release_version, - description=' PanDA Server Package', - long_description='''This package contains PanDA Server Components''', - license='GPL', - author='Panda Team', - author_email='hn-atlas-panda-pathena@cern.ch', - url='https://twiki.cern.ch/twiki/bin/view/Atlas/PanDA', - packages=[ 'pandaserver', - 'pandaserver.brokerage', - 'pandaserver.config', - 'pandaserver.dataservice', - 'pandaserver.jobdispatcher', - 'pandaserver.server', - 'pandaserver.taskbuffer', - 'pandaserver.test', - 'pandaserver.userinterface', - ], - data_files=[ - # config files - ('etc/panda', ['templates/panda_server-httpd.conf.rpmnew.template', - 'templates/panda_server-httpd-FastCGI.conf.rpmnew.template', - 'templates/panda_server.cfg.rpmnew.template', - 'templates/panda_server-grid-env.sh.template', - ] - ), - # sysconfig - ('etc/sysconfig', ['templates/panda_server-sysconfig.rpmnew.template', - ] - ), - # logrotate - ('etc/logrotate.d', ['templates/panda_server-logrotate.template', - ] - ), - # init script - ('etc/init.d', ['templates/panda_server-ctl.exe.template', - ] - ), - # crons - ('usr/bin', ['templates/panda_server-add.sh.exe.template', - 'templates/panda_server-priority.sh.exe.template', - 'templates/panda_server-copyArchive.sh.exe.template', - 'templates/panda_server-copyROOT.sh.exe.template', - 'templates/panda_server-vomsrenew.sh.exe.template', - 'templates/panda_server-archivelog.sh.exe.template', - 'templates/panda_server-tmpwatch.sh.exe.template', - 'templates/panda_server-backupJobArch.sh.exe.template', - 'templates/panda_server-deleteJobs.sh.exe.template', - 'templates/panda_server-merge.sh.exe.template', - 'templates/panda_server-datasetManager.sh.exe.template', - 'templates/panda_server-evpPD2P.sh.exe.template', - 'templates/panda_server-callback.sh.exe.template', - 'templates/panda_server-makeSlsXml.exe.template', - 'templates/panda_server-boostUser.sh.exe.template', - 'templates/panda_server-runRebro.sh.exe.template', - ] - ), - # var dirs - #('var/log/panda', []), - #('var/cache/pandaserver', []), - ], - cmdclass={'install': install_panda, - 'install_data': install_data_panda} -) diff --git a/current/templates/panda_server-add.sh.exe.template b/current/templates/panda_server-add.sh.exe.template deleted file mode 100755 index cce611988..000000000 --- a/current/templates/panda_server-add.sh.exe.template +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -# setup grid stuff -source /opt/glite/etc/profile.d/grid-env.sh - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -# set PYTHONPATH for LFC.py -export PYTHONPATH=/opt/lcg/lib64/python2.5/site-packages:$PYTHONPATH - -python2.5 @@install_purelib@@/pandaserver/test/add.py diff --git a/current/templates/panda_server-archivelog.sh.exe.template b/current/templates/panda_server-archivelog.sh.exe.template deleted file mode 100755 index 8a0a2c5ab..000000000 --- a/current/templates/panda_server-archivelog.sh.exe.template +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python @@install_purelib@@/pandaserver/test/archivelogs.py diff --git a/current/templates/panda_server-backupJobArch.sh.exe.template b/current/templates/panda_server-backupJobArch.sh.exe.template deleted file mode 100644 index bc896d843..000000000 --- a/current/templates/panda_server-backupJobArch.sh.exe.template +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python @@install_purelib@@/pandaserver/test/backupJobArch.py diff --git a/current/templates/panda_server-boostUser.sh.exe.template b/current/templates/panda_server-boostUser.sh.exe.template deleted file mode 100755 index f1541998e..000000000 --- a/current/templates/panda_server-boostUser.sh.exe.template +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -echo $1 | python2.5 @@install_purelib@@/pandaserver/test/boostUser.py diff --git a/current/templates/panda_server-callback.sh.exe.template b/current/templates/panda_server-callback.sh.exe.template deleted file mode 100755 index da833c70c..000000000 --- a/current/templates/panda_server-callback.sh.exe.template +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# setup grid stuff -source /opt/glite/etc/profile.d/grid-env.sh - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python2.5 @@install_purelib@@/pandaserver/test/fileCallbackListener.py diff --git a/current/templates/panda_server-copyArchive.sh.exe.template b/current/templates/panda_server-copyArchive.sh.exe.template deleted file mode 100755 index 8005b4d3e..000000000 --- a/current/templates/panda_server-copyArchive.sh.exe.template +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# setup grid stuff -source /opt/glite/etc/profile.d/grid-env.sh - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python2.5 @@install_purelib@@/pandaserver/test/copyArchive.py diff --git a/current/templates/panda_server-copyROOT.sh.exe.template b/current/templates/panda_server-copyROOT.sh.exe.template deleted file mode 100755 index efbd483be..000000000 --- a/current/templates/panda_server-copyROOT.sh.exe.template +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python @@install_purelib@@/pandaserver/test/copyROOT.py diff --git a/current/templates/panda_server-ctl.exe.template b/current/templates/panda_server-ctl.exe.template deleted file mode 100755 index 70a849b9c..000000000 --- a/current/templates/panda_server-ctl.exe.template +++ /dev/null @@ -1,139 +0,0 @@ -#!/bin/sh -# -# Copyright 2000-2004 The Apache Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# -# Apache control script designed to allow an easy command line interface -# to controlling Apache. Written by Marc Slemko, 1997/08/23 -# -# The exit codes returned are: -# XXX this doc is no longer correct now that the interesting -# XXX functions are handled by httpd -# 0 - operation completed successfully -# 1 - -# 2 - usage error -# 3 - httpd could not be started -# 4 - httpd could not be stopped -# 5 - httpd could not be started during a restart -# 6 - httpd could not be restarted during a restart -# 7 - httpd could not be restarted during a graceful restart -# 8 - configuration syntax error -# -# When multiple arguments are given, only the error from the _last_ -# one is reported. Run "apachectl help" for usage info -# -ARGV="$@" -# -# |||||||||||||||||||| START CONFIGURATION SECTION |||||||||||||||||||| -# -------------------- -------------------- -# -# the path to your httpd binary, including options if necessary -HTTPD='/usr/sbin/httpd.worker' - -# -# a command that outputs a formatted text version of the HTML at the -# url given on the command line. Designed for lynx, however other -# programs may work. -if [ -x /usr/bin/links ]; then - LYNX="links -dump" -elif [ -x /usr/bin/lynx ]; then - LYNX="lynx -dump" -else - LYNX="none" -fi - -# -# the URL to your server's mod_status status page. If you do not -# have one, then status and fullstatus will not work. -STATUSURL="http://localhost:80/server-status" - -# Source /etc/sysconfig/httpd for $HTTPD setting, etc. -if [ -r @@install_dir@@/etc/sysconfig/panda_server-sysconfig ]; then - . @@install_dir@@/etc/sysconfig/panda_server-sysconfig -fi - -ERROR=0 -if [ "x$ARGV" = "x" ] ; then - ARGV="-h" -fi - -function check13() { -# check for 1.3 configuration -GONE="(ServerType|BindAddress|Port|AddModule|ClearModuleList|" -GONE="${GONE}AgentLog|RefererLog|RefererIgnore|FancyIndexing|" -GONE="${GONE}AccessConfig|ResourceConfig)" -if grep -Eiq "^[[:space:]]*($GONE)" /etc/httpd/conf/httpd.conf; then - echo "$0: Apache 1.3 configuration directives found" - echo "$0: please read /usr/share/doc/httpd-2.0.52/migration.html" - exit 2 -fi -} - -function checklynx() { -if [ "$LYNX" = "none" ]; then - echo "The 'links' package is required for this functionality." - exit 8 -fi -} - -function testconfig() { -# httpd is denied terminal access in SELinux, so run in the -# current context to get stdout from $HTTPD -t. -if test -x /usr/sbin/selinuxenabled && /usr/sbin/selinuxenabled; then - runcon -- `id -Z` $HTTPD $OPTIONS -t -else - $HTTPD $OPTIONS -t -fi -ERROR=$? -} - -case $ARGV in -restart|graceful) - if $HTTPD -t >&/dev/null; then - $HTTPD $OPTIONS -k $ARGV - ERROR=$? - else - echo "apachectl: Configuration syntax error, will not run \"$ARGV\":" - testconfig - fi - ;; -start|stop) - check13 - $HTTPD $OPTIONS -k $ARGV - ERROR=$? - ;; -startssl|sslstart|start-SSL) - check13 - $HTTPD $OPTIONS -DSSL -k start - ERROR=$? - ;; -configtest) - testconfig - ;; -status) - checklynx - $LYNX $STATUSURL | awk ' /process$/ { print; exit } { print } ' - ;; -fullstatus) - checklynx - $LYNX $STATUSURL - ;; -*) - $HTTPD $OPTIONS $ARGV - ERROR=$? -esac - -exit $ERROR - diff --git a/current/templates/panda_server-datasetManager.sh.exe.template b/current/templates/panda_server-datasetManager.sh.exe.template deleted file mode 100644 index 32abd2976..000000000 --- a/current/templates/panda_server-datasetManager.sh.exe.template +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# setup grid stuff -source /opt/glite/etc/profile.d/grid-env.sh - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python2.5 @@install_purelib@@/pandaserver/test/datasetManager.py diff --git a/current/templates/panda_server-deleteJobs.sh.exe.template b/current/templates/panda_server-deleteJobs.sh.exe.template deleted file mode 100644 index fd48e9e7e..000000000 --- a/current/templates/panda_server-deleteJobs.sh.exe.template +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python @@install_purelib@@/pandaserver/test/deleteJobs.py diff --git a/current/templates/panda_server-evpPD2P.sh.exe.template b/current/templates/panda_server-evpPD2P.sh.exe.template deleted file mode 100755 index 8786da667..000000000 --- a/current/templates/panda_server-evpPD2P.sh.exe.template +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# setup grid stuff -source /opt/glite/etc/profile.d/grid-env.sh - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python2.5 @@install_purelib@@/pandaserver/test/evpPD2P.py diff --git a/current/templates/panda_server-grid-env.sh.template b/current/templates/panda_server-grid-env.sh.template deleted file mode 100644 index c1e0d3321..000000000 --- a/current/templates/panda_server-grid-env.sh.template +++ /dev/null @@ -1,3 +0,0 @@ -export LD_LIBRARY_PATH=/opt/glite/lib64:/opt/globus/lib:/opt/lcg/lib64:$LD_LIBRARY_PATH -export PYTHONPATH=/opt/glite/lib64/python:/opt/lcg/lib64/python:$PYTHONPATH -export PATH=/opt/edg/bin:/opt/glite/bin:/opt/globus/bin:/opt/lcg/bin:$PATH diff --git a/current/templates/panda_server-httpd-FastCGI.conf.rpmnew.template b/current/templates/panda_server-httpd-FastCGI.conf.rpmnew.template deleted file mode 100644 index 0148c1eb0..000000000 --- a/current/templates/panda_server-httpd-FastCGI.conf.rpmnew.template +++ /dev/null @@ -1,177 +0,0 @@ -LoadModule access_module modules/mod_access.so -LoadModule alias_module modules/mod_alias.so -LoadModule rewrite_module modules/mod_rewrite.so -LoadModule mime_magic_module modules/mod_mime_magic.so -LoadModule mime_module modules/mod_mime.so -LoadModule include_module modules/mod_include.so -LoadModule log_config_module modules/mod_log_config.so -LoadModule env_module modules/mod_env.so -LoadModule deflate_module modules/mod_deflate.so -LoadModule setenvif_module modules/mod_setenvif.so -LoadModule dir_module modules/mod_dir.so -LoadModule ssl_module modules/mod_ssl.so -LoadModule headers_module modules/mod_headers.so -LoadModule gridsite_module modules/mod_gridsite.so - -# FastCGI/WSGI -#LoadModule fastcgi_module modules/mod_fastcgi.so -LoadModule wsgi_module modules/mod_wsgi.so - - -User atlpan -Group zp - - -StartServers 25 -MinSpareServers 25 -ServerLimit 512 -MaxSpareServers 512 -MaxClients 512 -MaxRequestsPerChild 2000 - - -ServerName pandaserver.cern.ch - -DocumentRoot "@@install_purelib@@/pandaserver" - - - Order allow,deny - Deny from all - - -RedirectMatch 403 "/panda.py$" - - - Options FollowSymLinks - AllowOverride None - Order allow,deny - Allow from all - Deny from 192.203.218.14 - - -Alias /trf/ "@@install_dir@@/var/trf/" -Alias /cache/ "@@install_dir@@/var/cache/pandaserver/" -Alias /appdir/ "@@install_dir@@/var/appdir/" - - - Options FollowSymLinks - AllowOverride None - Order allow,deny - Allow from all - Deny from 192.203.218.14 - - - - FastCgiIpcDir @@install_dir@@/var/log/panda/fastsocks - FastCgiServer @@install_purelib@@/pandaserver/server/panda.py \ - -processes 25 -idle-timeout 300 -listen-queue-depth 1 -flush \ - -initial-env PYTHONPATH \ - -initial-env TZ \ - -initial-env HOME \ - -initial-env PANDA_HOME \ - -initial-env X509_CERT_DIR \ - -initial-env X509_USER_PROXY \ - -initial-env PANDA_URL \ - -initial-env PANDA_URL_SSL - ScriptAliasMatch ^/server/panda/(.+)$ @@install_purelib@@/pandaserver/server/panda.py - - - - WSGIDaemonProcess pandasrv_daemon processes=25 threads=2 home=/home/atlpan - WSGIProcessGroup pandasrv_daemon - WSGIApplicationGroup %{GLOBAL} - WSGIScriptAliasMatch ^/server/panda/(.+)$ @@install_purelib@@/pandaserver/server/panda.py - WSGISocketPrefix @@install_dir@@/var/log/panda/wsgisocks/wsgi - - - -Listen 25080 - - -RewriteEngine on -RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK) -RewriteRule .* - [F] -# use Cassandra for cache -RewriteRule ^/cscache/(.*)$ /server/panda/getFile?fileName=$1 [PT,L] - - - - - Order allow,deny - Allow from all - Deny from 192.203.218.14 - - - # allow .py - - Order allow,deny - Allow from all - - - # enable CGI for FastCGI/WSGI - Options FollowSymLinks +ExecCGI - - # mod_gridsite - GridSiteIndexes on - GridSiteAuth on - GridSiteDNlists /etc/grid-security/dn-lists/ - GridSiteEnvs on - - - - - -Listen 25443 - - -RewriteEngine on -RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK) -RewriteRule .* - [F] -# use Cassandra for cache -RewriteRule ^/cscache/(.*)$ /server/panda/getFile?fileName=$1 [PT,L] - -# CERN security recommendation to only allow the seven strongest ssl ciphers -SSLProtocol -all +TLSv1 +SSLv3 -SSLCipherSuite HIGH:MEDIUM:+SSLv3 - -SSLEngine on -SSLCertificateFile /etc/grid-security/hostcert.pem -SSLCertificateKeyFile /etc/grid-security/hostkey.pem -SSLCACertificatePath /etc/grid-security/certificates -SSLVerifyClient optional -SSLVerifyDepth 10 -SSLOptions +ExportCertData +StdEnvVars - - - - # allow .py - - Order allow,deny - Allow from all - - - # enable CGI for FastCGI/WSGI - Options FollowSymLinks +ExecCGI - - # mod_gridsite - GridSiteIndexes on - GridSiteAuth on - GridSiteDNlists /etc/grid-security/dn-lists/ - GridSiteGSIProxyLimit 1 - GridSiteEnvs on - - - - -LogLevel info - -LogFormat "%t %h \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined -LogFormat "%t %h \"%r\" %>s %b" common -LogFormat "%{Referer}i -> %U" referer -LogFormat "%{User-agent}i" agent -CustomLog @@install_dir@@/var/log/panda/panda_server_access_log common -ErrorLog @@install_dir@@/var/log/panda/panda_server_error_log - -PidFile @@install_dir@@/var/log/panda/panda_server_httpd.pid - -TypesConfig /etc/mime.types diff --git a/current/templates/panda_server-httpd.conf.rpmnew.template b/current/templates/panda_server-httpd.conf.rpmnew.template deleted file mode 100644 index 6057f0cc4..000000000 --- a/current/templates/panda_server-httpd.conf.rpmnew.template +++ /dev/null @@ -1,141 +0,0 @@ -LoadModule access_module modules/mod_access.so -LoadModule alias_module modules/mod_alias.so -LoadModule rewrite_module modules/mod_rewrite.so -LoadModule mime_magic_module modules/mod_mime_magic.so -LoadModule mime_module modules/mod_mime.so -LoadModule include_module modules/mod_include.so -LoadModule log_config_module modules/mod_log_config.so -LoadModule env_module modules/mod_env.so -LoadModule deflate_module modules/mod_deflate.so -LoadModule setenvif_module modules/mod_setenvif.so -LoadModule dir_module modules/mod_dir.so -LoadModule ssl_module modules/mod_ssl.so -LoadModule python_module modules/mod_python.so -LoadModule gridsite_module modules/mod_gridsite.so - -User atlpan -Group zp - - -StartServers 50 -MinSpareServers 50 -MaxSpareServers 50 -MaxClients 50 -MaxRequestsPerChild 0 - - - -ServerLimit 10 -StartServers 10 -MaxClients 50 -MinSpareThreads 50 -MaxSpareThreads 50 -ThreadsPerChild 5 -MaxRequestsPerChild 0 - - -ServerName pandaserver.cern.ch - -DocumentRoot "@@install_purelib@@/pandaserver" - - - Order allow,deny - Deny from all - - - - Options FollowSymLinks - AllowOverride None - Order allow,deny - Allow from all - Deny from 192.203.218.14 - - -Alias /cache/ "@@install_dir@@/var/cache/pandaserver/" - - - Options FollowSymLinks - AllowOverride None - Order allow,deny - Allow from all - Deny from 192.203.218.14 - - -Listen 25080 - - -RewriteEngine on -RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK) -RewriteRule .* - [F] - - - - - Order allow,deny - Allow from all - Deny from 192.203.218.14 - - - # mod_python - SetHandler python-program - PythonHandler mod_python.publisher - PythonDebug On - - # mod_gridsite - GridSiteIndexes on - GridSiteAuth on - GridSiteDNlists /etc/grid-security/dn-lists/ - GridSiteEnvs on - - - - - -Listen 25443 - - -RewriteEngine on -RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK) -RewriteRule .* - [F] - -# CERN security recommendation to only allow the seven strongest ssl ciphers -SSLProtocol -all +TLSv1 +SSLv3 -SSLCipherSuite HIGH:MEDIUM:+SSLv3 - -SSLEngine on -SSLCertificateFile /etc/grid-security/hostcert.pem -SSLCertificateKeyFile /etc/grid-security/hostkey.pem -SSLCACertificatePath /etc/grid-security/certificates -SSLVerifyClient optional -SSLVerifyDepth 10 -SSLOptions +ExportCertData +StdEnvVars - - - - # mod_python - SetHandler python-program - PythonHandler mod_python.publisher - PythonDebug On - - # mod_gridsite - GridSiteIndexes on - GridSiteAuth on - GridSiteDNlists /etc/grid-security/dn-lists/ - GridSiteGSIProxyLimit 1 - GridSiteEnvs on - - - - -LogLevel info - -LogFormat "%t %h \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined -LogFormat "%t %h \"%r\" %>s %b" common -LogFormat "%{Referer}i -> %U" referer -LogFormat "%{User-agent}i" agent -CustomLog @@install_dir@@/var/log/panda/panda_server_access_log common -ErrorLog @@install_dir@@/var/log/panda/panda_server_error_log - -PidFile @@install_dir@@/var/log/panda/panda_server_httpd.pid - -TypesConfig /etc/mime.types diff --git a/current/templates/panda_server-logrotate.template b/current/templates/panda_server-logrotate.template deleted file mode 100644 index a474741f6..000000000 --- a/current/templates/panda_server-logrotate.template +++ /dev/null @@ -1,14 +0,0 @@ -@@install_dir@@/var/log/panda/*log { - rotate 180 - daily - compress - missingok - notifempty - sharedscripts - daily - postrotate - killall -u atlpan python || true - killall -u atlpan python2.5 || true - /sbin/service httpd-pandasrv restart > /dev/null 2>/dev/null || true - endscript -} diff --git a/current/templates/panda_server-makeSlsXml.exe.template b/current/templates/panda_server-makeSlsXml.exe.template deleted file mode 100755 index 23e23b3d3..000000000 --- a/current/templates/panda_server-makeSlsXml.exe.template +++ /dev/null @@ -1,334 +0,0 @@ -#!/usr/bin/python2.5 - -import SLSxml -import socket -import subprocess -import re -import sys -import optparse - -########################################### -## define options -########################################### -parser = optparse.OptionParser() -parser.add_option( "-u", "--use", dest="use", type="string", - help="Use of xml, allowed values: 'mon', 'server' or 'bamboo'" ) -parser.add_option( "--host", dest="host", type="string", - help="Hostname of server to check, default is current machine hostname" ) -parser.add_option( "-d", "--dir", dest="dir", type="string", - help="Filename of the xml file output. Default is " + - "/data/atlpan/oracle/panda/monitoring" ) -parser.add_option( "--debug", action="store_true", dest="debug", - default=False, help="Print out debug statements." ) - -( options, args ) = parser.parse_args() - -def __main__() : - - if( options.host ) : - host = options.host - else : - host = socket.gethostname() - host = re.sub( r'^(\w+).*', r'\1', host ) - - if( options.use == 'mon' ) : - tmp_xml = make_monitor( host ) - file_part = 'PandaMon' - elif( options.use == 'server' ) : - tmp_xml = make_server( host ) - file_part = 'PandaServer' - elif( options.use == 'bamboo' ) : - tmp_xml = make_bamboo( host ) - file_part = 'PandaBamboo' - else : - print "Err: please choose a use, 'mon', 'server' or 'bamboo'." - return - - if( options.dir ) : - file_dir = options.dir - else : - file_dir = '/data/atlpan/oracle/panda/monitoring' - - file_name = '%s/%s_%s.xml' % ( file_dir, file_part, host ) - tmp_file = open( file_name, 'w' ) - tmp_file.write( tmp_xml ) - tmp_file.close - -def make_server( host ) : - - if( options.debug ) : print "Creating the server monitoring xml" - - server_avail = server_availability( host ) - add_processes = count_add_processes() - num_holdings = count_holdings() - data_used = volume_use( 'data' ) - var_used = volume_use( 'var' ) - ave_regtime = registration_time() - ave_regtimeDQ2 = registration_time(onlyDQ2=True) - - sls_xml = SLSxml.xml_doc() - sls_xml.set_id( 'PandaServer_%s' % ( host ) ) - sls_xml.set_shortname( 'PandaServer monitoring service at %s' % ( host ) ) - sls_xml.set_fullname( 'PandaServer monitoring service at %s' % ( host ) ) - sls_xml.set_availability( str( server_avail ) ) - - sls_xml.add_data( "AddProcesses", "Number of processes for DQ2+LFC registration", - str( add_processes ) ) - sls_xml.add_data( "HoldingJobs", "Number of holding jobs to be registered", - str( num_holdings ) ) - sls_xml.add_data( "RegistrationTime", "Average time for DQ2+LFC registration in second", - str( ave_regtime ) ) - sls_xml.add_data( "RegistrationTimeDQ2", "Average time for DQ2 registration in second", - str( ave_regtimeDQ2 ) ) - sls_xml.add_data( "DataVolumeUse", "Percent use of the local /data volume", - str( data_used ) ) - sls_xml.add_data( "VarVolumeUse", "Percent use of the local /var volume", - str( var_used ) ) - - return sls_xml.print_xml() - -def make_bamboo( host ) : - - if( options.debug ) : print "Creating the server monitoring xml" - - server_avail = bamboo_availability( host ) - - sls_xml = SLSxml.xml_doc() - sls_xml.set_id( 'PandaBamboo_%s' % ( host ) ) - sls_xml.set_shortname( 'PandaBamboo monitoring service at %s' % ( host ) ) - sls_xml.set_fullname( 'PandaBamboo monitoring service at %s' % ( host ) ) - sls_xml.set_availability( str( server_avail ) ) - return sls_xml.print_xml() - -def make_monitor( host ) : - - if( options.debug ) : print "Creating the monitor monitoring xml" - - errormes = False - messagetext = '' - - http_avail = httpd_availability( host ) - if( http_avail == 0 ) : - errormes = True - messagetext += "Error: web server on %s not working\n" % ( host ) - - squid_avail = squid_availability() - if( squid_avail == 0 ) : - errormes = True - messagetext += "Error: squid server on %s not working\n" % ( host ) - - panda_avail = panda_availability( host ) - if( panda_avail == 0 ) : - errormes = True - messagetext += "Error: panda monitor on %s not working\n" % ( host ) - - http_processes = count_processes() - - data_used = volume_use( 'data' ) - var_used = volume_use( 'var' ) - - if( errormes ) : - error_mail( host, messagetext ) - - if( options.debug ) : - print 'web - %s, squid - %s, panda - %s' % ( http_avail, squid_avail, - panda_avail ) - - sls_xml = SLSxml.xml_doc() - sls_xml.set_id( 'PandaMon_%s' % ( host ) ) - sls_xml.set_shortname( 'PandaMonitor monitoring service at %s' % ( host ) ) - sls_xml.set_fullname( 'PandaMonitor monitoring service at %s' % ( host ) ) - sls_xml.set_availability( str( panda_avail ) ) - - #adding intervention by hand here - #sls_xml.add_intervention( "2011-01-16T20:00:00", "PT36H", - # "Panda services with be out for over a day due to database server changes." ) - - sls_xml.add_data( "HttpdAvailability", "Availability of the httpd server", - str( http_avail ) ) - sls_xml.add_data( "SquidAvailability", "Availability of the squid server", - str( squid_avail ) ) - sls_xml.add_data( "PandaAvailability", "Availability of the panda monitor", - str( panda_avail ) ) - sls_xml.add_data( "HttpProcesses", "Number of processes for the panda monitor", - str( http_processes ) ) - sls_xml.add_data( "DataVolumeUse", "Percent use of the local /data volume", - str( data_used ) ) - sls_xml.add_data( "VarVolumeUse", "Percent use of the local /var volume", - str( var_used ) ) - return sls_xml.print_xml() - -def httpd_availability( host ) : - url = 'http://%s.cern.ch/robots.txt' % ( host ) - return check_url( url, "go away" ) - -def squid_availability() : - command = '/usr/bin/squidclient -p 25980 cache_object://localhost/info' - return check_command( command, 'OK' ) - -def panda_availability( host ) : - - port = '25980' - baseurl = 'http://' + host + ':' + port + '/server/pandamon/query?' - - reply = check_url( baseurl + 'isAlive', 'yes' ) - if( reply != '100' ) : return '0' - - return '100' - - #The above is a simpler test of the python code, for now, until the - #panda monitor migration is more stable, and all network tweaks are - #in quator, so things are stable on reboot/upgrade. Once that is - #true the below tests should be put back. - - reply = check_url( baseurl + 'dash=prod', 'CERN:OK' ) - if( reply != '100' ) : return '0' - - reply = check_url( baseurl + 'dash=clouds', 'Cloud status' ) - if( reply != '100' ) : return '0' - - reply = check_url( baseurl + 'overview=incidents', 'Recorded incidents' ) - if( reply != '100' ) : return '0' - - reply = check_url( baseurl + 'dash=ddm', 'Space available' ) - if( reply != '100' ) : return '0' - - return '100' - -def server_availability( host ) : - - tmp_url = '--no-check-certificate https://%s:25443/server/panda/isAlive' % ( host ) - reply = check_url( tmp_url, 'alive=yes' ) - if( reply != '100' ) : return '0' - - return '100' - -def bamboo_availability( host ) : - - tmp_url = 'http://%s:25070/bamboo/bamboo/isAlive' % ( host ) - reply = check_url( tmp_url, 'alive=yes' ) - if( reply != '100' ) : return '0' - - return '100' - -def check_url( url, check_string ) : - command = "wget -q -O - " + url - return check_command( command, check_string ) - -def check_command( command, check_string ) : - - if( options.debug ) : - print "Checking command : %s" % ( command ) - print "For string : %s" % ( check_string ) - - tmp_array = command.split() - output = subprocess.Popen( tmp_array, stdout=subprocess.PIPE ).communicate()[0] - - if( re.search( check_string, output ) ) : - if( options.debug ) : print "Found the string, return 100" - return '100' - else : - if( options.debug ) : print "String not found, return 0" - return '0' - -def count_processes() : - output = subprocess.Popen( ['ps', 'aux'], stdout=subprocess.PIPE ).communicate()[0] - count = 0 - for line in output.split( '\n' ) : - if( re.match( 'atlpan', line ) ) : - if( re.search( 'http', line ) ) : - count += 1 - return count - -def count_add_processes() : - output = subprocess.Popen( "pgrep -f add.py", - stdout=subprocess.PIPE,shell=True).communicate()[0] - count = 0 - for line in output.split( '\n' ) : - line = line.strip() - if line == '': - continue - count += 1 - return count - -def count_holdings() : - output = subprocess.Popen("ls /data/atlpan/srv/var/log/panda/ | egrep '(finished|failed)'", - stdout=subprocess.PIPE,shell=True).communicate()[0] - count =0 - for line in output.split( '\n' ) : - line = line.strip() - if line == '': - continue - count += 1 - return count - -def registration_time(timeSlice=False,onlyDQ2=False) : - aveRegTime = '0.0' - try: - if onlyDQ2: - com = "grep registraion /data/atlpan/srv/var/log/panda/panda-Adder.log | grep DQ2 | grep -v LFC" - else: - com = "grep 'LFC+DQ2' /data/atlpan/srv/var/log/panda/panda-Adder.log" - if not timeSlice: - com += ' | tail -1000' - output = subprocess.Popen(com,stdout=subprocess.PIPE,shell=True).communicate()[0] - regtimeMap = {} - for line in output.split('\n'): - try: - items = line.split() - timestamp = items[1][:2] - regtime = float(items[-2]) - if not regtimeMap.has_key(timestamp): - regtimeMap[timestamp] = {'totalTime':0.,'totalReg':0} - regtimeMap[timestamp]['totalTime'] += regtime - regtimeMap[timestamp]['totalReg'] += 1 - except: - pass - timestamps = regtimeMap.keys() - if timeSlice: - timestamps.sort() - for timestamp in timestamps: - print "%s %4.1fsec" % (timestamp,regtimeMap[timestamp]['totalTime']/float(regtimeMap[timestamp]['totalReg'])) - else: - totalTime = 0. - totalReg = 0 - for timestamp in timestamps: - totalTime += regtimeMap[timestamp]['totalTime'] - totalReg += regtimeMap[timestamp]['totalReg'] - if totalReg > 0: - aveRegTime = '%4.1f' % (totalTime/float(totalReg)) - except: - errtype,ervalue = sys.exc_info()[:2] - print "ERROR : %s:%s in registration_time" % (errtype,ervalue) - return aveRegTime - -def volume_use( volume_name ) : - command = "df -Pkh /" + volume_name - - tmp_array = command.split() - output = subprocess.Popen( tmp_array, stdout=subprocess.PIPE ).communicate()[0] - - for line in output.split( '\n' ) : - if( re.search( volume_name, line ) ) : - used_amount = re.search( r"(\d+)\%", line ).group(1) - - return used_amount - -def error_mail( host, message ) : - - mail_cmd = [] - mail_cmd.append( 'mail' ) - mail_cmd.append( '-s' ) - mail_cmd.append( 'Problems with %s' % ( host ) ) - mail_cmd.append( 'douglas@cern.ch' ) - - text = "Problems with %s :\n\n" % ( host ) - text += message - - p = subprocess.Popen( mail_cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE ) - p.stdin.write( text ) - p.stdin.close() - - -#run program -__main__() diff --git a/current/templates/panda_server-merge.sh.exe.template b/current/templates/panda_server-merge.sh.exe.template deleted file mode 100755 index 6acf67c5f..000000000 --- a/current/templates/panda_server-merge.sh.exe.template +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# setup grid stuff -source /opt/glite/etc/profile.d/grid-env.sh - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python2.5 @@install_purelib@@/pandaserver/test/runMerger.py diff --git a/current/templates/panda_server-priority.sh.exe.template b/current/templates/panda_server-priority.sh.exe.template deleted file mode 100755 index 70363d85b..000000000 --- a/current/templates/panda_server-priority.sh.exe.template +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python2.5 @@install_purelib@@/pandaserver/test/prioryMassage.py diff --git a/current/templates/panda_server-runRebro.sh.exe.template b/current/templates/panda_server-runRebro.sh.exe.template deleted file mode 100755 index 24dfc91c7..000000000 --- a/current/templates/panda_server-runRebro.sh.exe.template +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# setup grid stuff -source /opt/glite/etc/profile.d/grid-env.sh - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python2.5 @@install_purelib@@/pandaserver/test/runRebro.py diff --git a/current/templates/panda_server-sysconfig.rpmnew.template b/current/templates/panda_server-sysconfig.rpmnew.template deleted file mode 100644 index 7d4d5f482..000000000 --- a/current/templates/panda_server-sysconfig.rpmnew.template +++ /dev/null @@ -1,31 +0,0 @@ -# Configuration file for the httpd service. - -OPTIONS="-f @@install_dir@@/etc/panda/panda_server-httpd.conf" - -# for FastCGI/WSGI -#OPTIONS="-f @@install_dir@@/etc/panda/panda_server-httpd-FastCGI.conf" -#HTTPD='/usr/sbin/httpd' - -# for DQ2 -export X509_CERT_DIR=/etc/grid-security/certificates -export RUCIO_ACCOUNT=panda -export RUCIO_APPID=pandasrv - -# panda home -export PANDA_HOME=@@install_dir@@ - -# timezone -export TZ=UTC - -# import panda modules -export PYTHONPATH=@@install_purelib@@/pandacommon:@@install_purelib@@/pandaserver - -# avoid to use AFS -export HOME=/home/atlpan - -# set user's proxy -export X509_USER_PROXY=FIXME - -# panda server URLs -export PANDA_URL='http://localhost:25080/server/panda' -export PANDA_URL_SSL='https://localhost:25443/server/panda' diff --git a/current/templates/panda_server-tmpwatch.sh.exe.template b/current/templates/panda_server-tmpwatch.sh.exe.template deleted file mode 100644 index 40fbd2711..000000000 --- a/current/templates/panda_server-tmpwatch.sh.exe.template +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -# import env vars from sysconfig -source @@install_dir@@/etc/sysconfig/panda_server-sysconfig - -python @@install_purelib@@/pandaserver/test/tmpwatch.py diff --git a/current/templates/panda_server-vomsrenew.sh.exe.template b/current/templates/panda_server-vomsrenew.sh.exe.template deleted file mode 100755 index c4771655e..000000000 --- a/current/templates/panda_server-vomsrenew.sh.exe.template +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -source /etc/profile.d/grid-env.sh - -NOVOMS=/data/atlpan/x509up_u25606_novoms - -voms-proxy-init -voms atlas:/atlas/Role=production -out /data/atlpan/x509up_u25606 -valid 96:00 -cert=$NOVOMS - -# check lifetime of certificate -grid-proxy-info -e -h 504 -f $NOVOMS -if [ $? -ne 0 ]; then - echo $NOVOMS expires in 3 weeks on `hostname` | mail -s "WARNING : Grid certificate expires soon on panda server" atlas-adc-panda-support@cern.ch -fi - -# check lifetime of certificate -voms-proxy-info -exists -hours 72 -file /data/atlpan/x509up_u25606 -if [ $? -ne 0 ]; then - echo /data/atlpan/x509up_u25606 expires in 3 days on `hostname` | mail -s "WARNING : Grid proxy expires soon on panda server" atlas-adc-panda-support@cern.ch,atlas-adc-expert@cern.ch -fi - diff --git a/current/templates/panda_server.cfg.rpmnew.template b/current/templates/panda_server.cfg.rpmnew.template deleted file mode 100644 index f3274cec3..000000000 --- a/current/templates/panda_server.cfg.rpmnew.template +++ /dev/null @@ -1,258 +0,0 @@ -[server] - - -########################## -# -# Logger parameters -# - -# log directory -logdir=@@install_dir@@/var/log/panda - -# logger name -loggername = prod - - - -########################## -# -# Transaction parameters -# - -# lock file for getJobs -lockfile_getJobs = %(logdir)s/getJobs.lock - -# lock file for getSerialNumber -lockfile_getSN = %(logdir)s/getSN.lock - -# lock file for accessing email DB -lockfile_getMail = %(logdir)s/getMail.lock - -# lock file for updateDatasetStatus -lockfile_setDS = %(logdir)s/setDS.lock - -# lock file for getCloudTask -lockfile_getCT = %(logdir)s/getCT.lock - -# lock file for uuidgen -lockfile_getUU = %(logdir)s/getUU.lock - - - -########################## -# -# DA parameters -# - -# cache space -cache_dir = @@install_dir@@/var/cache/pandaserver - - - -########################## -# -# DDM parameters -# - -# dq2 dir -dq2_dir = /opt/dq2 - -# globus dir -globus_dir = /opt/globus - -# path to native python -native_python = /data/atlpan/bin - -# path to python for lfc client (/data/atlpan/bin/python cannot be used due to lack of libpythonX.Y.so) -native_python32 = /usr/bin - -# glite source file -glite_source = /opt/glite/etc/profile.d/grid-env.sh - -# location for Panda common -pandaCommon_dir = @@install_purelib@@/pandacommon - -# location for Panda server -pandaPython_dir = @@install_purelib@@/pandaserver - -# location for LFCclient -lfcClient_dir = %(pandaPython_dir)s/brokerage - -# home dir to change CWD -home_dir_cwd = /home/atlpan - - - -########################## -# -# Database parameters -# - -# host -dbhost = ADCR_PANDA - -# user -dbuser = ATLAS_PANDA_WRITER - -# password -dbpasswd = FIXME - -# database -dbname = PandaDB - -# number of connections -nDBConnection = 2 - -# number of connections for FastCGI/WSGI -nDBConForFastCGIWSGI = 1 - -# use timeout -usedbtimeout = True - -# timout value -dbtimeout = 300 - -# verbose in bridge -dbbridgeverbose = False - -# SQL dumper -dump_sql = False - - - -########################## -# -# Panda server parameters -# - -# port -pserverport = 25443 - - - -########################## -# -# proxy parameters -# - -# http -httpProxy = "" - - - -########################## -# -# E-mail DB parameters -# - -# database name for local caching -emailDB = %(logdir)s/email_db - -# SMTP server -emailSMTPsrv = cernmx.cern.ch - -# sender address for notification -emailSender = atlpan@cern.ch - -# login name for SMTP -emailLogin = atlpan - -# login password for SMTP -emailPass = FIXME - - - -########################## -# -# parameters for dynamic task assignment -# - -# enable dynamic task assignment -enableDynamicTA = True - - - -########################## -# -# parameters for redirection service -# - -# enable redirection service -enableRedirection = False - - - -########################## -# -# parameters for FastCGI/WSGI -# - -# use FastCGI with flup -useFastCGI = False - -# use WSGI without flup -useWSGI = True - -# verbose in entry point -entryVerbose = False - - - -########################## -# -# parameters for memcached -# - -# use memcached -memcached_enable = True - -# memcached servers -memcached_srvs = voatlas248.cern.ch:11211,voatlas249.cern.ch:11211,voatlas250.cern.ch:11211,voatlas251.cern.ch:11211,voatlas252.cern.ch:11211,voatlas253.cern.ch:11211 - -# expiration time in memcached -memcached_exptime = 86400 - - - -########################## -# -# nRunning parameters -# - -# interval -nrun_interval = 5 - -# the number of hosts -nrun_hosts = 3 - -# serial number -nrun_snum = 999 - - - -########################## -# -# Cassandra -# - -# use Cassandra for PandaCache -cacheUseCassandra = False - -# ignore Cassandra error -cacheIgnoreCassandraError = True - -# keyspace for PandaCache -cacheKeySpace = PandaCacheKeySpace - -# column family for files -cacheFileTable = FileTable - - - -########################## -# -# Job Status Monitor -# - -# enable job status change monitoring -record_statuschange = False