diff --git a/current/INSTALL.txt b/current/INSTALL.txt
deleted file mode 100644
index e3358f664..000000000
--- a/current/INSTALL.txt
+++ /dev/null
@@ -1,137 +0,0 @@
-Installation
---------------------
-
-1. Checkout panda-common and panda-server.
-
-$ svn co svn+ssh://svn.cern.ch/reps/panda/panda-common/tags/X.Y.Z panda-common
-$ svn co svn+ssh://svn.cern.ch/reps/panda/panda-server/tags/A.B.C panda-server
-
-* For tar-ball installation
-
-$ cd panda-common
-$ python setup.py install --prefix=INSTALLDIR
-$ cd ../panda-server
-$ python setup.py install --prefix=INSTALLDIR
-
-where INSTALLDIR is /data/atlpan/testsrv, for example.
-
-* For RPM installation
-
-$ cd panda-common
-$ python setup.py bdist_rpm
-$ sudo rpm -Uvh dist/panda-common-*.noarch.rpm
-$ cd ../panda-server
-$ python setup.py bdist_rpm
-$ sudo rpm -Uvh dist/panda-server-*.noarch.rpm
-
-INSTALLDIR is set to /data/atlpan/srv automatically for RPMs
-
-
-2. Modify config files
-
-$ cd INSTALLDIR/etc/panda
-$ mv panda_common.cfg.rpmnew panda_common.cfg
-$ mv panda_server.cfg.rpmnew panda_server.cfg
-$ mv panda_server-httpd.conf.rpmnew panda_server-httpd.conf
-$ emacs -nw panda_server.cfg
-
-fix FIXME
-
-dq2_dir = /opt/dq2
-
-->
-
-dq2_dir = /data/atlpan/DQ2Clients/DQ2Clients
-
-$ emacs -nw panda_server-httpd.conf
-
-SSLCertificateFile InstallDir/etc/panda/server.crt
-SSLCertificateKeyFile InstallDir/etc/panda/server.key
-
-->
-
-SSLCertificateFile /etc/httpd/conf/ssl.crt/server.crt
-SSLCertificateKeyFile /etc/httpd/conf/ssl.key/server.key
-
-$ cd INSTALLDIR/etc/sysconfig
-$ mv panda_server-sysconfig.rpmnew panda_server-sysconfig
-$ emacs -nw panda_server-sysconfig
-
-add
-
-export X509_USER_PROXY=/data/atlpan/x509up_u25606
-
-
-3. Add .gacl
-
-$ cd INSTALLDIR/lib/python*/site-packages/pandaserver/server/
-$ emacs -nw .gacl
-
-
-
-
-
-
-
-
-4. Add grid-env.sh if needed
-
-e.g.,
-$ cat INSTALLDIR/etc/grid-env.sh
-export LD_LIBRARY_PATH=/opt/glite/lib64:/opt/globus/lib:/opt/lcg/lib64:$LD_LIBRARY_PATH
-export PYTHONPATH=/opt/glite/lib64/python:/opt/lcg/lib64/python:$PYTHONPATH
-export PATH=/opt/edg/bin:/opt/glite/bin:/opt/globus/bin:/opt/lcg/bin:$PATH
-
-and modify panda_server.cfg
-
-$ emacs -nw INSTALLDIR/etc/panda/panda_server.cfg
-
-glite_source = /opt/glite/etc/profile.d/grid-env.sh
-
-->
-
-glite_source = INSTALLDIR/etc/grid-env.sh
-
-
-5. Make log and cache dirs, and change owner if RPM is used
-
-mkdir -p INSTALLDIR/var/log/panda
-mkdir -p INSTALLDIR/var/log/panda/wsgisocks
-mkdir -p INSTALLDIR/var/cache/pandaserver
-chown atlpan:zp INSTALLDIR/var/log/panda
-chown atlpan:zp INSTALLDIR/var/log/panda/wsgisocks
-chown atlpan:zp INSTALLDIR/var/cache/pandaserver
-
-6. For voatlas
-
-cp ~/devsrv/share/httpd-pandasrv /etc/rc.d/init.d/
-/sbin/chkconfig --add httpd-pandasrv
-cp ~/devsrv/share/panda_server-httpd.conf.VM /data/atlpan/srv/etc/panda/panda_server-httpd.conf
-cp ~/devsrv/share/panda_server.cfg.VM /data/atlpan/srv/etc/panda/panda_server.cfg
-cp ~/devsrv/share/x509up_u25606_novoms /data/atlpan/
-chown atlpan:zp /data/atlpan/x509up_u25606_novoms
-cp ~/devsrv/share/pandasrv /etc/logrotate.d/
-cp ~/devsrv/share/pandasrv.cron /etc/cron.d/
-
-
-Start the server
---------------------
-
-Add the following to crontab.
-
-0-59/5 * * * * INSTALLDIR/usr/bin/panda_server-add.sh > /dev/null 2>&1
-15 0-21/3 * * * INSTALLDIR/usr/bin/panda_server-copyArchive.sh > /dev/null 2>&1
-
-Run the server.
-
-$ sudo INSTALLDIR/etc/init.d/panda_server-ctl start
-
-Stop the server.
-
-$ sudo INSTALLDIR/etc/init.d/panda_server-ctl stop
-
-
-
-
-
-
diff --git a/current/MANIFEST.in b/current/MANIFEST.in
deleted file mode 100644
index 46666ccfe..000000000
--- a/current/MANIFEST.in
+++ /dev/null
@@ -1,2 +0,0 @@
-include *.txt *.py *.cfg
-recursive-include templates *.template
diff --git a/current/README.txt b/current/README.txt
deleted file mode 100644
index 7df534fd3..000000000
--- a/current/README.txt
+++ /dev/null
@@ -1,967 +0,0 @@
-Release Note
-
-* 0.0.18 (7/2/2013)
- * tagged for JEDI
- * fixed datriHandler for SLC6
- * improved getLFNsInUseForAnal
- * fixed getScriptOfflineRunning for athenaMP
- * fixed dispatcher so that install jobs can run on sites with status=test
- * fixed for ANALY_BNL_SHORT and ANALY_BNL_LONG
- * included group analysis jobs in priority massage
- * removed priority boost for group analysis jobs
- * fixed brokerage to respect preset computingSite even for too many input
- jobs in cloud with negative t1weight
-
-* 0.0.17 (4/27/2013)
- * giving a higher prio to install jobs
- * split runRebro from copyArchived
- * fixed retryInActive to reset file status
- * modified dispatcher to send prodSourceLabel for getJob
- * changed ATLAS_PANDALOG.USERS_ID_SEQ to ATLAS_PANDAMETA.USERS_ID_SEQ
- * added TaskMonitor link to email notifications
- * changed getJob() to allow the prod/analy pilot to get installation jobs
- * fixed retryJobsInActive
- * fixed datasetManager to delete sub from foreign T1 instead of home T1
- * improved getDisInUseForAnal
- * added boostUser
- * improved fairshare to support per-cloud shares
- * changed Setupper to register both DATADISK and PRODDISK as locations for sub
- * changed job/task brokerages not to check DBR with DQ2 at CVMFS sites
- * changed the brokerage to skip release checks for releases=ANY
- * fixed for site.priorityoffset
- * fixed T2 cleanup to check if there is active subscription
- * fixed brokerage and copyArchive for RU
- * changed insertNewJob not to insert metadata when it is empty
- * fixed killUser to kill jobs gradually
- * fixed Setupper to make dis for pin at MCP sites in ND cloud
- * fixed Setupper to take cloudconfig.tier1se into account for dis subscriptions
- * set a limit on G/U in the brokerage
- * sending more info in PD2P logging
- * fixed LFC lookup in the brokerage
- * changed PD2P to be triggered by the second job
- * removed multiCloudFactor from the brokerage for NL
- * added a protection to updateJobStatus to prevent holding->transferring
- * fixed getUserParameter to insert new row if the user is missing
- * fixed Setupper to trigger prestaging when sites with multi-endpoints use TAPE
- * put all info to ErrorDiag in the brokerage
- * added modificationTime constraint to URL sent to the user by Notifier
- * introduced ProcessLimiter
- * changed TA to shorten retry interval after refreshing replica info
- * skipping file availability check for log datasets in TA
- * using cloudconfig.tier1SE to count files at T1
- * setting scope only for ATLAS
- * improved the task brokerage to check datasets with fewer replicas first
- * set limit on the number of IDs to be sent to the logger for reassign/killJobs
- * removed LFC lookup from TA
- * changed PD2P to use secondary share
- * fixed to use correct DQ2 site ID for pinning at sites with multi-endpoints
- * modified to send scopes for output files to the pilot
- * added changeJobPriorities
- * using DATADISK for MCP T1 input at all T1s except US
- * added filespec.scope
- * reducing lifetime of dis when corresponding jobs finished and some of them failed
- * improved the brokerage to count the number of running jobs per processingType
- * using transferringlimit in the brokerage
- * fixed the bulk OK file lookup again for unique ddm endpoint sites
- * reduced interval of PandaMover reattempts to 15min from 3h
- * fixed the bulk OK file lookup in the brokerge for multiple ddm endpoints
- * increased the number of PandaMover channels to 15
- * using DATADISK for MCP T1 input at CERN
- * using a default fareshare defined per cloud if T2 doesn't define share
- * added a protection against overwriting of dataset status by datasetMgr
- * implemented a nested fareshare management mechanism
- * fixed the brokerage message when release is missing for repro
- * fixed TA since replicas at T1 non DATADISK prevented T2 replicas from being used
- * using DATADISK for MCP T1 input at ND,ES,DE,NL,TW
- * added a patch for MWT2 to associate MWT2_DATADISK in TA
- * allowed wildcards in cloudconfig.tier1SE
- * fixed Merger for standalone ROOT
- * fixed Closer to trigger merging for cancelled jobs
- * fixed Setupper to pin DBR as well
- * added a protection to Setupper for file lost after job submission
- * fixed getHighestPrioJobStatPerPG for group queue
- * added group queue to all clouds
- * added FOR UPDATE when getting jobdefID for users
- * removed hard-coded FZK-LCG2_DATATAPE removal in TA
- * set activity=Production to TA subscriptions
- * fixed weight reduction in TA for no input tasks
- * fixed the brokerage to send message to logger for too many transferring's
- * fixed wrong error message in TA when open dataset is incomplete
- * updated TA to use a special weight reduction when only TAPE is available
- * removed selector from fileCallbackListener
- * fixed for TMPDISK
- * fixed Setupper to scan T2 LFC per LFC host instead of per SE
- * fixed Setupper to use correct location when pinning dis at foreign T1
- * fixed sitemapper to allow multiple DQ2 site IDs to use the same token
- * added DQ2 registration time to SLS
- * fixed vomsrenew.sh to check certificate and proxy lifetime
- * fixed file-check in the brokerage for BNL@non-US
- * fixed brokerage not to overwrite file's destSE for destSE=local
- * introduced mcore queue in PG
- * added iscvmfs to SiteSpec
-
-* 0.0.16 (8/29/2012)
- * changed Setupper to make sub when data is available only at T2
- * changed Setupper to make sub when data is missing at T1
- * change TA to pin input and skip replicas with ToBeDeleted
- * using share=secondary for non T2-close-source PD2P
- * added useWebCache() to Client
- * fixed getJobStatistics not to read archived via http by default
- * fixed Adder2 to skip destSE check for ddm=local
- * fixed LFCclient to randomly resolve DNS alias for LFC host
- * added makeSlsXml
- * patched smtplib.stderr to send debug info to logger
- * added 32/64 to getScriptOfflineRunning
- * changed JOBSARCHIVED4_MODTIME_IDX hint
- * enabled maxtime check for analysis brokerage
- * fixed to check T2 files when get reassigned
- * removed hints related to JOBSACTIVE4_JOBSTATUS_IDX
- * fixed setOK to check map
- * fixed resetDefinedJob for for recordStatusChange
- * fixed updateJobStatus not to reset modificationTime of holding jobs
- * fixed file check not to use TAPE replicas when T1 is used as T2
- * disabled release check for CERN-RELEASE
- * enabled release check for CERN
- * removed EVNT from PD2P
- * removed the higher priority to phys-higgs
- * added _LONG as a suffix of hospital queue
- * fixed queryLastFilesInDataset agains missing jobs which are still in fileDB
- * added setPriority.py
- * fixed updateJobStatus for endTime
- * updated the brokerage log to have timestamp
- * updated the brokerage to take maxtime into account
- * updated file-level callback
- * added Job Status Monitor
- * added --killUserJobs to killJob.py
- * added reliability-based brokerage for analysis jobs
- * fixed getDestSE to look into ARCH for sub datasets for failed log files
- * fixed rebrokerage when orig replica is set to ToBeDeleted
- * temporally gave a higher priority to phys-higgs for ICHEP2012
- * added code=91 to allow prod role to kill user jobs gracefully
- * check LFC every hour for high prio transferring jobs
- * fixed datasetManager for T2 cleanup by recognizing T1 PRODDISK correctly
- * delete sub from PRODDISK except US clous
- * added protection to ReBroker against redundant comma in excludedSite
- * added fatal errors for datri in Adder2
- * fixed Adder2 for missing src in schedconfig for analysis with destSE
- * changed brokeage to make a chunk for each diskCount/memory
- * added RbLauncher to run ReBroker in grid env
- * added more message to Finisher
- * fixed Adder2 for failed jobs to add files to sub
- * reduced the number of add.py
- * modified getHighestPrioJobStat to calculate per PG
- * added --noRunning to killTask
- * fixed insertSandboxInfo to use real file size
- * added checkSandboxFile
- * fixed brokerage for nightlies
- * extracting crc from input sandbox in putFile
- * added changes for debug mode
- * setting prestage sites with PandaMover dynamically
- * removed BNL_ATLAS_1 from SiteMapper
- * removed FILESTABLE4_DATASET_IDX
- * added more info to putFile
- * optimized getDisInUseForAnal in TB
- * fixed TA to ignore non-DATADISK replicas at T1
- * fixed brokerage for preassigned repro jobs
- * fixed dataset update timing check in Notifier
- * rixed zero suppression with wildcard in brokerage
- * fixed rebro to set the same specialHandling to build since new build may have different specialHandling
- * removed old hints
- * fixed DataServiceUtils to return an empty map when DQ2Map is set
- * using FOR UPDATE in lockJobForReBrokerage
- * added more debug INFO to Setupper
- * fixed DBProxy not to freeze top datasets for HC when build failed
- * fixed anal brokerage to take # of defined jobs into account
- * setting RUCIO_ACCOUNT and RUCIO_APPID
- * pin dis for foreign T2s in US cloud
- * removed special treatment for BNL from Adder
- * fixed the brokerage to get hospital queues automatically
- * updated brokerage to use coreCount
- * fixed Closer not to freeze any HC datasets
- * fixed Adder since Register2 gives DatasetExist error when it got deleted
- * enabled cap based on priority for CERN
- * not reset retried jobs in Watcher
- * check attemprNr in retryJob
- * added double quotas to all params in getScriptOfflineRunning
- * added jobMetrics
- * added a protection against non-integer PandaID in peekJob
- * changed to update only changed attributes in job tables
- * fixed runMerge not to be stopped due to a single dataset error
- * added debug message for execution time of DQ2(+LFC) registration
- * fixed storeJob to reset changed attribute list
- * disabled beyond-pledge for HC jobs
- * changed to update only changed attributes in filesTable4
- * added nOutputDataFiles and outputFileBytes to job tables
- * modified getScriptOfflineRunning to use parallel transfers
- * removed shadow lookup in Adder
- * disabled sub for computingSite=destinationSE
- * added getScriptOfflineRunning
- * added retry to Cassandra operations
- * changed killing with group prod role not to be case-sensitive
- * added getDis/LFNsInUseForAnal
- * added getPledgeResourceRatio to TB
- * added Cassandra file cache
- * added TAG support in EventPicker
- * added countGuidsClient
- * using SCRIPT_NAME in panda.py
- * removed _shadow creation in ReBroker
- * fixed queryLastFilesInDataset for the fileTable change
- * remove deleting datasets from the Datasets table
- * sending error log to the logger when TA cannot find dataset in DQ2
- * sending fsize and checksum to the pilot
- * added modificationTime<=CURRENT in getFilesInUseForAnal
- * added hint when deleting rows from Datasets
- * making larger subs by sorting jobs by site
- * instantiating dq2api in each thread
- * added hint to use 11g cashing
- * removed constraint in TA to consider T1 and T2 equally
- * increased the lifetime of the proxy to 96h
- * fixed TA to select candidate T2s correctly
- * getting shadow info from filesTable
- * added vomsrenew.sh
- * fixed TA to count the number of files at US T2
- * check attmptNr
- * fixed for non-MC/DATA space at split T1
- * fixed TA to check completeness at T2
- * use correct locations for GEN dis when jobs directly go to T2
- * added protection to Adder2 against sites disappearance from schedconfig
- * added preferential analysis brokerage based on countryGroup
- * added more verbose message in Adder
- * Mikhail Titov updated datriHandler
- * fixed cloudlist to skip None
- * added getJobStatisticsPerUserSite
- * added 64bit in copyROOT
- * avoid priority reduction for merge jobs
- * use <= for maxDiskCount in getJob
- * fixed rebrokerage for --destSE
- * updated rebrokerage to be triggered 3 hours after the site is blacklisted
- * set maxAttempt to allow users to disable auto retry
- * changed global file map to local in brokerage
- * fixed Adder2 to use proper destination for token=TAPE when running at T1 as T2
- * updated killJob to take group prod role into account
- * updated brokerage to take priorities into account for prod jobs
- * using native DQ2 call in ToA
- * modified brokerage to do bulk LFC lookup per site
- * fixed brokerage_util to do LFC lookup per 1000 files instead of 100 files
- * fixed brokerageErrorDiag for repro + missingRel
- * fixed port of pandamon in email notification
- * fixed brokerageErrorDiag for useT2 + repro
- * set replica pin lifetime before deleting from T2
- * improved brokerage error diag
- * cleaned the brokerage for hospital queues
- * use 0 when memory=0 in one of online sites with the same siteID
- * fixed the brokerage to use RAL-LCG2_HIME as UK T1
- * touch input sandbox when tried to be overwritten
- * permit overwriting of input sandbox
- * reject limited proxy
- * added priority boost for gangarobot-pft
- * fixed getCriteria for aggregated sites
- * fixed brokerage for group=any:0%
- * fixed brokerage more for type=any:0%
- * fixed brokerage to take zero shares into account
- * fixed getCriteriaForProdShare for zero shares
- * added minPriority to Client.getJobStatisticsPerSite
- * using MV in getJobStatisticsWithLabel
- * added fairshare to getJob
- * fixed retryJob not to change the name of lib.tgz for ptest
- * fixed retryJob not to retry buildJob to keep the PandaID order
- * fixed TB to give higher prio to buildJob with prodRole
- * fixed Merger to use the largest SN for merged files
- * fixed queryLastFilesInDataset to ignore merged files
- * fixed brokerageErrorDiag for non missing release errors
- * added tmpwatch.py
- * changed hint in getJobs
- * fixed updateProdDBUpdateTime for pending jobs
- * fixed brokerage to accept test sites for prod_test jobs
- * changed getJobs for test pilots to get gangarobot jobs
- * setup glite in TaLuncher
- * added lock in lockDatasets
- * added version check in Merger to avoid duplicating merge jobs
- * changed Merger to fail when container name is too long
- * use lockJobsForReassign for reassign in copyArchive
- * use native DQ2 in copyArchive and datasetMgr
- * use python2.5 for copyArchive and prio-mgr
- * use native DQ2 in Setupper
- * fixed guid generation for user's log
- * introduced 2 staged submission for prod jobs
- * using T2 in TA
- * using materialized view get getJobStatistics family
- * updated Merger to put log files of merge jobs to a separate container
- * fixed Merger for --transferredDS
- * enabled rebrokerage for processingType=ganga
- * updated Adder for unique constraint error
- * added copyROOT
- * updated Adder to immediately go to failed when subscription failures
- * disabled prio boost for gangarobot derivatives
- * added protection to TA against undefined maxinputsize
- * updated TA and brokerage to use T2 datasets in prod
- * updated for DQ2 client 0.1.37
-
-* 0.0.15 (11/07/2011)
- * removed redundant freshness checks in getSN
- * changed hint in getSerialNumber
- * randomized job order in adder
- * decreased the number of adder processes
- * added more tight constraint to getJobStatistics family
- * reduced prio by 10 for pilot-retry jobs
- * increased the factor of the RW limit to 8000
- * updated Merger for --mexec
- * modified rebroekrage to send brokerage log
- * modified brokerage to send user's countryGroup and nJobs to logger
- * added a protection to httpd.conf for interesting panda.py
- * not attach attemptNr to lib.tgz for rc_test+buildJob
- * fixed parentID for retryJob with new PandaID
- * randomized the order of site check in analysis brokerage
- * added --killOwnProdJobs to killJob.py and killJobsInTask.py
- * fixed brokerage to require cache=None for release check
- * pinning input datasets
- * added limitation of exe/pilotErrorDiags in JD
- * fixed short->long mapping in retryJob
- * generates new PandaID for pilot-retried job
- * using negative errorcode for pilot-retry
- * added invalid character check to DDM
- * fixed the brokerage for --transferredDS
-
-* 0.0.14 (10/11/2011)
- * fixed TaskAssigner for MCshare=0
- * updated brokerage to consider priorities for analysis jobs
- * fixed brokerage for BNL_CVMFS_1
- * modified managed pilots to get prod_test as well
- * call addShadow even if DaTRI failed
- * fixed the error message of location registration in Setupper
- * modified ReBroker for server-side retry
- * reverted the brokerage change
- * changed brokerage to skip sites with memory=0 for analysis with memory
- * increaded MaxClients
- * use DQ2 for foreign T2 in US cloud
- * use IN2P3-CC and IN2P3-CC_SGE_VL as FR T1 in brokerage
- * unset commandToPilot for jobs reassigned by rebrokerage
- * added retryJobsInActive
- * added --maxJobs and --running to killJobLowPrio.py
- * added killJobLowPrio.py
- * fixed killJob
- * simplified anal_finalizer
- * added SiteSpec.lfcregister
- * added getAttr
- * keep failed analysis jobs in Active until all jobs finished
-
-* 0.0.13 (8/30/2011)
- * fixed Adder2.removeUnmerged to catch DQ2 errors correctly
- * using subType in datasetManager
- * filling datasets.subtype
- * added protection against too large inputFileBytes
- * removed CN=Robot: from DN
- * added hint to DBProxy.getLockDatasets
- * reduced the number of table scan in datasetMgr and runMerge
- * fixed brokerage not to count jobs for usermerge or pandamover
- * changed brokerage to use ANALY_CERN_XROOTD and not to use ANALY_CERN
- * added Forker to add.py
- * updated dispatcher to send taskID
- * using schedconfig.multicloud
- * fixed brokerage for test sites
- * fixed brokerage not to count jobs for HC
- * fixed rebrokerage for CERN TMP
- * updated the brokerage to stop assigning prod jobs to sites which have many transferring
- * added jobdefID to libDS in ReBrokerage
- * disabled short -> long for HC
- * fixed SiteMapper to respect online even if another queue is not online
- * put attempt number to output file name in Merger
- * changed = to == in redundant messages
- * job-chaining for ptest+prun
- * added initLogger to Notifier
- * removed redundant suffix from DN for DaTRI request in EventPicker
- * added more message in EventPicker for DaTRI request
- * changed Notifier to non-thread
- * fixed Notifier to take into account old jobs in Arch
- * implemented new PD2P scheme using MoU and close sites
- * increased the number of concurrent Mergers
- * incrementing Datasets.currentfile only for the first failed job
- * fixed Watcher to append attemptNr when sent->activated
- * fixed resetDefJob
- * limited the number of jobs with the same GEN dis
- * fixed EventPicker to take input files into account
- * fixed Merger to use .tgz for text merging
- * added EventPicker
- * added statusmodtime to SiteSpec
- * updated Merger for runDir
- * updated rebrokerage to take --cloud into account
- * added tags into PD2P logging
- * updated Merger for mergeScript
- * fixed getFilesInUseForAnal to skip NULL dis datasets
- * updated analy_brokerage to use memory size
- * added cmtconfig to broker logging
- * enabled cross-cloud for US in PD2P
- * enabled banUser in storeJobs
- * enabled role-check in submitJobs
- * added WrappedPickle to avoid deserializing insecure objects
- * added banUser to storeJob
- * added prodSourceLabel check to UserIF
-
-* 0.0.12 (6/13/2011)
- * fixed Merger for --useContElement
- * fixed inputFileProject extraction for wildcard-uses
- * using basename in Utils methods
- * fixed fetchLog to disallow chdir
- * fixed panda.py to disallow unexpected methods
- * added getVomsAttr
- * updated getJob to decompose CERN-XYZ to CERN-PROD+processingType
- * updated the brokerage to use installedsw.cmtConfig
- * use MoU share for T1 PD2P
- * added getNumPilots
- * added prodSourceLabel=ssc as user's label
- * added --prodSourceLabel to killUser
- * fixed archiveJob for failed jobs with multiple dis
- * fixed Setupper to store GEN dis
- * disabled release check in the brokerage for x86_64-slc5-gcc43
- * implemented aggressive cleaning for PRODDISK
- * added priority boost for gangarobot
- * updated T2 cleanup to use grace_period='00:00:00'
- * cleanup copyArchive
- * changed analysis brokerage to use nRunning(max in last 24h)
- * increased # of active subscriptions to 2 in PD2P
- * added nRunning calculator to add.py
- * disabled priority reduction for merge jods
- * sending analysis brokerage info to logger
- * updated PD2P not to check provenance since group datasets have mc*/data*
- * disabled PD2P to CERN-PROD_EOSDATADISK
- * added checkMergeGenerationStatus
- * enforce LFN-lookup to trigger getting replica map when reassigned
- * fixed brokerge for test jobs at test sites
- * use release matching for T2s in CERN cloud
- * skip release check for CERN and ND
- * set correct info to brokerageErrorDiag
- * send jobs to waiting when release/cache is missing
- * remove '' for |pilotOwners|
- * put cloud-boundary back to US
- * use SourcesPolicy.ALL_SOURCES for PD2P subscriptions
- * improved PD2P logger
- * included CERN to trigger PD2P
- * fixed typo in PD2P skip message
- * fixed zero-division in PD2P
- * enabled T1-T1 in PD2P
-
-* 0.0.11 (4/18/2011)
- * fixed getExpressJobs
- * use c-t-s for all files in merge jobs
- * modified runMerger to kill old process
- * disable Initializer when nDBConnection is 0
- * increased max attempt for rebrokerage to 5
- * changed the rebrokerage interval to 24h
- * skip init for jobDispather,dataService,userIF when nCon=0
- * added parameters in email notification
- * ignore LOCALGROUPDISK in PD2P
- * fixed auto type detection of Merger for THIST
- * use IN2P3-CC_VL for too many input or high prio jobs
- * gave T1 weight to IN2P3-CC_VL
- * added protection to Adder2 against DQ2 failure for jumbo datasets
- * updated Adder2 to avoid making DaTRI request for unmerged files
- * added protection against generating multiple Mergers for --individualOutDS
- * updated brokerage to give T1 weight to NIKHEF for repro jobs
- * fixed Merger for lib.tgz
- * added automatic merge type detection to Merger
- * updated Closer to redirect logging to parent as it doesn't work in nested threads
- * changed parameter convention for Merger
- * added merge job generation
- * set secondary for TA subscription
- * use TAIWAN-LCG2_HOTDISK for TW HOTDISK
- * disabled PD2P for ESD
- * set file.dispDBlock even if they are already available at the site
- * send jobDefID and cloud to the pilot
- * updated Setupper/Adder2 for T1 used as T2
- * set destDBlockToken to DATADISK
- * using home cloud to skip release check in the brokerage
- * reassign stuck T2 evgensimul more frequently
- * enabled release/cache check for US
- * using nRunning(cloud) in brokerage for multi-cloud
- * added fileGUID to updateInFilesReturnPandaIDs for file-level callback
- * set source to _subs for all clouds
- * using DQ2 API directly in Adder
- * added nInputDataFiles,inputFileType,inputFileProject,inputFileBytes
- * add hacks again to TA and Setupper for split T1
- * added EventLookup to PD2P
- * updated SiteMapper for multi-cloud
- * removed hacks from TA and Setupper for split T1
- * added forceOpt to runReBrokerage
- * fixed PD2P not to make sub when dataset is being deleted
- * changed PD2P not to send ESD to EOS
- * added a hint to getPandaIDsForProdDB to enforce function index
- * added comment_ to SiteSpec
- * put hacks back to TA and Setupper for split T1 which uses NIKHEF as src
- * set hidden metadata to _dis and _sub
- * removed REGEXP from Datasets cleanup
- * enabled rebrokerage for ganga-rbtest
- * fixed ReBroker for EOS
- * fixed ReBroker to add _shadow
- * use DATADISK for all PD2P subscriptions
- * close user datasets in container
- * set lifetime for dis and sub datasets
- * added --jobsetID to killUser.py
- * added protection against missing argument for jobID/jobsetID to killUser.py
- * trigger PD2P for EOS when nUsed >= 3
- * updated brokerage to take transferType into account
- * update modificationTime when going to Archived4
- * disabled extra replica making in PD2P
- * trigger PD2P for EOS when nUsed >= 2
- * added testG4sim16.py and testEvgen16.py
- * use diskThr=max(5%,3TB)-diskSize in PD2P
- * added killJobsInTask
- * set disk threshold in PD2P to 5GB
- * updated PD2P so that any analysis job using data makes subscriptions to CERN EOS
- * set specialHandling=rebro when reassigned by rebrokerage
- * fixed DQ2 ID conversion in PD2P for EOS
- * check free disk size in PD2P using DQ2.queryStorageUsage
- * use function index in getPandaIDsForProdDB
- * reduced the number of rotated logs
- * use cernmx.cern.ch
- * added getLockDatasets
- * added the number of succeeded jobs to the subject of Notification
- * added pd2p logging
- * added deleteJobs.py
- * split arch procedure to another cron
- * call taskbuffer.Initializer in forkSetupper.py to acquire Oracle environment handle correctly
- * use truncated DN when setting dataset owner
- * reassign evgen/simul with active state at T1 more aggressively
- * made SQLDumper iterable
- * added SQLDumper
- * added reassignTask
- * use getFullJobStatus in Notifier since some jobs can go to ARCH before notification
- * seprate retry for Notifier
- * added retry to Notifier when failing to send notifications
- * express jobs
- * make new dis datasets even if files are already available at T2
- * short/long mapping for ANALY_LYON-T2
- * updated PD2P to use a negative weight based on the number of subs
- * ignore hidden datasets in PD2P
- * don't use modTime index on jobs_ARCH
- * set/increment nUsed in PD2P
- * use LFN for WN-level matchmaking
- * ignore datasets with provenance=GP for PD2P
- * don't reuse the same site in a single PD2P cycle
- * fixed brokerage to send warning when cache is missing
- * removed redundant holding for prod jobs in Watcher
- * more fix to SetUpper for rc_test
- * not reset holding analysis jobs when stateChangeTime=modTime
- * set stateChangeTime when job goes to holding for finished/failed
- * job chain for rc_test + gangarobot-rctest
- * added archivelogs
- * set tobeclosed to sub datasets of failed downstream jobs
- * rctest -> rc_test
- * reduced time interval to reassign waiting jobs to 30min
- * enabled user-triggered rebrokerage
- * send currentPriority in dispatcher
- * set localpool to specialHandling when beyond-pledge pilot got the job
- * fixed makeSub in TA for getAva change
- * added random sleep for Finisher in copyArchive
- * improved del in copyArchive to avoid redundant deletion
- * increased timelimit for copyArchive
- * added auto rebrokerage to copyArchive
- * report new PandaID to taskBufferErrorDiag when rebrokered
- * check procesingType in rebrokerage
- * added code=8 to killJob for rebrokerage
- * first implementation of auto rebrokerage
- * added getCachePrefixes
- * removed apostrophes from prodUserName
- * fixed useNiotifier in Closer for completed sub datasets
- * changed queryLastFilesInDataset to use MAX(lfn)
- * improved the space shortage message in TA
- * don't check missing files with LFC when site is already set
- * added -9 to killTask
- * added forceKill for prod jobs
- * changed the brokerage to use CERN-PROD_EOSDATADISK as the dest for CERN-EOS jobs
- * added enforce to Activator
- * changes for merge/unmerge jobs
- * rctest
- * deleteStalledJobs
- * removed hacks for last_insert_id of InnoDB
- * allowOtherCountry
- * updated datriHandler to prevent false http-requests
- * added a hint to getJobIDsInTimeRange against jobsActive4
- * added a hint to getJobIDsInTimeRange against jobsArchived4
- * changed hint in DBProxy.updateTransferStatus
- * changing TRF URL from BNL to CERN on the server side
- * fixed error message in brokerage for sites with status!=brokeroff
- * fixed brokerage for release check when schedconfig.rel != ''
- * changed countryGroup=ustlas to us
- * ignore gangarobot family in PD2P
- * disabled priority decreasing for HC jobs
- * use installedSW for base-release matching for analysis
- * $GROUPJOBSN
- * added getSerialNumberForGroupJob
- * use jobsetID in Notifier
- * use max memory/inputsize for each site
- * set jobsetID for ptest
- * changes for output container and short LFN for analysis
-
-* 0.0.10 (8/2/2010)
- * tagged for output container and short LFN for analysis
- * added setCloudTaskByUser
- * get list of PD2P clouds dynamically
- * send transferType to the pilot
- * imposed a size limit on uploaded files by users
- * fixed the task brokerage to take maxDiskCount into account
- * added a protection againt empty jobParameters only for new jobs
- * fixed PD2P to remove the cloud boundary when counting nSites
- * disable brokerage for gangarobot
- * ignore HC and group jobs in PD2P
- * fixed PD2P to take non-PD2P sites into account when checking comp/incomp
- * fixed AtlasRelese for PD2P
- * enabled WN brokerage for ANALY_GLASGOW
- * updated Adder for --destSE=multiSites
- * use Data Brokering fr PD2P
- * change MWT2_UC_DATADISK to MWT2_DATADISK in PD2P
- * delete replicas from T2 when locations != []
- * protection against meta/para=None in peekJob
- * kill ITB_INTEGRATION jobs in sent status
- * batchID
- * ignore dis/sub in PD2P
- * dispatchDBlockTokenForOut
- * added banUser.py and made --jobID optional in killUser.py
- * set activity='Data Consolidation' and acl_alias='secondary' to PD2P subscriptions
- * check replica at T1 in PD2P
- * added getActiveDatasets
- * don't move RAW,HITS,RDO by PD2P
- * allow prod proxy to kill anal jobs with 2 or 4
- * added PD2P
- * regard found=None as an incomplete replica
- * invoke listFileReplicasBySites only for incomplete sites in TA
- * fixed re-brokerage
- * fixed used file check for cancelled jobs
- * increased wait interval for reconnection in connection pool
- * updated ConBridge to kill child when connection failure
- * changed URL of panda mover trf
- * added a protection against method execution failure in panda.py
- * set dataset status for DaTRI requests
- * ignore DaTRI failure for duplicated requests
- * use DQ2 for email extraction
- * added -9 to killJob.py
- * added killUser.py
- * added alias to httpd.conf for trf URL
- * changed reading order in getPandIDsWithJobID to avoid missing jobs
- * set taskBufferErrorDiag when running jobs are killed
- * prevent prod proxy from killing analysis jobs
- * added priority massager
- * added NG words to Notifier
- * avoid sending DaTRI requests for failed jobs
- * fixed replica registration for --destSE
- * set type in datriHandler for analysis system
- * testpanda -> panda
- * introduced datriHandler
- * delete sub datasets from EGEE T2 when callback is received
- * set REMOTE_HOST to creationHost
- * increased priority boost for activated jobs
- * delete cancelled from jobsDefined4
- * added boostPrio.py
- * added cvs,svn,grid,librarian to NG words
- * True/False for schedconfig.validation
- * added admin to NG words for Notifier
- * added cancelled state
-
-* 0.0.9 (4/13/2010)
- * increased the subscription limit to 600 in TA
- * protection against reassigning analysis jobs
- * enabled cache-matching brokerage for all EGEE clouds
- * enabled cache-matching brokerage for NL/FR
- * added a protection for containers composed of multiple datasets
- * added processingType to runBrokerage for HC
- * doesn't check release matching for CERN
- * cache-matching in the brokerage for DE
- * added getHighestPrioJobStat
- * changed weight for the task brokerage to use RW instead of fullRW
- * fixed getFilesInUseForAnal for --individualOutDS
- * added getQueuedAnalJobs
- * updated brokerage to assign one prod_test job to a site
- * disable prod role for non-group activity
- * use maxinputsize in the brokerage
- * added schedconfig stuff to template
- * removed cx_Oracle from FileSpec
- * removed MySQLdb from broker_utils
- * added maxinputsize
- * modified xyzCacheDB to take a list of siteIDs
- * suppressed warning messages in dashboard
-
-* 0.0.8 (2/2/2010)
- * tagging for SLC5 migration
- * added hostname matching for T3 pilots
- * use listFileReplicasBySites in TA
- * added checkFilesWithCacheDB
- * changed the default cmtconfig to SL4 for analysis in brokerage
- * updated the brokerage to allow slc4 jobs on slc5 sites
- * added killTask.py
- * added addFilesToCacheDB and flushCacheDB
- * modified dispatcher to accept service proxy
- * added WN-level file matching to getJob
- * added MemProxy
- * fixed brokerage to skip release/cache matching for ND
- * use all source locations for dis
- * use long hint for queryDatasetWithMap
- * added /Engage/LBNE/Role=pilot to acceptable roles
- * added analy_test to getJob for test pilots
- * use poffset regardless of accesscontrol
- * removed / from FQAN check in allowedgroups
- * limit the max number of files in sub dataset
- * use fasttrack only for evgen/simul
- * added cleanup in updateSiteData
- * added chdir to LFC
- * added chdir for dq2 and fork
- * removed logging updateJob/getJob from dispatcher
- * use averaged updateJob/getJob
- * ignore test when summing SiteData
- * don't update SiteData when logrotate is running
- * randomized the order of sites in updateSiteData to avoid concatenation
- * fixed checkSitesWithCache
- * multi-threads in adder.py
- * count number of updateJob/getJob in add.py
- * use taskBuffer in add.py for all DB access
- * use fasttrack for all tasktypes and prio>=700
- * use taskBuffer for reassignment in copyArchived
- * cleanup old PandaSiteIDs for UK
- * set the number of treads to 2 in wsgi daemon
- * set MaxRequestsPerChild
- * enabled KeepAlive for proxy sites
- * check filename FieldStorage when a param is treated as file
- * not delete dis datasets when jobs are reassigned
- * check useFastCGI before importing flup
- * introduced nDBConForFastCGIWSGI
- * fixed Setupper to re-register location at next attempt when previous was failed
- * changed logLevel in httpd
- * added flag to control verbosity of entry point
- * added FastCGI stuff
-
-* 0.0.7 (11/20/2009)
- * removed verbose message from DBProxyPool
- * more verbose info to DBProxyPool
- * fixed ReBrokerage to require the same distribution pattern of input datasets
- * set encoded nJobs to taskID for analysis jobs
- * fixed ReBrokerage
- * propagate bad state from dashboard
- * removed threading in dispatcher and dataservice
- * fixed typo in dashboard access
- * fixed CloudTaskSpec for serialization
- * close non-DQ2 destinationDBlock in Closer
- * use infinite loop in ProxyPool.__init__
- * add random sleep to ConBridge.connect
- * use TaskBuffer instead of DBProxy in copyArchive
- * added querySQLS to DBProxy
- * use ping for wakeUp
- * degrade message level of child termination in ConBridge
- * added ConBridge for database timeout
- * re-implemented rebrokerage to allow the case where build finished
-
-* 0.0.6 (11/13/2009)
- * destinationSE=local
- * propage failed_transfer from dashboard
- * added activity to subscriptions
- * added cleanup for Datasets table
- * added workaround for x86_64-slc5-gcc43
- * removed TO_DATE for Datasets.modificationdate
- * set priority of buildJob back to 2000
- * renamed testpanda.ddm to pandaddm_
- * added /osg/Role=pilot
- * added lower limit for TO_DATE against Datasets table
- * added protection in JobDispatch against non-proxy pilots
- * added ReBroker
- * removed UAT stuff
- * use long queue in brokerage in addition
- * increased max subjobs in UserIF to 5000
- * send log message from brokerage when disk shortage
- * use ANALY_LONG_BNL_ATLAS for UAT
- * added temporary priority boost for UAT
- * added YY.MM.DD to destinationDBlock of PandaMover
- * skipped release check in brokerage when weight is negative
- * removed T1 constaint on high prio jobs in brokerage only for i686-slc5-gcc43-opt
- * limit matching of cmtconfig=i686-slc5-gcc43-opt to i686-slc5-gcc43-opt jobs only
- * changed brokerage to use only T1 for many input jobs when weight is negative
- * removed computingElement matching in getJob for test jobs
- * use transtimelo for timeout of analysis transfers
- * fixed for site->siteid in installedSW
- * added protection to _checkRole()
- * use cache version matching for analysis
- * added 'user' to NG words in Notifier
- * take '_' into account in Closer for new naming convention
- * use onlyNames in dq2.listDatasets
- * changes for destSE
- * changed cmtconfig for slc5 to match to slc4 and slc5
- * set pandamover priorities using original job priorities
- * added HOTDISK to Setupper
- * added PandaMonURL to email notification
- * send email notification to site contact in addition to cloud contact
- * use schedconfig.DN for privilege check in addition to cloudconfig
- * ptest for analy tests
- * use SARA-MATRIX for all T1 sources
- * more NG words in address finding
- * skip VUID lookup for analysis jobs
- * added getSlimmedFileInfoPandaIDs
- * added a hint for filesTable_ARCH
- * limited modificationTime on filesTable_ARCH queries
- * allowed the pilot to set status for failed input files
- * make subscription for ptest
- * use /atlas for auth of updateFileStatusInDisp
- * added updateFileStatusInDisp to flag lost files
- * removed double counting of jobs in Notifier
- * updated template
- * changed LogFormat for SLS
- * send prodDBlockToken to the pilot
- * modified Adder to take DQUnknownDatasetException into account
- * make subscriptions for rc_test
- * flagged all missing files in Setupper
- * added jobType to Client.getJobStatisticsPerSite
- * use stage-priority for prestaging
- * updated the brokerage to take input size into account
- * use cleanUserID in Notifier
- * add copysetup to SiteSpec
- * fixed getCurrentSiteData for analysis
- * use pilotowners for checkRole in dispatcher
- * ignore DBRelease when adding shadow
- * support getJobStatisticsPerSite(countryGroup=None,workingGroup=None)
- * added two more filed to dis datasetname
- * calculate priority for each workingGroup
- * added finder for email address using phonebook
- * reverted the change in Setupper
- * register location for _sub even when src=dest
- * workingGroup/countryGroup in getJobStatisticsPerSite
- * added getPandaClientVer
- * fixed MailUtils for multiple recipients
- * reuse unknown input files when build failed
- * use T1 in brokerage when too many inputs are required
- * added a timeout to Client
- * set sources of dis for all clouds
- * use MCTAPE for subscriptions
- * added trustIS to runBrokerage
- * added longFormat to listSiteAccess
- * added set to updateSiteAccess
- * verify workingGroup
- * send email update/request for site access
- * kill old dq2 processes
- * addded updateSiteAccess
- * workingGroup
- * added MailUtils
- * prestaging for MCTAPE
- * set processingType for mover
- * get proxy for each job in getFullJobStatus
- * fixed address-check to trigger xwho
- * introduced NG words in email-adder finding
- * put size limit in putFile
- * set higher priority for installation mover
- * skip files used by failed/finished jobs in getFilesInUseForAnal
- * removed BNL and old bamboo stuff from Client.py
- * added a hint to updateInFilesReturnPandaIDs
- * added getFilesInUseForAnal
- * set sources for ES
- * added a hint to getJobIDsInTimeRangeLog
- * removed write spaces from md5sum/checksum in peekJobLog
-
-* 0.0.5 (5/15/2009)
- * subtract N*250M from available space in brokerage
- * use tasktype2 for RW recalculation
- * allow transferring in updateJob
- * use job stat per process group in brokerage
- * added prodUserName
- * added validation to test
- * fixed TA
- * use prodUserName for users
- * added nEvents to JD
- * added pilotowners
- * added rc_test
- * added a hint for Datasets.name
- * enabled validatedReleases for all clouds
- * set high priority for production role
- * added realDatasetsIn
- * get empty list of LFNs for empty dataset
- * set modificationTime to ARCH tables
- * fixed getUserParameter
- * added nInputFiles for HC
- * added countryGroup for country share
- * use a hint for filesTable4.dataset
- * fixed lookup for mail addr
- * use PandaMover for US
- * give higher priorities to /atlas/xyz/Role=production
- * set workingGroup when jobs are submitted with prod role
- * fixed peekJobLog
- * replica location lookup for containers
- * fixed broker_util to use proper python
- * use jobParamsTable
- * fixed python path to use 64bit glite
- * fixed for ArchivedDB
- * fixed FQAN extraction for GRST_CONN
- * dispatchDBlockToken
- * converted datetime to str for stateChangeTime
- * use 12hr limit in getJobStatisticsForBamboo
- * use CERN-PROD_DAQ for prestaging when _DATATAPE is not a location
- * ignore token=ATLASDATATAPE when no tape copy
- * pandasrv -> pandaserver
- * set old=False for listDatasetReplicas
- * fixed copyArchived for ArchiveDB
- * added _zStr/_nullAttrs in JobSpec
- * fixed getJobStatisticsForExtIF()
- * fixed for schedID/pilotID
- * removed redundant debug message
- * fixed for Notification
- * input token for mover
- * set NULL for creationHost,AtlasRelease,transformation,homepackage
- * use sequences directly for PandaID and row_ID
- * use SUBCOUNTER_SUBID_SEQ directly
- * added a hint to countFilesWithMap
- * fixed getNUserJobs
- * removed log/cache dirs making
- * put alias to filesTable4 in countFilesWithMap
- * introduced PANDA_URL_MAP
- * suppressed meta in JobSpec
- * error handling in Adder
- * fixed enddate in Notifier
- * use CURRENT_DATE in copyArch
- * added nprestage
- * added startTime/endTime in updateJob
- * validatedreleases and accesscontrol
- * 3 -> 1hour for movers (discarded)
- * added 'IS NULL' to copyArch
- * added bulk reading for PandaID to copyArch to avoid redundant lookup
- * added a hint to updateOutFilesReturnPandaIDs
- * use Null instead of 'NULL'
- * don't reset jobParameters when reassigned
- * added a hint to all fileTable4+destinationDBlock
- * use JOBSARCHIVED4_MODTIME_IDX
- * addSiteAccess and listSiteAccess
- * hours=1 -> 3 for movers
- * retry in peekJob
- * reconnection in rollback
- * added hint to queryDatasetWithMap
- * use bind-variables for all queries
- * fixed freezeDS
- * fixed a duplicated variable in Closer
- * truncate ddmErrorDiag
- * hint to freezeDS
- * removed deleteFiles in copyArchived
- * not update modTime in copyArchived when peekJob failed
- * container-aware
- * validatedreleases and space check in brokerage
- * added deleteJobSimple
- * use validatedreleases for FR too
- * fixed reassignXYZ
- * use archivedFlag for copy/delete
- * fine lock for reassignRepro
- * threading for reassignRepro
- * improved expiration messages
- * failed when input dataset is not found in DQ2
- * debug messages in Setupper
- * added other error codes in rollback
-
-* 0.0.4 (2/23/2009)
- * GSI authentication for pilots
- * tag-based security mechanism for scheduler-pilot-server chain
- * fixed test/add.py to use Oracle instead of MySQL
- * fixed querySQLS for DELETE
- * added panda_server-grid-env.sh
- * merged DB proxies to reduce the number of connections
- * added lock for worker MPM
- * use common write account
-
-* 0.0.3 (2/16/2009)
- * sync to production version
-
-* 0.0.2 (12/18/2008)
- * adjustments for CERN
-
-* 0.0.1 (12/4/2008)
- * first import
-
- LocalWords: ConBridge
diff --git a/current/pandaserver/__init__.py b/current/pandaserver/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/current/pandaserver/brokerage/ErrorCode.py b/current/pandaserver/brokerage/ErrorCode.py
deleted file mode 100644
index ea80122e4..000000000
--- a/current/pandaserver/brokerage/ErrorCode.py
+++ /dev/null
@@ -1,9 +0,0 @@
-############## errror code
-
-# release is not found
-EC_Release = 100
-
-# voms authentication failure
-EC_Voms = 101
-
-
diff --git a/current/pandaserver/brokerage/LFCclient.py b/current/pandaserver/brokerage/LFCclient.py
deleted file mode 100755
index 06a857e8e..000000000
--- a/current/pandaserver/brokerage/LFCclient.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import re
-import os
-import sys
-import socket
-import random
-
-# error codes
-EC_Main = 70
-EC_LFC = 80
-
-# import lfc api
-try:
- import lfc
-except:
- print "ERROR : could not import lfc"
- sys.exit(EC_LFC)
-
-
-# get files from LFC
-def _getFilesLFC(files,lfcHost,storages,verbose=False):
- # randomly resolve DNS alias
- if lfcHost in ['prod-lfc-atlas.cern.ch']:
- lfcHost = random.choice(socket.gethostbyname_ex(lfcHost)[2])
- # set LFC HOST
- os.environ['LFC_HOST'] = lfcHost
- # timeout
- os.environ['LFC_CONNTIMEOUT'] = '60'
- os.environ['LFC_CONRETRY'] = '2'
- os.environ['LFC_CONRETRYINT'] = '6'
- # get PFN
- iGUID = 0
- nGUID = 1000
- pfnMap = {}
- listGUID = []
- for guid in files.keys():
- if verbose:
- sys.stdout.write('.')
- sys.stdout.flush()
- iGUID += 1
- listGUID.append(guid)
- if iGUID % nGUID == 0 or iGUID == len(files):
- # get replica
- ret,resList = lfc.lfc_getreplicas(listGUID,'')
- if ret == 0:
- for fr in resList:
- if fr != None and ((not hasattr(fr,'errcode')) or \
- (hasattr(fr,'errcode') and fr.errcode == 0)):
- # get host
- match = re.search('^[^:]+://([^:/]+):*\d*/',fr.sfn)
- if match==None:
- continue
- # check host
- host = match.group(1)
- if storages != [] and (not host in storages):
- continue
- # append
- if not pfnMap.has_key(fr.guid):
- pfnMap[fr.guid] = []
- pfnMap[fr.guid].append(fr.sfn)
- else:
- print "ERROR : %s" % lfc.sstrerror(lfc.cvar.serrno)
- sys.exit(EC_LFC)
- # reset
- listGUID = []
- # collect LFNs
- retLFNs = {}
- for guid,lfn in files.iteritems():
- if guid in pfnMap.keys():
- retLFNs[lfn] = pfnMap[guid]
- # return
- return retLFNs
-
-
-
-####################################################################
-# main
-def main():
- import sys
- import getopt
- # option class
- class _options:
- def __init__(self):
- pass
- options = _options()
- del _options
- # set default values
- options.verbose = False
- options.lfns = []
- options.guids = []
- options.lfchost = ''
- options.storages = []
- options.infile = None
- options.outfile = None
- # get command-line parameters
- try:
- opts, args = getopt.getopt(sys.argv[1:],"s:i:g:vl:o:f:")
- except:
- _usage()
- print "ERROR : Invalid options"
- sys.exit(EC_Main)
- # set options
- for o, a in opts:
- if o in ("-v",):
- options.verbose = True
- if o in ("-s",):
- options.storages = a.split(',')
- if o in ("-i",):
- options.lfns = a.split(',')
- if o in ("-g",):
- options.guids = a.split(',')
- if o in ("-l",):
- options.lfchost = a
- if o in ("-f",):
- options.infile = a
- if o in ("-o",):
- options.outfile = a
- # read GUID/LFN
- files = {}
- if options.infile == None:
- for idx in range(len(options.guids)):
- guid = options.guids[idx]
- lfn = options.lfns[idx]
- if guid != 'NULL':
- files[guid] = lfn
- else:
- try:
- # read from file
- ifile = open(options.infile)
- for line in ifile:
- items = line.split()
- if len(items) == 2:
- guid = items[1]
- lfn = items[0]
- if guid != 'NULL':
- files[guid] = lfn
- # close and delete
- ifile.close()
- os.remove(options.infile)
- except:
- errType,errValue = sys.exc_info()[:2]
- print "ERROR: %s:%s" % (errType,errValue)
- sys.exit(1)
- # get files
- retFiles = _getFilesLFC(files,options.lfchost,options.storages,options.verbose)
- print "LFCRet : %s " % retFiles
- # return
- sys.exit(0)
-
-
-if __name__ == "__main__":
- main()
-
diff --git a/current/pandaserver/brokerage/PandaSiteIDs.py b/current/pandaserver/brokerage/PandaSiteIDs.py
deleted file mode 100644
index 5819cc4c0..000000000
--- a/current/pandaserver/brokerage/PandaSiteIDs.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# !!!!!!! This file is OBSOLETE. Its content has been absorbed into pilotController.py in the autopilot repository.
-# !!!!!!! Questions to Torre Wenaus.
-PandaSiteIDs = {
- 'AGLT2' : {'nickname':'AGLT2-condor','status':'OK'},
- 'ALBERTA-LCG2' : {'nickname':'ALBERTA-LCG2-lcgce01-atlas-lcgpbs','status':'OK'},
- 'ANALY_AGLT2' : {'nickname':'ANALY_AGLT2-condor','status':'OK'},
- 'ANALY_ALBERTA' : {'nickname':'ALBERTA-LCG2-lcgce01-atlas-lcgpbs','status':'OK'},
- 'ANALY_BEIJING' : {'nickname':'BEIJING-LCG2-lcg002-atlas-lcgpbs','status':'OK'},
- 'ANALY_BNL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'},
- 'ANALY_BNL_ATLAS_1' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'},
- 'ANALY_BNL_ATLAS_2' : {'nickname':'BNL_ATLAS_2-condor','status':'OK'},
- #'ANALY_BNL_LOCAL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'},
- 'ANALY_BNL_test' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'},
- 'ANALY_BNL_test2' : {'nickname':'ANALY_BNL_ATLAS_1-condor','status':'OK'},
- 'ANALY_BNL_test3' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'},
- 'ANALY_BRUNEL' : {'nickname':'UKI-LT2-Brunel-dgc-grid-44-atlas-lcgpbs','status':'notOK'},
- 'ANALY_CERN' : {'nickname':'CERN-PROD-ce123-grid_atlas-lcglsf','status':'notOK'},
- 'ANALY_CNAF' : {'nickname':'INFN-CNAF-gridit-ce-001-lcg-lcgpbs','status':'notOK'},
- 'ANALY_CPPM' : {'nickname':'IN2P3-CPPM-marce01-atlas-pbs','status':'OK'},
- 'ANALY_FZK' : {'nickname':'FZK-LCG2-ce-5-fzk-atlasXS-pbspro','status':'OK'},
- 'ANALY_GLASGOW' : {'nickname':'UKI-SCOTGRID-GLASGOW-svr021-q3d-lcgpbs','status':'OK'},
- 'ANALY_GLOW-ATLAS' : {'nickname':'GLOW-ATLAS-condor','status':'OK'},
- 'ANALY_GRIF-IRFU' : {'nickname':'GRIF-IRFU-node07-atlas-lcgpbs','status':'OK'},
- 'ANALY_GRIF-LAL' : {'nickname':'GRIF-LAL-grid10-atlasana-pbs','status':'notOK'},
- 'ANALY_GRIF-LPNHE' : {'nickname':'GRIF-LPNHE-lpnce-atlas-pbs','status':'notOK'},
- 'ANALY_HU_ATLAS_Tier2' : {'nickname':'ANALY_HU_ATLAS_Tier2-lsf','status':'OK'},
- 'ANALY_LANCS' : {'nickname':'UKI-NORTHGRID-LANCS-HEP-fal-pygrid-18-atlas-lcgpbs','status':'notOK'},
- 'ANALY_LAPP' : {'nickname':'IN2P3-LAPP-lapp-ce01-atlas-pbs','status':'notOK'},
- 'ANALY_LIV' : {'nickname':'UKI-NORTHGRID-LIV-HEP-hepgrid2-atlas-lcgpbs','status':'notOK'},
- 'ANALY_LONG_BNL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'},
- 'ANALY_LONG_BNL_ATLAS' : {'nickname':'BNL_ATLAS_2-condor','status':'OK'},
- 'ANALY_LONG_BNL_LOCAL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'},
- 'ANALY_LONG_LYON' : {'nickname':'IN2P3-CC-T2-cclcgceli05-long-bqs','status':'OK'},
- 'ANALY_LPC' : {'nickname':'IN2P3-LPC-clrlcgce03-atlas-lcgpbs','status':'notOK'},
- 'ANALY_LPSC' : {'nickname':'IN2P3-LPSC-lpsc-ce-atlas-pbs','status':'OK'},
- 'ANALY_LYON' : {'nickname':'IN2P3-CC-T2-cclcgceli05-medium-bqs','status':'OK'},
- 'ANALY_MANC' : {'nickname':'UKI-NORTHGRID-MAN-HEP-ce01-atlas-lcgpbs','status':'OK'},
- 'ANALY_MCGILL' : {'nickname':'MCGILL-LCG2-atlas-ce-atlas-lcgpbs','status':'OK'},
- 'ANALY_MWT2' : {'nickname':'ANALY_MWT2-condor','status':'notOK'},
- 'ANALY_MWT2_SHORT' : {'nickname':'ANALY_MWT2_SHORT-pbs','status':'notOK'},
- 'ANALY_NET2' : {'nickname':'ANALY_NET2-pbs','status':'OK'},
- 'ANALY_OU_OCHEP_SWT2' : {'nickname':'ANALY_OU_OCHEP_SWT2-condor','status':'notOK'},
- 'ANALY_PIC' : {'nickname':'pic-ce07-gshort-lcgpbs','status':'OK'},
- 'ANALY_RAL' : {'nickname':'RAL-LCG2-lcgce01-atlasL-lcgpbs','status':'OK'},
- 'ANALY_ROMANIA02' : {'nickname':'RO-02-NIPNE-tbat01-atlas-lcgpbs','status':'notOK'},
- 'ANALY_ROMANIA07' : {'nickname':'RO-07-NIPNE-tbit01-atlas-lcgpbs','status':'notOK'},
- 'ANALY_SARA' : {'nickname':'SARA-MATRIX-mu6-short-pbs','status':'notOK'},
- 'ANALY_SFU' : {'nickname':'SFU-LCG2-snowpatch-hep-atlas-lcgpbs','status':'notOK'},
- 'ANALY_SHEF' : {'nickname':'UKI-NORTHGRID-SHEF-HEP-lcgce0-atlas-lcgpbs','status':'OK'},
- 'ANALY_SLAC' : {'nickname':'ANALY_SLAC-lsf','status':'OK'},
- 'ANALY_SWT2_CPB' : {'nickname':'ANALY_SWT2_CPB-pbs','status':'OK'},
- 'ANALY_TAIWAN' : {'nickname':'Taiwan-LCG2-w-ce01-atlas-lcgpbs','status':'OK'},
- 'ANALY_TEST' : {'nickname':'ANALY_TEST','status':'OK'},
- 'ANALY_TORONTO' : {'nickname':'TORONTO-LCG2-bigmac-lcg-ce2-atlas-pbs','status':'OK'},
- 'ANALY_TOKYO' : {'nickname':'TOKYO-LCG2-lcg-ce01-atlas-lcgpbs','status':'OK'},
- 'ANALY_TRIUMF' : {'nickname':'TRIUMF-LCG2-ce1-atlas-lcgpbs','status':'OK'},
- 'ANALY_UBC' : {'nickname':'UBC-pbs','status':'OK'},
- 'ANALY_UIUC-HEP' : {'nickname':'ANALY_UIUC-HEP-condor','status':'OK'},
- 'ANALY_UTA' : {'nickname':'UTA-DPCC-pbs','status':'OK'},
- 'ANALY_UTA-DPCC' : {'nickname':'UTA-DPCC-test-pbs','status':'notOK'},
- 'ANALY_VICTORIA' : {'nickname':'VICTORIA-LCG2-lcg-ce-general-lcgpbs','status':'OK'},
- 'AUVERGRID' : {'nickname':'AUVERGRID-iut15auvergridce01-atlas-lcgpbs','status':'notOK'},
- 'ASGC' : {'nickname':'Taiwan-LCG2-w-ce01-atlas-lcgpbs','status':'OK'},
- 'ASGC_REPRO' : {'nickname':'ASGC_REPRO','status':'notOK'},
- 'Australia-ATLAS' : {'nickname':'Australia-ATLAS-agh2-atlas-lcgpbs','status':'OK'},
- 'BARNETT_TEST' : {'nickname':'BARNETT_TEST','status':'notOK'},
- 'BEIJING' : {'nickname':'BEIJING-LCG2-lcg002-atlas-lcgpbs','status':'OK'},
- 'BNLPROD' : {'nickname':'BNL_ATLAS_1-condor','status':'notOK'},
- 'BNL_ATLAS_1' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'},
- 'BNL_ATLAS_2' : {'nickname':'BNL_ATLAS_2-condor','status':'OK'},
- 'BNL_ATLAS_DDM' : {'nickname':'BNL_DDM-condor','status':'notOK'},
- 'BNL_ATLAS_test' : {'nickname':'BNL_ATLAS_2-condor','status':'notOK'},
- 'BU_ATLAS_Tier2' : {'nickname':'BU_ATLAS_Tier2-pbs','status':'OK'},
- 'BU_ATLAS_Tier2o' : {'nickname':'BU_ATLAS_Tier2o-pbs','status':'OK'},
- 'BU_ATLAS_test' : {'nickname':'BU_ATLAS_Tier2-pbs','status':'NOTOK'},
- 'HU_ATLAS_Tier2' : {'nickname':'HU_ATLAS_Tier2-lsf','status':'OK'},
- 'CERN-BUILDS' : {'nickname':'CERN-BUILDS','status':'notOK'},
- 'CERN-RELEASE' : {'nickname':'CERN-RELEASE','status':'notOK'},
- 'CERN-UNVALID' : {'nickname':'CERN-UNVALID','status':'notOK'},
- 'CGG' : {'nickname':'CGG-LCG2-ce1-atlas-lcgpbs','status':'notOK'},
- 'CHARMM' : {'nickname':'CHARMM','status':'notOK'},
- 'CNR-ILC-PISA' : {'nickname':'CNR-ILC-PISA-gridce-atlas-lcgpbs','status':'notOK'},
- 'CPPM' : {'nickname':'IN2P3-CPPM-marce01-atlas-pbs','status':'OK'},
- 'CSCS-LCG2' : {'nickname':'CSCS-LCG2-ce01-egee48h-lcgpbs','status':'OK'},
- 'csTCDie' : {'nickname':'csTCDie-gridgate-himem-pbs','status':'OK'},
- 'CYF' : {'nickname':'CYFRONET-LCG2-ce-atlas-pbs','status':'OK'},
- 'DESY-HH' : {'nickname':'DESY-HH-grid-ce3-default-lcgpbs','status':'OK'},
- 'DESY-ZN' : {'nickname':'DESY-ZN-lcg-ce0-atlas-lcgpbs','status':'OK'},
- 'EFDA-JET' : {'nickname':'EFDA-JET-grid002-atlas-lcgpbs','status':'notok'},
- 'FZK-LCG2' : {'nickname':'FZK-LCG2-ce-1-fzk-atlasXL-pbspro','status':'OK'},
- 'FZK_REPRO' : {'nickname':'FZK_REPRO','status':'notOK'},
- 'FZU' : {'nickname':'praguelcg2-golias25-lcgatlas-lcgpbs','status':'OK'},
- 'GLOW' : {'nickname':'GLOW-CMS-cmsgrid02-atlas-condor','status':'notOK'},
- 'GLOW-ATLAS' : {'nickname':'GLOW-ATLAS-condor','status':'OK'},
- 'GoeGrid' : {'nickname':'GoeGrid-ce-goegrid-atlas-lcgpbs','status':'OK'},
- 'GRIF-IRFU' : {'nickname':'GRIF-IRFU-node07-atlas-lcgpbs','status':'OK'},
- 'GRIF-LAL' : {'nickname':'GRIF-LAL-grid10-atlas-pbs','status':'OK'},
- 'GRIF-LPNHE' : {'nickname':'GRIF-LPNHE-lpnce-atlas-pbs','status':'OK'},
- 'HEPHY-UIBK' : {'nickname':'HEPHY-UIBK-hepx4-atlas-lcgpbs','status':'OK'},
- 'IFAE' : {'nickname':'ifae-ifaece01-ifae-lcgpbs','status':'OK'},
- 'IFIC' : {'nickname':'IFIC-LCG2-ce01-atlas-pbs','status':'OK'},
- 'IHEP' : {'nickname':'BEIJING-LCG2-lcg002-atlas-lcgpbs','status':'OK'},
- 'ITEP' : {'nickname':'ITEP-ceglite-atlas-lcgpbs','status':'OK'},
- 'IN2P3-LPSC' : {'nickname':'IN2P3-LPSC-lpsc-ce-atlas-pbs','status':'OK'},
- 'JINR-LCG2' : {'nickname':'JINR-LCG2-lcgce01-atlas-lcgpbs', 'status':'OK'},
- 'LAPP' : {'nickname':'IN2P3-LAPP-lapp-ce01-atlas-pbs','status':'OK'},
- 'LIP-COIMBRA' : {'nickname':'LIP-Coimbra-grid006-atlas-lcgpbs','status':'OK'},
- 'LIP-LISBON' : {'nickname':'LIP-Lisbon-ce02-atlasgrid-lcgsge','status':'OK'},
- 'LLR' : {'nickname':'GRIF-LLR-polgrid1-atlas-pbs','status':'notOK'},
- 'LPC' : {'nickname':'IN2P3-LPC-clrlcgce03-atlas-lcgpbs','status':'OK'},
- 'LRZ' : {'nickname':'LRZ-LMU-lcg-lrz-ce-atlas-sge','status':'OK'},
- 'LYON' : {'nickname':'IN2P3-CC-cclcgceli02-long-bqs','status':'OK'},
- 'LYON_REPRO' : {'nickname':'LYON_REPRO','status':'notOK'},
- 'Lyon-T2' : {'nickname':'IN2P3-CC-T2-cclcgceli05-long-bqs','status':'OK'},
- 'LTU_CCT' : {'nickname':'LTU_CCT-pbs','status':'OK'},
- 'MANC' : {'nickname':'UKI-NORTHGRID-MAN-HEP-ce02-atlas-lcgpbs','status':'OK'},
- 'MCGILL-LCG2' : {'nickname':'MCGILL-LCG2-atlas-ce-atlas-pbs','status':'OK'},
- 'MONTREAL' : {'nickname':'Umontreal-LCG2-lcg-ce-atlas-lcgpbs','status':'notOK'},
- 'MPP' : {'nickname':'MPPMU-grid-ce-long-sge','status':'OK'},
- 'MWT2_IU' : {'nickname':'MWT2_IU-pbs','status':'OK'},
- 'MWT2_UC' : {'nickname':'MWT2_UC-pbs','status':'OK'},
- 'NDGF' : {'nickname':'NDGF-condor','status':'OK'},
- 'NIKHEF-ELPROD' : {'nickname':'NIKHEF-ELPROD-gazon-atlas-pbs','status':'OK'},
- 'NIKHEF_REPRO' : {'nickname':'NIKHEF_REPRO','status':'notOK'},
- 'OUHEP_ITB' : {'nickname':'OUHEP_ITB-condor','status':'notOK'},
- 'OU_PAUL_TEST' : {'nickname':'OU_OCHEP_SWT2-condor','status':'notOK'},
- 'OU_OCHEP_SWT2' : {'nickname':'OU_OCHEP_SWT2-condor','status':'OK'},
- 'OU_OSCER_ATLAS' : {'nickname':'OU_OSCER_ATLAS-lsf','status':'OK'},
- 'OU_OSCER_ATLASdeb' : {'nickname':'OU_OSCER_ATLASdeb-lsf','status':'notOK'},
- 'PSNC' : {'nickname':'PSNC-ce-atlas-pbs','status':'OK'},
- 'PIC' : {'nickname':'pic-ce05-glong-lcgpbs','status':'OK'},
- 'PIC_REPRO' : {'nickname':'PIC_REPRO','status':'notOK'},
- 'prague_cesnet_lcg2' : {'nickname':'prague_cesnet_lcg2-skurut17-egee_atlas-lcgpbs','status':'notOK'},
- 'RAL' : {'nickname':'RAL-LCG2-lcgce02-grid1000M-lcgpbs','status':'OK'},
- 'RAL_REPRO' : {'nickname':'RAL_REPRO','status':'notOK'},
- 'ru-Moscow-SINP-LCG2' : {'nickname':'ru-Moscow-SINP-LCG2-lcg02-atlas-lcgpbs','status':'OK'},
- 'ru-PNPI' : {'nickname':'ru-PNPI-cluster-atlas-pbs','status':'OK'},
- 'RDIGTEST' : {'nickname':'RDIGTEST','status':'notOK'},
- 'ROMANIA02' : {'nickname':'RO-02-NIPNE-tbat01-atlas-lcgpbs','status':'OK'},
- 'ROMANIA07' : {'nickname':'RO-07-NIPNE-tbit01-atlas-lcgpbs','status':'OK'},
- 'RRC-KI' : {'nickname':'RRC-KI-gate-atlas-lcgpbs','status':'OK'},
- 'RU-Protvino-IHEP' : {'nickname':'RU-Protvino-IHEP-ce0003-atlas-lcgpbs','status':'OK'},
- 'SARA_REPRO' : {'nickname':'SARA_REPRO','status':'notOK'},
- 'SFU-LCG2' : {'nickname':'SFU-LCG2-snowpatch-atlas-lcgpbs','status':'OK'},
- 'SLACXRD' : {'nickname':'SLACXRD-lsf','status':'OK'},
- 'SLAC_PAUL_TEST' : {'nickname':'SLACXRD-lsf','status':'notOK'},
- 'SNS-PISA' : {'nickname':'SNS-PISA-gridce-atlas-lcgpbs','status':'notOK'},
- 'SPACI-CS-IA64' : {'nickname':'SPACI-CS-IA64-square-atlas-lsf','status':'notOK'},
- 'SWT2_CPB' : {'nickname':'SWT2_CPB-pbs','status':'OK'},
- 'Taiwan-IPAS-LCG2' : {'nickname':'Taiwan-IPAS-LCG2-atlasce-atlas-lcgcondor','status':'notOK'},
- 'TEST1' : {'nickname':'TEST1','status':'notOK'},
- 'TEST2' : {'nickname':'TEST2','status':'notOK'},
- 'TEST3' : {'nickname':'TEST3','status':'notOK'},
- 'TEST4' : {'nickname':'TEST4','status':'notOK'},
- 'TESTCHARMM' : {'nickname':'TESTCHARMM','status':'notOK'},
- 'TESTGLIDE' : {'nickname':'TESTGLIDE','status':'notOK'},
- 'TOKYO' : {'nickname':'TOKYO-LCG2-lcg-ce01-atlas-lcgpbs','status':'OK'},
- 'TORONTO-LCG2' : {'nickname':'TORONTO-LCG2-bigmac-lcg-ce2-atlas-pbs','status':'OK'},
- 'TPATHENA' : {'nickname':'TPATHENA','status':'notOK'},
- 'TPPROD' : {'nickname':'TPPROD','status':'notOK'},
- 'TRIUMF' : {'nickname':'TRIUMF-LCG2-ce1-atlas-lcgpbs','status':'OK'},
- 'TRIUMF_DDM' : {'nickname':'TRIUMF_DDM','status':'notOK'},
- 'TRIUMF_REPRO' : {'nickname':'TRIUMF_REPRO','status':'notOK'},
- 'TW-FTT' : {'nickname':'TW-FTT-f-ce01-atlas-lcgpbs','status':'OK'},
- 'TWTEST' : {'nickname':'TWTEST','status':'notOK'},
- 'TestPilot' : {'nickname':'TestPilot','status':'notOK'},
- 'UAM-LCG2' : {'nickname':'UAM-LCG2-grid003-atlas-lcgpbs','status':'OK'},
- 'UBC' : {'nickname':'UBC-pbs','status':'OK'},
- 'UBC_PAUL_TEST' : {'nickname':'UBC-pbs','status':'notOK'},
- 'UIUC-HEP' : {'nickname':'UIUC-HEP-condor','status':'OK'},
- 'UCITB_EDGE7' : {'nickname':'UCITB_EDGE7-pbs','status':'OK'},
- 'UC_ATLAS_MWT2' : {'nickname':'UC_ATLAS_MWT2-condor','status':'OK'},
- 'UC_ATLAS_test' : {'nickname':'UC_ATLAS_MWT2-condor','status':'OK'},
- 'UC_Teraport' : {'nickname':'UC_Teraport-pbs','status':'notOK'},
- 'UMESHTEST' : {'nickname':'UMESHTEST','status':'notOK'},
- 'UNI-FREIBURG' : {'nickname':'UNI-FREIBURG-ce-atlas-pbs','status':'OK'},
- 'UTA-DPCC' : {'nickname':'UTA-DPCC-pbs','status':'OK'},
- 'UTA-DPCC-test' : {'nickname':'UTA-DPCC-test-pbs','status':'OK'},
- 'UTA_PAUL_TEST' : {'nickname':'UTA-SWT2-pbs','status':'notOK'},
- 'UTA_SWT2' : {'nickname':'UTA-SWT2-pbs','status':'OK'},
- 'UTD-HEP' : {'nickname':'UTD-HEP-pbs','status':'OK'},
- 'VICTORIA-LCG2' : {'nickname':'VICTORIA-LCG2-lcg-ce-general-lcgpbs','status':'OK'},
- 'Wuppertal' : {'nickname':'wuppertalprod-grid-ce-dg_long-lcgpbs','status':'OK'},
-}
-
-
-# cloud-MoverID mapping
-PandaMoverIDs = {
- 'US' : 'BNL_ATLAS_DDM',
- 'CA' : 'TRIUMF_DDM',
- 'FR' : 'TRIUMF_DDM',
- 'IT' : 'TRIUMF_DDM',
- 'NL' : 'TRIUMF_DDM',
- 'DE' : 'TRIUMF_DDM',
- 'TW' : 'TRIUMF_DDM',
- 'UK' : 'TRIUMF_DDM',
- 'ES' : 'TRIUMF_DDM',
- }
diff --git a/current/pandaserver/brokerage/SiteMapper.py b/current/pandaserver/brokerage/SiteMapper.py
deleted file mode 100644
index a0ad2c0a6..000000000
--- a/current/pandaserver/brokerage/SiteMapper.py
+++ /dev/null
@@ -1,205 +0,0 @@
-import re
-import sys
-
-# logger
-from pandalogger.PandaLogger import PandaLogger
-_logger = PandaLogger().getLogger('SiteMapper')
-
-# PandaIDs
-from PandaSiteIDs import PandaSiteIDs
-
-# default site
-from taskbuffer.SiteSpec import SiteSpec
-defSite = SiteSpec()
-defSite.sitename = 'BNL_ATLAS_1'
-defSite.nickname = 'BNL_ATLAS_1-condor'
-defSite.dq2url = 'http://dms02.usatlas.bnl.gov:8000/dq2/'
-defSite.ddm = 'PANDA_UNDEFINED'
-defSite.type = 'production'
-defSite.gatekeeper = 'gridgk01.racf.bnl.gov'
-defSite.status = 'online'
-defSite.setokens = {}
-
-
-########################################################################
-
-class SiteMapper:
-
- # constructor
- def __init__(self,taskBuffer,verbose=False):
- _logger.debug('__init__ SiteMapper')
- try:
- # site list
- self.siteSpecList = {}
-
- # sites not belonging to a cloud
- self.defCloudSites = []
-
- # cloud specification
- self.cloudSpec = {}
-
- # create CloudSpec list
- tmpCloudListDB = taskBuffer.getCloudList()
- for tmpName,tmpCloudSpec in tmpCloudListDB.iteritems():
- self.cloudSpec[tmpName] = {}
- # copy attributes from CloudSepc
- for tmpAttr in tmpCloudSpec._attributes:
- self.cloudSpec[tmpName][tmpAttr] = getattr(tmpCloudSpec,tmpAttr)
- # append additional attributes
- # source : Panda siteID for source
- # dest : Panda siteID for dest
- # sites : Panda siteIDs in the cloud
- self.cloudSpec[tmpName]['source'] = self.cloudSpec[tmpName]['tier1']
- self.cloudSpec[tmpName]['dest'] = self.cloudSpec[tmpName]['tier1']
- self.cloudSpec[tmpName]['sites'] = []
- _logger.debug('Cloud->%s %s' % (tmpName,str(self.cloudSpec[tmpName])))
- # get list of PandaIDs
- siteIDsList = taskBuffer.getSiteList()
- firstDefault = True
- # read full list from DB
- siteFullList = taskBuffer.getSiteInfo()
- # read DB to produce paramters in siteinfo dynamically
- for tmpID,tmpNicknameList in siteIDsList.iteritems():
- for tmpNickname in tmpNicknameList:
- # invalid nickname
- if not siteFullList.has_key(tmpNickname):
- continue
- # get full spec
- ret = siteFullList[tmpNickname]
- # append
- if ret == None:
- _logger.error('Could not read site info for %s:%s' % (tmpID,tmpNickname))
- elif (firstDefault and tmpID == defSite.sitename) or (not self.siteSpecList.has_key(tmpID)) \
- or (self.siteSpecList.has_key(tmpID) and self.siteSpecList[tmpID].status in ['offline','']):
- # overwrite default or remove existing offline
- if firstDefault and tmpID == defSite.sitename:
- del self.siteSpecList[tmpID]
- firstDefault = False
- elif self.siteSpecList.has_key(tmpID) and self.siteSpecList[tmpID].status in ['offline','']:
- del self.siteSpecList[tmpID]
- # append
- if not self.siteSpecList.has_key(tmpID):
- # determine type following a convention
- tmpType = 'production'
- if tmpID.startswith('ANALY_'):
- tmpType = 'analysis'
- elif re.search('test',tmpID,re.I) or \
- (PandaSiteIDs.has_key(tmpID) and PandaSiteIDs[tmpID]['status']!='OK'):
- tmpType = 'test'
- # set type
- ret.sitename = tmpID
- ret.type = tmpType
- # don't use site for production when cloud is undefined
- if ret.type == 'production' and ret.cloud == '':
- _logger.error('Empty cloud for %s:%s' % (tmpID,tmpNickname))
- else:
- self.siteSpecList[tmpID] = ret
- else:
- # overwrite status
- if not ret.status in ['offline','']:
- if self.siteSpecList[tmpID].status != 'online':
- self.siteSpecList[tmpID].status = ret.status
- # use larger maxinputsize and memory
- try:
- if ret.status in ['online']:
- if self.siteSpecList[tmpID].maxinputsize < ret.maxinputsize or \
- ret.maxinputsize == 0:
- self.siteSpecList[tmpID].maxinputsize = ret.maxinputsize
- if (self.siteSpecList[tmpID].memory != 0 and self.siteSpecList[tmpID].memory < ret.memory) or \
- ret.memory == 0:
- self.siteSpecList[tmpID].memory = ret.memory
- except:
- errtype, errvalue = sys.exc_info()[:2]
- _logger.error("%s memory/inputsize failuer : %s %s" % (tmpID,errtype,errvalue))
- # make cloudSpec
- for siteSpec in self.siteSpecList.values():
- # choose only prod sites
- if siteSpec.type != 'production':
- continue
- # append prod site in cloud
- for tmpCloud in siteSpec.cloudlist:
- if self.cloudSpec.has_key(tmpCloud):
- if not siteSpec.sitename in self.cloudSpec[tmpCloud]['sites']:
- # append
- self.cloudSpec[tmpCloud]['sites'].append(siteSpec.sitename)
- else:
- # append to the default cloud
- if not siteSpec.sitename in self.defCloudSites:
- # append
- self.defCloudSites.append(siteSpec.sitename)
- # set defCloudSites for backward compatibility
- if self.cloudSpec.has_key('US'):
- # use US sites
- self.defCloudSites = self.cloudSpec['US']['sites']
- else:
- # add def site as a protection if defCloudSites is empty
- self.defCloudSites.append(defSite.sitename)
- # dump sites
- if verbose:
- _logger.debug('========= dump =========')
- for tmpSite,tmpSiteSpec in self.siteSpecList.iteritems():
- _logger.debug('Site->%s' % str(tmpSiteSpec))
- # check
- for tmpCloud,tmpVals in self.cloudSpec.iteritems():
- # set T1
- try:
- tmpVals['sites'].remove(tmpVals['dest'])
- except:
- pass
- tmpVals['sites'].insert(0,tmpVals['dest'])
- # dump
- _logger.debug('Cloud:%s has %s' % (tmpCloud,tmpVals['sites']))
- for tmpSite in tmpVals['sites']:
- if not self.siteSpecList.has_key(tmpSite):
- _logger.debug(" '%s' doesn't exist" % tmpSite)
- continue
- tmpSiteSpec = self.siteSpecList[tmpSite]
- if tmpSiteSpec.status in ['offline']:
- _logger.debug(' %s:%s' % (tmpSite,tmpSiteSpec.status))
- _logger.debug('Cloud:XX has %s' % self.defCloudSites)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("__init__ SiteMapper : %s %s" % (type,value))
- _logger.debug('__init__ SiteMapper done')
-
-
- # accessor for site
- def getSite(self,site):
- if self.siteSpecList.has_key(site):
- return self.siteSpecList[site]
- else:
- # return default site
- return defSite
-
-
- # check if site exists
- def checkSite(self,site):
- return self.siteSpecList.has_key(site)
-
-
- # accessor for cloud
- def getCloud(self,cloud):
- if self.cloudSpec.has_key(cloud):
- return self.cloudSpec[cloud]
- else:
- # return sites in default cloud
- ret = { 'source' : 'default',
- 'dest' : 'default',
- 'sites' : self.defCloudSites,
- 'transtimelo' : 2,
- 'transtimehi' : 1,
- }
- return ret
-
-
- # accessor for cloud
- def checkCloud(self,cloud):
- if self.cloudSpec.has_key(cloud):
- return True
- else:
- return False
-
-
- # accessor for cloud list
- def getCloudList(self):
- return self.cloudSpec.keys()
diff --git a/current/pandaserver/brokerage/VomsResolver.py b/current/pandaserver/brokerage/VomsResolver.py
deleted file mode 100644
index 7bc432002..000000000
--- a/current/pandaserver/brokerage/VomsResolver.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import re
-import sys
-
-# logger
-from pandalogger.PandaLogger import PandaLogger
-_logger = PandaLogger().getLogger('VomsResolver')
-
-
-########################################################################
-
-class VomsResolver:
-
- # constructor
- def __init__(self):
- self.vomsUserMap = {}
- try:
- # read grid-mapfile
- mapFile = open('/home/sm/grid-mapfile')
- vo = None
- for line in mapFile:
- if line.startswith("#----"):
- # get vo name
- vo = line.split()[-2]
- _logger.debug('get VO:%s' % vo)
- self.vomsUserMap[vo] = []
- else:
- # get DN
- match = re.search('^"([^"]+)"',line)
- if match != None:
- # append
- self.vomsUserMap[vo] = match.group(1)
- # close grid-mapfile
- mapFile.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("init : %s %s" % (type,value))
-
-
- # check the user is on VO
- def checkUser(self,voms,dn):
- _logger.debug('checkUser VO:%s DN:%s' % (voms,dn))
- if not self.vomsUserMap.has_key(voms):
- _logger.debug(' NG - VO:%s is unsupported' % voms)
- return False
- # look for DN
- for tmpDN in self.vomsUserMap[voms]:
- if dn.startswith(tmpDN):
- _logger.debug(' OK' % dn)
- return True
- _logger.debug(' NG - DN:%s is not found' % dn)
- return False
-
-
- # check voms is supported
- def checkVoms(self,voms):
- return self.vomsUserMap.has_key(voms)
diff --git a/current/pandaserver/brokerage/__init__.py b/current/pandaserver/brokerage/__init__.py
deleted file mode 100755
index e69de29bb..000000000
diff --git a/current/pandaserver/brokerage/broker.py b/current/pandaserver/brokerage/broker.py
deleted file mode 100755
index 0cfc98dee..000000000
--- a/current/pandaserver/brokerage/broker.py
+++ /dev/null
@@ -1,1684 +0,0 @@
-import re
-import sys
-import time
-import types
-import fcntl
-import random
-import datetime
-import commands
-import ErrorCode
-import broker_util
-import PandaSiteIDs
-from taskbuffer import ProcessGroups
-from dataservice import DataServiceUtils
-from config import panda_config
-
-from pandalogger.PandaLogger import PandaLogger
-_log = PandaLogger().getLogger('broker')
-
-# all known sites
-_allSites = PandaSiteIDs.PandaSiteIDs.keys()
-
-# sites for prestaging
-#prestageSites = ['BNL_ATLAS_test','BNL_ATLAS_1','BNL_ATLAS_2']
-
-# non LRC checking
-_disableLRCcheck = []
-
-# lock for uuidgen
-_lockGetUU = open(panda_config.lockfile_getUU, 'w')
-
-# short-long mapping
-shortLongMap = {'ANALY_BNL_ATLAS_1':'ANALY_LONG_BNL_ATLAS',
- 'ANALY_LYON-T2' :'ANALY_LONG_LYON-T2',
- 'ANALY_LYON_DCACHE':'ANALY_LONG_LYON_DCACHE',
- 'ANALY_BNL_SHORT' :'ANALY_BNL_LONG',
- }
-
-# processingType to skip brokerage
-skipBrokerageProTypes = ['prod_test']
-
-# comparison function for sort
-def _compFunc(jobA,jobB):
- # append site if not in list
- if not jobA.computingSite in _allSites:
- _allSites.append(jobA.computingSite)
- if not jobB.computingSite in _allSites:
- _allSites.append(jobB.computingSite)
- # compare
- indexA = _allSites.index(jobA.computingSite)
- indexB = _allSites.index(jobB.computingSite)
- if indexA > indexB:
- return 1
- elif indexA < indexB:
- return -1
- else:
- return 0
-
-
-# release checker
-def _checkRelease(jobRels,siteRels):
- # all on/off
- if "True" in siteRels:
- return True
- if "False" in siteRels:
- return False
- # loop over all releases
- for tmpRel in jobRels.split('\n'):
- relVer = re.sub('^Atlas-','',tmpRel)
- # not available releases
- if not relVer in siteRels:
- return False
- return True
-
-
-# get list of files which already exist at the site
-def _getOkFiles(v_ce,v_files,v_guids,allLFNs,allGUIDs,allOkFilesMap,tmpLog=None):
- # DQ2 URL
- dq2URL = v_ce.dq2url
- dq2IDs = v_ce.setokens.values()
- try:
- dq2IDs.remove('')
- except:
- pass
- dq2IDs.sort()
- if dq2IDs == []:
- dq2ID = v_ce.ddm
- else:
- dq2ID = ''
- for tmpID in dq2IDs:
- dq2ID += '%s,' % tmpID
- dq2ID = dq2ID[:-1]
- # set LFC and SE name
- tmpSE = []
- if not v_ce.lfchost in [None,'']:
- dq2URL = 'lfc://'+v_ce.lfchost+':/grid/atlas/'
- tmpSE = broker_util.getSEfromSched(v_ce.se)
- if tmpLog != None:
- tmpLog.debug('getOkFiles for %s with dq2ID:%s,LFC:%s,SE:%s' % (v_ce.sitename,dq2ID,dq2URL,str(tmpSE)))
- # use bulk lookup
- if allLFNs != []:
- # get bulk lookup data
- if not allOkFilesMap.has_key(dq2ID):
- # get files from LRC
- allOkFilesMap[dq2ID] = broker_util.getFilesFromLRC(allLFNs,dq2URL,guids=allGUIDs,
- storageName=tmpSE,getPFN=True)
- # make return map
- retMap = {}
- for tmpLFN in v_files:
- if allOkFilesMap[dq2ID].has_key(tmpLFN):
- retMap[tmpLFN] = allOkFilesMap[dq2ID][tmpLFN]
- # return
- return retMap
- else:
- # old style
- return broker_util.getFilesFromLRC(v_files,dq2URL,guids=v_guids,
- storageName=tmpSE,getPFN=True)
-
-
-# check reprocessing or not
-def _isReproJob(tmpJob):
- if tmpJob != None:
- if tmpJob.processingType in ['reprocessing']:
- return True
- if tmpJob.transformation in ['csc_cosmics_trf.py','csc_BSreco_trf.py','BStoESDAODDPD_trf.py']:
- return True
- return False
-
-
-# set 'ready' if files are already there
-def _setReadyToFiles(tmpJob,okFiles,siteMapper,tmpLog):
- allOK = True
- tmpSiteSpec = siteMapper.getSite(tmpJob.computingSite)
- tmpSrcSpec = siteMapper.getSite(siteMapper.getCloud(tmpJob.cloud)['source'])
- # direct usage of remote SE
- if tmpSiteSpec.ddm != tmpSrcSpec.ddm and tmpSrcSpec.ddm in tmpSiteSpec.setokens.values():
- tmpSiteSpec = tmpSrcSpec
- tmpLog.debug('%s uses remote SiteSpec of %s for %s' % (tmpJob.PandaID,tmpSrcSpec.sitename,tmpJob.computingSite))
- tmpLog.debug('%s %s' % (tmpJob.PandaID,str(tmpSiteSpec.seprodpath)))
- prestageSites = getPrestageSites(siteMapper)
- for tmpFile in tmpJob.Files:
- if tmpFile.type == 'input':
- if DataServiceUtils.isCachedFile(tmpFile.dataset,tmpSiteSpec):
- # cached file
- tmpFile.status = 'cached'
- tmpFile.dispatchDBlock = 'NULL'
- elif (tmpJob.computingSite.endswith('_REPRO') or tmpJob.computingSite == siteMapper.getCloud(tmpJob.cloud)['source'] \
- or tmpSiteSpec.ddm == tmpSrcSpec.ddm) \
- and (not tmpJob.computingSite in prestageSites):
- # EGEE T1. use DQ2 prestage only for on-tape files
- if tmpSiteSpec.seprodpath.has_key('ATLASDATATAPE') and tmpSiteSpec.seprodpath.has_key('ATLASMCTAPE') and \
- okFiles.has_key(tmpFile.lfn):
- tapeOnly = True
- tapeCopy = False
- for okPFN in okFiles[tmpFile.lfn]:
- if re.search(tmpSiteSpec.seprodpath['ATLASDATATAPE'],okPFN) == None and \
- re.search(tmpSiteSpec.seprodpath['ATLASMCTAPE'],okPFN) == None:
- # there is a disk copy
- if tmpJob.cloud == 'US':
- # check for BNLPANDA
- if (tmpSiteSpec.seprodpath.has_key('ATLASMCDISK') and \
- re.search(tmpSiteSpec.seprodpath['ATLASMCDISK'],okPFN) != None) or \
- (tmpSiteSpec.seprodpath.has_key('ATLASDATADISK') and
- re.search(tmpSiteSpec.seprodpath['ATLASDATADISK'],okPFN) != None):
- tapeOnly = False
- else:
- tapeOnly = False
- else:
- # there is a tape copy
- tapeCopy = True
- # trigger prestage when disk copy doesn't exist or token is TAPE
- if tapeOnly or (tapeCopy and tmpFile.dispatchDBlockToken in ['ATLASDATATAPE','ATLASMCTAPE']):
- allOK = False
- else:
- # set ready
- tmpFile.status = 'ready'
- tmpFile.dispatchDBlock = 'NULL'
- else:
- # set ready anyway even if LFC is down. i.e. okFiles doesn't contain the file
- tmpFile.status = 'ready'
- tmpFile.dispatchDBlock = 'NULL'
- elif (((tmpFile.lfn in okFiles) or (tmpJob.computingSite == tmpJob.destinationSE)) \
- and (not tmpJob.computingSite in prestageSites or \
- (tmpJob.computingSite in prestageSites and not tmpJob.cloud in ['US']))) \
- or tmpFile.status == 'missing':
- # don't use TAPE replicas when T1 is used as T2
- if okFiles.has_key(tmpFile.lfn) and \
- tmpSiteSpec.seprodpath.has_key('ATLASDATATAPE') and len(okFiles[tmpFile.lfn]) == 1 and \
- re.search(tmpSiteSpec.seprodpath['ATLASDATATAPE'],okFiles[tmpFile.lfn][0]) != None:
- allOK = False
- else:
- # set ready if the file exists and the site doesn't use prestage
- tmpFile.status = 'ready'
- tmpFile.dispatchDBlock = 'NULL'
- else:
- # prestage with PandaMover
- allOK = False
- # unset disp dataset
- if allOK:
- tmpJob.dispatchDBlock = 'NULL'
-
-
-
-# check number/size of inputs
-def _isTooManyInput(nFilesPerJob,inputSizePerJob):
- # the number of inputs is larger than 5 or
- # size of inputs is larger than 500MB
- if nFilesPerJob > 5 or inputSizePerJob > 500*1024*1024:
- return True
- return False
-
-
-# send analysis brokerage info
-def sendAnalyBrokeageInfo(results,prevRelease,diskThreshold,chosenSite,prevCmtConfig,
- siteReliability):
- # send log messages
- messageList = []
- for resultType,resultList in results.iteritems():
- for resultItem in resultList:
- if resultType == 'rel':
- if prevCmtConfig in ['','NULL',None]:
- msgBody = 'action=skip site=%s reason=missingapp - app=%s is missing' % (resultItem,prevRelease)
- else:
- msgBody = 'action=skip site=%s reason=missingapp - app=%s/%s is missing' % (resultItem,prevRelease,prevCmtConfig)
- elif resultType == 'pilot':
- msgBody = 'action=skip site=%s reason=nopilot - no pilots for last 3 hours' % resultItem
- elif resultType == 'disk':
- msgBody = 'action=skip site=%s reason=diskshortage - disk shortage < %sGB' % (resultItem,diskThreshold)
- elif resultType == 'memory':
- msgBody = 'action=skip site=%s reason=ramshortage - RAM shortage' % resultItem
- elif resultType == 'maxtime':
- msgBody = 'action=skip site=%s reason=maxtime - shorter walltime limit' % resultItem
- elif resultType == 'status':
- msgBody = 'action=skip site=%s reason=sitestatus - not online' % resultItem
- elif resultType == 'reliability':
- msgBody = 'action=skip site=%s reason=reliability - insufficient>%s' % (resultItem ,siteReliability)
- elif resultType == 'weight':
- tmpSite,tmpWeight = resultItem
- if tmpSite == chosenSite:
- msgBody = 'action=choose site=%s reason=maxweight - max weight=%s' % (tmpSite,tmpWeight)
- else:
- msgBody = 'action=skip site=%s reason=notmaxweight - weight=%s' % (tmpSite,tmpWeight)
- elif resultType == 'prefcountry':
- tmpSite,tmpCountry = resultItem
- if tmpSite == chosenSite:
- msgBody = 'action=prefer country=%s reason=countrygroup - preferential brokerage for beyond-pledge' % tmpCountry
- else:
- continue
- else:
- continue
- messageList.append(msgBody)
- # return
- return messageList
-
-
-# send analysis brokerage info to logger
-def sendMsgToLogger(message):
- _log.debug(message)
-
-
-# send analysis brokerage info to logger with HTTP
-def sendMsgToLoggerHTTP(msgList,job):
- try:
- # logging
- iMsg = 0
- # message type
- msgType = 'analy_brokerage'
- # make header
- if not job.jobsetID in [None,'NULL']:
- msgHead = "dn='%s' : jobset=%s jobdef=%s" % (job.prodUserName,job.jobsetID,job.jobDefinitionID)
- else:
- msgHead = "dn='%s' : jobdef=%s" % (job.prodUserName,job.jobDefinitionID)
- for msgBody in msgList:
- # make message
- message = msgHead + ' : ' + msgBody
- # dump locally
- _log.debug(message)
- # get logger
- _pandaLogger = PandaLogger()
- _pandaLogger.lock()
- _pandaLogger.setParams({'Type':msgType})
- logger = _pandaLogger.getHttpLogger(panda_config.loggername)
- # add message
- logger.info(message)
- # release HTTP handler
- _pandaLogger.release()
- # sleep
- iMsg += 1
- if iMsg % 5 == 0:
- time.sleep(1)
- except:
- errType,errValue = sys.exc_info()[:2]
- _log.error("sendMsgToLoggerHTTP : %s %s" % (errType,errValue))
-
-
-# get T2 candidates when files are missing at T2
-def getT2CandList(tmpJob,siteMapper,t2FilesMap):
- if tmpJob == None:
- return []
- # no cloud info
- if not t2FilesMap.has_key(tmpJob.cloud):
- return []
- # loop over all files
- tmpCandT2s = None
- for tmpFile in tmpJob.Files:
- if tmpFile.type == 'input' and tmpFile.status == 'missing':
- # no dataset info
- if not t2FilesMap[tmpJob.cloud].has_key(tmpFile.dataset):
- return []
- # initial candidates
- if tmpCandT2s == None:
- tmpCandT2s = t2FilesMap[tmpJob.cloud][tmpFile.dataset]['sites']
- # check all candidates
- newCandT2s = []
- for tmpCandT2 in tmpCandT2s:
- # site doesn't have the dataset
- if not t2FilesMap[tmpJob.cloud][tmpFile.dataset]['sites'].has_key(tmpCandT2):
- continue
- # site has the file
- if tmpFile.lfn in t2FilesMap[tmpJob.cloud][tmpFile.dataset]['sites'][tmpCandT2]:
- if not tmpCandT2 in newCandT2s:
- newCandT2s.append(tmpCandT2)
- # set new candidates
- tmpCandT2s = newCandT2s
- if tmpCandT2s == []:
- break
- # return [] if no missing files
- if tmpCandT2s == None:
- return []
- # return
- tmpCandT2s.sort()
- return tmpCandT2s
-
-
-# get hospital queues
-def getHospitalQueues(siteMapper):
- retMap = {}
- # hospital words
- goodWordList = ['CORE$','VL$','MEM$','MP\d+$','LONG$']
- # loop over all clouds
- for tmpCloudName in siteMapper.getCloudList():
- # get cloud
- tmpCloudSpec = siteMapper.getCloud(tmpCloudName)
- # get T1
- tmpT1Name = tmpCloudSpec['source']
- tmpT1Spec = siteMapper.getSite(tmpT1Name)
- # skip if DDM is undefined
- if tmpT1Spec.ddm == []:
- continue
- # loop over all sites
- for tmpSiteName in tmpCloudSpec['sites']:
- # skip T1 defined in cloudconfig
- if tmpSiteName == tmpT1Name:
- continue
- # check hospital words
- checkHospWord = False
- for tmpGoodWord in goodWordList:
- if re.search(tmpGoodWord,tmpSiteName) != None:
- checkHospWord = True
- break
- if not checkHospWord:
- continue
- # check site
- if not siteMapper.checkSite(tmpSiteName):
- continue
- tmpSiteSpec = siteMapper.getSite(tmpSiteName)
- # check DDM
- if tmpT1Spec.ddm == tmpSiteSpec.ddm:
- # append
- if not retMap.has_key(tmpCloudName):
- retMap[tmpCloudName] = []
- if not tmpSiteName in retMap[tmpCloudName]:
- retMap[tmpCloudName].append(tmpSiteName)
- _log.debug('hospital queues : %s' % str(retMap))
- # return
- return retMap
-
-
-# get prestage sites
-def getPrestageSites(siteMapper):
- retList = []
- # get cloud
- tmpCloudSpec = siteMapper.getCloud('US')
- # get T1
- tmpT1Name = tmpCloudSpec['source']
- tmpT1Spec = siteMapper.getSite(tmpT1Name)
- # loop over all sites
- for tmpSiteName in tmpCloudSpec['sites']:
- # check site
- if not siteMapper.checkSite(tmpSiteName):
- continue
- # get spec
- tmpSiteSpec = siteMapper.getSite(tmpSiteName)
- # add if DDM is the same as T1
- if tmpT1Spec.ddm == tmpSiteSpec.ddm and not tmpSiteName in retList:
- retList.append(tmpSiteName)
- _log.debug('US prestage sites : %s' % str(retList))
- # return
- return retList
-
-
-# make compact dialog message
-def makeCompactDiagMessage(header,results):
- # limit
- maxSiteList = 5
- # types for compact format
- compactTypeList = ['status','cpucore']
- # message mapping
- messageMap = {'rel' : 'missing rel/cache',
- 'pilot' : 'no pilot',
- 'status' : 'not online',
- 'disk' : 'SE full',
- 'memory' : 'RAM shortage',
- 'transferring' : 'many transferring',
- 'share' : 'zero share',
- 'maxtime' : 'short walltime',
- 'cpucore' : 'CPU core mismatch',
- 'scratch' : 'small scratch disk'
- }
- # put header
- if header in ['',None]:
- retStr = 'No candidate - '
- else:
- retStr = 'special brokerage for %s - ' % header
- # count number of sites per type
- numTypeMap = {}
- for resultType,resultList in results.iteritems():
- # ignore empty
- if len(resultList) == 0:
- continue
- # add
- nSites = len(resultList)
- if not numTypeMap.has_key(nSites):
- numTypeMap[nSites] = []
- numTypeMap[nSites].append(resultType)
- # sort
- numTypeKeys = numTypeMap.keys()
- numTypeKeys.sort()
- # use compact format for largest one
- largeTypes = None
- if len(numTypeKeys) > 0:
- largeTypes = numTypeMap[numTypeKeys[-1]]
- # loop over all types
- for numTypeKey in numTypeKeys:
- for resultType in numTypeMap[numTypeKey]:
- # label
- if messageMap.has_key(resultType):
- retStr += '%s at ' % messageMap[resultType]
- else:
- retStr += '%s at' % resultType
- # use comact format or not
- if (resultType in compactTypeList+largeTypes \
- or len(results[resultType]) >= maxSiteList) \
- and header in ['',None,'reprocessing'] :
- if len(results[resultType]) == 1:
- retStr += '%s site' % len(results[resultType])
- else:
- retStr += '%s sites' % len(results[resultType])
- else:
- for tmpSite in results[resultType]:
- retStr += '%s,' % tmpSite
- retStr = retStr[:-1]
- retStr += '. '
- retStr = retStr[:-2]
- # return
- return retStr
-
-
-# message class
-class MsgWrapper:
- def __init__(self):
- self.timestamp = datetime.datetime.utcnow().isoformat('/')
-
- def info(self,msg):
- _log.info(self.timestamp + ' ' + msg)
-
- def debug(self,msg):
- _log.debug(self.timestamp + ' ' + msg)
-
- def error(self,msg):
- _log.error(self.timestamp + ' ' + msg)
-
- def warning(self,msg):
- _log.warning(self.timestamp + ' ' + msg)
-
-
-
-# schedule
-def schedule(jobs,taskBuffer,siteMapper,forAnalysis=False,setScanSiteList=[],trustIS=False,
- distinguishedName=None,specialWeight={},getWeight=False,sizeMapForCheck={},
- datasetSize=0,replicaMap={},pd2pT1=False,reportLog=False,minPriority=None,
- t2FilesMap={},preferredCountries=[],siteReliability=None):
- # make a message instance
- tmpLog = MsgWrapper()
- try:
- tmpLog.debug('start %s %s %s %s minPrio=%s pref=%s siteRel=%s' % (forAnalysis,str(setScanSiteList),trustIS,
- distinguishedName,minPriority,
- str(preferredCountries),
- siteReliability))
- if specialWeight != {}:
- tmpLog.debug('PD2P weight : %s' % str(specialWeight))
- tmpLog.debug('replicaMap : %s' % str(replicaMap))
- # no jobs
- if len(jobs) == 0:
- tmpLog.debug('finished : no jobs')
- return
- allOkFilesMap = {}
- # use ANALY_CERN_XROOTD and not ANALY_CERN for EOS migration
- if forAnalysis:
- if 'ANALY_CERN_XROOTD' in setScanSiteList and 'ANALY_CERN' in setScanSiteList:
- setScanSiteList.remove('ANALY_CERN')
- tmpLog.debug('remove ANALY_CERN since ANALY_CERN_XROOTD is also a candidate')
- nJob = 20
- iJob = 0
- nFile = 20
- fileList = []
- guidList = []
- okFiles = {}
- totalNumInputs = 0
- totalInputSize = 0
- chosen_ce = None
- prodDBlock = None
- computingSite = None
- dispatchDBlock = None
- previousCloud = None
- prevRelease = None
- prevMemory = None
- prevCmtConfig = None
- prevProType = None
- prevSourceLabel= None
- prevDiskCount = None
- prevHomePkg = None
- prevDirectAcc = None
- prevCoreCount = None
- prevBrokergageSiteList = None
- prevManualPreset = None
- prevGoToT2Flag = None
- prevWorkingGroup = None
- prevMaxCpuCount = None
- prevBrokerageNote = None
- prevPriority = None
-
- nWNmap = {}
- indexJob = 0
- vomsOK = None
-
- diskThreshold = 200
- diskThresholdPD2P = 1024 * 3
- manyInputsThr = 20
- weightUsedByBrokerage = {}
-
- prestageSites = getPrestageSites(siteMapper)
-
- # get statistics
- faresharePolicy = {}
- newJobStatWithPrio = {}
- jobStatBrokerCloudsWithPrio = {}
- if len(jobs) > 0 and (jobs[0].processingType.startswith('gangarobot') or \
- jobs[0].processingType.startswith('hammercloud') or \
- jobs[0].processingType in ['pandamover','usermerge']):
- # disable redundant counting for HC
- jobStatistics = {}
- jobStatBroker = {}
- jobStatBrokerClouds = {}
- nRunningMap = {}
- hospitalQueueMap = {}
- else:
- jobStatistics = taskBuffer.getJobStatistics(forAnal=forAnalysis)
- if not forAnalysis:
- jobStatBroker = {}
- jobStatBrokerClouds = taskBuffer.getJobStatisticsBrokerage()
- faresharePolicy = taskBuffer.getFaresharePolicy()
- else:
- if minPriority == None:
- jobStatBroker = taskBuffer.getJobStatisticsAnalBrokerage()
- else:
- jobStatBroker = taskBuffer.getJobStatisticsAnalBrokerage(minPriority=minPriority)
- nRunningMap = taskBuffer.getnRunningInSiteData()
- hospitalQueueMap = getHospitalQueues(siteMapper)
- # sort jobs by siteID. Some jobs may already define computingSite
- jobs.sort(_compFunc)
- # brokerage for analysis
- candidateForAnal = True
- relCloudMap = {}
- loggerMessages = []
- # get all input files for bulk LFC lookup
- allLFNs = []
- allGUIDs = []
- for tmpJob in jobs:
- if tmpJob.prodSourceLabel in ('test','managed'):
- for tmpFile in tmpJob.Files:
- if tmpFile.type == 'input' and not tmpFile.lfn in allLFNs:
- allLFNs.append(tmpFile.lfn)
- allGUIDs.append(tmpFile.GUID)
- # loop over all jobs + terminator(None)
- for job in jobs+[None]:
- indexJob += 1
- # ignore failed jobs
- if job == None:
- pass
- elif job.jobStatus == 'failed':
- continue
- # list of sites for special brokerage
- specialBrokergageSiteList = []
- # note for brokerage
- brokerageNote = ''
- # send jobs to T2 when files are missing at T1
- goToT2Flag = False
- if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \
- and specialBrokergageSiteList == []:
- currentT2CandList = getT2CandList(job,siteMapper,t2FilesMap)
- if currentT2CandList != []:
- goToT2Flag = True
- specialBrokergageSiteList = currentT2CandList
- tmpLog.debug('PandaID:%s -> set SiteList=%s to use T2 for missing files at T1' % (job.PandaID,specialBrokergageSiteList))
- brokerageNote = 'useT2'
- # hack for split T1
- if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \
- and job.cloud == 'NL' and specialBrokergageSiteList == []:
- # loop over all input datasets
- tmpCheckedDS = []
- useSplitT1 = None
- for tmpFile in job.Files:
- if tmpFile.type == 'input' and (not tmpFile.dataset.startswith('ddo')) \
- and (not tmpFile.dataset in tmpCheckedDS):
- # init
- if useSplitT1 == None:
- useSplitT1 = True
- # no replica map
- if not replicaMap.has_key(tmpFile.dataset):
- # not set
- useSplitT1 = False
- break
- # check if input datasets are available only at NIKHEF
- tmpRepMap = replicaMap[tmpFile.dataset]
- splitT1HasDS = False
- for tmpSplitT1Key in tmpRepMap.keys():
- if tmpSplitT1Key.startswith('NIKHEF-ELPROD'):
- splitT1HasDS = True
- break
- if splitT1HasDS \
- and not tmpRepMap.has_key('SARA-MATRIX_MCDISK') \
- and not tmpRepMap.has_key('SARA-MATRIX_DATADISK') \
- and not tmpRepMap.has_key('SARA-MATRIX_MCTAPE') \
- and not tmpRepMap.has_key('SARA-MATRIX_DATATAPE'):
- pass
- else:
- # not set
- useSplitT1 = False
- break
- # set
- if useSplitT1 == True:
- specialBrokergageSiteList = ['NIKHEF-ELPROD']
- tmpLog.debug('PandaID:%s -> set SiteList=%s for split T1' % (job.PandaID,specialBrokergageSiteList))
- brokerageNote = 'useSplitNLT1'
- # set computingSite to T1 for high priority jobs
- if job != None and job.currentPriority >= 950 and job.computingSite == 'NULL' \
- and job.prodSourceLabel in ('test','managed') and specialBrokergageSiteList == []:
- specialBrokergageSiteList = [siteMapper.getCloud(job.cloud)['source']]
- # set site list to use T1 and T1_VL
- if hospitalQueueMap.has_key(job.cloud):
- specialBrokergageSiteList += hospitalQueueMap[job.cloud]
- tmpLog.debug('PandaID:%s -> set SiteList=%s for high prio' % (job.PandaID,specialBrokergageSiteList))
- brokerageNote = 'highPrio'
- # set computingSite to T1 when too many inputs are required
- if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \
- and specialBrokergageSiteList == []:
- # counts # of inputs
- tmpTotalInput = 0
- for tmpFile in job.Files:
- if tmpFile.type == 'input':
- tmpTotalInput += 1
- if tmpTotalInput >= manyInputsThr:
- specialBrokergageSiteList = [siteMapper.getCloud(job.cloud)['source']]
- # set site list to use T1 and T1_VL
- if hospitalQueueMap.has_key(job.cloud):
- specialBrokergageSiteList += hospitalQueueMap[job.cloud]
- tmpLog.debug('PandaID:%s -> set SiteList=%s for too many inputs' % (job.PandaID,specialBrokergageSiteList))
- brokerageNote = 'manyInput'
- # use limited sites for reprocessing
- if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \
- and job.processingType in ['reprocessing'] and specialBrokergageSiteList == []:
- for tmpSiteName in siteMapper.getCloud(job.cloud)['sites']:
- if siteMapper.checkSite(tmpSiteName):
- tmpSiteSpec = siteMapper.getSite(tmpSiteName)
- if _checkRelease(job.AtlasRelease,tmpSiteSpec.validatedreleases):
- specialBrokergageSiteList.append(tmpSiteName)
- tmpLog.debug('PandaID:%s -> set SiteList=%s for processingType=%s' % (job.PandaID,specialBrokergageSiteList,job.processingType))
- brokerageNote = '%s' % job.processingType
- # use limited sites for MP jobs
- if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \
- and not job.coreCount in [None,'NULL'] and job.coreCount > 1 and specialBrokergageSiteList == []:
- for tmpSiteName in siteMapper.getCloud(job.cloud)['sites']:
- if siteMapper.checkSite(tmpSiteName):
- tmpSiteSpec = siteMapper.getSite(tmpSiteName)
- if tmpSiteSpec.coreCount > 1:
- specialBrokergageSiteList.append(tmpSiteName)
- tmpLog.debug('PandaID:%s -> set SiteList=%s for MP=%scores' % (job.PandaID,specialBrokergageSiteList,job.coreCount))
- brokerageNote = 'MP=%score' % job.coreCount
- # manually set site
- manualPreset = False
- if job != None and job.computingSite != 'NULL' and job.prodSourceLabel in ('test','managed') \
- and specialBrokergageSiteList == []:
- specialBrokergageSiteList = [job.computingSite]
- manualPreset = True
- brokerageNote = 'presetSite'
- overwriteSite = False
- # new bunch or terminator
- if job == None or len(fileList) >= nFile \
- or (dispatchDBlock == None and job.homepackage.startswith('AnalysisTransforms')) \
- or prodDBlock != job.prodDBlock or job.computingSite != computingSite or iJob > nJob \
- or previousCloud != job.cloud or prevRelease != job.AtlasRelease \
- or prevCmtConfig != job.cmtConfig \
- or (computingSite in ['RAL_REPRO','INFN-T1_REPRO'] and len(fileList)>=2) \
- or (prevProType in skipBrokerageProTypes and iJob > 0) \
- or prevDirectAcc != job.transferType \
- or prevMemory != job.minRamCount \
- or prevDiskCount != job.maxDiskCount \
- or prevCoreCount != job.coreCount \
- or prevWorkingGroup != job.workingGroup \
- or prevProType != job.processingType \
- or prevMaxCpuCount != job.maxCpuCount \
- or prevBrokergageSiteList != specialBrokergageSiteList:
- if indexJob > 1:
- tmpLog.debug('new bunch')
- tmpLog.debug(' iJob %s' % iJob)
- tmpLog.debug(' cloud %s' % previousCloud)
- tmpLog.debug(' rel %s' % prevRelease)
- tmpLog.debug(' sourceLabel %s' % prevSourceLabel)
- tmpLog.debug(' cmtConfig %s' % prevCmtConfig)
- tmpLog.debug(' memory %s' % prevMemory)
- tmpLog.debug(' priority %s' % prevPriority)
- tmpLog.debug(' prodDBlock %s' % prodDBlock)
- tmpLog.debug(' computingSite %s' % computingSite)
- tmpLog.debug(' processingType %s' % prevProType)
- tmpLog.debug(' workingGroup %s' % prevWorkingGroup)
- tmpLog.debug(' coreCount %s' % prevCoreCount)
- tmpLog.debug(' maxCpuCount %s' % prevMaxCpuCount)
- tmpLog.debug(' transferType %s' % prevDirectAcc)
- tmpLog.debug(' goToT2 %s' % prevGoToT2Flag)
- # brokerage decisions
- resultsForAnal = {'rel':[],'pilot':[],'disk':[],'status':[],'weight':[],'memory':[],
- 'share':[],'transferring':[],'prefcountry':[],'cpucore':[],
- 'reliability':[],'maxtime':[],'scratch':[]}
- # determine site
- if (iJob == 0 or chosen_ce != 'TOBEDONE') and prevBrokergageSiteList in [None,[]]:
- # file scan for pre-assigned jobs
- jobsInBunch = jobs[indexJob-iJob-1:indexJob-1]
- if jobsInBunch != [] and fileList != [] and (not computingSite in prestageSites) \
- and (jobsInBunch[0].prodSourceLabel in ['managed','software'] or \
- re.search('test',jobsInBunch[0].prodSourceLabel) != None):
- # get site spec
- tmp_chosen_ce = siteMapper.getSite(computingSite)
- # get files from LRC
- okFiles = _getOkFiles(tmp_chosen_ce,fileList,guidList,allLFNs,allGUIDs,allOkFilesMap,tmpLog)
- # loop over all jobs
- for tmpJob in jobsInBunch:
- # set 'ready' if files are already there
- _setReadyToFiles(tmpJob,okFiles,siteMapper,tmpLog)
- else:
- # load balancing
- minSites = {}
- nMinSites = 2
- if prevBrokergageSiteList != []:
- # special brokerage
- scanSiteList = prevBrokergageSiteList
- elif setScanSiteList == []:
- if siteMapper.checkCloud(previousCloud):
- # use cloud sites
- scanSiteList = siteMapper.getCloud(previousCloud)['sites']
- else:
- # use default sites
- scanSiteList = siteMapper.getCloud('default')['sites']
- else:
- # use given sites
- scanSiteList = setScanSiteList
- # add long queue
- for tmpShortQueue,tmpLongQueue in shortLongMap.iteritems():
- if tmpShortQueue in scanSiteList:
- if not tmpLongQueue in scanSiteList:
- scanSiteList.append(tmpLongQueue)
- # the number/size of inputs per job
- nFilesPerJob = float(totalNumInputs)/float(iJob)
- inputSizePerJob = float(totalInputSize)/float(iJob)
- # use T1 for jobs with many inputs when weight is negative
- if (not forAnalysis) and _isTooManyInput(nFilesPerJob,inputSizePerJob) and \
- siteMapper.getCloud(previousCloud)['weight'] < 0 and prevManualPreset == False:
- scanSiteList = [siteMapper.getCloud(previousCloud)['source']]
- # set site list to use T1 and T1_VL
- if hospitalQueueMap.has_key(previousCloud):
- scanSiteList += hospitalQueueMap[previousCloud]
- # get availabe sites with cache
- useCacheVersion = False
- siteListWithCache = []
- if forAnalysis:
- if re.search('-\d+\.\d+\.\d+\.\d+',prevRelease) != None:
- useCacheVersion = True
- siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList,caches=prevRelease,cmtConfig=prevCmtConfig)
- tmpLog.debug(' using installSW for cache %s' % prevRelease)
- elif re.search('-\d+\.\d+\.\d+$',prevRelease) != None:
- useCacheVersion = True
- siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList,releases=prevRelease,cmtConfig=prevCmtConfig)
- tmpLog.debug(' using installSW for release %s' % prevRelease)
- elif re.search(':rel_\d+$$',prevRelease) != None:
- useCacheVersion = True
- iteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList,
- releases=prevRelease.split(':')[0],
- caches=prevRelease.split(':')[1],
- cmtConfig=prevCmtConfig)
- tmpLog.debug(' using installSW for release:cache %s' % prevRelease)
- elif previousCloud in ['DE','NL','FR','CA','ES','IT','TW','UK','US','ND','CERN','RU']:
- useCacheVersion = True
- # change / to -
- convedPrevHomePkg = prevHomePkg.replace('/','-')
- if re.search('rel_\d+(\n|$)',prevHomePkg) == None:
- # only cache is used for normal jobs
- siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList,caches=convedPrevHomePkg,
- cmtConfig=prevCmtConfig)
- else:
- # both AtlasRelease and homepackage are used for nightlies
- siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList,
- releases=prevRelease,
- caches=convedPrevHomePkg,
- cmtConfig=prevCmtConfig)
- tmpLog.debug(' cache %s' % prevHomePkg)
- if useCacheVersion:
- tmpLog.debug(' cache/relSites %s' % str(siteListWithCache))
- # release/cmtconfig check
- foundRelease = False
- # found candidate
- foundOneCandidate = False
- # randomize the order
- if forAnalysis:
- random.shuffle(scanSiteList)
- # get cnadidates
- if True:
- # loop over all sites
- for site in scanSiteList:
- tmpLog.debug('calculate weight for site:%s' % site)
- # _allSites may conain NULL after sort()
- if site == 'NULL':
- continue
- # ignore test sites
- if (prevManualPreset == False) and (site.endswith('test') or \
- site.endswith('Test') or site.startswith('Test')):
- continue
- # ignore analysis queues
- if (not forAnalysis) and site.startswith('ANALY'):
- continue
- # get SiteSpec
- if siteMapper.checkSite(site):
- tmpSiteSpec = siteMapper.getSite(site)
- else:
- tmpLog.debug(" skip: %s doesn't exist in DB" % site)
- continue
- # check status
- if tmpSiteSpec.status in ['offline','brokeroff'] and computingSite in ['NULL',None,'']:
- if forAnalysis and tmpSiteSpec.status == 'brokeroff' and tmpSiteSpec.accesscontrol == 'grouplist':
- # ignore brokeroff for grouplist site
- pass
- elif forAnalysis and prevProType in ['hammercloud','gangarobot','gangarobot-squid']:
- # ignore site status for HC
- pass
- else:
- tmpLog.debug(' skip: status %s' % tmpSiteSpec.status)
- resultsForAnal['status'].append(site)
- continue
- if tmpSiteSpec.status == 'test' and (not prevProType in ['prod_test','hammercloud','gangarobot','gangarobot-squid']) \
- and not prevSourceLabel in ['test','prod_test']:
- tmpLog.debug(' skip: status %s for %s' % (tmpSiteSpec.status,prevProType))
- resultsForAnal['status'].append(site)
- continue
- tmpLog.debug(' status=%s' % tmpSiteSpec.status)
- # check core count
- if tmpSiteSpec.coreCount > 1:
- # use multi-core queue for MP jobs
- if not prevCoreCount in [None,'NULL'] and prevCoreCount > 1:
- pass
- else:
- tmpLog.debug(' skip: MP site (%s core) for job.coreCount=%s' % (tmpSiteSpec.coreCount,
- prevCoreCount))
- resultsForAnal['cpucore'].append(site)
- continue
- else:
- # use single core for non-MP jobs
- if not prevCoreCount in [None,'NULL'] and prevCoreCount > 1:
- tmpLog.debug(' skip: single core site (%s core) for job.coreCount=%s' % (tmpSiteSpec.coreCount,
- prevCoreCount))
- resultsForAnal['cpucore'].append(site)
- continue
- # check memory
- if tmpSiteSpec.memory != 0 and not prevMemory in [None,0,'NULL']:
- try:
- if int(tmpSiteSpec.memory) < int(prevMemory):
- tmpLog.debug(' skip: memory shortage %s<%s' % (tmpSiteSpec.memory,prevMemory))
- resultsForAnal['memory'].append(site)
- continue
- except:
- errtype,errvalue = sys.exc_info()[:2]
- tmpLog.error("memory check : %s %s" % (errtype,errvalue))
- # check maxcpucount
- if tmpSiteSpec.maxtime != 0 and not prevMaxCpuCount in [None,0,'NULL']:
- try:
- if int(tmpSiteSpec.maxtime) < int(prevMaxCpuCount):
- tmpLog.debug(' skip: insufficient maxtime %s<%s' % (tmpSiteSpec.maxtime,prevMaxCpuCount))
- resultsForAnal['maxtime'].append(site)
- continue
- except:
- errtype,errvalue = sys.exc_info()[:2]
- tmpLog.error("maxtime check : %s %s" % (errtype,errvalue))
- # check max input size
- if tmpSiteSpec.maxinputsize != 0 and (not prevDiskCount in [None,0,'NULL']):
- try:
- if int(tmpSiteSpec.maxinputsize) < int(prevDiskCount):
- tmpLog.debug(' skip: not enough disk %s<%s' % (tmpSiteSpec.maxinputsize,prevDiskCount))
- resultsForAnal['scratch'].append(site)
- continue
- except:
- errtype,errvalue = sys.exc_info()[:2]
- tmpLog.error("disk check : %s %s" % (errtype,errvalue))
- tmpLog.debug(' maxinput=%s' % tmpSiteSpec.maxinputsize)
- # reliability
- if forAnalysis and isinstance(siteReliability,types.IntType):
- if tmpSiteSpec.reliabilityLevel != None and tmpSiteSpec.reliabilityLevel > siteReliability:
- tmpLog.debug(' skip: insufficient reliability %s > %s' % (tmpSiteSpec.reliabilityLevel,siteReliability))
- resultsForAnal['reliability'].append(site)
- continue
- # change NULL cmtconfig to slc3/4
- if prevCmtConfig in ['NULL','',None]:
- if forAnalysis:
- tmpCmtConfig = 'i686-slc4-gcc34-opt'
- else:
- tmpCmtConfig = 'i686-slc3-gcc323-opt'
- else:
- tmpCmtConfig = prevCmtConfig
- # set release
- releases = tmpSiteSpec.releases
- origReleases = releases
- if prevProType in ['reprocessing']:
- # use validated releases for reprocessing
- releases = tmpSiteSpec.validatedreleases
- if not useCacheVersion:
- tmpLog.debug(' %s' % str(releases))
- if origReleases == ['ANY']:
- # doesn't check releases for catch all
- tmpLog.debug(' no release check due to releases=%s' % origReleases)
- foundRelease = True
- elif forAnalysis and (tmpSiteSpec.cloud in ['ND'] or prevRelease==''):
- # doesn't check releases for analysis
- tmpLog.debug(' no release check')
- pass
- elif forAnalysis and useCacheVersion:
- # cache matching
- if not site in siteListWithCache:
- tmpLog.debug(' skip: cache %s/%s not found' % (prevRelease.replace('\n',' '),prevCmtConfig))
- if trustIS:
- resultsForAnal['rel'].append(site)
- continue
- elif prevRelease != None and \
- (useCacheVersion and not tmpSiteSpec.cloud in ['ND'] and not site in ['CERN-RELEASE']) and \
- (not prevProType in ['reprocessing']) and \
- (not site in siteListWithCache):
- tmpLog.debug(' skip: cache %s/%s not found' % (prevHomePkg.replace('\n',' '),prevCmtConfig))
- # send message to logger
- try:
- if prevSourceLabel in ['managed','test']:
- resultsForAnal['rel'].append(site)
- # make message
- message = '%s - cache %s/%s not found' % (site,prevHomePkg.replace('\n',' '),prevCmtConfig)
- if not message in loggerMessages:
- loggerMessages.append(message)
- except:
- pass
- continue
- elif prevRelease != None and \
- ((not useCacheVersion and releases != [] and not tmpSiteSpec.cloud in ['ND'] and not site in ['CERN-RELEASE']) or prevProType in ['reprocessing']) and \
- (((not _checkRelease(prevRelease,releases) and prevManualPreset == False) or not site in siteListWithCache) and not tmpSiteSpec.cloud in ['ND'] and not site in ['CERN-RELEASE']):
- # release matching
- if not useCacheVersion:
- tmpLog.debug(' skip: release %s/%s not found' % (prevRelease.replace('\n',' '),prevCmtConfig))
- else:
- tmpLog.debug(' skip: repro cache %s/%s not found' % (prevHomePkg.replace('\n',' '),prevCmtConfig))
- resultsForAnal['rel'].append(site)
- continue
- elif not foundRelease:
- # found at least one site has the release
- foundRelease = True
- # direct access
- if prevDirectAcc == 'direct' and not tmpSiteSpec.allowdirectaccess:
- tmpLog.debug(' skip: no direct access support')
- continue
- # get pilot statistics
- nPilotsGet = 0
- nPilotsUpdate = 0
- if nWNmap == {}:
- nWNmap = taskBuffer.getCurrentSiteData()
- if nWNmap.has_key(site):
- nPilots = nWNmap[site]['getJob'] + nWNmap[site]['updateJob']
- nPilotsGet = nWNmap[site]['getJob']
- nPilotsUpdate = nWNmap[site]['updateJob']
- else:
- nPilots = 0
- tmpLog.debug(' original nPilots:%s get:%s update:%s' % (nPilots,nPilotsGet,nPilotsUpdate))
- # limit on (G+1)/(U+1)
- limitOnGUmax = 2.0
- limitOnGUmin = 0.5
- guRatio = float(1+nPilotsGet)/float(1+nPilotsUpdate)
- if guRatio > limitOnGUmax:
- nPilotsGet = limitOnGUmax * float(1+nPilotsUpdate) - 1.0
- elif guRatio < limitOnGUmin:
- nPilotsGet = limitOnGUmin * float(1+nPilotsUpdate) - 1.0
- tmpLog.debug(' limited nPilots:%s get:%s update:%s' % (nPilots,nPilotsGet,nPilotsUpdate))
- # if no pilots
- if nPilots == 0 and nWNmap != {}:
- tmpLog.debug(" skip: %s no pilot" % site)
- resultsForAnal['pilot'].append(site)
- continue
- # if no jobs in jobsActive/jobsDefined
- if not jobStatistics.has_key(site):
- jobStatistics[site] = {'assigned':0,'activated':0,'running':0,'transferring':0}
- # set nRunning
- if forAnalysis:
- if not nRunningMap.has_key(site):
- nRunningMap[site] = 0
- # check space
- if specialWeight != {}:
- # for PD2P
- if sizeMapForCheck.has_key(site):
- # threshold for PD2P max(5%,3TB)
- thrForThisSite = long(sizeMapForCheck[site]['total'] * 5 / 100)
- if thrForThisSite < diskThresholdPD2P:
- thrForThisSite = diskThresholdPD2P
- remSpace = sizeMapForCheck[site]['total'] - sizeMapForCheck[site]['used']
- tmpLog.debug(' space available=%s remain=%s thr=%s' % (sizeMapForCheck[site]['total'],
- remSpace,thrForThisSite))
- if remSpace-datasetSize < thrForThisSite:
- tmpLog.debug(' skip: disk shortage %s-%s< %s' % (remSpace,datasetSize,thrForThisSite))
- if getWeight:
- weightUsedByBrokerage[site] = "NA : disk shortage"
- continue
- elif site != siteMapper.getCloud(previousCloud)['source']:
- # for T2
- if tmpSiteSpec.space != 0:
- nRemJobs = jobStatistics[site]['assigned']+jobStatistics[site]['activated']+jobStatistics[site]['running']
- if not forAnalysis:
- # take assigned/activated/running jobs into account for production
- remSpace = tmpSiteSpec.space - 0.250*nRemJobs
- else:
- remSpace = tmpSiteSpec.space
- tmpLog.debug(' space available=%s remain=%s' % (tmpSiteSpec.space,remSpace))
- if remSpace < diskThreshold:
- tmpLog.debug(' skip: disk shortage < %s' % diskThreshold)
- resultsForAnal['disk'].append(site)
- # keep message to logger
- try:
- if prevSourceLabel in ['managed','test']:
- # make message
- message = '%s - disk %s < %s' % (site,remSpace,diskThreshold)
- if not message in loggerMessages:
- loggerMessages.append(message)
- except:
- pass
- continue
- # get the process group
- tmpProGroup = ProcessGroups.getProcessGroup(prevProType)
- if prevProType in skipBrokerageProTypes:
- # use original processingType since prod_test is in the test category and thus is interfered by validations
- tmpProGroup = prevProType
- # production share
- skipDueToShare = False
- try:
- if not forAnalysis and prevSourceLabel in ['managed'] and faresharePolicy.has_key(site):
- for tmpPolicy in faresharePolicy[site]['policyList']:
- # ignore priority policy
- if tmpPolicy['priority'] != None:
- continue
- # only zero share
- if tmpPolicy['share'] != '0%':
- continue
- # check group
- if tmpPolicy['group'] != None:
- if '*' in tmpPolicy['group']:
- # wildcard
- tmpPatt = '^' + tmpPolicy['group'].replace('*','.*') + '$'
- if re.search(tmpPatt,prevWorkingGroup) == None:
- continue
- else:
- # normal definition
- if prevWorkingGroup != tmpPolicy['group']:
- continue
- else:
- # catch all except WGs used by other policies
- groupInDefList = faresharePolicy[site]['groupList']
- usedByAnother = False
- # loop over all groups
- for groupInDefItem in groupInDefList:
- if '*' in groupInDefItem:
- # wildcard
- tmpPatt = '^' + groupInDefItem.replace('*','.*') + '$'
- if re.search(tmpPatt,prevWorkingGroup) != None:
- usedByAnother = True
- break
- else:
- # normal definition
- if prevWorkingGroup == groupInDefItem:
- usedByAnother = True
- break
- if usedByAnother:
- continue
- # check type
- if tmpPolicy['type'] != None:
- if tmpPolicy['type'] == tmpProGroup:
- skipDueToShare = True
- break
- else:
- # catch all except PGs used by other policies
- typeInDefList = faresharePolicy[site]['typeList'][tmpPolicy['group']]
- usedByAnother = False
- for typeInDefItem in typeInDefList:
- if typeInDefItem == tmpProGroup:
- usedByAnother = True
- break
- if not usedByAnother:
- skipDueToShare = True
- break
- # skip
- if skipDueToShare:
- tmpLog.debug(" skip: %s zero share" % site)
- resultsForAnal['share'].append(site)
- continue
- except:
- errtype,errvalue = sys.exc_info()[:2]
- tmpLog.error("share check : %s %s" % (errtype,errvalue))
- # the number of assigned and activated
- if not forAnalysis:
- if not jobStatBrokerClouds.has_key(previousCloud):
- jobStatBrokerClouds[previousCloud] = {}
- # use number of jobs in the cloud
- jobStatBroker = jobStatBrokerClouds[previousCloud]
- if not jobStatBroker.has_key(site):
- jobStatBroker[site] = {}
- if not jobStatBroker[site].has_key(tmpProGroup):
- jobStatBroker[site][tmpProGroup] = {'assigned':0,'activated':0,'running':0,'transferring':0}
- # count # of assigned and activated jobs for prod by taking priorities in to account
- nRunJobsPerGroup = None
- if not forAnalysis and prevSourceLabel in ['managed','test']:
- if not jobStatBrokerCloudsWithPrio.has_key(prevPriority):
- jobStatBrokerCloudsWithPrio[prevPriority] = taskBuffer.getJobStatisticsBrokerage(prevPriority)
- if not jobStatBrokerCloudsWithPrio[prevPriority].has_key(previousCloud):
- jobStatBrokerCloudsWithPrio[prevPriority][previousCloud] = {}
- if not jobStatBrokerCloudsWithPrio[prevPriority][previousCloud].has_key(site):
- jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site] = {}
- if not jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site].has_key(tmpProGroup):
- jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup] = {'assigned':0,'activated':0,'running':0,'transferring':0}
- nAssJobs = jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup]['assigned']
- nActJobs = jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup]['activated']
- nRunJobsPerGroup = jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup]['running']
- # add newly assigned jobs
- for tmpNewPriority in newJobStatWithPrio.keys():
- if tmpNewPriority < prevPriority:
- continue
- if not newJobStatWithPrio[tmpNewPriority].has_key(previousCloud):
- continue
- if not newJobStatWithPrio[tmpNewPriority][previousCloud].has_key(site):
- continue
- if not newJobStatWithPrio[tmpNewPriority][previousCloud][site].has_key(tmpProGroup):
- continue
- nAssJobs += newJobStatWithPrio[tmpNewPriority][previousCloud][site][tmpProGroup]
- else:
- nAssJobs = jobStatBroker[site][tmpProGroup]['assigned']
- if forAnalysis and jobStatBroker[site][tmpProGroup].has_key('defined'):
- nAssJobs += jobStatBroker[site][tmpProGroup]['defined']
- nActJobs = jobStatBroker[site][tmpProGroup]['activated']
- # number of jobs per node
- if not nWNmap.has_key(site):
- nJobsPerNode = 1
- elif jobStatistics[site]['running']==0 or nWNmap[site]['updateJob']==0:
- nJobsPerNode = 1
- else:
- if nRunJobsPerGroup == None:
- nJobsPerNode = float(jobStatistics[site]['running'])/float(nWNmap[site]['updateJob'])
- else:
- if nRunJobsPerGroup == 0:
- nJobsPerNode = 1.0/float(nWNmap[site]['updateJob'])
- else:
- nJobsPerNode = float(nRunJobsPerGroup)/float(nWNmap[site]['updateJob'])
- # limit of the number of transferring jobs
- if tmpSiteSpec.transferringlimit == 0:
- maxTransferring = 2000
- else:
- maxTransferring = tmpSiteSpec.transferringlimit
- # get ration of transferring to running
- if not forAnalysis and not tmpSiteSpec.cloud in ['ND']:
- nTraJobs = 0
- nRunJobs = 0
- for tmpGroupForTra,tmpCountsForTra in jobStatBroker[site].iteritems():
- if tmpCountsForTra.has_key('running'):
- nRunJobs += tmpCountsForTra['running']
- if tmpCountsForTra.has_key('transferring'):
- nTraJobs += tmpCountsForTra['transferring']
- tmpLog.debug(' running=%s transferring=%s max=%s' % (nRunJobs,nTraJobs,maxTransferring))
- if max(maxTransferring,2*nRunJobs) < nTraJobs:
- tmpLog.debug(" skip: %s many transferring=%s > max(%s,2*running=%s)" % (site,nTraJobs,maxTransferring,nRunJobs))
- resultsForAnal['transferring'].append(site)
- if prevSourceLabel in ['managed','test']:
- # make message
- message = '%s - too many transferring' % site
- if not message in loggerMessages:
- loggerMessages.append(message)
- continue
- # get ratio of running jobs = run(cloud)/run(all) for multi cloud
- multiCloudFactor = 1
- if not forAnalysis and not previousCloud in ['NL']:
- tmpTotalRunningMulti = 0
- tmpNCloudMulti = 0
- for tmpCloudMulti,tmpCloudValMulti in jobStatBrokerClouds.iteritems():
- if tmpCloudValMulti.has_key(site):
- if tmpCloudValMulti[site].has_key(tmpProGroup):
- tmpNCloudMulti += 1
- if tmpCloudValMulti[site][tmpProGroup].has_key('running'):
- tmpTotalRunningMulti += tmpCloudValMulti[site][tmpProGroup]['running']
- # no running
- if tmpTotalRunningMulti == 0:
- if tmpNCloudMulti != 0:
- multiCloudFactor = tmpNCloudMulti
- else:
- multiCloudFactor = float(tmpTotalRunningMulti+1)/float(jobStatBroker[site][tmpProGroup]['running']+1)
- tmpLog.debug(' totalRun:%s cloudRun:%s multiCloud:%s' % (tmpTotalRunningMulti,
- jobStatBroker[site][tmpProGroup]['running'],
- multiCloudFactor))
- # country preference
- preferredCountryWeight = 1.0
- preferredCountryWeightStr = ''
- if forAnalysis:
- if preferredCountries != [] and tmpSiteSpec.countryGroup != []:
- for tmpCountry in preferredCountries:
- if tmpCountry in tmpSiteSpec.countryGroup:
- # avoid negative weight or zero-divide
- if tmpSiteSpec.availableCPU >= tmpSiteSpec.pledgedCPU and tmpSiteSpec.pledgedCPU > 0:
- preferredCountryWeight = float(tmpSiteSpec.availableCPU) / float(tmpSiteSpec.pledgedCPU)
- preferredCountryWeightStr = "*(%s/%s)" % (tmpSiteSpec.availableCPU,tmpSiteSpec.pledgedCPU)
- resultsForAnal['prefcountry'].append((site,tmpCountry))
- break
- tmpLog.debug(' country preference=%s' % preferredCountryWeightStr[1:])
- # calculate weight
- if specialWeight != {}:
- if not pd2pT1:
- # weight for T2 PD2P
- nSubs = 1
- if specialWeight.has_key(site):
- nSubs = specialWeight[site]
- tmpLog.debug(' %s nSubs:%s assigned:%s activated:%s running:%s nWNsG:%s nWNsU:%s' % \
- (site,nSubs,nAssJobs,nActJobs,nRunningMap[site],nPilotsGet,nPilotsUpdate))
- winv = float(nSubs) * float(nAssJobs+nActJobs) / float(1+nRunningMap[site]) / (1.0+float(nPilotsGet)/float(1+nPilotsUpdate))
- if getWeight:
- weightUsedByBrokerage[site] = "(1+%s/%s)*%s/%s/%s" % (nPilotsGet,1+nPilotsUpdate,1+nRunningMap[site],nAssJobs+nActJobs,nSubs)
- else:
- # weight for T1 PD2P
- tmpLog.debug(' %s MoU:%s' % (site,specialWeight[site]))
- winv = 1.0 / float(specialWeight[site])
- if getWeight:
- weightUsedByBrokerage[site] = "%s" % specialWeight[site]
- else:
- if not forAnalysis:
- if nRunJobsPerGroup == None:
- tmpLog.debug(' %s assigned:%s activated:%s running:%s nPilotsGet:%s nPilotsUpdate:%s multiCloud:%s' %
- (site,nAssJobs,nActJobs,jobStatistics[site]['running'],nPilotsGet,nPilotsUpdate,multiCloudFactor))
- else:
- tmpLog.debug(' %s assigned:%s activated:%s runningGroup:%s nPilotsGet:%s nPilotsUpdate:%s multiCloud:%s' %
- (site,nAssJobs,nActJobs,nRunJobsPerGroup,nPilotsGet,nPilotsUpdate,multiCloudFactor))
- else:
- tmpLog.debug(' %s assigned:%s activated:%s running:%s nWNsG:%s nWNsU:%s' %
- (site,nAssJobs,nActJobs,nRunningMap[site],nPilotsGet,nPilotsUpdate))
- if forAnalysis:
- winv = float(nAssJobs+nActJobs) / float(1+nRunningMap[site]) / (1.0+float(nPilotsGet)/float(1+nPilotsUpdate))
- else:
- if nRunJobsPerGroup == None:
- winv = float(nAssJobs+nActJobs) / float(1+jobStatistics[site]['running']) / (float(1+nPilotsGet)/float(1+nPilotsUpdate))
- else:
- winv = float(nAssJobs+nActJobs) / float(1+nRunJobsPerGroup) / (float(1+nPilotsGet)/float(1+nPilotsUpdate))
- winv *= float(multiCloudFactor)
- # send jobs to T1 when they require many or large inputs
- if _isTooManyInput(nFilesPerJob,inputSizePerJob):
- if site == siteMapper.getCloud(previousCloud)['source'] or \
- (site=='NIKHEF-ELPROD' and previousCloud=='NL' and prevProType=='reprocessing') or \
- (hospitalQueueMap.has_key(previousCloud) and site in hospitalQueueMap[previousCloud]):
- cloudT1Weight = 2.0
- # use weight in cloudconfig
- try:
- tmpCloudT1Weight = float(siteMapper.getCloud(previousCloud)['weight'])
- if tmpCloudT1Weight != 0.0:
- cloudT1Weight = tmpCloudT1Weight
- except:
- pass
- winv /= cloudT1Weight
- tmpLog.debug(' special weight for %s : nInputs/Job=%s inputSize/Job=%s weight=%s' %
- (site,nFilesPerJob,inputSizePerJob,cloudT1Weight))
- # found at least one candidate
- foundOneCandidate = True
- tmpLog.debug('Site:%s 1/Weight:%s' % (site,winv))
- if forAnalysis and trustIS and reportLog:
- resultsForAnal['weight'].append((site,'(1+%s/%s)*%s/%s%s' % (nPilotsGet,1+nPilotsUpdate,1+nRunningMap[site],
- nAssJobs+nActJobs,preferredCountryWeightStr)))
- # choose largest nMinSites weights
- minSites[site] = winv
- if len(minSites) > nMinSites:
- maxSite = site
- maxWinv = winv
- for tmpSite,tmpWinv in minSites.iteritems():
- if tmpWinv > maxWinv:
- maxSite = tmpSite
- maxWinv = tmpWinv
- # delte max one
- del minSites[maxSite]
- # remove too different weights
- if len(minSites) >= 2:
- # look for minimum
- minSite = minSites.keys()[0]
- minWinv = minSites[minSite]
- for tmpSite,tmpWinv in minSites.iteritems():
- if tmpWinv < minWinv:
- minSite = tmpSite
- minWinv = tmpWinv
- # look for too different weights
- difference = 2
- removeSites = []
- for tmpSite,tmpWinv in minSites.iteritems():
- if tmpWinv > minWinv*difference:
- removeSites.append(tmpSite)
- # remove
- for tmpSite in removeSites:
- del minSites[tmpSite]
- # set default
- if len(minSites) == 0:
- # cloud's list
- if forAnalysis or siteMapper.checkCloud(previousCloud):
- minSites[scanSiteList[0]] = 0
- else:
- minSites['BNL_ATLAS_1'] = 0
- # release not found
- if forAnalysis and trustIS:
- candidateForAnal = False
- # use only one site for prod_test to skip LFC scan
- if prevProType in skipBrokerageProTypes:
- if len(minSites) > 1:
- minSites = {minSites.keys()[0]:0}
- # choose site
- tmpLog.debug('Min Sites:%s' % minSites)
- if len(fileList) ==0:
- # choose min 1/weight
- minSite = minSites.keys()[0]
- minWinv = minSites[minSite]
- for tmpSite,tmpWinv in minSites.iteritems():
- if tmpWinv < minWinv:
- minSite = tmpSite
- minWinv = tmpWinv
- chosenCE = siteMapper.getSite(minSite)
- else:
- # compare # of files in LRC
- maxNfiles = -1
- for site in minSites:
- tmp_chosen_ce = siteMapper.getSite(site)
- # search LRC
- if site in _disableLRCcheck:
- tmpOKFiles = {}
- else:
- # get files from LRC
- tmpOKFiles = _getOkFiles(tmp_chosen_ce,fileList,guidList,allLFNs,allGUIDs,allOkFilesMap,tmpLog)
- nFiles = len(tmpOKFiles)
- tmpLog.debug('site:%s - nFiles:%s/%s %s' % (site,nFiles,len(fileList),str(tmpOKFiles)))
- # choose site holding max # of files
- if nFiles > maxNfiles:
- chosenCE = tmp_chosen_ce
- maxNfiles = nFiles
- okFiles = tmpOKFiles
- # set job spec
- tmpLog.debug('indexJob : %s' % indexJob)
- tmpLog.debug('nInputs/Job : %s' % nFilesPerJob)
- tmpLog.debug('inputSize/Job : %s' % inputSizePerJob)
- for tmpJob in jobs[indexJob-iJob-1:indexJob-1]:
- # set computingSite
- if (not candidateForAnal) and forAnalysis and trustIS:
- resultsForAnalStr = 'ERROR : No candidate. '
- if resultsForAnal['rel'] != []:
- if prevCmtConfig in ['','NULL',None]:
- resultsForAnalStr += 'Release:%s was not found at %s. ' % (prevRelease,str(resultsForAnal['rel']))
- else:
- resultsForAnalStr += 'Release:%s/%s was not found at %s. ' % (prevRelease,prevCmtConfig,str(resultsForAnal['rel']))
- if resultsForAnal['pilot'] != []:
- resultsForAnalStr += '%s are inactive (no pilots for last 3 hours). ' % str(resultsForAnal['pilot'])
- if resultsForAnal['disk'] != []:
- resultsForAnalStr += 'Disk shortage < %sGB at %s. ' % (diskThreshold,str(resultsForAnal['disk']))
- if resultsForAnal['memory'] != []:
- resultsForAnalStr += 'Insufficient RAM at %s. ' % str(resultsForAnal['memory'])
- if resultsForAnal['maxtime'] != []:
- resultsForAnalStr += 'Shorter walltime limit than maxCpuCount:%s at ' % prevMaxCpuCount
- for tmpItem in resultsForAnal['maxtime']:
- if siteMapper.checkSite(tmpItem):
- resultsForAnalStr += '%s:%s,' % (tmpItem,siteMapper.getSite(tmpItem).maxtime)
- resultsForAnalStr = resultsForAnalStr[:-1]
- resultsForAnalStr += '. '
- if resultsForAnal['status'] != []:
- resultsForAnalStr += '%s are not online. ' % str(resultsForAnal['status'])
- if resultsForAnal['reliability'] != []:
- resultsForAnalStr += 'Insufficient reliability at %s. ' % str(resultsForAnal['reliability'])
- resultsForAnalStr = resultsForAnalStr[:-1]
- tmpJob.computingSite = resultsForAnalStr
- else:
- tmpJob.computingSite = chosenCE.sitename
- # send log
- if forAnalysis and trustIS and reportLog:
- # put logging info to ErrorDiag just to give it back to the caller
- tmpJob.brokerageErrorDiag = sendAnalyBrokeageInfo(resultsForAnal,prevRelease,diskThreshold,
- tmpJob.computingSite,prevCmtConfig,
- siteReliability)
- tmpLog.debug('PandaID:%s -> site:%s' % (tmpJob.PandaID,tmpJob.computingSite))
- if tmpJob.computingElement == 'NULL':
- if tmpJob.prodSourceLabel == 'ddm':
- # use nickname for ddm jobs
- tmpJob.computingElement = chosenCE.nickname
- else:
- tmpJob.computingElement = chosenCE.gatekeeper
- # fail jobs if no sites have the release
- if (not foundRelease or (tmpJob.relocationFlag != 1 and not foundOneCandidate)) and (tmpJob.prodSourceLabel in ['managed','test']):
- # reset
- if tmpJob.relocationFlag != 1:
- tmpJob.computingSite = None
- tmpJob.computingElement = None
- # go to waiting
- tmpJob.jobStatus = 'waiting'
- tmpJob.brokerageErrorCode = ErrorCode.EC_Release
- if tmpJob.relocationFlag == 1:
- try:
- if resultsForAnal['pilot'] != []:
- tmpJob.brokerageErrorDiag = '%s no pilots' % tmpJob.computingSite
- elif resultsForAnal['disk'] != []:
- tmpJob.brokerageErrorDiag = 'SE full at %s' % tmpJob.computingSite
- elif resultsForAnal['memory'] != []:
- tmpJob.brokerageErrorDiag = 'RAM shortage at %s' % tmpJob.computingSite
- elif resultsForAnal['status'] != []:
- tmpJob.brokerageErrorDiag = '%s not online' % tmpJob.computingSite
- elif resultsForAnal['share'] != []:
- tmpJob.brokerageErrorDiag = '%s zero share' % tmpJob.computingSite
- elif resultsForAnal['cpucore'] != []:
- tmpJob.brokerageErrorDiag = "CPU core mismatch at %s" % tmpJob.computingSite
- elif resultsForAnal['maxtime'] != []:
- tmpJob.brokerageErrorDiag = "short walltime at %s" % tmpJob.computingSite
- elif resultsForAnal['transferring'] != []:
- tmpJob.brokerageErrorDiag = 'too many transferring at %s' % tmpJob.computingSite
- elif resultsForAnal['scratch'] != []:
- tmpJob.brokerageErrorDiag = 'small scratch disk at %s' % tmpJob.computingSite
- elif useCacheVersion:
- tmpJob.brokerageErrorDiag = '%s/%s not found at %s' % (tmpJob.homepackage,tmpJob.cmtConfig,tmpJob.computingSite)
- else:
- tmpJob.brokerageErrorDiag = '%s/%s not found at %s' % (tmpJob.AtlasRelease,tmpJob.cmtConfig,tmpJob.computingSite)
- except:
- errtype,errvalue = sys.exc_info()[:2]
- tmpLog.error("failed to set diag for %s: %s %s" % (tmpJob.PandaID,errtype,errvalue))
- tmpJob.brokerageErrorDiag = 'failed to set diag. see brokerage log in the panda server'
- elif not prevBrokergageSiteList in [[],None]:
- try:
- # make message
- tmpJob.brokerageErrorDiag = makeCompactDiagMessage(prevBrokerageNote,resultsForAnal)
- except:
- errtype,errvalue = sys.exc_info()[:2]
- tmpLog.error("failed to set special diag for %s: %s %s" % (tmpJob.PandaID,errtype,errvalue))
- tmpJob.brokerageErrorDiag = 'failed to set diag. see brokerage log in the panda server'
- elif prevProType in ['reprocessing']:
- tmpJob.brokerageErrorDiag = '%s/%s not found at reprocessing sites' % (tmpJob.homepackage,tmpJob.cmtConfig)
- elif not useCacheVersion:
- tmpJob.brokerageErrorDiag = '%s/%s not found at online sites with enough memory and disk' % \
- (tmpJob.AtlasRelease,tmpJob.cmtConfig)
- else:
- try:
- tmpJob.brokerageErrorDiag = makeCompactDiagMessage('',resultsForAnal)
- except:
- errtype,errvalue = sys.exc_info()[:2]
- tmpLog.error("failed to set compact diag for %s: %s %s" % (tmpJob.PandaID,errtype,errvalue))
- tmpJob.brokerageErrorDiag = 'failed to set diag. see brokerage log in the panda server'
- tmpLog.debug('PandaID:%s %s' % (tmpJob.PandaID,tmpJob.brokerageErrorDiag))
- continue
- # set ready if files are already there
- _setReadyToFiles(tmpJob,okFiles,siteMapper,tmpLog)
- # update statistics
- tmpProGroup = ProcessGroups.getProcessGroup(tmpJob.processingType)
- if tmpJob.processingType in skipBrokerageProTypes:
- # use original processingType since prod_test is in the test category and thus is interfered by validations
- tmpProGroup = tmpJob.processingType
- if not jobStatistics.has_key(tmpJob.computingSite):
- jobStatistics[tmpJob.computingSite] = {'assigned':0,'activated':0,'running':0}
- if not jobStatBroker.has_key(tmpJob.computingSite):
- jobStatBroker[tmpJob.computingSite] = {}
- if not jobStatBroker[tmpJob.computingSite].has_key(tmpProGroup):
- jobStatBroker[tmpJob.computingSite][tmpProGroup] = {'assigned':0,'activated':0,'running':0}
- jobStatistics[tmpJob.computingSite]['assigned'] += 1
- jobStatBroker[tmpJob.computingSite][tmpProGroup]['assigned'] += 1
- # update statistics by taking priorities into account
- if not forAnalysis and prevSourceLabel in ['managed','test']:
- if not newJobStatWithPrio.has_key(prevPriority):
- newJobStatWithPrio[prevPriority] = {}
- if not newJobStatWithPrio[prevPriority].has_key(tmpJob.cloud):
- newJobStatWithPrio[prevPriority][tmpJob.cloud] = {}
- if not newJobStatWithPrio[prevPriority][tmpJob.cloud].has_key(tmpJob.computingSite):
- newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite] = {}
- if not newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite].has_key(tmpProGroup):
- newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite][tmpProGroup] = 0
- newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite][tmpProGroup] += 1
- # terminate
- if job == None:
- break
- # reset iJob
- iJob = 0
- # reset file list
- fileList = []
- guidList = []
- okFiles = {}
- totalNumInputs = 0
- totalInputSize = 0
- # create new dispDBlock
- if job.prodDBlock != 'NULL':
- # get datatype
- try:
- tmpDataType = job.prodDBlock.split('.')[-2]
- except:
- # default
- tmpDataType = 'GEN'
- if len(tmpDataType) > 20:
- # avoid too long name
- tmpDataType = 'GEN'
- dispatchDBlock = "panda.%s.%s.%s.%s_dis%s" % (job.taskID,time.strftime('%m.%d'),tmpDataType,
- commands.getoutput('uuidgen'),job.PandaID)
- tmpLog.debug('New dispatchDBlock: %s' % dispatchDBlock)
- prodDBlock = job.prodDBlock
- # already define computingSite
- if job.computingSite != 'NULL':
- # instantiate KnownSite
- chosen_ce = siteMapper.getSite(job.computingSite)
- # if site doesn't exist, use ANALY_BNL_ATLAS_1
- if job.homepackage.startswith('AnalysisTransforms'):
- if chosen_ce.sitename == 'BNL_ATLAS_1':
- chosen_ce = siteMapper.getSite('ANALY_BNL_ATLAS_1')
- overwriteSite = True
- else:
- # default for Analysis jobs
- if job.homepackage.startswith('AnalysisTransforms'):
- chosen_ce = siteMapper.getSite('ANALY_BNL_ATLAS_1')
- overwriteSite = True
- else:
- # set chosen_ce
- chosen_ce = 'TOBEDONE'
- # increment iJob
- iJob += 1
- # reserve computingSite and cloud
- computingSite = job.computingSite
- previousCloud = job.cloud
- prevRelease = job.AtlasRelease
- prevMemory = job.minRamCount
- prevCmtConfig = job.cmtConfig
- prevProType = job.processingType
- prevSourceLabel = job.prodSourceLabel
- prevDiskCount = job.maxDiskCount
- prevHomePkg = job.homepackage
- prevDirectAcc = job.transferType
- prevCoreCount = job.coreCount
- prevMaxCpuCount = job.maxCpuCount
- prevBrokergageSiteList = specialBrokergageSiteList
- prevManualPreset = manualPreset
- prevGoToT2Flag = goToT2Flag
- prevWorkingGroup = job.workingGroup
- prevBrokerageNote = brokerageNote
- # truncate prio to avoid too many lookups
- if not job.currentPriority in [None,'NULL']:
- prevPriority = (job.currentPriority / 50) * 50
- # assign site
- if chosen_ce != 'TOBEDONE':
- job.computingSite = chosen_ce.sitename
- if job.computingElement == 'NULL':
- if job.prodSourceLabel == 'ddm':
- # use nickname for ddm jobs
- job.computingElement = chosen_ce.nickname
- else:
- job.computingElement = chosen_ce.gatekeeper
- # update statistics
- if not jobStatistics.has_key(job.computingSite):
- jobStatistics[job.computingSite] = {'assigned':0,'activated':0,'running':0}
- jobStatistics[job.computingSite]['assigned'] += 1
- tmpLog.debug('PandaID:%s -> preset site:%s' % (job.PandaID,chosen_ce.sitename))
- # set cloud
- if job.cloud in ['NULL',None,'']:
- job.cloud = chosen_ce.cloud
- # set destinationSE
- destSE = job.destinationSE
- if siteMapper.checkCloud(job.cloud):
- # use cloud dest for non-exsiting sites
- if job.prodSourceLabel != 'user' and (not job.destinationSE in siteMapper.siteSpecList.keys()) \
- and job.destinationSE != 'local':
- destSE = siteMapper.getCloud(job.cloud)['dest']
- job.destinationSE = destSE
- # use CERN-PROD_EOSDATADISK for CERN-EOS jobs
- if job.computingSite in ['CERN-EOS']:
- overwriteSite = True
- if overwriteSite:
- # overwrite SE for analysis jobs which set non-existing sites
- destSE = job.computingSite
- job.destinationSE = destSE
- # set dispatchDBlock and destinationSE
- first = True
- for file in job.Files:
- # dispatchDBlock. Set dispDB for prestaging jobs too
- if file.type == 'input' and file.dispatchDBlock == 'NULL' and \
- ((not file.status in ['ready','missing']) or job.computingSite in prestageSites):
- if first:
- first = False
- job.dispatchDBlock = dispatchDBlock
- file.dispatchDBlock = dispatchDBlock
- file.status = 'pending'
- if not file.lfn in fileList:
- fileList.append(file.lfn)
- guidList.append(file.GUID)
- try:
- # get total number/size of inputs except DBRelease
- # tgz inputs for evgen may be negligible
- if re.search('\.tar\.gz',file.lfn) == None:
- totalNumInputs += 1
- totalInputSize += file.fsize
- except:
- pass
- # destinationSE
- if file.type in ['output','log'] and destSE != '':
- if job.prodSourceLabel == 'user' and job.computingSite == file.destinationSE:
- pass
- elif destSE == 'local':
- pass
- else:
- file.destinationSE = destSE
- # pre-assign GUID to log
- if file.type == 'log':
- # get lock
- fcntl.flock(_lockGetUU.fileno(), fcntl.LOCK_EX)
- # generate GUID
- file.GUID = commands.getoutput('uuidgen')
- # release lock
- fcntl.flock(_lockGetUU.fileno(), fcntl.LOCK_UN)
- # send log messages
- try:
- for message in loggerMessages:
- # get logger
- _pandaLogger = PandaLogger()
- _pandaLogger.lock()
- _pandaLogger.setParams({'Type':'brokerage'})
- logger = _pandaLogger.getHttpLogger(panda_config.loggername)
- # add message
- logger.warning(message)
- # release HTTP handler
- _pandaLogger.release()
- time.sleep(1)
- except:
- pass
- # send analysis brokerage info when jobs are submitted
- if len(jobs) > 0 and jobs[0] != None and not forAnalysis and not pd2pT1 and specialWeight=={}:
- # for analysis job. FIXME once ganga is updated to send analy brokerage info
- if jobs[0].prodSourceLabel in ['user','panda'] and jobs[0].processingType in ['pathena','prun']:
- # send countryGroup
- tmpMsgList = []
- tmpNumJobs = len(jobs)
- if jobs[0].prodSourceLabel == 'panda':
- tmpNumJobs -= 1
- tmpMsg = 'nJobs=%s ' % tmpNumJobs
- if jobs[0].countryGroup in ['NULL','',None]:
- tmpMsg += 'countryGroup=None'
- else:
- tmpMsg += 'countryGroup=%s' % jobs[0].countryGroup
- tmpMsgList.append(tmpMsg)
- # send log
- sendMsgToLoggerHTTP(tmpMsgList,jobs[0])
- # finished
- tmpLog.debug('finished')
- if getWeight:
- return weightUsedByBrokerage
- except:
- type, value, traceBack = sys.exc_info()
- tmpLog.error("schedule : %s %s" % (type,value))
- if getWeight:
- return {}
-
diff --git a/current/pandaserver/brokerage/broker_util.py b/current/pandaserver/brokerage/broker_util.py
deleted file mode 100755
index ca8564a91..000000000
--- a/current/pandaserver/brokerage/broker_util.py
+++ /dev/null
@@ -1,399 +0,0 @@
-import re
-import urllib
-import time
-import sys
-import types
-import commands
-import xml.dom.minidom
-
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-_log = PandaLogger().getLogger('broker_util')
-
-# curl class
-class _Curl:
- # constructor
- def __init__(self,useProxy=False):
- # path to curl
- self.path = 'curl --user-agent "dqcurl" -m 180'
- # verification of the host certificate
- self.verifyHost = False
- # use proxy
- if useProxy and panda_config.httpProxy != '':
- self.path = 'env http_proxy=%s %s' % (panda_config.httpProxy,self.path)
-
- # GET method
- def get(self,url,data={}):
- # make command
- com = '%s --silent --get' % self.path
- if not self.verifyHost:
- com += ' --insecure'
- # data
- for key,value in data.iteritems():
- com += ' --data "%s"' % urllib.urlencode({key:value})
- com += ' %s' % url
- # execute
- _log.debug(com)
- ret = commands.getstatusoutput(com)
- _log.debug(ret)
- return ret
-
-
-# get default storage
-def _getDefaultStorage(baseURL,sePath=None,seProdPath={}):
- _log.debug('_getDefaultStorage (%s %s %s)' % (baseURL,sePath,seProdPath))
- # use se+seprodpath when baseURL=''
- if baseURL=='':
- # get token
- match = re.search('^token:([^:]+):',sePath)
- if match == None:
- _log.error("could not get token from %s" % sePath)
- return ""
- token = match.group(1)
- # get corresponding path
- if not seProdPath.has_key(token):
- _log.error("could not find path for % in %s" % (token,seProdPath))
- return ""
- # set se+seprodpath
- out = sePath+seProdPath[token]
- # append /
- if not out.endswith('/'):
- out += '/'
- _log.debug(out)
- else:
- # check port to set proxy
- useProxy = False
- if panda_config.httpProxy != '':
- pMatch = re.search('http://[^:/]+:*(\d+)/',baseURL)
- if pMatch == None:
- # default port
- useProxy = True
- elif pMatch.group(1) == '80':
- # standard port
- useProxy = True
- # instantiate curl
- curl = _Curl(useProxy)
- # get default storage
- url = baseURL + 'storages/default'
- status,out = curl.get(url)
- _log.debug(out)
- if status != 0:
- _log.error("could not get default storage from %s:%s" % (baseURL,status))
- return ""
- # parse
- match = re.search('^[^/]+://[^/]+(/.+)$',out)
- if match == None:
- _log.error("could not parse string : %s" % out)
- return ""
- return match.group(1)
-
-
-# get PoolFileCatalog
-def _getPoolFileCatalog(lfns,dq2url):
- _log.debug('_getPoolFileCatalog')
- # check port to set proxy
- useProxy = False
- if panda_config.httpProxy != '':
- pMatch = re.search('http://[^:/]+:*(\d+)/',dq2url)
- if pMatch == None:
- # default port
- useProxy = True
- elif pMatch.group(1) == '80':
- # standard port
- useProxy = True
- # instantiate curl
- curl = _Curl(useProxy)
- # get PoolFileCatalog
- iLFN = 0
- outXML =''
- strLFNs = ''
- if not dq2url.endswith('_'):
- url = dq2url + '/lrc/PoolFileCatalog'
- else:
- # NDGF LRC
- url = dq2url + 'lrc/PoolFileCatalog'
- for lfn in lfns:
- iLFN += 1
- # make argument
- strLFNs += '%s ' % lfn
- if iLFN % 40 == 0 or iLFN == len(lfns):
- # get PoolFileCatalog
- strLFNs = strLFNs.rstrip()
- data = {'lfns':strLFNs}
- # avoid too long argument
- strLFNs = ''
- # execute
- status,out = curl.get(url,data)
- _log.debug(status)
- # sleep
- time.sleep(2)
- if status != 0:
- _log.error("_getPoolFileCatalog : %s %s %s" % (dq2url,status,out))
- return status
- if status != 0 or out.startswith('Error'):
- continue
- if not out.startswith('<\?xml version="1.0" encoding="UTF-8" standalone="no" \?>
-
-
-
-"""
- outXML = re.sub(th,'',outXML)
- outXML = re.sub("""\s*""",'',outXML)
- outXML = re.sub("""\s*""",'',outXML)
- outXML = re.sub("""\s*""",'',outXML)
- outXML = re.sub("""\s*""",'',outXML)
- outXML = re.sub("""\s*""",'',outXML)
- outXML = re.sub("""\s*""",'',outXML)
-
- # return XML
- return outXML
-
-
-# get files from MySQL
-def _getPFNFromMySQL(lfns,dq2url):
- _log.debug('_getPFNFromMySQL')
- import MySQLdb
- comment = ' /* broker_util._getPFNFromMySQL */'
- outStr = ''
- # parse connection string
- match = re.search('^mysql://([^:]+):([^@]+)@([^/:]+):(\d+)/(.+)$',dq2url)
- if match == None:
- return outStr
- # parameters for DB connection
- connStr = "mysql -h %s -u %s -p%s -P %s %s"
- dbhost = match.group(3)
- dbuser = match.group(1)
- dbpswd = match.group(2)
- dbport = int(match.group(4))
- dbname = match.group(5)
- connStr = "mysql -h %s -u %s -p%s -P %s %s" % (dbhost,dbuser,dbpswd,dbport,dbname)
- try:
- _log.debug(connStr)
- # connect
- dbConn = MySQLdb.connect(db=dbname,host=dbhost,port=dbport,user=dbuser,passwd=dbpswd)
- # make cursor
- dbCur = dbConn.cursor()
- # query files
- iLFN = 0
- strLFNs = ''
- for lfn in lfns:
- iLFN += 1
- # make argument
- strLFNs += " lfname='%s' OR " % lfn
- if iLFN % 40 == 0 or iLFN == len(lfns):
- # get PoolFileCatalog
- strLFNs = strLFNs[:-3]
- # construct SQL
- sql = 'SELECT lfname FROM t_lfn WHERE %s' % strLFNs
- # reset
- strLFNs = ''
- # execute
- _log.debug(sql)
- dbCur.execute(sql+comment)
- res = dbCur.fetchall()
- _log.debug(res)
- # append LFNs
- if res != None and len(res) != 0:
- for resLFN in res:
- outStr += '%s ' % resLFN
- # close cursor
- dbCur.close()
- # close connection
- dbConn.close()
- except:
- type, value, traceBack = sys.exc_info()
- _log.error("_getPFNFromMySQL : %s %s %s" % (dq2url,type,value))
- return -1
- # return
- return outStr
-
-
-# get files from LFC
-def _getPFNFromLFC(lfns,dq2url,guids,storageName):
- _log.debug('_getPFNFromLFC')
- outStr = ''
- # check paramter
- if guids == [] or storageName == [] or (len(lfns) != len(guids)):
- return outStr
- # extract LFC host
- lfcHost = re.sub('[/:]',' ',dq2url).split()[1]
- # loop over all LFNs
- iLFN = 0
- nLFN = 1000
- strFiles = ''
- outStr = ''
- for iLFN in range(len(lfns)):
- strFiles += '%s %s\n' % (lfns[iLFN],guids[iLFN])
- # bulk operation
- if (iLFN+1) % nLFN == 0 or (iLFN+1) >= len(lfns):
- # write to file
- inFileName = '%s/lfcin.%s' % (panda_config.logdir,commands.getoutput('uuidgen'))
- ifile = open(inFileName,'w')
- ifile.write(strFiles)
- ifile.close()
- # construct commands
- strStorage = ''
- for storage in storageName:
- strStorage += '%s,' % storage
- strStorage = strStorage[:-1]
- com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd)
- com+= 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; '
- com+= 'source %s; %s/python -Wignore %s/LFCclient.py -f %s -l %s -s %s' % \
- (panda_config.glite_source,panda_config.native_python32,panda_config.lfcClient_dir,
- inFileName,lfcHost,strStorage)
- _log.debug(com)
- # exeute
- status,output = commands.getstatusoutput(com)
- _log.debug(status)
- if status == 0:
- outStr += output
- else:
- _log.error("_getPFNFromLFC : %s %s %s" % (dq2url,status,output))
- # send message to logger
- try:
- # make message
- message = 'LFC access : %s %s %s' % (dq2url,status,output)
- # get logger
- _pandaLogger = PandaLogger()
- _pandaLogger.lock()
- _pandaLogger.setParams({'Type':'broker_util'})
- logger = _pandaLogger.getHttpLogger(panda_config.loggername)
- # add message
- logger.error(message)
- # release HTTP handler
- _pandaLogger.release()
- except:
- pass
- return status
- # reset
- strFiles = ''
- # return
- return outStr
-
-
-# get files from LRC
-def getFilesFromLRC(files,url,guids=[],storageName=[],terminateWhenFailed=False,getPFN=False):
- _log.debug('getFilesFromLRC "%s" %s' % (url,str(storageName)))
- # get PFC
- outSTR = ''
- if url.startswith('mysql://'):
- # from MySQL
- outSTR = _getPFNFromMySQL(files,url)
- # get PFN
- if getPFN:
- outPFN = {}
- # FIXME
- _log.debug('RetPFN:%s ' % str(outPFN))
- return outPFN
- elif url.startswith('http://'):
- # from HTTP I/F
- outSTR = _getPoolFileCatalog(files,url)
- # get PFN
- if getPFN:
- outPFN = {}
- try:
- if not outSTR in ['',None]:
- root = xml.dom.minidom.parseString(outSTR)
- fileNodes = root.getElementsByTagName('File')
- for file in fileNodes:
- # get PFN and LFN nodes
- physical = file.getElementsByTagName('physical')[0]
- pfnNode = physical.getElementsByTagName('pfn')[0]
- logical = file.getElementsByTagName('logical')[0]
- lfnNode = logical.getElementsByTagName('lfn')[0]
- # convert UTF8 to Raw
- pfn = str(pfnNode.getAttribute('name'))
- lfn = str(lfnNode.getAttribute('name'))
- # assign
- if not outPFN.has_key(lfn):
- outPFN[lfn] = []
- outPFN[lfn].append(pfn)
- except:
- type, value, traceBack = sys.exc_info()
- _log.error(outSTR)
- _log.error("could not parse XML - %s %s" % (type, value))
- _log.debug('RetPFN:%s ' % str(outPFN))
- return outPFN
- elif url.startswith('lfc://'):
- # from LFC
- outSTR = _getPFNFromLFC(files,url,guids,storageName)
- # get PFN
- if getPFN:
- outPFN = {}
- try:
- if not outSTR in ['',None]:
- tmpItems = outSTR.split('LFCRet :')
- tmpItems.remove('')
- # loop over all returns
- for tmpItem in tmpItems:
- exec "tmpLFNmap = %s" % tmpItem
- for tmpLFN,tmpPFN in tmpLFNmap.iteritems():
- outPFN[tmpLFN] = tmpPFN
- except:
- type, value, traceBack = sys.exc_info()
- _log.error(outSTR)
- _log.error("could not parse LFC ret - %s %s" % (type, value))
- _log.debug('RetPFN:%s ' % str(outPFN))
- return outPFN
- # check return
- if not isinstance(outSTR,types.StringType):
- if terminateWhenFailed:
- return None
- # set empty string
- outSTR = ''
- # collect OK Files
- okFiles = []
- for file in files:
- if re.search(file,outSTR) != None:
- okFiles.append(file)
- _log.debug('Ret:%s ' % str(okFiles))
- return okFiles
-
-
-# get # of files from LRC
-def getNFilesFromLRC(files,url):
- _log.debug('getNFilesFromLRC')
- # get okFiles
- okFiles = getFilesFromLRC(files,url)
- nFiles = len(okFiles)
- _log.debug('Ret:%s ' % nFiles)
- return nFiles
-
-
-# get list of missing LFNs from LRC
-def getMissLFNsFromLRC(files,url,guids=[],storageName=[]):
- _log.debug('getMissLFNsFromLRC')
- # get OF files
- okFiles = getFilesFromLRC(files,url,guids,storageName)
- # collect missing files
- missFiles = []
- for file in files:
- if not file in okFiles:
- missFiles.append(file)
- _log.debug('Ret:%s ' % str(missFiles))
- return missFiles
-
-
-# extract list of se hosts from schedconfig
-def getSEfromSched(seStr):
- tmpSE = []
- if seStr != None:
- for tmpSrcSiteSE in seStr.split(','):
- # extract host
- match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE)
- if match != None:
- tmpSE.append(match.group(1))
- # sort
- tmpSE.sort()
- # return
- return tmpSE
-
-
diff --git a/current/pandaserver/config/__init__.py b/current/pandaserver/config/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/current/pandaserver/config/panda_config.py b/current/pandaserver/config/panda_config.py
deleted file mode 100755
index 68034b586..000000000
--- a/current/pandaserver/config/panda_config.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import re
-import sys
-import commands
-from liveconfigparser.LiveConfigParser import LiveConfigParser
-
-# get ConfigParser
-tmpConf = LiveConfigParser()
-
-# read
-tmpConf.read('panda_server.cfg')
-
-# get server section
-tmpDict = tmpConf.server
-
-# expand all values
-tmpSelf = sys.modules[ __name__ ]
-for tmpKey,tmpVal in tmpDict.iteritems():
- # convert string to bool/int
- if tmpVal == 'True':
- tmpVal = True
- elif tmpVal == 'False':
- tmpVal = False
- elif re.match('^\d+$',tmpVal):
- tmpVal = int(tmpVal)
- # update dict
- tmpSelf.__dict__[tmpKey] = tmpVal
-
-# set hostname
-tmpSelf.__dict__['pserverhost'] = commands.getoutput('hostname -f')
-
-# change the number of database connections for FastCGI/WSGI
-if tmpSelf.__dict__['useFastCGI'] or tmpSelf.__dict__['useWSGI']:
- tmpSelf.__dict__['nDBConnection'] = tmpSelf.__dict__['nDBConForFastCGIWSGI']
diff --git a/current/pandaserver/dataservice/Activator.py b/current/pandaserver/dataservice/Activator.py
deleted file mode 100755
index af3909050..000000000
--- a/current/pandaserver/dataservice/Activator.py
+++ /dev/null
@@ -1,47 +0,0 @@
-'''
-activate job
-
-'''
-
-import threading
-
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('Activator')
-
-
-class Activator (threading.Thread):
- # constructor
- def __init__(self,taskBuffer,dataset,enforce=False):
- threading.Thread.__init__(self)
- self.dataset = dataset
- self.taskBuffer = taskBuffer
- self.enforce = enforce
-
-
- # main
- def run(self):
- _logger.debug("start: %s" % self.dataset.name)
- if self.dataset.status in ['completed','deleting','deleted'] and not self.enforce:
- _logger.debug(" skip: %s" % self.dataset.name)
- else:
- # update input files
- ids = self.taskBuffer.updateInFilesReturnPandaIDs(self.dataset.name,'ready')
- _logger.debug("IDs: %s" % ids)
- if len(ids) != 0:
- # get job
- jobs = self.taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False)
- # remove None and unknown
- acJobs = []
- for job in jobs:
- if job == None or job.jobStatus == 'unknown':
- continue
- acJobs.append(job)
- # activate
- self.taskBuffer.activateJobs(acJobs)
- # update dataset in DB
- if self.dataset.type == 'dispatch':
- self.dataset.status = 'completed'
- self.taskBuffer.updateDatasets([self.dataset])
- _logger.debug("end: %s" % self.dataset.name)
diff --git a/current/pandaserver/dataservice/Adder.py b/current/pandaserver/dataservice/Adder.py
deleted file mode 100755
index 7209704e5..000000000
--- a/current/pandaserver/dataservice/Adder.py
+++ /dev/null
@@ -1,742 +0,0 @@
-'''
-add data to dataset
-
-'''
-
-import os
-import re
-import sys
-import time
-import fcntl
-import commands
-import threading
-import xml.dom.minidom
-import ErrorCode
-import brokerage.broker_util
-from DDM import ddm
-from Closer import Closer
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('Adder')
-
-
-class Adder (threading.Thread):
- # constructor
- def __init__(self,taskBuffer,jobID,fileCatalog,jobStatus,xmlFile='',ignoreDDMError=True,joinCloser=False,
- addOutput=False,pandaDDM=False,siteMapper=None,attemptNr=None):
- threading.Thread.__init__(self)
- self.job = None
- self.jobID = jobID
- self.jobStatus = jobStatus
- self.taskBuffer = taskBuffer
- self.ignoreDDMError = ignoreDDMError
- self.joinCloser = joinCloser
- self.addOutput = addOutput
- self.pandaDDM = pandaDDM
- self.lockXML = None
- self.datasetMap = {}
- self.siteMapper = siteMapper
- self.addToTopOnly = False
- self.goToTransferring = False
- self.subscriptionMap = {}
- self.attemptNr = attemptNr
- # dump Catalog into file
- if xmlFile=='':
- if attemptNr == None:
- self.xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,jobID,jobStatus,
- commands.getoutput('uuidgen'))
- else:
- self.xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,jobID,jobStatus,
- commands.getoutput('uuidgen'),attemptNr)
- file = open(self.xmlFile,'w')
- file.write(fileCatalog)
- file.close()
- else:
- self.xmlFile = xmlFile
-
-
- # main
- def run(self):
- try:
- _logger.debug("%s new start: %s" % (self.jobID,self.jobStatus))
- # lock XML except last trial
- if self.addOutput and self.ignoreDDMError:
- self.lockXML = open(self.xmlFile)
- try:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
- except:
- _logger.debug("%s cannot get lock : %s" % (self.jobID,self.xmlFile))
- self.lockXML.close()
- return
- # query job
- self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
- fromArchived=False,
- fromWaiting=False)[0]
- # check if job has finished
- if self.job == None:
- _logger.debug('%s : not found' % self.jobID)
- elif self.job.jobStatus in ['finished','failed','unknown','cancelled']:
- _logger.error('%s : invalid state -> %s' % (self.jobID,self.job.jobStatus))
- else:
- # add files only to top-level datasets for transferring jobs
- if self.job.jobStatus == 'transferring':
- self.addToTopOnly = True
- _logger.debug("%s adder for transferring" % self.jobID)
- # use PandaDDM for ddm jobs
- if self.job.prodSourceLabel == 'ddm':
- self.pandaDDM = True
- # set job status
- self.job.jobStatus = self.jobStatus
- # add outputs. Cannot add self.pandaDDM here since minidom.parse() produces seg-fault
- if self.addOutput:
- # check if the job should go to trasnferring
- tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm
- tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se)
- destSEwasSet = False
- if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(self.job.destinationSE):
- # DQ2 ID was set by using --destSE for analysis job to transfer output
- destSEwasSet = True
- tmpDstDDM = self.job.destinationSE
- tmpDstSEs = self.job.destinationSE
- else:
- tmpDstDDM = self.siteMapper.getSite(self.job.destinationSE).ddm
- tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.destinationSE).se)
- if re.search('^ANALY_',self.job.computingSite) != None:
- # analysis site
- pass
- elif (re.search('BNL', self.job.computingSite) != None or self.job.computingSite == "TPATHENA"):
- # BNL
- pass
- elif self.job.computingSite == self.job.destinationSE:
- # same site ID for computingSite and destinationSE
- pass
- elif tmpSrcDDM == tmpDstDDM:
- # same DQ2ID for src/dest
- pass
- elif tmpSrcSEs == tmpDstSEs:
- # same SEs
- pass
- elif self.job.computingSite.endswith("_REPRO"):
- # reprocessing sites
- pass
- elif self.addToTopOnly:
- # already in transferring
- pass
- elif self.job.jobStatus == 'failed':
- # failed jobs
- pass
- else:
- self.goToTransferring = True
- self._updateOutputs()
- else:
- _logger.debug('%s : not added' % self.jobID)
- _logger.debug('%s escape' % self.jobID)
- return
- _logger.debug('%s updated outputs' % self.jobID)
- # ignore DDMError
- if self.ignoreDDMError and \
- (re.search('could not add files',self.job.ddmErrorDiag) != None or \
- re.search('could not register subscription',self.job.ddmErrorDiag) != None) and \
- re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None and \
- re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None and \
- re.search('DQUnknownDatasetException',self.job.ddmErrorDiag) == None and \
- re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None and \
- re.search('KeyError',self.job.ddmErrorDiag) == None:
- _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag))
- _logger.debug('%s escape' % self.jobID)
- # unlock XML
- try:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
- self.lockXML.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("%s : %s %s" % (self.jobID,type,value))
- _logger.debug("%s cannot unlock XML" % self.jobID)
- return
- # update shadow dataset
- if self.job.prodSourceLabel == 'user' and self.jobStatus == 'finished' and self.job.ddmErrorDiag == 'NULL' \
- and not self.goToTransferring:
- self._updateShadow()
- # ignore DDMError
- if self.ignoreDDMError and re.search('could not add files',self.job.ddmErrorDiag) != None \
- and re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None \
- and re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None \
- and re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None \
- and re.search('KeyError',self.job.ddmErrorDiag) == None:
- _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag))
- _logger.debug('%s escape' % self.jobID)
- # unlock XML
- try:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
- self.lockXML.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("%s : %s %s" % (self.jobID,type,value))
- _logger.debug("%s cannot unlock XML" % self.jobID)
- return
- # set file status
- if self.job.jobStatus == 'failed':
- for file in self.job.Files:
- if file.type == 'output' or file.type == 'log':
- file.status = 'failed'
- else:
- # reset errors
- self.job.jobDispatcherErrorCode = 0
- self.job.jobDispatcherErrorDiag = 'NULL'
- # set job status
- hasOutput = False
- if self.goToTransferring or self.subscriptionMap != {}:
- # set status to transferring
- for file in self.job.Files:
- if file.type == 'output' or file.type == 'log' or \
- self.subscriptionMap.has_key(file.destinationDBlock):
- file.status = 'transferring'
- hasOutput = True
- if hasOutput:
- self.job.jobStatus = 'transferring'
- # propagate transition to prodDB
- self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- # endtime
- if self.job.endTime=='NULL':
- self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- # set cancelled state
- if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
- self.job.jobStatus = 'cancelled'
- # update job
- retU = self.taskBuffer.updateJobs([self.job],False)
- _logger.debug("%s retU: %s" % (self.jobID,retU))
- # failed
- if not retU[0]:
- _logger.error('failed to update DB for %s' % self.jobID)
- # unlock XML
- try:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
- self.lockXML.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("%s : %s %s" % (self.jobID,type,value))
- _logger.debug("%s cannot unlock XML" % self.jobID)
- return
- # setup for closer
- destDBList = []
- guidList = []
- for file in self.job.Files:
- # ignore inputs
- if file.type == 'input':
- continue
- # start closer for output/log datasets
- if not file.destinationDBlock in destDBList:
- destDBList.append(file.destinationDBlock)
- # collect GUIDs
- if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test'] and \
- self.job.processingType in ['pathena','gangarobot-rctest'])) \
- and file.type == 'output':
- guidList.append({'lfn':file.lfn, 'guid':file.GUID, 'type':file.type})
- if guidList != []:
- retG = self.taskBuffer.setGUIDs(guidList)
- if destDBList != []:
- # start Closer
- cThr = Closer(self.taskBuffer,destDBList,self.job,pandaDDM=self.pandaDDM,
- datasetMap=self.datasetMap)
- _logger.debug("%s start Closer" % self.jobID)
- cThr.start()
- if self.joinCloser:
- cThr.join()
- _logger.debug("%s end Closer" % self.jobID)
- _logger.debug("%s end" % self.jobID)
- try:
- # remove Catalog
- os.remove(self.xmlFile)
- except:
- pass
- # unlock XML
- if self.lockXML != None:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
- self.lockXML.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("%s : %s %s" % (self.jobID,type,value))
- _logger.debug("%s except" % self.jobID)
- # unlock XML just in case
- try:
- if self.lockXML != None:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("%s : %s %s" % (self.jobID,type,value))
- _logger.debug("%s cannot unlock XML" % self.jobID)
-
-
- # update output files
- def _updateOutputs(self):
- # get LFN and GUID
- _logger.debug("%s %s" % (self.jobID,self.xmlFile))
- # no outputs
- if self.job.Files == []:
- _logger.debug("%s has no outputs" % self.jobID)
- _logger.debug("%s addFiles end" % self.jobID)
- return
- # get input files
- inputLFNs = []
- for file in self.job.Files:
- if file.type == 'input':
- inputLFNs.append(file.lfn)
- # parse XML
- lfns = []
- guids = []
- fsizes = []
- md5sums = []
- chksums = []
- try:
- root = xml.dom.minidom.parse(self.xmlFile)
- files = root.getElementsByTagName('File')
- for file in files:
- # get GUID
- guid = str(file.getAttribute('ID'))
- _logger.debug(guid)
- # get PFN and LFN nodes
- logical = file.getElementsByTagName('logical')[0]
- lfnNode = logical.getElementsByTagName('lfn')[0]
- # convert UTF8 to Raw
- lfn = str(lfnNode.getAttribute('name'))
- # get metadata
- fsize = None
- md5sum = None
- adler32 = None
- for meta in file.getElementsByTagName('metadata'):
- # get fsize
- name = str(meta.getAttribute('att_name'))
- if name == 'fsize':
- fsize = long(meta.getAttribute('att_value'))
- elif name == 'md5sum':
- md5sum = str(meta.getAttribute('att_value'))
- # check
- if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
- md5sum = None
- elif name == 'adler32':
- adler32 = str(meta.getAttribute('att_value'))
- # error check
- if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)):
- raise RuntimeError, 'fsize/md5sum/adler32=None'
- # append
- lfns.append(lfn)
- guids.append(guid)
- fsizes.append(fsize)
- md5sums.append(md5sum)
- if adler32 != None:
- # use adler32 if available
- chksums.append("ad:%s" % adler32)
- else:
- chksums.append("md5:%s" % md5sum)
- except:
- # check if file exists
- if os.path.exists(self.xmlFile):
- type, value, traceBack = sys.exc_info()
- _logger.error("%s : %s %s" % (self.jobID,type,value))
- # set failed anyway
- self.job.jobStatus = 'failed'
- # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
- if (self.job.pilotErrorCode in [0,'0','NULL']) and \
- (self.job.transExitCode in [0,'0','NULL']):
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "Adder._updateOutputs() could not get GUID/LFN/MD5/FSIZE"
- return
- else:
- # XML was deleted
- self.job.ddmErrorDiag = "Adder._updateOutputs() could not add files"
- self.ignoreDDMError = True
- return
- # check files
- idMap = {}
- fileList = []
- subMap = {}
- for file in self.job.Files:
- if file.type == 'input':
- if file.lfn in lfns:
- if self.job.prodSourceLabel in ['user','panda']:
- # skipped file
- file.status = 'skipped'
- elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']:
- # failed by pilot
- file.status = 'failed'
- elif file.type == 'output' or file.type == 'log':
- # append to fileList
- fileList.append(file.lfn)
- # add only log file for failed jobs
- if self.jobStatus == 'failed' and file.type != 'log':
- continue
- # add only log file for unmerge jobs
- if self.job.prodSourceLabel == 'panda' and self.job.processingType in ['unmerge'] \
- and file.type != 'log':
- continue
- # look for GUID with LFN
- try:
- i = lfns.index(file.lfn)
- file.GUID = guids[i]
- file.fsize = fsizes[i]
- file.md5sum = md5sums[i]
- file.checksum = chksums[i]
- # status
- file.status = 'ready'
- # fsize
- fsize = None
- if not file.fsize in ['NULL','',0]:
- try:
- fsize = long(file.fsize)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("%s : %s %s" % (self.jobID,type,value))
- # append to map
- if not idMap.has_key(file.destinationDBlock):
- idMap[file.destinationDBlock] = []
- idMap[file.destinationDBlock].append({'guid' : file.GUID,
- 'lfn' : lfns[i],
- 'size' : fsize,
- 'checksum' : file.checksum})
- # for subscription
- if self.job.prodSourceLabel in ['managed','test','software','rc_test','ptest','user'] and \
- re.search('_sub\d+$',file.destinationDBlock) != None and (not self.addToTopOnly) and \
- self.job.destinationSE != 'local':
- if self.siteMapper == None:
- _logger.error("%s : SiteMapper==None" % self.jobID)
- else:
- # get dataset spec
- if not self.datasetMap.has_key(file.destinationDBlock):
- tmpDS = self.taskBuffer.queryDatasetWithMap({'name':file.destinationDBlock})
- self.datasetMap[file.destinationDBlock] = tmpDS
- # check if valid dataset
- if self.datasetMap[file.destinationDBlock] == None:
- _logger.error("%s : cannot find %s in DB" % (self.jobID,file.destinationDBlock))
- else:
- if not self.datasetMap[file.destinationDBlock].status in ['defined']:
- # not a fresh dataset
- _logger.debug("%s : subscription was already made for %s:%s" % \
- (self.jobID,self.datasetMap[file.destinationDBlock].status,
- file.destinationDBlock))
- else:
- # get DQ2 IDs
- tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm
- tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se)
- if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(file.destinationSE):
- # DQ2 ID was set by using --destSE for analysis job to transfer output
- tmpDstDDM = file.destinationSE
- tmpDstSEs = file.destinationSE
- else:
- tmpDstDDM = self.siteMapper.getSite(file.destinationSE).ddm
- tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(file.destinationSE).se)
- # if src != dest or multi-token
- if (tmpSrcDDM != tmpDstDDM and tmpSrcSEs != tmpDstSEs) or \
- (tmpSrcDDM == tmpDstDDM and file.destinationDBlockToken.count(',') != 0):
- optSub = {'DATASET_COMPLETE_EVENT' : ['https://%s:%s/server/panda/datasetCompleted' % \
- (panda_config.pserverhost,panda_config.pserverport)]}
- # append
- if not subMap.has_key(file.destinationDBlock):
- subMap[file.destinationDBlock] = []
- # sources
- optSource = {}
- # set sources for NL/FR/ES to handle T2s in another cloud
- if self.job.cloud in ['NL','FR','ES']:
- if file.destinationDBlockToken in ['NULL','']:
- # use default DQ2 ID as source
- optSource[tmpSrcDDM] = {'policy' : 0}
- else:
- # convert token to DQ2 ID
- dq2ID = tmpSrcDDM
- # use the first token's location as source for T1D1
- tmpSrcToken = file.destinationDBlockToken.split(',')[0]
- if self.siteMapper.getSite(self.job.computingSite).setokens.has_key(tmpSrcToken):
- dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens[tmpSrcToken]
- optSource[dq2ID] = {'policy' : 0}
- # use another location when token is set
- if not file.destinationDBlockToken in ['NULL','']:
- tmpDQ2IDList = []
- tmpDstTokens = file.destinationDBlockToken.split(',')
- # remove the first one because it is already used as a location
- if tmpSrcDDM == tmpDstDDM:
- tmpDstTokens = tmpDstTokens[1:]
- # loop over all tokens
- for idxToken,tmpDstToken in enumerate(tmpDstTokens):
- dq2ID = tmpDstDDM
- if self.siteMapper.getSite(file.destinationSE).setokens.has_key(tmpDstToken):
- dq2ID = self.siteMapper.getSite(file.destinationSE).setokens[tmpDstToken]
- # keep the fist destination for multi-hop
- if idxToken == 0:
- firstDestDDM = dq2ID
- else:
- # use the fist destination as source for T1D1
- optSource = {}
- optSource[firstDestDDM] = {'policy' : 0}
- # remove looping subscription
- if dq2ID == tmpSrcDDM:
- continue
- # avoid duplication
- if not dq2ID in tmpDQ2IDList:
- subMap[file.destinationDBlock].append((dq2ID,optSub,optSource))
- else:
- # use default DDM
- for dq2ID in tmpDstDDM.split(','):
- subMap[file.destinationDBlock].append((dq2ID,optSub,optSource))
- except:
- # status
- file.status = 'failed'
- type, value, traceBack = sys.exc_info()
- _logger.error("%s : %s %s" % (self.jobID,type,value))
- # cleanup submap
- tmpKeys = subMap.keys()
- for tmpKey in tmpKeys:
- if subMap[tmpKey] == []:
- del subMap[tmpKey]
- # check consistency between XML and filesTable
- for lfn in lfns:
- if (not lfn in fileList) and (not lfn in inputLFNs):
- _logger.error("%s %s is not found in filesTable" % (self.jobID,lfn))
- self.job.jobStatus = 'failed'
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "Adder._updateOutputs() XML is inconsistent with filesTable"
- return
- # return if PandaDDM is used or non-DQ2
- if self.pandaDDM or self.job.destinationSE == 'local':
- return
- # add data to original dataset
- for destinationDBlock in idMap.keys():
- match = re.findall('(.+)_sub\d+$',destinationDBlock)
- if len(match):
- # add files to top-level datasets
- if not self.goToTransferring:
- origDBlock = match[0]
- idMap[origDBlock] = idMap[destinationDBlock]
- # add files to top-level datasets only
- if self.addToTopOnly:
- del idMap[destinationDBlock]
- # print idMap
- _logger.debug("%s idMap = %s" % (self.jobID,idMap))
- # add data
- _logger.debug("%s addFiles start" % self.jobID)
- # number of retry
- nTry = 3
- for iTry in range(nTry):
- # empty
- if idMap == {}:
- break
- # add data to datasets
- time.sleep(1)
- _logger.debug((self.jobID, 'registerFilesInDatasets',idMap))
- status,out = ddm.DQ2.main('registerFilesInDatasets',idMap)
- isFailed = False
- if status != 0 and out.find('DQFileExistsInDatasetException') == -1 \
- and (out.find('The file LFN or GUID is already registered') == -1 or \
- out.find('already registered in vuid') == -1):
- isFailed = True
- if not isFailed:
- _logger.debug('%s %s' % (self.jobID,out))
- # failed
- if isFailed:
- _logger.error('%s %s' % (self.jobID,out))
- if (iTry+1) == nTry or out.find('DQClosedDatasetException') != 0 or \
- out.find('DQFrozenDatasetException') != 0 or \
- out.find('DQUnknownDatasetException') != 0 or \
- out.find('DQFileMetaDataMismatchException') != 0:
- self.job.jobStatus = 'failed'
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- errMsg = "Adder._updateOutputs() could not add files to %s\n" % idMap.keys()
- self.job.ddmErrorDiag = errMsg + out.split('\n')[-1]
- return
- _logger.error("%s Try:%s" % (self.jobID,iTry))
- # sleep
- time.sleep(120)
- else:
- break
- # register dataset subscription
- subActivity = 'Production'
- if not self.job.prodSourceLabel in ['user']:
- # make DQ2 subscription for prod jobs
- for tmpName,tmpVal in subMap.iteritems():
- for dq2ID,optSub,optSource in tmpVal:
- _logger.debug((self.jobID,'registerDatasetSubscription',tmpName,dq2ID,0,0,optSub,
- optSource,001000 | 010000,0,None,0,"production",None,subActivity,None,"14 days"))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('registerDatasetSubscription',tmpName,dq2ID,0,0,optSub,
- optSource,001000 | 010000,0,None,0,"production",None,subActivity,None,"14 days")
- if (status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1) and \
- out.find('DQSubscriptionExistsException') == -1:
- time.sleep(60)
- else:
- break
- if status != 0 and (out != 'None' and out.find('DQSubscriptionExistsException') == -1):
- _logger.error('%s %s' % (self.jobID,out))
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "Adder._updateOutputs() could not register subscription : %s" % tmpName
- return
- _logger.debug('%s %s' % (self.jobID,out))
- # set dataset status
- self.datasetMap[tmpName].status = 'running'
- # keep subscriptions
- self.subscriptionMap = subMap
- else:
- # send request to DaTRI
- tmpTopDatasets = {}
- # collect top-level datasets
- for tmpName,tmpVal in subMap.iteritems():
- for dq2ID,optSub,optSource in tmpVal:
- tmpTopName = re.sub('_sub\d+','',tmpName)
- # append
- if not tmpTopDatasets.has_key(tmpTopName):
- tmpTopDatasets[tmpTopName] = []
- if not dq2ID in tmpTopDatasets[tmpTopName]:
- tmpTopDatasets[tmpTopName].append(dq2ID)
- # remove redundant CN from DN
- tmpDN = self.job.prodUserID
- tmpDN = re.sub('/CN=limited proxy','',tmpDN)
- tmpDN = re.sub('(/CN=proxy)+$','',tmpDN)
- # send request
- if tmpTopDatasets != {} and self.jobStatus == 'finished':
- try:
- from datriHandler import datriHandler
- if self.job.lockedby.startswith('Ganga'):
- tmpHandler = datriHandler(type='ganga')
- else:
- tmpHandler = datriHandler(type='pathena')
- # loop over all output datasets
- for tmpDsName,dq2IDlist in tmpTopDatasets.iteritems():
- for tmpDQ2ID in dq2IDlist:
- tmpMsg = "%s %s ds=%s site=%s id=%s" % (self.jobID,'datriHandler.sendRequest',
- tmpDsName,tmpDQ2ID,tmpDN)
- _logger.debug(tmpMsg)
- tmpHandler.setParameters(data_pattern=tmpDsName,
- site=tmpDQ2ID,
- userid=tmpDN)
- # number of retry
- nTry = 3
- for iTry in range(nTry):
- dhStatus,dhOut = tmpHandler.sendRequest()
- # succeeded
- if dhStatus == 0 or "such request is exist" in dhOut:
- _logger.debug("%s %s %s" % (self.jobID,dhStatus,dhOut))
- break
- if iTry+1 < nTry:
- # sleep
- time.sleep(60)
- else:
- # final attempt failed
- tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,dhStatus,dhOut)
- _logger.error(tmpMsg)
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "DaTRI failed for %s with %s %s" % (tmpDsName,dhStatus,dhOut)
- return
- # set dataset status
- for tmpName,tmpVal in subMap.iteritems():
- self.datasetMap[tmpName].status = 'running'
- except:
- errType,errValue = sys.exc_info()[:2]
- tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,errType,errValue)
- _logger.error(tmpMsg)
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "DaTRI failed with %s %s" % (errType,errValue)
- return
- # properly finished
- _logger.debug("%s addFiles end" % self.jobID)
-
-
- # update shadow dataset
- def _updateShadow(self):
- # return if PandaDDM is used or non-DQ2
- if self.pandaDDM or self.job.destinationSE == 'local':
- return
- _logger.debug("%s updateShadow" % self.jobID)
- # get shadow DS and contents
- shadowList = []
- shadowFiles = []
- for file in self.job.Files:
- if file.type == 'output' or file.type == 'log':
- # get shadow name
- shadowDS = re.sub('_sub\d+$','',file.destinationDBlock) + '_shadow'
- if not shadowDS in shadowList:
- shadowList.append(shadowDS)
- elif file.type == 'input':
- # remove skipped files
- if file.status in ['skipped']:
- continue
- # ignore lib.tgz
- if re.search('lib\.tgz\.*\d*',file.lfn) != None:
- continue
- # ignore DBRelease
- if re.search('DBRelease',file.lfn) != None:
- continue
- # ignore when noshadow is set
- if file.destinationDBlockToken == 'noshadow':
- continue
- # fsize
- fsize = None
- if not file.fsize in ['NULL','',0]:
- try:
- fsize = long(file.fsize)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("%s : %s %s" % (self.jobID,type,value))
- # append
- if len(str(file.GUID))==36:
- shadowFiles.append({'guid' : file.GUID,
- 'lfn' : file.lfn,
- 'size' : fsize,
- 'checksum' : None})
- # create idMap
- idMap = {}
- for shadowDS in shadowList:
- nTry = 3
- findFlag = False
- for iTry in range(nTry):
- # check if shadow dataset exists
- _logger.debug((self.jobID, 'listDatasets',shadowDS,0,True))
- status,out = ddm.DQ2.main('listDatasets',shadowDS,0,True)
- if status == 0:
- if (out.find(shadowDS) == -1):
- _logger.debug("%s shadow %s doesn't exist" % (self.jobID,shadowDS))
- else:
- findFlag = True
- break
- # sleep
- time.sleep(120)
- # append
- if findFlag and shadowFiles != []:
- idMap[shadowDS] = shadowFiles
- # add data
- _logger.debug("%s shadow idMap = %s" % (self.jobID,idMap))
- if idMap == {}:
- return
- _logger.debug("%s addFilesToShadow start" % self.jobID)
- # number of retry
- nTry = 3
- for iTry in range(nTry):
- # add data to datasets
- time.sleep(1)
- _logger.debug((self.jobID, 'registerFilesInDatasets',idMap))
- status,out = ddm.DQ2.main('registerFilesInDatasets',idMap)
- isFailed = False
- if status != 0 and out.find('DQFileExistsInDatasetException') == -1 \
- and (out.find('The file LFN or GUID is already registered') == -1 or \
- out.find('already registered in vuid') == -1):
- isFailed = True
- if not isFailed:
- _logger.debug('%s %s' % (self.jobID,out))
- # failed
- if isFailed:
- _logger.error('%s %s' % (self.jobID,out))
- if (iTry+1) == nTry or out.find('DQClosedDatasetException') != 0 or \
- out.find('DQFrozenDatasetException') != 0 or \
- out.find('DQFileMetaDataMismatchException') != 0:
- self.job.jobStatus = 'failed'
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- errMsg = "Adder._updateOutputs() could not add files to %s\n" % idMap.keys()
- self.job.ddmErrorDiag = errMsg + out.split('\n')[-1]
- return
- _logger.error("%s shadow Try:%s" % (self.jobID,iTry))
- # sleep
- time.sleep(120)
- else:
- break
- _logger.debug("%s addFilesToShadow end" % self.jobID)
diff --git a/current/pandaserver/dataservice/Adder2.py b/current/pandaserver/dataservice/Adder2.py
deleted file mode 100644
index 521526d7b..000000000
--- a/current/pandaserver/dataservice/Adder2.py
+++ /dev/null
@@ -1,1014 +0,0 @@
-'''
-add data to dataset
-
-'''
-
-import os
-import re
-import sys
-import time
-import fcntl
-import datetime
-import commands
-import threading
-import xml.dom.minidom
-import ErrorCode
-from dq2.clientapi import DQ2
-try:
- from dq2.clientapi.cli import Register2
-except:
- pass
-
-import brokerage.broker_util
-import Closer
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('Adder')
-Closer.initLogger(_logger)
-
-
-class Adder (threading.Thread):
- # constructor
- def __init__(self,taskBuffer,jobID,fileCatalog,jobStatus,xmlFile='',ignoreDDMError=True,joinCloser=False,
- addOutput=False,pandaDDM=False,siteMapper=None,attemptNr=None):
- threading.Thread.__init__(self)
- self.job = None
- self.jobID = jobID
- self.jobStatus = jobStatus
- self.taskBuffer = taskBuffer
- self.ignoreDDMError = ignoreDDMError
- self.joinCloser = joinCloser
- self.addOutput = addOutput
- self.pandaDDM = pandaDDM
- self.lockXML = None
- self.datasetMap = {}
- self.siteMapper = siteMapper
- self.addToTopOnly = False
- self.goToTransferring = False
- self.logTransferring = False
- self.subscriptionMap = {}
- self.dq2api = None
- self.attemptNr = attemptNr
- # dump Catalog into file
- if xmlFile=='':
- if attemptNr == None:
- self.xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,jobID,jobStatus,
- commands.getoutput('uuidgen'))
- else:
- self.xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,jobID,jobStatus,
- commands.getoutput('uuidgen'),attemptNr)
- file = open(self.xmlFile,'w')
- file.write(fileCatalog)
- file.close()
- else:
- self.xmlFile = xmlFile
- # exstract attemptNr
- try:
- tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1]
- if re.search('^\d+$',tmpAttemptNr) != None:
- self.attemptNr = int(tmpAttemptNr)
- except:
- pass
- # main
- def run(self):
- try:
- _logger.debug("%s new start: %s attemptNr=%s" % (self.jobID,self.jobStatus,self.attemptNr))
- # instantiate DQ2
- self.dq2api = DQ2.DQ2()
- # lock XML except last trial
- if self.addOutput and self.ignoreDDMError:
- self.lockXML = open(self.xmlFile)
- try:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
- except:
- _logger.debug("%s cannot get lock : %s" % (self.jobID,self.xmlFile))
- self.lockXML.close()
- return
- # query job
- self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
- fromArchived=False,
- fromWaiting=False)[0]
- # check if job has finished
- if self.job == None:
- _logger.debug('%s : not found' % self.jobID)
- elif self.job.jobStatus in ['finished','failed','unknown','cancelled']:
- _logger.error('%s : invalid state -> %s' % (self.jobID,self.job.jobStatus))
- elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
- _logger.error('%s : wrong attemptNr -> job=%s <> %s' % (self.jobID,self.job.attemptNr,self.attemptNr))
- else:
- # add files only to top-level datasets for transferring jobs
- if self.job.jobStatus == 'transferring':
- self.addToTopOnly = True
- _logger.debug("%s adder for transferring" % self.jobID)
- # use PandaDDM for ddm jobs
- if self.job.prodSourceLabel == 'ddm':
- self.pandaDDM = True
- # set job status
- self.job.jobStatus = self.jobStatus
- # add outputs. Cannot add self.pandaDDM here since minidom.parse() produces seg-fault
- if self.addOutput:
- # check if the job should go to trasnferring
- tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm
- tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se)
- destSEwasSet = False
- brokenSched = False
- if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(self.job.destinationSE):
- # DQ2 ID was set by using --destSE for analysis job to transfer output
- destSEwasSet = True
- tmpDstDDM = self.job.destinationSE
- tmpDstSEs = self.job.destinationSE
- else:
- tmpDstDDM = self.siteMapper.getSite(self.job.destinationSE).ddm
- tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.destinationSE).se)
- # protection against disappearance of dest from schedconfig
- if not self.siteMapper.checkSite(self.job.destinationSE) and self.job.destinationSE != 'local':
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "destinaitonSE %s is unknown in schedconfig" % self.job.destinationSE
- self.job.jobStatus = 'failed'
- self.jobStatus = 'failed'
- _logger.error("%s %s" % (self.jobID,self.job.ddmErrorDiag))
- brokenSched = True
- # protection against disappearance of src from schedconfig
- if not self.siteMapper.checkSite(self.job.computingSite):
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "computingSite %s is unknown in schedconfig" % self.job.computingSite
- self.job.jobStatus = 'failed'
- self.jobStatus = 'failed'
- _logger.error("%s %s" % (self.jobID,self.job.ddmErrorDiag))
- brokenSched = True
- _logger.debug('%s DDM src:%s dst:%s' % (self.jobID,tmpSrcDDM,tmpDstDDM))
- _logger.debug('%s SE src:%s dst:%s' % (self.jobID,tmpSrcSEs,tmpDstSEs))
- if re.search('^ANALY_',self.job.computingSite) != None:
- # analysis site
- pass
- elif self.job.computingSite == self.job.destinationSE:
- # same site ID for computingSite and destinationSE
- pass
- elif tmpSrcDDM == tmpDstDDM:
- # same DQ2ID for src/dest
- pass
- elif tmpSrcSEs == tmpDstSEs:
- # same SEs
- pass
- elif self.addToTopOnly:
- # already in transferring
- pass
- elif self.job.jobStatus == 'failed':
- # failed jobs
- if self.job.prodSourceLabel in ['managed','test']:
- self.logTransferring = True
- pass
- else:
- self.goToTransferring = True
- _logger.debug('%s goToTransferring=%s' % (self.jobID,self.goToTransferring))
- _logger.debug('%s logTransferring=%s' % (self.jobID,self.logTransferring))
- if not brokenSched:
- self._updateOutputs()
- else:
- _logger.debug('%s : not added' % self.jobID)
- _logger.debug('%s escape' % self.jobID)
- return
- _logger.debug('%s updated outputs' % self.jobID)
- # ignore DDMError
- if self.ignoreDDMError and \
- (re.search('could not add files',self.job.ddmErrorDiag) != None or \
- re.search('could not register subscription',self.job.ddmErrorDiag) != None) and \
- re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None and \
- re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None and \
- re.search('DQUnknownDatasetException',self.job.ddmErrorDiag) == None and \
- re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None and \
- re.search('DQDatasetExistsException',self.job.ddmErrorDiag) == None and \
- re.search('Exceeded the maximum number of files',self.job.ddmErrorDiag) == None and \
- re.search('KeyError',self.job.ddmErrorDiag) == None and \
- not self.job.ddmErrorCode in [ErrorCode.EC_Subscription]:
- _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag))
- _logger.debug('%s escape' % self.jobID)
- # unlock XML
- try:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
- self.lockXML.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("%s : %s %s" % (self.jobID,type,value))
- _logger.debug("%s cannot unlock XML" % self.jobID)
- return
- # update shadow dataset
- """
- if self.job.prodSourceLabel == 'user' and self.jobStatus == 'finished' and \
- (self.job.ddmErrorDiag == 'NULL' or re.search('DaTRI failed',self.job.ddmErrorDiag) != None) and \
- not self.goToTransferring:
- self._updateShadow()
- # ignore DDMError
- if self.ignoreDDMError and re.search('could not add files',self.job.ddmErrorDiag) != None \
- and re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None \
- and re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None \
- and re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None \
- and re.search('Exceeded the maximum number of files',self.job.ddmErrorDiag) == None \
- and re.search('KeyError',self.job.ddmErrorDiag) == None:
- _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag))
- _logger.debug('%s escape' % self.jobID)
- # unlock XML
- try:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
- self.lockXML.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("%s : %s %s" % (self.jobID,type,value))
- _logger.debug("%s cannot unlock XML" % self.jobID)
- return
- """
- # remove unmerged
- if self.job.processingType == 'usermerge' and self.job.prodSourceLabel == 'user' and \
- self.jobStatus == 'finished' and self.job.ddmErrorDiag == 'NULL':
- retMerge = self._removeUnmerged()
- # ignore DDMError
- if self.ignoreDDMError and retMerge == None:
- _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag))
- _logger.debug('%s escape' % self.jobID)
- # unlock XML
- try:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
- self.lockXML.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("%s : %s %s" % (self.jobID,type,value))
- _logger.debug("%s cannot unlock XML" % self.jobID)
- return
- # set file status
- if self.job.jobStatus == 'failed':
- for file in self.job.Files:
- if file.type == 'output' or file.type == 'log':
- file.status = 'failed'
- else:
- # reset errors
- self.job.jobDispatcherErrorCode = 0
- self.job.jobDispatcherErrorDiag = 'NULL'
- # set job status
- hasOutput = False
- if self.goToTransferring or self.subscriptionMap != {}:
- # set status to transferring
- for file in self.job.Files:
- if file.type == 'output' or file.type == 'log' or \
- self.subscriptionMap.has_key(file.destinationDBlock):
- file.status = 'transferring'
- hasOutput = True
- if hasOutput:
- self.job.jobStatus = 'transferring'
- # propagate transition to prodDB
- self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- # endtime
- if self.job.endTime=='NULL':
- self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- # output size and # of outputs
- self.job.nOutputDataFiles = 0
- self.job.outputFileBytes = 0
- for tmpFile in self.job.Files:
- if tmpFile.type == 'output':
- self.job.nOutputDataFiles += 1
- try:
- self.job.outputFileBytes += tmpFile.fsize
- except:
- pass
- # protection
- maxOutputFileBytes = 99999999999
- if self.job.outputFileBytes > maxOutputFileBytes:
- self.job.outputFileBytes = maxOutputFileBytes
- # set cancelled state
- if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
- self.job.jobStatus = 'cancelled'
- # update job
- retU = self.taskBuffer.updateJobs([self.job],False)
- _logger.debug("%s retU: %s" % (self.jobID,retU))
- # failed
- if not retU[0]:
- _logger.error('failed to update DB for %s' % self.jobID)
- # unlock XML
- try:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
- self.lockXML.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("%s : %s %s" % (self.jobID,type,value))
- _logger.debug("%s cannot unlock XML" % self.jobID)
- return
- # setup for closer
- destDBList = []
- guidList = []
- for file in self.job.Files:
- # ignore inputs
- if file.type == 'input':
- continue
- # start closer for output/log datasets
- if not file.destinationDBlock in destDBList:
- destDBList.append(file.destinationDBlock)
- # collect GUIDs
- if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test'] and \
- self.job.processingType in ['pathena','prun','gangarobot-rctest'])) \
- and file.type == 'output':
- guidList.append({'lfn':file.lfn,'guid':file.GUID,'type':file.type,
- 'checksum':file.checksum,'md5sum':file.md5sum,
- 'fsize':file.fsize,'scope':file.scope})
- if guidList != []:
- retG = self.taskBuffer.setGUIDs(guidList)
- if destDBList != []:
- # start Closer
- cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,pandaDDM=self.pandaDDM,
- datasetMap=self.datasetMap)
- _logger.debug("%s start Closer" % self.jobID)
- cThr.start()
- if self.joinCloser:
- cThr.join()
- _logger.debug("%s end Closer" % self.jobID)
- _logger.debug("%s end" % self.jobID)
- try:
- # remove Catalog
- os.remove(self.xmlFile)
- except:
- pass
- # unlock XML
- if self.lockXML != None:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
- self.lockXML.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("%s : %s %s" % (self.jobID,type,value))
- _logger.debug("%s except" % self.jobID)
- # unlock XML just in case
- try:
- if self.lockXML != None:
- fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("%s : %s %s" % (self.jobID,type,value))
- _logger.debug("%s cannot unlock XML" % self.jobID)
-
-
- # update output files
- def _updateOutputs(self):
- # get LFN and GUID
- _logger.debug("%s %s" % (self.jobID,self.xmlFile))
- # no outputs
- if self.job.Files == []:
- _logger.debug("%s has no outputs" % self.jobID)
- _logger.debug("%s addFiles end" % self.jobID)
- return
- # get input files
- inputLFNs = []
- for file in self.job.Files:
- if file.type == 'input':
- inputLFNs.append(file.lfn)
- # parse XML
- lfns = []
- guids = []
- fsizes = []
- md5sums = []
- chksums = []
- surls = []
- try:
- root = xml.dom.minidom.parse(self.xmlFile)
- files = root.getElementsByTagName('File')
- for file in files:
- # get GUID
- guid = str(file.getAttribute('ID'))
- _logger.debug(guid)
- # get PFN and LFN nodes
- logical = file.getElementsByTagName('logical')[0]
- lfnNode = logical.getElementsByTagName('lfn')[0]
- # convert UTF8 to Raw
- lfn = str(lfnNode.getAttribute('name'))
- # get metadata
- fsize = None
- md5sum = None
- adler32 = None
- surl = None
- for meta in file.getElementsByTagName('metadata'):
- # get fsize
- name = str(meta.getAttribute('att_name'))
- if name == 'fsize':
- fsize = long(meta.getAttribute('att_value'))
- elif name == 'md5sum':
- md5sum = str(meta.getAttribute('att_value'))
- # check
- if re.search("^[a-fA-F0-9]{32}$",md5sum) == None:
- md5sum = None
- elif name == 'adler32':
- adler32 = str(meta.getAttribute('att_value'))
- elif name == 'surl':
- surl = str(meta.getAttribute('att_value'))
- # error check
- if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None) \
- or (self.useCentralLFC() and surl == None)):
- raise RuntimeError, 'fsize/md5sum/adler32/surl=None'
- # append
- lfns.append(lfn)
- guids.append(guid)
- fsizes.append(fsize)
- md5sums.append(md5sum)
- surls.append(surl)
- if adler32 != None:
- # use adler32 if available
- chksums.append("ad:%s" % adler32)
- else:
- chksums.append("md5:%s" % md5sum)
- except:
- # check if file exists
- if os.path.exists(self.xmlFile):
- type, value, traceBack = sys.exc_info()
- _logger.error("%s : %s %s" % (self.jobID,type,value))
- # set failed anyway
- self.job.jobStatus = 'failed'
- # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
- if (self.job.pilotErrorCode in [0,'0','NULL']) and \
- (self.job.transExitCode in [0,'0','NULL']):
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "Adder._updateOutputs() could not get GUID/LFN/MD5/FSIZE/SURL"
- return
- else:
- # XML was deleted
- self.job.ddmErrorDiag = "Adder._updateOutputs() could not add files"
- self.ignoreDDMError = True
- return
- # check files
- idMap = {}
- fileList = []
- subMap = {}
- for file in self.job.Files:
- if file.type == 'input':
- if file.lfn in lfns:
- if self.job.prodSourceLabel in ['user','panda']:
- # skipped file
- file.status = 'skipped'
- elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']:
- # failed by pilot
- file.status = 'failed'
- elif file.type == 'output' or file.type == 'log':
- # append to fileList
- fileList.append(file.lfn)
- # add only log file for failed jobs
- if self.jobStatus == 'failed' and file.type != 'log':
- continue
- # add only log file for unmerge jobs
- if self.job.prodSourceLabel == 'panda' and self.job.processingType in ['unmerge'] \
- and file.type != 'log':
- continue
- # look for GUID with LFN
- try:
- i = lfns.index(file.lfn)
- file.GUID = guids[i]
- file.fsize = fsizes[i]
- file.md5sum = md5sums[i]
- file.checksum = chksums[i]
- surl = surls[i]
- # status
- file.status = 'ready'
- # fsize
- fsize = None
- if not file.fsize in ['NULL','',0]:
- try:
- fsize = long(file.fsize)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("%s : %s %s" % (self.jobID,type,value))
- # append to map
- if not idMap.has_key(file.destinationDBlock):
- idMap[file.destinationDBlock] = []
- fileAttrs = {'guid' : file.GUID,
- 'lfn' : lfns[i],
- 'size' : fsize,
- 'checksum' : file.checksum}
- # add SURLs if LFC registration is required
- if self.useCentralLFC():
- fileAttrs['surl'] = surl
- idMap[file.destinationDBlock].append(fileAttrs)
- # for subscription
- if self.job.prodSourceLabel in ['managed','test','software','rc_test','ptest','user'] and \
- re.search('_sub\d+$',file.destinationDBlock) != None and (not self.addToTopOnly) and \
- self.job.destinationSE != 'local':
- if self.siteMapper == None:
- _logger.error("%s : SiteMapper==None" % self.jobID)
- else:
- # get dataset spec
- if not self.datasetMap.has_key(file.destinationDBlock):
- tmpDS = self.taskBuffer.queryDatasetWithMap({'name':file.destinationDBlock})
- self.datasetMap[file.destinationDBlock] = tmpDS
- # check if valid dataset
- if self.datasetMap[file.destinationDBlock] == None:
- _logger.error("%s : cannot find %s in DB" % (self.jobID,file.destinationDBlock))
- else:
- if not self.datasetMap[file.destinationDBlock].status in ['defined']:
- # not a fresh dataset
- _logger.debug("%s : subscription was already made for %s:%s" % \
- (self.jobID,self.datasetMap[file.destinationDBlock].status,
- file.destinationDBlock))
- else:
- # get DQ2 IDs
- tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm
- tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se)
- if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(file.destinationSE):
- # DQ2 ID was set by using --destSE for analysis job to transfer output
- tmpDstDDM = file.destinationSE
- tmpDstSEs = file.destinationSE
- else:
- tmpDstDDM = self.siteMapper.getSite(file.destinationSE).ddm
- tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(file.destinationSE).se)
- # if src != dest or multi-token
- if (tmpSrcDDM != tmpDstDDM and tmpSrcSEs != tmpDstSEs) or \
- (tmpSrcDDM == tmpDstDDM and file.destinationDBlockToken.count(',') != 0):
- optSub = {'DATASET_COMPLETE_EVENT' : ['https://%s:%s/server/panda/datasetCompleted' % \
- (panda_config.pserverhost,panda_config.pserverport)]}
- # append
- if not subMap.has_key(file.destinationDBlock):
- subMap[file.destinationDBlock] = []
- # sources
- optSource = {}
- # set sources
- if file.destinationDBlockToken in ['NULL','']:
- # use default DQ2 ID as source
- optSource[tmpSrcDDM] = {'policy' : 0}
- else:
- # convert token to DQ2 ID
- dq2ID = tmpSrcDDM
- # use the first token's location as source for T1D1
- tmpSrcToken = file.destinationDBlockToken.split(',')[0]
- if self.siteMapper.getSite(self.job.computingSite).setokens.has_key(tmpSrcToken):
- dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens[tmpSrcToken]
- optSource[dq2ID] = {'policy' : 0}
- # T1 used as T2
- if self.siteMapper.getSite(self.job.computingSite).cloud != self.job.cloud and \
- (not tmpSrcDDM.endswith('PRODDISK')) and \
- (not self.job.prodSourceLabel in ['user','panda']):
- # register both DATADISK and PRODDISK as source locations
- if self.siteMapper.getSite(self.job.computingSite).setokens.has_key('ATLASPRODDISK'):
- dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens['ATLASPRODDISK']
- optSource[dq2ID] = {'policy' : 0}
- if not optSource.has_key(tmpSrcDDM):
- optSource[tmpSrcDDM] = {'policy' : 0}
- # use another location when token is set
- if not file.destinationDBlockToken in ['NULL','']:
- tmpDQ2IDList = []
- tmpDstTokens = file.destinationDBlockToken.split(',')
- # remove the first one because it is already used as a location
- if tmpSrcDDM == tmpDstDDM:
- tmpDstTokens = tmpDstTokens[1:]
- # loop over all tokens
- for idxToken,tmpDstToken in enumerate(tmpDstTokens):
- dq2ID = tmpDstDDM
- if self.siteMapper.getSite(file.destinationSE).setokens.has_key(tmpDstToken):
- dq2ID = self.siteMapper.getSite(file.destinationSE).setokens[tmpDstToken]
- # keep the fist destination for multi-hop
- if idxToken == 0:
- firstDestDDM = dq2ID
- else:
- # use the fist destination as source for T1D1
- optSource = {}
- optSource[firstDestDDM] = {'policy' : 0}
- # remove looping subscription
- if dq2ID == tmpSrcDDM:
- continue
- # avoid duplication
- if not dq2ID in tmpDQ2IDList:
- subMap[file.destinationDBlock].append((dq2ID,optSub,optSource))
- else:
- # use default DDM
- for dq2ID in tmpDstDDM.split(','):
- subMap[file.destinationDBlock].append((dq2ID,optSub,optSource))
- except:
- # status
- file.status = 'failed'
- type, value, traceBack = sys.exc_info()
- _logger.error("%s : %s %s" % (self.jobID,type,value))
- # cleanup submap
- tmpKeys = subMap.keys()
- for tmpKey in tmpKeys:
- if subMap[tmpKey] == []:
- del subMap[tmpKey]
- # check consistency between XML and filesTable
- for lfn in lfns:
- if (not lfn in fileList) and (not lfn in inputLFNs):
- _logger.error("%s %s is not found in filesTable" % (self.jobID,lfn))
- self.job.jobStatus = 'failed'
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "Adder._updateOutputs() XML is inconsistent with filesTable"
- return
- # return if PandaDDM is used or non-DQ2
- if self.pandaDDM or self.job.destinationSE == 'local':
- return
- # add data to original dataset
- for destinationDBlock in idMap.keys():
- origDBlock = None
- match = re.search('^(.+)_sub\d+$',destinationDBlock)
- if match != None:
- # add files to top-level datasets
- origDBlock = match.group(1)
- if not self.goToTransferring:
- idMap[origDBlock] = idMap[destinationDBlock]
- # add files to top-level datasets only
- if self.addToTopOnly:
- del idMap[destinationDBlock]
- # skip sub unless getting transferred
- if origDBlock != None:
- if not self.goToTransferring and not self.logTransferring \
- and idMap.has_key(destinationDBlock):
- del idMap[destinationDBlock]
- # print idMap
- _logger.debug("%s idMap = %s" % (self.jobID,idMap))
- _logger.debug("%s subMap = %s" % (self.jobID,subMap))
- # add data
- _logger.debug("%s addFiles start" % self.jobID)
- # count the number of files
- regNumFiles = 0
- regFileList = []
- for tmpRegDS,tmpRegList in idMap.iteritems():
- for tmpRegItem in tmpRegList:
- if not tmpRegItem['lfn'] in regFileList:
- regNumFiles += 1
- regFileList.append(tmpRegItem['lfn'])
- # number of retry
- nTry = 3
- for iTry in range(nTry):
- # empty
- if idMap == {}:
- break
- # add data to datasets
- time.sleep(1)
- isFailed = False
- isFatal = False
- setErrorDiag = False
- out = 'OK'
- fatalErrStrs = ['[ORA-00001] unique constraint (ATLAS_DQ2.UQ_01_FILES_GUID) violated']
- regStart = datetime.datetime.utcnow()
- try:
- if not self.useCentralLFC():
- regMsgStr = "DQ2 registraion for %s files " % regNumFiles
- _logger.debug('%s %s %s' % (self.jobID,'registerFilesInDatasets',str(idMap)))
- self.dq2api.registerFilesInDatasets(idMap)
- else:
- regMsgStr = "LFC+DQ2 registraion for %s files " % regNumFiles
- _logger.debug('%s %s %s' % (self.jobID,'Register.registerFilesInDatasets',str(idMap)))
- registerAPI = Register2.Register(self.siteMapper.getSite(self.job.computingSite).ddm)
- out = registerAPI.registerFilesInDatasets(idMap)
- except DQ2.DQFileExistsInDatasetException:
- # hamless error
- errType,errValue = sys.exc_info()[:2]
- out = '%s : %s' % (errType,errValue)
- except (DQ2.DQClosedDatasetException,
- DQ2.DQFrozenDatasetException,
- DQ2.DQUnknownDatasetException,
- DQ2.DQFileMetaDataMismatchException):
- # fatal errors
- errType,errValue = sys.exc_info()[:2]
- out = '%s : %s' % (errType,errValue)
- isFatal = True
- except:
- # unknown errors
- errType,errValue = sys.exc_info()[:2]
- out = '%s : %s' % (errType,errValue)
- for tmpFatalErrStr in fatalErrStrs:
- if tmpFatalErrStr in str(errValue):
- self.job.ddmErrorDiag = 'failed to add files : ' + tmpFatalErrStr
- setErrorDiag = True
- break
- isFatal = True
- regTime = datetime.datetime.utcnow() - regStart
- _logger.debug('%s ' % self.jobID + regMsgStr + \
- 'took %s.%03d sec' % (regTime.seconds,regTime.microseconds/1000))
- # failed
- if isFailed or isFatal:
- _logger.error('%s %s' % (self.jobID,out))
- if (iTry+1) == nTry or isFatal:
- self.job.jobStatus = 'failed'
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- if not setErrorDiag:
- errMsg = "Adder._updateOutputs() could not add files : "
- self.job.ddmErrorDiag = errMsg + out.split('\n')[-1]
- return
- _logger.error("%s Try:%s" % (self.jobID,iTry))
- # sleep
- time.sleep(120)
- else:
- _logger.debug('%s %s' % (self.jobID,out))
- break
- # register dataset subscription
- subActivity = 'Production'
- if not self.job.prodSourceLabel in ['user']:
- # make DQ2 subscription for prod jobs
- for tmpName,tmpVal in subMap.iteritems():
- for dq2ID,optSub,optSource in tmpVal:
- _logger.debug("%s %s %s %s" % (self.jobID,'registerDatasetSubscription',
- (tmpName,dq2ID),
- {'version':0,'archived':0,'callbacks':optSub,
- 'sources':optSource,'sources_policy':(001000 | 010000),
- 'wait_for_sources':0,'destination':None,'query_more_sources':0,
- 'sshare':"production",'group':None,'activity':subActivity,
- 'acl_alias':None,'replica_lifetime':"14 days"}))
- for iDDMTry in range(3):
- out = 'OK'
- isFailed = False
- try:
- self.dq2api.registerDatasetSubscription(tmpName,dq2ID,version=0,archived=0,callbacks=optSub,
- sources=optSource,sources_policy=(001000 | 010000),
- wait_for_sources=0,destination=None,query_more_sources=0,
- sshare="production",group=None,activity=subActivity,
- acl_alias=None,replica_lifetime="14 days")
- except DQ2.DQSubscriptionExistsException:
- # harmless error
- errType,errValue = sys.exc_info()[:2]
- out = '%s : %s' % (errType,errValue)
- except:
- # unknown errors
- errType,errValue = sys.exc_info()[:2]
- out = '%s : %s' % (errType,errValue)
- isFailed = True
- if 'is not a Tiers of Atlas Destination' in str(errValue) or \
- 'is not in Tiers of Atlas' in str(errValue):
- # fatal error
- self.job.ddmErrorCode = ErrorCode.EC_Subscription
- else:
- # retry for temporary errors
- time.sleep(60)
- else:
- break
- if isFailed:
- _logger.error('%s %s' % (self.jobID,out))
- if self.job.ddmErrorCode == ErrorCode.EC_Subscription:
- # fatal error
- self.job.ddmErrorDiag = "subscription failure with %s" % out
- self.job.jobStatus = 'failed'
- else:
- # temoprary errors
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "Adder._updateOutputs() could not register subscription : %s" % tmpName
- return
- _logger.debug('%s %s' % (self.jobID,out))
- # set dataset status
- self.datasetMap[tmpName].status = 'running'
- # keep subscriptions
- self.subscriptionMap = subMap
- elif not "--mergeOutput" in self.job.jobParameters:
- # send request to DaTRI unless files will be merged
- tmpTopDatasets = {}
- # collect top-level datasets
- for tmpName,tmpVal in subMap.iteritems():
- for dq2ID,optSub,optSource in tmpVal:
- tmpTopName = re.sub('_sub\d+','',tmpName)
- # append
- if not tmpTopDatasets.has_key(tmpTopName):
- tmpTopDatasets[tmpTopName] = []
- if not dq2ID in tmpTopDatasets[tmpTopName]:
- tmpTopDatasets[tmpTopName].append(dq2ID)
- # remove redundant CN from DN
- tmpDN = self.job.prodUserID
- tmpDN = re.sub('/CN=limited proxy','',tmpDN)
- tmpDN = re.sub('(/CN=proxy)+$','',tmpDN)
- # send request
- if tmpTopDatasets != {} and self.jobStatus == 'finished':
- try:
- from datriHandler import datriHandler
- if self.job.lockedby.startswith('Ganga'):
- tmpHandler = datriHandler(type='ganga')
- else:
- tmpHandler = datriHandler(type='pathena')
- # loop over all output datasets
- for tmpDsName,dq2IDlist in tmpTopDatasets.iteritems():
- for tmpDQ2ID in dq2IDlist:
- tmpMsg = "%s %s ds=%s site=%s id=%s" % (self.jobID,'datriHandler.sendRequest',
- tmpDsName,tmpDQ2ID,tmpDN)
- _logger.debug(tmpMsg)
- tmpHandler.setParameters(data_pattern=tmpDsName,
- site=tmpDQ2ID,
- userid=tmpDN)
- # number of retry
- nTry = 3
- for iTry in range(nTry):
- dhStatus,dhOut = tmpHandler.sendRequest()
- # succeeded
- if dhStatus == 0 or "such request is exist" in dhOut:
- _logger.debug("%s %s %s" % (self.jobID,dhStatus,dhOut))
- break
- # faital errors
- if "No input data or input data is incorrect" in dhOut:
- tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,dhStatus,dhOut)
- _logger.error(tmpMsg)
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "DaTRI failed for %s with %s %s" % (tmpDsName,dhStatus,dhOut)
- return
- # retry
- if iTry+1 < nTry:
- # sleep
- time.sleep(60)
- else:
- # final attempt failed
- tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,dhStatus,dhOut)
- _logger.error(tmpMsg)
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "DaTRI failed for %s with %s %s" % (tmpDsName,dhStatus,dhOut)
- return
- # set dataset status
- for tmpName,tmpVal in subMap.iteritems():
- self.datasetMap[tmpName].status = 'running'
- except:
- errType,errValue = sys.exc_info()[:2]
- tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,errType,errValue)
- _logger.error(tmpMsg)
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- self.job.ddmErrorDiag = "DaTRI failed with %s %s" % (errType,errValue)
- return
- # properly finished
- _logger.debug("%s addFiles end" % self.jobID)
-
-
- # update shadow dataset
- def _updateShadow(self):
- # return if PandaDDM is used or non-DQ2
- if self.pandaDDM or self.job.destinationSE == 'local':
- return
- _logger.debug("%s updateShadow" % self.jobID)
- # get shadow DS and contents
- shadowList = []
- shadowFiles = []
- for file in self.job.Files:
- if file.type == 'output' or file.type == 'log':
- # get shadow name
- shadowDS = re.sub('_sub\d+$','',file.destinationDBlock) + '_shadow'
- if not shadowDS in shadowList:
- shadowList.append(shadowDS)
- elif file.type == 'input':
- # remove skipped files
- if file.status in ['skipped']:
- continue
- # ignore lib.tgz
- if re.search('lib\.tgz\.*\d*',file.lfn) != None:
- continue
- # ignore DBRelease
- if re.search('DBRelease',file.lfn) != None:
- continue
- # ignore when noshadow is set
- if file.destinationDBlockToken == 'noshadow':
- continue
- # fsize
- fsize = None
- if not file.fsize in ['NULL','',0]:
- try:
- fsize = long(file.fsize)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("%s : %s %s" % (self.jobID,type,value))
- # append
- if len(str(file.GUID))==36:
- shadowFiles.append({'guid' : file.GUID,
- 'lfn' : file.lfn,
- 'size' : fsize,
- 'checksum' : None})
- # create idMap
- idMap = {}
- for shadowDS in shadowList:
- nTry = 3
- findFlag = False
- for iTry in range(nTry):
- # check if shadow dataset exists
- _logger.debug((self.jobID, 'listDatasets',shadowDS,0,True))
- try:
- out = self.dq2api.listDatasets(shadowDS,0,True)
- if not out.has_key(shadowDS):
- _logger.debug("%s shadow %s doesn't exist" % (self.jobID,shadowDS))
- else:
- findFlag = True
- break
- except:
- # sleep
- time.sleep(120)
- # append
- if findFlag and shadowFiles != []:
- idMap[shadowDS] = shadowFiles
- # add data
- _logger.debug("%s shadow idMap = %s" % (self.jobID,idMap))
- if idMap == {}:
- return
- _logger.debug("%s addFilesToShadow start" % self.jobID)
- # number of retry
- nTry = 3
- for iTry in range(nTry):
- # add data to datasets
- _logger.debug((self.jobID, 'registerFilesInDatasets',idMap))
- isFailed = False
- isFatal = False
- out = 'OK'
- try:
- self.dq2api.registerFilesInDatasets(idMap)
- except DQ2.DQFileExistsInDatasetException:
- # hamless error
- errType,errValue = sys.exc_info()[:2]
- out = '%s : %s' % (errType,errValue)
- except (DQ2.DQClosedDatasetException,
- DQ2.DQFrozenDatasetException,
- DQ2.DQUnknownDatasetException,
- DQ2.DQFileMetaDataMismatchException):
- # fatal errors
- errType,errValue = sys.exc_info()[:2]
- out = '%s : %s' % (errType,errValue)
- isFatal = True
- except:
- # unknown errors
- errType,errValue = sys.exc_info()[:2]
- out = '%s : %s' % (errType,errValue)
- isFatal = True
- # failed
- if isFailed or isFatal:
- _logger.error('%s %s' % (self.jobID,out))
- if (iTry+1) == nTry or isFatal:
- self.job.jobStatus = 'failed'
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- errMsg = "Adder._updateOutputs() could not add files : "
- self.job.ddmErrorDiag = errMsg + out.split('\n')[-1]
- return
- _logger.error("%s shadow Try:%s" % (self.jobID,iTry))
- # sleep
- time.sleep(120)
- else:
- _logger.debug('%s %s' % (self.jobID,out))
- break
- _logger.debug("%s addFilesToShadow end" % self.jobID)
-
-
- # use cerntral LFC
- def useCentralLFC(self):
- tmpSiteSpec = self.siteMapper.getSite(self.job.computingSite)
- if not self.addToTopOnly and tmpSiteSpec.lfcregister in ['server']:
- return True
- return False
-
-
- # remove unmerged files
- def _removeUnmerged(self):
- _logger.debug("%s removeUnmerged" % self.jobID)
- # get input files
- inputFileGUIDs = []
- inputFileStr = ''
- for file in self.job.Files:
- if file.type == 'input':
- # remove skipped files
- if file.status in ['skipped']:
- continue
- # ignore lib.tgz
- if re.search('lib\.tgz\.*\d*',file.lfn) != None:
- continue
- # ignore DBRelease
- if re.search('DBRelease',file.lfn) != None:
- continue
- # append
- inputFileGUIDs.append(file.GUID)
- inputFileStr += '%s,' % file.lfn
- # extract parent dataset name
- tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters)
- # failed
- if tmpMatch == None:
- _logger.error("%s failed to extract parentDS from params=%s" % (self.jobID,self.job.jobParameters))
- return False
- parentDS = tmpMatch.group(1)
- # delete
- _logger.debug("%s deleteFilesFromDataset %s %s" % (self.jobID,parentDS,inputFileStr[:-1]))
- nTry = 3
- for iTry in range(nTry):
- # add data to datasets
- isFailed = False
- isFatal = False
- out = 'OK'
- try:
- self.dq2api.deleteFilesFromDataset(parentDS,inputFileGUIDs)
- except (DQ2.DQClosedDatasetException,
- DQ2.DQFrozenDatasetException,
- DQ2.DQUnknownDatasetException,
- DQ2.DQFileMetaDataMismatchException):
- # fatal errors
- errType,errValue = sys.exc_info()[:2]
- out = '%s : %s' % (errType,errValue)
- isFatal = True
- except:
- # unknown errors
- errType,errValue = sys.exc_info()[:2]
- out = '%s : %s' % (errType,errValue)
- isFailed = True
- # failed
- if isFailed or isFatal:
- _logger.error('%s %s' % (self.jobID,out))
- if (iTry+1) == nTry or isFatal:
- self.job.jobStatus = 'failed'
- self.job.ddmErrorCode = ErrorCode.EC_Adder
- errMsg = "failed to remove unmerged files : "
- self.job.ddmErrorDiag = errMsg + out.split('\n')[-1]
- if not isFatal:
- # retrun None to retry later
- return None
- return False
- _logger.error("%s removeUnmerged Try:%s" % (self.jobID,iTry))
- # sleep
- time.sleep(120)
- else:
- _logger.debug('%s %s' % (self.jobID,out))
- break
- # succeeded
- _logger.debug("%s removeUnmerged end" % self.jobID)
- return True
diff --git a/current/pandaserver/dataservice/AddressFinder.py b/current/pandaserver/dataservice/AddressFinder.py
deleted file mode 100644
index c96099bff..000000000
--- a/current/pandaserver/dataservice/AddressFinder.py
+++ /dev/null
@@ -1,308 +0,0 @@
-import re
-import sys
-import urllib
-import commands
-
-from config import panda_config
-from taskbuffer.OraDBProxy import DBProxy
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('AddressFinder')
-
-# NG words in email address
-_ngWordsInMailAddr = ['support','system','stuff','service','secretariat','club','user']
-
-
-# insert *
-def insertWC(str):
- retStr = ".*"
- for item in str:
- retStr += item
- retStr += ".*"
- return retStr
-
-
-# clean name
-def cleanName(dn):
- # extract First Last from DN
- dbProxy = DBProxy()
- extractedDN = dbProxy.cleanUserID(dn)
- # replace -.
- extractedDN = re.sub('-|\.',' ',extractedDN)
- # change to lower
- extractedDN = extractedDN.lower()
- # remove ATLAS
- extractedDN = re.sub('\(*atlas\)*','',extractedDN)
- # remove numbers
- extractedDN = re.sub('\d*','',extractedDN)
- # remove Jr
- extractedDN = re.sub(' jr( |$)',' ',extractedDN)
- # remove whitespaces
- extractedDN = re.sub(' +',' ',extractedDN)
- extractedDN = extractedDN.strip()
- # return
- return extractedDN
-
-
-# get email address using phonebook
-def getEmailPhonebook(dn):
- _logger.debug('Getting email via phonebook for %s' % dn)
- # clean DN
- extractedDN = cleanName(dn)
- # dump
- _logger.debug(extractedDN)
- # construct command
- for sTry in ['full','full_rev','fullwc','fullwc_rev,',
- 'suronly', 'firstonly','suronly_rev','firstonly_rev',
- 'email']:
- if sTry == 'full':
- # try full name
- com = '~atlpan/phonebook --firstname "%s" --surname "%s" --all' \
- % (extractedDN.split()[0],extractedDN.split()[-1])
- if sTry == 'full_rev':
- # try full name
- com = '~atlpan/phonebook --firstname "%s" --surname "%s" --all' \
- % (extractedDN.split()[-1],extractedDN.split()[0])
- elif sTry == 'fullwc':
- # try full name with wildcard
- com = '~atlpan/phonebook --firstname "*%s*" --surname "*%s*" --all' \
- % (extractedDN.split()[0],extractedDN.split()[-1])
- elif sTry == 'fullwc_rev':
- # try full name with wildcard
- com = '~atlpan/phonebook --firstname "*%s*" --surname "*%s*" --all' \
- % (extractedDN.split()[-1],extractedDN.split()[0])
- elif sTry == 'suronly':
- if len(extractedDN.split()) == 2:
- # try surname only
- com = '~atlpan/phonebook --surname "%s" --all' \
- % extractedDN.split()[-1]
- else:
- # try surname with wildcard
- com = '~atlpan/phonebook --surname "*%s*" --all' \
- % extractedDN.split()[-1]
- elif sTry == 'suronly_rev':
- if len(extractedDN.split()) == 2:
- # try surname only
- com = '~atlpan/phonebook --surname "%s" --all' \
- % extractedDN.split()[0]
- else:
- # try surname with wildcard
- com = '~atlpan/phonebook --surname "*%s*" --all' \
- % extractedDN.split()[0]
- elif sTry == 'firstonly':
- if len(extractedDN.split()) == 2:
- # try firstname only
- com = '~atlpan/phonebook --firstname "%s" --all' \
- % extractedDN.split()[0]
- else:
- # try firstname with wildcard
- com = '~atlpan/phonebook --firstname "*%s*" --all' \
- % extractedDN.split()[0]
- elif sTry == 'firstonly_rev':
- if len(extractedDN.split()) == 2:
- # try firstname only
- com = '~atlpan/phonebook --firstname "%s" --all' \
- % extractedDN.split()[-1]
- else:
- # try firstname with wildcard
- com = '~atlpan/phonebook --firstname "*%s*" --all' \
- % extractedDN.split()[-1]
- elif sTry == 'email':
- # try email
- mailPatt = re.sub(' +','*',extractedDN)
- com = '~atlpan/phonebook --email "*%s*" --all' \
- % mailPatt
- _logger.debug(com)
- # execute
- sStat,sOut = commands.getstatusoutput(com)
- _logger.debug(sOut)
- # failed
- if sStat != 0:
- _logger.debug('phonebook failed with %s' % sStat)
- return []
- # extract email
- emails = []
- groups = []
- dnames = []
- for line in sOut.split('\n'):
- if line.startswith('E-mail:'):
- # append
- tmpStr = line.split()[-1]
- emails.append(tmpStr)
- elif line.startswith('Group:'):
- # append
- tmpStr = line.split()[-1]
- groups.append(tmpStr)
- elif line.startswith('Display Name:'):
- # append
- tmpStr = re.sub('^[^:]+:','',line).strip()
- dnames.append(tmpStr)
- # check groups
- newGroups = []
- newEmails = []
- newDNames = []
- for idx,group in enumerate(groups):
- if group.startswith('A') or group in ['UAT','GS','-']:
- newGroups.append(group)
- newEmails.append(emails[idx])
- newDNames.append(dnames[idx])
- # replace
- groups = newGroups
- emails = newEmails
- dnames = newDNames
- # check dname
- if len(emails) > 1 and len(emails) == len(dnames):
- newGroups = []
- newEmails = []
- newDNames = []
- newGroupsWC = []
- newEmailsWC = []
- newDNamesWC = []
- for idx,dname in enumerate(dnames):
- # check fragments
- nameItems = extractedDN.split()
- nMatch = 0
- nMatchWC = 0
- for nameItem in nameItems:
- # check w/o wildcard
- if re.search(nameItem,dname,re.I) != None:
- nMatch += 1
- # check with wildcard
- if re.search(insertWC(nameItem),dname,re.I) != None:
- nMatchWC += 1
- # append if totally matched or partially matched ignoring middle-name etc
- if len(nameItems) == nMatch or (len(nameItems) > 2 and (len(nameItems)-nMatch) < 2):
- newGroups.append(groups[idx])
- newEmails.append(emails[idx])
- newDNames.append(dname)
- # append if matched with wildcard
- if len(nameItems) == nMatchWC or (len(nameItems) > 2 and (len(nameItems)-nMatchWC) < 2):
- newGroupsWC.append(groups[idx])
- newEmailsWC.append(emails[idx])
- newDNamesWC.append(dname)
- # replace
- if len(newGroups)>0:
- # use strict matching
- groups = newGroups
- emails = newEmails
- dnames = newDNames
- else:
- # use loose matching
- groups = newGroupsWC
- emails = newEmailsWC
- dnames = newDNamesWC
- _logger.debug('emails=%s' % str(emails))
- # return
- if len(emails) == 1:
- _logger.debug('Succeeded %s %s' % (groups[0],emails[0]))
- return emails
- # failed
- _logger.error('Failed for %s' % dn)
- return []
-
-
-# get email address using xwho
-def getEmailXwho(dn):
- # get email from CERN/xwho
- _logger.debug('Getting email via xwho for %s' % dn)
- for sTry in ['full','firstlastonly']:
- try:
- # remove middle name
- encodedDN = cleanName(dn)
- encodedDN = re.sub(' . ',' ',encodedDN)
- # remove _
- encodedDN = encodedDN.replace('_',' ')
- # use fist and lastnames only
- if sTry == 'firstlastonly':
- newEncodedDN = '%s %s' % (encodedDN.split()[0],encodedDN.split()[-1])
- # skip if it was already tried
- if encodedDN == newEncodedDN:
- continue
- encodedDN = newEncodedDN
- # URL encode
- encodedDN = encodedDN.replace(' ','%20')
- url = 'http://consult.cern.ch/xwho?'+encodedDN
- if panda_config.httpProxy != '':
- proxies = proxies={'http': panda_config.httpProxy}
- else:
- proxies = proxies={}
- opener = urllib.FancyURLopener(proxies)
- fd=opener.open(url)
- data = fd.read()
- if re.search(' not found',data,re.I) == None:
- break
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("xwho failure with %s %s" % (type,value))
- return []
- # parse HTML
- emails = []
- headerItem = ["Family Name","First Name","Phone","Dep"]
- findTable = False
- _logger.debug(data)
- for line in data.split('\n'):
- # look for table
- if not findTable:
- # look for header
- tmpFlag = True
- for item in headerItem:
- if re.search(item,line) == None:
- tmpFlag = False
- break
- findTable = tmpFlag
- continue
- else:
- # end of table
- if re.search(item,"") != None:
- findTable = False
- continue
- # look for link to individual page
- match = re.search('href="(/xwho/people/\d+)"',line)
- if match == None:
- continue
- link = match.group(1)
- try:
- url = 'http://consult.cern.ch'+link
- if panda_config.httpProxy != '':
- proxies = proxies={'http': panda_config.httpProxy}
- else:
- proxies = proxies={}
- opener = urllib.FancyURLopener(proxies)
- fd=opener.open(url)
- data = fd.read()
- _logger.debug(data)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("xwho failure with %s %s" % (type,value))
- return []
- # get mail adder
- match = re.search("mailto:([^@]+@[^>]+)>",data)
- if match != None:
- adder = match.group(1)
- # check NG words
- okAddr = True
- for ngWord in _ngWordsInMailAddr:
- if re.search(ngWord,adder,re.I):
- _logger.error("%s has NG word:%s" % (adder,ngWord))
- okAddr = False
- break
- if okAddr and (not adder in emails):
- emails.append(adder)
- _logger.debug("emails from xwho : '%s'" % emails)
- # return
- if len(emails) == 1:
- _logger.debug('Succeeded : %s %s' % (str(emails),dn))
- return emails
- # multiple candidates
- if len(emails) > 1:
- _logger.error("non unique address : %s for %s" % (str(emails),dn))
- return []
- # failed
- _logger.error('Failed to find address for %s' % dn)
- return []
-
-
-
-
-
diff --git a/current/pandaserver/dataservice/Closer.py b/current/pandaserver/dataservice/Closer.py
deleted file mode 100755
index 8301945d3..000000000
--- a/current/pandaserver/dataservice/Closer.py
+++ /dev/null
@@ -1,290 +0,0 @@
-'''
-update dataset DB, and then close dataset and start Activator if needed
-
-'''
-
-import re
-import sys
-import time
-import urllib
-import commands
-import threading
-from DDM import ddm
-import Notifier
-import RetryMaker
-from Activator import Activator
-from pandalogger.PandaLogger import PandaLogger
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-from taskbuffer.DatasetSpec import DatasetSpec
-from brokerage.SiteMapper import SiteMapper
-from config import panda_config
-import brokerage.broker_util
-
-# logger
-_logger = PandaLogger().getLogger('Closer')
-
-def initLogger(pLogger):
- # redirect logging to parent as it doesn't work in nested threads
- global _logger
- _logger = pLogger
- Notifier.initLogger(_logger)
- RetryMaker.initLogger(_logger)
-
-
-class Closer (threading.Thread):
- # constructor
- def __init__(self,taskBuffer,destinationDBlocks,job,pandaDDM=False,datasetMap={}):
- threading.Thread.__init__(self)
- self.taskBuffer = taskBuffer
- self.destinationDBlocks = destinationDBlocks
- self.job = job
- self.pandaID = job.PandaID
- self.pandaDDM = pandaDDM
- self.siteMapper = None
- self.datasetMap = datasetMap
-
-
- # main
- def run(self):
- try:
- _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus))
- flagComplete = True
- ddmJobs = []
- topUserDsList = []
- usingMerger = False
- disableNotifier = False
- firstIndvDS = True
- for destinationDBlock in self.destinationDBlocks:
- dsList = []
- _logger.debug('%s start %s' % (self.pandaID,destinationDBlock))
- # ignore tid datasets
- if re.search('_tid[\d_]+$',destinationDBlock):
- _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock))
- continue
- # query dataset
- if self.datasetMap.has_key(destinationDBlock):
- dataset = self.datasetMap[destinationDBlock]
- else:
- dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock})
- if dataset == None:
- _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock))
- flagComplete = False
- continue
- # skip tobedeleted/tobeclosed
- if dataset.status in ['cleanup','tobeclosed','completed']:
- _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status))
- continue
- dsList.append(dataset)
- # sort
- dsList.sort()
- # count number of completed files
- notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock,
- 'status':'unknown'})
- if notFinish < 0:
- _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish))
- flagComplete = False
- continue
- # check if completed
- _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish))
- if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']:
- # close non-DQ2 destinationDBlock immediately
- finalStatus = 'closed'
- elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \
- and self.job.processingType != 'usermerge':
- # merge output files
- if firstIndvDS:
- # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS
- finalStatus = 'tobemerged'
- firstIndvDS = False
- else:
- finalStatus = 'tobeclosed'
- # set merging to top dataset
- usingMerger = True
- # disable Notifier
- disableNotifier = True
- else:
- # set status to 'tobeclosed' to trigger DQ2 closing
- finalStatus = 'tobeclosed'
- if notFinish==0:
- _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock))
- # set status
- dataset.status = finalStatus
- # update dataset in DB
- retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
- criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
- if len(retT) > 0 and retT[0]==1:
- # close user datasets
- if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \
- and (dataset.name.startswith('user') or dataset.name.startswith('group')):
- # get top-level user dataset
- topUserDsName = re.sub('_sub\d+$','',dataset.name)
- # update if it is the first attempt
- if topUserDsName != dataset.name and not topUserDsName in topUserDsList:
- topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName})
- if topUserDs != None:
- # check status
- if topUserDs.status in ['completed','cleanup','tobeclosed',
- 'tobemerged','merging']:
- _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status))
- else:
- # set status
- if self.job.processingType.startswith('gangarobot') or \
- self.job.processingType.startswith('hammercloud'):
- # not trigger freezing for HC datasets so that files can be appended
- topUserDs.status = 'completed'
- elif not usingMerger:
- topUserDs.status = finalStatus
- else:
- topUserDs.status = 'merging'
- # append to avoid repetition
- topUserDsList.append(topUserDsName)
- # update DB
- retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus",
- criteriaMap={':crStatus':topUserDs.status})
- if len(retTopT) > 0 and retTopT[0]==1:
- _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName))
- else:
- _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName))
- # get parent dataset for merge job
- if self.job.processingType == 'usermerge':
- tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters)
- if tmpMatch == None:
- _logger.error('%s failed to extract parentDS' % self.pandaID)
- else:
- unmergedDsName = tmpMatch.group(1)
- # update if it is the first attempt
- if not unmergedDsName in topUserDsList:
- unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName})
- if unmergedDs == None:
- _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName))
- else:
- # check status
- if unmergedDs.status in ['completed','cleanup','tobeclosed']:
- _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status))
- else:
- # set status
- unmergedDs.status = finalStatus
- # append to avoid repetition
- topUserDsList.append(unmergedDsName)
- # update DB
- retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus",
- criteriaMap={':crStatus':unmergedDs.status})
- if len(retTopT) > 0 and retTopT[0]==1:
- _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName))
- else:
- _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName))
- if self.pandaDDM and self.job.prodSourceLabel=='managed':
- # instantiate SiteMapper
- if self.siteMapper == None:
- self.siteMapper = SiteMapper(self.taskBuffer)
- # get file list for PandaDDM
- retList = self.taskBuffer.queryFilesWithMap({'destinationDBlock':destinationDBlock})
- lfnsStr = ''
- guidStr = ''
- for tmpFile in retList:
- if tmpFile.type in ['log','output']:
- lfnsStr += '%s,' % tmpFile.lfn
- guidStr += '%s,' % tmpFile.GUID
- if lfnsStr != '':
- guidStr = guidStr[:-1]
- lfnsStr = lfnsStr[:-1]
- # create a DDM job
- ddmjob = JobSpec()
- ddmjob.jobDefinitionID = int(time.time()) % 10000
- ddmjob.jobName = "%s" % commands.getoutput('uuidgen')
- ddmjob.transformation = 'http://pandaserver.cern.ch:25080/trf/mover/run_dq2_cr'
- ddmjob.destinationDBlock = 'testpanda.%s' % ddmjob.jobName
- ddmjob.computingSite = "BNL_ATLAS_DDM"
- ddmjob.destinationSE = ddmjob.computingSite
- ddmjob.currentPriority = 200000
- ddmjob.prodSourceLabel = 'ddm'
- ddmjob.transferType = 'sub'
- # append log file
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % ddmjob.jobName
- fileOL.destinationDBlock = ddmjob.destinationDBlock
- fileOL.destinationSE = ddmjob.destinationSE
- fileOL.dataset = ddmjob.destinationDBlock
- fileOL.type = 'log'
- ddmjob.addFile(fileOL)
- # make arguments
- dstDQ2ID = 'BNLPANDA'
- srcDQ2ID = self.siteMapper.getSite(self.job.computingSite).ddm
- callBackURL = 'https://%s:%s/server/panda/datasetCompleted?vuid=%s&site=%s' % \
- (panda_config.pserverhost,panda_config.pserverport,
- dataset.vuid,dstDQ2ID)
- _logger.debug(callBackURL)
- # set src/dest
- ddmjob.sourceSite = srcDQ2ID
- ddmjob.destinationSite = dstDQ2ID
- # if src==dst, send callback without ddm job
- if dstDQ2ID == srcDQ2ID:
- comout = commands.getoutput('curl -k %s' % callBackURL)
- _logger.debug(comout)
- else:
- # run dq2_cr
- callBackURL = urllib.quote(callBackURL)
- # get destination dir
- destDir = brokerage.broker_util._getDefaultStorage(self.siteMapper.getSite(self.job.computingSite).dq2url)
- argStr = "-s %s -r %s --guids %s --lfns %s --callBack %s -d %s/%s %s" % \
- (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,callBackURL,destDir,
- destinationDBlock,destinationDBlock)
- # set job parameters
- ddmjob.jobParameters = argStr
- _logger.debug('%s pdq2_cr %s' % (self.pandaID,ddmjob.jobParameters))
- ddmJobs.append(ddmjob)
- # start Activator
- if re.search('_sub\d+$',dataset.name) == None:
- if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']:
- # don't trigger Activator for merge jobs
- pass
- else:
- if self.job.jobStatus == 'finished':
- aThr = Activator(self.taskBuffer,dataset)
- aThr.start()
- aThr.join()
- else:
- # unset flag since another thread already updated
- flagComplete = False
- else:
- # update dataset in DB
- self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
- criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
- # unset flag
- flagComplete = False
- # end
- _logger.debug('%s end %s' % (self.pandaID,destinationDBlock))
- # start DDM jobs
- if ddmJobs != []:
- self.taskBuffer.storeJobs(ddmJobs,self.job.prodUserID,joinThr=True)
- # change pending jobs to failed
- if flagComplete and self.job.prodSourceLabel=='user':
- #_logger.debug('%s call RetryMaker for %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID))
- #retryMaker = RetryMaker.RetryMaker(self.taskBuffer,self.job)
- #retryMaker.run()
- _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID))
- self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID)
- # start notifier
- _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete))
- if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \
- (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')):
- # don't send email for merge jobs
- if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']:
- useNotifier = True
- summaryInfo = {}
- # check all jobDefIDs in jobsetID
- if not self.job.jobsetID in [0,None,'NULL']:
- useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID,
- self.job.prodUserName)
- _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier))
- if useNotifier:
- _logger.debug('%s start Notifier' % self.pandaID)
- nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo)
- nThr.run()
- _logger.debug('%s end Notifier' % self.pandaID)
- _logger.debug('%s End' % self.pandaID)
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s %s" % (errType,errValue))
-
diff --git a/current/pandaserver/dataservice/DDM.py b/current/pandaserver/dataservice/DDM.py
deleted file mode 100755
index 5888a36b3..000000000
--- a/current/pandaserver/dataservice/DDM.py
+++ /dev/null
@@ -1,344 +0,0 @@
-"""
-provide primitive methods for DDM
-
-"""
-
-import sys
-import types
-import commands
-from config import panda_config
-
-
-# change cwd
-_cwd = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd)
-
-# environment variables
-_env = 'PATH=%s:%s:$PATH ' % (panda_config.native_python,panda_config.globus_dir+'/bin')
-_env+= 'LD_LIBRARY_PATH=%s ' % (panda_config.globus_dir+'/lib')
-_env+= 'DQ2_HOME=%s/opt/dq2 ' % panda_config.dq2_dir
-_env+= 'http_proxy=%s ' % panda_config.httpProxy
-_env+= 'https_proxy=%s ' % panda_config.httpProxy
-
-_env+= 'PYTHONPATH=%s/usr/lib/python2.3/site-packages:$PYTHONPATH' \
- % panda_config.dq2_dir
-
-# method object wrapping DQ2 method
-class _DQMethod:
- # constructor
- def __init__(self,moduleName,methodName):
- self.moduleName = moduleName
- self.methodName = methodName
-
- # method emulation
- def __call__(self,*args,**kwargs):
- # main method has disappeared since 0.3
- args = list(args)
- if self.methodName == 'main':
- self.methodName = args[0]
- args.pop(0)
- # build command
- com = 'import dq2.clientapi.cli.cliutil; '
- #com += 'import sys; sys.tracebacklimit=0; '
- com += 'dq2api = dq2.clientapi.cli.cliutil.getDQ2(None); '
- if self.moduleName == 'DQ2':
- # DQ2 is top-level module
- com += 'print dq2api.%s(' % self.methodName
- elif self.moduleName == 'DQ2_iter':
- # iterator
- com += 'iter = dq2api.%s(' % self.methodName
- else:
- com += 'print dq2api.%s.%s(' % (self.moduleName,self.methodName)
- # expand args
- for i in range(len(args)):
- arg = args[i]
- if isinstance(arg,types.StringType):
- # check invalid characters
- for invCh in ['"',"'",'(',')',';']:
- if invCh in arg:
- return -1,"invalid character %s in %s" % (invCh,arg)
- com = "%s'%s'," % (com,arg)
- else:
- com = '%s%s,' % (com,str(arg))
- for tmpK,tmpV in kwargs.iteritems():
- if isinstance(tmpV,types.StringType):
- com += "%s='%s'," % (tmpK,tmpV)
- else:
- com += "%s=%s," % (tmpK,tmpV)
- com = com[:-1]
- com += ")"
- # loop over iterator
- if self.moduleName == 'DQ2_iter':
- com += ";exec 'for item in iter:print item'"
- # execute
- return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com))
-
-
-# DQ module class
-class _DQModule:
- # constructor
- def __init__(self,moduleName):
- self.moduleName = moduleName
-
- # factory method
- def __getattr__(self,methodName):
- return _DQMethod(self.moduleName,methodName)
-
-
-# native DQ2 method class
-class NativeDQ2Method:
- # constructor
- def __init__(self):
- self.moduleName = None
- self.methodName = None
- # set module and method name
- def setNames(self,moduleName,methodName):
- self.moduleName = moduleName
- self.methodName = methodName
- # method emulation
- def __call__(self,*args,**kwargs):
- try:
- # make dq2api locally since global dq2 object is not thread-safe
- import dq2.clientapi.cli.cliutil
- dq2api = dq2.clientapi.cli.cliutil.getDQ2(None)
- # main method has disappeared since 0.3
- args = list(args)
- if self.methodName == 'main':
- self.methodName = args[0]
- args.pop(0)
- # get method object
- if self.moduleName in ['DQ2','DQ2_iter']:
- methodObj = getattr(dq2api,self.methodName)
- else:
- methodObj = getattr(getattr(dq2api,self.moduleName),self.methodName)
- # execute
- retVal = apply(methodObj,args,kwargs)
- # loop over for iterator
- if self.moduleName == 'DQ2_iter':
- strRet = ''
- for item in retVal:
- strRet += str(item)
- else:
- strRet = str(retVal)
- # return
- return 0,strRet
- except:
- errType,errVale = sys.exc_info()[:2]
- return 1,'%s %s' % (errType,errVale)
-
-
-
-# native DQ2 module class
-class NativeDQ2Module:
- # constructor
- def __init__(self):
- self.moduleName = None
- # set module name
- def setModName(self,moduleName):
- self.moduleName = moduleName
- # getter
- def __getattr__(self,methodName):
- # set method name
- api = NativeDQ2Method()
- api.setNames(self.moduleName,methodName)
- return api
-
-
-# factory class
-class DDM:
- # constructor
- def __init__(self):
- self.usingNativeDQ2 = False
- # switch to use DQ2 in the same session
- def useDirectDQ2(self):
- self.usingNativeDQ2 = True
- # getter
- def __getattr__(self,moduleName):
- if not self.usingNativeDQ2:
- # run dq2 comamnd in another session
- return _DQModule(moduleName)
- else:
- # run dq2 command in the same session
- nativeDQ2 = NativeDQ2Module()
- nativeDQ2.setModName(moduleName)
- return nativeDQ2
-
-# instantiate
-ddm = DDM()
-del DDM
-
-
-# method object wrapping TOA method
-class _TOAMethod:
- # constructor
- def __init__(self,methodName):
- self.methodName = methodName
-
- # method emulation
- def __call__(self,*args):
- args = list(args)
- # build command
- com = 'from dq2.info import TiersOfATLAS; '
- com += 'print TiersOfATLAS.%s(' % self.methodName
- # expand args
- for i in range(len(args)):
- arg = args[i]
- if isinstance(arg,types.StringType):
- com += "'%s'," % arg
- else:
- com = '%s,' % arg
- com = com[:-1]
- com += ")"
- # execute
- return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com))
-
-
-# native ToA method class
-class NativeTOAMethod:
- # constructor
- def __init__(self):
- self.methodName = None
- from dq2.info import TiersOfATLAS
- self.api = TiersOfATLAS
- # set method name
- def setName(self,methodName):
- self.methodName = methodName
- # method emulation
- def __call__(self,*args,**kwargs):
- try:
- methodObj = getattr(self.api,self.methodName)
- # execute
- retVal = apply(methodObj,args,kwargs)
- strRet = str(retVal)
- # return
- return 0,strRet
- except:
- errType,errVale = sys.exc_info()[:2]
- return 1,'%s %s' % (errType,errVale)
-
-
-# TOA module class
-class TOA:
- # constructor
- def __init__(self):
- self.usingNativeDQ2 = False
- self.nativeTOA = None
- # getter
- def __getattr__(self,methodName):
- if not ddm.usingNativeDQ2:
- # run dq2 comamnd in another session
- return _TOAMethod(methodName)
- else:
- # make method object
- if self.nativeTOA == None:
- self.nativeTOA = NativeTOAMethod()
- # run dq2 command in the same session
- self.nativeTOA.setName(methodName)
- return self.nativeTOA
-
-
-
-# instantiate
-toa = TOA()
-del TOA
-
-
-# method object wrapping Dashboard method
-class _DashBoradMethod:
- # constructor
- def __init__(self,methodName):
- self.methodName = methodName
-
- # method emulation
- def __call__(self,*args):
- args = list(args)
- # build command
- com = "import sys;sys.stderr=open('/dev/null','w');"
- com += "import datetime;from dashboard.api.data.DataQuery import DataQuery;"
- com += "sys.stderr=sys.__stderr__;"
- com += "dash=DataQuery('dashb-atlas-data.cern.ch', 80);"
- com += "print dash.%s(%s,'%s'," % (self.methodName,args[0],args[1])
- com += "startDate=datetime.datetime.utcnow()-datetime.timedelta(hours=24))"
- # execute
- return commands.getstatusoutput('%s python -c "%s"' % (_cwd,com))
-
-
-# TOA module class
-class DashBorad:
- def __getattr__(self,methodName):
- return _DashBoradMethod(methodName)
-
-# instantiate
-dashBorad = DashBorad()
-del DashBorad
-
-
-# method object wrapping DQ2Info method
-class _DQ2InfoMethod:
- # constructor
- def __init__(self,methodName):
- self.methodName = methodName
-
- # method emulation
- def __call__(self,*args):
- args = list(args)
- # build command
- com = 'from dq2.info.client.infoClient import infoClient; '
- com += 'print infoClient().%s(' % self.methodName
- # expand args
- for i in range(len(args)):
- arg = args[i]
- if isinstance(arg,types.StringType):
- com += "'%s'," % arg
- else:
- com = '%s,' % arg
- com = com[:-1]
- com += ")"
- # execute
- return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com))
-
-
-# TOA module class
-class DQ2Info:
- def __getattr__(self,methodName):
- return _DQ2InfoMethod(methodName)
-
-
-# instantiate
-dq2Info = DQ2Info()
-del DQ2Info
-
-
-# method object wrapping dq2 common
-class _DQ2CommonMethod:
- # constructor
- def __init__(self,methodName):
- self.methodName = methodName
-
- # method emulation
- def __call__(self,*args):
- args = list(args)
- # build command
- com = 'from dq2.common import %s; ' % self.methodName
- com += 'print %s(' % self.methodName
- # expand args
- for i in range(len(args)):
- arg = args[i]
- if isinstance(arg,types.StringType):
- com += "'%s'," % arg
- else:
- com = '%s,' % arg
- com = com[:-1]
- com += ")"
- # execute
- return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com))
-
-
-# TOA module class
-class DQ2Common:
- def __getattr__(self,methodName):
- return _DQ2CommonMethod(methodName)
-
-
-# instantiate
-dq2Common = DQ2Common()
-del DQ2Common
diff --git a/current/pandaserver/dataservice/DDMHandler.py b/current/pandaserver/dataservice/DDMHandler.py
deleted file mode 100755
index 165738c8e..000000000
--- a/current/pandaserver/dataservice/DDMHandler.py
+++ /dev/null
@@ -1,48 +0,0 @@
-'''
-master hander for DDM
-
-'''
-
-import re
-import threading
-
-from Waker import Waker
-from Finisher import Finisher
-from Activator import Activator
-
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('DDMHandler')
-
-
-class DDMHandler (threading.Thread):
- # constructor
- def __init__(self,taskBuffer,vuid,site=None):
- threading.Thread.__init__(self)
- self.vuid = vuid
- self.taskBuffer = taskBuffer
- self.site = site
-
-
- # main
- def run(self):
- # query dataset
- _logger.debug("start: %s %s" % (self.vuid,self.site))
- dataset = self.taskBuffer.queryDatasetWithMap({'vuid':self.vuid})
- if dataset == None:
- _logger.error("Not found : %s" % self.vuid)
- _logger.debug("end: %s" % self.vuid)
- return
- _logger.debug("vuid:%s type:%s name:%s" % (self.vuid,dataset.type,dataset.name))
- if dataset.type == 'dispatch':
- # activate jobs in jobsDefined
- Activator(self.taskBuffer,dataset).start()
- if dataset.type == 'output':
- if dataset.name != None and re.search('^panda\..*_zip$',dataset.name) != None:
- # start unmerge jobs
- Activator(self.taskBuffer,dataset,enforce=True).start()
- else:
- # finish transferring jobs
- Finisher(self.taskBuffer,dataset,site=self.site).start()
- _logger.debug("end: %s" % self.vuid)
diff --git a/current/pandaserver/dataservice/DataService.py b/current/pandaserver/dataservice/DataService.py
deleted file mode 100755
index 540987e1a..000000000
--- a/current/pandaserver/dataservice/DataService.py
+++ /dev/null
@@ -1,99 +0,0 @@
-"""
-provide web service for DDM
-
-"""
-
-import re
-import sys
-import cPickle as pickle
-from config import panda_config
-from taskbuffer.WrappedPickle import WrappedPickle
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('DataService')
-
-
-class DataService:
- # constructor
- def __init__(self):
- self.taskBuffer = None
-
- # set taskbuffer
- def init(self,taskBuffer):
- self.taskBuffer = taskBuffer
-
-# Singleton
-dataService = DataService()
-del DataService
-
-
-'''
-web interface
-
-'''
-
-from DDMHandler import DDMHandler
-
-
-# callback for dataset verification
-def datasetCompleted(req,vuid,site=None):
- thr = DDMHandler(dataService.taskBuffer,vuid,site)
- thr.start()
- thr.join()
- return True
-
-
-# get FQANs
-def _getFQAN(req):
- fqans = []
- for tmpKey,tmpVal in req.subprocess_env.iteritems():
- # compact credentials
- if tmpKey.startswith('GRST_CRED_'):
- # VOMS attribute
- if tmpVal.startswith('VOMS'):
- # FQAN
- fqan = tmpVal.split()[-1]
- # append
- fqans.append(fqan)
- # old style
- elif tmpKey.startswith('GRST_CONN_'):
- tmpItems = tmpVal.split(':')
- # FQAN
- if len(tmpItems)==2 and tmpItems[0]=='fqan':
- fqans.append(tmpItems[-1])
- # return
- return fqans
-
-
-# set file status
-def updateFileStatusInDisp(req,dataset,fileStatus):
- try:
- # get FQAN
- fqans = _getFQAN(req)
- roleOK = False
- # loop over all FQANs
- for fqan in fqans:
- # check production role
- for rolePat in ['/atlas/usatlas/Role=production',
- '/atlas/Role=production',
- # use /atlas since delegation proxy doesn't inherit roles
- '/atlas/']:
- if fqan.startswith(rolePat):
- roleOK = True
- break
- if not roleOK:
- _logger.error('updateFileStatusInDisp : invalid proxy %s' % fqans)
- return "False"
- # deserialize fileStatus
- fileStatusMap = WrappedPickle.loads(fileStatus)
- _logger.debug('updateFileStatusInDisp : start %s - %s' % (dataset,fileStatusMap))
- # update status
- dataService.taskBuffer.updateFileStatusInDisp(dataset,fileStatusMap)
- _logger.debug('updateFileStatusInDisp : done')
- return "True"
- except:
- type,value,traceBack = sys.exc_info()
- _logger.error("updateFileStatusInDisp : %s %s" % (type,value))
- return "False"
-
diff --git a/current/pandaserver/dataservice/DataServiceUtils.py b/current/pandaserver/dataservice/DataServiceUtils.py
deleted file mode 100644
index 0e4093cbb..000000000
--- a/current/pandaserver/dataservice/DataServiceUtils.py
+++ /dev/null
@@ -1,281 +0,0 @@
-import re
-import sys
-
-# get prefix for DQ2
-def getDQ2Prefix(dq2SiteID):
- # prefix of DQ2 ID
- tmpDQ2IDPrefix = re.sub('_[A-Z,0-9]+DISK$','',dq2SiteID)
- # remove whitespace
- tmpDQ2IDPrefix = tmpDQ2IDPrefix.strip()
- # patchfor MWT2
- if tmpDQ2IDPrefix == 'MWT2_UC':
- tmpDQ2IDPrefix = 'MWT2'
- return tmpDQ2IDPrefix
-
-
-# check if the file is cached
-def isCachedFile(datasetName,siteSpec):
- # using CVMFS
- if siteSpec.iscvmfs != True:
- return False
- # FIXME
- if not siteSpec.cloud in ['IT']:
- return False
- # look for DBR
- if not datasetName.startswith('ddo.'):
- return False
- # look for three digits
- if re.search('v\d{6}$',datasetName) == None:
- return False
- return True
-
-
-# get the list of sites where dataset is available
-def getSitesWithDataset(tmpDsName,siteMapper,replicaMap,cloudKey,useHomeCloud=False,getDQ2ID=False,
- useOnlineSite=False,includeT1=False):
- retList = []
- retDQ2Map = {}
- # no replica map
- if not replicaMap.has_key(tmpDsName):
- if getDQ2ID:
- return retDQ2Map
- return retList
- # use valid cloud
- if not siteMapper.checkCloud(cloudKey):
- if getDQ2ID:
- return retDQ2Map
- return retList
- # check sites in the cloud
- for tmpSiteName in siteMapper.getCloud(cloudKey)['sites']:
- # skip T1
- if not includeT1:
- # T1
- if tmpSiteName == siteMapper.getCloud(cloudKey)['source']:
- continue
- # hospital queue
- if siteMapper.getSite(tmpSiteName).ddm == siteMapper.getSite(siteMapper.getCloud(cloudKey)['source']).ddm:
- continue
- # use home cloud
- if useHomeCloud:
- if siteMapper.getSite(tmpSiteName).cloud != cloudKey:
- continue
- # online
- if siteMapper.getSite(tmpSiteName).status != 'online':
- continue
- # check all associated DQ2 IDs
- tmpFoundFlag = False
- tmpSiteSpec = siteMapper.getSite(tmpSiteName)
- for tmpSiteDQ2ID in [tmpSiteSpec.ddm]+tmpSiteSpec.setokens.values():
- # prefix of DQ2 ID
- tmpDQ2IDPrefix = getDQ2Prefix(tmpSiteDQ2ID)
- # ignore empty
- if tmpDQ2IDPrefix == '':
- continue
- # loop over all replica DQ2 IDs
- for tmpDQ2ID in replicaMap[tmpDsName].keys():
- # use DATADISK or GROUPDISK
- if '_SCRATCHDISK' in tmpDQ2ID or \
- '_USERDISK' in tmpDQ2ID or \
- '_PRODDISK' in tmpDQ2ID or \
- '_LOCALGROUPDISK' in tmpDQ2ID or \
- 'TAPE' in tmpDQ2ID or \
- '_DAQ' in tmpDQ2ID or \
- '_TMPDISK' in tmpDQ2ID or \
- '_TZERO' in tmpDQ2ID:
- continue
- # check DQ2 prefix
- if tmpDQ2ID.startswith(tmpDQ2IDPrefix):
- tmpFoundFlag = True
- if not getDQ2ID:
- break
- # append map
- if not retDQ2Map.has_key(tmpSiteName):
- retDQ2Map[tmpSiteName] = []
- if not tmpDQ2ID in retDQ2Map[tmpSiteName]:
- retDQ2Map[tmpSiteName].append(tmpDQ2ID)
- # append
- if tmpFoundFlag:
- retList.append(tmpSiteName)
- # return map
- if getDQ2ID:
- return retDQ2Map
- # retrun
- return retList
-
-
-# get the number of files available at the site
-def getNumAvailableFilesSite(siteName,siteMapper,replicaMap,badMetaMap,additionalSEs=[],
- noCheck=[],fileCounts=None):
- try:
- # get DQ2 endpoints
- tmpSiteSpec = siteMapper.getSite(siteName)
- prefixList = []
- for tmpSiteDQ2ID in [tmpSiteSpec.ddm]+tmpSiteSpec.setokens.values():
- # prefix of DQ2 ID
- tmpDQ2IDPrefix = getDQ2Prefix(tmpSiteDQ2ID)
- # ignore empty
- if tmpDQ2IDPrefix != '':
- prefixList.append(tmpDQ2IDPrefix)
- # loop over datasets
- totalNum = 0
- for tmpDsName,tmpSitesData in replicaMap.iteritems():
- # cached files
- if isCachedFile(tmpDsName,tmpSiteSpec) and fileCounts != None and \
- fileCounts.has_key(tmpDsName):
- # add with no check
- totalNum += fileCounts[tmpDsName]
- continue
- # dataset type
- datasetType = getDatasetType(tmpDsName)
- # use total num to effectively skip file availability check
- if datasetType in noCheck:
- columnName = 'total'
- else:
- columnName = 'found'
- # get num of files
- maxNumFile = 0
- # for T1 or T2
- if additionalSEs != []:
- # check T1 endpoints
- for tmpSePat in additionalSEs:
- # ignore empty
- if tmpSePat == '':
- continue
- # make regexp pattern
- if '*' in tmpSePat:
- tmpSePat = tmpSePat.replace('*','.*')
- tmpSePat = '^' + tmpSePat +'$'
- # loop over all sites
- for tmpSE in tmpSitesData.keys():
- # skip bad metadata
- if badMetaMap.has_key(tmpDsName) and tmpSE in badMetaMap[tmpDsName]:
- continue
- # check match
- if re.search(tmpSePat,tmpSE) == None:
- continue
- # get max num of files
- tmpN = tmpSitesData[tmpSE][0][columnName]
- if tmpN != None and tmpN > maxNumFile:
- maxNumFile = tmpN
- else:
- # check explicit endpoint name
- for tmpSiteDQ2ID in [tmpSiteSpec.ddm]+tmpSiteSpec.setokens.values():
- # skip bad metadata
- if badMetaMap.has_key(tmpDsName) and tmpSiteDQ2ID in badMetaMap[tmpDsName]:
- continue
- # ignore empty
- if tmpSiteDQ2ID == '':
- continue
- # get max num of files
- if tmpSitesData.has_key(tmpSiteDQ2ID):
- tmpN = tmpSitesData[tmpSiteDQ2ID][0][columnName]
- if tmpN != None and tmpN > maxNumFile:
- maxNumFile = tmpN
- # check prefix
- for tmpDQ2IDPrefix in prefixList:
- for tmpDQ2ID,tmpStat in tmpSitesData.iteritems():
- # skip bad metadata
- if badMetaMap.has_key(tmpDsName) and tmpDQ2ID in badMetaMap[tmpDsName]:
- continue
- # ignore NG
- if '_SCRATCHDISK' in tmpDQ2ID or \
- '_USERDISK' in tmpDQ2ID or \
- '_PRODDISK' in tmpDQ2ID or \
- '_LOCALGROUPDISK' in tmpDQ2ID or \
- '_DAQ' in tmpDQ2ID or \
- '_TMPDISK' in tmpDQ2ID or \
- '_TZERO' in tmpDQ2ID:
- continue
- # check prefix
- if tmpDQ2ID.startswith(tmpDQ2IDPrefix):
- tmpN = tmpSitesData[tmpDQ2ID][0][columnName]
- if tmpN != None and tmpN > maxNumFile:
- maxNumFile = tmpN
- # sum
- totalNum += maxNumFile
- # return
- return True,totalNum
- except:
- errtype,errvalue = sys.exc_info()[:2]
- return False,'%s:%s' % (errtype,errvalue)
-
-
-# get the list of sites where dataset is available
-def getEndpointsAtT1(tmpRepMap,siteMapper,cloudName):
- retList = []
- # get cloud SEs
- tmpCloud = siteMapper.getCloud(cloudName)
- cloudSEs = tmpCloud['tier1SE']
- # check T1 endpoints
- for tmpSePat in cloudSEs:
- # ignore empty
- if tmpSePat == '':
- continue
- # make regexp pattern
- if '*' in tmpSePat:
- tmpSePat = tmpSePat.replace('*','.*')
- tmpSePat = '^' + tmpSePat +'$'
- # loop over all sites
- for tmpSE in tmpRepMap.keys():
- # check match
- if re.search(tmpSePat,tmpSE) == None:
- continue
- # append
- if not tmpSE in retList:
- retList.append(tmpSE)
- # return
- return retList
-
-
-# check DDM response
-def isDQ2ok(out):
- if out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1 \
- or out == '()':
- return False
- return True
-
-
-# check if DBR
-def isDBR(datasetName):
- if datasetName.startswith('ddo'):
- return True
- return False
-
-
-# get the list of sites in a cloud which cache a dataset
-def getSitesWithCacheDS(cloudKey,excludedSites,siteMapper,datasetName):
- retList = []
- # check sites in the cloud
- for tmpSiteName in siteMapper.getCloud(cloudKey)['sites']:
- # excluded
- if tmpSiteName in excludedSites:
- continue
- # skip T1
- if tmpSiteName == siteMapper.getCloud(cloudKey)['source']:
- continue
- # hospital queue
- if siteMapper.getSite(tmpSiteName).ddm == siteMapper.getSite(siteMapper.getCloud(cloudKey)['source']).ddm:
- continue
- # not home cloud
- if siteMapper.getSite(tmpSiteName).cloud != cloudKey:
- continue
- # online
- if siteMapper.getSite(tmpSiteName).status != 'online':
- continue
- # check CVMFS
- if isCachedFile(datasetName,siteMapper.getSite(tmpSiteName)):
- retList.append(tmpSiteName)
- # return
- return retList
-
-
-# get dataset type
-def getDatasetType(dataset):
- datasetType = None
- try:
- datasetType = dataset.split('.')[4]
- except:
- pass
- return datasetType
diff --git a/current/pandaserver/dataservice/DynDataDistributer.py b/current/pandaserver/dataservice/DynDataDistributer.py
deleted file mode 100644
index 8a808a54c..000000000
--- a/current/pandaserver/dataservice/DynDataDistributer.py
+++ /dev/null
@@ -1,1657 +0,0 @@
-'''
-find candidate site to distribute input datasets
-
-'''
-
-import re
-import sys
-import time
-import math
-import types
-import random
-import datetime
-
-from dataservice.DDM import ddm
-from dataservice.DDM import toa
-from taskbuffer.JobSpec import JobSpec
-import brokerage.broker
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('DynDataDistributer')
-
-def initLogger(pLogger):
- # redirect logging to parent
- global _logger
- _logger = pLogger
-
-
-# NG datasets
-ngDataTypes = ['RAW','HITS','RDO','ESD','EVNT']
-
-# excluded provenance
-ngProvenance = []
-
-# protection for max number of replicas
-protectionMaxNumReplicas = 10
-
-# max number of waiting jobs
-maxWaitingJobs = 200
-
-# max number of waiting jobsets
-maxWaitingJobsets = 2
-
-# clouds with small T1 to make replica at T2
-cloudsWithSmallT1 = ['IT']
-
-# files in datasets
-g_filesInDsMap = {}
-
-
-class DynDataDistributer:
-
- # constructor
- def __init__(self,jobs,taskBuffer,siteMapper,simul=False,token=None):
- self.jobs = jobs
- self.taskBuffer = taskBuffer
- self.siteMapper = siteMapper
- if token == None:
- self.token = datetime.datetime.utcnow().isoformat(' ')
- else:
- self.token = token
- # use a fixed list since some clouds don't have active T2s
- self.pd2pClouds = ['CA','DE','ES','FR','IT','ND','NL','TW','UK','US']
- self.simul = simul
- self.lastMessage = ''
- self.cachedSizeMap = {}
- self.shareMoUForT2 = None
- self.mapTAGandParentGUIDs = {}
- self.tagParentInfo = {}
- self.parentLfnToTagMap = {}
-
-
- # main
- def run(self):
- try:
- self.putLog("start for %s" % self.jobs[0].PandaID)
- # check cloud
- if not self.jobs[0].cloud in self.pd2pClouds+['CERN',]:
- self.putLog("skip cloud=%s not one of PD2P clouds %s" % (self.jobs[0].cloud,str(self.pd2pClouds)))
- self.putLog("end for %s" % self.jobs[0].PandaID)
- return
- # ignore HC and group production
- if self.jobs[0].processingType in ['hammercloud','gangarobot'] or self.jobs[0].processingType.startswith('gangarobot'):
- self.putLog("skip due to processingType=%s" % self.jobs[0].processingType)
- self.putLog("end for %s" % self.jobs[0].PandaID)
- return
- # ignore HC and group production
- if not self.jobs[0].workingGroup in ['NULL',None,'']:
- self.putLog("skip due to workingGroup=%s" % self.jobs[0].workingGroup)
- self.putLog("end for %s" % self.jobs[0].PandaID)
- return
- # get input datasets
- inputDatasets = []
- for tmpJob in self.jobs:
- if tmpJob.prodSourceLabel == 'user':
- for tmpFile in tmpJob.Files:
- if tmpFile.type == 'input' and not tmpFile.lfn.endswith('.lib.tgz'):
- if not tmpFile.dataset in inputDatasets:
- inputDatasets.append(tmpFile.dataset)
- # loop over all input datasets
- for inputDS in inputDatasets:
- # only mc/data datasets
- moveFlag = False
- for projectName in ['mc','data']:
- if inputDS.startswith(projectName):
- moveFlag = True
- if not moveFlag:
- self.putLog("skip non official dataset %s" % inputDS)
- continue
- if re.search('_sub\d+$',inputDS) != None or re.search('_dis\d+$',inputDS) != None:
- self.putLog("skip dis/sub dataset %s" % inputDS)
- continue
- # check type
- tmpItems = inputDS.split('.')
- if len(tmpItems) < 5:
- self.putLog("cannot get type from %s" % inputDS)
- continue
- if tmpItems[4] in ngDataTypes:
- self.putLog("don't move %s : %s" % (tmpItems[4],inputDS))
- continue
- # get candidate sites
- self.putLog("get candidates for %s" % inputDS)
- status,sitesMaps = self.getCandidates(inputDS,useCloseSites=True)
- if not status:
- self.putLog("failed to get candidates")
- continue
- # get size of input container
- totalInputSize = 0
- if inputDS.endswith('/'):
- status,totalInputSize = self.getDatasetSize(inputDS)
- if not status:
- self.putLog("failed to get size of %s" % inputDS)
- continue
- # get number of waiting jobs and jobsets
- nWaitingJobsAll = self.taskBuffer.getNumWaitingJobsForPD2P(inputDS)
- nWaitingJobsets = self.taskBuffer.getNumWaitingJobsetsForPD2P(inputDS)
- # loop over all datasets
- usedSites = []
- for tmpDS,tmpVal in sitesMaps.iteritems():
- self.putLog("triggered for %s" % tmpDS,sendLog=True)
- # increment used counter
- if not self.simul:
- nUsed = self.taskBuffer.incrementUsedCounterSubscription(tmpDS)
- else:
- nUsed = 5
- # insert dummy for new dataset which is used to keep track of usage even if subscription is not made
- if nUsed == 0:
- retAddUserSub = self.taskBuffer.addUserSubscription(tmpDS,['DUMMY'])
- if not retAddUserSub:
- self.putLog("failed to add dummy subscription to database for %s " % tmpDS,type='error',sendLog=True)
- continue
- # collect candidates
- allCandidates = []
- totalUserSub = 0
- allCompPd2pSites = []
- allOKClouds = []
- totalSecReplicas = 0
- allT1Candidates = []
- totalT1Sub = 0
- cloudCandMap = {}
- nReplicasInCloud = {}
- allCandidatesMoU = []
- nTier1Copies = 0
- for tmpCloud,(candSites,sitesComDS,sitesPd2pDS,nUserSub,t1HasReplica,t1HasPrimary,nSecReplicas,nT1Sub,candForMoU) in tmpVal.iteritems():
- self.putLog("%s sites with comp DS:%s compPD2P:%s candidates:%s nSub:%s T1:%s Pri:%s nSec:%s nT1Sub:%s candMoU:%s" % \
- (tmpCloud,str(sitesComDS),str(sitesPd2pDS),str(candSites),nUserSub,t1HasReplica,t1HasPrimary,
- nSecReplicas,nT1Sub,str(candForMoU)))
- # add
- totalUserSub += nUserSub
- totalT1Sub += nT1Sub
- allCompPd2pSites += sitesPd2pDS
- totalSecReplicas += nSecReplicas
- cloudCandMap[tmpCloud] = candSites
- nReplicasInCloud[tmpCloud] = len(sitesComDS) + len(sitesPd2pDS)
- # cloud is candidate for T1-T1 when T1 doesn't have primary or secondary replicas or old subscriptions
- if not t1HasPrimary and nSecReplicas == 0 and nT1Sub == 0:
- allT1Candidates.append(tmpCloud)
- # the number of T1s with replica
- if t1HasPrimary or nSecReplicas > 0:
- nTier1Copies += 1
- # add candidates
- for tmpCandSite in candSites:
- if not tmpCandSite in usedSites:
- allCandidates.append(tmpCandSite)
- # add candidates for MoU
- for tmpCandSite in candForMoU:
- if not tmpCandSite in usedSites:
- allCandidatesMoU.append(tmpCandSite)
- # add clouds
- if not tmpCloud in allOKClouds:
- allOKClouds.append(tmpCloud)
- self.putLog("PD2P sites with comp replicas : %s" % str(allCompPd2pSites))
- self.putLog("PD2P T2 candidates : %s" % str(allCandidates))
- self.putLog("PD2P T2 MoU candidates : %s" % str(allCandidatesMoU))
- self.putLog("PD2P # of T2 subscriptions : %s" % totalUserSub)
- self.putLog("PD2P # of T1 secondaries : %s" % totalSecReplicas)
- self.putLog("PD2P # of T1 subscriptions : %s" % nT1Sub)
- self.putLog("PD2P # of T1 replicas : %s" % nTier1Copies)
- self.putLog("PD2P T1 candidates : %s" % str(allT1Candidates))
- self.putLog("PD2P nUsed : %s" % nUsed)
- # get dataset size
- retDsSize,dsSize = self.getDatasetSize(tmpDS)
- if not retDsSize:
- self.putLog("failed to get dataset size of %s" % tmpDS,type='error',sendLog=True)
- continue
- self.putLog("PD2P nWaitingJobsets : %s" % nWaitingJobsets)
- if totalInputSize != 0:
- self.putLog("PD2P nWaitingJobs : %s = %s(all)*%s(dsSize)/%s(contSize)" % \
- (int((float(nWaitingJobsAll * dsSize) / float(totalInputSize))),
- nWaitingJobsAll,dsSize,totalInputSize))
- else:
- self.putLog("PD2P nWaitingJobs : %s = %s(all)" % \
- (nWaitingJobsAll,nWaitingJobsAll))
- # make T1-T1
- triggeredT1PD2P = False
- if nUsed > 0:
- # extract integer part. log10(nUsed) and log10(nUsed)+1 are used to avoid round-off error
- intLog10nUsed = int(math.log10(nUsed))
- if self.simul or (int(math.log10(nUsed)) > totalSecReplicas and \
- (nUsed == 10**intLog10nUsed or nUsed == 10**(intLog10nUsed+1)) and \
- nT1Sub == 0 and allT1Candidates != []):
- self.putLog("making T1-T1",sendLog=True)
- # make subscription
- retT1Sub,useSmallT1 = self.makeT1Subscription(allT1Candidates,tmpDS,dsSize,nUsed)
- self.putLog("done for T1-T1")
- triggeredT1PD2P = True
- # make a T2 copy when T1 PD2P was triggered
- if triggeredT1PD2P:
- # TODO
- retT2MoU,selectedSite = self.makeT2SubscriptionMoU(allCandidatesMoU,tmpDS,dsSize,'T1MOU',nUsed)
- if retT2MoU and selectedSite != None:
- # remove from candidate list
- if selectedSite in allCandidates:
- allCandidates.remove(selectedSite)
- if selectedSite in allCandidatesMoU:
- allCandidatesMoU.remove(selectedSite)
- # increment the number of T2 subscriptions
- totalUserSub += 1
- # set the number of T2 PD2P replicas
- maxSitesHaveDS = 1
- # additional replicas
- if nWaitingJobsets > maxWaitingJobsets:
- # the number of waiting jobs for this dataset
- if totalInputSize != 0:
- # dataset in container
- tmpN = float(nWaitingJobsAll * dsSize) / float(totalInputSize)
- else:
- # dataset
- tmpN = float(nWaitingJobsAll)
- tmpN = int(math.log10(tmpN/float(maxWaitingJobs))) + nTier1Copies
- maxSitesHaveDS = max(maxSitesHaveDS,tmpN)
- # protection against too many replications
- maxSitesHaveDS = min(maxSitesHaveDS,protectionMaxNumReplicas)
- self.putLog("PD2P maxSitesHaveDS : %s" % maxSitesHaveDS)
- # ignore the first job
- if nUsed == 0:
- self.putLog("skip the first job",
- sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'FIRSTJOB','dataset':tmpDS})
- if not self.simul:
- continue
- # check number of replicas
- if len(allCompPd2pSites) >= maxSitesHaveDS and nUsed != 1:
- self.putLog("skip since many T2 PD2P sites (%s>=%s) have the replica" % (len(allCompPd2pSites),maxSitesHaveDS),
- sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'TOO_MANY_T2_REPLICAS','dataset':tmpDS})
- if not self.simul:
- continue
- # check the number of subscriptions
- maxNumSubInAllCloud = max(0,maxSitesHaveDS-len(allCompPd2pSites))
- maxNumSubInAllCloud = min(2,maxNumSubInAllCloud)
- self.putLog("PD2P maxNumSubInAllCloud : %s" % maxNumSubInAllCloud)
- if totalUserSub >= maxNumSubInAllCloud:
- self.putLog("skip since enough subscriptions (%s>=%s) were already made for T2 PD2P" % \
- (totalUserSub,maxNumSubInAllCloud),
- sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'TOO_MANY_T2_SUBSCRIPTIONS','dataset':tmpDS})
- if not self.simul:
- continue
- # no candidates
- if len(allCandidates) == 0:
- self.putLog("skip since no candidates",sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'NO_T2_CANDIDATE','dataset':tmpDS})
- continue
- # get inverse weight for brokerage
- weightForBrokerage = self.getWeightForBrokerage(allCandidates,tmpDS,nReplicasInCloud)
- self.putLog("inverse weight %s" % str(weightForBrokerage))
- # get free disk size
- self.putLog("getting free disk size for T2 PD2P")
- retFreeSizeMap,freeSizeMap = self.getFreeDiskSize(tmpDS,allCandidates)
- if not retFreeSizeMap:
- self.putLog("failed to get free disk size",type='error',sendLog=True)
- continue
- # run brokerage
- tmpJob = JobSpec()
- tmpJob.AtlasRelease = ''
- self.putLog("run brokerage for %s" % tmpDS)
- usedWeight = brokerage.broker.schedule([tmpJob],self.taskBuffer,self.siteMapper,True,allCandidates,
- True,specialWeight=weightForBrokerage,getWeight=True,
- sizeMapForCheck=freeSizeMap,datasetSize=dsSize)
- selectedSite = tmpJob.computingSite
- for tmpWeightSite,tmpWeightStr in usedWeight.iteritems():
- tmpTagsMap = {'site':tmpWeightSite,'weight':tmpWeightStr,'dataset':tmpDS}
- if tmpWeightSite == selectedSite:
- if nUsed == 1:
- tmpActionTag = 'SELECTEDT2_JOB'
- elif len(allCompPd2pSites) == 0:
- tmpActionTag = 'SELECTEDT2_NOREP'
- else:
- tmpActionTag = 'SELECTEDT2_WAIT'
- tmpTagsMap['nused'] = nUsed
- tmpTagsMap['nwaitingjobs'] = nWaitingJobsAll
- tmpTagsMap['nwaitingjobsets'] = nWaitingJobsets
- tmpTagsMap['nsiteshaveds'] = len(allCompPd2pSites)
- else:
- tmpActionTag = 'UNSELECTEDT2'
- self.putLog("weight %s %s" % (tmpWeightSite,tmpWeightStr),sendLog=True,
- actionTag=tmpActionTag,tagsMap=tmpTagsMap)
- self.putLog("site for T2 PD2P -> %s" % selectedSite)
- # remove from candidate list
- if selectedSite in allCandidates:
- allCandidates.remove(selectedSite)
- if selectedSite in allCandidatesMoU:
- allCandidatesMoU.remove(selectedSite)
- # make subscription
- if not self.simul:
- subRet,dq2ID = self.makeSubscription(tmpDS,selectedSite,ddmShare='secondary')
- self.putLog("made subscription to %s:%s" % (selectedSite,dq2ID),sendLog=True)
- usedSites.append(selectedSite)
- # update database
- if subRet:
- self.taskBuffer.addUserSubscription(tmpDS,[dq2ID])
- # additional T2 copy with MoU share when it is the second submission
- if nUsed == 1 or self.simul:
- retT2MoU,selectedSite = self.makeT2SubscriptionMoU(allCandidatesMoU,tmpDS,dsSize,'T2MOU',nUsed)
- self.putLog("end for %s" % self.jobs[0].PandaID)
- except:
- errType,errValue = sys.exc_info()[:2]
- self.putLog("%s %s" % (errType,errValue),'error')
-
-
- # get candidate sites for subscription
- def getCandidates(self,inputDS,checkUsedFile=True,useHidden=False,useCloseSites=False):
- # return for failure
- failedRet = False,{'':{'':([],[],[],0,False,False,0,0,[])}}
- # get replica locations
- if inputDS.endswith('/'):
- # container
- status,tmpRepMaps = self.getListDatasetReplicasInContainer(inputDS)
- # get used datasets
- if status and checkUsedFile:
- status,tmpUsedDsList = self.getUsedDatasets(tmpRepMaps)
- # remove unused datasets
- newRepMaps = {}
- for tmpKey,tmpVal in tmpRepMaps.iteritems():
- if tmpKey in tmpUsedDsList:
- newRepMaps[tmpKey] = tmpVal
- tmpRepMaps = newRepMaps
- else:
- # normal dataset
- status,tmpRepMap = self.getListDatasetReplicas(inputDS)
- tmpRepMaps = {inputDS:tmpRepMap}
- if not status:
- # failed
- self.putLog("failed to get replica locations for %s" % inputDS,'error')
- return failedRet
- # get close sites
- closeSitesMap = {}
- for tmpDS,tmpRepMap in tmpRepMaps.iteritems():
- # loop over all DQ2 IDs
- for tmpDQ2ID in tmpRepMap.keys():
- if not closeSitesMap.has_key(tmpDQ2ID):
- status,tmpCloseSiteList = toa.getCloseSites(tmpDQ2ID)
- exec "tmpCloseSiteList = %s" % tmpCloseSiteList
- closeSitesMap[tmpDQ2ID] = []
- # select only DATADISK
- for tmpCloseSite in tmpCloseSiteList:
- if tmpCloseSite.endswith('_DATADISK'):
- closeSitesMap[tmpDQ2ID].append(tmpCloseSite)
- # get all sites
- allSiteMap = {}
- for tmpSiteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems():
- # check cloud
- if not tmpSiteSpec.cloud in self.pd2pClouds:
- continue
- # ignore test sites
- if 'test' in tmpSiteName.lower():
- continue
- # analysis only
- if not tmpSiteName.startswith('ANALY'):
- continue
- # online
- if not tmpSiteSpec.status in ['online']:
- self.putLog("skip %s due to status=%s" % (tmpSiteName,tmpSiteSpec.status))
- continue
- if not allSiteMap.has_key(tmpSiteSpec.cloud):
- allSiteMap[tmpSiteSpec.cloud] = []
- allSiteMap[tmpSiteSpec.cloud].append(tmpSiteSpec)
- # NG DQ2 IDs
- ngDQ2SuffixList = ['LOCALGROUPDISK']
- # loop over all clouds
- returnMap = {}
- checkedMetaMap = {}
- userSubscriptionsMap = {}
- for cloud in self.pd2pClouds:
- # DQ2 prefix of T1
- tmpT1SiteID = self.siteMapper.getCloud(cloud)['source']
- tmpT1DQ2ID = self.siteMapper.getSite(tmpT1SiteID).ddm
- prefixDQ2T1 = re.sub('[^_]+DISK$','',tmpT1DQ2ID)
- # loop over all datasets
- for tmpDS,tmpRepMap in tmpRepMaps.iteritems():
- candSites = []
- sitesComDS = []
- sitesCompPD2P = []
- # check metadata
- if not checkedMetaMap.has_key(tmpDS):
- checkedMetaMap[tmpDS] = self.getDatasetMetadata(tmpDS)
- retMeta,tmpMetadata = checkedMetaMap[tmpDS]
- if not retMeta:
- self.putLog("failed to get metadata for %s" % tmpDS,'error')
- return failedRet
- if tmpMetadata['provenance'] in ngProvenance:
- self.putLog("provenance=%s of %s is excluded" % (tmpMetadata['provenance'],tmpDS))
- continue
- if tmpMetadata['hidden'] in [True,'True'] and not useHidden:
- self.putLog("%s is hidden" % tmpDS)
- continue
- # check T1 has a replica and get close sites
- t1HasReplica = False
- t1HasPrimary = False
- nSecReplicas = 0
- closeSiteList = []
- candForMoU = []
- for tmpDQ2ID,tmpStatMap in tmpRepMap.iteritems():
- # check NG suffix
- ngSuffixFlag = False
- for tmpNGSuffix in ngDQ2SuffixList:
- if tmpDQ2ID.endswith(tmpNGSuffix):
- ngSuffixFlag = True
- break
- if ngSuffixFlag:
- continue
- # get close sites
- if closeSitesMap.has_key(tmpDQ2ID):
- for tmpCloseSiteID in closeSitesMap[tmpDQ2ID]:
- if not tmpCloseSiteID in closeSiteList:
- closeSiteList.append(tmpCloseSiteID)
- # checks for T1
- if tmpDQ2ID.startswith(prefixDQ2T1):
- if tmpStatMap[0]['total'] == tmpStatMap[0]['found']:
- t1HasReplica = True
- # check replica metadata to get archived info
- retRepMeta,tmpRepMetadata = self.getReplicaMetadata(tmpDS,tmpDQ2ID)
- if not retRepMeta:
- self.putLog("failed to get replica metadata for %s:%s" % \
- (tmpDS,tmpDQ2ID),'error')
- return failedRet
- # check archived field
- if isinstance(tmpRepMetadata,types.DictType) and tmpRepMetadata.has_key('archived') and \
- tmpRepMetadata['archived'] == 'primary':
- # primary
- t1HasPrimary = True
- break
- elif isinstance(tmpRepMetadata,types.DictType) and tmpRepMetadata.has_key('archived') and \
- tmpRepMetadata['archived'] == 'secondary':
- # secondary
- nSecReplicas += 1
- break
- self.putLog("close sites : %s" % str(closeSiteList))
- # get on-going subscriptions
- timeRangeSub = 7
- if not userSubscriptionsMap.has_key(tmpDS):
- userSubscriptionsMap[tmpDS] = self.taskBuffer.getUserSubscriptions(tmpDS,timeRangeSub)
- userSubscriptions = userSubscriptionsMap[tmpDS]
- # unused cloud
- if not allSiteMap.has_key(cloud):
- continue
- # count the number of T1 subscriptions
- nT1Sub = 0
- for tmpUserSub in userSubscriptions:
- if tmpUserSub.startswith(prefixDQ2T1):
- nT1Sub += 1
- # check sites
- nUserSub = 0
- for tmpSiteSpec in allSiteMap[cloud]:
- # check cloud
- if tmpSiteSpec.cloud != cloud:
- continue
- # prefix of DQ2 ID
- prefixDQ2 = re.sub('[^_]+DISK$','',tmpSiteSpec.ddm)
- # skip T1
- if prefixDQ2 == prefixDQ2T1:
- continue
- # check if corresponding DQ2 ID is a replica location
- hasReplica = False
- for tmpDQ2ID,tmpStatMap in tmpRepMap.iteritems():
- # check NG suffix
- ngSuffixFlag = False
- for tmpNGSuffix in ngDQ2SuffixList:
- if tmpDQ2ID.endswith(tmpNGSuffix):
- ngSuffixFlag = True
- break
- if ngSuffixFlag:
- continue
- if tmpDQ2ID.startswith(prefixDQ2):
- if tmpStatMap[0]['total'] == tmpStatMap[0]['found']:
- # complete
- sitesComDS.append(tmpSiteSpec.sitename)
- if tmpSiteSpec.cachedse == 1:
- sitesCompPD2P.append(tmpSiteSpec.sitename)
- hasReplica = True
- break
- # site doesn't have a replica
- if (not hasReplica) and tmpSiteSpec.cachedse == 1:
- candForMoU.append(tmpSiteSpec.sitename)
- if not useCloseSites:
- candSites.append(tmpSiteSpec.sitename)
- else:
- # use close sites only
- if self.getDQ2ID(tmpSiteSpec.sitename,tmpDS) in closeSiteList:
- candSites.append(tmpSiteSpec.sitename)
- # the number of subscriptions
- for tmpUserSub in userSubscriptions:
- if tmpUserSub.startswith(prefixDQ2):
- nUserSub += 1
- break
- # append
- if not returnMap.has_key(tmpDS):
- returnMap[tmpDS] = {}
- returnMap[tmpDS][cloud] = (candSites,sitesComDS,sitesCompPD2P,nUserSub,t1HasReplica,t1HasPrimary,
- nSecReplicas,nT1Sub,candForMoU)
- # return
- return True,returnMap
-
-
- # check DDM response
- def isDQ2ok(self,out):
- if out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1 \
- or out == '()':
- return False
- return True
-
-
- # get map of DQ2 IDs
- def getDQ2ID(self,sitename,dataset):
- # get DQ2 ID
- if not self.siteMapper.checkSite(sitename):
- self.putLog("cannot find SiteSpec for %s" % sitename)
- return ''
- dq2ID = self.siteMapper.getSite(sitename).ddm
- if True:
- # data
- matchEOS = re.search('_EOS[^_]+DISK$',dq2ID)
- if matchEOS != None:
- dq2ID = re.sub('_EOS[^_]+DISK','_EOSDATADISK',dq2ID)
- else:
- dq2ID = re.sub('_[^_]+DISK','_DATADISK',dq2ID)
- else:
- # unsupported prefix for subscription
- self.putLog('%s has unsupported prefix for subscription' % dataset,'error')
- return ''
- # patch for MWT2_UC
- if dq2ID == 'MWT2_UC_DATADISK':
- dq2ID = 'MWT2_DATADISK'
- # return
- return dq2ID
-
-
- # get list of datasets
- def makeSubscription(self,dataset,sitename,givenDQ2ID=None,ddmShare='secondary'):
- # return for failuer
- retFailed = False,''
- # get DQ2 IDs
- if givenDQ2ID == None:
- dq2ID = self.getDQ2ID(sitename,dataset)
- else:
- dq2ID = givenDQ2ID
- if dq2ID == '':
- self.putLog("cannot find DQ2 ID for %s:%s" % (sitename,dataset))
- return retFailed
- # make subscription
- optSrcPolicy = 000001
- nTry = 3
- for iDDMTry in range(nTry):
- # register subscription
- self.putLog('%s/%s registerDatasetSubscription %s %s' % (iDDMTry,nTry,dataset,dq2ID))
- status,out = ddm.DQ2.main('registerDatasetSubscription',dataset,dq2ID,version=0,archived=0,
- callbacks={},sources={},sources_policy=optSrcPolicy,
- wait_for_sources=0,destination=None,query_more_sources=0,
- sshare=ddmShare,group=None,activity='Data Brokering',acl_alias='secondary')
- if out.find('DQSubscriptionExistsException') != -1:
- break
- elif out.find('DQLocationExistsException') != -1:
- break
- elif status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if out.find('DQSubscriptionExistsException') != -1:
- pass
- elif status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response for %s' % dataset,'error')
- return retFailed
- # update
- self.putLog('%s %s' % (status,out))
- return True,dq2ID
-
-
- # get weight for brokerage
- def getWeightForBrokerage(self,sitenames,dataset,nReplicasInCloud):
- # return for failuer
- retFailed = False,{}
- retMap = {}
- # get the number of subscriptions for last 24 hours
- numUserSubs = self.taskBuffer.getNumUserSubscriptions()
- # loop over all sites
- for sitename in sitenames:
- # get DQ2 ID
- dq2ID = self.getDQ2ID(sitename,dataset)
- if dq2ID == '':
- self.putLog("cannot find DQ2 ID for %s:%s" % (sitename,dataset))
- return retFailed
- # append
- if numUserSubs.has_key(dq2ID):
- retMap[sitename] = 1 + numUserSubs[dq2ID]
- else:
- retMap[sitename] = 1
- # negative weight if a cloud already has replicas
- tmpCloud = self.siteMapper.getSite(sitename).cloud
- retMap[sitename] *= (1 + nReplicasInCloud[tmpCloud])
- # return
- return retMap
-
-
- # get free disk size
- def getFreeDiskSize(self,dataset,siteList):
- # return for failuer
- retFailed = False,{}
- # loop over all sites
- sizeMap = {}
- for sitename in siteList:
- # reuse cached value
- if self.cachedSizeMap.has_key(sitename):
- sizeMap[sitename] = self.cachedSizeMap[sitename]
- continue
- # get DQ2 IDs
- dq2ID = self.getDQ2ID(sitename,dataset)
- if dq2ID == '':
- self.putLog("cannot find DQ2 ID for %s:%s" % (sitename,dataset))
- return retFailed
- for valueItem in ['used','total']:
- nTry = 3
- for iDDMTry in range(nTry):
- status,out = ddm.DQ2.main('queryStorageUsage','srm',valueItem,dq2ID)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if status != 0 or out.startswith('Error'):
- self.putLog("%s/%s queryStorageUsage key=%s value=%s site=%s" % (iDDMTry,nTry,'srm',valueItem,dq2ID))
- self.putLog(out,'error')
- self.putLog('bad DQ2 response for %s:%s' % (dq2ID,valueItem), 'error')
- return retFailed
- try:
- # convert res to map
- exec "tmpGigaVal = %s[0]['giga']" % out
- if not sizeMap.has_key(sitename):
- sizeMap[sitename] = {}
- # append
- sizeMap[sitename][valueItem] = tmpGigaVal
- # cache
- self.cachedSizeMap[sitename] = sizeMap[sitename]
- except:
- self.putLog("%s/%s queryStorageUsage key=%s value=%s site=%s" % (iDDMTry,nTry,'srm',valueItem,dq2ID))
- self.putLog(out,'error')
- self.putLog('could not convert HTTP-res to free size map for %s%s' % (dq2ID,valueItem), 'error')
- return retFailed
- # return
- self.putLog('getFreeDiskSize done->%s' % str(sizeMap))
- return True,sizeMap
-
-
-
- # get list of replicas for a dataset
- def getListDatasetReplicas(self,dataset):
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog("%s/%s listDatasetReplicas %s" % (iDDMTry,nTry,dataset))
- status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response for %s' % dataset, 'error')
- return False,{}
- try:
- # convert res to map
- exec "tmpRepSites = %s" % out
- self.putLog('getListDatasetReplicas->%s' % str(tmpRepSites))
- return True,tmpRepSites
- except:
- self.putLog(out,'error')
- self.putLog('could not convert HTTP-res to replica map for %s' % dataset, 'error')
- return False,{}
-
-
- # get replicas for a container
- def getListDatasetReplicasInContainer(self,container):
- # response for failure
- resForFailure = False,{}
- # get datasets in container
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s listDatasetsInContainer %s' % (iDDMTry,nTry,container))
- status,out = ddm.DQ2.main('listDatasetsInContainer',container)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response for %s' % container, 'error')
- return resForFailure
- datasets = []
- try:
- # convert to list
- exec "datasets = %s" % out
- except:
- self.putLog('could not convert HTTP-res to dataset list for %s' % container, 'error')
- return resForFailure
- # loop over all datasets
- allRepMap = {}
- for dataset in datasets:
- # get replicas
- status,tmpRepSites = self.getListDatasetReplicas(dataset)
- if not status:
- return resForFailure
- # append
- allRepMap[dataset] = tmpRepSites
- # return
- self.putLog('getListDatasetReplicasInContainer done')
- return True,allRepMap
-
-
- # get dataset metadata
- def getDatasetMetadata(self,datasetName):
- # response for failure
- resForFailure = False,{}
- metaDataAttrs = ['provenance','hidden']
- # get datasets in container
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s getMetaDataAttribute %s' % (iDDMTry,nTry,datasetName))
- status,out = ddm.DQ2.main('getMetaDataAttribute',datasetName,metaDataAttrs)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response for %s' % datasetName, 'error')
- return resForFailure
- metadata = {}
- try:
- # convert to map
- exec "metadata = %s" % out
- except:
- self.putLog('could not convert HTTP-res to metadata for %s' % datasetName, 'error')
- return resForFailure
- # check whether all attributes are available
- for tmpAttr in metaDataAttrs:
- if not metadata.has_key(tmpAttr):
- self.putLog('%s is missing in %s' % (tmpAttr,str(metadata)), 'error')
- return resForFailure
- # return
- self.putLog('getDatasetMetadata -> %s' % str(metadata))
- return True,metadata
-
-
- # get replica metadata
- def getReplicaMetadata(self,datasetName,locationName):
- # response for failure
- resForFailure = False,{}
- # get metadata
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s listMetaDataReplica %s %s' % (iDDMTry,nTry,datasetName,locationName))
- status,out = ddm.DQ2.main('listMetaDataReplica',locationName,datasetName)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response for %s' % datasetName, 'error')
- return resForFailure
- metadata = {}
- try:
- # convert to map
- exec "metadata = %s" % out
- except:
- self.putLog('could not convert HTTP-res to replica metadata for %s:%s' % \
- (datasetName,locationName), 'error')
- return resForFailure
- # return
- self.putLog('getReplicaMetadata -> %s' % str(metadata))
- return True,metadata
-
-
- # check subscription info
- def checkSubscriptionInfo(self,destDQ2ID,datasetName):
- resForFailure = (False,False)
- # get datasets in container
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s listSubscriptionInfo %s %s' % (iDDMTry,nTry,destDQ2ID,datasetName))
- status,out = ddm.DQ2.main('listSubscriptionInfo',datasetName,destDQ2ID,0)
- if status != 0:
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response for %s' % datasetName, 'error')
- return resForFailure
- self.putLog(out)
- if out == '()':
- # no subscription
- retVal = False
- else:
- # already exists
- retVal = True
- self.putLog('checkSubscriptionInfo -> %s' % retVal)
- return True,retVal
-
-
- # get size of dataset
- def getDatasetSize(self,datasetName):
- self.putLog("get size of %s" % datasetName)
- resForFailure = (False,0)
- # get size of datasets
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s listFilesInDataset %s' % (iDDMTry,nTry,datasetName))
- status,out = ddm.DQ2.listFilesInDataset(datasetName)
- if status != 0:
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response to get size of %s' % datasetName, 'error')
- return resForFailure
- self.putLog("OK")
- # get total size
- dsSize = 0
- try:
- exec "outList = %s" % out
- for guid,vals in outList[0].iteritems():
- dsSize += long(vals['filesize'])
- except:
- self.putLog('failed to get size from DQ2 response for %s' % datasetName, 'error')
- return resForFailure
- # GB
- dsSize /= (1024*1024*1024)
- self.putLog("dataset size = %s" % dsSize)
- return True,dsSize
-
-
- # get datasets used by jobs
- def getUsedDatasets(self,datasetMap):
- resForFailure = (False,[])
- # loop over all datasets
- usedDsList = []
- for datasetName in datasetMap.keys():
- # get file list
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s listFilesInDataset %s' % (iDDMTry,nTry,datasetName))
- status,out = ddm.DQ2.listFilesInDataset(datasetName)
- if status != 0:
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response to get size of %s' % datasetName, 'error')
- return resForFailure
- # convert to map
- try:
- tmpLfnList = []
- exec "outList = %s" % out
- for guid,vals in outList[0].iteritems():
- tmpLfnList.append(vals['lfn'])
- except:
- self.putLog('failed to get file list from DQ2 response for %s' % datasetName, 'error')
- return resForFailure
- # check if jobs use the dataset
- usedFlag = False
- for tmpJob in self.jobs:
- for tmpFile in tmpJob.Files:
- if tmpFile.type == 'input' and tmpFile.lfn in tmpLfnList:
- usedFlag = True
- break
- # escape
- if usedFlag:
- break
- # used
- if usedFlag:
- usedDsList.append(datasetName)
- # return
- self.putLog("used datasets = %s" % str(usedDsList))
- return True,usedDsList
-
-
- # get file from dataset
- def getFileFromDataset(self,datasetName,guid,randomMode=False,nSamples=1):
- resForFailure = (False,None)
- # get files in datasets
- global g_filesInDsMap
- if not g_filesInDsMap.has_key(datasetName):
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s listFilesInDataset %s' % (iDDMTry,nTry,datasetName))
- status,out = ddm.DQ2.listFilesInDataset(datasetName)
- if status != 0:
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response to get size of %s' % datasetName, 'error')
- return resForFailure
- # get file
- try:
- exec "outList = %s" % out
- # append
- g_filesInDsMap[datasetName] = outList[0]
- except:
- self.putLog('failed to get file list from DQ2 response for %s' % datasetName, 'error')
- return resForFailure
- # random mode
- if randomMode:
- tmpList = g_filesInDsMap[datasetName].keys()
- random.shuffle(tmpList)
- retList = []
- for iSamples in range(nSamples):
- if iSamples < len(tmpList):
- guid = tmpList[iSamples]
- retMap = g_filesInDsMap[datasetName][guid]
- retMap['guid'] = guid
- retMap['dataset'] = datasetName
- retList.append(retMap)
- return True,retList
- # return
- if g_filesInDsMap[datasetName].has_key(guid):
- retMap = g_filesInDsMap[datasetName][guid]
- retMap['guid'] = guid
- retMap['dataset'] = datasetName
- return True,retMap
- return resForFailure
-
-
- # make subscriptions to EOS
- def makeSubscriptionToEOS(self,datasetName):
- self.putLog("start making EOS subscription for %s" % datasetName)
- destDQ2IDs = ['CERN-PROD_EOSDATADISK']
- # get dataset replica locations
- if datasetName.endswith('/'):
- statRep,replicaMaps = self.getListDatasetReplicasInContainer(datasetName)
- else:
- statRep,replicaMap = self.getListDatasetReplicas(datasetName)
- replicaMaps = {datasetName:replicaMap}
- if not statRep:
- self.putLog("failed to get replica map for EOS",type='error')
- return False
- # loop over all datasets
- for tmpDsName,replicaMap in replicaMaps.iteritems():
- # check if replica is already there
- for destDQ2ID in destDQ2IDs:
- if replicaMap.has_key(destDQ2ID):
- self.putLog("skip EOS sub for %s:%s since replica is already there" % (destDQ2ID,tmpDsName))
- else:
- statSubEx,subExist = self.checkSubscriptionInfo(destDQ2ID,tmpDsName)
- if not statSubEx:
- self.putLog("failed to check subscription for %s:%s" % (destDQ2ID,tmpDsName),type='error')
- continue
- # make subscription
- if subExist:
- self.putLog("skip EOS sub for %s:%s since subscription is already there" % (destDQ2ID,tmpDsName))
- else:
- statMkSub,retMkSub = self.makeSubscription(tmpDsName,'',destDQ2ID)
- if statMkSub:
- self.putLog("made subscription to %s for %s" % (destDQ2ID,tmpDsName))
- else:
- self.putLog("failed to make subscription to %s for %s" % (destDQ2ID,tmpDsName),type='error')
- # return
- self.putLog("end making EOS subscription for %s" % datasetName)
- return True
-
-
- # register new dataset container with datasets
- def registerDatasetContainerWithDatasets(self,containerName,files,replicaMap):
- # sort by locations
- filesMap = {}
- for tmpFile in files:
- tmpLocations = replicaMap[tmpFile['dataset']]
- tmpLocations.sort()
- tmpKey = tuple(tmpLocations)
- if not filesMap.has_key(tmpKey):
- filesMap[tmpKey] = []
- # append file
- filesMap[tmpKey].append(tmpFile)
- # register new datasets
- datasetNames = []
- tmpIndex = 1
- for tmpLocations,tmpFiles in filesMap.iteritems():
- tmpDsName = containerName[:-1] + '_%04d' % tmpIndex
- tmpRet = self.registerDatasetWithLocation(tmpDsName,tmpFiles,tmpLocations)
- # failed
- if not tmpRet:
- self.putLog('failed to register %s' % tmpDsName, 'error')
- return False
- # append dataset
- datasetNames.append(tmpDsName)
- tmpIndex += 1
- # register container
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s registerContainer %s' % (iDDMTry,nTry,containerName))
- status,out = ddm.DQ2.main('registerContainer',containerName,datasetNames)
- if status != 0 and out.find('DQDatasetExistsException') == -1:
- time.sleep(60)
- else:
- break
- if out.find('DQDatasetExistsException') != -1:
- pass
- elif status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response to register %s' % containerName, 'error')
- return False
- # return
- return True
-
-
-
- # register new dataset with locations
- def registerDatasetWithLocation(self,datasetName,files,locations):
- resForFailure = False
- # get file info
- guids = []
- lfns = []
- fsizes = []
- chksums = []
- for tmpFile in files:
- guids.append(tmpFile['guid'])
- lfns.append(tmpFile['lfn'])
- fsizes.append(None)
- chksums.append(None)
- # register new dataset
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s registerNewDataset %s' % (iDDMTry,nTry,datasetName))
- status,out = ddm.DQ2.main('registerNewDataset',datasetName,lfns,guids,fsizes,chksums,
- None,None,None,True)
- if status != 0 and out.find('DQDatasetExistsException') == -1:
- time.sleep(60)
- else:
- break
- if out.find('DQDatasetExistsException') != -1:
- pass
- elif status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response to register %s' % datasetName, 'error')
- return resForFailure
- # freeze dataset
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s freezeDataset %s' % (iDDMTry,nTry,datasetName))
- status,out = ddm.DQ2.main('freezeDataset',datasetName)
- if status != 0 and out.find('DQFrozenDatasetException') == -1:
- time.sleep(60)
- else:
- break
- if out.find('DQFrozenDatasetException') != -1:
- pass
- elif status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response to freeze %s' % datasetName, 'error')
- return resForFailure
- # register locations
- for tmpLocation in locations:
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s registerDatasetLocation %s %s' % (iDDMTry,nTry,datasetName,tmpLocation))
- status,out = ddm.DQ2.main('registerDatasetLocation',datasetName,tmpLocation,0,1,None,None,None,"14 days")
- if status != 0 and out.find('DQLocationExistsException') == -1:
- time.sleep(60)
- else:
- break
- if out.find('DQLocationExistsException') != -1:
- pass
- elif status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response to freeze %s' % datasetName, 'error')
- return resForFailure
- return True
-
-
- # list datasets by file GUIDs
- def listDatasetsByGUIDs(self,guids,dsFilters):
- resForFailure = (False,{})
- # get size of datasets
- nTry = 3
- for iDDMTry in range(nTry):
- self.putLog('%s/%s listDatasetsByGUIDs' % (iDDMTry,nTry))
- status,out = ddm.DQ2.listDatasetsByGUIDs(guids)
- if status != 0:
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- self.putLog(out,'error')
- self.putLog('bad DQ2 response to list datasets by GUIDs','error')
- return resForFailure
- self.putLog(out)
- # get map
- retMap = {}
- try:
- exec "outMap = %s" % out
- for guid in guids:
- tmpDsNames = []
- # GUID not found
- if not outMap.has_key(guid):
- self.putLog('GUID=%s not found' % guid,'error')
- return resForFailure
- # ignore junk datasets
- for tmpDsName in outMap[guid]:
- if tmpDsName.startswith('panda') or \
- tmpDsName.startswith('user') or \
- tmpDsName.startswith('group') or \
- re.search('_sub\d+$',tmpDsName) != None or \
- re.search('_dis\d+$',tmpDsName) != None or \
- re.search('_shadow$',tmpDsName) != None:
- continue
- # check with filters
- if dsFilters != []:
- flagMatch = False
- for tmpFilter in dsFilters:
- if re.search(tmpFilter,tmpDsName) != None:
- flagMatch = True
- break
- # not match
- if not flagMatch:
- continue
- # append
- tmpDsNames.append(tmpDsName)
- # empty
- if tmpDsNames == []:
- self.putLog('no datasets found for GUID=%s' % guid)
- continue
- # duplicated
- if len(tmpDsNames) != 1:
- self.putLog('there are multiple datasets %s for GUID:%s' % (str(tmpDsNames),guid),'error')
- return resForFailure
- # append
- retMap[guid] = tmpDsNames[0]
- except:
- self.putLog('failed to list datasets by GUIDs','error')
- return resForFailure
- return True,retMap
-
-
- # conver event/run list to datasets
- def convertEvtRunToDatasets(self,runEvtList,dsType,streamName,dsFilters,amiTag):
- self.putLog('convertEvtRunToDatasets type=%s stream=%s dsPatt=%s amitag=%s' % \
- (dsType,streamName,str(dsFilters),amiTag))
- # check data type
- failedRet = False,{},[]
- if dsType == 'AOD':
- streamRef = 'StreamAOD_ref'
- elif dsType == 'ESD':
- streamRef = 'StreamESD_ref'
- elif dsType == 'RAW':
- streamRef = 'StreamRAW_ref'
- else:
- self.putLog("invalid data type %s for EventRun conversion" % dsType,type='error')
- return failedRet
- # import event lookup client
- from eventLookupClient import eventLookupClient
- elssiIF = eventLookupClient()
- # loop over all events
- runEvtGuidMap = {}
- nEventsPerLoop = 500
- iEventsTotal = 0
- while iEventsTotal < len(runEvtList):
- tmpRunEvtList = runEvtList[iEventsTotal:iEventsTotal+nEventsPerLoop]
- iEventsTotal += nEventsPerLoop
- if streamName == '':
- guidListELSSI = elssiIF.doLookup(tmpRunEvtList,tokens=streamRef,
- amitag=amiTag,extract=True)
- else:
- guidListELSSI = elssiIF.doLookup(tmpRunEvtList,stream=streamName,tokens=streamRef,
- amitag=amiTag,extract=True)
- # failed
- if guidListELSSI == None or len(guidListELSSI) == 0:
- errStr = ''
- for tmpLine in elssiIF.output:
- errStr += tmpLine
- self.putLog(errStr,type='error')
- self.putLog("invalid retrun from EventLookup",type='error')
- return failedRet
- # check attribute
- attrNames, attrVals = guidListELSSI
- def getAttributeIndex(attr):
- for tmpIdx,tmpAttrName in enumerate(attrNames):
- if tmpAttrName.strip() == attr:
- return tmpIdx
- return None
- # get index
- indexEvt = getAttributeIndex('EventNumber')
- indexRun = getAttributeIndex('RunNumber')
- indexTag = getAttributeIndex(streamRef)
- if indexEvt == None or indexRun == None or indexTag == None:
- self.putLog("failed to get attribute index from %s" % str(attrNames),type='error')
- return failedRet
- # check events
- for runNr,evtNr in tmpRunEvtList:
- paramStr = 'Run:%s Evt:%s Stream:%s' % (runNr,evtNr,streamName)
- self.putLog(paramStr)
- # collect GUIDs
- tmpguids = []
- for attrVal in attrVals:
- if runNr == attrVal[indexRun] and evtNr == attrVal[indexEvt]:
- tmpGuid = attrVal[indexTag]
- # check non existing
- if tmpGuid == 'NOATTRIB':
- continue
- if not tmpGuid in tmpguids:
- tmpguids.append(tmpGuid)
- # not found
- if tmpguids == []:
- errStr = "no GUIDs were found in Event Lookup service for %s" % paramStr
- self.putLog(errStr,type='error')
- return failedRet
- # append
- runEvtGuidMap[(runNr,evtNr)] = tmpguids
- # convert to datasets
- allDatasets = []
- allFiles = []
- allLocations = {}
- for tmpIdx,tmpguids in runEvtGuidMap.iteritems():
- runNr,evtNr = tmpIdx
- tmpDsRet,tmpDsMap = self.listDatasetsByGUIDs(tmpguids,dsFilters)
- # failed
- if not tmpDsRet:
- self.putLog("failed to convert GUIDs to datasets",type='error')
- return failedRet
- # empty
- if tmpDsMap == {}:
- self.putLog("there is no dataset for Run:%s Evt:%s" % (runNr,evtNr),type='error')
- return failedRet
- if len(tmpDsMap) != 1:
- self.putLog("there are multiple datasets %s for Run:%s Evt:%s" % (str(tmpDsMap),runNr,evtNr),
- type='error')
- return failedRet
- # append
- for tmpGUID,tmpDsName in tmpDsMap.iteritems():
- # collect dataset names
- if not tmpDsName in allDatasets:
- allDatasets.append(tmpDsName)
- # get location
- statRep,replicaMap = self.getListDatasetReplicas(tmpDsName)
- # failed
- if not statRep:
- self.putLog("failed to get locations for DS:%s" % tmpDsName,type='error')
- return failedRet
- # collect locations
- tmpLocationList = []
- for tmpLocation in replicaMap.keys():
- if not tmpLocation in tmpLocationList:
- tmpLocationList.append(tmpLocation)
- allLocations[tmpDsName] = tmpLocationList
- # get file info
- tmpFileRet,tmpFileInfo = self.getFileFromDataset(tmpDsName,tmpGUID)
- # failed
- if not tmpFileRet:
- self.putLog("failed to get fileinfo for GUID:%s DS:%s" % (tmpGUID,tmpDsName),type='error')
- return failedRet
- # collect files
- allFiles.append(tmpFileInfo)
- # return
- self.putLog('converted to %s, %s, %s' % (str(allDatasets),str(allLocations),str(allFiles)))
- return True,allLocations,allFiles
-
-
- # get mapping between TAG and parent GUIDs
- def getMapTAGandParentGUIDs(self,dsName,tagQuery,streamRef):
- # remove _tidXYZ
- dsNameForLookUp = re.sub('_tid\d+(_\d+)*$','',dsName)
- # reuse
- if self.mapTAGandParentGUIDs.has_key(dsNameForLookUp):
- return self.mapTAGandParentGUIDs[dsNameForLookUp]
- # set
- from countGuidsClient import countGuidsClient
- tagIF = countGuidsClient()
- tagResults = tagIF.countGuids(dsNameForLookUp,tagQuery,streamRef+',StreamTAG_ref')
- if tagResults == None:
- errStr = ''
- for tmpLine in tagIF.output:
- if tmpLine == '\n':
- continue
- errStr += tmpLine
- self.putLog(errStr,type='error')
- errStr2 = "invalid return from Event Lookup service. "
- if "No collection in the catalog matches the dataset name" in errStr:
- errStr2 += "Note that only merged TAG is uploaded to the TAG DB, "
- errStr2 += "so you need to use merged TAG datasets (or container) for inDS. "
- errStr2 += "If this is already the case please contact atlas-event-metadata@cern.ch"
- self.putLog(errStr2,type='error')
- return None
- # empty
- if not tagResults[0]:
- errStr = "No GUIDs found for %s" % dsName
- self.putLog(errStr,type='error')
- return None
- # collect
- retMap = {}
- for guidCount,guids in tagResults[1]:
- self.putLog('%s %s' % (guidCount,guids))
- parentGUID,tagGUID = guids
- # append TAG GUID
- if not retMap.has_key(tagGUID):
- retMap[tagGUID] = {}
- # append parent GUID and the number of selected events
- if retMap[tagGUID].has_key(parentGUID):
- errStr = "GUIDs=%s is duplicated" % parentGUID
- self.putLog(errStr,type='error')
- return None
- retMap[tagGUID][parentGUID] = long(guidCount)
- # keep to avoid redundant lookup
- self.mapTAGandParentGUIDs[dsNameForLookUp] = retMap
- # return
- return retMap
-
-
- # get TAG files and parent DS/files using TAG query
- def getTagParentInfoUsingTagQuery(self,tagDsList,tagQuery,streamRef):
- # return code for failure
- failedRet = False,{},[]
- allDatasets = []
- allFiles = []
- allLocations = {}
- # set empty if Query is undefined
- if tagQuery == False:
- tagQuery = ''
- # loop over all tags
- self.putLog('getting parent dataset names and LFNs from TAG DB using EventSelector.Query="%s"' % tagQuery)
- for tagDS in tagDsList:
- if tagDS.endswith('/'):
- # get elements in container
- tmpStat,elementMap = self.getListDatasetReplicasInContainer(tagDS)
- else:
- tmpStat,elementMap = self.getListDatasetReplicas(tagDS)
- # loop over all elemets
- for dsName in elementMap.keys():
- self.putLog("DS=%s Query=%s Ref:%s" % (dsName,tagQuery,streamRef))
- guidMap = self.getMapTAGandParentGUIDs(dsName,tagQuery,streamRef)
- # failed
- if guidMap == None:
- self.putLog("failed to get mappping between TAG and parent GUIDs",type='error')
- return failedRet
- # convert TAG GUIDs to LFNs
- tmpTagRet,tmpTagDsMap = self.listDatasetsByGUIDs(guidMap.keys(),[])
- # failed
- if not tmpTagRet:
- self.putLog("failed to convert GUIDs to datasets",type='error')
- return failedRet
- # empty
- if tmpTagDsMap == {}:
- self.putLog("there is no dataset for DS=%s Query=%s Ref:%s" % (dsName,tagQuery,streamRef),type='error')
- return failedRet
- # convert parent GUIDs for each TAG file
- for tagGUID in guidMap.keys():
- # not found
- if not tmpTagDsMap.has_key(tagGUID):
- errStr = 'TAG GUID=%s not found in DQ2' % tagGUID
- self.putLog(errStr,type='error')
- return failedRet
- # get TAG file info
- tagElementDS = tmpTagDsMap[tagGUID]
- tmpFileRet,tmpTagFileInfo = self.getFileFromDataset(tmpTagDsMap[tagGUID],tagGUID)
- # failed
- if not tmpFileRet:
- self.putLog("failed to get fileinfo for GUID:%s DS:%s" % (tagGUID,tmpTagDsMap[tagGUID]),type='error')
- return failedRet
- # convert parent GUIDs to DS/LFNs
- tmpParentRet,tmpParentDsMap = self.listDatasetsByGUIDs(guidMap[tagGUID].keys(),[])
- # failed
- if not tmpParentRet:
- self.putLog("failed to convert GUIDs:%s to parent datasets" % str(guidMap[tagGUID].keys()),type='error')
- return failedRet
- # empty
- if tmpParentDsMap == {}:
- self.putLog("there is no parent dataset for GUIDs:%s" % str(guidMap[tagGUID].keys()),type='error')
- return failedRet
- # loop over all parent GUIDs
- for parentGUID in guidMap[tagGUID].keys():
- # not found
- if not tmpParentDsMap.has_key(parentGUID):
- errStr = '%s GUID=%s not found in DQ2' % (re.sub('_ref$','',streamRef),parentGUID)
- self.putLog(errStr,type='error')
- return failedRet
- # get parent file info
- tmpParentDS = tmpParentDsMap[parentGUID]
- tmpFileRet,tmpParentFileInfo = self.getFileFromDataset(tmpParentDS,parentGUID)
- # failed
- if not tmpFileRet:
- self.putLog("failed to get parent fileinfo for GUID:%s DS:%s" % (parentGUID,tmpParentDS),
- type='error')
- return failedRet
- # collect files
- allFiles.append(tmpParentFileInfo)
- # get location
- if not tmpParentDS in allDatasets:
- allDatasets.append(tmpParentDS)
- # get location
- statRep,replicaMap = self.getListDatasetReplicas(tmpParentDS)
- # failed
- if not statRep:
- self.putLog("failed to get locations for DS:%s" % tmpParentDS,type='error')
- return failedRet
- # collect locations
- tmpLocationList = []
- for tmpLocation in replicaMap.keys():
- if not tmpLocation in tmpLocationList:
- tmpLocationList.append(tmpLocation)
- allLocations[tmpParentDS] = tmpLocationList
- # return
- self.putLog('converted to %s, %s, %s' % (str(allDatasets),str(allLocations),str(allFiles)))
- return True,allLocations,allFiles
-
-
- # put log
- def putLog(self,msg,type='debug',sendLog=False,actionTag='',tagsMap={}):
- tmpMsg = self.token+' '+msg
- if type == 'error':
- _logger.error(tmpMsg)
- # keep last error message
- self.lastMessage = tmpMsg
- else:
- _logger.debug(tmpMsg)
- # send to logger
- if sendLog:
- tmpMsg = self.token + ' - '
- if actionTag != '':
- tmpMsg += 'action=%s ' % actionTag
- for tmpTag,tmpTagVal in tagsMap.iteritems():
- tmpMsg += '%s=%s ' % (tmpTag,tmpTagVal)
- tmpMsg += '- ' + msg
- tmpPandaLogger = PandaLogger()
- tmpPandaLogger.lock()
- tmpPandaLogger.setParams({'Type':'pd2p'})
- tmpLog = tmpPandaLogger.getHttpLogger(panda_config.loggername)
- # add message
- if type == 'error':
- tmpLog.error(tmpMsg)
- else:
- tmpLog.info(tmpMsg)
- # release HTTP handler
- tmpPandaLogger.release()
- time.sleep(1)
-
-
- # peek log
- def peekLog(self):
- return self.lastMessage
-
-
- # make T1 subscription
- def makeT1Subscription(self,allCloudCandidates,tmpDS,dsSize,
- nUsed=None,nWaitingJobs=None,nWaitingJobsets=None):
- useSmallT1 = None
- # no candidate
- if allCloudCandidates == []:
- return True,useSmallT1
- # convert to siteIDs
- t1Candidates = []
- t1Weights = {}
- siteToCloud = {}
- for tmpCloud in allCloudCandidates:
- tmpCloudSpec = self.siteMapper.getCloud(tmpCloud)
- tmpT1SiteID = tmpCloudSpec['source']
- t1Candidates.append(tmpT1SiteID)
- # use MoU share
- t1Weights[tmpT1SiteID] = tmpCloudSpec['mcshare']
- # reverse lookup
- siteToCloud[tmpT1SiteID] = tmpCloud
- # get free disk size
- self.putLog("getting free disk size for T1 PD2P")
- retFreeSizeMap,freeSizeMap = self.getFreeDiskSize(tmpDS,t1Candidates)
- if not retFreeSizeMap:
- self.putLog("failed to get free disk size",type='error',sendLog=True)
- return False,useSmallT1
- # run brokerage
- tmpJob = JobSpec()
- tmpJob.AtlasRelease = ''
- self.putLog("run brokerage for T1-T1 for %s" % tmpDS)
- selectedSite = self.chooseSite(t1Weights,freeSizeMap,dsSize)
- self.putLog("site for T1 PD2P -> %s" % selectedSite)
- # simulation
- if self.simul:
- return True,useSmallT1
- # no candidate
- if selectedSite == None:
- self.putLog("no candidate for T1-T1")
- return False,useSmallT1
- # make subscription
- tmpJob.computingSite = selectedSite
- subRet,dq2ID = self.makeSubscription(tmpDS,tmpJob.computingSite)
- tmpTagsMap = {'site':tmpJob.computingSite,'dataset':tmpDS}
- if nUsed != None:
- tmpTagsMap['nused'] = nUsed
- if nWaitingJobs != None:
- tmpTagsMap['nwaitingjobs'] = nWaitingJobs
- if nWaitingJobsets != None:
- tmpTagsMap['nwaitingjobsets'] = nWaitingJobsets
- self.putLog("made subscription for T1-T1 to %s:%s" % (tmpJob.computingSite,dq2ID),sendLog=True,
- actionTag='SELECTEDT1',tagsMap=tmpTagsMap)
- # check if small cloud is used
- if siteToCloud[tmpJob.computingSite] in cloudsWithSmallT1:
- useSmallT1 = siteToCloud[tmpJob.computingSite]
- # update database
- if subRet:
- self.taskBuffer.addUserSubscription(tmpDS,[dq2ID])
- return True,useSmallT1
- else:
- return False,useSmallT1
-
-
- # make T2 subscription with MoU share
- def makeT2SubscriptionMoU(self,allCandidates,tmpDS,dsSize,pd2pType,
- nUsed=None,nWaitingJobs=None,nWaitingJobsets=None):
- # no candidate
- if allCandidates == []:
- return True,None
- # get MoU share
- if self.shareMoUForT2 == None:
- self.shareMoUForT2 = self.taskBuffer.getMouShareForT2PD2P()
- # convert to DQ2 ID
- t2Candidates = []
- t2Weights = {}
- dq2List = []
- for tmpCandidate in allCandidates:
- tmpDQ2ID = self.getDQ2ID(tmpCandidate,tmpDS)
- if not tmpDQ2ID in dq2List:
- # append
- dq2List.append(tmpDQ2ID)
- # get MoU share
- if not self.shareMoUForT2.has_key(tmpDQ2ID):
- # site is undefined in t_regions_replication
- self.putLog("%s is not in MoU table" % tmpDQ2ID,type='error')
- continue
- if not self.shareMoUForT2[tmpDQ2ID]['status'] in ['ready']:
- # site is not ready
- self.putLog("%s is not ready in MoU table" % tmpDQ2ID)
- continue
- tmpWeight = self.shareMoUForT2[tmpDQ2ID]['weight']
- # skip if the weight is 0
- if tmpWeight == 0:
- self.putLog("%s has 0 weight in MoU table" % tmpDQ2ID)
- continue
- # collect siteIDs and weights for brokerage
- t2Candidates.append(tmpCandidate)
- t2Weights[tmpCandidate] = tmpWeight
- # sort for reproducibility
- t2Candidates.sort()
- # get free disk size
- self.putLog("getting free disk size for T2 %s PD2P" % pd2pType)
- retFreeSizeMap,freeSizeMap = self.getFreeDiskSize(tmpDS,t2Candidates)
- if not retFreeSizeMap:
- self.putLog("failed to get free disk size",type='error',sendLog=True)
- return False,None
- # run brokerage
- tmpJob = JobSpec()
- tmpJob.AtlasRelease = ''
- self.putLog("run brokerage for T2 with %s for %s" % (pd2pType,tmpDS))
- selectedSite = self.chooseSite(t2Weights,freeSizeMap,dsSize)
- self.putLog("site for T2 %s PD2P -> %s" % (pd2pType,selectedSite))
- # simulation
- if self.simul:
- return True,selectedSite
- # no candidate
- if selectedSite == None:
- self.putLog("no candidate for T2 with %s" % pd2pType)
- return False,None
- # make subscription
- subRet,dq2ID = self.makeSubscription(tmpDS,selectedSite)
- tmpTagsMap = {'site':selectedSite,'dataset':tmpDS}
- if nUsed != None:
- tmpTagsMap['nused'] = nUsed
- if nWaitingJobs != None:
- tmpTagsMap['nwaitingjobs'] = nWaitingJobs
- if nWaitingJobsets != None:
- tmpTagsMap['nwaitingjobsets'] = nWaitingJobsets
- self.putLog("made subscription for T2 with %s to %s:%s" % (pd2pType,selectedSite,dq2ID),sendLog=True,
- actionTag='SELECTEDT2_%s' % pd2pType,tagsMap=tmpTagsMap)
- # update database
- if subRet:
- self.taskBuffer.addUserSubscription(tmpDS,[dq2ID])
- return True,selectedSite
- else:
- return False,None
-
-
- # choose site
- def chooseSite(self,canWeights,freeSizeMap,datasetSize):
- # loop over all candidates
- totalW = 0
- allCandidates = []
- for tmpCan,tmpW in canWeights.iteritems():
- # size check
- if freeSizeMap.has_key(tmpCan):
- # disk threshold for PD2P max(5%,3TB)
- diskThresholdPD2P = 1024 * 3
- thrForThisSite = long(freeSizeMap[tmpCan]['total'] * 5 / 100)
- if thrForThisSite < diskThresholdPD2P:
- thrForThisSite = diskThresholdPD2P
- remSpace = freeSizeMap[tmpCan]['total'] - freeSizeMap[tmpCan]['used']
- if remSpace-datasetSize < thrForThisSite:
- self.putLog(' skip: disk shortage %s-%s< %s' % (remSpace,datasetSize,thrForThisSite))
- continue
- self.putLog('weight %s %s' % (tmpCan,tmpW))
- # get total weight
- totalW += tmpW
- # append candidate
- allCandidates.append(tmpCan)
- # no candidate
- if allCandidates == []:
- return None
- # sort for reproducibility
- allCandidates.sort()
- # choose site
- rNumber = random.random() * totalW
- for tmpCan in allCandidates:
- rNumber -= canWeights[tmpCan]
- if rNumber <= 0:
- return tmpCan
- return allCandidates[-1]
-
-
diff --git a/current/pandaserver/dataservice/ErrorCode.py b/current/pandaserver/dataservice/ErrorCode.py
deleted file mode 100755
index 91faf46e1..000000000
--- a/current/pandaserver/dataservice/ErrorCode.py
+++ /dev/null
@@ -1,16 +0,0 @@
-############## errror code
-
-# Setupper
-EC_Setupper = 100
-
-# Setupper
-EC_GUID = 101
-
-# Adder
-EC_Adder = 200
-
-# Subscription failures
-EC_Subscription = 201
-
-# lost file (=taskbuffer.ErrorCode.EC_LostFile)
-EC_LostFile = 110
diff --git a/current/pandaserver/dataservice/EventPicker.py b/current/pandaserver/dataservice/EventPicker.py
deleted file mode 100644
index 977be5be5..000000000
--- a/current/pandaserver/dataservice/EventPicker.py
+++ /dev/null
@@ -1,288 +0,0 @@
-'''
-add data to dataset
-
-'''
-
-import os
-import re
-import sys
-import time
-import fcntl
-import datetime
-import commands
-import brokerage.broker
-from dataservice import DynDataDistributer
-from dataservice.MailUtils import MailUtils
-from dataservice.Notifier import Notifier
-from taskbuffer.JobSpec import JobSpec
-from dataservice.datriHandler import datriHandler
-
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('EventPicker')
-DynDataDistributer.initLogger(_logger)
-
-
-class EventPicker:
- # constructor
- def __init__(self,taskBuffer,siteMapper,evpFileName,ignoreError):
- self.taskBuffer = taskBuffer
- self.siteMapper = siteMapper
- self.ignoreError = ignoreError
- self.evpFileName = evpFileName
- self.token = datetime.datetime.utcnow().isoformat(' ')
- self.pd2p = DynDataDistributer.DynDataDistributer([],self.taskBuffer,self.siteMapper,
- token=self.token)
- self.userDatasetName = ''
- self.creationTime = ''
- self.params = ''
- self.lockedBy = ''
- self.evpFile = None
-
- # main
- def run(self):
- try:
- self.putLog('start %s' % self.evpFileName)
- # lock evp file
- self.evpFile = open(self.evpFileName)
- try:
- fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_EX|fcntl.LOCK_NB)
- except:
- # relase
- self.putLog("cannot lock %s" % self.evpFileName)
- self.evpFile.close()
- return True
- # options
- runEvtList = []
- eventPickDataType = ''
- eventPickStreamName = ''
- eventPickDS = []
- eventPickAmiTag = ''
- inputFileList = []
- tagDsList = []
- tagQuery = ''
- tagStreamRef = ''
- # read evp file
- for tmpLine in self.evpFile:
- tmpMatch = re.search('^([^=]+)=(.+)$',tmpLine)
- # check format
- if tmpMatch == None:
- continue
- tmpItems = tmpMatch.groups()
- if tmpItems[0] == 'runEvent':
- # get run and event number
- tmpRunEvt = tmpItems[1].split(',')
- if len(tmpRunEvt) == 2:
- runEvtList.append(tmpRunEvt)
- elif tmpItems[0] == 'eventPickDataType':
- # data type
- eventPickDataType = tmpItems[1]
- elif tmpItems[0] == 'eventPickStreamName':
- # stream name
- eventPickStreamName = tmpItems[1]
- elif tmpItems[0] == 'eventPickDS':
- # dataset pattern
- eventPickDS = tmpItems[1].split(',')
- elif tmpItems[0] == 'eventPickAmiTag':
- # AMI tag
- eventPickAmiTag = tmpItems[1]
- elif tmpItems[0] == 'userName':
- # user name
- self.userDN = tmpItems[1]
- self.putLog("user=%s" % self.userDN)
- elif tmpItems[0] == 'userDatasetName':
- # user dataset name
- self.userDatasetName = tmpItems[1]
- elif tmpItems[0] == 'lockedBy':
- # client name
- self.lockedBy = tmpItems[1]
- elif tmpItems[0] == 'creationTime':
- # creation time
- self.creationTime = tmpItems[1]
- elif tmpItems[0] == 'params':
- # parameters
- self.params = tmpItems[1]
- elif tmpItems[0] == 'inputFileList':
- # input file list
- inputFileList = tmpItems[1].split(',')
- try:
- inputFileList.remove('')
- except:
- pass
- elif tmpItems[0] == 'tagDS':
- # TAG dataset
- tagDsList = tmpItems[1].split(',')
- elif tmpItems[0] == 'tagQuery':
- # query for TAG
- tagQuery = tmpItems[1]
- elif tmpItems[0] == 'tagStreamRef':
- # StreamRef for TAG
- tagStreamRef = tmpItems[1]
- if not tagStreamRef.endswith('_ref'):
- tagStreamRef += '_ref'
- # convert
- if tagDsList == [] or tagQuery == '':
- # convert run/event list to dataset/file list
- tmpRet,locationMap,allFiles = self.pd2p.convertEvtRunToDatasets(runEvtList,
- eventPickDataType,
- eventPickStreamName,
- eventPickDS,
- eventPickAmiTag)
- if not tmpRet:
- self.endWithError('Failed to convert the run/event list to a dataset/file list')
- return False
- else:
- # get parent dataset/files with TAG
- tmpRet,locationMap,allFiles = self.pd2p.getTagParentInfoUsingTagQuery(tagDsList,tagQuery,tagStreamRef)
- if not tmpRet:
- self.endWithError('Failed to get parent dataset/file list with TAG')
- return False
- # use only files in the list
- if inputFileList != []:
- tmpAllFiles = []
- for tmpFile in allFiles:
- if tmpFile['lfn'] in inputFileList:
- tmpAllFiles.append(tmpFile)
- allFiles = tmpAllFiles
- # make dataset container
- tmpRet = self.pd2p.registerDatasetContainerWithDatasets(self.userDatasetName,allFiles,locationMap)
- if not tmpRet:
- self.endWithError('Failed to make a dataset container %s' % self.userDatasetName)
- return False
- # get candidates
- tmpRet,candidateMaps = self.pd2p.getCandidates(self.userDatasetName,checkUsedFile=False,
- useHidden=True)
- if not tmpRet:
- self.endWithError('Failed to find candidate for destination')
- return False
- # collect all candidates
- allCandidates = []
- for tmpDS,tmpDsVal in candidateMaps.iteritems():
- for tmpCloud,tmpCloudVal in tmpDsVal.iteritems():
- for tmpSiteName in tmpCloudVal[0]:
- if not tmpSiteName in allCandidates:
- allCandidates.append(tmpSiteName)
- if allCandidates == []:
- self.endWithError('No candidate for destination')
- return False
- # get size of dataset container
- tmpRet,totalInputSize = self.pd2p.getDatasetSize(self.userDatasetName)
- if not tmpRet:
- self.endWithError('Failed to get the size of %s' % self.userDatasetName)
- return False
- # run brokerage
- tmpJob = JobSpec()
- tmpJob.AtlasRelease = ''
- self.putLog("run brokerage for %s" % tmpDS)
- brokerage.broker.schedule([tmpJob],self.taskBuffer,self.siteMapper,True,allCandidates,
- True,datasetSize=totalInputSize)
- if tmpJob.computingSite.startswith('ERROR'):
- self.endWithError('brokerage failed with %s' % tmpJob.computingSite)
- return False
- self.putLog("site -> %s" % tmpJob.computingSite)
- # send request to DaTRI
- if self.lockedBy.startswith('ganga'):
- tmpHandler = datriHandler(type='ganga')
- else:
- tmpHandler = datriHandler(type='pathena')
- # remove redundant CN from DN
- tmpDN = self.userDN
- tmpDN = re.sub('/CN=limited proxy','',tmpDN)
- tmpDN = re.sub('(/CN=proxy)+$','',tmpDN)
- tmpMsg = "%s ds=%s site=%s id=%s" % ('datriHandler.sendRequest',
- self.userDatasetName,
- self.siteMapper.getSite(tmpJob.computingSite).ddm,
- tmpDN)
- self.putLog(tmpMsg)
- tmpHandler.setParameters(data_pattern=self.userDatasetName,
- site=self.siteMapper.getSite(tmpJob.computingSite).ddm,
- userid=tmpDN)
- nTry = 3
- for iTry in range(nTry):
- dhStatus,dhOut = tmpHandler.sendRequest()
- # succeeded
- if dhStatus == 0 or "such request is exist" in dhOut:
- self.putLog("%s %s" % (dhStatus,dhOut))
- break
- if iTry+1 < nTry:
- # sleep
- time.sleep(60)
- else:
- # final attempt failed
- self.endWithError('Failed to send request to DaTRI : %s %s' % (dhStatus,dhOut))
- return False
- # send email notification for success
- tmpMsg = 'A transfer request was successfully sent to DaTRI.\n'
- tmpMsg += 'You will receive a notification from DaTRI when it completed.'
- self.sendEmail(True,tmpMsg)
- try:
- # unlock and delete evp file
- fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN)
- self.evpFile.close()
- os.remove(self.evpFileName)
- except:
- pass
- # successfully terminated
- self.putLog("end %s" % self.evpFileName)
- return True
- except:
- errType,errValue = sys.exc_info()[:2]
- self.endWithError('Got exception %s:%s' % (errType,errValue))
- return False
-
-
- # end with error
- def endWithError(self,message):
- self.putLog(message,'error')
- # unlock evp file
- try:
- fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN)
- self.evpFile.close()
- if not self.ignoreError:
- # remove evp file
- os.remove(self.evpFileName)
- # send email notification
- self.sendEmail(False,message)
- except:
- pass
- self.putLog('end %s' % self.evpFileName)
-
-
- # put log
- def putLog(self,msg,type='debug'):
- tmpMsg = self.token+' '+msg
- if type == 'error':
- _logger.error(tmpMsg)
- else:
- _logger.debug(tmpMsg)
-
-
- # send email notification
- def sendEmail(self,isSucceeded,message):
- # mail address
- toAdder = Notifier(self.taskBuffer,None,[]).getEmail(self.userDN)
- if toAdder == '':
- self.putLog('cannot find email address for %s' % self.userDN,'error')
- return
- # subject
- mailSubject = "PANDA notification for Event-Picking Request"
- # message
- mailBody = "Hello,\n\nHere is your request status for event picking\n\n"
- if isSucceeded:
- mailBody += "Status : Passed to DaTRI\n"
- else:
- mailBody += "Status : Failed\n"
- mailBody += "Created : %s\n" % self.creationTime
- mailBody += "Ended : %s\n" % datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
- mailBody += "Dataset : %s\n" % self.userDatasetName
- mailBody += "\n"
- mailBody += "Parameters : %s %s\n" % (self.lockedBy,self.params)
- mailBody += "\n"
- mailBody += "%s\n" % message
- # send
- retVal = MailUtils().send(toAdder,mailSubject,mailBody)
- # return
- return
diff --git a/current/pandaserver/dataservice/Finisher.py b/current/pandaserver/dataservice/Finisher.py
deleted file mode 100755
index 64d5c30be..000000000
--- a/current/pandaserver/dataservice/Finisher.py
+++ /dev/null
@@ -1,178 +0,0 @@
-'''
-finish transferring jobs
-
-'''
-
-import re
-import sys
-import commands
-import threading
-from DDM import ddm
-from config import panda_config
-
-from brokerage.SiteMapper import SiteMapper
-
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('Finisher')
-
-
-class Finisher (threading.Thread):
- # constructor
- def __init__(self,taskBuffer,dataset,job=None,site=None):
- threading.Thread.__init__(self)
- self.dataset = dataset
- self.taskBuffer = taskBuffer
- self.job = job
- self.site = site
-
-
- # main
- def run(self):
- # start
- try:
- if self.job == None:
- _logger.debug("start: %s" % self.dataset.name)
- _logger.debug("callback from %s" % self.site)
- # FIXME when callback from BNLPANDA disappeared
- if self.site == 'BNLPANDA':
- self.site = 'BNL-OSG2_ATLASMCDISK'
- # instantiate site mapper
- siteMapper = SiteMapper(self.taskBuffer)
- # get computingSite/destinationSE
- computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name)
- if destinationSE == None:
- # try to get computingSite/destinationSE from ARCH to delete sub
- # even if no active jobs left
- computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name,True)
- if destinationSE == None:
- _logger.error("cannot get source/destination for %s" % self.dataset.name)
- _logger.debug("end: %s" % self.dataset.name)
- return
- _logger.debug("src: %s" % computingSite)
- _logger.debug("dst: %s" % destinationSE)
- # get corresponding token
- tmpSrcSiteSpec = siteMapper.getSite(computingSite)
- tmpDstSiteSpec = siteMapper.getSite(destinationSE)
- _logger.debug(tmpDstSiteSpec.setokens)
- destToken = None
- for tmpToken,tmpDdmId in tmpDstSiteSpec.setokens.iteritems():
- if self.site == tmpDdmId:
- destToken = tmpToken
- break
- _logger.debug("use Token=%s" % destToken)
- # get required tokens
- reqTokens = self.taskBuffer.getDestTokens(self.dataset.name)
- if reqTokens == None:
- _logger.error("cannot get required token for %s" % self.dataset.name)
- _logger.debug("end: %s" % self.dataset.name)
- return
- _logger.debug("req Token=%s" % reqTokens)
- # make bitmap for the token
- bitMap = 1
- if len(reqTokens.split(','))>1:
- for tmpReqToken in reqTokens.split(','):
- if tmpReqToken == destToken:
- break
- # shift one bit
- bitMap <<= 1
- # completed bitmap
- compBitMap = (1 << len(reqTokens.split(',')))-1
- # ignore the lowest bit for T1, file on DISK is already there
- if tmpSrcSiteSpec.ddm == tmpDstSiteSpec.ddm:
- compBitMap = compBitMap & 0xFFFE
- # update bitmap in DB
- updatedBitMap = self.taskBuffer.updateTransferStatus(self.dataset.name,bitMap)
- _logger.debug("transfer status:%s - comp:%s - bit:%s" % (hex(updatedBitMap),hex(compBitMap),hex(bitMap)))
- # update output files
- if (updatedBitMap & compBitMap) == compBitMap:
- ids = self.taskBuffer.updateOutFilesReturnPandaIDs(self.dataset.name)
- # set flag for T2 cleanup
- self.dataset.status = 'cleanup'
- self.taskBuffer.updateDatasets([self.dataset])
- else:
- _logger.debug("end: %s" % self.dataset.name)
- return
- else:
- _logger.debug("start: %s" % self.job.PandaID)
- # update input files
- ids = [self.job.PandaID]
- _logger.debug("IDs: %s" % ids)
- if len(ids) != 0:
- # get job
- if self.job == None:
- jobs = self.taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False)
- else:
- jobs = [self.job]
- # loop over all jobs
- for job in jobs:
- if job == None:
- continue
- _logger.debug("Job: %s" % job.PandaID)
- if job.jobStatus == 'transferring':
- jobReady = True
- # check file status
- for file in job.Files:
- if file.type == 'output' or file.type == 'log':
- if file.status != 'ready':
- _logger.debug("Job: %s file:%s %s != ready" % (job.PandaID,file.lfn,file.status))
- jobReady = False
- break
- # finish job
- if jobReady:
- _logger.debug("Job: %s all files ready" % job.PandaID)
- # create XML
- try:
- import xml.dom.minidom
- dom = xml.dom.minidom.getDOMImplementation()
- doc = dom.createDocument(None,'xml',None)
- topNode = doc.createElement("POOLFILECATALOG")
- for file in job.Files:
- if file.type in ['output','log']:
- # File
- fileNode = doc.createElement("File")
- fileNode.setAttribute("ID",file.GUID)
- # LFN
- logNode = doc.createElement("logical")
- lfnNode = doc.createElement("lfn")
- lfnNode.setAttribute('name',file.lfn)
- # metadata
- fsizeNode = doc.createElement("metadata")
- fsizeNode.setAttribute("att_name","fsize")
- fsizeNode.setAttribute("att_value",str(file.fsize))
- # checksum
- if file.checksum.startswith('ad:'):
- # adler32
- chksumNode = doc.createElement("metadata")
- chksumNode.setAttribute("att_name","adler32")
- chksumNode.setAttribute("att_value",re.sub('^ad:','',file.checksum))
- else:
- # md5sum
- chksumNode = doc.createElement("metadata")
- chksumNode.setAttribute("att_name","md5sum")
- chksumNode.setAttribute("att_value",re.sub('^md5:','',file.checksum))
- # append nodes
- logNode.appendChild(lfnNode)
- fileNode.appendChild(logNode)
- fileNode.appendChild(fsizeNode)
- fileNode.appendChild(chksumNode)
- topNode.appendChild(fileNode)
- # write to file
- xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,job.PandaID,'finished',commands.getoutput('uuidgen'))
- oXML = open(xmlFile,"w")
- oXML.write(topNode.toxml())
- oXML.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("%s : %s %s" % (job.PandaID,type,value))
- _logger.debug("Job: %s status: %s" % (job.PandaID,job.jobStatus))
- # end
- if self.job == None:
- _logger.debug("end: %s" % self.dataset.name)
- else:
- _logger.debug("end: %s" % self.job.PandaID)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("run() : %s %s" % (type,value))
-
diff --git a/current/pandaserver/dataservice/MailUtils.py b/current/pandaserver/dataservice/MailUtils.py
deleted file mode 100755
index 9a8dfd290..000000000
--- a/current/pandaserver/dataservice/MailUtils.py
+++ /dev/null
@@ -1,103 +0,0 @@
-'''
-email utilities
-'''
-
-import sys
-import smtplib
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('MailUtils')
-
-class MailUtils:
- # constructor
- def __init__(self):
- pass
-
- # main
- def send(self,toAddr,mailSubject,mailBody):
- _logger.debug("start SEND session")
- try:
- # remove duplicated address
- listToAddr = []
- newToAddr = ''
- for tmpToAddr in toAddr.split(','):
- if not tmpToAddr in listToAddr:
- listToAddr.append(tmpToAddr)
- newToAddr += '%s,' % tmpToAddr
- toAddr = newToAddr[:-1]
- # make message
- fromAdd = panda_config.emailSender
- message = \
-"""Subject: %s
-From: %s
-To: %s
-
-%s
-""" % (mailSubject,fromAdd,toAddr,mailBody)
- message = self.addTailer(message)
- # send mail
- _logger.debug("send to %s\n%s" % (toAddr,message))
- server = smtplib.SMTP(panda_config.emailSMTPsrv)
- server.set_debuglevel(1)
- server.ehlo()
- server.starttls()
- #server.login(panda_config.emailLogin,panda_config.emailPass)
- out = server.sendmail(fromAdd,listToAddr,message)
- _logger.debug(out)
- server.quit()
- retVal = True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("%s %s" % (type,value))
- retVal = False
- _logger.debug("end SEND session")
- return retVal
-
-
- # send update notification to user
- def sendSiteAccessUpdate(self,toAddr,newStatus,pandaSite):
- # subject
- mailSubject = "PANDA Update on Access Request for %s" % pandaSite
- # message
- mailBody = "Hello,\n\nYour access request for %s has been %s \n" % (pandaSite,newStatus.upper())
- # send
- retVal = self.send(toAddr,mailSubject,mailBody)
- # return
- return retVal
-
-
- # send requests to cloud responsible
- def sendSiteAccessRequest(self,toAddr,requestsMap,cloud):
- # subject
- mailSubject = "PANDA Access Requests in %s" % cloud
- # message
- mailBody = "Hello,\n\nThere are access requests to be approved or rejected.\n\n"
- for pandaSite,userNames in requestsMap.iteritems():
- mailBody += " %s\n" % pandaSite
- userStr = ''
- for userName in userNames:
- userStr += ' %s,' % userName
- userStr = userStr[:-1]
- mailBody += " %s\n\n" % userStr
- # send
- retVal = self.send(toAddr,mailSubject,mailBody)
- # return
- return retVal
-
-
- # add tailer
- def addTailer(self,msg):
- msg += """
-Report Panda problems of any sort to
-
- the eGroup for help request
- hn-atlas-dist-analysis-help@cern.ch
-
- the Savannah for software bug
- https://savannah.cern.ch/projects/panda/
-"""
- return msg
-
diff --git a/current/pandaserver/dataservice/Merger.py b/current/pandaserver/dataservice/Merger.py
deleted file mode 100644
index b8e1d60e5..000000000
--- a/current/pandaserver/dataservice/Merger.py
+++ /dev/null
@@ -1,692 +0,0 @@
-'''
-merge files in dataset
-
-'''
-
-import re
-import sys
-import time
-import commands
-
-import dq2.common
-from dq2.clientapi import DQ2
-import dq2.container.exceptions
-
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('Merger')
-
-
-class Merger:
-
- # constructor
- def __init__(self,taskBuffer,job,simulFlag=False,noSubmit=False):
- self.taskBuffer = taskBuffer
- self.job = job
- self.mergeType = ""
- self.mergeScript = ""
- self.runDir = "."
- self.mergeTypeMap = {}
- self.supportedMergeType = ['hist','ntuple','pool','user','log','text']
- self.simulFlag = simulFlag
- self.noSubmit = noSubmit
- self.dsContMergeLog = ""
- self.fileDestSeMap = {}
-
-
- # parse jobParameters and get mergeType specified by the client
- def getMergeType(self):
- type = ""
- try:
- paramList = re.split('\W+',self.job.jobParameters.strip())
- type = paramList[ paramList.index('mergeType') + 1 ]
- except:
- _logger.debug("%s cannot find --mergeType parameter from parent job" % self.job.PandaID)
- return type
-
-
- # parse jobParameters and get mergeScript specified by the client
- def getUserMergeScript(self):
- script = ""
- try:
- match = re.search("--mergeScript\s(([^\'\"\s]+)|(\"[^\"]+\")|(\'[^\']+\'))",self.job.jobParameters)
- if match != None:
- script = match.group(1)
- except:
- _logger.debug("%s cannot find --mergeScript parameter from parent job" % self.job.PandaID)
- return script
-
- # parse jobParameters and get rundir specified by the client
- def getRunDir(self):
- rundir = "."
- try:
- m = re.match(r'.*\-r\s+(\S+)\s+.*', self.job.jobParameters.strip())
- if m:
- rundir = re.sub(r'[\'"]','',m.group(1))
- except:
- _logger.debug("%s cannot find -r parameter from parent job" % self.job.PandaID)
- return rundir
-
- # parse jobParameters and get ROOT version
- def getRootVer(self):
- ver = ""
- try:
- m = re.match(r'.*\--rootVer\s+(\S+)\s+.*', self.job.jobParameters.strip())
- if m:
- ver = m.group(1)
- except:
- _logger.debug("%s cannot find --rootVer parameter from parent job" % self.job.PandaID)
- return ver
-
- # get file type
- def getFileType(self,tmpLFN):
- tmpLFN = re.sub('\.\d+$','',tmpLFN)
- tmpMatch = re.search('^(.+)\._\d+\.(.+)$',tmpLFN)
- if tmpMatch != None:
- return (tmpMatch.group(1),tmpMatch.group(2))
- return None
-
-
- # parse jobSpec to get merge type automatically
- def getMergeTypeAuto(self):
- # look for outmap
- try:
- tmpMatch = re.search('-o \"([^\"]+)\"',self.job.jobParameters)
- outMapStr = tmpMatch.group(1)
- exec "outMap="+outMapStr
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.debug("%s cannot extract outMap from jobParameters=%s %s:%s" % \
- (self.job.PandaID,self.job.jobParameters,errType,errValue))
- return False
- # convert output type to merge type
- if '/runGen-' in self.job.transformation:
- # loop over all output files for runGen
- for oldName,newName in outMap.iteritems():
- # get file type
- tmpKey = self.getFileType(newName)
- if tmpKey != None:
- # check extension
- if re.search('\.pool\.root(\.\d+)*$',newName) != None:
- # POOL
- tmpType = 'pool'
- elif re.search('\.root(\.\d+)*$',newName) != None:
- # map all root files to ntuple
- tmpType = 'ntuple'
- else:
- # catch all using zip
- tmpType = 'text'
- # append
- self.mergeTypeMap[tmpKey] = tmpType
- else:
- # hist
- if outMap.has_key('hist'):
- tmpType = 'hist'
- tmpKey = self.getFileType(outMap['hist'])
- if tmpKey != None:
- # append
- self.mergeTypeMap[tmpKey] = tmpType
- # ntuple
- if outMap.has_key('ntuple'):
- tmpType = 'ntuple'
- for sName,fName in outMap['ntuple']:
- tmpKey = self.getFileType(fName)
- if tmpKey != None:
- # append
- self.mergeTypeMap[tmpKey] = tmpType
- # AANT
- if outMap.has_key('AANT'):
- # map AANT to ntuple for now
- tmpType = 'ntuple'
- for aName,sName,fName in outMap['AANT']:
- tmpKey = self.getFileType(fName)
- if tmpKey != None:
- # append
- self.mergeTypeMap[tmpKey] = tmpType
- # THIST
- if outMap.has_key('THIST'):
- tmpType = 'ntuple'
- for aName,fName in outMap['THIST']:
- tmpKey = self.getFileType(fName)
- if tmpKey != None:
- # append only when the stream is not used by AANT
- if not self.mergeTypeMap.has_key(tmpKey):
- self.mergeTypeMap[tmpKey] = tmpType
- # POOL
- for tmpOutType,tmpOutVal in outMap.iteritems():
- # TAG is mapped to POOL for now
- if tmpOutType in ['RDO','ESD','AOD','TAG','Stream1','Stream2']:
- tmpType = 'pool'
- tmpKey = self.getFileType(tmpOutVal)
- if tmpKey != None:
- # append
- self.mergeTypeMap[tmpKey] = tmpType
- # general POOL stream
- if outMap.has_key('StreamG'):
- tmpType = 'pool'
- for sName,fName in outMap['StreamG']:
- tmpKey = self.getFileType(fName)
- if tmpKey != None:
- # append
- self.mergeTypeMap[tmpKey] = tmpType
- # meta
- if outMap.has_key('Meta'):
- tmpType = 'pool'
- for sName,fName in outMap['Meta']:
- tmpKey = self.getFileType(fName)
- if tmpKey != None:
- # append only when the stream is not used by another
- if not self.mergeTypeMap.has_key(tmpKey):
- self.mergeTypeMap[tmpKey] = tmpType
- # UserData
- if outMap.has_key('UserData'):
- tmpType = 'pool'
- for fName in outMap['UserData']:
- tmpKey = self.getFileType(fName)
- if tmpKey != None:
- # append
- self.mergeTypeMap[tmpKey] = tmpType
- # BS
- if outMap.has_key('BS'):
- # ByteStream is mapped to text to use zip for now
- tmpType = 'text'
- tmpKey = self.getFileType(outMap['BS'])
- if tmpKey != None:
- # append
- self.mergeTypeMap[tmpKey] = tmpType
- # extra outputs
- if outMap.has_key('IROOT'):
- for oldName,newName in outMap['IROOT']:
- tmpKey = self.getFileType(newName)
- if tmpKey != None:
- # check extension
- if re.search('\.pool\.root(\.\d+)*$',newName) != None:
- # POOL
- tmpType = 'pool'
- elif re.search('\.root(\.\d+)*$',newName) != None:
- # map all root files to ntuple
- tmpType = 'ntuple'
- else:
- # catch all using zip
- tmpType = 'text'
- # append
- self.mergeTypeMap[tmpKey] = tmpType
- # dump
- _logger.debug("%s automatic merge type mapping -> %s" % (self.job.PandaID,str(self.mergeTypeMap)))
- return True
-
-
- # detect merge type with LFN prefix and suffix
- def detectMergeTypeWithLFN(self,filePrefix,fileSuffix):
- tmpKey = (filePrefix,fileSuffix)
- if self.mergeTypeMap.has_key(tmpKey):
- return self.mergeTypeMap[tmpKey]
- # look for matching fileSuffix mainly for --useContElement which has differed prefix
- for tmpKey in self.mergeTypeMap.keys():
- tmpFilePrefix,tmpFileSuffix = tmpKey
- if tmpFileSuffix == fileSuffix:
- _logger.debug("%s updated merge type mapping for %s:%s -> %s" % (self.job.PandaID,filePrefix,fileSuffix,str(self.mergeTypeMap)))
- self.mergeTypeMap[(filePrefix,fileSuffix)] = self.mergeTypeMap[tmpKey]
- return self.mergeTypeMap[tmpKey]
- raise RuntimeError,'cannot find merge type for %s %s' % (filePrefix,fileSuffix)
-
-
- # main returns None for unrecoverable
- def run(self):
- try:
- _logger.debug("%s start" % self.job.PandaID)
- # check source label
- if not self.job.prodSourceLabel in ['user',]:
- _logger.debug("%s do nothing for non-user job" % self.job.PandaID)
- _logger.debug("%s end" % self.job.PandaID)
- return None
- # check command-line parameter
- if not self.simulFlag and not "--mergeOutput" in self.job.jobParameters:
- _logger.debug("%s skip no-merge" % self.job.PandaID)
- _logger.debug("%s end" % self.job.PandaID)
- return None
- # get mergeType from jobParams
- self.mergeType = self.getMergeType()
- self.mergeScript = self.getUserMergeScript()
-
- # if mergeScript is given by user, it's equivalent to user mode mergeType
- if self.mergeScript:
- self.mergeType = 'user'
-
- if self.mergeType != '':
- # check if the merging type is given and is supported
- if self.mergeType not in self.supportedMergeType:
- _logger.error("%s skip not supported merging type \"%s\"" % (self.job.PandaID, self.mergeType))
- _logger.debug("%s end" % self.job.PandaID)
- return None
- elif self.mergeType in ['user']:
- self.runDir = self.getRunDir()
- if not self.mergeScript:
- _logger.error("%s skip: no merging command specified for merging type \"%s\"" % (self.job.PandaID, self.mergeType))
- _logger.debug("%s end" % self.job.PandaID)
- return None
- else:
- # automatic merge type detection
- tmpRet = self.getMergeTypeAuto()
- if not tmpRet:
- _logger.error("%s failed to detect merge type automatically" % self.job.PandaID)
- _logger.debug("%s end" % self.job.PandaID)
- return None
- # instantiate DQ2
- self.dq2api = DQ2.DQ2()
- # get list of datasets
- dsList = []
- dsSubDsMap = {}
- for tmpFile in self.job.Files:
- # use output/log
- if not tmpFile.type in ['log','output']:
- continue
- tmpContName = tmpFile.dataset
- # extend logfile container name with ".merge.log" for storing logs of the merging operation
- if tmpFile.type == 'log' and not self.dsContMergeLog:
- self.dsContMergeLog = re.sub('/$','.merge.log/',tmpFile.dataset)
- tmpSubDsName = tmpFile.destinationDBlock
- # remove _sub
- tmpDsName = re.sub('_sub\d+$','',tmpSubDsName)
- tmpKey = (tmpContName,tmpDsName)
- if not tmpKey in dsList:
- dsList.append(tmpKey)
- dsSubDsMap[tmpDsName] = tmpSubDsName
- # get type
- tmpMatch = self.getFileType(tmpFile.lfn)
- if tmpMatch != None:
- self.fileDestSeMap[tmpMatch] = tmpFile.destinationSE
- # loop over all datasets
- mergeJobList = {}
- for tmpContName,tmpDsName in dsList:
- # check prefix
- if (not tmpDsName.startswith('user')) and (not tmpDsName.startswith('group')):
- _logger.debug("%s ignore non-user/group DS %s" % (self.job.PandaID,tmpDsName))
- continue
- # get list of files
- _logger.debug("%s listFilesInDataset %s" % (self.job.PandaID,tmpDsName))
- tmpAllFileMap = {}
- nTry = 3
- for iTry in range(nTry):
- try:
- tmpRetTimeStamp = self.dq2api.listFilesInDataset(tmpDsName)
- except DQ2.DQUnknownDatasetException:
- _logger.error("%s DQ2 doesn't know %s" % (self.job.PandaID,tmpDsName))
- _logger.debug("%s end" % self.job.PandaID)
- return None
- except:
- if (iTry+1) == nTry:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s DQ2 failed with %s:%s to get file list for %s" % (self.job.PandaID,errType,errValue,tmpDsName))
- _logger.debug("%s end" % self.job.PandaID)
- return False
- # sleep
- time.sleep(60)
- # empty
- if tmpRetTimeStamp == ():
- # close dataset
- varMap = {}
- varMap[':name'] = tmpDsName
- varMap[':status'] = 'tobeclosed'
- uSQL = "UPDATE /*+ INDEX(tab DATASETS_NAME_IDX)*/ ATLAS_PANDA.Datasets "
- uSQL += "SET status=:status,modificationdate=CURRENT_DATE WHERE name=:name "
- self.taskBuffer.querySQLS(uSQL,varMap)
- _logger.debug("%s %s is empty" % (self.job.PandaID,tmpDsName))
- continue
- # loop over all GUIDs
- tmpRet,tmpTimeStamp = tmpRetTimeStamp
- for tmpGUID,tmpVal in tmpRet.iteritems():
- # set GUID
- tmpVal['guid'] = tmpGUID
- # get type
- tmpMatch = self.getFileType(tmpVal['lfn'])
- if tmpMatch == None:
- _logger.error("%s cannot get type for %s" % (self.job.PandaID,tmpVal['lfn']))
- _logger.debug("%s end" % self.job.PandaID)
- return None
- tmpType = (tmpMatch[0],tmpMatch[1],tmpContName,tmpDsName)
- # append
- if not tmpAllFileMap.has_key(tmpType):
- tmpAllFileMap[tmpType] = {}
- tmpAllFileMap[tmpType][tmpVal['lfn']] = tmpVal
- # max size of merged file
- maxMergedFileSize = 5 * 1024 * 1024 * 1024
- # max number of files to be merged
- maxNumToBeMerged = 200
- # loop over all types
- for tmpType,tmpFileMap in tmpAllFileMap.iteritems():
- # sort LFNs
- tmpFileList = tmpFileMap.keys()
- tmpFileList.sort()
- # split by size
- subTotalSize = 0
- subFileList = []
- for tmpFileName in tmpFileList:
- if (subTotalSize+tmpFileMap[tmpFileName]['filesize'] > maxMergedFileSize and subFileList != []) \
- or len(subFileList) >= maxNumToBeMerged:
- # instantiate job
- tmpMergeJob = self.makeMergeJob(subFileList,tmpFileMap,tmpType)
- # append
- if not mergeJobList.has_key(tmpDsName):
- mergeJobList[tmpDsName] = []
- mergeJobList[tmpDsName].append(tmpMergeJob)
- # reset
- subTotalSize = 0
- subFileList = []
- # append
- subTotalSize += tmpFileMap[tmpFileName]['filesize']
- subFileList.append(tmpFileName)
- # remaining
- if subFileList != []:
- # instantiate job
- tmpMergeJob = self.makeMergeJob(subFileList,tmpFileMap,tmpType)
- # append
- if not mergeJobList.has_key(tmpDsName):
- mergeJobList[tmpDsName] = []
- mergeJobList[tmpDsName].append(tmpMergeJob)
- # terminate simulation
- if self.simulFlag and not self.noSubmit:
- _logger.debug("%s end simulation" % self.job.PandaID)
- return True
- # get list of new datasets
- newDatasetMap = {}
- for tmpDsName,tmpJobList in mergeJobList.iteritems():
- # loop over all files
- for tmpFile in tmpJobList[0].Files:
- # ignore inputs
- if not tmpFile.type in ['output','log']:
- continue
- # append
- if not newDatasetMap.has_key(tmpFile.dataset):
- newDatasetMap[tmpFile.dataset] = []
- if not tmpFile.destinationDBlock in newDatasetMap[tmpFile.dataset]:
- newDatasetMap[tmpFile.dataset].append(tmpFile.destinationDBlock)
- # remove /CN=proxy and /CN=limited from DN
- tmpRealDN = self.job.prodUserID
- tmpRealDN = re.sub('/CN=limited proxy','',tmpRealDN)
- tmpRealDN = re.sub('/CN=proxy','',tmpRealDN)
- tmpRealDN = dq2.common.parse_dn(tmpRealDN)
- # register container for merge log files
- if self.dsContMergeLog:
- # register new container for the logs of merging operation
- _logger.debug("%s registerContainer %s" % (self.job.PandaID, self.dsContMergeLog))
- nTry = 3
- unRecoverable = False
- for iTry in range(nTry):
- try:
- self.dq2api.registerContainer(self.dsContMergeLog)
- break
- except DQ2.DQDatasetExistsException:
- break
- except:
- errType,errValue = sys.exc_info()[:2]
- if 'exceeds the maximum length' in str(errValue):
- unRecoverable = True
- if unRecoverable or (iTry+1) == nTry:
- _logger.error("%s DQ2 failed with %s:%s to register new container %s" % (self.job.PandaID,errType,errValue,self.dsContMergeLog))
- _logger.debug("%s end" % self.job.PandaID)
- if unRecoverable:
- return None
- return False
- # sleep
- time.sleep(60)
- # set container owner
- _logger.debug("%s setMetaDataAttribute %s %s" % (self.job.PandaID, self.dsContMergeLog, tmpRealDN))
- nTry = 3
- for iTry in range(nTry):
- try:
- self.dq2api.setMetaDataAttribute(self.dsContMergeLog, 'owner', tmpRealDN)
- except:
- if (iTry+1) == nTry:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s DQ2 failed with %s:%s to set owner for %s" % (self.job.PandaID,errType,errValue,self.dsContMergeLog))
- _logger.debug("%s end" % self.job.PandaID)
- return False
- # sleep
- time.sleep(60)
- # register datasets
- for tmpDsContainer,tmpNewDatasets in newDatasetMap.iteritems():
- # loop over all datasets
- for tmpNewDS in tmpNewDatasets:
- # register
- _logger.debug("%s registerNewDataset %s" % (self.job.PandaID,tmpNewDS))
- nTry = 3
- for iTry in range(nTry):
- try:
- self.dq2api.registerNewDataset(tmpNewDS)
- except DQ2.DQDatasetExistsException:
- pass
- except:
- if (iTry+1) == nTry:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s DQ2 failed with %s:%s to register %s" % (self.job.PandaID,errType,errValue,tmpNewDS))
- _logger.debug("%s end" % self.job.PandaID)
- return False
- # sleep
- time.sleep(60)
- # set owner
- _logger.debug("%s setMetaDataAttribute %s %s" % (self.job.PandaID,tmpNewDS,tmpRealDN))
- nTry = 3
- for iTry in range(nTry):
- try:
- self.dq2api.setMetaDataAttribute(tmpNewDS,'owner',tmpRealDN)
- except:
- if (iTry+1) == nTry:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s DQ2 failed with %s:%s to set owner for %s" % (self.job.PandaID,errType,errValue,tmpNewDS))
- _logger.debug("%s end" % self.job.PandaID)
- return False
- # sleep
- time.sleep(60)
- # add to container
- if tmpDsContainer.endswith('/'):
- # add
- _logger.debug("%s registerDatasetsInContainer %s %s" % (self.job.PandaID,tmpDsContainer,str(tmpNewDatasets)))
- nTry = 3
- for iTry in range(nTry):
- try:
- self.dq2api.registerDatasetsInContainer(tmpDsContainer,tmpNewDatasets)
- break
- except dq2.container.exceptions.DQContainerAlreadyHasDataset:
- break
- except:
- if (iTry+1) == nTry:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s DQ2 failed with %s:%s to add datasets to %s" % (self.job.PandaID,errType,errValue,tmpDsContainer))
- _logger.debug("%s end" % self.job.PandaID)
- return False
- # sleep
- time.sleep(60)
- # no submission
- if self.noSubmit:
- _logger.debug("%s end with no submission" % self.job.PandaID)
- return True
- # submit new jobs
- _logger.debug("%s submit jobs" % self.job.PandaID)
- # fake FQANs
- fqans = []
- if not self.job.countryGroup in ['','NULL',None]:
- fqans.append('/atlas/%s/Role=NULL' % self.job.countryGroup)
- if self.job.destinationDBlock.startswith('group') and not self.job.workingGroup in ['','NULL',None]:
- fqans.append('/atlas/%s/Role=production' % self.job.workingGroup)
- # insert jobs
- for tmpDsName,tmpJobList in mergeJobList.iteritems():
- ret = self.taskBuffer.storeJobs(tmpJobList,self.job.prodUserID,True,False,fqans,
- self.job.creationHost,True,checkSpecialHandling=False)
- if ret == []:
- _logger.error("%s storeJobs failed with [] for %s" % (self.job.PandaID,tmpDsName))
- _logger.debug("%s end" % self.job.PandaID)
- return False
- else:
- # set jobDefID
- tmpJobDefID = ret[0][1]
- if not tmpJobDefID in ['NULL','',None,-1]:
- varMap = {}
- varMap[':name'] = dsSubDsMap[tmpDsName]
- varMap[':moverID'] = tmpJobDefID
- uSQL = "UPDATE /*+ INDEX(tab DATASETS_NAME_IDX)*/ ATLAS_PANDA.Datasets "
- uSQL += "SET moverID=:moverID WHERE name=:name "
- self.taskBuffer.querySQLS(uSQL,varMap)
- # dump
- strPandaIDs = ''
- for tmpItem in ret:
- strPandaIDs += '%s,' % tmpItem[0]
- _logger.debug("%s jobDefID=%s mergeJobs=%s" % (self.job.PandaID,tmpJobDefID,strPandaIDs[:-1]))
- # return
- _logger.debug("%s end" % self.job.PandaID)
- return True
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s failed with %s:%s" % (self.job.PandaID,errType,errValue))
- _logger.debug("%s end" % self.job.PandaID)
- return None
-
-
- # make merge job
- def makeMergeJob(self,fileList,fileMap,fileType):
- # make job spec
- tmpJob = JobSpec()
- # set release and cache
- if not self.job.AtlasRelease in ['','NULL',None]:
- tmpJob.AtlasRelease = self.job.AtlasRelease
- if not self.job.homepackage in ['','NULL',None]:
- tmpJob.homepackage = self.job.homepackage
- tmpJob.prodSourceLabel = 'user'
- tmpJob.prodUserID = self.job.prodUserID
- tmpJob.assignedPriority = 5000
- tmpJob.jobName = 'usermerge.%s' % commands.getoutput('uuidgen')
- tmpJob.computingSite = self.job.computingSite
- tmpJob.metadata = self.job.metadata
- tmpJob.prodDBlock = self.job.prodDBlock
- tmpJob.destinationDBlock = self.job.destinationDBlock
- tmpJob.destinationSE = self.job.destinationSE
- tmpJob.cloud = self.job.cloud
- tmpJob.cmtConfig = self.job.cmtConfig
- tmpJob.lockedby = self.job.lockedby
- tmpJob.processingType = 'usermerge'
- tmpJob.jobsetID = self.job.jobsetID
- tmpJob.jobDefinitionID = 0
- tmpJob.transformation = "http://pandaserver.cern.ch:25080/trf/user/runMerge-00-00-01"
- # decompose fileType
- filePrefix,fileSuffix,containerName,datasetName = fileType
- fileTypeKey = (filePrefix,fileSuffix)
- # output dataset name
- outDsName = datasetName+'.merge'
- # job parameter
- params = '--parentDS %s --parentContainer %s --outDS %s' % (datasetName,containerName,outDsName)
- # look for lib.tgz
- for tmpLibFile in self.job.Files:
- if tmpLibFile.type == 'input' and tmpLibFile.lfn.endswith('.lib.tgz'):
- tmpFile = FileSpec()
- tmpFile.lfn = tmpLibFile.lfn
- tmpFile.GUID = tmpLibFile.GUID
- tmpFile.fsize = tmpLibFile.fsize
- tmpFile.md5sum = tmpLibFile.md5sum
- tmpFile.checksum = tmpLibFile.checksum
- tmpFile.dataset = tmpLibFile.dataset
- tmpFile.prodDBlock = tmpLibFile.prodDBlock
- tmpFile.type = 'input'
- tmpFile.status = 'ready'
- tmpFile.prodDBlockToken = 'local'
- tmpJob.addFile(tmpFile)
- params += " --libTgz %s" % tmpFile.lfn
- break
- # reverse sort to use the largest SN in merged LFN, which is required to find SN offset when outDS is reused
- fileList.reverse()
- # input
- serNum = None
- attNum = None
- for tmpFileName in fileList:
- # extract serial number
- if serNum == None:
- tmpMatch = re.search('^'+filePrefix+'\.(_\d+)\.'+fileSuffix,tmpFileName)
- if tmpMatch == None:
- raise RuntimeError,'cannot extract SN from %s' % tmpFileName
- serNum = tmpMatch.group(1)
- # extract attempt number
- tmpMatch = re.search('\.(\d+)$',tmpFileName)
- if tmpMatch != None:
- attNum = tmpMatch.group(1)
- # make file spec
- tmpFile = FileSpec()
- vals = fileMap[tmpFileName]
- tmpFile.lfn = tmpFileName
- tmpFile.GUID = vals['guid']
- tmpFile.fsize = vals['filesize']
- tmpFile.md5sum = vals['checksum']
- tmpFile.checksum = vals['checksum']
- tmpFile.dataset = containerName
- tmpFile.prodDBlock = tmpFile.dataset
- tmpFile.type = 'input'
- tmpFile.status = 'ready'
- tmpFile.prodDBlockToken = 'local'
- tmpJob.addFile(tmpFile)
-
- # merge type determination
- if fileSuffix.endswith('log.tgz'):
- # log
- usedMergeType = 'log'
- elif self.mergeType != '':
- # user specified merging type
- usedMergeType = self.mergeType
- else:
- # auto detection
- usedMergeType = self.detectMergeTypeWithLFN(filePrefix,fileSuffix)
-
- if usedMergeType in ['user']:
- ## run user mode merging given the merging script
- params += ' -j %s -r %s' % (self.mergeScript, self.runDir)
-
- params += " -t %s" % usedMergeType
- params += " -i \"%s\"" % repr(fileList)
-
- if self.getRootVer():
- params += " --rootVer %s" % self.getRootVer()
-
- if self.job.jobParameters.find('--useRootCore') >= 0:
- params += " --useRootCore"
-
- # output
- tmpFile = FileSpec()
- if attNum == None:
- tmpFile.lfn = "%s.%s.merge.%s" % (filePrefix,serNum,fileSuffix)
- else:
- tmpFile.lfn = "%s.%s.%s.merge.%s" % (filePrefix,serNum,attNum,fileSuffix)
-
- if usedMergeType == 'text' and \
- not tmpFile.lfn.endswith('.tgz') and \
- not tmpFile.lfn.endswith('.tar.gz'):
- tmpFile.lfn += '.tgz'
- tmpFile.destinationDBlock = outDsName
- if self.fileDestSeMap.has_key(fileTypeKey):
- tmpFile.destinationSE = self.fileDestSeMap[fileTypeKey]
- else:
- tmpFile.destinationSE = self.job.destinationSE
- tmpFile.dataset = containerName
- tmpFile.type = 'output'
- tmpJob.addFile(tmpFile)
- params += ' -o "%s"' % tmpFile.lfn
- # log
- tmpItems = filePrefix.split('.')
- if len(tmpItems) > 3:
- logPrefix = "%s.%s.%s" % tuple(tmpItems[:3])
- else:
- logPrefix = filePrefix
- tmpFile = FileSpec()
- tmpFile.lfn = '%s._$PANDAID.log.tgz' % logPrefix
- tmpFile.destinationDBlock = outDsName + ".log"
- tmpFile.destinationSE = tmpJob.computingSite
- tmpFile.dataset = self.dsContMergeLog
- tmpFile.type = 'log'
- tmpJob.addFile(tmpFile)
- # set job parameter
- tmpJob.jobParameters = params
- if self.simulFlag:
- _logger.debug("%s prams %s" % (self.job.PandaID,tmpJob.jobParameters))
- # return
- return tmpJob
diff --git a/current/pandaserver/dataservice/Notifier.py b/current/pandaserver/dataservice/Notifier.py
deleted file mode 100755
index 44aa7cdcf..000000000
--- a/current/pandaserver/dataservice/Notifier.py
+++ /dev/null
@@ -1,396 +0,0 @@
-'''
-notifier
-
-'''
-
-import re
-import sys
-import fcntl
-import commands
-import threading
-import urllib
-import shelve
-import smtplib
-import datetime
-import time
-
-from config import panda_config
-from taskbuffer.OraDBProxy import DBProxy
-from pandalogger.PandaLogger import PandaLogger
-from dataservice.DDM import dq2Info
-import taskbuffer.ErrorCode
-
-# logger
-_logger = PandaLogger().getLogger('Notifier')
-
-# lock file
-_lockGetMail = open(panda_config.lockfile_getMail, 'w')
-
-# ignored DN
-_ignoreList = [
- 'Nurcan Ozturk',
- 'Xin Zhao',
- 'Dietrich Liko',
- ]
-
-# NG words in email address
-_ngWordsInMailAddr = ['support','system','stuff','service','secretariat','club','user','admin',
- 'cvs','grid','librarian','svn','atlas','cms','lhcb','alice','alaelp']
-
-# port for SMTP server
-smtpPortList = [25,587]
-
-def initLogger(pLogger):
- # redirect logging to parent as it doesn't work in nested threads
- global _logger
- _logger = pLogger
-
-
-# wrapper to patch smtplib.stderr to send debug info to logger
-class StderrLogger(object):
- def __init__(self,token):
- self.token = token
- def write(self,message):
- message = message.strip()
- if message != '':
- _logger.debug('%s %s' % (self.token,message))
-
-
-class Notifier:
- # constructor
- def __init__(self,taskBuffer,job,datasets,summary={},mailFile=None,mailFileName=''):
- self.job = job
- self.datasets = datasets
- self.taskBuffer = taskBuffer
- self.summary = summary
- self.mailFile = mailFile
- self.mailFileName = mailFileName
-
- # main
- def run(self):
- if self.mailFile == None:
- _logger.debug("%s start" % self.job.PandaID)
- try:
- # check job type
- if self.job.prodSourceLabel != 'user' and self.job.prodSourceLabel != 'panda':
- _logger.error("Invalid job type : %s" % self.job.prodSourceLabel)
- _logger.debug("%s end" % self.job.PandaID)
- return
- # ignore some DNs to avoid mail storm
- for igName in _ignoreList:
- if re.search(igName,self.job.prodUserID) != None:
- _logger.debug("Ignore DN : %s" % self.job.prodUserID)
- _logger.debug("%s end" % self.job.PandaID)
- return
- # get e-mail address
- mailAddr = self.getEmail(self.job.prodUserID)
- if mailAddr == '':
- _logger.error("could not find email address for %s" % self.job.prodUserID)
- _logger.debug("%s end" % self.job.PandaID)
- return
- # not send
- if mailAddr in ['notsend','',None]:
- _logger.debug("not send to %s" % self.job.prodUserID)
- _logger.debug("%s end" % self.job.PandaID)
- return
- # use all datasets
- if self.summary != {}:
- self.datasets = []
- for tmpJobID,tmpDsList in self.summary.iteritems():
- if tmpDsList == []:
- continue
- self.datasets += tmpDsList
- # get full jobSpec including metadata
- self.job = self.taskBuffer.peekJobs([self.job.PandaID],fromDefined=False,
- fromActive=False,fromWaiting=False)[0]
- if self.job == None:
- _logger.error('%s : not found in DB' % self.job.PandaID)
- _logger.debug("%s end" % self.job.PandaID)
- return
- # get IDs
- ids = []
- # from active tables
- tmpIDs = self.taskBuffer.queryPandaIDwithDataset(self.datasets)
- for tmpID in tmpIDs:
- if not tmpID in ids:
- ids.append(tmpID)
- # from archived table
- if self.job.jobsetID in [0,'NULL',None]:
- tmpIDs = self.taskBuffer.getPandIDsWithIdInArch(self.job.prodUserName,self.job.jobDefinitionID,False)
- else:
- tmpIDs = self.taskBuffer.getPandIDsWithIdInArch(self.job.prodUserName,self.job.jobsetID,True)
- for tmpID in tmpIDs:
- if not tmpID in ids:
- ids.append(tmpID)
- _logger.debug("%s IDs: %s" % (self.job.PandaID,ids))
- if len(ids) != 0:
- # get jobs
- jobs = self.taskBuffer.getFullJobStatus(ids,fromDefined=False,fromActive=False,
- fromWaiting=False,forAnal=False)
- # statistics
- nTotal = 0
- nSucceeded = 0
- nFailed = 0
- nPartial = 0
- nCancel = 0
- # time info
- creationTime = self.job.creationTime
- endTime = self.job.modificationTime
- if isinstance(endTime,datetime.datetime):
- endTime = endTime.strftime('%Y-%m-%d %H:%M:%S')
- # datasets
- iDSList = []
- oDSList = []
- siteMap = {}
- logDS = None
- for tmpJob in jobs:
- if not siteMap.has_key(tmpJob.jobDefinitionID):
- siteMap[tmpJob.jobDefinitionID] = tmpJob.computingSite
- for file in tmpJob.Files:
- if file.type == 'input':
- if not file.dataset in iDSList:
- iDSList.append(file.dataset)
- else:
- if not file.dataset in oDSList:
- oDSList.append(file.dataset)
- if file.type == 'log':
- logDS = file.dataset
- # job/jobset IDs and site
- if self.summary == {}:
- jobIDsite = "%s/%s" % (self.job.jobDefinitionID,self.job.computingSite)
- jobsetID = self.job.jobDefinitionID
- jobDefIDList = [self.job.jobDefinitionID]
- else:
- jobDefIDList = self.summary.keys()
- jobDefIDList.sort()
- jobIDsite = ''
- tmpIndent = " "
- for tmpJobID in jobDefIDList:
- jobIDsite += '%s/%s\n%s' % (tmpJobID,siteMap[tmpJobID],tmpIndent)
- remCount = len(tmpIndent) + 1
- jobIDsite = jobIDsite[:-remCount]
- jobsetID = self.job.jobsetID
- # count
- for job in jobs:
- if job == None:
- continue
- # ignore pilot-retried job
- if job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_PilotRetried]:
- continue
- # total
- nTotal += 1
- # count per job status
- if job.jobStatus == 'finished':
- # check all files were used
- allUses = True
- for file in job.Files:
- if file.type == 'input' and file.status in ['skipped']:
- allUses = False
- break
- if allUses:
- nSucceeded += 1
- else:
- nPartial += 1
- elif job.jobStatus == 'failed':
- nFailed += 1
- elif job.jobStatus == 'cancelled':
- nCancel += 1
- # make message
- if nSucceeded == nTotal:
- finalStatInSub = "(All Succeeded)"
- else:
- finalStatInSub = "(%s/%s Succeeded)" % (nSucceeded,nTotal)
- fromadd = panda_config.emailSender
- if self.job.jobsetID in [0,'NULL',None]:
- message = \
-"""Subject: PANDA notification for JobID : %s %s
-From: %s
-To: %s
-
-Summary of JobID : %s
-
-Site : %s""" % (self.job.jobDefinitionID,finalStatInSub,fromadd,mailAddr,self.job.jobDefinitionID,self.job.computingSite)
- else:
- message = \
-"""Subject: PANDA notification for JobsetID : %s %s
-From: %s
-To: %s
-
-Summary of JobsetID : %s
-
-JobID/Site : %s""" % (jobsetID,finalStatInSub,fromadd,mailAddr,jobsetID,jobIDsite)
- message += \
-"""
-
-Created : %s (UTC)
-Ended : %s (UTC)
-
-Total Number of Jobs : %s
- Succeeded : %s
- Partial : %s
- Failed : %s
- Cancelled : %s
-""" % (creationTime,endTime,nTotal,nSucceeded,nPartial,nFailed,nCancel)
- # input datasets
- for iDS in iDSList:
- message += \
-"""
-In : %s""" % iDS
- # output datasets
- for oDS in oDSList:
- message += \
-"""
-Out : %s""" % oDS
- # command
- if not self.job.metadata in ['','NULL',None]:
- message += \
-"""
-
-Parameters : %s""" % self.job.metadata
- # URLs to PandaMon
- if self.job.jobsetID in [0,'NULL',None]:
- for tmpIdx,tmpJobID in enumerate(jobDefIDList):
- urlData = {}
- urlData['job'] = '*'
- urlData['jobDefinitionID'] = tmpJobID
- urlData['user'] = self.job.prodUserName
- urlData['at'] = (str(creationTime)).split()[0]
- if tmpIdx == 0:
- message += \
-"""
-
-PandaMonURL : http://panda.cern.ch/server/pandamon/query?%s""" % urllib.urlencode(urlData)
- else:
- message += \
-"""
- http://panda.cern.ch/server/pandamon/query?%s""" % urllib.urlencode(urlData)
- else:
- urlData = {}
- urlData['job'] = '*'
- urlData['jobsetID'] = self.job.jobsetID
- urlData['user'] = self.job.prodUserName
- urlData['at'] = (str(creationTime)).split()[0]
- message += \
-"""
-
-PandaMonURL : http://panda.cern.ch/server/pandamon/query?%s""" % urllib.urlencode(urlData)
- if logDS != None:
- message += \
-"""
-TaskMonitorURL : https://dashb-atlas-task.cern.ch/templates/task-analysis/#task=%s""" % logDS
-
- # tailer
- message += \
-"""
-
-
-Report Panda problems of any sort to
-
- the eGroup for help request
- hn-atlas-dist-analysis-help@cern.ch
-
- the Savannah for software bug
- https://savannah.cern.ch/projects/panda/
-"""
-
- # send mail
- self.sendMail(self.job.PandaID,fromadd,mailAddr,message,1,True)
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s %s %s" % (self.job.PandaID,errType,errValue))
- _logger.debug("%s end" % self.job.PandaID)
- else:
- try:
- _logger.debug("start recovery for %s" % self.mailFileName)
- # read from file
- pandaID = self.mailFile.readline()[:-1]
- fromadd = self.mailFile.readline()[:-1]
- mailAddr = self.mailFile.readline()[:-1]
- message = self.mailFile.read()
- _logger.debug("%s start recovery" % pandaID)
- if message != '':
- self.sendMail(pandaID,fromadd,mailAddr,message,5,False)
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s %s %s" % (self.mailFileName,errType,errValue))
- _logger.debug("end recovery for %s" % self.mailFileName)
-
-
- # send mail
- def sendMail(self,pandaID,fromadd,mailAddr,message,nTry,fileBackUp):
- _logger.debug("%s send to %s\n%s" % (pandaID,mailAddr,message))
- for iTry in range(nTry):
- try:
- org_smtpstderr = smtplib.stderr
- smtplib.stderr = StderrLogger(pandaID)
- smtpPort = smtpPortList[iTry % len(smtpPortList)]
- server = smtplib.SMTP(panda_config.emailSMTPsrv,smtpPort)
- server.set_debuglevel(1)
- server.ehlo()
- server.starttls()
- #server.login(panda_config.emailLogin,panda_config.emailPass)
- out = server.sendmail(fromadd,mailAddr,message)
- _logger.debug('%s %s' % (pandaID,str(out)))
- server.quit()
- break
- except:
- errType,errValue = sys.exc_info()[:2]
- if iTry+1 < nTry:
- # sleep for retry
- _logger.debug("%s sleep %s due to %s %s" % (pandaID,iTry,errType,errValue))
- time.sleep(30)
- else:
- _logger.error("%s %s %s" % (pandaID,errType,errValue))
- if fileBackUp:
- # write to file which is processed in add.py
- mailFile = '%s/mail_%s_%s' % (panda_config.logdir,self.job.PandaID,commands.getoutput('uuidgen'))
- oMail = open(mailFile,"w")
- oMail.write(str(self.job.PandaID)+'\n'+fromadd+'\n'+mailAddr+'\n'+message)
- oMail.close()
- try:
- smtplib.stderr = org_smtpstderr
- except:
- pass
-
-
-
- # get email
- def getEmail(self,dn):
- # get DN
- _logger.debug("getDN for %s" % dn)
- dbProxy = DBProxy()
- distinguishedName = dbProxy.cleanUserID(dn)
- _logger.debug("DN = %s" % distinguishedName)
- if distinguishedName == "":
- _logger.error("cannot get DN for %s" % dn)
- return ""
- # get email from MetaDB
- mailAddr = self.taskBuffer.getEmailAddr(distinguishedName)
- if mailAddr == 'notsend':
- _logger.debug("email from MetaDB : '%s'" % mailAddr)
- return mailAddr
- # get email from DQ2
- realDN = re.sub('/CN=limited proxy','',dn)
- realDN = re.sub('(/CN=proxy)+','',realDN)
- try:
- _logger.debug("dq2Info.finger(%s)" % realDN)
- for iDDMTry in range(3):
- status,out = dq2Info.finger(realDN)
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(10)
- else:
- break
- _logger.debug(out)
- exec "userInfo=%s" % out
- mailAddr = userInfo['email']
- _logger.debug("email from DQ2 : '%s'" % mailAddr)
- return mailAddr
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s %s" % (errType,errValue))
- return ""
-
-
-
diff --git a/current/pandaserver/dataservice/ProcessLimiter.py b/current/pandaserver/dataservice/ProcessLimiter.py
deleted file mode 100644
index 580fe9c39..000000000
--- a/current/pandaserver/dataservice/ProcessLimiter.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import datetime
-import commands
-import threading
-
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('ProcessLimiter')
-
-
-# limit the number of processes
-class ProcessLimiter:
- # constructor
- def __init__(self,maxProcess=3):
- self.processLock = threading.Semaphore(maxProcess)
- self.dataLock = threading.Lock()
- self.summary = {'nQueued':0,'nRunning':0}
-
-
- # update summary
- def updateSummary(self,dataName,change):
- # lock
- self.dataLock.acquire()
- # update
- if self.summary.has_key(dataName):
- self.summary[dataName] += change
- # release
- self.dataLock.release()
- _logger.debug('Summary : %s' % str(self.summary))
-
-
- # execute command
- def getstatusoutput(self,commandStr):
- # time stamp
- timestamp = datetime.datetime.utcnow().isoformat(' ')
- _logger.debug('%s start for "%s"' % (timestamp,commandStr))
- self.updateSummary('nQueued',1)
- _logger.debug('%s getting lock' % timestamp)
- # get semaphore
- self.processLock.acquire()
- _logger.debug('%s got lock' % timestamp)
- # execute
- self.updateSummary('nRunning',1)
- status,output = commands.getstatusoutput(commandStr)
- _logger.debug('%s executed' % timestamp)
- self.updateSummary('nRunning',-1)
- # release queue
- self.processLock.release()
- _logger.debug('%s end' % timestamp)
- self.updateSummary('nQueued',-1)
- # return
- return status,output
-
-
diff --git a/current/pandaserver/dataservice/RetryMaker.py b/current/pandaserver/dataservice/RetryMaker.py
deleted file mode 100755
index e6b69a6ce..000000000
--- a/current/pandaserver/dataservice/RetryMaker.py
+++ /dev/null
@@ -1,125 +0,0 @@
-'''
-notifier
-
-'''
-
-import re
-import sys
-import commands
-import urllib
-import datetime
-import time
-
-from config import panda_config
-from userinterface import ReBroker
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('RetryMaker')
-
-
-def initLogger(pLogger):
- # redirect logging to parent as it doesn't work in nested threads
- global _logger
- _logger = pLogger
- ReBroker.initLogger(_logger)
-
-
-class RetryMaker:
- # constructor
- def __init__(self,taskBuffer,job):
- self.job = job
- self.taskBuffer = taskBuffer
-
- # main
- def run(self):
- _logger.debug("%s start" % self.job.PandaID)
- try:
- # check the number of server retry
- nRetry = self.job.specialHandling.split(',').count('sretry')
- _logger.debug("%s nRetry=%s" % (self.job.PandaID,nRetry))
- # too many reattempts
- maxRetry = 2
- if nRetry >= maxRetry:
- _logger.debug("%s end : too many reattempts %s>=%s" % (self.job.PandaID,nRetry,maxRetry))
- return True
- # get all job status in Active
- idStatus,buildID = self.taskBuffer.getPandIDsWithJobID(self.job.prodUserName,
- self.job.jobDefinitionID,
- {},0)
- # count # of failed in active
- nFailed = 0
- for tmpID,tmpVar in idStatus.iteritems():
- # ignore buildJob
- if tmpID == buildID:
- continue
- # count
- tmpStatus,tmpCommand = tmpVar
- if tmpStatus == 'failed':
- nFailed += 1
- elif tmpStatus == 'cancelled' or tmpCommand == 'tobekilled':
- # killed
- _logger.debug("%s end : cancelled" % self.job.PandaID)
- return True
- _logger.debug("%s : nFailed=%s in Active" % (self.job.PandaID,nFailed))
- # no failed
- if nFailed == 0:
- _logger.debug("%s end : no failed jobs" % self.job.PandaID)
- return True
- # get all job status including Archived
- idStatus,buildID = self.taskBuffer.getPandIDsWithJobIDLog(self.job.prodUserName,
- self.job.jobDefinitionID,
- idStatus,0,buildID)
- # count # of failed and others in archived
- nFailed = 0
- nOthers = 0
- for tmpID,tmpVar in idStatus.iteritems():
- # ignore buildJob
- if tmpID == buildID:
- continue
- # count
- tmpStatus,tmpCommand = tmpVar
- if tmpStatus == 'failed':
- nFailed += 1
- elif tmpStatus == 'cancelled' or tmpCommand == 'tobekilled':
- # killed
- _logger.debug("%s end : cancelled" % self.job.PandaID)
- return True
- else:
- nOthers += 1
- _logger.debug("%s : nFailed=%s nOthers=%s in Active+Archived" % (self.job.PandaID,nFailed,nOthers))
- # no successful jobs
- if nOthers == 0:
- _logger.debug("%s end : no successful jobs" % self.job.PandaID)
- return True
- # no failed jobs just in case
- if nFailed == 0:
- _logger.debug("%s end : no failed jobs" % self.job.PandaID)
- return True
- # check ratio
- maxFailedRatio = 0.8
- failedRatio = float(nFailed) / float(nOthers+nFailed)
- if failedRatio > maxFailedRatio:
- _logger.debug("%s end : too many failed jobs %s/%s>%s" % (self.job.PandaID,
- nFailed,
- nOthers+nFailed,
- maxFailedRatio))
- return True
- # instantiate rebrokerage since server-side retry relies on that
- rebro = ReBroker.ReBroker(self.taskBuffer,forFailed=True,avoidSameSite=True)
- # lock job for retry
- reSt,reVal = rebro.lockJob(self.job.prodUserID,self.job.jobDefinitionID)
- if not reSt:
- _logger.debug("%s end : failed to lock jobs with %s" % (self.job.PandaID,eVal))
- return False
- # execute
- _logger.debug("%s : execute ReBroker" % self.job.PandaID)
- rebro.start()
- rebro.join()
- _logger.debug("%s end : successfully" % self.job.PandaID)
- return True
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s %s %s" % (self.job.PandaID,errType,errValue))
- _logger.debug("%s end : failed" % self.job.PandaID)
- return False
diff --git a/current/pandaserver/dataservice/Setupper.py b/current/pandaserver/dataservice/Setupper.py
deleted file mode 100755
index 6b2103fea..000000000
--- a/current/pandaserver/dataservice/Setupper.py
+++ /dev/null
@@ -1,2420 +0,0 @@
-'''
-setup dataset
-
-'''
-
-import re
-import sys
-import time
-import types
-import urllib
-import datetime
-import commands
-import threading
-import traceback
-import ErrorCode
-import TaskAssigner
-from DDM import ddm
-from dataservice.DDM import dq2Common
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-from taskbuffer.DatasetSpec import DatasetSpec
-from brokerage.SiteMapper import SiteMapper
-from brokerage.PandaSiteIDs import PandaMoverIDs
-import brokerage.broker
-import brokerage.broker_util
-import DataServiceUtils
-
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('Setupper')
-
-
-# temporary
-PandaDDMSource = ['BNLPANDA','BNL-OSG2_MCDISK','BNL-OSG2_DATADISK','BNL-OSG2_MCTAPE','BNL-OSG2_DATATAPE']
-
-
-class Setupper (threading.Thread):
- # constructor
- def __init__(self,taskBuffer,jobs,resubmit=False,pandaDDM=False,ddmAttempt=0,forkRun=False,onlyTA=False,
- resetLocation=False,useNativeDQ2=True):
- threading.Thread.__init__(self)
- self.jobs = jobs
- self.taskBuffer = taskBuffer
- # VUIDs of dispatchDBlocks
- self.vuidMap = {}
- # resubmission or not
- self.resubmit = resubmit
- # time stamp
- self.timestamp = datetime.datetime.utcnow().isoformat(' ')
- # use PandaDDM
- self.pandaDDM = pandaDDM
- # file list for dispDS for PandaDDM
- self.dispFileList = {}
- # priority for ddm job
- self.ddmAttempt = ddmAttempt
- # site mapper
- self.siteMapper = None
- # fork another process because python doesn't release memory
- self.forkRun = forkRun
- # run task assignment only
- self.onlyTA = onlyTA
- # location map
- self.replicaMap = {}
- # all replica locations
- self.allReplicaMap = {}
- # reset locations
- self.resetLocation = resetLocation
- # replica map for special brokerage
- self.replicaMapForBroker = {}
- # available files at T2
- self.availableLFNsInT2 = {}
- # use DQ2 in the same process
- self.useNativeDQ2 = useNativeDQ2
- # list of missing datasets
- self.missingDatasetList = {}
- # lfn ds map
- self.lfnDatasetMap = {}
-
-
- # main
- def run(self):
- try:
- _logger.debug('%s startRun' % self.timestamp)
- self._memoryCheck()
- # run main procedure in the same process
- if not self.forkRun:
- if self.jobs != None and len(self.jobs) > 0:
- _logger.debug('%s PandaID:%s type:%s taskID:%s' % (self.timestamp,
- self.jobs[0].PandaID,
- self.jobs[0].prodSourceLabel,
- self.jobs[0].taskID))
- # instantiate site mapper
- self.siteMapper = SiteMapper(self.taskBuffer)
- # use native DQ2
- if self.useNativeDQ2:
- ddm.useDirectDQ2()
- # correctLFN
- self._correctLFN()
- # run full Setupper
- if not self.onlyTA:
- # invoke brokerage
- _logger.debug('%s brokerSchedule' % self.timestamp)
- brokerage.broker.schedule(self.jobs,self.taskBuffer,self.siteMapper,
- replicaMap=self.replicaMapForBroker,
- t2FilesMap=self.availableLFNsInT2)
- # remove waiting jobs
- self.removeWaitingJobs()
- # setup dispatch dataset
- _logger.debug('%s setupSource' % self.timestamp)
- self._setupSource()
- # sort by site so that larger subs are created in the next step
- if self.jobs != [] and self.jobs[0].prodSourceLabel in ['managed','test']:
- tmpJobMap = {}
- for tmpJob in self.jobs:
- # add site
- if not tmpJobMap.has_key(tmpJob.computingSite):
- tmpJobMap[tmpJob.computingSite] = []
- # add job
- tmpJobMap[tmpJob.computingSite].append(tmpJob)
- # make new list
- tmpJobList = []
- for tmpSiteKey in tmpJobMap.keys():
- tmpJobList += tmpJobMap[tmpSiteKey]
- # set new list
- self.jobs = tmpJobList
- # create dataset for outputs and assign destination
- if self.jobs != [] and self.jobs[0].prodSourceLabel in ['managed','test'] and self.jobs[0].cloud in ['DE']:
- # count the number of jobs per _dis
- iBunch = 0
- prevDisDsName = None
- nJobsPerDisList = []
- for tmpJob in self.jobs:
- if prevDisDsName != None and prevDisDsName != tmpJob.dispatchDBlock:
- nJobsPerDisList.append(iBunch)
- iBunch = 0
- # increment
- iBunch += 1
- # set _dis name
- prevDisDsName = tmpJob.dispatchDBlock
- # remaining
- if iBunch != 0:
- nJobsPerDisList.append(iBunch)
- # split sub datasets
- iBunch = 0
- nBunchMax = 50
- tmpIndexJob = 0
- for nJobsPerDis in nJobsPerDisList:
- # check _dis boundary so that the same _dis doesn't contribute to many _subs
- if iBunch+nJobsPerDis > nBunchMax:
- if iBunch != 0:
- self._setupDestination(startIdx=tmpIndexJob,nJobsInLoop=iBunch)
- tmpIndexJob += iBunch
- iBunch = 0
- # increment
- iBunch += nJobsPerDis
- # remaining
- if iBunch != 0:
- self._setupDestination(startIdx=tmpIndexJob,nJobsInLoop=iBunch)
- else:
- # at a burst
- self._setupDestination()
- # make dis datasets for existing files
- self._makeDisDatasetsForExistingfiles()
- # update jobs
- _logger.debug('%s updateJobs' % self.timestamp)
- self._updateJobs()
- # then subscribe sites distpatchDBlocks. this must be the last method
- _logger.debug('%s subscribeDistpatchDB' % self.timestamp)
- self._subscribeDistpatchDB()
- # dynamic data placement for analysis jobs
- self._dynamicDataPlacement()
- # pin input datasets
- self._pinInputDatasets()
- # make subscription for missing
- self._makeSubscriptionForMissing()
- else:
- # write jobs to file
- import os
- import cPickle as pickle
- outFileName = '%s/set.%s_%s' % (panda_config.logdir,self.jobs[0].PandaID,commands.getoutput('uuidgen'))
- outFile = open(outFileName,'w')
- pickle.dump(self.jobs,outFile)
- outFile.close()
- # run main procedure in another process because python doesn't release memory
- com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd)
- com += 'source /opt/glite/etc/profile.d/grid-env.sh; '
- com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \
- (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python,
- panda_config.pandaPython_dir,outFileName)
- if self.onlyTA:
- com += " -t"
- _logger.debug(com)
- # exeute
- status,output = self.taskBuffer.processLimiter.getstatusoutput(com)
- _logger.debug("Ret from another process: %s %s" % (status,output))
- self._memoryCheck()
- _logger.debug('%s endRun' % self.timestamp)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("%s run() : %s %s" % (self.timestamp,type,value))
-
-
- # make dipatchDBlocks, insert prod/dispatchDBlock to database
- def _setupSource(self):
- fileList = {}
- prodList = []
- prodError = {}
- dispSiteMap = {}
- dispError = {}
- # extract prodDBlock
- for job in self.jobs:
- # ignore failed jobs
- if job.jobStatus in ['failed','cancelled']:
- continue
- # production datablock
- if job.prodDBlock != 'NULL' and (not self.pandaDDM) and (not job.prodSourceLabel in ['user','panda']):
- # get VUID and record prodDBlock into DB
- if not prodError.has_key(job.prodDBlock):
- time.sleep(1)
- _logger.debug((self.timestamp,'queryDatasetByName',job.prodDBlock))
- prodError[job.prodDBlock] = ''
- for iDDMTry in range(3):
- status,out = ddm.repositoryClient.main('queryDatasetByName',job.prodDBlock)
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(60)
- else:
- break
- _logger.debug("%s %s" % (self.timestamp,out))
- if status != 0 or out.find('Error') != -1:
- prodError[job.prodDBlock] = "Setupper._setupSource() could not get VUID of prodDBlock"
- _logger.error(out)
- else:
- try:
- exec "vuids = %s['%s']['vuids']" % (out.split('\n')[0],job.prodDBlock)
- nfiles = 0
- # dataset spec
- ds = DatasetSpec()
- ds.vuid = vuids[0]
- ds.name = job.prodDBlock
- ds.type = 'input'
- ds.status = 'completed'
- ds.numberfiles = nfiles
- ds.currentfiles = nfiles
- prodList.append(ds)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("_setupSource() : %s %s" % (type,value))
- prodError[job.prodDBlock] = "Setupper._setupSource() could not decode VUID of prodDBlock"
- # error
- if prodError[job.prodDBlock] != '':
- job.jobStatus = 'failed'
- job.ddmErrorCode = ErrorCode.EC_Setupper
- job.ddmErrorDiag = prodError[job.prodDBlock]
- continue
- # dispatch datablock
- if job.dispatchDBlock != 'NULL':
- # src/dst sites
- tmpSrcID = 'BNL_ATLAS_1'
- if self.siteMapper.checkCloud(job.cloud):
- # use cloud's source
- tmpSrcID = self.siteMapper.getCloud(job.cloud)['source']
- srcDQ2ID = self.siteMapper.getSite(tmpSrcID).ddm
- # use srcDQ2ID as dstDQ2ID when dst SE is same as src SE
- srcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpSrcID).se)
- dstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(job.computingSite).se)
- if srcSEs == dstSEs:
- dstDQ2ID = srcDQ2ID
- else:
- dstDQ2ID = self.siteMapper.getSite(job.computingSite).ddm
- dispSiteMap[job.dispatchDBlock] = {'src':srcDQ2ID,'dst':dstDQ2ID,'site':job.computingSite}
- # filelist
- if not fileList.has_key(job.dispatchDBlock):
- fileList[job.dispatchDBlock] = {'lfns':[],'guids':[],'fsizes':[],'md5sums':[],'chksums':[]}
- # collect LFN and GUID
- for file in job.Files:
- if file.type == 'input' and file.status == 'pending':
- if not file.lfn in fileList[job.dispatchDBlock]['lfns']:
- fileList[job.dispatchDBlock]['lfns'].append(file.lfn)
- fileList[job.dispatchDBlock]['guids'].append(file.GUID)
- if file.fsize in ['NULL',0]:
- fileList[job.dispatchDBlock]['fsizes'].append(None)
- else:
- fileList[job.dispatchDBlock]['fsizes'].append(long(file.fsize))
- if file.md5sum in ['NULL','']:
- fileList[job.dispatchDBlock]['md5sums'].append(None)
- elif file.md5sum.startswith("md5:"):
- fileList[job.dispatchDBlock]['md5sums'].append(file.md5sum)
- else:
- fileList[job.dispatchDBlock]['md5sums'].append("md5:%s" % file.md5sum)
- if file.checksum in ['NULL','']:
- fileList[job.dispatchDBlock]['chksums'].append(None)
- else:
- fileList[job.dispatchDBlock]['chksums'].append(file.checksum)
- # get replica locations
- if not self.replicaMap.has_key(job.dispatchDBlock):
- self.replicaMap[job.dispatchDBlock] = {}
- if not self.allReplicaMap.has_key(file.dataset):
- if file.dataset.endswith('/'):
- status,out = self.getListDatasetReplicasInContainer(file.dataset)
- else:
- for iDDMTry in range(3):
- _logger.debug((self.timestamp,'listDatasetReplicas',file.dataset))
- status,out = ddm.DQ2.main('listDatasetReplicas',file.dataset,0,None,False)
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1 \
- or out == '()':
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error("%s %s" % (self.timestamp,out))
- dispError[job.dispatchDBlock] = 'could not get locations for %s' % file.dataset
- _logger.error(dispError[job.dispatchDBlock])
- else:
- _logger.debug("%s %s" % (self.timestamp,out))
- tmpRepSites = {}
- try:
- # convert res to map
- exec "tmpRepSites = %s" % out
- self.allReplicaMap[file.dataset] = tmpRepSites
- except:
- dispError[job.dispatchDBlock] = 'could not convert HTTP-res to replica map for %s' % file.dataset
- _logger.error(dispError[job.dispatchDBlock])
- _logger.error(out)
- if self.allReplicaMap.has_key(file.dataset):
- self.replicaMap[job.dispatchDBlock][file.dataset] = self.allReplicaMap[file.dataset]
- # register dispatch dataset
- dispList = []
- for dispatchDBlock in fileList.keys():
- # ignore empty dataset
- if len(fileList[dispatchDBlock]['lfns']) == 0:
- continue
- # use DQ2
- if (not self.pandaDDM) and (not dispSiteMap[dispatchDBlock]['src'] in PandaDDMSource or \
- self.siteMapper.getSite(dispSiteMap[dispatchDBlock]['site']).cloud != 'US') \
- and (job.prodSourceLabel != 'ddm') and (not dispSiteMap[dispatchDBlock]['site'].endswith("_REPRO")):
- # register dispatch dataset
- disFiles = fileList[dispatchDBlock]
- _logger.debug((self.timestamp,'registerNewDataset',dispatchDBlock,disFiles['lfns'],disFiles['guids'],
- disFiles['fsizes'],disFiles['chksums'],None,None,None,True))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('registerNewDataset',dispatchDBlock,disFiles['lfns'],disFiles['guids'],
- disFiles['fsizes'],disFiles['chksums'],None,None,None,True)
- if status != 0 and out.find('DQDatasetExistsException') != -1:
- break
- elif status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- _logger.debug("%s sleep %s for %s" % (self.timestamp,iDDMTry,dispatchDBlock))
- _logger.debug(status)
- _logger.debug(out)
- _logger.debug("-------------")
- time.sleep(60)
- else:
- break
- if status != 0 or out.find('Error') != -1:
- _logger.error("%s %s" % (self.timestamp,out))
- dispError[dispatchDBlock] = "Setupper._setupSource() could not register dispatchDBlock"
- continue
- _logger.debug("%s %s" % (self.timestamp,out))
- vuidStr = out
- # freezeDataset dispatch dataset
- time.sleep(1)
- _logger.debug((self.timestamp,'freezeDataset',dispatchDBlock))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('freezeDataset',dispatchDBlock)
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(60)
- else:
- break
- if status != 0 or (out.find('Error') != -1 and out.find("is frozen") == -1):
- _logger.error("%s %s" % (self.timestamp,out))
- dispError[dispatchDBlock] = "Setupper._setupSource() could not freeze dispatchDBlock"
- continue
- _logger.debug("%s %s" % (self.timestamp,out))
- else:
- # use PandaDDM
- self.dispFileList[dispatchDBlock] = fileList[dispatchDBlock]
- # create a fake vuidStr for PandaDDM
- tmpMap = {'vuid':commands.getoutput('uuidgen')}
- vuidStr = "%s" % tmpMap
- # get VUID
- try:
- exec "vuid = %s['vuid']" % vuidStr
- # dataset spec. currentfiles is used to count the number of failed jobs
- ds = DatasetSpec()
- ds.vuid = vuid
- ds.name = dispatchDBlock
- ds.type = 'dispatch'
- ds.status = 'defined'
- ds.numberfiles = len(fileList[dispatchDBlock])/2
- ds.currentfiles = 0
- dispList.append(ds)
- self.vuidMap[ds.name] = ds.vuid
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("_setupSource() : %s %s" % (type,value))
- dispError[dispatchDBlock] = "Setupper._setupSource() could not decode VUID dispatchDBlock"
- # insert datasets to DB
- self.taskBuffer.insertDatasets(prodList+dispList)
- # job status
- for job in self.jobs:
- if dispError.has_key(job.dispatchDBlock) and dispError[job.dispatchDBlock] != '':
- job.jobStatus = 'failed'
- job.ddmErrorCode = ErrorCode.EC_Setupper
- job.ddmErrorDiag = dispError[job.dispatchDBlock]
- # delete explicitly some huge variables
- del fileList
- del prodList
- del prodError
- del dispSiteMap
-
-
- # create dataset for outputs in the repository and assign destination
- def _setupDestination(self,startIdx=-1,nJobsInLoop=50):
- _logger.debug('%s setupDestination idx:%s n:%s' % (self.timestamp,startIdx,nJobsInLoop))
- destError = {}
- datasetList = {}
- newnameList = {}
- snGottenDS = []
- if startIdx == -1:
- jobsList = self.jobs
- else:
- jobsList = self.jobs[startIdx:startIdx+nJobsInLoop]
- for job in jobsList:
- # ignore failed jobs
- if job.jobStatus in ['failed','cancelled']:
- continue
- for file in job.Files:
- # ignore input files
- if file.type == 'input':
- continue
- # don't touch with outDS for unmerge jobs
- if job.prodSourceLabel == 'panda' and job.processingType == 'unmerge' and file.type != 'log':
- continue
- # extract destinationDBlock, destinationSE and computingSite
- dest = (file.destinationDBlock,file.destinationSE,job.computingSite,file.destinationDBlockToken)
- if not destError.has_key(dest):
- destError[dest] = ''
- originalName = ''
- if (job.prodSourceLabel == 'panda') or (job.prodSourceLabel in ['ptest','rc_test'] and \
- job.processingType in ['pathena','prun','gangarobot-rctest']):
- # keep original name
- nameList = [file.destinationDBlock]
- else:
- # set freshness to avoid redundant DB lookup
- definedFreshFlag = None
- if file.destinationDBlock in snGottenDS:
- # already checked
- definedFreshFlag = False
- elif job.prodSourceLabel in ['user','test','prod_test']:
- # user or test datasets are always fresh in DB
- definedFreshFlag = True
- # get serial number
- sn,freshFlag = self.taskBuffer.getSerialNumber(file.destinationDBlock,definedFreshFlag)
- if sn == -1:
- destError[dest] = "Setupper._setupDestination() could not get serial num for %s" % file.destinationDBlock
- continue
- if not file.destinationDBlock in snGottenDS:
- snGottenDS.append(file.destinationDBlock)
- # new dataset name
- newnameList[dest] = "%s_sub0%s" % (file.destinationDBlock,sn)
- if freshFlag or self.resetLocation:
- # register original dataset and new dataset
- nameList = [file.destinationDBlock,newnameList[dest]]
- originalName = file.destinationDBlock
- else:
- # register new dataset only
- nameList = [newnameList[dest]]
- # create dataset
- for name in nameList:
- computingSite = job.computingSite
- if name == originalName:
- # for original dataset
- computingSite = file.destinationSE
- # use DQ2
- if (not self.pandaDDM) and (job.prodSourceLabel != 'ddm') and (job.destinationSE != 'local'):
- # get src and dest DDM conversion is needed for unknown sites
- if job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(computingSite):
- # DQ2 ID was set by using --destSE for analysis job to transfer output
- tmpSrcDDM = self.siteMapper.getSite(job.computingSite).ddm
- else:
- tmpSrcDDM = self.siteMapper.getSite(computingSite).ddm
- if job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(file.destinationSE):
- # DQ2 ID was set by using --destSE for analysis job to transfer output
- tmpDstDDM = tmpSrcDDM
- else:
- tmpDstDDM = self.siteMapper.getSite(file.destinationSE).ddm
- # skip registration for _sub when src=dest
- if tmpSrcDDM == tmpDstDDM and name != originalName and re.search('_sub\d+$',name) != None:
- # create a fake vuidStr
- vuidStr = 'vuid="%s"' % commands.getoutput('uuidgen')
- else:
- # register dataset
- time.sleep(1)
- # set hidden flag for _sub
- tmpHiddenFlag = False
- if name != originalName and re.search('_sub\d+$',name) != None:
- tmpHiddenFlag = True
- _logger.debug((self.timestamp,'registerNewDataset',name,[],[],[],[],
- None,None,None,tmpHiddenFlag))
- atFailed = 0
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('registerNewDataset',name,[],[],[],[],
- None,None,None,tmpHiddenFlag)
- if status != 0 and out.find('DQDatasetExistsException') != -1:
- atFailed = iDDMTry
- break
- elif status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- _logger.debug("%s sleep %s for %s" % (self.timestamp,iDDMTry,name))
- _logger.debug(status)
- _logger.debug(out)
- _logger.debug("-------------")
- time.sleep(60)
- else:
- break
- if status != 0 or out.find('Error') != -1:
- # unset vuidStr
- vuidStr = ""
- # ignore 'already exists' ERROR because original dataset may be registered by upstream.
- # atFailed > 0 is for the case in which the first attempt succeeded but report failure
- if (job.prodSourceLabel == 'panda' or (job.prodSourceLabel in ['ptest','rc_test'] and \
- job.processingType in ['pathena','prun','gangarobot-rctest']) \
- or name == originalName or atFailed > 0) and \
- out.find('DQDatasetExistsException') != -1:
- _logger.debug('%s ignored DQDatasetExistsException' % self.timestamp)
- else:
- destError[dest] = "Setupper._setupDestination() could not register : %s" % name
- _logger.error("%s %s" % (self.timestamp,out))
- continue
- else:
- _logger.debug("%s %s" % (self.timestamp,out))
- vuidStr = "vuid = %s['vuid']" % out
- # get list of tokens
- tmpTokenList = file.destinationDBlockToken.split(',')
- # register datasetsets
- if name == originalName or tmpSrcDDM != tmpDstDDM or \
- job.prodSourceLabel == 'panda' or (job.prodSourceLabel in ['ptest','rc_test'] and \
- job.processingType in ['pathena','prun','gangarobot-rctest']) \
- or len(tmpTokenList) > 1:
- time.sleep(1)
- # register location
- usingT1asT2 = False
- if job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(computingSite):
- dq2IDList = [self.siteMapper.getSite(job.computingSite).ddm]
- else:
- if self.siteMapper.getSite(computingSite).cloud != job.cloud and \
- re.search('_sub\d+$',name) != None and \
- (not job.prodSourceLabel in ['user','panda']) and \
- (not self.siteMapper.getSite(computingSite).ddm.endswith('PRODDISK')):
- # T1 used as T2. Use both DATADISK and PRODDISK as locations while T1 PRODDISK is phasing out
- dq2IDList = [self.siteMapper.getSite(computingSite).ddm]
- if self.siteMapper.getSite(computingSite).setokens.has_key('ATLASPRODDISK'):
- dq2IDList += [self.siteMapper.getSite(computingSite).setokens['ATLASPRODDISK']]
- usingT1asT2 = True
- else:
- dq2IDList = [self.siteMapper.getSite(computingSite).ddm]
- # use another location when token is set
- if (not usingT1asT2) and (not file.destinationDBlockToken in ['NULL','']):
- dq2IDList = []
- for tmpToken in tmpTokenList:
- # set default
- dq2ID = self.siteMapper.getSite(computingSite).ddm
- # convert token to DQ2ID
- if self.siteMapper.getSite(computingSite).setokens.has_key(tmpToken):
- dq2ID = self.siteMapper.getSite(computingSite).setokens[tmpToken]
- # replace or append
- if len(tmpTokenList) <= 1 or name != originalName:
- # use location consistent with token
- dq2IDList = [dq2ID]
- break
- else:
- # use multiple locations for _tid
- if not dq2ID in dq2IDList:
- dq2IDList.append(dq2ID)
- # loop over all locations
- repLifeTime = None
- if name != originalName and re.search('_sub\d+$',name) != None:
- repLifeTime = "14 days"
- for dq2ID in dq2IDList:
- _logger.debug((self.timestamp,'registerDatasetLocation',name,dq2ID,0,0,None,None,None,repLifeTime))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('registerDatasetLocation',name,dq2ID,0,0,None,None,None,repLifeTime)
- if status != 0 and out.find('DQLocationExistsException') != -1:
- break
- elif status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(60)
- else:
- break
- # ignore "already exists at location XYZ"
- if out.find('DQLocationExistsException') != -1:
- _logger.debug('%s ignored DQLocationExistsException' % self.timestamp)
- status,out = 0,''
- else:
- _logger.debug("%s %s" % (self.timestamp,out))
- if status == 0 and out.find('Error') == -1:
- # change replica ownership for user datasets
- if self.resetLocation and ((name == originalName and job.prodSourceLabel == 'user') or \
- job.prodSourceLabel=='panda'):
- # remove /CN=proxy and /CN=limited from DN
- tmpRealDN = job.prodUserID
- tmpRealDN = re.sub('/CN=limited proxy','',tmpRealDN)
- tmpRealDN = re.sub('/CN=proxy','',tmpRealDN)
- status,out = dq2Common.parse_dn(tmpRealDN)
- if status != 0:
- _logger.error("%s %s" % (self.timestamp,out))
- status,out = 1,'failed to truncate DN:%s' % job.prodUserID
- else:
- tmpRealDN = out
- _logger.debug((self.timestamp,'setReplicaMetaDataAttribute',name,dq2ID,'owner',tmpRealDN))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',name,dq2ID,'owner',tmpRealDN)
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(60)
- else:
- break
- # failed
- if status != 0 or out.find('Error') != -1:
- _logger.error("%s %s" % (self.timestamp,out))
- break
- # delete old replicas
- tmpDelStat = self.deleteDatasetReplicas([name],[dq2ID])
- if not tmpDelStat:
- status,out = 1,'failed to delete old replicas for %s' % name
- break
- # failed
- if status != 0 or out.find('Error') != -1:
- _logger.error("%s %s" % (self.timestamp,out))
- break
- else:
- # skip registerDatasetLocations
- status,out = 0,''
- if status != 0 or out.find('Error') != -1:
- destError[dest] = "Could not register location : %s %s" % (name,out.split('\n')[-1])
- elif job.prodSourceLabel == 'panda' or (job.prodSourceLabel in ['ptest','rc_test'] and \
- job.processingType in ['pathena','prun','gangarobot-rctest']):
- # do nothing for "panda" job
- pass
- elif name == originalName and job.prodSourceLabel in ['managed','test','rc_test','ptest']:
- # set metadata
- time.sleep(1)
- dq2ID = self.siteMapper.getSite(file.destinationSE).ddm
- # use another location when token is set
- if not file.destinationDBlockToken in ['NULL','']:
- # register only the first token becasue it is used as the location
- tmpFirstToken = file.destinationDBlockToken.split(',')[0]
- if self.siteMapper.getSite(file.destinationSE).setokens.has_key(tmpFirstToken):
- dq2ID = self.siteMapper.getSite(file.destinationSE).setokens[tmpFirstToken]
- _logger.debug((self.timestamp,'setMetaDataAttribute',name,'origin',dq2ID))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('setMetaDataAttribute',name,'origin',dq2ID)
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(60)
- else:
- break
- _logger.debug("%s %s" % (self.timestamp,out))
- if status != 0 or (out != 'None' and out.find('already exists') == -1):
- _logger.error(out)
- destError[dest] = "Setupper._setupDestination() could not set metadata : %s" % name
- # use PandaDDM or non-DQ2
- else:
- # create a fake vuidStr
- vuidStr = 'vuid="%s"' % commands.getoutput('uuidgen')
- # already failed
- if destError[dest] != '' and name == originalName:
- break
- # get vuid
- if vuidStr == '':
- _logger.debug((self.timestamp,'queryDatasetByName',name))
- for iDDMTry in range(3):
- status,out = ddm.repositoryClient.main('queryDatasetByName',name)
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(60)
- else:
- break
- _logger.debug("%s %s" % (self.timestamp,out))
- if status != 0 or out.find('Error') != -1:
- _logger.error(out)
- vuidStr = "vuid = %s['%s']['vuids'][0]" % (out.split('\n')[0],name)
- try:
- exec vuidStr
- # dataset spec
- ds = DatasetSpec()
- ds.vuid = vuid
- ds.name = name
- ds.type = 'output'
- ds.numberfiles = 0
- ds.currentfiles = 0
- ds.status = 'defined'
- # append
- datasetList[(name,file.destinationSE,computingSite)] = ds
- except:
- # set status
- type, value, traceBack = sys.exc_info()
- _logger.error("_setupDestination() : %s %s" % (type,value))
- destError[dest] = "Setupper._setupDestination() could not get VUID : %s" % name
- # set new destDBlock
- if newnameList.has_key(dest):
- file.destinationDBlock = newnameList[dest]
- # update job status if failed
- if destError[dest] != '':
- job.jobStatus = 'failed'
- job.ddmErrorCode = ErrorCode.EC_Setupper
- job.ddmErrorDiag = destError[dest]
- else:
- newdest = (file.destinationDBlock,file.destinationSE,job.computingSite)
- # increment number of files
- datasetList[newdest].numberfiles = datasetList[newdest].numberfiles + 1
- # dump
- for tmpDsKey in datasetList.keys():
- if re.search('_sub\d+$',tmpDsKey[0]) != None:
- _logger.debug('%s made sub:%s for nFiles=%s' % (self.timestamp,tmpDsKey[0],datasetList[tmpDsKey].numberfiles))
- # insert datasets to DB
- return self.taskBuffer.insertDatasets(datasetList.values())
-
-
- # subscribe sites to distpatchDBlocks
- def _subscribeDistpatchDB(self):
- dispError = {}
- failedJobs = []
- ddmJobs = []
- ddmUser = 'NULL'
- for job in self.jobs:
- # ignore failed jobs
- if job.jobStatus in ['failed','cancelled']:
- continue
- # ignore no dispatch jobs
- if job.dispatchDBlock=='NULL' or job.computingSite=='NULL':
- continue
- # extract dispatchDBlock and computingSite
- disp = (job.dispatchDBlock,job.computingSite)
- if dispError.has_key(disp) == 0:
- dispError[disp] = ''
- # DQ2 IDs
- tmpSrcID = 'BNL_ATLAS_1'
- if self.siteMapper.checkCloud(job.cloud):
- # use cloud's source
- tmpSrcID = self.siteMapper.getCloud(job.cloud)['source']
- srcDQ2ID = self.siteMapper.getSite(tmpSrcID).ddm
- # destination
- tmpDstID = job.computingSite
- if srcDQ2ID != self.siteMapper.getSite(job.computingSite).ddm and \
- srcDQ2ID in self.siteMapper.getSite(job.computingSite).setokens.values():
- # direct usage of remote SE. Mainly for prestaging
- tmpDstID = tmpSrcID
- _logger.debug('%s use remote SiteSpec of %s for %s' % (self.timestamp,tmpDstID,job.computingSite))
- # use srcDQ2ID as dstDQ2ID when dst SE is same as src SE
- srcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpSrcID).se)
- dstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpDstID).se)
- if srcSEs == dstSEs or job.computingSite.endswith("_REPRO"):
- dstDQ2ID = srcDQ2ID
- else:
- dstDQ2ID = self.siteMapper.getSite(job.computingSite).ddm
- # use DQ2
- if (not self.pandaDDM) and (not srcDQ2ID in PandaDDMSource or self.siteMapper.getSite(tmpDstID).cloud != 'US') \
- and (job.prodSourceLabel != 'ddm') and (not job.computingSite.endswith("_REPRO")):
- # look for replica
- dq2ID = srcDQ2ID
- dq2IDList = []
- # register replica
- if dq2ID != dstDQ2ID:
- # make list
- if self.replicaMap.has_key(job.dispatchDBlock):
- # set DQ2 ID for DISK
- if not srcDQ2ID.endswith('_DATADISK'):
- hotID = re.sub('_MCDISK','_HOTDISK', srcDQ2ID)
- diskID = re.sub('_MCDISK','_DATADISK',srcDQ2ID)
- tapeID = re.sub('_MCDISK','_DATATAPE',srcDQ2ID)
- mctapeID = re.sub('_MCDISK','_MCTAPE',srcDQ2ID)
- else:
- hotID = re.sub('_DATADISK','_HOTDISK', srcDQ2ID)
- diskID = re.sub('_DATADISK','_DATADISK',srcDQ2ID)
- tapeID = re.sub('_DATADISK','_DATATAPE',srcDQ2ID)
- mctapeID = re.sub('_DATADISK','_MCTAPE',srcDQ2ID)
- # DQ2 ID is mixed with TAIWAN-LCG2 and TW-FTT
- if job.cloud in ['TW',]:
- tmpSiteSpec = self.siteMapper.getSite(tmpSrcID)
- if tmpSiteSpec.setokens.has_key('ATLASDATADISK'):
- diskID = tmpSiteSpec.setokens['ATLASDATADISK']
- if tmpSiteSpec.setokens.has_key('ATLASDATATAPE'):
- tapeID = tmpSiteSpec.setokens['ATLASDATATAPE']
- if tmpSiteSpec.setokens.has_key('ATLASMCTAPE'):
- mctapeID = tmpSiteSpec.setokens['ATLASMCTAPE']
- hotID = 'TAIWAN-LCG2_HOTDISK'
- for tmpDataset,tmpRepMap in self.replicaMap[job.dispatchDBlock].iteritems():
- if tmpRepMap.has_key(hotID):
- # HOTDISK
- if not hotID in dq2IDList:
- dq2IDList.append(hotID)
- if tmpRepMap.has_key(srcDQ2ID):
- # MCDISK
- if not srcDQ2ID in dq2IDList:
- dq2IDList.append(srcDQ2ID)
- if tmpRepMap.has_key(diskID):
- # DATADISK
- if not diskID in dq2IDList:
- dq2IDList.append(diskID)
- if job.cloud == 'US' and tmpRepMap.has_key('BNLPANDA'):
- # BNLPANDA
- if not 'BNLPANDA' in dq2IDList:
- dq2IDList.append('BNLPANDA')
- if tmpRepMap.has_key(tapeID):
- # DATATAPE
- if not tapeID in dq2IDList:
- dq2IDList.append(tapeID)
- if tmpRepMap.has_key(mctapeID):
- # MCTAPE
- if not mctapeID in dq2IDList:
- dq2IDList.append(mctapeID)
- # hack for split T1
- splitT1IDsHaveDS = []
- for tmpSplitT1Key in tmpRepMap.keys():
- if tmpSplitT1Key.startswith('NIKHEF-ELPROD'):
- splitT1IDsHaveDS.append(tmpSplitT1Key)
- if job.cloud == 'NL' and splitT1IDsHaveDS != [] \
- and not tmpRepMap.has_key('SARA-MATRIX_MCDISK') \
- and not tmpRepMap.has_key('SARA-MATRIX_DATADISK') \
- and not tmpRepMap.has_key('SARA-MATRIX_MCTAPE') \
- and not tmpRepMap.has_key('SARA-MATRIX_DATATAPE'):
- for tmpSplitT1Key in splitT1IDsHaveDS:
- if not tmpSplitT1Key in dq2IDList:
- dq2IDList.append(tmpSplitT1Key)
- # consider cloudconfig.tier1se
- tmpCloudSEs = DataServiceUtils.getEndpointsAtT1(tmpRepMap,self.siteMapper,job.cloud)
- useCloudSEs = []
- for tmpCloudSE in tmpCloudSEs:
- if not tmpCloudSE in dq2IDList:
- useCloudSEs.append(tmpCloudSE)
- if useCloudSEs != []:
- dq2IDList += useCloudSEs
- _logger.debug('%s use additional endpoints %s from cloudconfig' % (self.timestamp,str(useCloudSEs)))
- # use default location if empty
- if dq2IDList == []:
- dq2IDList = [dq2ID]
- for dq2ID in dq2IDList:
- time.sleep(1)
- _logger.debug((self.timestamp,'registerDatasetLocation',job.dispatchDBlock,dq2ID,0,1,None,None,None,"7 days"))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('registerDatasetLocation',job.dispatchDBlock,dq2ID,0,1,None,None,None,"7 days")
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(60)
- else:
- break
- _logger.debug("%s %s" % (self.timestamp,out))
- # failure
- if status != 0 or out.find('Error') != -1:
- break
- else:
- # skip registerDatasetLocations
- status,out = 0,''
- if status != 0 or out.find('Error') != -1:
- _logger.error(out)
- dispError[disp] = "Setupper._subscribeDistpatchDB() could not register location"
- else:
- # assign destination
- time.sleep(1)
- optSub = {'DATASET_COMPLETE_EVENT' : ['https://%s:%s/server/panda/datasetCompleted' % \
- (panda_config.pserverhost,panda_config.pserverport)]}
- optSource = {}
- optSrcPolicy = 001000 | 010000
- dq2ID = dstDQ2ID
- # prestaging
- if srcDQ2ID == dstDQ2ID:
- # stage-in callback
- optSub['DATASET_STAGED_EVENT'] = ['https://%s:%s/server/panda/datasetCompleted' % \
- (panda_config.pserverhost,panda_config.pserverport)]
- # use ATLAS*TAPE
- seTokens = self.siteMapper.getSite(tmpDstID).setokens
- if seTokens.has_key('ATLASDATATAPE') and seTokens.has_key('ATLASMCTAPE'):
- dq2ID = seTokens['ATLASDATATAPE']
- # use MCDISK if needed
- for tmpDataset,tmpRepMap in self.replicaMap[job.dispatchDBlock].iteritems():
- if (not tmpRepMap.has_key(dq2ID)) and tmpRepMap.has_key(seTokens['ATLASMCTAPE']):
- dq2ID = seTokens['ATLASMCTAPE']
- break
- # for CERN and BNL
- if job.cloud in ['CERN','US'] and self.replicaMap.has_key(job.dispatchDBlock):
- setNewIDflag = False
- if job.cloud == 'CERN':
- otherIDs = ['CERN-PROD_DAQ','CERN-PROD_TZERO','CERN-PROD_TMPDISK']
- else:
- otherIDs = ['BNLPANDA']
- for tmpDataset,tmpRepMap in self.replicaMap[job.dispatchDBlock].iteritems():
- if not tmpRepMap.has_key(dq2ID):
- # look for another id
- for cernID in otherIDs:
- if tmpRepMap.has_key(cernID):
- dq2ID = cernID
- setNewIDflag = True
- break
- # break
- if setNewIDflag:
- break
- optSrcPolicy = 000010
- optSource[dq2ID] = {'policy' : 0}
- else:
- # set sources to handle T2s in another cloud and to transfer dis datasets being split in multiple sites
- for tmpDQ2ID in dq2IDList:
- optSource[tmpDQ2ID] = {'policy' : 0}
- # T1 used as T2
- if job.cloud != self.siteMapper.getSite(tmpDstID).cloud and \
- (not dstDQ2ID.endswith('PRODDISK')) and \
- (not job.prodSourceLabel in ['user','panda']) and \
- self.siteMapper.getSite(tmpDstID).cloud in ['US']:
- seTokens = self.siteMapper.getSite(tmpDstID).setokens
- # use T1_PRODDISK
- if seTokens.has_key('ATLASPRODDISK'):
- dq2ID = seTokens['ATLASPRODDISK']
- # register subscription
- _logger.debug('%s %s %s %s' % (self.timestamp,'registerDatasetSubscription',
- (job.dispatchDBlock,dq2ID),
- {'version':0,'archived':0,'callbacks':optSub,'sources':optSource,'sources_policy':optSrcPolicy,
- 'wait_for_sources':0,'destination':None,'query_more_sources':0,'sshare':"production",'group':None,
- 'activity':"Production",'acl_alias':None,'replica_lifetime':"7 days"}))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('registerDatasetSubscription',job.dispatchDBlock,dq2ID,version=0,archived=0,callbacks=optSub,
- sources=optSource,sources_policy=optSrcPolicy,wait_for_sources=0,destination=None,
- query_more_sources=0,sshare="production",group=None,activity="Production",
- acl_alias=None,replica_lifetime="7 days")
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(60)
- else:
- break
- _logger.debug("%s %s" % (self.timestamp,out))
- if status != 0 or (out != 'None' and len(out) != 35):
- _logger.error(out)
- dispError[disp] = "Setupper._subscribeDistpatchDB() could not register subscription"
- # logging
- try:
- # make message
- dq2ID = dstDQ2ID
- message = '%s - siteID:%s type:dispatch vuid:%s' % (commands.getoutput('hostname'),dq2ID,
- self.vuidMap[job.dispatchDBlock])
- # get logger
- _pandaLogger = PandaLogger()
- _pandaLogger.lock()
- _pandaLogger.setParams({'Type':'registerSubscription'})
- logger = _pandaLogger.getHttpLogger(panda_config.loggername)
- # add message
- logger.info(message)
- # release HTTP handler
- _pandaLogger.release()
- except:
- pass
- # use PandaDDM
- else:
- # set DDM user DN
- if ddmUser == 'NULL':
- ddmUser = job.prodUserID
- # create a DDM job
- ddmjob = JobSpec()
- ddmjob.jobDefinitionID = int(time.time()) % 10000
- ddmjob.jobName = "%s" % commands.getoutput('uuidgen')
- ddmjob.transformation = 'http://pandaserver.cern.ch:25080/trf/mover/run_dq2_cr'
- ddmjob.destinationDBlock = 'pandaddm_%s.%s' % (time.strftime('%y.%m.%d'),ddmjob.jobName)
- if job.cloud == 'NULL':
- ddmjob.cloud = 'US'
- else:
- ddmjob.cloud = job.cloud
- if not PandaMoverIDs.has_key(job.cloud):
- ddmjob.computingSite = "BNL_ATLAS_DDM"
- else:
- ddmjob.computingSite = PandaMoverIDs[job.cloud]
- ddmjob.destinationSE = ddmjob.computingSite
- ddmjob.assignedPriority = 200000
- if job.prodSourceLabel in ['software']:
- # set higher priority for installation jobs
- ddmjob.assignedPriority += 1000
- else:
- ddmjob.assignedPriority += job.currentPriority
- ddmjob.currentPriority = ddmjob.assignedPriority
- if self.ddmAttempt != 0:
- # keep count of attemptNr
- ddmjob.attemptNr = self.ddmAttempt + 1
- else:
- ddmjob.attemptNr = 1
- # check attemptNr to avoid endless loop
- if ddmjob.attemptNr > 10:
- err = "Too many attempts %s for %s" % (ddmjob.attemptNr,job.dispatchDBlock)
- _logger.error(err)
- dispError[disp] = err
- continue
- ddmjob.prodSourceLabel = 'ddm'
- ddmjob.transferType = 'dis'
- ddmjob.processingType = 'pandamover'
- # append log file
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz.%s" % (ddmjob.destinationDBlock,ddmjob.attemptNr)
- fileOL.destinationDBlock = ddmjob.destinationDBlock
- fileOL.destinationSE = ddmjob.destinationSE
- fileOL.dataset = ddmjob.destinationDBlock
- fileOL.type = 'log'
- ddmjob.addFile(fileOL)
- # make arguments
- callBackURL = 'https://%s:%s/server/panda/datasetCompleted?vuid=%s&site=%s' % \
- (panda_config.pserverhost,panda_config.pserverport,
- self.vuidMap[job.dispatchDBlock],dstDQ2ID)
- callBackURL = urllib.quote(callBackURL)
- lfnsStr = ''
- for tmpLFN in self.dispFileList[job.dispatchDBlock]['lfns']:
- lfnsStr += '%s,' % tmpLFN
- guidStr = ''
- for tmpGUID in self.dispFileList[job.dispatchDBlock]['guids']:
- guidStr += '%s,' % tmpGUID
- guidStr = guidStr[:-1]
- lfnsStr = lfnsStr[:-1]
- # check input token
- moverUseTape = False
- for tmpFile in job.Files:
- if tmpFile.type == 'input' and tmpFile.dispatchDBlockToken in ['ATLASDATATAPE']:
- moverUseTape = True
- break
- if srcDQ2ID != dstDQ2ID:
- # get destination dir
- tmpSpec = self.siteMapper.getSite(job.computingSite)
- destDir = brokerage.broker_util._getDefaultStorage(tmpSpec.dq2url,tmpSpec.se,tmpSpec.seprodpath)
- if destDir == '':
- err = "could not get default storage for %s" % job.computingSite
- _logger.error(err)
- dispError[disp] = err
- continue
- # normal jobs
- argStr = ""
- if moverUseTape:
- argStr += "--useTape "
- argStr += "-t 7200 -n 3 -s %s -r %s --guids %s --lfns %s --tapePriority %s --callBack %s -d %spanda/dis/%s%s %s" % \
- (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,job.currentPriority,callBackURL,destDir,
- time.strftime('%y/%m/%d/'),job.dispatchDBlock,job.dispatchDBlock)
- else:
- # prestaging jobs
- argStr = ""
- if moverUseTape:
- argStr += "--useTape "
- argStr += "-t 540 -n 2 -s %s -r %s --guids %s --lfns %s --tapePriority %s --callBack %s --prestage --cloud %s %s" % \
- (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,job.currentPriority,callBackURL,job.cloud,job.dispatchDBlock)
- # set job parameters
- ddmjob.jobParameters = argStr
- _logger.debug('%s pdq2_cr %s' % (self.timestamp,ddmjob.jobParameters))
- # set src/dest
- ddmjob.sourceSite = srcDQ2ID
- ddmjob.destinationSite = dstDQ2ID
- ddmJobs.append(ddmjob)
- # failed jobs
- if dispError[disp] != '':
- job.jobStatus = 'failed'
- job.ddmErrorCode = ErrorCode.EC_Setupper
- job.ddmErrorDiag = dispError[disp]
- failedJobs.append(job)
- # update failed jobs only. succeeded jobs should be activate by DDM callback
- self.taskBuffer.updateJobs(failedJobs,True)
- # submit ddm jobs
- if ddmJobs != []:
- ddmRet = self.taskBuffer.storeJobs(ddmJobs,ddmUser,joinThr=True)
- # update datasets
- ddmIndex = 0
- ddmDsList = []
- for ddmPandaID,ddmJobDef,ddmJobName in ddmRet:
- # invalid PandaID
- if ddmPandaID in ['NULL',None]:
- continue
- # get dispatch dataset
- dsName = ddmJobs[ddmIndex].jobParameters.split()[-1]
- ddmIndex += 1
- tmpDS = self.taskBuffer.queryDatasetWithMap({'name':dsName})
- if tmpDS != None:
- # set MoverID
- tmpDS.MoverID = ddmPandaID
- ddmDsList.append(tmpDS)
- # update
- if ddmDsList != []:
- self.taskBuffer.updateDatasets(ddmDsList)
-
-
- # update jobs
- def _updateJobs(self):
- updateJobs = []
- failedJobs = []
- activateJobs = []
- # sort out jobs
- for job in self.jobs:
- # failed jobs
- if job.jobStatus in ['failed','cancelled']:
- failedJobs.append(job)
- # no input jobs
- elif job.dispatchDBlock=='NULL':
- activateJobs.append(job)
- # normal jobs
- else:
- # change status
- job.jobStatus = "assigned"
- updateJobs.append(job)
- # update DB
- self.taskBuffer.activateJobs(activateJobs)
- self.taskBuffer.updateJobs(updateJobs,True)
- self.taskBuffer.updateJobs(failedJobs,True)
- # delete local values
- del updateJobs
- del failedJobs
- del activateJobs
-
-
- # correct LFN for attemptNr
- def _correctLFN(self):
- lfnMap = {}
- valMap = {}
- prodError = {}
- missingDS = {}
- jobsWaiting = []
- jobsFailed = []
- jobsProcessed = []
- allLFNs = {}
- allGUIDs = {}
- cloudMap = {}
- lfnDsMap = {}
- replicaMap = {}
- _logger.debug('%s go into LFN correction' % self.timestamp)
- for job in self.jobs:
- if self.onlyTA:
- _logger.debug("%s start TA session %s" % (self.timestamp,job.taskID))
- # check if sitename is known
- if job.computingSite != 'NULL' and (not job.computingSite in self.siteMapper.siteSpecList.keys()):
- job.jobStatus = 'failed'
- job.ddmErrorCode = ErrorCode.EC_Setupper
- job.ddmErrorDiag = "computingSite:%s is unknown" % job.computingSite
- # append job for downstream process
- jobsProcessed.append(job)
- # error message for TA
- if self.onlyTA:
- _logger.error("%s %s" % (self.timestamp,job.ddmErrorDiag))
- continue
- # ignore no prodDBlock jobs or container dataset
- if job.prodDBlock == 'NULL':
- # set cloud
- if panda_config.enableDynamicTA and job.prodSourceLabel in ['managed','validation'] \
- and job.cloud in ['NULL',''] and (not job.taskID in [None,'NULL',0]):
- # look into map to check if it is already gotten
- if not cloudMap.has_key(job.taskID):
- # instantiate TaskAssigner
- cloudResolver = TaskAssigner.TaskAssigner(self.taskBuffer,self.siteMapper,
- job.taskID,job.prodSourceLabel,job)
- # check cloud
- _logger.debug("%s check cloud for %s" % (self.timestamp,job.taskID))
- retCloud = cloudResolver.checkCloud()
- _logger.debug("%s checkCloud() -> %s" % (self.timestamp,retCloud))
- # failed
- if retCloud == None:
- _logger.error("failed to check cloud for %s" % job.taskID)
- # append job to waiting list
- jobsWaiting.append(job)
- continue
- # to be set
- elif retCloud == "":
- # collect LFN/GUID
- tmpLFNs = []
- tmpGUIDs = []
- # set cloud
- _logger.debug("%s set cloud for %s" % (self.timestamp,job.taskID))
- retCloud = cloudResolver.setCloud(tmpLFNs,tmpGUIDs,metadata=job.metadata)
- _logger.debug("%s setCloud() -> %s" % (self.timestamp,retCloud))
- if retCloud == None:
- _logger.debug("failed to set cloud for %s" % job.taskID)
- # append job to waiting list
- jobsWaiting.append(job)
- continue
- # append to map
- cloudMap[job.taskID] = retCloud
- # set cloud
- job.cloud = cloudMap[job.taskID]
- # message for TA
- if self.onlyTA:
- _logger.debug("%s set %s:%s" % (self.timestamp,job.taskID,job.cloud))
- # append job to processed list
- jobsProcessed.append(job)
- continue
- # collect datasets
- datasets = []
- for file in job.Files:
- if file.type == 'input' and file.dispatchDBlock == 'NULL' \
- and (file.GUID == 'NULL' or job.prodSourceLabel in ['managed','test','ptest']):
- if not file.dataset in datasets:
- datasets.append(file.dataset)
- # get LFN list
- for dataset in datasets:
- if not dataset in lfnMap.keys():
- prodError[dataset] = ''
- lfnMap[dataset] = {}
- # get LFNs
- time.sleep(1)
- for iDDMTry in range(3):
- _logger.debug((self.timestamp,'listFilesInDataset',dataset))
- status,out = ddm.DQ2.main('listFilesInDataset',dataset)
- if out.find("DQUnknownDatasetException") != -1:
- break
- elif status == -1:
- break
- elif status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error(out)
- prodError[dataset] = 'could not get file list of prodDBlock %s' % dataset
- _logger.error(prodError[dataset])
- # doesn't exist in DQ2
- if out.find('DQUnknownDatasetException') != -1:
- missingDS[dataset] = "DS:%s not found in DQ2" % dataset
- elif status == -1:
- missingDS[dataset] = out
- else:
- # make map (key: LFN w/o attemptNr, value: LFN with attemptNr)
- items = {}
- try:
- # protection for empty dataset
- if out != '()':
- exec "items = %s[0]" % out
- # keep values to avoid redundant lookup
- self.lfnDatasetMap[dataset] = items
- # loop over all files
- for guid,vals in items.iteritems():
- valMap[vals['lfn']] = {'guid' : guid, 'fsize' : vals['filesize'],
- 'md5sum' : vals['checksum'],
- 'chksum' : vals['checksum'],
- 'scope' : vals['scope']}
- genLFN = re.sub('\.\d+$','',vals['lfn'])
- if lfnMap[dataset].has_key(genLFN):
- # get attemptNr
- newAttNr = 0
- newMat = re.search('\.(\d+)$',vals['lfn'])
- if newMat != None:
- newAttNr = int(newMat.group(1))
- oldAttNr = 0
- oldMat = re.search('\.(\d+)$',lfnMap[dataset][genLFN])
- if oldMat != None:
- oldAttNr = int(oldMat.group(1))
- # compare
- if newAttNr > oldAttNr:
- lfnMap[dataset][genLFN] = vals['lfn']
- else:
- lfnMap[dataset][genLFN] = vals['lfn']
- # mapping from LFN to DS
- lfnDsMap[lfnMap[dataset][genLFN]] = dataset
- except:
- prodError[dataset] = 'could not convert HTTP-res to map for prodDBlock %s' % dataset
- _logger.error(prodError[dataset])
- _logger.error(out)
- # get replica locations
- if (self.onlyTA or job.prodSourceLabel in ['managed','test']) \
- and prodError[dataset] == '' and (not replicaMap.has_key(dataset)):
- if dataset.endswith('/'):
- status,out = self.getListDatasetReplicasInContainer(dataset)
- else:
- for iDDMTry in range(3):
- _logger.debug((self.timestamp,'listDatasetReplicas',dataset))
- status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False)
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1 \
- or out == '()':
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- prodError[dataset] = 'could not get locations for %s' % dataset
- _logger.error(prodError[dataset])
- _logger.error(out)
- else:
- tmpRepSites = {}
- try:
- # convert res to map
- exec "tmpRepSites = %s" % out
- replicaMap[dataset] = tmpRepSites
- except:
- prodError[dataset] = 'could not convert HTTP-res to replica map for %s' % dataset
- _logger.error(prodError[dataset])
- _logger.error(out)
- # append except DBR
- if not dataset.startswith('ddo'):
- self.replicaMapForBroker[dataset] = tmpRepSites
- # error
- isFailed = False
- # check for failed
- for dataset in datasets:
- if missingDS.has_key(dataset):
- job.jobStatus = 'failed'
- job.ddmErrorCode = ErrorCode.EC_GUID
- job.ddmErrorDiag = missingDS[dataset]
- # set missing
- for tmpFile in job.Files:
- if tmpFile.dataset == dataset:
- tmpFile.status = 'missing'
- # append
- jobsFailed.append(job)
- isFailed = True
- # message for TA
- if self.onlyTA:
- _logger.error("%s %s" % (self.timestamp,missingDS[dataset]))
- self.sendTaMesg("%s %s" % (job.taskID,missingDS[dataset]),msgType='error')
- else:
- _logger.debug("%s %s failed with %s" % (self.timestamp,job.PandaID,missingDS[dataset]))
- break
- if isFailed:
- continue
- # check for waiting
- for dataset in datasets:
- if prodError[dataset] != '':
- # append job to waiting list
- jobsWaiting.append(job)
- isFailed = True
- # message for TA
- if self.onlyTA:
- _logger.error("%s %s" % (self.timestamp,prodError[dataset]))
- break
- if isFailed:
- continue
- # set cloud
- if panda_config.enableDynamicTA and job.prodSourceLabel in ['managed','validation'] \
- and job.cloud in ['NULL',''] and (not job.taskID in [None,'NULL',0]):
- # look into map to check if it is already gotten
- if not cloudMap.has_key(job.taskID):
- # instantiate TaskAssigner
- cloudResolver = TaskAssigner.TaskAssigner(self.taskBuffer,self.siteMapper,
- job.taskID,job.prodSourceLabel,job)
- # check cloud
- _logger.debug("%s check cloud for %s" % (self.timestamp,job.taskID))
- retCloud = cloudResolver.checkCloud()
- _logger.debug("%s checkCloud() -> %s" % (self.timestamp,retCloud))
- # failed
- if retCloud == None:
- _logger.error("failed to check cloud for %s" % job.taskID)
- # append job to waiting list
- jobsWaiting.append(job)
- continue
- # to be set
- elif retCloud == "":
- # collect LFN/GUID
- tmpLFNs = []
- tmpGUIDs = []
- tmpReLoc = {}
- tmpCountMap = {}
- for dataset in datasets:
- # get LFNs
- eachDSLFNs = lfnMap[dataset].values()
- tmpLFNs += eachDSLFNs
- # get GUIDs
- for oneLFN in eachDSLFNs:
- tmpGUIDs.append(valMap[oneLFN]['guid'])
- # locations
- tmpReLoc[dataset] = replicaMap[dataset]
- # file counts
- tmpCountMap[dataset] = len(eachDSLFNs)
- # set cloud
- _logger.debug("%s set cloud for %s" % (self.timestamp,job.taskID))
- retCloud = cloudResolver.setCloud(tmpLFNs,tmpGUIDs,tmpReLoc,metadata=job.metadata,
- fileCounts=tmpCountMap)
- _logger.debug("%s setCloud() -> %s" % (self.timestamp,retCloud))
- if retCloud == None:
- _logger.debug("failed to set cloud for %s" % job.taskID)
- # append job to waiting list
- jobsWaiting.append(job)
- continue
- # append to map
- cloudMap[job.taskID] = retCloud
- # set cloud
- job.cloud = cloudMap[job.taskID]
- # message for TA
- if self.onlyTA:
- _logger.debug("%s set %s:%s" % (self.timestamp,job.taskID,job.cloud))
- _logger.debug('%s replacing generic LFNs' % self.timestamp)
- # replace generic LFN with real LFN
- replaceList = []
- isFailed = False
- for file in job.Files:
- if file.type == 'input' and file.dispatchDBlock == 'NULL':
- addToLfnMap = True
- if file.GUID == 'NULL':
- # get LFN w/o attemptNr
- basename = re.sub('\.\d+$','',file.lfn)
- if basename == file.lfn:
- # replace
- if basename in lfnMap[file.dataset].keys():
- file.lfn = lfnMap[file.dataset][basename]
- replaceList.append((basename,file.lfn))
- # set GUID
- if file.lfn in valMap:
- file.GUID = valMap[file.lfn]['guid']
- file.fsize = valMap[file.lfn]['fsize']
- file.md5sum = valMap[file.lfn]['md5sum']
- file.checksum = valMap[file.lfn]['chksum']
- file.scope = valMap[file.lfn]['scope']
- # remove white space
- if file.md5sum != None:
- file.md5sum = file.md5sum.strip()
- if file.checksum != None:
- file.checksum = file.checksum.strip()
- else:
- if not job.prodSourceLabel in ['managed','test']:
- addToLfnMap = False
- # check missing file
- if file.GUID == 'NULL' or job.prodSourceLabel in ['managed','test']:
- if not file.lfn in valMap:
- # append job to waiting list
- errMsg = "GUID for %s not found in DQ2" % file.lfn
- _logger.debug("%s %s" % (self.timestamp,errMsg))
- file.status = 'missing'
- if not job in jobsFailed:
- job.jobStatus = 'failed'
- job.ddmErrorCode = ErrorCode.EC_GUID
- job.ddmErrorDiag = errMsg
- jobsFailed.append(job)
- isFailed = True
- continue
- # add to allLFNs/allGUIDs
- if addToLfnMap:
- if not allLFNs.has_key(job.cloud):
- allLFNs[job.cloud] = []
- if not allGUIDs.has_key(job.cloud):
- allGUIDs[job.cloud] = []
- allLFNs[job.cloud].append(file.lfn)
- allGUIDs[job.cloud].append(file.GUID)
- # modify jobParameters
- if not isFailed:
- for patt,repl in replaceList:
- job.jobParameters = re.sub('%s ' % patt, '%s ' % repl, job.jobParameters)
- # append job to processed list
- jobsProcessed.append(job)
- # return if TA only
- if self.onlyTA:
- _logger.debug("%s end TA sessions" % self.timestamp)
- return
- _logger.debug('%s checking missing files at T1' % self.timestamp)
- # get missing LFNs from source LRC/LFC
- missLFNs = {}
- for cloudKey in allLFNs.keys():
- # use BNL by default
- dq2URL = self.siteMapper.getSite('BNL_ATLAS_1').dq2url
- dq2SE = []
- # use cloud's source
- if self.siteMapper.checkCloud(cloudKey):
- tmpSrcID = self.siteMapper.getCloud(cloudKey)['source']
- tmpSrcSite = self.siteMapper.getSite(tmpSrcID)
- # get LRC/LFC URL
- if not tmpSrcSite.lfchost in [None,'']:
- # LFC
- dq2URL = 'lfc://'+tmpSrcSite.lfchost+':/grid/atlas/'
- if tmpSrcSite.se != None:
- for tmpSrcSiteSE in tmpSrcSite.se.split(','):
- match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE)
- if match != None:
- dq2SE.append(match.group(1))
- # hack for split T1
- if cloudKey == 'NL':
- tmpSplitSite = self.siteMapper.getSite('NIKHEF-ELPROD')
- if tmpSplitSite.se != None:
- for tmpSrcSiteSE in tmpSplitSite.se.split(','):
- match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE)
- if match != None:
- dq2SE.append(match.group(1))
- else:
- # LRC
- dq2URL = tmpSrcSite.dq2url
- dq2SE = []
- # get missing files
- tmpMissLFNs = brokerage.broker_util.getMissLFNsFromLRC(allLFNs[cloudKey],dq2URL,allGUIDs[cloudKey],dq2SE)
- # append
- if not missLFNs.has_key(cloudKey):
- missLFNs[cloudKey] = []
- missLFNs[cloudKey] += tmpMissLFNs
- _logger.debug('%s checking T2 LFC' % self.timestamp)
- # check availability of files at T2
- for cloudKey,tmpAllLFNs in allLFNs.iteritems():
- if len(self.jobs) > 0 and (self.jobs[0].prodSourceLabel in ['user','panda','ddm'] or \
- self.jobs[0].processingType.startswith('gangarobot') or \
- self.jobs[0].processingType.startswith('hammercloud')):
- continue
- # add cloud
- if not self.availableLFNsInT2.has_key(cloudKey):
- self.availableLFNsInT2[cloudKey] = {}
- # loop over all files to find datasets
- for tmpCheckLFN in tmpAllLFNs:
- # add dataset
- if not lfnDsMap.has_key(tmpCheckLFN):
- continue
- tmpDsName = lfnDsMap[tmpCheckLFN]
- if not self.availableLFNsInT2[cloudKey].has_key(tmpDsName):
- # collect sites
- tmpSiteNameDQ2Map = DataServiceUtils.getSitesWithDataset(tmpDsName,self.siteMapper,replicaMap,cloudKey,getDQ2ID=True)
- if tmpSiteNameDQ2Map == {}:
- continue
- self.availableLFNsInT2[cloudKey][tmpDsName] = {'allfiles':[],'allguids':[],'sites':{}}
- for tmpSiteName in tmpSiteNameDQ2Map.keys():
- self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName] = []
- self.availableLFNsInT2[cloudKey][tmpDsName]['siteDQ2IDs'] = tmpSiteNameDQ2Map
- # add files
- if not tmpCheckLFN in self.availableLFNsInT2[cloudKey][tmpDsName]:
- self.availableLFNsInT2[cloudKey][tmpDsName]['allfiles'].append(tmpCheckLFN)
- self.availableLFNsInT2[cloudKey][tmpDsName]['allguids'].append(allGUIDs[cloudKey][allLFNs[cloudKey].index(tmpCheckLFN)])
- # get available files at each T2
- for tmpDsName in self.availableLFNsInT2[cloudKey].keys():
- checkedDq2SiteMap = {}
- checkLfcSeMap = {}
- for tmpSiteName in self.availableLFNsInT2[cloudKey][tmpDsName]['sites'].keys():
- tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
- # add LFC
- if not checkLfcSeMap.has_key(tmpSiteSpec.lfchost):
- checkLfcSeMap[tmpSiteSpec.lfchost] = {}
- # add site
- if not checkLfcSeMap[tmpSiteSpec.lfchost].has_key(tmpSiteName):
- checkLfcSeMap[tmpSiteSpec.lfchost][tmpSiteName] = []
- # add SE
- if tmpSiteSpec.se != None:
- for tmpSrcSiteSE in tmpSiteSpec.se.split(','):
- match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE)
- if match != None:
- checkLfcSeMap[tmpSiteSpec.lfchost][tmpSiteName].append(match.group(1))
- # LFC lookup
- for tmpLfcHost in checkLfcSeMap.keys():
- # get SEs
- tmpSEList = []
- for tmpSiteName in checkLfcSeMap[tmpLfcHost].keys():
- tmpSEList += checkLfcSeMap[tmpLfcHost][tmpSiteName]
- # get available file list
- _logger.debug('%s checking T2 LFC=%s for %s' % (self.timestamp,tmpLfcHost,tmpSEList))
- bulkAvFiles = brokerage.broker_util.getFilesFromLRC(self.availableLFNsInT2[cloudKey][tmpDsName]['allfiles'],
- 'lfc://'+tmpLfcHost+':/grid/atlas/',
- self.availableLFNsInT2[cloudKey][tmpDsName]['allguids'],
- storageName=tmpSEList,getPFN=True)
- # check each site
- for tmpSiteName in checkLfcSeMap[tmpLfcHost].keys():
- self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName] = []
- for tmpLFNck,tmpPFNlistck in bulkAvFiles.iteritems():
- siteHasFileFlag = False
- for tmpPFNck in tmpPFNlistck:
- # check se
- for tmpSE in checkLfcSeMap[tmpLfcHost][tmpSiteName]:
- if '://'+tmpSE in tmpPFNck:
- siteHasFileFlag = True
- break
- # escape
- if siteHasFileFlag:
- break
- # append
- if siteHasFileFlag:
- self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName].append(tmpLFNck)
- _logger.debug('%s available %s files at %s T2=%s for %s' % \
- (self.timestamp,
- len(self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName]),
- cloudKey,tmpSiteName,tmpDsName))
- _logger.debug('%s missLFNs at T1 %s' % (self.timestamp,missLFNs))
- # check if files in source LRC/LFC
- tmpJobList = tuple(jobsProcessed)
- for job in tmpJobList:
- # check only production/test jobs
- if not job.prodSourceLabel in ['managed','test','software','rc_test','ptest']:
- continue
- # don't check if site is already set
- if job.prodSourceLabel in ['managed','test'] and not job.computingSite in ['NULL','',None]:
- continue
- missingFlag = False
- for file in job.Files:
- if file.type == 'input':
- if missLFNs.has_key(job.cloud) and file.lfn in missLFNs[job.cloud]:
- # set file status
- file.status = 'missing'
- missingFlag = True
- # check if missing files are available at T2s
- goToT2 = None
- if missingFlag:
- tmpCandT2s = None
- for tmpFile in job.Files:
- if tmpFile.type == 'input' and tmpFile.status == 'missing':
- # no cloud info
- if not self.availableLFNsInT2.has_key(job.cloud):
- goToT2 = False
- break
- # no dataset info
- if not self.availableLFNsInT2[job.cloud].has_key(tmpFile.dataset):
- goToT2 = False
- break
- # initial candidates
- if tmpCandT2s == None:
- tmpCandT2s = self.availableLFNsInT2[job.cloud][tmpFile.dataset]['sites']
- # check all candidates
- newCandT2s = []
- for tmpCandT2 in tmpCandT2s:
- # site doesn't have the dataset
- if not self.availableLFNsInT2[job.cloud][tmpFile.dataset]['sites'].has_key(tmpCandT2):
- continue
- # site has the file
- if tmpFile.lfn in self.availableLFNsInT2[job.cloud][tmpFile.dataset]['sites'][tmpCandT2]:
- if not tmpCandT2 in newCandT2s:
- newCandT2s.append(tmpCandT2)
- # set new candidates
- tmpCandT2s = newCandT2s
- # no candidates left
- if tmpCandT2s == []:
- goToT2 = False
- break
- # go to T2
- if goToT2 == None:
- goToT2 = True
- # remove job not to process further
- if missingFlag and goToT2 != True:
- jobsProcessed.remove(job)
- # revert
- for oJob in self.jobs:
- if oJob.PandaID == job.PandaID:
- jobsWaiting.append(oJob)
- break
- # get missing datasets
- if missingFlag:
- if job.processingType.startswith('gangarobot') or \
- job.processingType.startswith('hammercloud'):
- pass
- elif not job.prodSourceLabel in ['managed']:
- pass
- else:
- for tmpFile in job.Files:
- if tmpFile.type == 'input' and tmpFile.status == 'missing' and \
- not tmpFile.dataset.startswith('ddo'):
- # append
- if not self.missingDatasetList.has_key(job.cloud):
- self.missingDatasetList[job.cloud] = {}
- if not self.missingDatasetList[job.cloud].has_key(tmpFile.dataset):
- self.missingDatasetList[job.cloud][tmpFile.dataset] = []
- if not tmpFile.GUID in self.missingDatasetList[job.cloud][tmpFile.dataset]:
- self.missingDatasetList[job.cloud][tmpFile.dataset].append(tmpFile.GUID)
- # set data summary fields
- for tmpJob in self.jobs:
- try:
- # set only for production/analysis/test
- if not tmpJob.prodSourceLabel in ['managed','test','rc_test','ptest','user']:
- continue
- # loop over all files
- tmpJob.nInputDataFiles = 0
- tmpJob.inputFileBytes = 0
- tmpInputFileProject = None
- tmpInputFileType = None
- for tmpFile in tmpJob.Files:
- # use input files and ignore DBR/lib.tgz
- if tmpFile.type == 'input' and (not tmpFile.dataset.startswith('ddo')) \
- and not tmpFile.lfn.endswith('.lib.tgz'):
- tmpJob.nInputDataFiles += 1
- if not tmpFile.fsize in ['NULL',None,0,'0']:
- tmpJob.inputFileBytes += tmpFile.fsize
- # get input type and project
- if tmpInputFileProject == None:
- tmpInputItems = tmpFile.dataset.split('.')
- # input project
- tmpInputFileProject = tmpInputItems[0]
- # input type. ignore user/group/groupXY
- if len(tmpInputItems) > 4 and (not tmpInputItems[0] in ['','NULL','user','group']) \
- and (not tmpInputItems[0].startswith('group')):
- tmpInputFileType = tmpInputItems[4]
- # set input type and project
- if not tmpJob.prodDBlock in ['',None,'NULL']:
- # input project
- if tmpInputFileProject != None:
- tmpJob.inputFileProject = tmpInputFileProject
- # input type
- if tmpInputFileType != None:
- tmpJob.inputFileType = tmpInputFileType
- # protection
- maxInputFileBytes = 99999999999
- if tmpJob.inputFileBytes > maxInputFileBytes:
- tmpJob.inputFileBytes = maxInputFileBytes
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("failed to set data summary fields for PandaID=%s: %s %s" % (tmpJob.PandaID,errType,errValue))
- # send jobs to jobsWaiting
- self.taskBuffer.keepJobs(jobsWaiting)
- # update failed job
- self.taskBuffer.updateJobs(jobsFailed,True)
- # remove waiting/failed jobs
- self.jobs = jobsProcessed
- # delete huge variables
- del lfnMap
- del valMap
- del prodError
- del jobsWaiting
- del jobsProcessed
- del allLFNs
- del allGUIDs
- del cloudMap
- del missLFNs
-
-
- # remove waiting jobs
- def removeWaitingJobs(self):
- jobsWaiting = []
- jobsProcessed = []
- for tmpJob in self.jobs:
- if tmpJob.jobStatus == 'waiting':
- jobsWaiting.append(tmpJob)
- else:
- jobsProcessed.append(tmpJob)
- # send jobs to jobsWaiting
- self.taskBuffer.keepJobs(jobsWaiting)
- # remove waiting/failed jobs
- self.jobs = jobsProcessed
-
-
- # memory checker
- def _memoryCheck(self):
- try:
- import os
- proc_status = '/proc/%d/status' % os.getpid()
- procfile = open(proc_status)
- name = ""
- vmSize = ""
- vmRSS = ""
- # extract Name,VmSize,VmRSS
- for line in procfile:
- if line.startswith("Name:"):
- name = line.split()[-1]
- continue
- if line.startswith("VmSize:"):
- vmSize = ""
- for item in line.split()[1:]:
- vmSize += item
- continue
- if line.startswith("VmRSS:"):
- vmRSS = ""
- for item in line.split()[1:]:
- vmRSS += item
- continue
- procfile.close()
- _logger.debug('%s MemCheck PID=%s Name=%s VSZ=%s RSS=%s' % (self.timestamp,os.getpid(),name,vmSize,vmRSS))
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("memoryCheck() : %s %s" % (type,value))
- _logger.debug('%s MemCheck PID=%s unknown' % (self.timestamp,os.getpid()))
- return
-
-
- # check DDM response
- def isDQ2ok(self,out):
- if out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1 \
- or out == '()':
- return False
- return True
-
-
- # get list of files in dataset
- def getListFilesInDataset(self,dataset):
- # use cache data
- if self.lfnDatasetMap.has_key(dataset):
- return True,self.lfnDatasetMap[dataset]
- for iDDMTry in range(3):
- _logger.debug((self.timestamp,'listFilesInDataset',dataset))
- status,out = ddm.DQ2.main('listFilesInDataset',dataset)
- if out.find("DQUnknownDatasetException") != -1:
- break
- elif status == -1:
- break
- elif status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error("%s %s" % (self.timestamp,out))
- return False,{}
- # convert
- items = {}
- try:
- exec "items = %s[0]" % out
- except:
- return False,{}
- return True,items
-
-
- # get list of datasets in container
- def getListDatasetInContainer(self,container):
- # get datasets in container
- _logger.debug((self.timestamp,'listDatasetsInContainer',container))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('listDatasetsInContainer',container)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- _logger.debug('%s %s' % (self.timestamp,out))
- if status != 0 or out.startswith('Error'):
- return False,out
- datasets = []
- try:
- # convert to list
- exec "datasets = %s" % out
- except:
- return False,out
- return True,datasets
-
-
- def getListDatasetReplicasInContainer(self,container,getMap=False):
- # get datasets in container
- _logger.debug((self.timestamp,'listDatasetsInContainer',container))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('listDatasetsInContainer',container)
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1 \
- or out == '()':
- time.sleep(60)
- else:
- break
- _logger.debug('%s %s' % (self.timestamp,out))
- if status != 0 or out.startswith('Error'):
- return status,out
- datasets = []
- try:
- # convert to list
- exec "datasets = %s" % out
- except:
- return status,out
- # loop over all datasets
- allRepMap = {}
- for dataset in datasets:
- _logger.debug((self.timestamp,'listDatasetReplicas',dataset))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False)
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1 \
- or out == '()':
- time.sleep(60)
- else:
- break
- _logger.debug('%s %s' % (self.timestamp,out))
- if status != 0 or out.startswith('Error'):
- return status,out
- tmpRepSites = {}
- try:
- # convert res to map
- exec "tmpRepSites = %s" % out
- except:
- return status,out
- # get map
- if getMap:
- allRepMap[dataset] = tmpRepSites
- continue
- # otherwise get sum
- for siteId,statList in tmpRepSites.iteritems():
- if not allRepMap.has_key(siteId):
- # append
- allRepMap[siteId] = [statList[-1],]
- else:
- # add
- newStMap = {}
- for stName,stNum in allRepMap[siteId][0].iteritems():
- if statList[-1].has_key(stName):
- # try mainly for archived=None
- try:
- newStMap[stName] = stNum + statList[-1][stName]
- except:
- newStMap[stName] = stNum
- else:
- newStMap[stName] = stNum
- allRepMap[siteId] = [newStMap,]
- # return
- _logger.debug('%s %s' % (self.timestamp,str(allRepMap)))
- if not getMap:
- return 0,str(allRepMap)
- else:
- return 0,allRepMap
-
-
- # get list of replicas for a dataset
- def getListDatasetReplicas(self,dataset):
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug("%s %s/%s listDatasetReplicas %s" % (self.timestamp,iDDMTry,nTry,dataset))
- status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if status != 0 or out.startswith('Error'):
- _logger.error(self.timestamp+' '+out)
- _logger.error('%s bad DQ2 response for %s' % (self.timestamp,dataset))
- return False,{}
- try:
- # convert res to map
- exec "tmpRepSites = %s" % out
- _logger.debug('%s getListDatasetReplicas->%s' % (self.timestamp,str(tmpRepSites)))
- return True,tmpRepSites
- except:
- _logger.error(self.timestamp+' '+out)
- _logger.error('%s could not convert HTTP-res to replica map for %s' % (self.timestamp,dataset))
- return False,{}
-
-
- # delete original locations
- def deleteDatasetReplicas(self,datasets,keepSites):
- # loop over all datasets
- for dataset in datasets:
- # get locations
- status,tmpRepSites = self.getListDatasetReplicas(dataset)
- if not status:
- return False
- # no replicas
- if len(tmpRepSites.keys()) == 0:
- continue
- delSites = []
- for tmpRepSite in tmpRepSites.keys():
- if not tmpRepSite in keepSites:
- delSites.append(tmpRepSite)
- # no repilicas to be deleted
- if delSites == []:
- continue
- # delete
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug("%s %s/%s deleteDatasetReplicas %s %s" % (self.timestamp,iDDMTry,nTry,dataset,str(delSites)))
- status,out = ddm.DQ2.main('deleteDatasetReplicas',dataset,delSites)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if status != 0 or out.startswith('Error'):
- _logger.error(self.timestamp+' '+out)
- _logger.error('%s bad DQ2 response for %s' % (self.timestamp,dataset))
- return False
- _logger.debug(self.timestamp+' '+out)
- # return
- _logger.debug('%s deleted replicas for %s' % (self.timestamp,str(datasets)))
- return True
-
-
- # dynamic data placement for analysis jobs
- def _dynamicDataPlacement(self):
- # no jobs
- if len(self.jobs) == 0:
- return
- # only successful analysis
- if self.jobs[0].jobStatus in ['failed','cancelled'] or (not self.jobs[0].prodSourceLabel in ['user','panda']):
- return
- # execute
- _logger.debug('%s execute PD2P' % self.timestamp)
- from DynDataDistributer import DynDataDistributer
- ddd = DynDataDistributer(self.jobs,self.taskBuffer,self.siteMapper)
- ddd.run()
- _logger.debug('%s finished PD2P' % self.timestamp)
- return
-
-
- # make dis datasets for existing files to avoid deletion when jobs are queued
- def _makeDisDatasetsForExistingfiles(self):
- _logger.debug('%s make dis datasets for existing files' % self.timestamp)
- # collect existing files
- dsFileMap = {}
- nMaxJobs = 20
- nJobsMap = {}
- for tmpJob in self.jobs:
- # use production or test jobs only
- if not tmpJob.prodSourceLabel in ['managed','test']:
- continue
- # ignore inappropriate status
- if tmpJob.jobStatus in ['failed','cancelled','waiting']:
- continue
- # check cloud
- if (tmpJob.cloud == 'ND' and self.siteMapper.getSite(tmpJob.computingSite).cloud == 'ND') or \
- (tmpJob.cloud == 'US' and self.siteMapper.getSite(tmpJob.computingSite).cloud == 'US'):
- continue
- # check SE to use T2 only
- tmpSrcID = self.siteMapper.getCloud(tmpJob.cloud)['source']
- srcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpSrcID).se)
- dstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpJob.computingSite).se)
- if srcSEs == dstSEs:
- continue
- # look for log _sub dataset to be used as a key
- logSubDsName = ''
- for tmpFile in tmpJob.Files:
- if tmpFile.type == 'log':
- logSubDsName = tmpFile.destinationDBlock
- break
- # append site
- destDQ2ID = self.siteMapper.getSite(tmpJob.computingSite).ddm
- # T1 used as T2
- if tmpJob.cloud != self.siteMapper.getSite(tmpJob.computingSite).cloud and \
- not destDQ2ID.endswith('PRODDISK') and \
- self.siteMapper.getSite(tmpJob.computingSite).cloud in ['US']:
- tmpSeTokens = self.siteMapper.getSite(tmpJob.computingSite).setokens
- if tmpSeTokens.has_key('ATLASPRODDISK'):
- destDQ2ID = tmpSeTokens['ATLASPRODDISK']
- mapKeyJob = (destDQ2ID,logSubDsName)
- # increment the number of jobs per key
- if not nJobsMap.has_key(mapKeyJob):
- nJobsMap[mapKeyJob] = 0
- mapKey = (destDQ2ID,logSubDsName,nJobsMap[mapKeyJob]/nMaxJobs)
- nJobsMap[mapKeyJob] += 1
- if not dsFileMap.has_key(mapKey):
- dsFileMap[mapKey] = {}
- # add files
- for tmpFile in tmpJob.Files:
- if tmpFile.type != 'input':
- continue
- # if files are unavailable at the dest site normal dis datasets contain them
- # or files are cached
- if not tmpFile.status in ['ready']:
- continue
- # if available at T2
- realDestDQ2ID = (destDQ2ID,)
- if self.availableLFNsInT2.has_key(tmpJob.cloud) and self.availableLFNsInT2[tmpJob.cloud].has_key(tmpFile.dataset) \
- and self.availableLFNsInT2[tmpJob.cloud][tmpFile.dataset]['sites'].has_key(tmpJob.computingSite) \
- and tmpFile.lfn in self.availableLFNsInT2[tmpJob.cloud][tmpFile.dataset]['sites'][tmpJob.computingSite]:
- realDestDQ2ID = self.availableLFNsInT2[tmpJob.cloud][tmpFile.dataset]['siteDQ2IDs'][tmpJob.computingSite]
- realDestDQ2ID = tuple(realDestDQ2ID)
- # append
- if not dsFileMap[mapKey].has_key(realDestDQ2ID):
- dsFileMap[mapKey][realDestDQ2ID] = {'taskID':tmpJob.taskID,
- 'PandaID':tmpJob.PandaID,
- 'files':{}}
- if not dsFileMap[mapKey][realDestDQ2ID]['files'].has_key(tmpFile.lfn):
- dsFileMap[mapKey][realDestDQ2ID]['files'][tmpFile.lfn] = {'lfn' :tmpFile.lfn,
- 'guid':tmpFile.GUID,
- 'fileSpecs':[]}
- # add file spec
- dsFileMap[mapKey][realDestDQ2ID]['files'][tmpFile.lfn]['fileSpecs'].append(tmpFile)
- # loop over all locations
- dispList = []
- for tmpMapKey,tmpDumVal in dsFileMap.iteritems():
- tmpDumLocation,tmpLogSubDsName,tmpBunchIdx = tmpMapKey
- for tmpLocationList,tmpVal in tmpDumVal.iteritems():
- for tmpLocation in tmpLocationList:
- tmpFileList = tmpVal['files']
- if tmpFileList == {}:
- continue
- nMaxFiles = 500
- iFiles = 0
- iLoop = 0
- while iFiles < len(tmpFileList):
- subFileNames = tmpFileList.keys()[iFiles:iFiles+nMaxFiles]
- if len(subFileNames) == 0:
- break
- # dis name
- disDBlock = "panda.%s.%s.%s.%s_dis0%s%s" % (tmpVal['taskID'],time.strftime('%m.%d'),'GEN',
- commands.getoutput('uuidgen'),iLoop,
- tmpVal['PandaID'])
- iFiles += nMaxFiles
- lfns = []
- guids = []
- fsizes = []
- chksums = []
- for tmpSubFileName in subFileNames:
- lfns.append(tmpFileList[tmpSubFileName]['lfn'])
- guids.append(tmpFileList[tmpSubFileName]['guid'])
- fsizes.append(None)
- chksums.append(None)
- # set dis name
- for tmpFileSpec in tmpFileList[tmpSubFileName]['fileSpecs']:
- if tmpFileSpec.status in ['ready'] and tmpFileSpec.dispatchDBlock == 'NULL':
- tmpFileSpec.dispatchDBlock = disDBlock
- # register datasets
- iLoop += 1
- _logger.debug((self.timestamp,'ext registerNewDataset',disDBlock,lfns,guids,fsizes,chksums,
- None,None,None,True))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('registerNewDataset',disDBlock,lfns,guids,fsizes,chksums,
- None,None,None,True)
- if status != 0 and out.find('DQDatasetExistsException') != -1:
- break
- elif status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- _logger.debug("%s sleep %s for %s" % (self.timestamp,iDDMTry,disDBlock))
- _logger.debug(status)
- _logger.debug(out)
- time.sleep(60)
- else:
- break
- if status != 0 or out.find('Error') != -1:
- _logger.error("%s %s" % (self.timestamp,out))
- continue
- _logger.debug("%s %s" % (self.timestamp,out))
- # get VUID
- try:
- exec "vuid = %s['vuid']" % out
- # dataset spec. currentfiles is used to count the number of failed jobs
- ds = DatasetSpec()
- ds.vuid = vuid
- ds.name = disDBlock
- ds.type = 'dispatch'
- ds.status = 'defined'
- ds.numberfiles = len(lfns)
- ds.currentfiles = 0
- dispList.append(ds)
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("ext registerNewDataset : failed to decode VUID for %s - %s %s" % (disDBlock,errType,errValue))
- continue
- # freezeDataset dispatch dataset
- _logger.debug((self.timestamp,'freezeDataset',disDBlock))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('freezeDataset',disDBlock)
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(60)
- else:
- break
- if status != 0 or (out.find('Error') != -1 and out.find("is frozen") == -1):
- _logger.error("%s %s" % (self.timestamp,out))
- continue
- _logger.debug("%s %s" % (self.timestamp,out))
- # register location
- _logger.debug((self.timestamp,'registerDatasetLocation',disDBlock,tmpLocation,0,1,None,None,None,"7 days"))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('registerDatasetLocation',disDBlock,tmpLocation,0,1,None,None,None,"7 days")
- if status != 0 or out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1:
- time.sleep(60)
- else:
- break
- _logger.debug("%s %s" % (self.timestamp,out))
- # failure
- if status != 0 or out.find('Error') != -1:
- _logger.error("%s %s" % (self.timestamp,out))
- continue
- # insert datasets to DB
- self.taskBuffer.insertDatasets(dispList)
- _logger.debug('%s finished to make dis datasets for existing files' % self.timestamp)
- return
-
-
- # pin input dataset
- def _pinInputDatasets(self):
- _logger.debug('%s pin input datasets' % self.timestamp)
- # collect input datasets and locations
- doneList = []
- allReplicaMap = {}
- for tmpJob in self.jobs:
- # ignore HC jobs
- if tmpJob.processingType.startswith('gangarobot') or \
- tmpJob.processingType.startswith('hammercloud'):
- continue
- # use production or test or user jobs only
- if not tmpJob.prodSourceLabel in ['managed','test','user']:
- continue
- # ignore inappropriate status
- if tmpJob.jobStatus in ['failed','cancelled','waiting']:
- continue
- # set lifetime
- if tmpJob.prodSourceLabel in ['managed','test']:
- pinLifeTime = 7
- else:
- pinLifeTime = 7
- # get source
- if tmpJob.prodSourceLabel in ['managed','test']:
- tmpSrcID = self.siteMapper.getCloud(tmpJob.cloud)['source']
- srcDQ2ID = self.siteMapper.getSite(tmpSrcID).ddm
- else:
- srcDQ2ID = self.siteMapper.getSite(tmpJob.computingSite).ddm
- # prefix of DQ2 ID
- srcDQ2IDprefix = re.sub('_[A-Z,0-9]+DISK$','',srcDQ2ID)
- # loop over all files
- for tmpFile in tmpJob.Files:
- # use input files and ignore DBR/lib.tgz
- if tmpFile.type == 'input' and \
- not tmpFile.lfn.endswith('.lib.tgz') and \
- not tmpFile.dataset.startswith('ddo') and \
- not tmpFile.dataset.startswith('user') and \
- not tmpFile.dataset.startswith('group'):
- # get replica locations
- if not allReplicaMap.has_key(tmpFile.dataset):
- if tmpFile.dataset.endswith('/'):
- status,tmpRepSitesMap = self.getListDatasetReplicasInContainer(tmpFile.dataset,getMap=True)
- if status == 0:
- status = True
- else:
- status = False
- else:
- status,tmpRepSites = self.getListDatasetReplicas(tmpFile.dataset)
- tmpRepSitesMap = {}
- tmpRepSitesMap[tmpFile.dataset] = tmpRepSites
- # append
- if status:
- allReplicaMap[tmpFile.dataset] = tmpRepSitesMap
- else:
- # set empty to avoid further lookup
- allReplicaMap[tmpFile.dataset] = {}
- # loop over constituent datasets
- _logger.debug('%s pin DQ2 prefix=%s' % (self.timestamp,srcDQ2IDprefix))
- for tmpDsName,tmpRepSitesMap in allReplicaMap[tmpFile.dataset].iteritems():
- # loop over locations
- for tmpRepSite in tmpRepSitesMap.keys():
- if tmpRepSite.startswith(srcDQ2IDprefix) \
- and not 'TAPE' in tmpRepSite \
- and not 'SCRATCH' in tmpRepSite:
- tmpKey = (tmpDsName,tmpRepSite)
- # already done
- if tmpKey in doneList:
- continue
- # append to avoid repetition
- doneList.append(tmpKey)
- # get metadata
- status,tmpMetadata = self.getReplicaMetadata(tmpDsName,tmpRepSite)
- if not status:
- continue
- # check pin lifetime
- if tmpMetadata.has_key('pin_expirationdate'):
- if isinstance(tmpMetadata['pin_expirationdate'],types.StringType) and tmpMetadata['pin_expirationdate'] != 'None':
- # keep original pin lifetime if it is longer
- origPinLifetime = datetime.datetime.strptime(tmpMetadata['pin_expirationdate'],'%Y-%m-%d %H:%M:%S')
- if origPinLifetime > datetime.datetime.utcnow()+datetime.timedelta(days=pinLifeTime):
- _logger.debug('%s skip pinning for %s:%s due to longer lifetime %s' % (self.timestamp,
- tmpDsName,tmpRepSite,
- tmpMetadata['pin_expirationdate']))
- continue
- # set pin lifetime
- status = self.setReplicaMetadata(tmpDsName,tmpRepSite,'pin_lifetime','%s days' % pinLifeTime)
- # retrun
- _logger.debug('%s pin input datasets done' % self.timestamp)
- return
-
-
- # make T1 subscription for missing files
- def _makeSubscriptionForMissing(self):
- _logger.debug('%s make subscriptions for missing files' % self.timestamp)
- # collect datasets
- missingList = {}
- for tmpCloud,tmpMissDatasets in self.missingDatasetList.iteritems():
- # append cloud
- if not missingList.has_key(tmpCloud):
- missingList[tmpCloud] = []
- # loop over all datasets
- for tmpDsName,tmpMissFiles in tmpMissDatasets.iteritems():
- # check if datasets in container are used
- if tmpDsName.endswith('/'):
- # convert container to datasets
- tmpStat,tmpDsList = self.getListDatasetInContainer(tmpDsName)
- if not tmpStat:
- _logger.error('%s failed to get datasets in container:%s' % (self.timestamp,tmpDsName))
- continue
- # check if each dataset is actually used
- for tmpConstDsName in tmpDsList:
- # skip if already checked
- if tmpDsName in missingList[tmpCloud]:
- continue
- # get files in each dataset
- tmpStat,tmpFilesInDs = self.getListFilesInDataset(tmpConstDsName)
- if not tmpStat:
- _logger.error('%s failed to get files in dataset:%s' % (self.timestamp,tmpConstDsName))
- continue
- # loop over all files to check the dataset is used
- for tmpGUID in tmpMissFiles:
- # append if used
- if tmpFilesInDs.has_key(tmpGUID):
- missingList[tmpCloud].append(tmpConstDsName)
- break
- else:
- # append dataset w/o checking
- if not tmpDsName in missingList[tmpCloud]:
- missingList[tmpCloud].append(tmpDsName)
- # make subscriptions
- for tmpCloud,missDsNameList in missingList.iteritems():
- # get distination
- tmpDstID = self.siteMapper.getCloud(tmpCloud)['source']
- dstDQ2ID = self.siteMapper.getSite(tmpDstID).ddm
- # register subscription
- for missDsName in missDsNameList:
- _logger.debug('%s make subscription at %s for missing %s' % (self.timestamp,dstDQ2ID,missDsName))
- self.makeSubscription(missDsName,dstDQ2ID)
- # retrun
- _logger.debug('%s make subscriptions for missing files done' % self.timestamp)
- return
-
-
- # check DDM response
- def isDQ2ok(self,out):
- if out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1 \
- or out == '()':
- return False
- return True
-
-
- # make subscription
- def makeSubscription(self,dataset,dq2ID):
- # return for failuer
- retFailed = False
- # make subscription
- optSrcPolicy = 000001
- nTry = 3
- for iDDMTry in range(nTry):
- # register subscription
- _logger.debug('%s %s/%s registerDatasetSubscription %s %s' % (self.timestamp,iDDMTry,nTry,dataset,dq2ID))
- status,out = ddm.DQ2.main('registerDatasetSubscription',dataset,dq2ID,version=0,archived=0,
- callbacks={},sources={},sources_policy=optSrcPolicy,
- wait_for_sources=0,destination=None,query_more_sources=0,
- sshare="production",group=None,activity='Production',acl_alias='secondary')
- status,out = 0,''
- if out.find('DQSubscriptionExistsException') != -1:
- break
- elif out.find('DQLocationExistsException') != -1:
- break
- elif status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if out.find('DQSubscriptionExistsException') != -1:
- pass
- elif status != 0 or out.startswith('Error'):
- _logger.error("%s %s" % (self.timestamp,out))
- return retFailed
- # update
- _logger.debug('%s %s %s' % (self.timestamp,status,out))
- # return
- return True
-
-
- # get replica metadata
- def getReplicaMetadata(self,datasetName,locationName):
- # response for failure
- resForFailure = False,{}
- # get metadata
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug('%s %s/%s listMetaDataReplica %s %s' % (self.timestamp,iDDMTry,nTry,datasetName,locationName))
- status,out = ddm.DQ2.main('listMetaDataReplica',locationName,datasetName)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error("%s %s" % (self.timestamp,out))
- return resForFailure
- metadata = {}
- try:
- # convert to map
- exec "metadata = %s" % out
- except:
- _logger.error('%s could not convert HTTP-res to replica metadata for %s:%s' % \
- (self.timestamp,datasetName,locationName))
- return resForFailure
- # return
- _logger.debug('%s getReplicaMetadata -> %s' % (self.timestamp,str(metadata)))
- return True,metadata
-
-
- # set replica metadata
- def setReplicaMetadata(self,datasetName,locationName,attrname,attrvalue):
- # response for failure
- resForFailure = False
- # get metadata
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug('%s %s/%s setReplicaMetaDataAttribute %s %s %s=%s' % (self.timestamp,iDDMTry,nTry,datasetName,
- locationName,attrname,attrvalue))
- status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',datasetName,locationName,attrname,attrvalue)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error("%s %s" % (self.timestamp,out))
- return resForFailure
- # return
- _logger.debug('%s setReplicaMetadata done' % self.timestamp)
- return True
-
-
- # send task brokerage message to logger
- def sendTaMesg(self,message,msgType=None):
- try:
- # get logger
- tmpPandaLogger = PandaLogger()
- # lock HTTP handler
- tmpPandaLogger.lock()
- tmpPandaLogger.setParams({'Type':'taskbrokerage'})
- # use bamboo for loggername
- if panda_config.loggername == 'prod':
- tmpLogger = tmpPandaLogger.getHttpLogger('bamboo')
- else:
- # for dev
- tmpLogger = tmpPandaLogger.getHttpLogger(panda_config.loggername)
- # add message
- if msgType=='error':
- tmpLogger.error(message)
- elif msgType=='warning':
- tmpLogger.warning(message)
- elif msgType=='info':
- tmpLogger.info(message)
- else:
- tmpLogger.debug(message)
- # release HTTP handler
- tmpPandaLogger.release()
- except:
- pass
- time.sleep(1)
-
diff --git a/current/pandaserver/dataservice/TaLauncher.py b/current/pandaserver/dataservice/TaLauncher.py
deleted file mode 100755
index e44a7bc72..000000000
--- a/current/pandaserver/dataservice/TaLauncher.py
+++ /dev/null
@@ -1,55 +0,0 @@
-'''
-launcer for TaskAssigner
-
-'''
-
-import sys
-import time
-import commands
-import threading
-import cPickle as pickle
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('TaLauncher')
-
-
-class TaLauncher (threading.Thread):
- # constructor
- def __init__(self,taskBuffer,jobs):
- threading.Thread.__init__(self)
- self.jobs = jobs
- self.taskBuffer = taskBuffer
- # time stamp
- self.timestamp = time.asctime()
-
-
- # main
- def run(self):
- try:
- _logger.debug('%s startRun' % self.timestamp)
- # run setupper sequentially
- for job in self.jobs:
- # write jobs to file
- outFileName = '%s/set.%s_%s' % (panda_config.logdir,job.PandaID,commands.getoutput('uuidgen'))
- outFile = open(outFileName,'w')
- pickle.dump([job],outFile)
- outFile.close()
- # run main procedure in another process because python doesn't release memory
- com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd)
- com += 'source /opt/glite/etc/profile.d/grid-env.sh; '
- com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \
- (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python,
- panda_config.pandaPython_dir,outFileName)
- # add option for TA
- com += " -t"
- _logger.debug('%s taskID:%s %s' % (self.timestamp,job.taskID,com))
- # exeute
- status,output = self.taskBuffer.processLimiter.getstatusoutput(com)
- _logger.debug("%s Ret from child process: %s %s" % (self.timestamp,status,output))
- _logger.debug('%s endRun' % self.timestamp)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("run() : %s %s" % (type,value))
diff --git a/current/pandaserver/dataservice/TaskAssigner.py b/current/pandaserver/dataservice/TaskAssigner.py
deleted file mode 100644
index 677cd4645..000000000
--- a/current/pandaserver/dataservice/TaskAssigner.py
+++ /dev/null
@@ -1,1180 +0,0 @@
-'''
-setup cloud
-
-'''
-
-import re
-import sys
-import time
-import types
-import random
-import commands
-import datetime
-import brokerage.broker_util
-from DDM import ddm
-from DDM import dq2Common
-from DDM import toa
-from config import panda_config
-from taskbuffer import ProcessGroups
-from pandalogger.PandaLogger import PandaLogger
-import DataServiceUtils
-
-
-# logger
-_logger = PandaLogger().getLogger('TaskAssigner')
-
-# cutoff for RW
-thr_RW_low = 400
-thr_RW_high = 8000
-thr_RW_sub = 600
-
-# cutoff for disk
-thr_space_low = (1 * 1024)
-
-# special reduction for TAPE
-reductionForTape = 0.5
-
-# task types using MC share
-taskTypesMcShare = ['evgen']
-
-# task types for subscriptions
-taskTypesSub = ['simul']
-
-# dataset type to ignore file availability check
-datasetTypeToSkipCheck = ['log']
-
-class TaskAssigner:
- # constructor
- def __init__(self,taskBuffer,siteMapper,taskID,prodSourceLabel,job):
- self.taskBuffer = taskBuffer
- self.siteMapper = siteMapper
- self.taskID = taskID
- self.cloudTask = None
- self.prodSourceLabel = prodSourceLabel
- self.cloudForSubs = []
- self.job = job
- self.metadataMap = {}
- self.contDsMap = {}
-
-
- # check cloud
- def checkCloud(self):
- try:
- _logger.info('%s checkCloud' % self.taskID)
- # get CloudTask from DB
- self.cloudTask = self.taskBuffer.getCloudTask(self.taskID)
- if self.cloudTask == None:
- _logger.error('%s cannot get CloudTask' % self.taskID)
- return None
- # if already assigned
- if self.cloudTask.status == 'assigned':
- _logger.info('%s checked Cloud -> %s' % (self.taskID,self.cloudTask.cloud))
- return self.cloudTask.cloud
- # return "" to set cloud later
- _logger.info('%s return Cloud=""' % self.taskID)
- return ""
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("%s checkCloud : %s %s" % (self.taskID,type,value))
- return None
-
-
- # set cloud
- def setCloud(self,lfns,guids,locations={},metadata=None,fileCounts=None):
- try:
- _logger.info('%s setCloud' % self.taskID)
- _logger.info('%s metadata="%s"' % (self.taskID,metadata))
- _logger.info('%s fileCounts="%s"' % (self.taskID,fileCounts))
- taskType = None
- RWs = {}
- expRWs = {}
- highRWs = {}
- prioMap = {}
- fullRWs = {}
- tt2Map = {}
- diskCount = 0
- usingOpenDS = False
- try:
- # parse metadata
- if not metadata in (None,'NULL'):
- # task type
- taskType = metadata.split(';')[0]
- # RWs
- exec "RWs = %s" % metadata.split(';')[1]
- # expected RWs
- exec "expRWs = %s" % metadata.split(';')[2]
- # RWs for high priority tasks
- exec "prioMap = %s" % metadata.split(';')[3]
- # full RWs for space calcuration
- exec "fullRWs = %s" % metadata.split(';')[4]
- # tasktype2 map
- exec "tt2Map = %s" % metadata.split(';')[5]
- except:
- pass
- try:
- diskCount = int(self.job.maxDiskCount)
- except:
- pass
- message = '%s taskType==%s prio==%s RW==%s DiskCount==%s' % (self.taskID,taskType,prioMap[self.taskID],
- expRWs[self.taskID],diskCount)
- _logger.info(message)
- self.sendMesg(message)
- _logger.info('%s RWs = %s' % (self.taskID,str(RWs)))
- _logger.info('%s expRWs = %s' % (self.taskID,str(expRWs)))
- _logger.info('%s prioMap = %s' % (self.taskID,str(prioMap)))
- _logger.info('%s fullRWs = %s' % (self.taskID,str(fullRWs)))
- _logger.info('%s tt2Map = %s' % (self.taskID,str(tt2Map)))
- # get cloud list
- cloudList = self.siteMapper.getCloudList()
- # get pilot statistics
- nWNmap = self.taskBuffer.getCurrentSiteData()
- # get process group
- myTaskGroup = ProcessGroups.getProcessGroup(tt2Map[self.taskID])
- # recalculate RWs
- for tmpTaskID,tmpExpRW in expRWs.iteritems():
- # skip myself
- if tmpTaskID == self.taskID:
- continue
- # get cloud from DB
- tmpCloudInDB = self.taskBuffer.seeCloudTask(tmpTaskID)
- # not assigned
- if tmpCloudInDB == '':
- continue
- # increase full RW
- if not fullRWs.has_key(tmpCloudInDB):
- fullRWs[tmpCloudInDB] = 0
- fullRWs[tmpCloudInDB] += tmpExpRW
- # no priority info
- if not prioMap.has_key(tmpTaskID):
- continue
- # lower priority
- if prioMap[tmpTaskID] < prioMap[self.taskID]:
- continue
- # check tasktype2
- tmpTaskGroup = ProcessGroups.getProcessGroup(tt2Map[tmpTaskID])
- # check tasktype2
- if tmpTaskGroup != myTaskGroup:
- continue
- # increase RW
- if not RWs.has_key(tmpCloudInDB):
- RWs[tmpCloudInDB] = 0
- RWs[tmpCloudInDB] += tmpExpRW
- _logger.info('%s newRWs =%s' % (self.taskID,str(RWs)))
- _logger.info('%s fullRWs =%s' % (self.taskID,str(fullRWs)))
- # remove offline clouds and check validation/fasttrack
- tmpCloudList = []
- for tmpCloudName in cloudList:
- # get cloud
- tmpCloud = self.siteMapper.getCloud(tmpCloudName)
- # skip offline clouds
- if not tmpCloud['status'] in ['online']:
- message = '%s %s skip : status==%s' % (self.taskID,tmpCloudName,tmpCloud['status'])
- _logger.info(message)
- self.sendMesg(message)
- continue
- # skip non-validation cloud if validation
- if self.prodSourceLabel in ['validation'] and tmpCloud['validation'] != 'true':
- message = "%s %s skip : validation=='%s'" % (self.taskID,tmpCloudName,tmpCloud['validation'])
- _logger.info(message)
- self.sendMesg(message)
- continue
- # check fast track
- if ((taskType in ['evgen'] and prioMap[self.taskID] >= 700) or
- (taskType in ['simul'] and prioMap[self.taskID] >= 800)) and tmpCloud['fasttrack'] != 'true':
- message = "%s %s skip : fasttrack=='%s'" % (self.taskID,tmpCloudName,tmpCloud['fasttrack'])
- _logger.info(message)
- self.sendMesg(message)
- continue
- # check disk count
- if diskCount != 0:
- enoughSpace = self.checkDiskCount(diskCount,tmpCloudName)
- if not enoughSpace:
- message = "%s %s skip : no online sites have enough space for DiskCount==%s" % (self.taskID,tmpCloudName,diskCount)
- _logger.info(message)
- self.sendMesg(message,msgType='warning')
- continue
- # append
- tmpCloudList.append(tmpCloudName)
- self.cloudForSubs.append(tmpCloudName)
- cloudList = tmpCloudList
- # DQ2 location info
- _logger.info('%s DQ2 locations %s' % (self.taskID,str(locations)))
- # check immutable datasets
- for tmpDataset,tmpSites in locations.iteritems():
- sitesForRefresh = []
- for tmpSite in tmpSites.keys():
- tmpStat = tmpSites[tmpSite][-1]
- if tmpStat['total'] == -1 or tmpStat['found'] == None:
- sitesForRefresh.append(tmpSite)
- elif tmpStat['immutable'] == 0:
- # using open datasets
- usingOpenDS = True
- _logger.info('%s open dataset : %s' % (self.taskID,tmpDataset))
- # refresh replica info
- if sitesForRefresh != []:
- # invoke listFileReplicasBySites to refresh replica info
- _logger.info('%s listFileReplicasBySites %s:%s' % (self.taskID,tmpDataset,str(sitesForRefresh)))
- tmpStat,tmpOut = ddm.DQ2_iter.listFileReplicasBySites(tmpDataset,0,sitesForRefresh,0,300)
- _logger.info('%s listFileReplicasBySites end with %s:%s' % (self.taskID,tmpStat,tmpOut))
- # reset tmod to shorten retry interval
- self.taskBuffer.resetTmodCloudTask(self.taskID)
- removedDQ2Map = {}
- t2ListForMissing = {}
- diskCopyCloud = None
- badMetaMap = {}
- if locations != {}:
- # sort datasets by the number of sites
- numSitesDatasetMap = {}
- for dataset,sites in locations.iteritems():
- numSites = len(sites)
- if not numSitesDatasetMap.has_key(numSites):
- numSitesDatasetMap[numSites] = []
- numSitesDatasetMap[numSites].append(dataset)
- numSitesList = numSitesDatasetMap.keys()
- numSitesList.sort()
- sortedDatasetList = []
- for numSites in numSitesList:
- sortedDatasetList += numSitesDatasetMap[numSites]
- # loop over datasets starting with fewer replicas
- removedCloud = []
- for dataset in sortedDatasetList:
- sites = locations[dataset]
- tmpDiskCopyCloud = []
- removedDQ2Map[dataset] = []
- _logger.info('%s DS:%s' % (self.taskID,dataset))
- datasetType = DataServiceUtils.getDatasetType(dataset)
- for tmpCloudName in cloudList:
- useCacheT1 = False
- tmpCloud = self.siteMapper.getCloud(tmpCloudName)
- if DataServiceUtils.isCachedFile(dataset,self.siteMapper.getSite(tmpCloud['source'])):
- # use site's endpoint for CVMFS cache
- foundSE = self.siteMapper.getSite(tmpCloud['source']).ddm
- tmpDiskCopyCloud.append(tmpCloudName)
- # using cached files at T1
- useCacheT1 = True
- else:
- # look for T1 SE which holds the max number of files
- minFound = -1
- foundSE = ''
- for tmpSePat in tmpCloud['tier1SE']:
- # make regexp pattern
- if '*' in tmpSePat:
- tmpSePat = tmpSePat.replace('*','.*')
- tmpSePat = '^' + tmpSePat +'$'
- for tmpSE in sites.keys():
- # check name with regexp pattern
- if re.search(tmpSePat,tmpSE) == None:
- continue
- # check metadata
- metaOK = self.checkMetadata(dataset,tmpSE)
- if not metaOK:
- if not badMetaMap.has_key(dataset):
- badMetaMap[dataset] = []
- badMetaMap[dataset].append(tmpSE)
- _logger.info('%s skip %s due to ToBeDeleted' % (self.taskID,tmpSE))
- continue
- # check the number of available files
- tmpStat = sites[tmpSE][-1]
- if tmpStat['found'] == None:
- if minFound == -1:
- foundSE = tmpSE
- elif minFound < tmpStat['found']:
- minFound = tmpStat['found']
- foundSE = tmpSE
- # check if disk copy is available
- tmpStatusSE,tmpRetSE = toa.getSiteProperty(tmpSE,'tape')
- if tmpRetSE != 'True':
- if tmpStat['found'] != None and tmpStat['found'] == tmpStat['total']:
- tmpDiskCopyCloud.append(tmpCloudName)
- else:
- _logger.info('%s %s is on tape : %s' % (self.taskID,tmpSE,tmpRetSE))
- # get list of T2s where dataset is available
- tmpT2List = []
- tmpT2Map = DataServiceUtils.getSitesWithDataset(dataset,self.siteMapper,locations,
- tmpCloudName,True,getDQ2ID=True,
- useOnlineSite=True)
- for tmpT2Name,tmpT2DQ2List in tmpT2Map.iteritems():
- # skip redundant lookup
- if t2ListForMissing.has_key(tmpCloudName) and \
- not tmpT2Name in t2ListForMissing[tmpCloudName]:
- continue
- # loop over all DQ2 IDs
- for tmpT2DQ2 in tmpT2DQ2List:
- # check metadata
- metaOK = self.checkMetadata(dataset,tmpT2DQ2)
- if metaOK:
- tmpT2List.append(tmpT2Name)
- break
- else:
- if not badMetaMap.has_key(dataset):
- badMetaMap[dataset] = []
- badMetaMap[dataset].append(tmpT2DQ2)
- _logger.info('%s skip %s due to ToBeDeleted' % (self.taskID,tmpT2DQ2))
- # take CVMFS cache into account
- tmpT2CacheList = DataServiceUtils.getSitesWithCacheDS(tmpCloudName,tmpT2List,self.siteMapper,dataset)
- tmpT2List += tmpT2CacheList
- # remove cloud if T1SE or T2 is not a location
- if foundSE == '':
- # keep if T2 has the dataset
- if tmpT2List == []:
- if not tmpCloudName in removedCloud:
- _logger.info('%s removed %s' % (self.taskID,tmpCloudName))
- removedCloud.append(tmpCloudName)
- # add dataset to map for subscription when T2 has non-cached replica
- if (tmpT2List != [] and len(tmpT2CacheList) != len(tmpT2List)) and not tmpCloudName in removedDQ2Map[dataset]:
- removedDQ2Map[dataset].append(tmpCloudName)
- else:
- if not useCacheT1:
- # check incomplete or not
- tmpStat = sites[foundSE][-1]
- if tmpStat['found'] == None or \
- (not datasetType in datasetTypeToSkipCheck and tmpStat['found'] < tmpStat['total']):
- # add dataset to map which is subscribed when the task is used due to T2 files
- if not tmpCloudName in removedDQ2Map[dataset]:
- removedDQ2Map[dataset].append(tmpCloudName)
- # aggregate T2 list
- if not t2ListForMissing.has_key(tmpCloudName):
- t2ListForMissing[tmpCloudName] = tmpT2List
- else:
- # use sites where all datasets are available
- newTmpT2List = []
- for tmpT2 in t2ListForMissing[tmpCloudName]:
- if tmpT2 in tmpT2List:
- newTmpT2List.append(tmpT2)
- t2ListForMissing[tmpCloudName] = newTmpT2List
- # disk copy cloud
- if diskCopyCloud == None:
- diskCopyCloud = tmpDiskCopyCloud
- else:
- newDiskCopyCloud = []
- for tmpCloudName in diskCopyCloud:
- if tmpCloudName in tmpDiskCopyCloud:
- newDiskCopyCloud.append(tmpCloudName)
- diskCopyCloud = newDiskCopyCloud
- # remove clouds
- for tmpCloudName in removedCloud:
- if tmpCloudName in cloudList:
- cloudList.remove(tmpCloudName)
- _logger.info('%s new locations after DQ2 filter %s' % (self.taskID,str(cloudList)))
- _logger.info('%s clouds where complete disk copies are available %s' % (self.taskID,str(diskCopyCloud)))
- _logger.info('%s removed DQ2 map %s' % (self.taskID,str(removedDQ2Map)))
- if cloudList == []:
- # make subscription to empty cloud
- if taskType in taskTypesSub:
- _logger.info('%s makeSubscription start' % self.taskID)
- retSub = self.makeSubscription(removedDQ2Map,RWs,fullRWs,expRWs)
- _logger.info('%s makeSubscription end with %s' % (self.taskID,retSub))
- message = '%s no input data locations' % self.taskID
- self.sendMesg(message,msgType='warning')
- raise RuntimeError, '%s cloud list is empty after DQ2 filter' % self.taskID
- message = '%s input data locations %s' % (self.taskID,str(cloudList))
- _logger.info(message)
- self.sendMesg(message)
- # calculate # of loops
- nFile = 200
- nLoop = len(guids) / nFile
- if len(guids) % nFile != 0:
- nLoop += 1
- iFileList = []
- for iTmp in range(nLoop):
- iFileList.append(iTmp*nFile)
- # truncate list to avoid too many lookup
- maxLoop = 100
- if len(iFileList) > maxLoop:
- random.shuffle(iFileList)
- iFileList = iFileList[:maxLoop]
- iFileList.sort()
- # count the number of files to be lookup
- maxNFiles = 0
- if not usingOpenDS:
- # if dataset is open, doesn't check nFiles
- for iFile in iFileList:
- maxNFiles += len(lfns[iFile:iFile+nFile])
- # loop over all cloud
- weightParams = {}
- foundCandidateWithT1 = []
- candidatesUsingT2 = []
- for tmpCloudName in cloudList:
- _logger.info('%s calculate weight for %s' % (self.taskID,tmpCloudName))
- # add missing cloud in RWs
- if not RWs.has_key(tmpCloudName):
- RWs[tmpCloudName] = 0
- if not fullRWs.has_key(tmpCloudName):
- fullRWs[tmpCloudName] = 0
- # get cloud
- tmpCloud = self.siteMapper.getCloud(tmpCloudName)
- weightParams[tmpCloudName] = {}
- # get T1 site
- tmpT1Site = self.siteMapper.getSite(tmpCloud['source'])
- # get number of running jobs. Initially set 1 to avoid zero dividing
- nPilot = 1
- for siteName in tmpCloud['sites']:
- if nWNmap.has_key(siteName):
- nPilot += (nWNmap[siteName]['getJob'] + nWNmap[siteName]['updateJob'])
- weightParams[tmpCloudName]['nPilot'] = nPilot
- _logger.info('%s # of pilots %s' % (self.taskID,nPilot))
- # available space
- weightParams[tmpCloudName]['space'] = tmpT1Site.space
- _logger.info('%s T1 space %s' % (self.taskID,tmpT1Site.space))
- # MC share
- weightParams[tmpCloudName]['mcshare'] = tmpCloud['mcshare']
- _logger.info('%s MC share %s' % (self.taskID,tmpCloud['mcshare']))
- # calculate available space = totalT1space - ((RW(cloud)+RW(thistask))*GBperSI2kday))
- aveSpace,sizeCloud,sizeThis = self.getAvailableSpace(weightParams[tmpCloudName]['space'],
- fullRWs[tmpCloudName],
- expRWs[self.taskID])
- # no task is assigned if available space is less than 1TB
- if aveSpace < thr_space_low:
- message = '%s %s skip : space:%s (total:%s - assigned:%s - this:%s) < %sGB' % \
- (self.taskID,tmpCloudName,aveSpace,weightParams[tmpCloudName]['space'],
- sizeCloud,sizeThis,thr_space_low)
- _logger.info(message)
- self.sendMesg(message,msgType='warning')
- del weightParams[tmpCloudName]
- continue
- else:
- _logger.info('%s %s pass : space:%s (total:%s - assigned:%s - this:%s)' % \
- (self.taskID,tmpCloudName,aveSpace,weightParams[tmpCloudName]['space'],
- sizeCloud,sizeThis))
- # not assign tasks when RW is too high
- if RWs.has_key(tmpCloudName) and RWs[tmpCloudName] > thr_RW_high*weightParams[tmpCloudName]['mcshare']:
- message = '%s %s skip : too high RW==%s > %s' % \
- (self.taskID,tmpCloudName,RWs[tmpCloudName],thr_RW_high*weightParams[tmpCloudName]['mcshare'])
- _logger.info(message)
- self.sendMesg(message,msgType='warning')
- del weightParams[tmpCloudName]
- continue
- # T1
- t1List = [tmpT1Site.sitename]
- # hack for split T1
- if tmpCloudName == 'NL':
- t1List.append('NIKHEF-ELPROD')
- # get files
- weightParams[tmpCloudName]['nFiles'] = 0
- # loop
- tmpMaxNumFile = 0
- for tmpSiteNameScan in t1List:
- tmpScanRet,tmpN = DataServiceUtils.getNumAvailableFilesSite(tmpSiteNameScan,
- self.siteMapper,
- locations,badMetaMap,
- tmpCloud['tier1SE'],
- noCheck=datasetTypeToSkipCheck,
- fileCounts=fileCounts)
- # failed
- if not tmpScanRet:
- raise RuntimeError, 'failed to get nFiles at %s due to %s' % (tmpSiteNameScan,tmpN)
- # max
- if tmpMaxNumFile < tmpN:
- tmpMaxNumFile = tmpN
- # set
- weightParams[tmpCloudName]['nFiles'] = tmpMaxNumFile
- _logger.info('%s # of files at T1 %s' % (self.taskID,weightParams[tmpCloudName]['nFiles']))
- # found candidate
- foundCandidateT1 = False
- if weightParams[tmpCloudName]['nFiles'] >= maxNFiles:
- foundCandidateT1 = True
- # avoid incomplete at T1
- for tmpDS,tmpT2CloudList in removedDQ2Map.iteritems():
- if tmpCloudName in tmpT2CloudList:
- foundCandidateT1 = False
- # reset nFiles at T1
- weightParams[tmpCloudName]['nFiles'] = 0
- break
- if foundCandidateT1:
- foundCandidateWithT1.append(tmpCloudName)
- # check T2 if files are missing
- if (not foundCandidateT1 or weightParams[tmpCloudName]['nFiles'] < maxNFiles) and \
- t2ListForMissing.has_key(tmpCloudName) and t2ListForMissing[tmpCloudName] != []:
- _logger.info('%s T2 candidates %s' % (self.taskID,str(t2ListForMissing[tmpCloudName])))
- # loop
- tmpMaxNumFile = 0
- for tmpSiteNameScan in t2ListForMissing[tmpCloudName]:
- tmpScanRet,tmpN = DataServiceUtils.getNumAvailableFilesSite(tmpSiteNameScan,
- self.siteMapper,
- locations,badMetaMap,
- noCheck=datasetTypeToSkipCheck,
- fileCounts=fileCounts)
- # failed
- if not tmpScanRet:
- raise RuntimeError, 'failed to get nFiles at %s due to %s' % (tmpSiteNameScan,tmpN)
- # use larger value
- _logger.info('%s # of files at T2:%s %s' % (self.taskID,tmpSiteNameScan,tmpN))
- if tmpN > weightParams[tmpCloudName]['nFiles']:
- weightParams[tmpCloudName]['nFiles'] = tmpN
- # found candidate
- if weightParams[tmpCloudName]['nFiles'] >= maxNFiles:
- candidatesUsingT2.append(tmpCloudName)
- break
- # compare parameters
- definedCloud = "US"
- maxClouds = []
- useMcShare = False
- # use clouds where T1 have the data
- maxClouds += foundCandidateWithT1
- # use clouds where T2 have the data
- maxClouds += candidatesUsingT2
- # logging
- _logger.info('%s check nFiles' % self.taskID)
- for cloudName,params in weightParams.iteritems():
- if not cloudName in maxClouds:
- if maxNFiles == 0:
- message = '%s %s skip : missing files at DATA/GROUPDISK' % \
- (self.taskID,cloudName)
- elif params['nFiles'] != maxNFiles:
- message = '%s %s skip : nFiles==%s<%s' % \
- (self.taskID,cloudName,params['nFiles'],maxNFiles)
- else:
- message = '%s %s skip : no complete replica at DATA/GROUPDISK' % \
- (self.taskID,cloudName)
- _logger.info(message)
- self.sendMesg(message)
- time.sleep(2)
- # check RW
- _logger.info('%s check RW' % self.taskID)
- tmpInfClouds = []
- for cloudName in maxClouds:
- # set weight to infinite when RW is too low
- if not taskType in taskTypesMcShare:
- if RWs[cloudName] < thr_RW_low*weightParams[cloudName]['mcshare']:
- message = '%s %s infinite weight : RW==%s < %s' % \
- (self.taskID,cloudName,RWs[cloudName],thr_RW_low*weightParams[cloudName]['mcshare'])
- _logger.info(message)
- self.sendMesg(message)
- tmpInfClouds.append(cloudName)
- # use new list
- if tmpInfClouds != []:
- _logger.info('%s use infinite clouds after RW checking' % self.taskID)
- maxClouds = tmpInfClouds
- useMcShare = True
- elif maxClouds == []:
- messageEnd = '%s no candidates left' % self.taskID
- self.sendMesg(messageEnd)
- # make subscription to empty cloud
- if taskType in taskTypesSub:
- _logger.info('%s makeSubscription start' % self.taskID)
- retSub = self.makeSubscription(removedDQ2Map,RWs,fullRWs,expRWs)
- _logger.info('%s makeSubscription end with %s' % (self.taskID,retSub))
- if retSub:
- message = '%s made subscription' % self.taskID
- self.sendMesg(message,msgType='info')
- else:
- message = "%s didn't make subscription" % self.taskID
- self.sendMesg(message,msgType='warning')
- # return
- _logger.info(messageEnd)
- _logger.info("%s end" % self.taskID)
- return None
- # choose one
- message = '%s candidates %s' % (self.taskID,str(maxClouds))
- _logger.info(message)
- self.sendMesg(message)
- if len(maxClouds) == 1:
- definedCloud = maxClouds[0]
- elif len(maxClouds) > 1:
- # choose cloud according to weight
- nWeightList = []
- totalWeight = 0
- for cloudName in maxClouds:
- if (taskType in taskTypesMcShare):
- # use MC share for evgen
- tmpWeight = float(weightParams[cloudName]['mcshare'])
- message = "%s %s weight==%s" % (self.taskID,cloudName,weightParams[cloudName]['mcshare'])
- else:
- # use nPilot/RW*MCshare
- tmpWeight = float(weightParams[cloudName]['nPilot']) / float(1+RWs[cloudName])
- message = "%s %s weight==%s/%s" % (self.taskID,cloudName,
- weightParams[cloudName]['nPilot'],
- 1+RWs[cloudName])
- # use different weight if DISK is available
- if diskCopyCloud != None and diskCopyCloud != [] and cloudName not in diskCopyCloud:
- tmpWeight *= float(reductionForTape)
- message += '*%s' % reductionForTape
- self.sendMesg(message)
- nWeightList.append(tmpWeight)
- totalWeight += tmpWeight
- # check total weight
- if totalWeight == 0:
- raise RuntimeError, 'totalWeight=0'
- # determin cloud using random number
- _logger.info('%s weights %s' % (self.taskID,str(nWeightList)))
- rNumber = random.random() * totalWeight
- _logger.info('%s totalW %s' % (self.taskID,totalWeight))
- _logger.info('%s rNumber %s' % (self.taskID,rNumber))
- for index,tmpWeight in enumerate(nWeightList):
- rNumber -= tmpWeight
- _logger.info('%s rNumber %s : Cloud=%s weight=%s' %
- (self.taskID,rNumber,maxClouds[index],tmpWeight))
- if rNumber <= 0:
- definedCloud = maxClouds[index]
- break
- # make subscription when T2 candidate is chosen
- if definedCloud in candidatesUsingT2:
- newT2DQ2Map = {}
- for tmpDS,tmpT2CloudList in removedDQ2Map.iteritems():
- if definedCloud in tmpT2CloudList:
- newT2DQ2Map[tmpDS] = [definedCloud]
- if newT2DQ2Map == {}:
- _logger.error('%s no subscription map to use T2 datasets cloud=%s map=%s' % (self.taskID,definedCloud,removedDQ2Map))
- return None
- _logger.info('%s makeSubscription to use T2 start' % self.taskID)
- retSub = self.makeSubscription(newT2DQ2Map,RWs,fullRWs,expRWs,noEmptyCheck=True,acceptInProcess=True)
- if not retSub:
- _logger.error('%s makeSubscription to use T2 failed with %s' % (self.taskID,retSub))
- return None
- _logger.info('%s makeSubscription to use T2 end with %s' % (self.taskID,retSub))
- # set CloudTask in DB
- self.cloudTask.cloud = definedCloud
- retCloudTask = self.taskBuffer.setCloudTask(self.cloudTask)
- if retCloudTask == None:
- _logger.error('%s cannot set CloudTask' % self.taskID)
- return None
- # pin input dataset
- pinSiteList = []
- if definedCloud in candidatesUsingT2:
- # pin T2 replicas
- if t2ListForMissing.has_key(definedCloud):
- pinSiteList = t2ListForMissing[definedCloud]
- else:
- # pin T1 replica
- pinSiteList = [self.siteMapper.getCloud(definedCloud)['tier1']]
- if pinSiteList != []:
- self.pinDataset(locations,pinSiteList,definedCloud)
- message = '%s set Cloud -> %s' % (self.taskID,retCloudTask.cloud)
- _logger.info(message)
- self.sendMesg(message)
- # return
- return retCloudTask.cloud
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("%s setCloud : %s %s" % (self.taskID,type,value))
- return None
-
-
- # send message to logger
- def sendMesg(self,message,msgType=None):
- try:
- # get logger
- tmpPandaLogger = PandaLogger()
- # lock HTTP handler
- tmpPandaLogger.lock()
- tmpPandaLogger.setParams({'Type':'taskbrokerage'})
- # use bamboo for loggername
- if panda_config.loggername == 'prod':
- tmpLogger = tmpPandaLogger.getHttpLogger('bamboo')
- else:
- # for dev
- tmpLogger = tmpPandaLogger.getHttpLogger(panda_config.loggername)
- # add message
- if msgType=='error':
- tmpLogger.error(message)
- elif msgType=='warning':
- tmpLogger.warning(message)
- elif msgType=='info':
- tmpLogger.info(message)
- else:
- tmpLogger.debug(message)
- # release HTTP handler
- tmpPandaLogger.release()
- except:
- pass
- time.sleep(1)
-
-
- # check disk count
- def checkDiskCount(self,diskCount,cloud):
- scanSiteList = self.siteMapper.getCloud(cloud)['sites']
- # loop over all sites
- for tmpSiteName in scanSiteList:
- if 'test' in tmpSiteName.lower():
- continue
- # get sitespec
- tmpSiteSpec = self.siteMapper.getSite(tmpSiteName)
- # use online only
- if not tmpSiteSpec.status in ['online']:
- continue
- # no size limit
- if tmpSiteSpec.maxinputsize in [0,None,'']:
- return True
- # enough space for input
- if int(tmpSiteSpec.maxinputsize) >= int(diskCount):
- return True
- # no sites have enough space
- return False
-
-
- # get available space
- def getAvailableSpace(self,space,fullRW,expRW):
- # calculate available space = totalT1space - ((RW(cloud)+RW(thistask))*GBperSI2kday))
- sizeCloud = fullRW * 0.2
- sizeThis = expRW * 0.2
- aveSpace = space - (sizeCloud + sizeThis)
- return aveSpace,sizeCloud,sizeThis
-
-
- # make subscription
- def makeSubscription(self,dsCloudMap,RWs,fullRWs,expRWs,noEmptyCheck=False,acceptInProcess=False):
- nDDMtry = 3
- cloudList = []
- # collect clouds which don't hold datasets
- message = '%s possible clouds : %s' % (self.taskID,str(self.cloudForSubs))
- _logger.info(message)
- for tmpDS,tmpClouds in dsCloudMap.iteritems():
- for tmpCloud in tmpClouds:
- if (not tmpCloud in cloudList) and tmpCloud in self.cloudForSubs:
- cloudList.append(tmpCloud)
- message = '%s candidates for subscription : %s' % (self.taskID,str(cloudList))
- _logger.info(message)
- self.sendMesg(message)
- if cloudList == []:
- _logger.info('%s no candidates for subscription' % self.taskID)
- return False
- # get DN
- com = 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; '
- com+= 'source %s; grid-proxy-info -subject' % panda_config.glite_source
- status,DN = commands.getstatusoutput(com)
- _logger.info('%s %s' % (self.taskID,DN))
- # ignore AC issuer
- if re.search('WARNING: Unable to verify signature!',DN) != None:
- status = 0
- if status != 0:
- _logger.error('%s could not get DN %s:%s' % (self.taskID,status,DN))
- return False
- # check if there is in-process subscription
- if not acceptInProcess:
- # remove /CN=proxy and /CN=limited from DN
- DN = DN.split('\n')[-1]
- DN = re.sub('(/CN=proxy)+$','',DN)
- DN = re.sub('/CN=limited proxy','',DN)
- status,out = dq2Common.parse_dn(DN)
- if status != 0:
- _logger.error('%s could not truncate DN %s:%s' % (self.taskID,status,DN))
- return False
- DN = out
- # loop over all datasets
- runningSub = {}
- for tmpDS,tmpClouds in dsCloudMap.iteritems():
- # get running subscriptions
- runningSub[tmpDS] = []
- _logger.info('%s listSubscriptions(%s)' % (self.taskID,tmpDS))
- iTry = 0
- while True:
- status,outLoc = ddm.DQ2.listSubscriptions(tmpDS)
- # succeed
- if status == 0:
- break
- # failed
- iTry += 1
- if iTry < nDDMtry:
- time.sleep(30)
- else:
- _logger.error('%s %s' % (self.taskID,outLoc))
- return False
- _logger.info('%s %s %s' % (self.taskID,status,outLoc))
- time.sleep(1)
- # get subscription metadata
- exec "outLoc = %s" % outLoc
- for tmpLocation in outLoc:
- t1Flag = False
- # check T1 or not
- for tmpCloudName4T1 in self.siteMapper.getCloudList():
- if tmpLocation in self.siteMapper.getCloud(tmpCloudName4T1)['tier1SE']:
- t1Flag = True
- break
- # skip non-T1
- if not t1Flag:
- continue
- _logger.info('%s listSubscriptionInfo(%s,%s)' % (self.taskID,tmpDS,tmpLocation))
- iTry = 0
- while True:
- status,outMeta = ddm.DQ2.listSubscriptionInfo(tmpDS,tmpLocation,0)
- # succeed
- if status == 0:
- break
- # skip non-existing ID
- if re.search('not a Tiers of Atlas Destination',outMeta) != None:
- _logger.info('%s ignore %s' % (self.taskID,outMeta.split('\n')[-1]))
- status = 0
- outMeta = "()"
- break
- # failed
- iTry += 1
- if iTry < nDDMtry:
- time.sleep(30)
- else:
- _logger.error('%s %s' % (self.taskID,outMeta))
- return False
- _logger.info('%s %s %s' % (self.taskID,status,outMeta))
- time.sleep(1)
- # look for DN in metadata
- exec "outMeta = %s" % outMeta
- if DN in outMeta:
- # get corrosponding cloud
- for tmpCloudName in self.siteMapper.getCloudList():
- tmpCloudSpec = self.siteMapper.getCloud(tmpCloudName)
- if tmpLocation in tmpCloudSpec['tier1SE']:
- # append
- if not tmpCloudName in runningSub[tmpDS]:
- runningSub[tmpDS].append(tmpCloudName)
- break
- _logger.info('%s runningSub=%s' % (self.taskID,runningSub))
- # doesn't make subscriptions when another subscriptions is in process
- subThr = 1
- for tmpDS,tmpClouds in runningSub.iteritems():
- if len(tmpClouds) > 0:
- message = '%s subscription:%s to %s in process' % (self.taskID,tmpDS,str(tmpClouds))
- _logger.info(message)
- self.sendMesg(message)
- return False
- # get size of datasets
- dsSizeMap = {}
- for tmpDS in dsCloudMap.keys():
- _logger.debug('%s listFilesInDataset(%s)' % (self.taskID,tmpDS))
- iTry = 0
- while True:
- status,outList = ddm.DQ2.listFilesInDataset(tmpDS)
- # succeed
- if status == 0:
- break
- # failed
- iTry += 1
- if iTry < nDDMtry:
- time.sleep(30)
- else:
- _logger.error('%s %s %s' % (self.taskID,status,outList))
- return False
- # get total size
- dsSizeMap[tmpDS] = 0
- exec "outList = %s" % outList
- for guid,vals in outList[0].iteritems():
- try:
- dsSizeMap[tmpDS] += long(vals['filesize'])
- except:
- pass
- # GB
- _logger.info('%s %s %sB' % (self.taskID,tmpDS,dsSizeMap[tmpDS]))
- dsSizeMap[tmpDS] /= (1024*1024*1024)
- _logger.info('%s dsSize=%s' % (self.taskID,dsSizeMap))
- # check space and RW
- minRW = None
- minCloud = None
- for tmpCloudName in cloudList:
- # get cloud spec
- tmpCloudSpec = self.siteMapper.getCloud(tmpCloudName)
- # get T1 site
- tmpT1Site = self.siteMapper.getSite(tmpCloudSpec['source'])
- # calculate available space
- if not fullRWs.has_key(tmpCloudName):
- fullRWs[tmpCloudName] = 0
- aveSpace,sizeCloud,sizeThis = self.getAvailableSpace(tmpT1Site.space,
- fullRWs[tmpCloudName],
- expRWs[self.taskID])
- # reduce requred space
- for tmpDS,tmpClouds in dsCloudMap.iteritems():
- if tmpCloudName in tmpClouds:
- aveSpace -= dsSizeMap[tmpDS]
- # check space
- if aveSpace < thr_space_low:
- message = '%s %s skip : space==%s total==%s' % (self.taskID,tmpCloudName,aveSpace,
- tmpT1Site.space)
- _logger.info(message)
- self.sendMesg(message,msgType='warning')
- continue
- _logger.info('%s %s pass : space==%s total==%s' % (self.taskID,tmpCloudName,aveSpace,
- tmpT1Site.space))
- # get cloud spec
- tmpCloudSpec = self.siteMapper.getCloud(tmpCloudName)
- # check MC share
- if tmpCloudSpec['mcshare'] == 0:
- message = '%s %s skip : mcshare==%s' % (self.taskID,tmpCloudName,tmpCloudSpec['mcshare'])
- _logger.info(message)
- continue
- # get minimum RW
- if not RWs.has_key(tmpCloudName):
- RWs[tmpCloudName] = 0
- tmpRwThr = tmpCloudSpec['mcshare']*thr_RW_sub
- _logger.info('%s %s RW==%s Thr==%s' % (self.taskID,tmpCloudName,RWs[tmpCloudName],
- tmpRwThr))
- tmpRwRatio = float(RWs[tmpCloudName])/float(tmpRwThr)
- if minRW == None or minRW > tmpRwRatio:
- minRW = tmpRwRatio
- minCloud = tmpCloudName
- # check RW
- if minCloud == None:
- message = '%s no candidates left for subscription' % self.taskID
- _logger.info(message)
- self.sendMesg(message)
- return False
- # get cloud spec
- tmpCloudSpec = self.siteMapper.getCloud(minCloud)
- # check threshold
- if minRW > 1.0 and not noEmptyCheck:
- message = '%s no empty cloud : %s minRW==%s>%s' % \
- (self.taskID,minCloud,RWs[minCloud],thr_RW_sub*tmpCloudSpec['mcshare'])
- _logger.info(message)
- self.sendMesg(message)
- return False
- message = '%s %s for subscription : minRW==%s' % (self.taskID,minCloud,minRW)
- _logger.info(message)
- self.sendMesg(message)
- # get cloud spec for subscription
- tmpCloudSpec = self.siteMapper.getCloud(minCloud)
- # get T1 site
- tmpT1Site = self.siteMapper.getSite(tmpCloudSpec['source'])
- # dest DQ2 ID
- dq2ID = tmpT1Site.ddm
- # make subscription
- for tmpDsName,tmpClouds in dsCloudMap.iteritems():
- # skip if the dataset already exists in the cloud
- if not minCloud in tmpClouds:
- _logger.info('%s %s already exists in %s' % (self.taskID,tmpDS,minCloud))
- continue
- # get constituents
- if tmpDsName.endswith('/'):
- tmpStat,repMap = self.getListDatasetReplicasInContainer(tmpDsName)
- if not tmpStat:
- _logger.info('%s failed to get datasets in %s ' % (self.taskID,tmpDsName))
- continue
- else:
- repMap = {tmpDsName:{dq2ID:[]}}
- # loop over all constituents
- for tmpDS in repMap.keys():
- # register subscription
- optSrcPolicy = 001000 | 010000
- _logger.debug("%s %s %s" % ('registerDatasetSubscription',(tmpDS,dq2ID),
- {'version':0,'archived':0,'callbacks':{},'sources':{},
- 'sources_policy':optSrcPolicy,'wait_for_sources':0,
- 'destination':None,'query_more_sources':0,'sshare':"secondary",
- 'group':None,'activity':"Production",'acl_alias':'secondary'}))
- iTry = 0
- while True:
- # execute
- status,out = ddm.DQ2.main('registerDatasetSubscription',tmpDS,dq2ID,version=0,archived=0,callbacks={},
- sources={},sources_policy=optSrcPolicy,wait_for_sources=0,destination=None,
- query_more_sources=0,sshare="secondary",group=None,activity="Production",
- acl_alias='secondary')
- # succeed
- if status == 0 or 'DQSubscriptionExistsException' in out:
- break
- # failed
- iTry += 1
- if iTry < nDDMtry:
- time.sleep(30)
- else:
- _logger.error('%s %s %s' % (self.taskID,status,out))
- return False
- if 'DQSubscriptionExistsException' in out:
- _logger.info('%s %s %s' % (self.taskID,status,'DQSubscriptionExistsException'))
- else:
- _logger.info('%s %s %s' % (self.taskID,status,out))
- message = '%s registered subscription %s %s:%s' % (self.taskID,tmpDS,minCloud,dq2ID)
- _logger.info(message)
- self.sendMesg(message)
- time.sleep(1)
- # completed
- return True
-
-
- # pin dataset
- def pinDataset(self,locationMap,siteList,cloudName):
- _logger.info('%s start pin input datasets' % self.taskID)
- pinLifeTime = 7
- # loop over all datasets
- for tmpDsName,tmpDQ2Map in locationMap.iteritems():
- # skip DBR
- if DataServiceUtils.isDBR(tmpDsName):
- continue
- # get DQ2 IDs in the cloud where dataset is available
- tmpDq2Map = DataServiceUtils.getSitesWithDataset(tmpDsName,self.siteMapper,locationMap,
- cloudName,useHomeCloud=True,
- getDQ2ID=True,
- useOnlineSite=True,
- includeT1=True)
- # loop over all sites
- for tmpSiteName in siteList:
- # pin dataset when the site has replicas
- if tmpDq2Map.has_key(tmpSiteName):
- # loop over all DQ2 IDs
- for tmpRepSite in tmpDq2Map[tmpSiteName]:
- # get constituents
- if tmpDsName.endswith('/'):
- tmpStat,repMap = self.getListDatasetReplicasInContainer(tmpDsName)
- if not tmpStat:
- _logger.info('%s failed to get datasets in %s ' % (self.taskID,tmpDsName))
- continue
- else:
- repMap = {tmpDsName:{tmpRepSite:[]}}
- # loop over all datasets
- for datasetName,locVal in repMap.iteritems():
- # check missing
- if not repMap[datasetName].has_key(tmpRepSite):
- _logger.info('%s skip pinning for %s at %s due to missing replica' % \
- (self.taskID,datasetName,tmpRepSite))
- continue
- # get metadata
- status,tmpMetadata = self.getReplicaMetadata(datasetName,tmpRepSite)
- if not status:
- continue
- # check pin lifetime
- if tmpMetadata.has_key('pin_expirationdate'):
- if isinstance(tmpMetadata['pin_expirationdate'],types.StringType) and tmpMetadata['pin_expirationdate'] != 'None':
- # keep original pin lifetime if it is longer
- origPinLifetime = datetime.datetime.strptime(tmpMetadata['pin_expirationdate'],'%Y-%m-%d %H:%M:%S')
- if origPinLifetime > datetime.datetime.utcnow()+datetime.timedelta(days=pinLifeTime):
- _logger.info('%s skip pinning for %s:%s due to longer lifetime %s' % (self.taskID,
- datasetName,tmpRepSite,
- tmpMetadata['pin_expirationdate']))
- continue
- # set pin lifetime
- status = self.setReplicaMetadata(datasetName,tmpRepSite,'pin_lifetime','%s days' % pinLifeTime)
- # return
- _logger.info('%s end pin input datasets' % self.taskID)
- return
-
-
- # get replica metadata
- def getReplicaMetadata(self,datasetName,locationName):
- # use cached data
- if self.metadataMap.has_key(datasetName) and self.metadataMap[datasetName].has_key(locationName):
- return True,self.metadataMap[datasetName][locationName]
- # response for failure
- resForFailure = False,{}
- # get metadata
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug('%s %s/%s listMetaDataReplica %s %s' % (self.taskID,iDDMTry,nTry,datasetName,locationName))
- status,out = ddm.DQ2.main('listMetaDataReplica',locationName,datasetName)
- if status != 0 or (not DataServiceUtils.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error("%s %s" % (self.taskID,out))
- return resForFailure
- metadata = {}
- try:
- # convert to map
- exec "metadata = %s" % out
- except:
- _logger.error('%s could not convert HTTP-res to replica metadata for %s:%s' % \
- (self.taskID,datasetName,locationName))
- return resForFailure
- # append
- if not self.metadataMap.has_key(datasetName):
- self.metadataMap[datasetName] = {}
- self.metadataMap[datasetName][locationName] = metadata
- # return
- _logger.debug('%s getReplicaMetadata -> %s' % (self.taskID,str(metadata)))
- return True,metadata
-
-
- # check metadata
- def checkMetadata(self,datasetName,tmpSE):
- try:
- # skip checking for DBR
- if DataServiceUtils.isDBR(datasetName):
- return True
- # get constituents
- if datasetName.endswith('/'):
- tmpStat,repMap = self.getListDatasetReplicasInContainer(datasetName)
- if not tmpStat:
- raise RuntimeError, 'failed to get datasets in %s when checkMetadata' % datasetName
- else:
- repMap = {datasetName:{tmpSE:[]}}
- # loop over all datasets
- for dataset,locVal in repMap.iteritems():
- # check missing
- if not locVal.has_key(tmpSE):
- _logger.info('%s skip %s at %s due to missing replica when checkMetadata' % (self.taskID,dataset,tmpSE))
- # NG
- return False
- # get metadata
- status,metaItem = self.getReplicaMetadata(dataset,tmpSE)
- if not status:
- raise RuntimeError, 'failed to get metadata at %s for %s when checkMetadata' % (tmpSE,dataset)
- # check
- if metaItem.has_key('archived') and isinstance(metaItem['archived'],types.StringType) \
- and metaItem['archived'].lower() in ['tobedeleted',]:
- _logger.info('%s skip %s due to ToBeDeleted when checkMetadata' % (self.taskID,tmpSE))
- # NG
- return False
- except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("%s checkMetadata : %s %s" % (self.taskID,errtype,errvalue))
- # FIXME
- #return False
- # OK
- return True
-
-
- # set replica metadata
- def setReplicaMetadata(self,datasetName,locationName,attrname,attrvalue):
- # response for failure
- resForFailure = False
- # get metadata
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug('%s %s/%s setReplicaMetaDataAttribute %s %s %s=%s' % (self.taskID,iDDMTry,nTry,datasetName,
- locationName,attrname,attrvalue))
- status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',datasetName,locationName,attrname,attrvalue)
- if status != 0 or (not DataServiceUtils.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error("%s %s" % (self.taskID,out))
- return resForFailure
- # return
- _logger.info('%s setReplicaMetadata done for %s:%s' % (self.taskID,datasetName,locationName))
- return True
-
-
- # get list of replicas in container
- def getListDatasetReplicasInContainer(self,container):
- # use cache
- if self.contDsMap.has_key(container):
- return True,self.contDsMap[container]
- # get datasets in container
- _logger.debug((self.taskID,'listDatasetsInContainer',container))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('listDatasetsInContainer',container)
- if status != 0 or (not DataServiceUtils.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- _logger.debug('%s %s' % (self.taskID,out))
- if status != 0 or out.startswith('Error'):
- return False,out
- datasets = []
- try:
- # convert to list
- exec "datasets = %s" % out
- except:
- return False,out
- # loop over all datasets
- allRepMap = {}
- for dataset in datasets:
- _logger.debug((self.taskID,'listDatasetReplicas',dataset))
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False)
- if status != 0 or (not DataServiceUtils.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- _logger.debug('%s %s' % (self.taskID,out))
- if status != 0 or out.startswith('Error'):
- return False,out
- tmpRepSites = {}
- try:
- # convert res to map
- exec "tmpRepSites = %s" % out
- except:
- return False,out
- # get map
- allRepMap[dataset] = tmpRepSites
- # return
- _logger.debug('%s %s' % (self.taskID,str(allRepMap)))
- self.contDsMap[container] = allRepMap
- return True,allRepMap
-
-
-
diff --git a/current/pandaserver/dataservice/Waker.py b/current/pandaserver/dataservice/Waker.py
deleted file mode 100755
index 93234bcd7..000000000
--- a/current/pandaserver/dataservice/Waker.py
+++ /dev/null
@@ -1,55 +0,0 @@
-'''
-awake jobs in waiting table
-
-'''
-
-import time
-import threading
-from DDM import ddm
-
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('Waker')
-
-
-class Waker (threading.Thread):
- # constructor
- def __init__(self,taskBuffer,dataset):
- threading.Thread.__init__(self)
- self.dataset = dataset
- self.taskBuffer = taskBuffer
-
-
- # main
- def run(self):
- _logger.debug("start: %s" % self.dataset.name)
- # get file list from DDM
- for iDDMTry in range(3):
- status,out = ddm.DQ2.main('listFilesInDataset',self.dataset.name)
- if status != 0 and out.find("DQ2 unknown dataset exception") != -1:
- break
- elif status != 0 or out.find("DQ2 internal server exception") != -1:
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error(out)
- _logger.debug("failed: %s" % self.dataset.name)
- return
- # parse
- lfns = []
- try:
- exec "resDQ=%s" % out
- for guid,vals in resDQ[0].iteritems():
- lfns.append(vals['lfn'])
- except:
- _logger.error("could not parse %s" % out)
- # get PandaIDs of jobs which use files with LFNs
- if len(lfns) != 0:
- ids = self.taskBuffer.queryPandaIDwithLFN(lfns)
- _logger.debug("IDs: %s" % ids)
- if len(ids) != 0:
- # awake jobs
- self.taskBuffer.awakeJobs(ids)
- _logger.debug("finished: %s" % self.dataset.name)
diff --git a/current/pandaserver/dataservice/__init__.py b/current/pandaserver/dataservice/__init__.py
deleted file mode 100755
index e69de29bb..000000000
diff --git a/current/pandaserver/dataservice/countGuidsClient.py b/current/pandaserver/dataservice/countGuidsClient.py
deleted file mode 100644
index a65489ce2..000000000
--- a/current/pandaserver/dataservice/countGuidsClient.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import urllib, re, string, os, time
-from eventLookupClient import eventLookupClient
-
-# client for countGuids Athenaeum service
-# author: Marcin.Nowak@cern.ch
-
-
-class countGuidsClient(eventLookupClient):
-
- #serverURL = "http://j2eeps.cern.ch/test-Athenaeum/"
- serverURL = "http://j2eeps.cern.ch/atlas-project-Athenaeum/"
- #serverURL = "http://j2eeps.cern.ch/test-eventPicking/"
- servicePage = "CountGuids.jsp"
- getPage = "EventLookupGet.jsp"
-
- def __init__(self):
- eventLookupClient.__init__(self)
-
- def countGuids(self, datasetName, query='', tokens=''):
- """ contact the server and return GUIDs count
- tokens - token names
- """
- query_args = { 'key': self.key,
- 'worker': self.workerURL(),
- 'cert_proxy': self.certProxy,
- 'query': query,
- 'dataset': datasetName,
- 'tokens': tokens
- }
- self.talkToServer(self.serverURL + self.servicePage, query_args)
-
- self.remoteFile = None
- for line in self.output:
- m = re.search("FILE=(.+)$", line)
- if m:
- return self.waitForFile( m.group(1) )
-
- return self.scanOutputForGuids()
-
-
- def scanOutputForGuids(self):
- """ Scan the server output looking for GUIDs
- return None in case of errors
- """
- self.countedGuids = []
- self.tokens = []
- stage = None
- tokpat = re.compile(r'([0-9A-F]{8}-([0-9A-F]{4}-){3}[0-9A-F]{12})')
- for line in self.output:
- if re.search(self.errorPattern, line, re.I):
- #print " -- Error line matched: " + line
- return None
- if stage == "readGuids":
- try:
- (count, guidline) = line.split(None,1)
- guids = guidline.split()
- if tokpat.match(guids[0]):
- self.countedGuids.append( (count,guids,) )
- continue
- except ValueError:
- pass
- # end of input, finish
- break
- if re.search("Event count per distinct GUIDs group:", line):
- stage = "readAttribs"
- continue
- if stage == "readAttribs":
- self.tokens = line.split()[1:]
- stage = "readGuids"
- continue
-
- return (self.tokens, self.countedGuids)
diff --git a/current/pandaserver/dataservice/datriHandler.py b/current/pandaserver/dataservice/datriHandler.py
deleted file mode 100644
index de6f8d407..000000000
--- a/current/pandaserver/dataservice/datriHandler.py
+++ /dev/null
@@ -1,207 +0,0 @@
-"""
-DaTRI Handler for external applications (curl, python ver. >= 2.4)
-CERN, ATLAS Distributed Computing (March 2010)
-
-@author: Mikhail Titov
-@contact: mikhail.titov@cern.ch
-@data: June 21, 2013
-@version: 0.97
-"""
-
-import os
-import subprocess
-from urllib import urlencode
-
-HTTPS_PORT = 25943
-PANDAMON_HOST = 'panda.cern.ch'
-PANDAMON_URI = '/server/pandamon/query'
-
-# -s: Silent or quiet mode. Don't show progress meter or error messages.
-# -S: When used with -s it makes curl show an error message if it fails.
-CURL_SILENT_OPTION = '-s'
-
-PARAMS_LIST = ['mode', 'action', 'dpat', 'site', 'userid']
-PARAMS_LIST_ADDON = ['emails', 'comments']
-MODE = {
- 'pathena': 'ddm_pathenareq',
- 'ganga': 'ddm_gangareq',
- 'group': 'ddm_groupreq'}
-
-RETRY_NUM = 2
-
-
-def execute(params):
- """Returns tuple (out, err)
-
- @param params (@type list)
- shell command (1st parameter) and its options
- """
- try:
- p = subprocess.Popen(params, stdout=subprocess.PIPE)
- except (OSError, ValueError), e:
- return '', 'SubprocessException: %s' % e
- else:
- return p.communicate()
-
-
-class datriHandler(object):
-
- """Class datriHandler."""
-
- def __init__(self, **kwargs):
- """Initialization
-
- @param kwargs (@type dict)
- has "type" with one of the next values: pathena/ganga/group
- """
- self.curl = datriCurl()
- self.info = {'mode': MODE.get(kwargs.get('type', 'pathena'), '')}
- self.err_message = ''
- if not self.info['mode']:
- self.err_message = 'datriHandler: mode is incorrect'
-
- def __del__(self):
- self.curl = None
- self.info.clear()
- self.err_message = ''
-
- def hasParams(self):
- """Check that parameters are defined and are not null
-
- @return (@type bool)
- True/False
- """
- for p in PARAMS_LIST:
- if not self.info.get(p, None):
- return False
- return True
-
- def setParameters(self, data_pattern, site, userid, **kwargs):
- """Define request parameters
-
- @param data_pattern (@type str)
- dataset | container | pattern
- @param site (@type str)
- destination site (see AGIS/TiersOfAtlas)
- @param userid (@type str)
- unique user identification (certificate dn | email)
- """
- if data_pattern and site and userid:
- self.info.update({'dpat': data_pattern,
- 'site': site,
- 'userid': userid})
- for p in PARAMS_LIST_ADDON:
- if p in kwargs:
- self.info[p] = kwargs[p]
- else:
- self.err_message = 'datriHandler: required data are not defined'
-
- def checkData(self):
- """Check request data (send "Check"-request)
-
- @return (@type typle: int, str)
- returns status code and info (error) message
- """
- if not self.err_message:
- self.info['action'] = 'Check'
- if self.hasParams():
- return self.curl.get(**self.info)
- else:
- self.err_message = 'datriHandler: required data are not defined'
- return 4, self.err_message
-
- def sendRequest(self):
- """Send request to DaTRI (send "Submit"-request)
-
- @return (@type typle: int, str)
- returns status code and info (error) message
- """
- if not self.err_message:
- self.info['action'] = 'Submit'
- if self.hasParams():
- return self.curl.get(**self.info)
- else:
- self.err_message = 'datriHandler: required data are not defined'
- return 4, self.err_message
-
-# - Class for https-request definition -
-
-class datriCurl(object):
-
- """Class datriCurl for curl-command creation."""
-
- def __init__(self):
- self.err_message = ''
- self.cmd_params = ['curl',
- '--user-agent', 'datricurl',
- '--max-redirs', '5',
- '--max-time', '90',
- CURL_SILENT_OPTION,
- '-G']
- self._user_proxy()
- self._ca_path()
- # - url definition -
- self.url = 'https://%s:%s%s' % (PANDAMON_HOST, HTTPS_PORT, PANDAMON_URI)
-
- def _user_proxy(self):
- cert = os.environ.get('X509_USER_PROXY')
- if not cert:
- cert = '/tmp/x509up_u%s' % os.getuid()
- if not os.access(cert, os.R_OK):
- cert = None
- if cert:
- self.cmd_params.extend(['--cert', cert, '--cacert', cert])
- else:
- self.err_message += 'User proxy certificate is not defined; '
-
- def _ca_path(self):
- if os.environ.get('X509_CERT_DIR'):
- self.cmd_params.extend(['--capath', os.environ['X509_CERT_DIR']])
- else:
- self.err_message += 'CA-path is not defined; '
-
- # - method GET -
- def get(self, **kwargs):
- """Returns status code and response message
-
- @param kwargs (@type dict)
- parameters for DaTRI request definition (see PARAMS_LIST)
- @return (@type typle: int, str)
- returns status code and info (error) message
- """
- if not self.err_message:
- if not kwargs:
- return 2, 'datriCurl: input parameters are not defined'
- o, e = '', ' is not defined'
- # - several attempts for cmd execution - begin -
- cmd_params = (self.cmd_params +
- ['--url', '%s?%s' % (self.url, urlencode(kwargs))])
- for i in range(RETRY_NUM):
- o, e = execute(cmd_params)
- if o and not e:
- return (0, o) if o.startswith('OK.') else (1, o)
- # - several attempts for cmd execution - end -
- return 3, 'datriCurl: execution error (output=%s, error=%s)' % (o, e)
- return 5, 'datriCurl: %s' % self.err_message
-
-
-#######################################################################################
-# datriHandler - Status code definition: #
-# #
-# 0 - DaTRI request - CREATED SUCCESSFULLY #
-# #
-# 1 - DaTRI request - NOT CREATED [due to incorrect input data] #
-# datriHandler - EXECUTED SUCCESSFULLY #
-# #
-# 2 - DaTRI request - NOT CREATED #
-# datriHandler - FAILED [due to lack of input data at datriCurl.get] #
-# #
-# 3 - DaTRI request - NOT CREATED #
-# datriHandler - FAILED [due to failure at datriCurl.get] #
-# #
-# 4 - DaTRI request - NOT CREATED #
-# datriHandler - FAILED [due to lack of input data at datriHandler.setParameters] #
-# #
-# 5 - DaTRI request - NOT CREATED #
-# datriHandler - FAILED [due to failure at datriCurl] #
-#######################################################################################
diff --git a/current/pandaserver/dataservice/eventLookupClient.py b/current/pandaserver/dataservice/eventLookupClient.py
deleted file mode 100644
index b7ae3391a..000000000
--- a/current/pandaserver/dataservice/eventLookupClient.py
+++ /dev/null
@@ -1,201 +0,0 @@
-import urllib, re, string, os, time
-
-# client for eventLookup Athenaeum service
-# author: Marcin.Nowak@cern.ch
-
-class eventLookupClient:
-
- serverURL = "http://j2eeps.cern.ch/atlas-project-Athenaeum/"
- #serverURL = "http://j2eeps.cern.ch/test-Athenaeum/"
- #serverURL = "http://j2eeps.cern.ch/test-eventPicking/"
- lookupPage = "EventLookup.jsp"
- getPage = "EventLookupGet.jsp"
- key = "insider"
- workerHost = "atlas-tagservices.cern.ch"
- #workerHost = "atlddm10.cern.ch" #this is at the moment the real host aliased by atlas-tagservices
- #workerHost = "voatlas69.cern.ch"
- workerPort = '10004'
- connectionRefusedSleep = 20
- errorPattern = "(Exception)|(Error)|(Lookup cannot be run)|(invalid)|(NOT EXISTING)"
-
-
- def __init__(self):
- self.output = ""
- self.guids = {}
- self.guidsLine = ""
- self.certProxyFileName = None
- self.certProxy = ""
- self.debug = None
- self.remoteFile = None
- try:
- self.certProxyFileName = os.environ['X509_USER_PROXY']
- except KeyError:
- print 'You do not seem to have a certificate proxy! (do voms-proxy-init)'
- return
- proxy = open(self.certProxyFileName)
- try:
- for line in proxy:
- self.certProxy += line
- finally:
- proxy.close()
-
-
- def workerURL(self):
- if self.workerHost.find(":") > 0:
- # port number together with the host name, possibly from commandline option
- return "http://" + self.workerHost
- else:
- return "http://" + self.workerHost + ":" + self.workerPort
-
-
- def doLookup(self, inputEvents, async=None, stream="", tokens="",
- amitag="", extract=False):
- """ contact the server and return a list of GUIDs
- inputEvents - list of run-event pairs
- async - request query procesing in a separate process, client will poll for results
- stream - stream
- tokens - token names
- amitag - used to select reprocessing pass (default empty means the latest)
- """
- if inputEvents == []:
- return []
-
- runs_events = ""
- runs = set()
- sep = ""
- for run_ev in inputEvents:
- runs_events += sep + run_ev[0] + " " + run_ev[1]
- sep = "\n"
- runs.add(run_ev[0]);
-
- if async is None:
- if len(runs) > 50 or len(inputEvents) > 1000:
- async = True
- if async:
- asyncStr = "true"
- else:
- asyncStr = "false"
-
- query_args = { 'key': self.key,
- 'worker': self.workerURL(),
- 'runs_events': runs_events,
- 'cert_proxy': self.certProxy,
- 'async': asyncStr,
- 'stream': stream,
- 'amitag': amitag,
- 'tokens': tokens
- }
- if extract:
- query_args['extract'] = "true"
-
- self.talkToServer(self.serverURL + self.lookupPage, query_args)
- if not async:
- for line in self.output:
- if re.search("502 Bad Gateway", line):
- # usually signifies a timeout on the J2EE server
- print "Timeout detected. Retrying in asynchronous mode"
- query_args['async'] = "true"
- self.talkToServer(self.serverURL + self.lookupPage, query_args)
- break
-
- self.remoteFile = None
- for line in self.output:
- m = re.search("FILE=(.+)$", line)
- if m:
- return self.waitForFile( m.group(1) )
-
- return self.scanOutputForGuids()
-
-
- def talkToServer(self, url, args):
- encoded_args = urllib.urlencode(args)
- if self.debug:
- print "Contacting URL: " + url
- print encoded_args
-
- for _try in range(1,6):
- response = urllib.urlopen(url, encoded_args)
- self.output = []
- retry = False
- for line in response:
- self.output.append(line)
- if re.search("Connection refused", line):
- retry = True
- if retry:
- if self.debug:
- print "Failed to connect to the server, try " + str(_try)
- time.sleep(self.connectionRefusedSleep)
- else:
- break
-
-
- def scanOutputForGuids(self):
- """ Scan the server output looking for a line with GUIDs
- return list of GUIDs if line found, put GUIDs in self.guids
- return None in case of errors
- """
- self.guids = {}
- self.tags = []
- self.tagAttributes = None
- stage = None
- tokpat = re.compile(r'[[]DB=(?P.*?)[]]')
- for line in self.output:
- if re.search(self.errorPattern, line, re.I):
- #print " -- Error line matched: " + line
- return None
- if stage == "readTags":
- if line[0:1] == ":":
- # break the line up into attributes, extract GUIDs
- values = []
- for attr in string.split(line[1:]):
- tok = tokpat.match(attr)
- if tok:
- attr = tok.group('FID')
- # self.guids - TODO - populate the guids dict
- values.append(attr)
- self.tags.append( values )
- continue
- else:
- return (self.tagAttributes, self.tags)
- if re.match("\{.*\}$", line):
- guids = eval(line)
- if type(guids).__name__!='dict':
- return None
- self.guids = guids
- return guids
- if re.search("TAGs extracted:", line):
- stage = "readAttribs"
- continue
- if stage == "readAttribs":
- self.tagAttributes = string.split(line.strip(),",")
- stage = "readTags"
- continue
- return None
-
-
- def waitForFile(self, file):
- """ Wait for the server to do EventLookup and store results in file
- Retrieve the file and scan for GUIDs - return them if found
- """
- query_args = { 'key': self.key,
- 'worker': self.workerURL(),
- 'file' : file,
- 'wait_time' : "45"
- }
- self.remoteFile = file
- if self.debug:
- print "EventLookup waiting for server. Remote file=" + file
-
- ready = False
- while not ready:
- self.talkToServer(self.serverURL + self.getPage, query_args)
- ready = True
- for line in self.output:
- if re.match("NOT READY", line):
- if self.debug:
- print "received NOT READY"
- time.sleep(1)
- ready = False
-
- return self.scanOutputForGuids()
-
diff --git a/current/pandaserver/dataservice/forkSetupper.py b/current/pandaserver/dataservice/forkSetupper.py
deleted file mode 100755
index 415995de7..000000000
--- a/current/pandaserver/dataservice/forkSetupper.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import os
-import sys
-import commands
-
-# exec
-def run(inFile,v_onlyTA):
- import cPickle as pickle
- try:
- # read Jobs from file
- f = open(inFile)
- jobs = pickle.load(f)
- f.close()
- except:
- type, value, traceBack = sys.exc_info()
- print("run() : %s %s" % (type,value))
- return
- # password
- from config import panda_config
- passwd = panda_config.dbpasswd
- # initialize cx_Oracle using dummy connection
- from taskbuffer.Initializer import initializer
- initializer.init()
- # instantiate TB
- from taskbuffer.TaskBuffer import taskBuffer
- taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
- # run Setupper
- from dataservice.Setupper import Setupper
- thr = Setupper(taskBuffer,jobs,onlyTA=v_onlyTA,useNativeDQ2=True)
- thr.start()
- thr.join()
- return
-
-
-# exit action
-def _onExit(fname):
- commands.getoutput('rm -rf %s' % fname)
-
-
-####################################################################
-# main
-def main():
- import getopt
- import atexit
- # option class
- class _options:
- def __init__(self):
- pass
- options = _options()
- del _options
- # set default values
- options.inFile = ""
- options.onlyTA = False
- # get command-line parameters
- try:
- opts, args = getopt.getopt(sys.argv[1:],"i:t")
- except:
- print("ERROR : Invalid options")
- sys.exit(1)
- # set options
- for o, a in opts:
- if o in ("-i",):
- options.inFile = a
- if o in ("-t",):
- options.onlyTA = True
- # exit action
- atexit.register(_onExit,options.inFile)
- # run
- run(options.inFile,options.onlyTA)
- # return
- sys.exit(0)
-
-
-if __name__ == "__main__":
- main()
diff --git a/current/pandaserver/jobdispatcher/ErrorCode.py b/current/pandaserver/jobdispatcher/ErrorCode.py
deleted file mode 100755
index e58b1b444..000000000
--- a/current/pandaserver/jobdispatcher/ErrorCode.py
+++ /dev/null
@@ -1,11 +0,0 @@
-############## errror code
-
-# Watcher
-EC_Watcher = 100
-
-# recovery failed
-EC_Recovery = 101
-
-# send failed
-EC_SendError = 102
-
diff --git a/current/pandaserver/jobdispatcher/JobDispatcher.py b/current/pandaserver/jobdispatcher/JobDispatcher.py
deleted file mode 100755
index 86f126921..000000000
--- a/current/pandaserver/jobdispatcher/JobDispatcher.py
+++ /dev/null
@@ -1,541 +0,0 @@
-"""
-dispatch jobs
-
-"""
-
-import re
-import types
-import threading
-import Protocol
-import time
-import datetime
-import commands
-from threading import Lock
-from config import panda_config
-from dataservice.Adder import Adder
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('JobDispatcher')
-_pilotReqLogger = PandaLogger().getLogger('PilotRequests')
-
-
-# a wrapper to install timpout into a method
-class _TimedMethod:
- def __init__(self,method,timeout):
- self.method = method
- self.timeout = timeout
- self.result = Protocol.TimeOutToken
-
- # method emulation
- def __call__(self,*var):
- self.result = apply(self.method,var)
-
- # run
- def run(self,*var):
- thr = threading.Thread(target=self,args=var)
- # run thread
- thr.start()
- thr.join() #self.timeout)
-
-
-# job dipatcher
-class JobDipatcher:
- # constructor
- def __init__(self):
- # taskbuffer
- self.taskBuffer = None
- # DN/token map
- self.tokenDN = None
- # datetime of last updated
- self.lastUpdated = datetime.datetime.utcnow()
- # how frequently update DN/token map
- self.timeInterval = datetime.timedelta(seconds=180)
- # pilot owners
- self.pilotOwners = None
- # hostnames for authorization at grid-free sites
- self.allowedNodes = None
- # lock
- self.lock = Lock()
-
-
- # set task buffer
- def init(self,taskBuffer):
- # lock
- self.lock.acquire()
- # set TB
- if self.taskBuffer == None:
- self.taskBuffer = taskBuffer
- # update DN/token map
- if self.tokenDN == None:
- self.tokenDN = self.taskBuffer.getListSchedUsers()
- # get pilot owners
- if self.pilotOwners == None:
- self.pilotOwners = self.taskBuffer.getPilotOwners()
- # get allowed nodes
- if self.allowedNodes == None:
- self.allowedNodes = self.taskBuffer.getAllowedNodes()
- # release
- self.lock.release()
-
-
- # get job
- def getJob(self,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement,
- atlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup,allowOtherCountry):
- jobs = []
- # wrapper function for timeout
- tmpWrapper = _TimedMethod(self.taskBuffer.getJobs,timeout)
- tmpWrapper.run(1,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement,
- atlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup,allowOtherCountry)
- if isinstance(tmpWrapper.result,types.ListType):
- jobs = jobs + tmpWrapper.result
- # make response
- if len(jobs) > 0:
- proxyKey = jobs[-1]
- nSent = jobs[-2]
- jobs = jobs[:-2]
- if len(jobs) != 0:
- # succeed
- response=Protocol.Response(Protocol.SC_Success)
- # append Job
- response.appendJob(jobs[0])
- # append nSent
- response.appendNode('nSent',nSent)
- # set proxy key
- if getProxyKey:
- response.setProxyKey(proxyKey)
- else:
- if tmpWrapper.result == Protocol.TimeOutToken:
- # timeout
- response=Protocol.Response(Protocol.SC_TimeOut)
- else:
- # no available jobs
- response=Protocol.Response(Protocol.SC_NoJobs)
- # return
- _logger.debug("getJob : %s %s ret -> %s" % (siteName,node,response.encode()))
- return response.encode()
-
-
- # update job status
- def updateJob(self,jobID,jobStatus,timeout,xml,siteName,param,metadata,attemptNr=None,stdout=''):
- # retry failed analysis job and ddm job
- if jobStatus=='failed' \
- and ((param.has_key('pilotErrorCode') and (param['pilotErrorCode'] in ['1200','1201'] \
- or param['pilotErrorCode'].startswith('-'))) \
- or (siteName != None and siteName.find('DDM') != -1)):
- # retry
- if param.has_key('pilotErrorCode') and param['pilotErrorCode'].startswith('-'):
- # pilot retry with new PandaID
- ret = self.taskBuffer.retryJob(jobID,param,getNewPandaID=True,attemptNr=attemptNr)
- else:
- # old style
- ret = self.taskBuffer.retryJob(jobID,param,attemptNr=attemptNr)
- if ret:
- # return succeed
- response=Protocol.Response(Protocol.SC_Success)
- return response.encode()
- # add metadata
- if metadata != '':
- self.taskBuffer.addMetadata([jobID],[metadata])
- # add stdout
- if stdout != '':
- self.taskBuffer.addStdOut(jobID,stdout)
- # update
- tmpStatus = jobStatus
- updateStateChange = False
- if jobStatus == 'failed' or jobStatus == 'finished':
- tmpStatus = 'holding'
- # update stateChangeTime to prevent Watcher from finding this job
- updateStateChange = True
- if tmpStatus == 'holding':
- tmpWrapper = _TimedMethod(self.taskBuffer.updateJobStatus,None)
- else:
- tmpWrapper = _TimedMethod(self.taskBuffer.updateJobStatus,timeout)
- tmpWrapper.run(jobID,tmpStatus,param,updateStateChange,attemptNr)
- # make response
- if tmpWrapper.result == Protocol.TimeOutToken:
- # timeout
- response=Protocol.Response(Protocol.SC_TimeOut)
- else:
- if tmpWrapper.result:
- # succeed
- response=Protocol.Response(Protocol.SC_Success)
- # set command
- if isinstance(tmpWrapper.result,types.StringType):
- response.appendNode('command',tmpWrapper.result)
- else:
- response.appendNode('command','NULL')
- # add output to dataset
- if tmpWrapper.result != "badattemptnr" and (jobStatus == 'failed' or jobStatus == 'finished'):
- Adder(self.taskBuffer,jobID,xml,jobStatus,attemptNr=attemptNr).start()
- else:
- # failed
- response=Protocol.Response(Protocol.SC_Failed)
- _logger.debug("updateJob : %s ret -> %s" % (jobID,response.encode()))
- return response.encode()
-
-
- # get job status
- def getStatus(self,strIDs,timeout):
- # convert str to list
- ids = strIDs.split()
- # peek jobs
- tmpWrapper = _TimedMethod(self.taskBuffer.peekJobs,timeout)
- tmpWrapper.run(ids,False,True,True,False)
- # make response
- if tmpWrapper.result == Protocol.TimeOutToken:
- # timeout
- response=Protocol.Response(Protocol.SC_TimeOut)
- else:
- if isinstance(tmpWrapper.result,types.ListType):
- # succeed
- response=Protocol.Response(Protocol.SC_Success)
- # make return
- retStr = ''
- attStr = ''
- for job in tmpWrapper.result:
- if job == None:
- retStr += '%s+' % 'notfound'
- attStr += '0+'
- else:
- retStr += '%s+' % job.jobStatus
- attStr += '%s+' % job.attemptNr
- response.appendNode('status',retStr[:-1])
- response.appendNode('attemptNr',attStr[:-1])
- else:
- # failed
- response=Protocol.Response(Protocol.SC_Failed)
- _logger.debug("getStatus : %s ret -> %s" % (strIDs,response.encode()))
- return response.encode()
-
-
- # get DN/token map
- def getDnTokenMap(self):
- # get current datetime
- current = datetime.datetime.utcnow()
- # lock
- self.lock.acquire()
- # update DN map if old
- if current-self.lastUpdated > self.timeInterval:
- # get new map
- self.tokenDN = self.taskBuffer.getListSchedUsers()
- # reset
- self.lastUpdated = current
- # release
- self.lock.release()
- # return
- return self.tokenDN
-
-
- # generate pilot token
- def genPilotToken(self,schedulerhost,scheduleruser,schedulerid):
- retVal = self.taskBuffer.genPilotToken(schedulerhost,scheduleruser,schedulerid)
- # failed
- if retVal == None:
- return "ERROR : failed to generate token"
- return "SUCCEEDED : " + retVal
-
-
-# Singleton
-jobDispatcher = JobDipatcher()
-del JobDipatcher
-
-
-# get FQANs
-def _getFQAN(req):
- fqans = []
- for tmpKey,tmpVal in req.subprocess_env.iteritems():
- # compact credentials
- if tmpKey.startswith('GRST_CRED_'):
- # VOMS attribute
- if tmpVal.startswith('VOMS'):
- # FQAN
- fqan = tmpVal.split()[-1]
- # append
- fqans.append(fqan)
- # old style
- elif tmpKey.startswith('GRST_CONN_'):
- tmpItems = tmpVal.split(':')
- # FQAN
- if len(tmpItems)==2 and tmpItems[0]=='fqan':
- fqans.append(tmpItems[-1])
- # return
- return fqans
-
-
-# check role
-def _checkRole(fqans,dn,jdCore,withVomsPatch=True,site='',hostname=''):
- prodManager = False
- try:
- # VOMS attributes of production and pilot roles
- prodAttrs = ['/atlas/usatlas/Role=production',
- '/atlas/usatlas/Role=pilot',
- '/atlas/Role=production',
- '/atlas/Role=pilot',
- '/osg/Role=pilot',
- '/Engage/LBNE/Role=pilot',
- ]
- if withVomsPatch:
- # FIXEME once http://savannah.cern.ch/bugs/?47136 is solved
- prodAttrs += ['/atlas/']
- prodAttrs += ['/osg/']
- prodAttrs += ['/Engage/LBNE/']
- for fqan in fqans:
- # check atlas/usatlas production role
- for rolePat in prodAttrs:
- if fqan.startswith(rolePat):
- prodManager = True
- break
- # escape
- if prodManager:
- break
- # service proxy for CERNVM
- if site in ['CERNVM']:
- serviceSubjects = ['/DC=ch/DC=cern/OU=computers/CN=pilot/copilot.cern.ch']
- for tmpSub in serviceSubjects:
- if dn.startswith(tmpSub):
- prodManager = True
- break
- # grid-free authorization
- if not prodManager:
- if hostname != '' and jdCore.allowedNodes.has_key(site):
- for tmpPat in jdCore.allowedNodes[site]:
- if re.search(tmpPat,hostname) != None:
- prodManager = True
- break
- # check DN with pilotOwners
- if (not prodManager) and (not dn in [None]):
- for owner in jdCore.pilotOwners:
- # check
- if re.search(owner,dn) != None:
- prodManager = True
- break
- except:
- pass
- # return
- return prodManager
-
-
-# get DN
-def _getDN(req):
- realDN = None
- if req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- realDN = req.subprocess_env['SSL_CLIENT_S_DN']
- # remove redundant CN
- realDN = re.sub('/CN=limited proxy','',realDN)
- realDN = re.sub('/CN=proxy(/CN=proxy)+','/CN=proxy',realDN)
- # return
- return realDN
-
-
-# check token
-def _checkToken(token,jdCore):
- # not check None until all pilots use tokens
- if token == None:
- return True
- # get map
- tokenDN = jdCore.getDnTokenMap()
- # return
- return tokenDN.has_key(token)
-
-
-
-"""
-web service interface
-
-"""
-
-# get job
-def getJob(req,siteName,token=None,timeout=60,cpu=None,mem=None,diskSpace=None,prodSourceLabel=None,node=None,
- computingElement=None,AtlasRelease=None,prodUserID=None,getProxyKey=None,countryGroup=None,
- workingGroup=None,allowOtherCountry=None):
- _logger.debug("getJob(%s)" % siteName)
- # get DN
- realDN = _getDN(req)
- # get FQANs
- fqans = _getFQAN(req)
- # check production role
- if getProxyKey == 'True':
- # don't use /atlas to prevent normal proxy getting credname
- prodManager = _checkRole(fqans,realDN,jobDispatcher,False,site=siteName)
- else:
- prodManager = _checkRole(fqans,realDN,jobDispatcher,site=siteName,
- hostname=req.get_remote_host())
- # check token
- validToken = _checkToken(token,jobDispatcher)
- # set DN for non-production user
- if not prodManager:
- prodUserID = realDN
- # allow getProxyKey for production role
- if getProxyKey == 'True' and prodManager:
- getProxyKey = True
- else:
- getProxyKey = False
- # convert mem and diskSpace
- try:
- mem = int(float(mem))
- if mem < 0:
- mem = 0
- except:
- mem = 0
- try:
- diskSpace = int(float(diskSpace))
- if diskSpace < 0:
- diskSpace = 0
- except:
- diskSpace = 0
- _logger.debug("getJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s)" \
- % (siteName,cpu,mem,diskSpace,prodSourceLabel,node,
- computingElement,AtlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup,
- allowOtherCountry,realDN,prodManager,token,validToken,str(fqans)))
- _pilotReqLogger.info('method=getJob,site=%s,node=%s,type=%s' % (siteName,node,prodSourceLabel))
- # invalid role
- if (not prodManager) and (not prodSourceLabel in ['user']):
- _logger.warning("getJob(%s) : invalid role" % siteName)
- return Protocol.Response(Protocol.SC_Role).encode()
- # invalid token
- if not validToken:
- _logger.warning("getJob(%s) : invalid token" % siteName)
- return Protocol.Response(Protocol.SC_Invalid).encode()
- # invoke JD
- return jobDispatcher.getJob(siteName,prodSourceLabel,cpu,mem,diskSpace,node,int(timeout),
- computingElement,AtlasRelease,prodUserID,getProxyKey,countryGroup,
- workingGroup,allowOtherCountry)
-
-
-# update job status
-def updateJob(req,jobId,state,token=None,transExitCode=None,pilotErrorCode=None,pilotErrorDiag=None,timestamp=None,timeout=60,
- xml='',node=None,workdir=None,cpuConsumptionTime=None,cpuConsumptionUnit=None,remainingSpace=None,
- schedulerID=None,pilotID=None,siteName=None,messageLevel=None,pilotLog='',metaData='',
- cpuConversionFactor=None,exeErrorCode=None,exeErrorDiag=None,pilotTiming=None,computingElement=None,
- startTime=None,endTime=None,nEvents=None,nInputFiles=None,batchID=None,attemptNr=None,jobMetrics=None,
- stdout=''):
- _logger.debug("updateJob(%s)" % jobId)
- # get DN
- realDN = _getDN(req)
- # get FQANs
- fqans = _getFQAN(req)
- # check production role
- prodManager = _checkRole(fqans,realDN,jobDispatcher,site=siteName,hostname=req.get_remote_host())
- # check token
- validToken = _checkToken(token,jobDispatcher)
- _logger.debug("updateJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,attemptNr:%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s\n==XML==\n%s\n==LOG==\n%s\n==Meta==\n%s\n==Metrics==\n%s\n==stdout==\n%s)" %
- (jobId,state,transExitCode,pilotErrorCode,pilotErrorDiag,node,workdir,cpuConsumptionTime,
- cpuConsumptionUnit,remainingSpace,schedulerID,pilotID,siteName,messageLevel,nEvents,nInputFiles,
- cpuConversionFactor,exeErrorCode,exeErrorDiag,pilotTiming,computingElement,startTime,endTime,
- batchID,attemptNr,realDN,prodManager,token,validToken,str(fqans),xml,pilotLog,metaData,jobMetrics,
- stdout))
- _pilotReqLogger.info('method=updateJob,site=%s,node=%s,type=None' % (siteName,node))
- # invalid role
- if not prodManager:
- _logger.warning("updateJob(%s) : invalid role" % jobId)
- return Protocol.Response(Protocol.SC_Role).encode()
- # invalid token
- if not validToken:
- _logger.warning("updateJob(%s) : invalid token" % jobId)
- return Protocol.Response(Protocol.SC_Invalid).encode()
- # aborting message
- if jobId=='NULL':
- return Protocol.Response(Protocol.SC_Success).encode()
- # check status
- if not state in ['running','failed','finished','holding','starting','transferring']:
- _logger.warning("invalid state=%s for updateJob" % state)
- return Protocol.Response(Protocol.SC_Success).encode()
- # pilot log
- if pilotLog != '':
- try:
- # make message
- message = pilotLog
- # get logger
- _pandaLogger = PandaLogger()
- _pandaLogger.lock()
- _pandaLogger.setParams({'Type':'pilotLog','PandaID':int(jobId)})
- logger = _pandaLogger.getHttpLogger(panda_config.loggername)
- # add message
- logger.info(message)
- # release HTTP handler
- _pandaLogger.release()
- except:
- pass
- # create parameter map
- param = {}
- if cpuConsumptionTime != None:
- param['cpuConsumptionTime']=cpuConsumptionTime
- if cpuConsumptionUnit != None:
- param['cpuConsumptionUnit']=cpuConsumptionUnit
- if node != None:
- param['modificationHost']=node
- if transExitCode != None:
- param['transExitCode']=transExitCode
- if pilotErrorCode != None:
- param['pilotErrorCode']=pilotErrorCode
- if pilotErrorDiag != None:
- param['pilotErrorDiag']=pilotErrorDiag[:500]
- if jobMetrics != None:
- param['jobMetrics']=jobMetrics[:500]
- if schedulerID != None:
- param['schedulerID']=schedulerID
- if pilotID != None:
- param['pilotID']=pilotID[:200]
- if batchID != None:
- param['batchID']=batchID
- if exeErrorCode != None:
- param['exeErrorCode']=exeErrorCode
- if exeErrorDiag != None:
- param['exeErrorDiag']=exeErrorDiag[:500]
- if cpuConversionFactor != None:
- param['cpuConversion']=cpuConversionFactor
- if pilotTiming != None:
- param['pilotTiming']=pilotTiming
- if computingElement != None:
- param['computingElement']=computingElement
- if nEvents != None:
- param['nEvents']=nEvents
- if nInputFiles != None:
- param['nInputFiles']=nInputFiles
- if startTime != None:
- try:
- param['startTime']=datetime.datetime(*time.strptime(startTime,'%Y-%m-%d %H:%M:%S')[:6])
- except:
- pass
- if endTime != None:
- try:
- param['endTime']=datetime.datetime(*time.strptime(endTime,'%Y-%m-%d %H:%M:%S')[:6])
- except:
- pass
- if attemptNr != None:
- try:
- attemptNr = int(attemptNr)
- except:
- attemptNr = None
- if stdout != '':
- stdout = stdout[:2048]
- # invoke JD
- return jobDispatcher.updateJob(int(jobId),state,int(timeout),xml,siteName,
- param,metaData,attemptNr,stdout)
-
-
-# get job status
-def getStatus(req,ids,timeout=60):
- _logger.debug("getStatus(%s)" % ids)
- return jobDispatcher.getStatus(ids,int(timeout))
-
-
-# generate pilot token
-def genPilotToken(req,schedulerid,host=None):
- # get DN
- realDN = _getDN(req)
- # get FQANs
- fqans = _getFQAN(req)
- # check production role
- prodManager = _checkRole(fqans,realDN,jobDispatcher,False)
- if not prodManager:
- return "ERROR : production or pilot role is required"
- if realDN == None:
- return "ERROR : failed to retrive DN"
- # hostname
- if host == None:
- host = req.get_remote_host()
- # return
- return jobDispatcher.genPilotToken(host,realDN,schedulerid)
-
diff --git a/current/pandaserver/jobdispatcher/Protocol.py b/current/pandaserver/jobdispatcher/Protocol.py
deleted file mode 100755
index 42cbd9d4d..000000000
--- a/current/pandaserver/jobdispatcher/Protocol.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import urllib
-
-
-# constants
-TimeOutToken = "TimeOut"
-NoJobsToken = "NoJobs"
-
-########### status codes
-# succeeded
-SC_Success = 0
-# timeout
-SC_TimeOut = 10
-# no available jobs
-SC_NoJobs = 20
-# failed
-SC_Failed = 30
-# Not secure connection
-SC_NonSecure = 40
-# invalid token
-SC_Invalid = 50
-# invalid role
-SC_Role = 60
-
-
-# response
-class Response:
- # constructor
- def __init__(self,statusCode):
- # create data object
- self.data = {'StatusCode':statusCode}
-
-
- # URL encode
- def encode(self):
- return urllib.urlencode(self.data)
-
-
- # append Node
- def appendNode(self,name,value):
- self.data[name]=value
-
-
- # append job
- def appendJob(self,job):
- # PandaID
- self.data['PandaID'] = job.PandaID
- # prodSourceLabel
- self.data['prodSourceLabel'] = job.prodSourceLabel
- # swRelease
- self.data['swRelease'] = job.AtlasRelease
- # homepackage
- self.data['homepackage'] = job.homepackage
- # transformation
- self.data['transformation'] = job.transformation
- # job name
- self.data['jobName'] = job.jobName
- # job definition ID
- self.data['jobDefinitionID'] = job.jobDefinitionID
- # cloud
- self.data['cloud'] = job.cloud
- # files
- strIFiles = ''
- strOFiles = ''
- strDispatch = ''
- strDisToken = ''
- strDisTokenForOutput = ''
- strDestination = ''
- strRealDataset = ''
- strRealDatasetIn = ''
- strDestToken = ''
- strProdToken = ''
- strGUID = ''
- strFSize = ''
- strCheckSum = ''
- strScopeIn = ''
- strScopeOut = ''
- strScopeLog = ''
- logFile = ''
- logGUID = ''
- for file in job.Files:
- if file.type == 'input':
- if strIFiles != '':
- strIFiles += ','
- strIFiles += file.lfn
- if strDispatch != '':
- strDispatch += ','
- strDispatch += file.dispatchDBlock
- if strDisToken != '':
- strDisToken += ','
- strDisToken += file.dispatchDBlockToken
- if strProdToken != '':
- strProdToken += ','
- strProdToken += file.prodDBlockToken
- if strGUID != '':
- strGUID += ','
- strGUID += file.GUID
- strRealDatasetIn += '%s,' % file.dataset
- strFSize += '%s,' % file.fsize
- if not file.checksum in ['','NULL',None]:
- strCheckSum += '%s,' % file.checksum
- else:
- strCheckSum += '%s,' % file.md5sum
- strScopeIn += '%s,' % file.scope
- if file.type == 'output' or file.type == 'log':
- if strOFiles != '':
- strOFiles += ','
- strOFiles += file.lfn
- if strDestination != '':
- strDestination += ','
- strDestination += file.destinationDBlock
- if strRealDataset != '':
- strRealDataset += ','
- strRealDataset += file.dataset
- if file.type == 'log':
- logFile = file.lfn
- logGUID = file.GUID
- strScopeLog = file.scope
- else:
- strScopeOut += '%s,' % file.scope
- if strDestToken != '':
- strDestToken += ','
- strDestToken += file.destinationDBlockToken.split(',')[0]
- strDisTokenForOutput += '%s,' % file.dispatchDBlockToken
- # inFiles
- self.data['inFiles'] = strIFiles
- # dispatch DBlock
- self.data['dispatchDblock'] = strDispatch
- # dispatch DBlock space token
- self.data['dispatchDBlockToken'] = strDisToken
- # dispatch DBlock space token for output
- self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1]
- # outFiles
- self.data['outFiles'] = strOFiles
- # destination DBlock
- self.data['destinationDblock'] = strDestination
- # destination DBlock space token
- self.data['destinationDBlockToken'] = strDestToken
- # prod DBlock space token
- self.data['prodDBlockToken'] = strProdToken
- # real output datasets
- self.data['realDatasets'] = strRealDataset
- # real output datasets
- self.data['realDatasetsIn'] = strRealDatasetIn[:-1]
- # log filename
- self.data['logFile'] = logFile
- # log GUID
- self.data['logGUID'] = logGUID
- # jobPars
- self.data['jobPars'] = job.jobParameters
- # attempt number
- self.data['attemptNr'] = job.attemptNr
- # GUIDs
- self.data['GUID'] = strGUID
- # checksum
- self.data['checksum'] = strCheckSum[:-1]
- # fsize
- self.data['fsize'] = strFSize[:-1]
- # scope
- self.data['scopeIn'] = strScopeIn[:-1]
- self.data['scopeOut'] = strScopeOut[:-1]
- self.data['scopeLog'] = strScopeLog
- # destinationSE
- self.data['destinationSE'] = job.destinationSE
- # user ID
- self.data['prodUserID'] = job.prodUserID
- # CPU count
- self.data['maxCpuCount'] = job.maxCpuCount
- # RAM count
- self.data['minRamCount'] = job.minRamCount
- # disk count
- self.data['maxDiskCount'] = job.maxDiskCount
- # cmtconfig
- self.data['cmtConfig'] = job.cmtConfig
- # processingType
- self.data['processingType'] = job.processingType
- # transferType
- self.data['transferType'] = job.transferType
- # current priority
- self.data['currentPriority'] = job.currentPriority
- # taskID
- self.data['taskID'] = job.taskID
- # debug mode
- if job.specialHandling != None and 'debug' in job.specialHandling:
- self.data['debug'] = 'True'
-
-
- # set proxy key
- def setProxyKey(self,proxyKey):
- names = ['credname','myproxy']
- for name in names:
- if proxyKey.has_key(name):
- self.data[name] = proxyKey[name]
- else:
- self.data[name] = ''
-
-
-# check if secure connection
-def isSecure(req):
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return False
- return True
-
-
-# get user DN
-def getUserDN(req):
- try:
- return req.subprocess_env['SSL_CLIENT_S_DN']
- except:
- return 'None'
-
-
-
diff --git a/current/pandaserver/jobdispatcher/Watcher.py b/current/pandaserver/jobdispatcher/Watcher.py
deleted file mode 100755
index f07e6d922..000000000
--- a/current/pandaserver/jobdispatcher/Watcher.py
+++ /dev/null
@@ -1,172 +0,0 @@
-'''
-watch job
-
-'''
-
-import re
-import sys
-import time
-import commands
-import datetime
-import threading
-import ErrorCode
-
-import taskbuffer.ErrorCode
-
-from brokerage.PandaSiteIDs import PandaSiteIDs
-
-from dataservice.Closer import Closer
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('Watcher')
-
-
-class Watcher (threading.Thread):
- # constructor
- def __init__(self,taskBuffer,pandaID,single=False,sleepTime=360,sitemapper=None):
- threading.Thread.__init__(self)
- self.pandaID = pandaID
- self.taskBuffer = taskBuffer
- self.sleepTime = sleepTime
- self.single = single
- self.siteMapper = sitemapper
-
- # main
- def run(self):
- try:
- while True:
- _logger.debug('%s start' % self.pandaID)
- # query job
- job = self.taskBuffer.peekJobs([self.pandaID],fromDefined=False,
- fromArchived=False,fromWaiting=False)[0]
- # check job status
- if job == None or (not job.jobStatus in ['running','sent','starting','holding',
- 'stagein','stageout']):
- _logger.debug('%s escape : %s' % (self.pandaID,job.jobStatus))
- return
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=self.sleepTime)
- if job.modificationTime < timeLimit or (job.endTime != 'NULL' and job.endTime < timeLimit):
- _logger.debug('%s %s lastmod:%s endtime:%s' % (job.PandaID,job.jobStatus,
- str(job.modificationTime),
- str(job.endTime)))
- destDBList = []
- # retry analysis jobs
- if (job.prodSourceLabel in ['user','panda']) and (job.attemptNr<2 or job.jobStatus == 'sent') \
- and job.commandToPilot != 'tobekilled' and (not job.processingType in ['ITB_INTEGRATION']) \
- and not job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_Reassigned,
- taskbuffer.ErrorCode.EC_Retried,
- taskbuffer.ErrorCode.EC_PilotRetried] \
- and not job.processingType.startswith('gangarobot') \
- and not job.processingType.startswith('hammercloud'):
- # reset
- _logger.debug(' -> reset %s job with %s : PandaID:%s #%s' % (job.prodSourceLabel,job.jobStatus,job.PandaID,job.attemptNr))
- job.jobStatus = 'activated'
- job.startTime = None
- job.endTime = None
- job.attemptNr = job.attemptNr + 1
- # remove flag regarding to pledge-resource handling
- if not job.specialHandling in [None,'NULL','']:
- newSpecialHandling = re.sub(',*localpool','',job.specialHandling)
- if newSpecialHandling == '':
- job.specialHandling = None
- else:
- job.specialHandling = newSpecialHandling
- # TEMPORARY : send it to long queue
- oldComputingSite = job.computingSite
- if job.jobStatus != 'sent' and job.computingSite.startswith('ANALY') and (not job.computingSite.startswith('ANALY_LONG_')):
- tmpLongSiteList = []
- tmpLongSite = re.sub('^ANALY_','ANALY_LONG_',job.computingSite)
- tmpLongSite = re.sub('_\d+$','',tmpLongSite)
- tmpLongSiteList.append(tmpLongSite)
- tmpLongSite = job.computingSite + '_LONG'
- tmpLongSiteList.append(tmpLongSite)
- tmpLongSite = re.sub('SHORT','LONG',job.computingSite)
- if tmpLongSite != job.computingSite:
- tmpLongSiteList.append(tmpLongSite)
- for longSite in tmpLongSiteList:
- if self.siteMapper.checkSite(longSite):
- tmpSiteSpec = self.siteMapper.getSite(longSite)
- if tmpSiteSpec.status == 'online':
- job.computingSite = longSite
- _logger.debug(' -> sending PandaID:%s to %s' % (job.PandaID,job.computingSite))
- # set destinationSE
- if job.destinationSE == oldComputingSite:
- job.destinationSE = job.computingSite
- break
- # modify LFNs and destinationSE
- for file in job.Files:
- modTypes = ('output','log')
- if file.type in modTypes:
- # set destinationSE
- if file.destinationSE == oldComputingSite:
- file.destinationSE = job.computingSite
- if job.prodSourceLabel == 'panda':
- # doesn't change output for buildJob
- modTypes = ('log',)
- if file.type in modTypes:
- # set new GUID
- if file.type == 'log':
- file.GUID = commands.getoutput('uuidgen')
- # add attempt nr
- oldName = file.lfn
- file.lfn = re.sub("\.\d+$","",file.lfn)
- file.lfn = "%s.%d" % (file.lfn,job.attemptNr)
- newName = file.lfn
- # modify jobParameters
- sepPatt = "(\'|\"|%20)" + oldName + "(\'|\"|%20)"
- matches = re.findall(sepPatt,job.jobParameters)
- for match in matches:
- oldPatt = match[0]+oldName+match[-1]
- newPatt = match[0]+newName+match[-1]
- job.jobParameters = re.sub(oldPatt,newPatt,job.jobParameters)
- else:
- if job.jobStatus == 'sent':
- # sent job didn't receive reply from pilot within 30 min
- job.jobDispatcherErrorCode = ErrorCode.EC_SendError
- job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min"
- elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL':
- # lost heartbeat
- job.jobDispatcherErrorCode = ErrorCode.EC_Watcher
- if job.jobDispatcherErrorDiag == 'NULL':
- if job.endTime == 'NULL':
- # normal lost heartbeat
- job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.modificationTime)
- else:
- # job recovery failed
- job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.endTime)
- else:
- # job recovery failed
- job.jobDispatcherErrorCode = ErrorCode.EC_Recovery
- job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (self.sleepTime/60)
- # set job status
- job.jobStatus = 'failed'
- # set endTime for lost heartbeat
- if job.endTime == 'NULL':
- # normal lost heartbeat
- job.endTime = job.modificationTime
- # set files status
- for file in job.Files:
- if file.type == 'output' or file.type == 'log':
- file.status = 'failed'
- if not file.destinationDBlock in destDBList:
- destDBList.append(file.destinationDBlock)
- # update job
- self.taskBuffer.updateJobs([job],False)
- # start closer
- if job.jobStatus == 'failed':
- cThr = Closer(self.taskBuffer,destDBList,job)
- cThr.start()
- cThr.join()
- _logger.debug('%s end' % job.PandaID)
- return
- # single action
- if self.single:
- return
- # sleep
- time.sleep(60*self.sleepTime)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("run() : %s %s" % (type,value))
- return
diff --git a/current/pandaserver/jobdispatcher/__init__.py b/current/pandaserver/jobdispatcher/__init__.py
deleted file mode 100755
index e69de29bb..000000000
diff --git a/current/pandaserver/server/panda.py b/current/pandaserver/server/panda.py
deleted file mode 100755
index d8d9d4991..000000000
--- a/current/pandaserver/server/panda.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#!/usr/bin/python2.5
-
-"""
-entry point
-
-"""
-
-# config file
-from config import panda_config
-
-# initialize cx_Oracle using dummy connection
-from taskbuffer.Initializer import initializer
-initializer.init()
-
-# initialzie TaskBuffer
-from taskbuffer.TaskBuffer import taskBuffer
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,panda_config.nDBConnection,True)
-
-# initialize JobDispatcher
-from jobdispatcher.JobDispatcher import jobDispatcher
-if panda_config.nDBConnection != 0:
- jobDispatcher.init(taskBuffer)
-
-# initialize DataService
-from dataservice.DataService import dataService
-if panda_config.nDBConnection != 0:
- dataService.init(taskBuffer)
-
-# initialize UserIF
-from userinterface.UserIF import userIF
-if panda_config.nDBConnection != 0:
- userIF.init(taskBuffer)
-
-# import web I/F
-allowedMethods = []
-
-from taskbuffer.Utils import isAlive,putFile,deleteFile,getServer,updateLog,fetchLog,\
- touchFile,getVomsAttr,putEventPickingRequest,getAttr,getFile
-allowedMethods += ['isAlive','putFile','deleteFile','getServer','updateLog','fetchLog',
- 'touchFile','getVomsAttr','putEventPickingRequest','getAttr','getFile']
-
-from dataservice.DataService import datasetCompleted,updateFileStatusInDisp
-allowedMethods += ['datasetCompleted','updateFileStatusInDisp']
-
-from jobdispatcher.JobDispatcher import getJob,updateJob,getStatus,genPilotToken
-allowedMethods += ['getJob','updateJob','getStatus','genPilotToken']
-
-from userinterface.UserIF import submitJobs,getJobStatus,queryPandaIDs,killJobs,reassignJobs,\
- getJobStatistics,getJobStatisticsPerSite,resubmitJobs,queryLastFilesInDataset,getPandaIDsSite,\
- getJobsToBeUpdated,updateProdDBUpdateTimes,runTaskAssignment,getAssigningTask,getSiteSpecs,\
- getCloudSpecs,runBrokerage,seeCloudTask,queryJobInfoPerCloud,registerProxyKey,getProxyKey,\
- getJobIDsInTimeRange,getPandIDsWithJobID,getFullJobStatus,getJobStatisticsForBamboo,\
- getNUserJobs,addSiteAccess,listSiteAccess,getFilesInUseForAnal,updateSiteAccess,\
- getPandaClientVer,getSlimmedFileInfoPandaIDs,runReBrokerage,deleteFilesFromCacheDB,\
- addFilesToCacheDB,flushCacheDB,checkFilesWithCacheDB,getQueuedAnalJobs,getHighestPrioJobStat,\
- getActiveDatasets,setCloudTaskByUser,getSerialNumberForGroupJob,getCachePrefixes,\
- checkMergeGenerationStatus,sendLogInfo,getNumPilots,retryFailedJobsInActive,\
- getJobStatisticsWithLabel,getPandaIDwithJobExeID,getJobStatisticsPerUserSite,\
- getDisInUseForAnal,getLFNsInUseForAnal,getScriptOfflineRunning,setDebugMode,\
- insertSandboxFileInfo,checkSandboxFile,changeJobPriorities
-allowedMethods += ['submitJobs','getJobStatus','queryPandaIDs','killJobs','reassignJobs',
- 'getJobStatistics','getJobStatisticsPerSite','resubmitJobs','queryLastFilesInDataset','getPandaIDsSite',
- 'getJobsToBeUpdated','updateProdDBUpdateTimes','runTaskAssignment','getAssigningTask','getSiteSpecs',
- 'getCloudSpecs','runBrokerage','seeCloudTask','queryJobInfoPerCloud','registerProxyKey','getProxyKey',
- 'getJobIDsInTimeRange','getPandIDsWithJobID','getFullJobStatus','getJobStatisticsForBamboo',
- 'getNUserJobs','addSiteAccess','listSiteAccess','getFilesInUseForAnal','updateSiteAccess',
- 'getPandaClientVer','getSlimmedFileInfoPandaIDs','runReBrokerage','deleteFilesFromCacheDB',
- 'addFilesToCacheDB','flushCacheDB','checkFilesWithCacheDB','getQueuedAnalJobs','getHighestPrioJobStat',
- 'getActiveDatasets','setCloudTaskByUser','getSerialNumberForGroupJob','getCachePrefixes',
- 'checkMergeGenerationStatus','sendLogInfo','getNumPilots','retryFailedJobsInActive',
- 'getJobStatisticsWithLabel','getPandaIDwithJobExeID','getJobStatisticsPerUserSite',
- 'getDisInUseForAnal','getLFNsInUseForAnal','getScriptOfflineRunning','setDebugMode',
- 'insertSandboxFileInfo','checkSandboxFile','changeJobPriorities']
-
-# import error
-import taskbuffer.ErrorCode
-
-
-# FastCGI/WSGI entry
-if panda_config.useFastCGI or panda_config.useWSGI:
-
- import os
- import cgi
- import sys
- from pandalogger.PandaLogger import PandaLogger
-
- # logger
- _logger = PandaLogger().getLogger('Entry')
-
- # dummy request object
- class DummyReq:
- def __init__(self,env,):
- # environ
- self.subprocess_env = env
- # header
- self.headers_in = {}
- # content-length
- if self.subprocess_env.has_key('CONTENT_LENGTH'):
- self.headers_in["content-length"] = self.subprocess_env['CONTENT_LENGTH']
-
- # get remote host
- def get_remote_host(self):
- if self.subprocess_env.has_key('REMOTE_HOST'):
- return self.subprocess_env['REMOTE_HOST']
- return ""
-
-
- # application
- def application(environ, start_response):
- # get method name
- methodName = ''
- if environ.has_key('SCRIPT_NAME'):
- methodName = environ['SCRIPT_NAME'].split('/')[-1]
- if panda_config.entryVerbose:
- _logger.debug("PID=%s %s in" % (os.getpid(),methodName))
- # check method name
- if not methodName in allowedMethods:
- _logger.error("PID=%s %s is forbidden" % (os.getpid(),methodName))
- exeRes = "False : %s is forbidden" % methodName
- else:
- # get method object
- tmpMethod = None
- try:
- exec "tmpMethod = %s" % methodName
- except:
- pass
- # object not found
- if tmpMethod == None:
- _logger.error("PID=%s %s is undefined" % (os.getpid(),methodName))
- exeRes = "False"
- else:
- # get params
- tmpPars = cgi.FieldStorage(environ['wsgi.input'], environ=environ,
- keep_blank_values=1)
- # convert to map
- params = {}
- for tmpKey in tmpPars.keys():
- if tmpPars[tmpKey].file != None and tmpPars[tmpKey].filename != None:
- # file
- params[tmpKey] = tmpPars[tmpKey]
- else:
- # string
- params[tmpKey] = tmpPars.getfirst(tmpKey)
- if panda_config.entryVerbose:
- _logger.debug("PID=%s %s with %s" % (os.getpid(),methodName,str(params.keys())))
- # dummy request object
- dummyReq = DummyReq(environ)
- try:
- # exec
- exeRes = apply(tmpMethod,[dummyReq],params)
- # convert bool to string
- if exeRes in [True,False]:
- exeRes = str(exeRes)
- except:
- errType,errValue = sys.exc_info()[:2]
- errStr = ""
- for tmpKey,tmpVal in environ.iteritems():
- errStr += "%s : %s\n" % (tmpKey,str(tmpVal))
- _logger.error("execution failure : %s %s" % (errType,errValue))
- _logger.error(errStr)
- # return internal server error
- start_response('500 INTERNAL SERVER ERROR', [('Content-Type', 'text/plain')])
- return ["%s %s" % (errType,errValue)]
- if panda_config.entryVerbose:
- _logger.debug("PID=%s %s out" % (os.getpid(),methodName))
- # return
- if exeRes == taskbuffer.ErrorCode.EC_NotFound:
- start_response('404 Not Found', [('Content-Type', 'text/plain')])
- return ['not found']
- elif isinstance(exeRes,taskbuffer.ErrorCode.EC_Redirect):
- start_response('302 Redirect', [('Location', exeRes.url)])
- return ['redirect']
- else:
- start_response('200 OK', [('Content-Type', 'text/plain')])
- return [exeRes]
-
- # start server
- if panda_config.useFastCGI:
- from flup.server.fcgi import WSGIServer
- WSGIServer(application,multithreaded=False).run()
diff --git a/current/pandaserver/taskbuffer/ArchiveDBProxyPool.py b/current/pandaserver/taskbuffer/ArchiveDBProxyPool.py
deleted file mode 100644
index 8bfd014b0..000000000
--- a/current/pandaserver/taskbuffer/ArchiveDBProxyPool.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""
-pool for ArchiveDBProxies
-
-"""
-
-import time
-import Queue
-import random
-import OraLogDBProxy as LogDBProxy
-from config import panda_config
-
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('ArchiveDBProxyPool')
-
-class ArchiveDBProxyPool:
-
- def __init__(self,nConnection=panda_config.nArchiveDBConnection):
- # create Proxies
- _logger.debug("init")
- self.proxyList = Queue.Queue(nConnection)
- for i in range(nConnection):
- _logger.debug("connect -> %s " % i)
- proxy = LogDBProxy.LogDBProxy()
- nTry = 10
- for iTry in range(nTry):
- if proxy.connect(dbhost = panda_config.archivedbhost,
- dbpasswd = panda_config.archivedbpasswd,
- dbuser = panda_config.archivedbuser,
- dbname = panda_config.archivedbname):
- break
- _logger.debug("failed -> %s : try %s" % (i,iTry))
- if iTry+1 == nTry:
- raise RuntimeError, 'ArchiveDBProxyPool.__init__ failed'
- time.sleep(random.randint(10,20))
- self.proxyList.put(proxy)
- time.sleep(1)
- _logger.debug("ready")
-
- # return a free proxy. this method blocks until a proxy is available
- def getProxy(self):
- # get proxy
- proxy = self.proxyList.get()
- # wake up connection
- proxy.wakeUp()
- # return
- return proxy
-
-
- # put back a proxy
- def putProxy(self,proxy):
- # put
- self.proxyList.put(proxy)
-
diff --git a/current/pandaserver/taskbuffer/CloudSpec.py b/current/pandaserver/taskbuffer/CloudSpec.py
deleted file mode 100644
index bfb1927d3..000000000
--- a/current/pandaserver/taskbuffer/CloudSpec.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""
-cloud specification
-
-"""
-
-class CloudSpec(object):
- # attributes
- _attributes = ('name','tier1','tier1SE','relocation','weight','server','status','transtimelo',
- 'transtimehi','waittime','validation','mcshare','countries','fasttrack','nprestage',
- 'pilotowners')
-
- # constructor
- def __init__(self):
- # install attributes
- for attr in self._attributes:
- setattr(self,attr,None)
-
- # serialize
- def __str__(self):
- str = ''
- for attr in self._attributes:
- str += '%s:%s ' % (attr,getattr(self,attr))
- return str
-
-
-
-
diff --git a/current/pandaserver/taskbuffer/CloudTaskSpec.py b/current/pandaserver/taskbuffer/CloudTaskSpec.py
deleted file mode 100644
index 8fade3ce1..000000000
--- a/current/pandaserver/taskbuffer/CloudTaskSpec.py
+++ /dev/null
@@ -1,99 +0,0 @@
-"""
-cloud/task specification
-
-"""
-
-class CloudTaskSpec(object):
- # attributes
- _attributes = ('id','taskname','taskid','cloud','status','tmod','tenter')
- # slots
- __slots__ = _attributes
-
-
- # constructor
- def __init__(self):
- # install attributes
- for attr in self._attributes:
- setattr(self,attr,None)
-
-
- # override __getattribute__ for SQL and PandaID
- def __getattribute__(self,name):
- ret = object.__getattribute__(self,name)
- if ret == None:
- return "NULL"
- return ret
-
-
- # return a tuple of values
- def values(self):
- ret = []
- for attr in self._attributes:
- val = getattr(self,attr)
- ret.append(val)
- return tuple(ret)
-
-
- # pack tuple into CloudTaskSpec
- def pack(self,values):
- for i in range(len(self._attributes)):
- attr= self._attributes[i]
- val = values[i]
- setattr(self,attr,val)
-
-
- # return state values to be pickled
- def __getstate__(self):
- state = []
- for attr in self._attributes:
- val = getattr(self,attr)
- state.append(val)
- return state
-
-
- # restore state from the unpickled state values
- def __setstate__(self,state):
- for i in range(len(self._attributes)):
- if i+1 < len(state):
- setattr(self,self._attributes[i],state[i])
- else:
- setattr(self,self._attributes[i],'NULL')
-
-
- # return column names for INSERT
- def columnNames(cls):
- ret = ""
- for attr in cls._attributes:
- if ret != "":
- ret += ','
- ret += attr
- return ret
- columnNames = classmethod(columnNames)
-
-
- # return expression of values for INSERT
- def valuesExpression(cls):
- ret = "VALUES("
- for attr in cls._attributes:
- ret += "%s"
- if attr != cls._attributes[len(cls._attributes)-1]:
- ret += ","
- ret += ")"
- return ret
- valuesExpression = classmethod(valuesExpression)
-
-
- # return an expression for UPDATE
- def updateExpression(cls):
- ret = ""
- for attr in cls._attributes:
- ret = ret + attr + "=%s"
- if attr != cls._attributes[len(cls._attributes)-1]:
- ret += ","
- return ret
- updateExpression = classmethod(updateExpression)
-
-
-
-
-
diff --git a/current/pandaserver/taskbuffer/CloudURLMap.py b/current/pandaserver/taskbuffer/CloudURLMap.py
deleted file mode 100644
index 27bdce567..000000000
--- a/current/pandaserver/taskbuffer/CloudURLMap.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# cloud to Panda server's URL mapping
-cloudURLMap = {
- 'CA' : {
- 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda',
- 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda',
- },
- 'ES' : {
- 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda',
- 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda',
- },
- 'FR' : {
- 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda',
- 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda',
- },
- 'IT' : {
- 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda',
- 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda',
- },
- 'NL' : {
- 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda',
- 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda',
- },
- 'TW' : {
- 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda',
- 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda',
- },
- 'UK' : {
- 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda',
- 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda',
- },
- 'US' : {
- 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda',
- 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda',
- },
- }
-
diff --git a/current/pandaserver/taskbuffer/ConBridge.py b/current/pandaserver/taskbuffer/ConBridge.py
deleted file mode 100644
index 3f4fd1abd..000000000
--- a/current/pandaserver/taskbuffer/ConBridge.py
+++ /dev/null
@@ -1,502 +0,0 @@
-import os
-import re
-import sys
-import time
-import types
-import socket
-import signal
-import random
-import threading
-import cPickle as pickle
-
-import OraDBProxy as DBProxy
-
-from config import panda_config
-from JobSpec import JobSpec
-from FileSpec import FileSpec
-from DatasetSpec import DatasetSpec
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('ConBridge')
-
-
-# exception for normal termination
-class HarmlessEx(Exception):
- pass
-
-
-# terminate child process by itself when master has gone
-class Terminator (threading.Thread):
-
- # constructor
- def __init__(self,consock):
- threading.Thread.__init__(self)
- self.consock = consock
-
-
- # main
- def run(self):
- # watching control socket
- try:
- rcvSize = self.consock.recv(1)
- except:
- pass
- # get PID
- pid = os.getpid()
- _logger.debug("child %s received termination" % pid)
- # kill
- try:
- os.kill(pid,signal.SIGTERM)
- except:
- pass
- try:
- os.kill(pid,signal.SIGKILL)
- except:
- pass
-
-
-
-# connection bridge with with timeout
-class ConBridge (object):
-
- # constructor
- def __init__(self):
- self.child_pid = 0
- self.isMaster = False
- self.mysock = None
- self.consock = None
- self.pid = os.getpid()
- # timeout
- if hasattr(panda_config,'dbtimeout'):
- self.timeout = int(panda_config.dbtimeout)
- else:
- self.timeout = 600
- # verbose
- if hasattr(panda_config,'dbbridgeverbose'):
- self.verbose = panda_config.dbbridgeverbose
- else:
- self.verbose = False
-
-
- # destructor
- def __del__(self):
- # kill old child process
- self.bridge_killChild()
-
-
- # connect
- def connect(self,dbhost=panda_config.dbhost,dbpasswd=panda_config.dbpasswd,
- dbuser=panda_config.dbuser,dbname=panda_config.dbname,
- dbtimeout=None,reconnect=False):
- # kill old child process
- self.bridge_killChild()
- _logger.debug('master %s connecting' % self.pid)
- # reset child PID and sockets
- self.child_pid = 0
- self.mysock = None
- self.consock = None
- # create socket
- datpair = socket.socketpair()
- conpair = socket.socketpair()
- # fork
- self.child_pid = os.fork()
- if self.child_pid == 0:
- # child
- self.isMaster = False
- self.pid = os.getpid()
- # keep socket
- self.mysock = datpair[1]
- self.consock = conpair[1]
- datpair[0].close()
- conpair[0].close()
- # connect to database
- _logger.debug('child %s connecting to database' % self.pid)
- self.proxy = DBProxy.DBProxy()
- if not self.proxy.connect(dbhost=dbhost,dbpasswd=dbpasswd,dbtimeout=60):
- _logger.error('child %s failed to connect' % self.pid)
- # send error
- self.bridge_sendError((RuntimeError,'child %s connection failed' % self.pid))
- # exit
- self.bridge_childExit()
- # send OK just for ACK
- _logger.debug('child %s connection is ready' % self.pid)
- self.bridge_sendResponse(None)
- # start terminator
- Terminator(self.consock).start()
- # go main loop
- _logger.debug('child %s going into the main loop' % self.pid)
- self.bridge_run()
- # exit
- self.bridge_childExit(0)
- else:
- # master
- self.isMaster = True
- # keep socket
- self.mysock = datpair[0]
- self.consock = conpair[0]
- datpair[1].close()
- conpair[1].close()
- try:
- # get ACK
- _logger.debug('master %s waiting ack from child=%s' % (self.pid,self.child_pid))
- self.bridge_getResponse()
- _logger.debug('master %s got ready from child=%s' % (self.pid,self.child_pid))
- return True
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error('master %s failed to setup child=%s : %s %s' % \
- (self.pid,self.child_pid,errType,errValue))
- # kill child
- self.bridge_killChild()
- return False
-
-
-
- #######################
- # communication methods
-
- # send packet
- def bridge_send(self,val):
- try:
- # set timeout
- if self.isMaster:
- self.mysock.settimeout(self.timeout)
- # serialize
- tmpStr = pickle.dumps(val)
- # send size
- self.mysock.sendall("%50s" % len(tmpStr))
- # send body
- self.mysock.sendall(tmpStr)
- # set timeout back
- if self.isMaster:
- self.mysock.settimeout(None)
- except:
- errType,errValue = sys.exc_info()[:2]
- if self.isMaster:
- roleType = 'master'
- else:
- roleType = 'child '
- _logger.error('%s %s send error : val=%s - %s %s' % \
- (roleType,self.pid,str(val),errType,errValue))
- # terminate child
- if not self.isMaster:
- self.bridge_childExit()
- raise errType,errValue
-
-
- # receive packet
- def bridge_recv(self):
- try:
- # set timeout
- if self.isMaster:
- self.mysock.settimeout(self.timeout)
- # get size
- strSize = ''
- headSize = 50
- while len(strSize) < headSize:
- tmpSize = headSize - len(strSize)
- tmpStr = self.mysock.recv(tmpSize)
- if tmpStr == '':
- if self.isMaster:
- raise socket.error,'empty packet'
- else:
- # master closed socket
- raise HarmlessEx,'empty packet'
- strSize += tmpStr
- # get body
- strBody = ''
- bodySize = long(strSize)
- while len(strBody) < bodySize:
- tmpSize = bodySize - len(strBody)
- tmpStr = self.mysock.recv(tmpSize)
- if tmpStr == '':
- if self.isMaster:
- raise socket.error,'empty packet'
- else:
- # master closed socket
- raise HarmlessEx,'empty packet'
- strBody += tmpStr
- # set timeout back
- if self.isMaster:
- self.mysock.settimeout(None)
- # deserialize
- retVal = pickle.loads(strBody)
- return True,retVal
- except:
- if self.isMaster:
- roleType = 'master'
- else:
- roleType = 'child '
- errType,errValue = sys.exc_info()[:2]
- if errType == HarmlessEx:
- _logger.debug('%s %s recv harmless ex : %s' % \
- (roleType,self.pid,errValue))
- else:
- _logger.error('%s %s recv error : %s %s' % \
- (roleType,self.pid,errType,errValue))
- # terminate child
- if not self.isMaster:
- self.bridge_childExit()
- raise errType,errValue
-
-
-
- #######################
- # child's methods
-
- # send error
- def bridge_sendError(self,val):
- # send status
- self.bridge_send("NG")
- # check if pickle-able
- try:
- pickle.dumps(val)
- except:
- # use RuntimeError
- val = (RuntimeError,str(val[-1]))
- # send exceptions
- self.bridge_send(val)
-
-
- # send response
- def bridge_sendResponse(self,val):
- # send status
- self.bridge_send("OK")
- # send response
- self.bridge_send(val)
-
-
- # termination of child
- def bridge_childExit(self,exitCode=1):
- if not self.isMaster:
- _logger.debug("child %s closing sockets" % self.pid)
- # close sockets
- try:
- self.mysock.shutdown(socket.SHUT_RDWR)
- except:
- pass
- try:
- self.consock.shutdown(socket.SHUT_RDWR)
- except:
- pass
- # exit
- _logger.debug("child %s going to exit" % self.pid)
- os._exit(exitCode)
-
-
- # child main
- def bridge_run(self):
- comStr = ''
- while True:
- try:
- # get command
- status,comStr = self.bridge_recv()
- if not status:
- raise RuntimeError,'invalid command'
- # get variables
- status,variables = self.bridge_recv()
- if not status:
- raise RuntimeError,'invalid variables'
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error('child %s died : %s %s' % (self.pid,errType,errValue))
- # exit
- self.bridge_childExit()
- if self.verbose:
- _logger.debug('child %s method %s executing' % (self.pid,comStr))
- try:
- # execute
- method = getattr(self.proxy,comStr)
- res = apply(method,variables[0],variables[1])
- # FIXME : modify response since cx_Oracle types cannot be picked
- if comStr in ['querySQLS']:
- newRes = [True]+res[1:]
- res = newRes
- if self.verbose:
- _logger.debug('child %s method %s completed' % (self.pid,comStr))
- # return
- self.bridge_sendResponse((res,variables[0],variables[1]))
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error('child %s method %s failed : %s %s' % (self.pid,comStr,errType,errValue))
- if errType in [socket.error,socket.timeout]:
- _logger.error('child %s died : %s %s' % (self.pid,errType,errValue))
- # exit
- self.bridge_childExit()
- # send error
- self.bridge_sendError((errType,errValue))
-
-
-
- #######################
- # master's methods
-
- # kill child
- def bridge_killChild(self):
- # kill old child process
- if self.child_pid != 0:
- # close sockets
- _logger.debug('master %s closing sockets for child=%s' % (self.pid,self.child_pid))
- try:
- if self.mysock != None:
- self.mysock.shutdown(socket.SHUT_RDWR)
- except:
- pass
- try:
- if self.consock != None:
- self.consock.shutdown(socket.SHUT_RDWR)
- except:
- pass
- _logger.debug('master %s killing child=%s' % (self.pid,self.child_pid))
- # send SIGTERM
- try:
- os.kill(self.child_pid,signal.SIGTERM)
- except:
- pass
- time.sleep(2)
- # send SIGKILL
- try:
- os.kill(self.child_pid,signal.SIGKILL)
- except:
- pass
- # wait for completion of child
- _logger.debug('master %s waiting child=%s' % (self.pid,self.child_pid))
- try:
- os.waitpid(self.child_pid,0)
- except:
- pass
- # sleep to avoid burst reconnection
- time.sleep(random.randint(5,15))
- _logger.debug('master %s killed child=%s' % (self.pid,self.child_pid))
-
-
- # get responce
- def bridge_getResponse(self):
- # get status
- status,strStatus = self.bridge_recv()
- if not status:
- raise RuntimeError,'master %s got invalid status response from child=%s' % \
- (self.pid,self.child_pid)
- if strStatus == 'OK':
- # return res
- status,ret = self.bridge_recv()
- if not status:
- raise RuntimeError,'master %s got invalid response body from child=%s' % \
- (self.pid,self.child_pid)
- return ret
- elif strStatus == 'NG':
- # raise error
- status,ret = self.bridge_recv()
- if not status:
- raise RuntimeError,'master %s got invalid response value from child=%s' % \
- (self.pid,self.child_pid)
- raise ret[0],ret[1]
- else:
- raise RuntimeError,'master %s got invalid response from child=%s : %s' % \
- (self.pid,self.child_pid,str(strStatus))
-
-
- # method wrapper class
- class bridge_masterMethod:
-
- # constructor
- def __init__(self,name,parent):
- self.name = name
- self.parent = parent
- self.pid = os.getpid()
-
-
- # copy changes in taskbuff objects to master
- def copyTbObjChanges(self,oldPar,newPar):
- # check they have the same type
- if type(oldPar) != type(newPar):
- return False
- # copy some Specs since they are passed via ref's
- if isinstance(oldPar,JobSpec) or isinstance(oldPar,FileSpec) \
- or isinstance(oldPar,DatasetSpec):
- if hasattr(oldPar,'__getstate__'):
- tmpStat = newPar.__getstate__()
- oldPar.__setstate__(tmpStat)
- else:
- tmpStat = newPar.values()
- oldPar.pack(tmpStat)
- return True
- # copy Datasets
- return False
-
-
- # copy changes in objects to master
- def copyChanges(self,oldPar,newPar):
- if isinstance(oldPar,types.ListType):
- # delete all elements first
- while len(oldPar) > 0:
- oldPar.pop()
- # append
- for tmpItem in newPar:
- oldPar.append(tmpItem)
- elif isinstance(oldPar,types.DictType):
- # replace
- for tmpKey in newPar.keys():
- oldPar[tmpKey] = newPar[tmpKey]
- else:
- self.copyTbObjChanges(oldPar,newPar)
-
-
- # method emulation
- def __call__(self,*args,**keywords):
- while True:
- try:
- # send command name
- self.parent.bridge_send(self.name)
- # send variables
- self.parent.bridge_send((args,keywords))
- # get response
- retVal,newArgs,newKeywords = self.parent.bridge_getResponse()
- # propagate child's changes in args to master
- for idxArg,tmpArg in enumerate(args):
- self.copyChanges(tmpArg,newArgs[idxArg])
- # propagate child's changes in keywords to master
- for tmpKey,tmpArg in keywords.iteritems():
- self.copyChanges(tmpArg,newKeywords[tmpKey])
- # return
- return retVal
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error('master %s method %s failed : %s %s' % \
- (self.pid,self.name,errType,errValue))
- # reconnect when socket has a problem
- if not errType in [socket.error,socket.timeout]:
- # kill old child process
- self.parent.bridge_killChild()
- _logger.error('master %s killed child' % self.pid)
- #raise errType,errValue
- # sleep
- time.sleep(5)
- # reconnect
- try:
- _logger.debug('master %s trying to reconnect' % self.pid)
- self.parent.connect()
- _logger.debug('master %s reconnect completed' % self.pid)
- except:
- _logger.error('master %s connect failed' % self.pid)
-
-
- # get atter for cursor attributes
- def __getattribute__(self,name):
- if object.__getattribute__(self,'isMaster'):
- try:
- # return origianl attribute
- return object.__getattribute__(self,name)
- except:
- # append methods
- if not name.startswith('_') and hasattr(DBProxy.DBProxy,name) and \
- isinstance(getattr(DBProxy.DBProxy,name),types.UnboundMethodType):
- # get DBProxy's method wrapper
- method = ConBridge.bridge_masterMethod(name,self)
- # set method
- setattr(self,name,method)
- # return
- return method
- # return origianl attribute for child
- return object.__getattribute__(self,name)
diff --git a/current/pandaserver/taskbuffer/DBProxy.py b/current/pandaserver/taskbuffer/DBProxy.py
deleted file mode 100755
index 9d0981e15..000000000
--- a/current/pandaserver/taskbuffer/DBProxy.py
+++ /dev/null
@@ -1,3066 +0,0 @@
-"""
-proxy for database connection
-
-"""
-
-import re
-import os
-import sys
-import time
-import fcntl
-import random
-import urllib
-import MySQLdb
-import datetime
-import commands
-import traceback
-import warnings
-import ErrorCode
-from JobSpec import JobSpec
-from FileSpec import FileSpec
-from DatasetSpec import DatasetSpec
-from CloudTaskSpec import CloudTaskSpec
-from pandalogger.PandaLogger import PandaLogger
-from config import panda_config
-from brokerage.PandaSiteIDs import PandaSiteIDs
-
-warnings.filterwarnings('ignore')
-
-# logger
-_logger = PandaLogger().getLogger('DBProxy')
-
-# lock file
-_lockGetSN = open(panda_config.lockfile_getSN, 'w')
-_lockSetDS = open(panda_config.lockfile_setDS, 'w')
-_lockGetCT = open(panda_config.lockfile_getCT, 'w')
-
-
-# proxy
-class DBProxy:
-
- # constructor
- def __init__(self):
- # connection object
- self.conn = None
- # cursor object
- self.cur = None
- # host name
- self.hostname = None
- # retry count
- self.nTry = 5
-
- # connect to DB
- def connect(self,dbhost=panda_config.dbhost,dbpasswd=panda_config.dbpasswd,
- dbuser=panda_config.dbuser,dbname=panda_config.dbname,
- dbtimeout=None,reconnect=False):
- # keep parameters for reconnect
- if not reconnect:
- self.dbhost = dbhost
- self.dbpasswd = dbpasswd
- self.dbuser = dbuser
- self.dbname = dbname
- self.dbtimeout = dbtimeout
- # connect
- try:
- if self.dbtimeout == None:
- self.conn = MySQLdb.connect(host=self.dbhost,user=self.dbuser,
- passwd=self.dbpasswd,db=self.dbname)
- else:
- self.conn = MySQLdb.connect(host=self.dbhost,user=self.dbuser,
- passwd=self.dbpasswd,db=self.dbname,
- connect_timeout=self.dbtimeout)
- self.cur=self.conn.cursor()
- # get hostname
- self.cur.execute('SELECT USER()')
- res = self.cur.fetchone()
- match = re.search('^([^@]+)@([^@]+)$',res[0])
- if match != None:
- self.hostname = match.group(2)
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("connect : %s %s" % (type,value))
- # roll back
- self._rollback()
- return False
-
-
- # query an SQL
- def querySQL(self,sql):
- comment = ' /* DBProxy.querySQL */'
- try:
- _logger.debug("querySQL : %s " % sql)
- # begin transaction
- self.cur.execute("START TRANSACTION")
- self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return res
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("querySQL : %s " % sql)
- _logger.error("querySQL : %s %s" % (type,value))
- return None
-
-
- # query an SQL return Status
- def querySQLS(self,sql):
- comment = ' /* DBProxy.querySQLS */'
- try:
- # begin transaction
- self.cur.execute("SET AUTOCOMMIT=1")
- ret = self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return ret,res
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("querySQLS : %s " % sql)
- _logger.error("querySQLS : %s %s" % (type,value))
- return -1,None
-
-
- # query an SQL with list return Status
- def querySQLwList(self,sql,valList):
- comment = ' /* DBProxy.querySQLwList */'
- try:
- # begin transaction
- self.cur.execute("SET AUTOCOMMIT=1")
- ret = self.cur.execute(sql+comment,valList)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return ret,res
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("querySQLwList : %s %s" % (sql,str(valList)))
- _logger.error("querySQLwList : %s %s" % (type,value))
- return -1,None
-
-
- # insert job to jobsDefined
- def insertNewJob(self,job,user,serNum,weight=0.0,priorityOffset=0,userVO=None):
- comment = ' /* DBProxy.insertNewJob */'
- sql1 = "INSERT INTO jobsDefined4 (%s) " % JobSpec.columnNames()
- sql1+= JobSpec.valuesExpression()
- # make sure PandaID is NULL
- job.PandaID = None
- # job status
- job.jobStatus='defined'
- # host and time information
- job.modificationHost = self.hostname
- job.creationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- job.modificationTime = job.creationTime
- job.stateChangeTime = job.creationTime
- # DN
- if job.prodUserID == "NULL" or job.prodSourceLabel in ['user','panda']:
- job.prodUserID = user
- # VO
- job.VO = userVO
- # priority
- if job.assignedPriority != 'NULL':
- job.currentPriority = job.assignedPriority
- if job.prodSourceLabel == 'user':
- job.currentPriority = 1000 + priorityOffset - (serNum / 5) - int(100 * weight)
- elif job.prodSourceLabel == 'panda':
- job.currentPriority = 2000 + priorityOffset
- # usergroup
- if job.prodSourceLabel == 'regional':
- job.computingSite= "BNLPROD"
- try:
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # insert
- retI = self.cur.execute(sql1+comment, job.values())
- # set PandaID
- job.PandaID = self.conn.insert_id()
- # insert files
- _logger.debug("insertNewJob : %s Label : %s ret : %s" % (job.PandaID,job.prodSourceLabel,retI))
- sqlFile = "INSERT INTO filesTable4 (%s) " % FileSpec.columnNames()
- sqlFile+= FileSpec.valuesExpression()
- for file in job.Files:
- file.rowID = None
- if file.status != 'ready':
- file.status='unknown'
- # replace $PANDAID with real PandaID
- file.lfn = re.sub('\$PANDAID', '%05d' % job.PandaID, file.lfn)
- self.cur.execute(sqlFile+comment, file.values())
- # get rowID
- file.rowID = self.conn.insert_id()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("insertNewJob : %s File OK" % job.PandaID)
- # update job info in MonALISA - Job Defined.
- #aThr = apmonInterface(job)
- #aThr.start()
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("insertNewJob : %s %s" % (type,value))
- # roll back
- self._rollback()
- return False
-
-
- # simply insert job to a table
- def insertJobSimple(self,job,table,fileTable):
- comment = ' /* DBProxy.insertJobSimple */'
- _logger.debug("insertJobSimple : %s" % job.PandaID)
- sql1 = "INSERT INTO %s (%s) " % (table,JobSpec.columnNames())
- sql1+= JobSpec.valuesExpression()
- try:
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # insert
- self.cur.execute(sql1+comment, job.values())
- # files
- sqlFile = "INSERT INTO %s " % fileTable
- sqlFile+= "(%s) " % FileSpec.columnNames()
- sqlFile+= FileSpec.valuesExpression()
- for file in job.Files:
- self.cur.execute(sqlFile+comment, file.values())
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("insertJobSimple : %s %s" % (type,value))
- # roll back
- self._rollback()
- return False
-
-
- # activate job. move job from jobsDefined to jobsActive
- def activateJob(self,job):
- comment = ' /* DBProxy.activateJob */'
- if job==None:
- _logger.debug("activateJob : None")
- return True
- _logger.debug("activateJob : %s" % job.PandaID)
- sql0 = "SELECT rowID FROM filesTable4 WHERE PandaID=%s AND type=%s AND status!=%s"
- sql1 = "UPDATE jobsDefined4 SET jobStatus='activated' "
- sql1+= "WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined') AND commandToPilot<>'tobekilled'"
- sql2 = "INSERT INTO jobsActive4 (%s) " % JobSpec.columnNames()
- sql2+= JobSpec.valuesExpression()
- # host and time information
- job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- # set stateChangeTime for defined->activated but not for assigned->activated
- if job.jobStatus in ['defined']:
- job.stateChangeTime = job.modificationTime
- nTry=3
- for iTry in range(nTry):
- try:
- # check if all files are ready
- allOK = True
- for file in job.Files:
- if file.type == 'input' and file.status != 'ready':
- allOK = False
- break
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # check all inputs are ready
- self.cur.execute(sql0+comment, (job.PandaID,"input","ready"))
- res = self.cur.fetchall()
- if len(res) == 0 or allOK:
- # change status
- job.jobStatus = "activated"
- # update. Not delete for InnoDB
- n = self.cur.execute(sql1+comment, (job.PandaID,))
- if n==0:
- # already killed or activated
- _logger.debug("activateJob : Not found %s" % job.PandaID)
- else:
- # insert
- self.cur.execute(sql2+comment, job.values())
- # update files
- sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s"
- for file in job.Files:
- self.cur.execute(sqlF+comment, file.values()+(file.rowID,))
- else:
- # update job
- sqlJ = ("UPDATE jobsDefined4 SET %s " % JobSpec.updateExpression()) + \
- "WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')"
- n = self.cur.execute(sqlJ+comment, job.values()+(job.PandaID,))
- if n==0:
- # already killed or activated
- _logger.debug("activateJob : Not found %s" % job.PandaID)
- else:
- # update files
- sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s"
- for file in job.Files:
- self.cur.execute(sqlF+comment, file.values()+(file.rowID,))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("activateJob : %s retry : %s" % (job.PandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("activateJob : %s %s" % (type,value))
- return False
-
-
- # send job to jobsWaiting
- def keepJob(self,job):
- comment = ' /* DBProxy.keepJob */'
- _logger.debug("keepJob : %s" % job.PandaID)
- sql1 = "UPDATE jobsDefined4 SET jobStatus='waiting' "
- sql1+= "WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined') AND commandToPilot<>'tobekilled'"
- sql2 = "INSERT INTO jobsWaiting4 (%s) " % JobSpec.columnNames()
- sql2+= JobSpec.valuesExpression()
- # time information
- job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- job.stateChangeTime = job.modificationTime
- nTry=3
- for iTry in range(nTry):
- try:
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # delete
- n = self.cur.execute(sql1+comment, (job.PandaID,))
- if n==0:
- # already killed
- _logger.debug("keepJob : Not found %s" % job.PandaID)
- else:
- # set status
- job.jobStatus = 'waiting'
- # insert
- self.cur.execute(sql2+comment, job.values())
- # update files
- sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s"
- for file in job.Files:
- self.cur.execute(sqlF+comment, file.values()+(file.rowID,))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # update job info in MonALISA - Job sent to waiting state
- #aThr = apmonInterface(job)
- #aThr.start()
- return True
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("keepJob : %s retry : %s" % (job.PandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("keepJob : %s %s" % (type,value))
- return False
-
-
- # archive job to jobArchived and remove the job from jobsActive or jobsDefined
- def archiveJob(self,job,fromJobsDefined):
- comment = ' /* DBProxy.archiveJob */'
- _logger.debug("archiveJob : %s" % job.PandaID)
- if fromJobsDefined:
- sql1 = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')"
- else:
- sql1 = "DELETE FROM jobsActive4 WHERE PandaID=%s"
- sql2 = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames()
- sql2+= JobSpec.valuesExpression()
- nTry=3
- for iTry in range(nTry):
- try:
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # delete
- n = self.cur.execute(sql1+comment, (job.PandaID,))
- if n==0:
- # already killed
- _logger.debug("archiveJob : Not found %s" % job.PandaID)
- else:
- # insert
- job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- job.stateChangeTime = job.modificationTime
- if job.endTime == 'NULL':
- job.endTime = job.modificationTime
- self.cur.execute(sql2+comment, job.values())
- # update files
- sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s"
- for file in job.Files:
- self.cur.execute(sqlF+comment, file.values()+(file.rowID,))
- # delete downstream jobs
- ddmIDs = []
- newJob = None
- ddmAttempt = 0
- if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed':
- # look for outputs
- upOutputs = []
- for file in job.Files:
- if file.type == 'output':
- upOutputs.append(file.lfn)
- # look for downstream jobs
- sqlD = "SELECT PandaID FROM filesTable4 WHERE type='input' AND lfn='%s' GROUP BY PandaID"
- sqlDJS = "SELECT %s " % JobSpec.columnNames()
- sqlDJS+= "FROM jobsDefined4 WHERE PandaID=%s"
- sqlDJD = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s"
- sqlDJI = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames()
- sqlDJI+= JobSpec.valuesExpression()
- for upFile in upOutputs:
- _logger.debug("look for downstream jobs for %s" % upFile)
- # select PandaID
- self.cur.execute((sqlD+comment) % upFile)
- res = self.cur.fetchall()
- for downID in res:
- _logger.debug("delete : %s" % downID)
- # select jobs
- self.cur.execute((sqlDJS+comment) % downID)
- resJob = self.cur.fetchall()
- if len(resJob) == 0:
- continue
- # instantiate JobSpec
- dJob = JobSpec()
- dJob.pack(resJob[0])
- # delete
- retD = self.cur.execute((sqlDJD+comment) % downID)
- if retD == 0:
- continue
- # error code
- dJob.jobStatus = 'failed'
- dJob.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- dJob.taskBufferErrorCode = ErrorCode.EC_Kill
- dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed'
- dJob.modificationTime = dJob.endTime
- dJob.stateChangeTime = dJob.endTime
- # insert
- self.cur.execute(sqlDJI+comment, dJob.values())
- elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='dis':
- # get corresponding jobs for production movers
- vuid = ''
- # extract vuid
- match = re.search('--callBack (\S+)',job.jobParameters)
- if match != None:
- try:
- callbackUrl = urllib.unquote(match.group(1))
- callbackUrl = re.sub('[&\?]',' ', callbackUrl)
- # look for vuid=
- for item in callbackUrl.split():
- if item.startswith('vuid='):
- vuid = item.split('=')[-1]
- break
- except:
- pass
- if vuid == '':
- _logger.error("cannot extract vuid from %s" % job.jobParameters)
- else:
- # get name
- self.cur.execute(("SELECT name FROM Datasets WHERE vuid='%s' AND type='dispatch'" % vuid)+comment)
- res = self.cur.fetchall()
- if len(res) != 0:
- disName = res[0]
- # get PandaIDs
- self.cur.execute(("SELECT PandaID FROM jobsDefined4 WHERE dispatchDBlock='%s' AND jobStatus='assigned'" % disName)+comment)
- resDDM = self.cur.fetchall()
- for tmpID, in resDDM:
- ddmIDs.append(tmpID)
- # get offset
- ddmAttempt = job.attemptNr
- _logger.debug("get PandaID for reassign : %s ddmAttempt=%s" % (str(ddmIDs),ddmAttempt))
- elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='ddm' and job.attemptNr<2 \
- and job.commandToPilot != 'tobekilled':
- # instantiate new mover to retry subscription
- newJob = JobSpec()
- newJob.jobDefinitionID = job.jobDefinitionID
- newJob.jobName = job.jobName
- newJob.attemptNr = job.attemptNr + 1
- newJob.transformation = job.transformation
- newJob.destinationDBlock = job.destinationDBlock
- newJob.destinationSE = job.destinationSE
- newJob.currentPriority = job.currentPriority
- newJob.prodSourceLabel = job.prodSourceLabel
- newJob.prodUserID = job.prodUserID
- newJob.computingSite = job.computingSite
- newJob.transferType = job.transferType
- newJob.sourceSite = job.sourceSite
- newJob.destinationSite = job.destinationSite
- newJob.jobParameters = job.jobParameters
- if job.Files != []:
- file = job.Files[0]
- fileOL = FileSpec()
- # add attempt nr
- fileOL.lfn = re.sub("\.\d+$","",file.lfn)
- fileOL.lfn = "%s.%d" % (fileOL.lfn,job.attemptNr)
- fileOL.destinationDBlock = file.destinationDBlock
- fileOL.destinationSE = file.destinationSE
- fileOL.dataset = file.dataset
- fileOL.type = file.type
- newJob.addFile(fileOL)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True,ddmIDs,ddmAttempt,newJob
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("archiveJob : %s retry : %s" % (job.PandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("archiveJob : %s" % job.PandaID)
- _logger.error("archiveJob : %s %s" % (type,value))
- return False,[],0,None
-
-
- # overload of archiveJob
- def archiveJobLite(self,pandaID,jobStatus,param):
- comment = ' /* DBProxy.archiveJobLite */'
- _logger.debug("archiveJobLite : %s" % pandaID)
- sql1 = "SELECT %s FROM jobsActive4 " % JobSpec.columnNames()
- sql1+= "WHERE PandaID=%s"
- sql2 = "DELETE FROM jobsActive4 WHERE PandaID=%s"
- sql3 = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames()
- sql3+= JobSpec.valuesExpression()
- nTry=3
- for iTry in range(nTry):
- try:
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # select
- self.cur.execute(sql1+comment, (pandaID,))
- res = self.cur.fetchall()
- if len(res) == 0:
- _logger.error("archiveJobLite() : PandaID %d not found" % pandaID)
- self._rollback()
- return False
- job = JobSpec()
- job.pack(res[0])
- job.jobStatus = jobStatus
- for key in param.keys():
- if param[key] != None:
- setattr(job,key,param[key])
- job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- job.endTime = job.modificationTime
- job.stateChangeTime = job.modificationTime
- # delete
- n = self.cur.execute(sql2+comment, (job.PandaID,))
- if n==0:
- # already killed
- _logger.debug("archiveJobLite : Not found %s" % pandaID)
- else:
- # insert
- self.cur.execute(sql3+comment, job.values())
- # update files
- sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s"
- for file in job.Files:
- self.cur.execute(sqlF+comment, file.values()+(file.rowID,))
- # delete downstream jobs
- if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed':
- # file select
- sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=%s"
- self.cur.execute(sqlFile+comment, (job.PandaID,))
- resFs = self.cur.fetchall()
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- job.addFile(file)
- # look for outputs
- upOutputs = []
- for file in job.Files:
- if file.type == 'output':
- upOutputs.append(file.lfn)
- # look for downstream jobs
- sqlD = "SELECT PandaID FROM filesTable4 WHERE type='input' AND lfn='%s' GROUP BY PandaID"
- sqlDJS = "SELECT %s " % JobSpec.columnNames()
- sqlDJS+= "FROM jobsDefined4 WHERE PandaID=%s"
- sqlDJD = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s"
- sqlDJI = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames()
- sqlDJI+= JobSpec.valuesExpression()
- for upFile in upOutputs:
- _logger.debug("look for downstream jobs for %s" % upFile)
- # select PandaID
- self.cur.execute((sqlD+comment) % upFile)
- res = self.cur.fetchall()
- for downID in res:
- _logger.debug("delete : %s" % downID)
- # select jobs
- self.cur.execute((sqlDJS+comment) % downID)
- resJob = self.cur.fetchall()
- if len(resJob) == 0:
- continue
- # instantiate JobSpec
- dJob = JobSpec()
- dJob.pack(resJob[0])
- # delete
- retD = self.cur.execute((sqlDJD+comment) % downID)
- if retD == 0:
- continue
- # error code
- dJob.jobStatus = 'failed'
- dJob.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- dJob.taskBufferErrorCode = ErrorCode.EC_Kill
- dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed'
- dJob.modificationTime = dJob.endTime
- dJob.stateChangeTime = dJob.endTime
- # insert
- self.cur.execute((sqlDJI+comment), dJob.values())
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("archiveJobLite : %s retry : %s" % (pandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("archiveJobLite : %s %s" % (type,value))
- return False
-
-
- # update Job status in jobsActive
- def updateJobStatus(self,pandaID,jobStatus,param):
- comment = ' /* DBProxy.updateJobStatus */'
- _logger.debug("updateJobStatus : %s" % pandaID)
- sql1 = "UPDATE jobsActive4 SET jobStatus=%s,modificationTime=UTC_TIMESTAMP()"
- if jobStatus in ['starting']:
- sql1 += ",stateChangeTime=UTC_TIMESTAMP()"
- values = [jobStatus]
- for key in param.keys():
- if param[key] != None:
- sql1 = sql1 + (',%s=' % key) + '%s'
- values.append(param[key])
- sql1 += " WHERE PandaID=%s"
- values.append(pandaID)
- nTry=3
- for iTry in range(nTry):
- try:
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # update
- self.cur.execute (sql1+comment,tuple(values))
- # get command
- self.cur.execute ('SELECT commandToPilot,endTime FROM jobsActive4 WHERE PandaID=%s'+comment,(pandaID,))
- res = self.cur.fetchone()
- if res != None:
- ret = res[0]
- # update endTime
- endTime = res[1]
- if jobStatus == 'holding' and endTime==None:
- self.cur.execute ("UPDATE jobsActive4 SET endTime=UTC_TIMESTAMP() WHERE PandaID=%s"+comment,(pandaID,))
- else:
- # already deleted
- ret = 'tobekilled'
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return ret
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("updateJobStatus : %s retry : %s" % (pandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("updateJobStatus : %s %s" % (type,value))
- _logger.error("updateJobStatus : %s" % pandaID)
- return False
-
-
- # update job information in jobsActive or jobsDefined
- def updateJob(self,job,inJobsDefined):
- comment = ' /* DBProxy.updateJob */'
- _logger.debug("updateJob : %s" % job.PandaID)
- if inJobsDefined:
- sql1 = "UPDATE jobsDefined4 SET %s " % JobSpec.updateExpression()
- else:
- sql1 = "UPDATE jobsActive4 SET %s " % JobSpec.updateExpression()
- sql1+= "WHERE PandaID=%s"
- if inJobsDefined:
- sql1+= " AND (jobStatus='assigned' OR jobStatus='defined')"
- nTry=3
- for iTry in range(nTry):
- try:
- job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- # set stateChangeTime for defined->assigned
- if inJobsDefined:
- job.stateChangeTime = job.modificationTime
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # update
- n = self.cur.execute(sql1+comment, job.values()+(job.PandaID,))
- if n==0:
- # already killed or activated
- _logger.debug("updateJob : Not found %s" % job.PandaID)
- else:
- sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s"
- for file in job.Files:
- self.cur.execute(sqlF+comment, file.values()+(file.rowID,))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("updateJob : %s retry : %s" % (job.PandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("updateJob : %s %s" % (type,value))
- return False
-
-
- # retry analysis job
- def retryJob(self,pandaID,param):
- comment = ' /* DBProxy.retryJob */'
- _logger.debug("retryJob : %s" % pandaID)
- sql1 = "SELECT %s FROM jobsActive4 " % JobSpec.columnNames()
- sql1+= "WHERE PandaID=%s"
- sql2 = "UPDATE jobsActive4 SET %s " % JobSpec.updateExpression()
- sql2+= "WHERE PandaID=%s"
- nTry=3
- for iTry in range(nTry):
- try:
- retValue = False
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # select
- self.cur.execute(sql1+comment, (pandaID,))
- res = self.cur.fetchall()
- if len(res) == 0:
- _logger.debug("retryJob() : PandaID %d not found" % pandaID)
- self._rollback()
- return retValue
- job = JobSpec()
- job.pack(res[0])
- # check if it's analysis job
- if (((job.prodSourceLabel == 'user' or job.prodSourceLabel == 'panda') \
- and job.computingSite.startswith('ANALY_') and param.has_key('pilotErrorCode') \
- and param['pilotErrorCode'] in ['1200','1201'] and (not job.computingSite.startswith('ANALY_LONG_')) \
- and job.attemptNr < 2) or (job.prodSourceLabel == 'ddm' and job.cloud == 'CA' and job.attemptNr <= 10)) \
- and job.commandToPilot != 'tobekilled':
- _logger.debug(' -> reset PandaID:%s #%s' % (job.PandaID,job.attemptNr))
- # reset job
- job.jobStatus = 'activated'
- job.startTime = None
- job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- job.attemptNr = job.attemptNr + 1
- # send it to long queue for analysis jobs
- oldComputingSite = job.computingSite
- if job.computingSite.startswith('ANALY') and (not job.computingSite.startswith('ANALY_LONG_')):
- longSite = re.sub('^ANALY_','ANALY_LONG_',job.computingSite)
- longSite = re.sub('_\d+$','',longSite)
- if longSite in PandaSiteIDs.keys():
- job.computingSite = longSite
- # set destinationSE if queue is changed
- if oldComputingSite == job.destinationSE:
- job.destinationSE = job.computingSite
- # select files
- sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=%s AND (type='log' OR type='output')"
- self.cur.execute(sqlFile+comment, (job.PandaID,))
- resFs = self.cur.fetchall()
- for resF in resFs:
- # set PandaID
- file = FileSpec()
- file.pack(resF)
- job.addFile(file)
- # set new GUID
- if file.type == 'log':
- file.GUID = commands.getoutput('uuidgen')
- # append attemptNr to LFN
- oldName = file.lfn
- file.lfn = re.sub('\.\d+$','',file.lfn)
- file.lfn = '%s.%s' % (file.lfn,job.attemptNr)
- newName = file.lfn
- # set destinationSE
- if oldComputingSite == file.destinationSE:
- file.destinationSE = job.computingSite
- # modify jobParameters
- sepPatt = "(\'|\"|%20)" + oldName + "(\'|\"|%20)"
- matches = re.findall(sepPatt,job.jobParameters)
- for match in matches:
- oldPatt = match[0]+oldName+match[-1]
- newPatt = match[0]+newName+match[-1]
- job.jobParameters = re.sub(oldPatt,newPatt,job.jobParameters)
- # update
- sqlFup = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s"
- self.cur.execute(sqlFup+comment, file.values()+(file.rowID,))
- # update job
- self.cur.execute(sql2+comment, job.values()+(job.PandaID,))
- # set return
- retValue = True
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return retValue
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("retryJob : %s retry : %s" % (pandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- # error report
- type, value, traceBack = sys.exc_info()
- _logger.error("retryJob : %s %s" % (type,value))
- return False
-
-
- # get jobs
- def getJobs(self,nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement,
- atlasRelease,prodUserID):
- comment = ' /* DBProxy.getJobs */'
- dynamicBrokering = False
- sql1 = "WHERE jobStatus=%s AND computingSite=%s AND commandToPilot<>'tobekilled' "
- if not mem in [0,'0']:
- sql1+= "AND (minRamCount<=%s OR minRamCount=0) " % mem
- if not diskSpace in [0,'0']:
- sql1+= "AND (maxDiskCount<%s OR maxDiskCount=0) " % diskSpace
- if prodSourceLabel == 'user':
- sql1+= "AND (prodSourceLabel='user' OR prodSourceLabel='panda') "
- elif prodSourceLabel == 'ddm':
- dynamicBrokering = True
- sql1+= "AND prodSourceLabel='ddm' "
- elif prodSourceLabel in [None,'managed']:
- sql1+= "AND (prodSourceLabel='managed' OR prodSourceLabel='test') "
- elif prodSourceLabel == 'software':
- sql1+= "AND prodSourceLabel='software' "
- elif prodSourceLabel == 'test' and computingElement != None:
- dynamicBrokering = True
- sql1+= "AND (computingElement='%s' OR computingElement='to.be.set' OR processingType='prod_test' OR prodSourceLabel='test') " % computingElement
- else:
- sql1+= "AND prodSourceLabel='%s' " % prodSourceLabel
- # user ID
- if prodUserID != None:
- sql1+= "AND prodUserID='%s' " % prodUserID
- sql2 = "SELECT %s FROM jobsActive4 " % JobSpec.columnNames()
- sql2+= "WHERE PandaID=%s"
- retJobs = []
- nSent = 0
- try:
- timeLimit = datetime.timedelta(seconds=timeout-10)
- timeStart = datetime.datetime.utcnow()
- strName = datetime.datetime.isoformat(timeStart)
- attLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
- attSQL = "AND ((creationTime<'%s' AND attemptNr>1) OR attemptNr<=1) " % attLimit.strftime('%Y-%m-%d %H:%M:%S')
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # get nJobs
- for iJob in range(nJobs):
- pandaID = 0
- # select channel for ddm jobs
- if prodSourceLabel == 'ddm':
- sqlDDM = "SELECT count(*),jobStatus,sourceSite,destinationSite,transferType FROM jobsActive4 WHERE computingSite=%s AND prodSourceLabel='ddm' " + attSQL + "GROUP BY jobStatus,sourceSite,destinationSite,transferType"
- _logger.debug((sqlDDM+comment) % siteName)
- self.cur.execute(sqlDDM+comment,(siteName,))
- resDDM = self.cur.fetchall()
- # make a channel map
- channelMap = {}
- for tmp_count,tmp_jobStatus,tmp_sourceSite,tmp_destinationSite,tmp_transferType in resDDM:
- # use source,dest,type as the key
- channel = (tmp_sourceSite,tmp_destinationSite,tmp_transferType)
- if not channelMap.has_key(channel):
- channelMap[channel] = {}
- # ignore holding
- if tmp_jobStatus == 'holding':
- continue
- # distinguish activate from other stats
- if tmp_jobStatus != 'activated':
- tmp_jobStatus = 'others'
- # append
- if not channelMap[channel].has_key(tmp_jobStatus):
- channelMap[channel][tmp_jobStatus] = int(tmp_count)
- else:
- channelMap[channel][tmp_jobStatus] += int(tmp_count)
- _logger.debug(channelMap)
- # choose channel
- channels = channelMap.keys()
- random.shuffle(channels)
- foundChannel = False
- for channel in channels:
- # no activated jobs
- if (not channelMap[channel].has_key('activated')) or channelMap[channel]['activated'] == 0:
- continue
- maxRunning = 10
- # prestaging job
- if channel[0] == channel[1] and channel[2] == 'dis':
- maxRunning = 50
- if (not channelMap[channel].has_key('others')) or channelMap[channel]['others'] < maxRunning:
- # set SQL
- sql1+= "AND sourceSite='%s' AND destinationSite='%s' AND transferType='%s' " \
- % channel
- foundChannel = True
- break
- # no proper channel
- if not foundChannel:
- _logger.debug("getJobs : no DDM jobs for Site %s" % siteName)
- break
- # get job
- if prodSourceLabel in ['ddm']:
- # to add some delay for attempts
- sql1 += attSQL
- nTry=1
- for iTry in range(nTry):
- # set siteID
- tmpSiteID = siteName
- if siteName.startswith('ANALY_BNL_ATLAS'):
- tmpSiteID = 'ANALY_BNL_ATLAS_1'
- # get file lock
- _logger.debug("getJobs : %s -> lock" % strName)
- if (datetime.datetime.utcnow() - timeStart) < timeLimit:
- toGetPandaIDs = True
- pandaIDs = []
- # get max priority for analysis jobs
- if prodSourceLabel in ['panda','user']:
- sqlMX = "SELECT MAX(currentPriority) FROM jobsActive4 "
- sqlMX+= sql1
- _logger.debug((sqlMX+comment) % ("activated",tmpSiteID))
- self.cur.execute(sqlMX+comment, ("activated",tmpSiteID))
- tmpPriority, = self.cur.fetchone()
- # no jobs
- if tmpPriority == None:
- toGetPandaIDs = False
- else:
- # set priority
- sql1 += "AND currentPriority=%s" % tmpPriority
- if toGetPandaIDs:
- # get PandaIDs
- sqlP = "SELECT PandaID,currentPriority FROM jobsActive4 "
- sqlP+= sql1
- _logger.debug((sqlP+comment) % ("activated",tmpSiteID))
- self.cur.execute(sqlP+comment, ("activated",tmpSiteID))
- resIDs = self.cur.fetchall()
- maxCurrentPriority = None
- # get max priority and min PandaID
- for tmpPandaID,tmpCurrentPriority in resIDs:
- if maxCurrentPriority==None or maxCurrentPriority < tmpCurrentPriority:
- maxCurrentPriority = tmpCurrentPriority
- pandaIDs = [tmpPandaID]
- elif maxCurrentPriority == tmpCurrentPriority:
- pandaIDs.append(tmpPandaID)
- # sort
- pandaIDs.sort()
- if pandaIDs == []:
- _logger.debug("getJobs : %s -> no PandaIDs" % strName)
- retU = 0
- else:
- # get nSent for production jobs
- if prodSourceLabel in [None,'managed']:
- sentLimit = timeStart - datetime.timedelta(seconds=60)
- sqlSent = "SELECT count(*) FROM jobsActive4 WHERE jobStatus='sent' "
- sqlSent += "AND prodSourceLabel IN ('managed','test') "
- sqlSent += "AND computingSite='%s' " % tmpSiteID
- sqlSent += "AND modificationTime>'%s' " % sentLimit.strftime('%Y-%m-%d %H:%M:%S')
- self.cur.execute(sqlSent+comment)
- resSent = self.cur.fetchone()
- if resSent != None:
- nSent, = resSent
- # update
- for indexID,tmpPandaID in enumerate(pandaIDs):
- # max attempts
- if indexID > 10:
- break
- # update
- sqlJ = "UPDATE jobsActive4 "
- sqlJ+= "SET jobStatus=%s,modificationTime=UTC_TIMESTAMP(),modificationHost=%s,startTime=UTC_TIMESTAMP()"
- # set CE
- if computingElement != None:
- sqlJ+= ",computingElement='%s'" % computingElement
- sqlJ+= " WHERE PandaID=%s AND jobStatus=%s"
- _logger.debug((sqlJ+comment) % ("sent",node,tmpPandaID,"activated"))
- retU = self.cur.execute(sqlJ+comment,("sent",node,tmpPandaID,"activated"))
- # succeeded
- if retU != 0:
- pandaID = tmpPandaID
- # increment nSent
- if prodSourceLabel in [None,'managed']:
- nSent += (indexID+1)
- break
- else:
- _logger.debug("getJobs : %s -> do nothing" % strName)
- retU = 0
- # release file lock
- _logger.debug("getJobs : %s -> unlock" % strName)
- # succeeded
- if retU != 0:
- break
- if iTry+1 < nTry:
- #time.sleep(0.5)
- pass
- # failed to UPDATE
- if retU == 0:
- # reset pandaID
- pandaID = 0
- _logger.debug("getJobs : Site %s : retU %s : PandaID %s - %s"
- % (siteName,retU,pandaID,prodSourceLabel))
- if pandaID == 0:
- break
- # select
- self.cur.execute(sql2+comment, (pandaID,))
- res = self.cur.fetchone()
- if len(res) == 0:
- break
- # instantiate Job
- job = JobSpec()
- job.pack(res)
- # Files
- sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=%s"
- self.cur.execute(sqlFile+comment, (job.PandaID,))
- resFs = self.cur.fetchall()
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- job.addFile(file)
- # append
- retJobs.append(job)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return retJobs,nSent
- except:
- # roll back
- self._rollback()
- # error report
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobs : %s %s" % (type,value))
- return [],0
-
-
- # reset job in jobsActive or jobsWaiting
- def resetJob(self,pandaID,activeTable=True,keepSite=False):
- comment = ' /* DBProxy.resetJob */'
- _logger.debug("resetJobs : %s" % pandaID)
- # select table
- table = 'jobsWaiting4'
- if activeTable:
- table = 'jobsActive4'
- sql1 = "SELECT %s FROM %s " % (JobSpec.columnNames(),table)
- sql1+= "WHERE PandaID=%s"
- sql2 = "DELETE FROM %s " % table
- sql2+= "WHERE PandaID=%s AND (jobStatus='waiting' OR jobStatus='activated')"
- sql3 = "INSERT INTO jobsDefined4 (%s) " % JobSpec.columnNames()
- sql3+= JobSpec.valuesExpression()
- try:
- # transaction causes Request ndbd time-out in jobsActive4
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute(sql1+comment,(pandaID,))
- res = self.cur.fetchone()
- # not found
- if res == None:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return None
- # instantiate Job
- job = JobSpec()
- job.pack(res)
- # if already running
- if job.jobStatus != 'waiting' and job.jobStatus != 'activated':
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return None
- # delete
- retD = self.cur.execute(sql2+comment,(pandaID,))
- # delete failed
- _logger.debug("resetJobs : retD = %s" % retD)
- if retD != 1:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return None
- # delete from jobsDefined4 just in case
- sqlD = "DELETE FROM jobsDefined4 WHERE PandaID=%s"
- self.cur.execute(sqlD+comment,(pandaID,))
- # increase priority
- if job.jobStatus == 'activated' and job.currentPriority < 100:
- job.currentPriority = 100
- # reset computing site and dispatchDBlocks
- job.jobStatus = 'defined'
- job.dispatchDBlock = None
- # erase old assignment
- if (not keepSite) and job.relocationFlag != 1:
- job.computingSite = None
- job.computingElement = None
- # host and time information
- job.modificationHost = self.hostname
- job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- job.stateChangeTime = job.modificationTime
- # insert
- self.cur.execute(sql3+comment, job.values())
- # Files
- sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=%s"
- self.cur.execute(sqlFile+comment, (job.PandaID,))
- resFs = self.cur.fetchall()
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- # reset GUID to trigger LRC/LFC scanning
- if file.status == 'missing':
- file.GUID = None
- # reset status, destinationDBlock and dispatchDBlock
- file.status ='unknown'
- file.dispatchDBlock = None
- file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock)
- # add file
- job.addFile(file)
- # update files
- sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s"
- self.cur.execute(sqlF+comment, file.values()+(file.rowID,))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return job
- except:
- # roll back
- self._rollback()
- # error report
- type, value, traceBack = sys.exc_info()
- _logger.error("resetJobs : %s %s" % (type,value))
- _logger.error("resetJobs : %s" % pandaID)
- return None
-
-
- # reset jobs in jobsDefined
- def resetDefinedJob(self,pandaID,keepSite=False):
- comment = ' /* DBProxy.resetDefinedJob */'
- _logger.debug("resetDefinedJob : %s" % pandaID)
- sql1 = "UPDATE jobsDefined4 SET "
- sql1 += "jobStatus='defined',"
- sql1 += "modificationTime=UTC_TIMESTAMP(),"
- sql1 += "dispatchDBlock=NULL,"
- sql1 += "computingElement=NULL"
- sql1 += " WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')"
- sql2 = "SELECT %s FROM jobsDefined4 " % JobSpec.columnNames()
- sql2+= "WHERE PandaID=%s"
- try:
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # update
- retU = self.cur.execute(sql1+comment,(pandaID,))
- # not found
- job = None
- if retU == 0:
- _logger.debug("resetDefinedJob : Not found %s" % pandaID)
- else:
- # select
- self.cur.execute(sql2+comment,(pandaID,))
- res = self.cur.fetchone()
- # not found
- if res == None:
- raise RuntimeError, 'Could not SELECT : PandaID=%s' % pandaID
- # instantiate Job
- job = JobSpec()
- job.pack(res)
- job.dispatchDBlock = None
- if (not keepSite) and job.relocationFlag != 1:
- # erase old assignment
- job.computingSite = None
- job.computingElement = None
- # Files
- sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=%s"
- self.cur.execute(sqlFile+comment, (job.PandaID,))
- resFs = self.cur.fetchall()
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- # reset status, destinationDBlock and dispatchDBlock
- file.status ='unknown'
- file.dispatchDBlock = None
- file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock)
- # add file
- job.addFile(file)
- # update files
- sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s"
- self.cur.execute(sqlF+comment, file.values()+(file.rowID,))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return job
- except:
- # error report
- type, value, traceBack = sys.exc_info()
- _logger.error("resetDefinedJobs : %s %s" % (type,value))
- #_logger.error(traceback.format_exc())
- # roll back
- self._rollback()
- return None
-
-
- # kill job
- def killJob(self,pandaID,user,code,prodManager):
- comment = ' /* DBProxy.killJob */'
- _logger.debug("killJob : %s %s %s %s" % (code,pandaID,prodManager,user))
- # check PandaID
- try:
- long(pandaID)
- except:
- _logger.error("not an integer : %s" % pandaID)
- return False
- sql0 = "SELECT prodUserID FROM %s WHERE PandaID=%s"
- sql1 = "UPDATE %s SET commandToPilot='tobekilled' WHERE PandaID=%s AND commandToPilot<>'tobekilled'"
- sql2 = "SELECT %s " % JobSpec.columnNames()
- sql2+= "FROM %s WHERE PandaID=%s AND jobStatus<>'running'"
- sql3 = "DELETE FROM %s WHERE PandaID=%s"
- sqlU = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')"
- sql4 = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames()
- sql4+= JobSpec.valuesExpression()
- try:
- flagCommand = False
- flagKilled = False
- # begin transaction
- self.cur.execute("START TRANSACTION")
- for table in ('jobsDefined4','jobsActive4','jobsWaiting4'):
- # get DN if user is not production DN
- if (not prodManager) and (not user.startswith('/DC=org/DC=doegrids/OU=People/CN=Nurcan Ozturk')) \
- and (not user.startswith('/DC=org/DC=doegrids/OU=People/CN=Torre Wenaus')):
- self.cur.execute((sql0+comment) % (table,pandaID))
- res = self.cur.fetchone()
- # not found
- if res == None:
- continue
- # owner?
- def getCN(dn):
- distinguishedName = ''
- for line in dn.split('/'):
- if line.startswith('CN='):
- distinguishedName = re.sub('^CN=','',line)
- distinguishedName = re.sub('\d+$','',distinguishedName)
- distinguishedName = distinguishedName.strip()
- break
- return distinguishedName
- cn1 = getCN(res[0])
- cn2 = getCN(user)
- _logger.debug("Owner:%s - Requester:%s " % (cn1,cn2))
- if cn1 != cn2:
- _logger.debug("ignore killJob -> Owner != Requester")
- break
- # update
- retU = self.cur.execute((sql1+comment) % (table,pandaID))
- if retU == 0:
- continue
- # set flag
- flagCommand = True
- # select
- self.cur.execute((sql2+comment) % (table,pandaID))
- res = self.cur.fetchall()
- if len(res) == 0:
- continue
- # instantiate JobSpec
- job = JobSpec()
- job.pack(res[0])
- # delete
- if table=='jobsDefined4':
- retD = self.cur.execute((sqlU+comment) % (pandaID,))
- else:
- retD = self.cur.execute((sql3+comment) % (table,pandaID))
- if retD == 0:
- continue
- # error code
- job.jobStatus = 'failed'
- job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- job.modificationTime = job.endTime
- job.stateChangeTime = job.modificationTime
- if code in ['2','4']:
- # expire
- if code == '2':
- job.taskBufferErrorCode = ErrorCode.EC_Expire
- job.taskBufferErrorDiag = 'expired after 7 days since submission'
- else:
- # waiting timeout
- job.taskBufferErrorCode = ErrorCode.EC_Expire
- #job.taskBufferErrorCode = ErrorCode.EC_WaitTimeout
- job.taskBufferErrorDiag = 'expired after waiting for input data for 2 days'
- elif code=='3':
- # aborted
- job.taskBufferErrorCode = ErrorCode.EC_Aborted
- job.taskBufferErrorDiag = 'aborted by ExtIF'
- else:
- # killed
- job.taskBufferErrorCode = ErrorCode.EC_Kill
- job.taskBufferErrorDiag = 'killed by %s' % user
- # insert
- self.cur.execute(sql4+comment, job.values())
- flagKilled = True
- break
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("killJob : com=%s kill=%s " % (flagCommand,flagKilled))
- return (flagCommand or flagKilled)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("killJob : %s %s" % (type,value))
- # roll back
- self._rollback()
- return False
-
-
- # peek at job
- def peekJob(self,pandaID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal=False):
- comment = ' /* DBProxy.peekJob */'
- _logger.debug("peekJob : %s" % pandaID)
- # return None for NULL PandaID
- if pandaID in ['NULL','','None',None]:
- return None
- sql1_0 = "SELECT %s FROM %s "
- sql1_1 = "WHERE PandaID=%s"
- try:
- tables=[]
- if fromActive:
- tables.append('jobsActive4')
- if fromArchived:
- tables.append('jobsArchived4')
- if fromWaiting:
- tables.append('jobsWaiting4')
- if fromDefined:
- # defined needs to be the last one due to InnoDB's auto_increment
- tables.append('jobsDefined4')
- # select
- for table in tables:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1
- self.cur.execute(sql+comment, (pandaID,))
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if len(res) != 0:
- # Job
- job = JobSpec()
- job.pack(res[0])
- # Files
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=%s"
- self.cur.execute(sqlFile+comment, (job.PandaID,))
- resFs = self.cur.fetchall()
- # metadata
- if table == 'jobsArchived4' and (not forAnal):
- # read metadata only for finished/failed jobs
- sqlMeta = "SELECT metaData FROM metaTable WHERE PandaID=%s"
- self.cur.execute(sqlMeta+comment, (job.PandaID,))
- resMeta = self.cur.fetchone()
- else:
- resMeta = None
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # set files
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- job.addFile(file)
- # set metadata
- if resMeta != None:
- job.metadata = resMeta[0]
- return job
- _logger.debug("peekJob() : PandaID %s not found" % pandaID)
- return None
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("peekJob : %s %s" % (type,value))
- # return None for analysis
- if forAnal:
- return None
- # return 'unknown'
- job = JobSpec()
- job.PandaID = pandaID
- job.jobStatus = 'unknown'
- return job
-
-
- # get JobIDs in a time range
- def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs):
- comment = ' /* DBProxy.getJobIDsInTimeRange */'
- _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S')))
- try:
- tables = ['jobsArchived4','jobsActive4','jobsWaiting4','jobsDefined4']
- # select
- for table in tables:
- # make sql
- sql = "SELECT jobDefinitionID FROM %s " % table
- sql += "WHERE prodUserID=%s AND modificationTime>%s AND prodSourceLabel='user'"
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- _logger.debug(sql+comment+str((dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))))
- self.cur.execute(sql+comment, (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S')))
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for tmpID, in resList:
- if not tmpID in retJobIDs:
- retJobIDs.append(tmpID)
- _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs))
- return retJobIDs
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobIDsInTimeRange : %s %s" % (type,value))
- # return empty list
- return []
-
-
- # get PandaIDs for a JobID
- def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs):
- comment = ' /* DBProxy.getPandIDsWithJobID */'
- _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID))
- try:
- tables = ['jobsArchived4','jobsActive4','jobsWaiting4','jobsDefined4']
- # select
- for table in tables:
- # skip if all jobs have already been gotten
- if nJobs > 0 and len(idStatus) >= nJobs:
- continue
- # make sql
- sql = "SELECT PandaID,jobStatus,commandToPilot FROM %s " % table
- sql += "WHERE prodUserID=%s AND jobDefinitionID=%s "
- sql += "AND prodSourceLabel in ('user','panda') "
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- _logger.debug(sql+comment+str((dn,jobID)))
- self.cur.execute(sql+comment, (dn,jobID))
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for tmpID,tmpStatus,tmpCommand in resList:
- if not idStatus.has_key(tmpID):
- idStatus[tmpID] = (tmpStatus,tmpCommand)
- _logger.debug("getPandIDsWithJobID : %s" % str(idStatus))
- return idStatus
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getPandIDsWithJobID : %s %s" % (type,value))
- # return empty list
- return {}
-
-
- # query PandaID
- def queryPandaID(self,jobDefID):
- comment = ' /* DBProxy.queryPandaID */'
- _logger.debug("queryPandaID : %s" % jobDefID)
- sql0 = "SELECT PandaID,attemptNr FROM %s WHERE attemptNr=("
- sql0+= "SELECT MAX(attemptNr) FROM %s"
- sql1= " WHERE prodSourceLabel=%s AND jobDefinitionID=%s) AND prodSourceLabel=%s AND jobDefinitionID=%s"
- try:
- ids = []
- # select
- for table in ['jobsDefined4','jobsActive4','jobsArchived4','jobsWaiting4']:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = sql0 % (table,table) + sql1
- self.cur.execute(sql+comment, ('managed',jobDefID,'managed',jobDefID))
- res = self.cur.fetchall()
- ids += list(res)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # look for the latest attempt
- preAtt =-1
- pandaID=None
- for pID,att in ids:
- if att > preAtt:
- pandaID = pID
- preAtt = att
- if att == preAtt:
- if pandaID < pID:
- pandaID = pID
- return pandaID
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("queryPandaID : %s %s" % (type,value))
- # roll back
- self._rollback()
- return None
-
-
- # query job info per cloud
- def queryJobInfoPerCloud(self,cloud,schedulerID=None):
- comment = ' /* DBProxy.queryJobInfoPerCloud */'
- _logger.debug("queryJobInfoPerCloud : %s %s" % (cloud,schedulerID))
- attrs = ['PandaID','jobStatus','jobName']
- sql0 = "SELECT "
- for attr in attrs:
- sql0 += "%s," % attr
- sql0 = "%s " % sql0[:-1]
- sql0+= "FROM %s "
- sql0+= "WHERE cloud='%s' " % cloud
- if schedulerID != None:
- sql0+= "AND schedulerID='%s' " % schedulerID
- try:
- ids = []
- returnList = []
- # select
- for table in ['jobsActive4','jobsWaiting4','jobsDefined4']:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = sql0 % table
- self.cur.execute(sql+comment)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # loop over all
- for res in resList:
- valMap = {}
- # skip if already in the list
- PandaID = res[0]
- if PandaID in ids:
- continue
- # convert to map
- for idx,attr in enumerate(attrs):
- valMap[attr] = res[idx]
- # append to list
- ids.append(PandaID)
- returnList.append(valMap)
- # return
- return returnList
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("queryJobInfoPerCloud : %s %s" % (type,value))
- # roll back
- self._rollback()
- return None
-
-
- # get PandaIDs at Site
- def getPandaIDsSite(self,site,status,limit):
- comment = ' /* DBProxy.getPandaIDsSite */'
- _logger.debug("getPandaIDsSite : %s %s %s" % (site,status,limit))
- try:
- ids = []
- # find table
- if status in ['defined','assigned']:
- table = 'jobsDefined4'
- elif status in ['activated','running','holding','trasnferring']:
- table = 'jobsActive4'
- elif status in ['waiting']:
- table = 'jobsWaiting4'
- elif status in ['finished','failed']:
- table = 'jobsArchived4'
- else:
- _logger.error("unknown status:%s" % status)
- return ids
- # limit
- limit = int(limit)
- # SQL
- sql = "SELECT PandaID FROM %s " % table
- sql += "WHERE computingSite=%s AND jobStatus=%s AND prodSourceLabel=%s "
- sql += "LIMIT %d" % limit
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute(sql+comment, (site,status,'managed'))
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # convert to list
- for id, in res:
- ids.append(id)
- return ids
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getPandaIDsSite : %s %s" % (type,value))
- # roll back
- self._rollback()
- return []
-
-
- # get PandaIDs to be updated in prodDB
- def getPandaIDsForProdDB(self,limit,lockedby):
- comment = ' /* DBProxy.getPandaIDsForProdDB */'
- _logger.debug("getPandaIDsForProdDB %s" % limit)
- sql0 = "SELECT PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID FROM %s "
- sql0+= "WHERE prodSourceLabel IN ('managed','rc_test') AND lockedby='%s' " % lockedby
- sql0+= "AND stateChangeTime>prodDBUpdateTime AND stateChangeTime<>'0000-00-00 00:00:00'"
- try:
- retMap = {}
- totalIDs = 0
- # select
- for table in ['jobsArchived4','jobsActive4','jobsWaiting4','jobsDefined4']:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = sql0 % table
- self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- for PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID in res:
- # ignore dummy jobs in jobsDefined4
- if table == 'jobsDefined4' and (not jobStatus in ['defined','assigned']):
- continue
- # add status
- if not retMap.has_key(jobStatus):
- retMap[jobStatus] = []
- # append
- retMap[jobStatus].append({'PandaID':PandaID,'attemptNr':attemptNr,
- 'stateChangeTime':stateChangeTime.strftime('%Y-%m-%d %H:%M:%S'),
- 'jobDefinitionID':jobDefinitionID,
- 'jobExecutionID':jobExecutionID})
- totalIDs += 1
- # limit
- if totalIDs > limit:
- break
- _logger.debug("getPandaIDsForProdDB %s ret->%s" % (limit,totalIDs))
- return retMap
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getPandaIDsForProdDB : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # update prodDBUpdateTime
- def updateProdDBUpdateTime(self,param):
- comment = ' /* DBProxy.updateProdDBUpdateTime */'
- _logger.debug("updateProdDBUpdateTime %s" % str(param))
- sql0 = "UPDATE %s "
- sql0+= "SET prodDBUpdateTime='%s' " % param['stateChangeTime']
- sql0+= "WHERE PandaID=%s AND jobStatus='%s' AND stateChangeTime='%s'" % (param['PandaID'],
- param['jobStatus'],
- param['stateChangeTime'])
- try:
- if param['jobStatus'] in ['defined','assigned']:
- table = 'jobsDefined4'
- elif param['jobStatus'] in ['waiting']:
- table = 'jobsWaiting4'
- elif param['jobStatus'] in ['activated','sent','starting','running','holding','transferring']:
- table = 'jobsActive4'
- elif param['jobStatus'] in ['finished','failed']:
- table = 'jobsArchived4'
- else:
- _logger.error("invalid status %s" % param['jobStatus'])
- return False
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # update
- sql = sql0 % table
- _logger.debug(sql)
- retU = self.cur.execute(sql+comment)
- _logger.debug("updateProdDBUpdateTime %s ret=%s" % (param['PandaID'],retU))
- if retU == 1:
- return True
- return False
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("updateProdDBUpdateTime : %s %s" % (type,value))
- # roll back
- self._rollback()
- return False
-
-
- # add metadata
- def addMetadata(self,pandaID,metadata):
- comment = ' /* DBProxy.addMetaData */'
- _logger.debug("addMetaData : %s" % pandaID)
- sql0 = "SELECT PandaID FROM metaTable WHERE PandaID=%s"
- sql1 = "INSERT INTO metaTable (PandaID,metaData) VALUE (%s,%s)"
- nTry=3
- for iTry in range(nTry):
- try:
- # autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute(sql0+comment, (pandaID,))
- res = self.cur.fetchone()
- # already exist
- if res != None:
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- # insert
- self.cur.execute(sql1+comment, (pandaID,metadata))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("addMetaData : %s retry : %s" % (pandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("addMetaData : %s %s" % (type,value))
- return False
-
-
- # insert dataset
- def insertDataset(self,dataset,tablename="Datasets"):
- comment = ' /* DBProxy.insertDataset */'
- _logger.debug("insertDataset(%s)" % dataset.name)
- sql1 = "INSERT INTO %s " % tablename
- sql1+= "(%s) " % DatasetSpec.columnNames()
- sql1+= DatasetSpec.valuesExpression()
- # time information
- dataset.creationdate = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- dataset.modificationdate = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- try:
- # get file lock
- #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_EX)
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # avoid duplication
- self.cur.execute("SELECT vuid FROM "+tablename+" WHERE vuid=%s"+comment, (dataset.vuid,))
- res = self.cur.fetchall()
- if len(res) == 0:
- # insert
- self.cur.execute(sql1+comment, dataset.values())
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # release file lock
- #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN)
- return True
- except:
- # roll back
- self._rollback()
- # release file lock
- #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN)
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("insertDataset() : %s %s" % (type,value))
- return False
-
-
- # query dataset with map
- def queryDatasetWithMap(self,map):
- comment = ' /* DBProxy.queryDatasetWithMap */'
- _logger.debug("queryDatasetWithMap(%s)" % map)
- sql1 = "SELECT %s FROM Datasets" % DatasetSpec.columnNames()
- valueL = []
- for key in map.keys():
- if len(valueL)==0:
- sql1+= " WHERE %s=" % key
- else:
- sql1+= " AND %s=" % key
- sql1+= "%s"
- valueL.append(map[key])
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- nTry=5
- for iTry in range(nTry):
- retS = self.cur.execute(sql1+comment, tuple(valueL))
- res = self.cur.fetchall()
- if retS>=0 and res != None and retS==len(res):
- break
- if iTry+1 < nTry:
- _logger.debug("queryDatasetWithMap : retS %s retry : %s" % (retS,iTry))
- time.sleep(random.randint(10,20))
- _logger.debug("queryDatasetWithMap(%s) : retS %s ret %s" % (str(map),retS,str(res)))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # instantiate Dataset
- if res != None and len(res) != 0:
- dataset = DatasetSpec()
- dataset.pack(res[0])
- return dataset
- _logger.error("queryDatasetWithMap(%s) : dataset not found" % map)
- return None
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("queryDatasetWithMap(%s) : %s %s" % (map,type,value))
- return None
-
-
- # update dataset
- def updateDataset(self,datasets,withLock,withCriteria):
- comment = ' /* DBProxy.updateDataset */'
- _logger.debug("updateDataset()")
- sql1 = "UPDATE Datasets SET %s " % DatasetSpec.updateExpression()
- sql1+= "WHERE vuid=%s"
- if withCriteria != "":
- sql1+= " AND %s" % withCriteria
- nTry=3
- for iTry in range(nTry):
- try:
- # get file lock
- if withLock:
- fcntl.flock(_lockSetDS.fileno(), fcntl.LOCK_EX)
- retList = []
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- for dataset in datasets:
- _logger.debug("updateDataset(%s,%s)" % (dataset.name,dataset.status))
- # time information
- dataset.modificationdate = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
- # update
- retU = self.cur.execute(sql1+comment, dataset.values()+(dataset.vuid,))
- if retU != 0 and retU != 1:
- raise RuntimeError, 'Invalid retrun %s' % retU
- retList.append(retU)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # release file lock
- if withLock:
- fcntl.flock(_lockSetDS.fileno(), fcntl.LOCK_UN)
- _logger.debug("updateDataset() ret:%s" % retList)
- return retList
- except:
- # roll back
- self._rollback()
- # release file lock
- if withLock:
- fcntl.flock(_lockSetDS.fileno(), fcntl.LOCK_UN)
- if iTry+1 < nTry:
- _logger.debug("updateDataset : retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("updateDataset() : %s %s" % (type,value))
- return []
-
-
- # delete dataset
- def deleteDataset(self,name):
- comment = ' /* DBProxy.deleteDataset */'
- sql1 = "DELETE FROM Datasets WHERE name=%s"
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # delete
- self.cur.execute(sql1+comment,(name,))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("deleteDataset() : %s %s" % (type,value))
- return False
-
-
- # get serial number for dataset, insert dummy datasets to increment SN
- def getSerialNumber(self,datasetname):
- comment = ' /* DBProxy.getSerialNumber */'
- try:
- _logger.debug("getSerialNumber(%s)" % datasetname)
- # get file lock
- #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_EX)
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = "SELECT COUNT(*) FROM Datasets WHERE type='output' AND name='%s'" % datasetname
- nTry=3
- for iTry in range(nTry):
- retS = self.cur.execute(sql+comment)
- res = self.cur.fetchone()
- _logger.debug("getSerialNumber : retS %s, res %s" % (retS,res))
- if retS>=0 and res != None:
- break
- if iTry+1 < nTry:
- time.sleep(random.randint(10,20))
- # fresh dataset or not
- if res != None and len(res) != 0 and res[0] > 0:
- freshFlag = False
- else:
- freshFlag = True
- # get serial number
- sql = "INSERT INTO subCounter (subID) VALUES ('NULL')"
- self.cur.execute(sql+comment)
- sn = self.conn.insert_id()
- # delete. '<' is needed for auto_incr of InnoDB
- sql = "DELETE FROM subCounter where subID<%s" % sn
- self.cur.execute(sql+comment)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # release file lock
- #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN)
- _logger.debug("getSerialNumber : %s %s" % (sn,freshFlag))
- return (sn,freshFlag)
- except:
- # roll back
- self._rollback()
- # release file lock
- #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN)
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getSerialNumber() : %s %s" % (type,value))
- return (-1,False)
-
-
- # update transfer status for a dataset
- def updateTransferStatus(self,datasetname,bitMap):
- comment = ' /* DBProxy.updateTransferStatus */'
- try:
- _logger.debug("updateTransferStatus(%s,%s)" % (datasetname,hex(bitMap)))
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- retTransSt = 0
- # update bitmap
- sqlU = "UPDATE Datasets SET transferStatus=transferStatus|%s WHERE name='%s'" % (bitMap,datasetname)
- retU = self.cur.execute(sqlU+comment)
- # get transferStatus
- sqlS = "SELECT transferStatus from Datasets WHERE name='%s'" % datasetname
- retS = self.cur.execute(sqlS+comment)
- resS = self.cur.fetchall()
- if resS != None and len(resS) != 0:
- retTransSt = resS[0][0]
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("updateTransferStatus : %s" % hex(retTransSt))
- return retTransSt
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("updateTransferStatus : %s %s" % (type,value))
- return 0
-
-
- # get CloudTask. If not exist, create it
- def getCloudTask(self,tid):
- comment = ' /* getCloudTask */'
- try:
- _logger.debug("getCloudTask(%s)" % tid)
- # check tid
- if tid in [None,'NULL']:
- _logger.error("invalid TID : %s" % tid)
- return None
- # get file lock
- fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_EX)
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = "SELECT %s FROM cloudtasks " % CloudTaskSpec.columnNames()
- sql += "WHERE taskid=%s" % tid
- nTry=5
- for iTry in range(nTry):
- retS = self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- _logger.debug("getCloudTask : retS %s" % retS)
- if retS>=0 and res != None and retS==len(res):
- break
- if iTry+1 < nTry:
- time.sleep(random.randint(10,20))
- # already exist
- if res != None and len(res) != 0:
- # instantiate CloudTask
- cloudTask = CloudTaskSpec()
- cloudTask.pack(res[0])
- # update tmod if status <> 'assigned'
- if cloudTask.status <> 'assigned':
- sql = "UPDATE cloudtasks SET tmod=UTC_TIMESTAMP() WHERE taskid=%s" % cloudTask.taskid
- self.cur.execute(sql+comment)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # release file lock
- fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_UN)
- _logger.debug("return existing CloudTask")
- return cloudTask
- # insert new CloudTask
- _logger.debug("insert new CloudTask")
- cloudTask = CloudTaskSpec()
- cloudTask.taskid = tid
- cloudTask.status = 'defined'
- sql = "INSERT INTO cloudtasks (taskid,status,tmod,tenter) VALUES(%s,%s,UTC_TIMESTAMP(),UTC_TIMESTAMP())"
- self.cur.execute(sql+comment,(cloudTask.taskid,cloudTask.status))
- # get id
- cloudTask.id = self.conn.insert_id()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # release file lock
- fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_UN)
- _logger.debug("return new CloudTask")
- return cloudTask
- except:
- # roll back
- self._rollback()
- # release file lock
- fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_UN)
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getCloudTask() : %s %s" % (type,value))
- return None
-
-
- # set cloud to CloudTask
- def setCloudTask(self,cloudTask):
- comment = ' /* setCloudTask */'
- try:
- _logger.debug("setCloudTask(id=%s,taskid=%s)" % (cloudTask.id,cloudTask.taskid))
- sql = "UPDATE cloudtasks SET cloud=%s,status=%s,tmod=UTC_TIMESTAMP() WHERE id=%s AND status='defined'"
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # update
- retU = self.cur.execute(sql+comment,(cloudTask.cloud,'assigned',cloudTask.id))
- # succeeded
- if retU != 0:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return cloudTask
- # read if it is already set by another thread
- sql = "SELECT %s FROM cloudtasks " % CloudTaskSpec.columnNames()
- sql += "WHERE id=%s" % cloudTask.id
- # select
- retS = self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # retrun CloudTask
- if res != None and len(res) != 0:
- # instantiate CloudTask
- cloudTask = CloudTaskSpec()
- cloudTask.pack(res[0])
- return cloudTask
- _logger.error("setCloudTask() : cannot find CloudTask for %s" % cloudTask.id)
- return None
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("setCloudTask() : %s %s" % (type,value))
- return None
-
-
- # see CloudTask
- def seeCloudTask(self,tid):
- comment = ' /* seeCloudTask */'
- try:
- _logger.debug("seeCloudTask(%s)" % tid)
- # check tid
- if tid in [None,'NULL']:
- _logger.error("invalid TID : %s" % tid)
- return None
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = "SELECT cloud FROM cloudtasks WHERE taskid=%s" % tid
- nTry=5
- for iTry in range(nTry):
- retS = self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- _logger.debug("seeCloudTask : retS %s" % retS)
- if retS>=0 and res != None and retS==len(res):
- break
- if iTry+1 < nTry:
- time.sleep(random.randint(10,20))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # existing task
- if res != None and len(res) != 0:
- # return cloud
- return res[0][0]
- else:
- return None
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("seeCloudTask() : %s %s" % (type,value))
- return None
-
-
- # get assigning task
- def getAssigningTask(self):
- comment = ' /* getAssigningTask */'
- try:
- _logger.debug("getAssigningTask")
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = "SELECT taskid FROM cloudtasks WHERE status<>'assigned' AND tmod>'%s'" % timeLimit.strftime('%Y-%m-%d %H:%M:%S')
- self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # loop over all taskid
- retList = []
- if res != None:
- for tid, in res:
- retList.append(tid)
- # return
- _logger.debug("getAssigningTask ret:%s" % retList)
- return retList
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getAssigningTask : %s %s" % (type,value))
- return []
-
-
- # query files with map
- def queryFilesWithMap(self,map):
- comment = ' /* DBProxy.queryFilesWithMap */'
- _logger.debug("queryFilesWithMap()")
- sql1 = "SELECT PandaID,%s FROM filesTable4" % FileSpec.columnNames()
- valueL = []
- for key in map.keys():
- if len(valueL)==0:
- sql1+= " WHERE %s=" % key
- else:
- sql1+= " AND %s=" % key
- sql1+= "%s"
- valueL.append(map[key])
- nTry=3
- for iTry in range(nTry):
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute(sql1+comment, tuple(valueL))
- res = self.cur.fetchall()
- _logger.debug("queryFilesWithMap() : %s" % str(res))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # instantiate files
- retList = []
- for item in res:
- # instantiate dummy JobSpec obj for PandaID
- job = JobSpec()
- job.PandaID = item[0]
- # instantiate file
- file = FileSpec()
- file.pack(item[1:])
- # set owner
- file.setOwner(job)
- # append
- retList.append(file)
- return retList
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("queryFilesWithMap retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("queryFilesWithMap : %s %s" % (type,value))
- return []
-
-
- # count the number of files with map
- def countFilesWithMap(self,map):
- comment = ' /* DBProxy.countFilesWithMap */'
- sql1 = "SELECT COUNT(*) FROM filesTable4"
- valueL = []
- for key in map.keys():
- if len(valueL)==0:
- sql1+= " WHERE %s=" % key
- else:
- sql1+= " AND %s=" % key
- sql1+= "%s"
- valueL.append(map[key])
- nTry=3
- for iTry in range(nTry):
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- _logger.debug("countFilesWithMap() : %s" % str(map))
- retS = self.cur.execute(sql1+comment, tuple(valueL))
- res = self.cur.fetchone()
- _logger.debug("countFilesWithMap() : %s %s" % (retS,str(res)))
- # check return
- if retS != 1:
- raise RuntimeError, 'Invalid return'
- nFiles=0
- if res != None:
- nFiles=res[0]
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return nFiles
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("countFilesWithMap() retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("countFilesWithMap(%s) : %s %s" % (map,type,value))
- return -1
-
-
- # update input files and return corresponding PandaIDs
- def updateInFilesReturnPandaIDs(self,dataset,status):
- comment = ' /* DBProxy.updateInFilesReturnPandaIDs */'
- _logger.debug("updateInFilesReturnPandaIDs(%s)" % dataset)
- sql0 = "SELECT rowID,PandaID FROM filesTable4 WHERE status<>%s AND dispatchDBlock=%s"
- for iTry in range(self.nTry):
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- retS = self.cur.execute(sql0+comment, (status,dataset))
- resS = self.cur.fetchall()
- _logger.debug("updateInFilesReturnPandaIDs : retS %s" % retS)
- if retS<0 or resS==None or retS!=len(resS):
- raise RuntimeError, 'SQL error'
- # avoid too long expression
- nDiv = 10
- nRow,tmpMod = divmod(len(resS),nDiv)
- if tmpMod != 0:
- nRow += 1
- # update
- retList = []
- for iRow in range(nRow):
- rows = []
- pandaIDs = []
- for tmpRowID,tmpPandaID in resS[iRow*nDiv:(iRow+1)*nDiv]:
- rows.append(tmpRowID)
- if not tmpPandaID in pandaIDs:
- pandaIDs.append(tmpPandaID)
- # make SQL query
- sql1 = "UPDATE filesTable4 SET status=%s WHERE "
- for row in rows:
- if row != rows[0]:
- sql1+= "OR "
- sql1+= "rowID=%s "
- # update
- retU = self.cur.execute(sql1+comment, tuple([status]+rows))
- _logger.debug("updateInFilesReturnPandaIDs : retU %s" % retU)
- # append
- for tmpPandaID in pandaIDs:
- if not tmpPandaID in retList:
- retList.append(tmpPandaID)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- _logger.debug("updateInFilesReturnPandaIDs : %s" % str(retList))
- return retList
- except:
- # roll back
- self._rollback()
- # error report
- if iTry+1 < self.nTry:
- _logger.debug("updateInFilesReturnPandaIDs retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("updateInFilesReturnPandaIDs : %s %s" % (type, value))
- return []
-
-
- # update output files and return corresponding PandaIDs
- def updateOutFilesReturnPandaIDs(self,dataset):
- comment = ' /* DBProxy.updateOutFilesReturnPandaIDs */'
- _logger.debug("updateOutFilesReturnPandaIDs(%s)" % dataset)
- sql0 = "SELECT rowID,PandaID FROM filesTable4 WHERE destinationDBlock=%s AND status='transferring'"
- for iTry in range(self.nTry):
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- retS = self.cur.execute(sql0+comment, (dataset,))
- resS = self.cur.fetchall()
- _logger.debug("updateOutFilesReturnPandaIDs : retS %s" % retS)
- if retS<0 or resS==None or retS!=len(resS):
- raise RuntimeError, 'SQL error'
- # avoid too long expression
- nDiv = 10
- nRow,tmpMod = divmod(len(resS),nDiv)
- if tmpMod != 0:
- nRow += 1
- # update
- retList = []
- for iRow in range(nRow):
- rows = []
- pandaIDs = []
- for tmpRowID,tmpPandaID in resS[iRow*nDiv:(iRow+1)*nDiv]:
- rows.append(tmpRowID)
- if not tmpPandaID in pandaIDs:
- pandaIDs.append(tmpPandaID)
- # make SQL query
- sql1 = "UPDATE filesTable4 SET status=%s WHERE "
- for row in rows:
- if row != rows[0]:
- sql1+= "OR "
- sql1+= "rowID=%s "
- # update
- retU = self.cur.execute(sql1+comment, tuple(['ready']+rows))
- _logger.debug("updateOutFilesReturnPandaIDs : retU %s" % retU)
- # append
- for tmpPandaID in pandaIDs:
- if not tmpPandaID in retList:
- retList.append(tmpPandaID)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- _logger.debug("updateOutFilesReturnPandaIDs : %s" % str(retList))
- return retList
- except:
- # roll back
- self._rollback()
- # error report
- if iTry+1 < self.nTry:
- _logger.debug("updateOutFilesReturnPandaIDs retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("updateOutFilesReturnPandaIDs : %s %s" % (type, value))
- return []
-
-
- # set GUIDs
- def setGUIDs(self,files):
- comment = ' /* DBProxy.setGUIDs */'
- _logger.debug("setGUIDs(%s)" % files)
- sql0 = "UPDATE filesTable4 SET GUID=%s WHERE lfn=%s"
- for iTry in range(self.nTry):
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # update
- for file in files:
- retU = self.cur.execute(sql0+comment, (file['guid'],file['lfn']))
- _logger.debug("setGUIDs : retU %s" % retU)
- if retU<0:
- raise RuntimeError, 'SQL error'
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- # error report
- if iTry+1 < self.nTry:
- _logger.debug("setGUIDs retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("setGUIDs : %s %s" % (type, value))
- return False
-
-
- # query PandaID with Datasets
- def queryPandaIDwithDataset(self,datasets):
- comment = ' /* DBProxy.queryPandaIDwithDataset */'
- _logger.debug("queryPandaIDwithDataset(%s)" % datasets)
- if len(datasets) == 0:
- return []
- # make SQL query
- sql1 = "SELECT PandaID FROM filesTable4 WHERE "
- for dataset in datasets:
- if dataset != datasets[0]:
- sql1+= "OR "
- sql1+= "destinationDBlock='%s' " % dataset
- sql1+= "GROUP BY PandaID"
- # execute
- for iTry in range(self.nTry):
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute(sql1+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- retList = []
- for r in res:
- retList.append(r[0])
- # return
- _logger.debug("queryPandaIDwithDataset : %s" % str(retList))
- return retList
- except:
- # roll back
- self._rollback()
- # error report
- if iTry+1 < self.nTry:
- _logger.debug("queryPandaIDwithDataset retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("queryPandaIDwithDataset : %s %s" % (type, value))
- return []
-
-
- # query last files in datasets
- def queryLastFilesInDataset(self,datasets):
- comment = ' /* DBProxy.queryLastFilesInDataset */'
- _logger.debug("queryLastFilesInDataset(%s)" % datasets)
- if len(datasets) == 0:
- return []
- # make SQL query
- sql1 = "SELECT MAX(PandaID) FROM filesTable4 WHERE dataset=%s AND type='output'"
- sql2 = "SELECT lfn FROM filesTable4 WHERE PandaID=%s AND type='output'"
- # execute
- try:
- retMap = {}
- for dataset in datasets:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select PandaID
- self.cur.execute(sql1+comment,(dataset,))
- res = self.cur.fetchone()
- # found
- retList = []
- if res != None:
- pandaID = res[0]
- # select LFNs
- self.cur.execute(sql2+comment,(pandaID,))
- res = self.cur.fetchall()
- for r in res:
- retList.append(r[0])
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- retMap[dataset] = retList
- # return
- _logger.debug("queryLastFilesInDataset : %s" % str(retMap))
- return retMap
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("queryLastFilesInDataset : %s %s" % (type, value))
- return {}
-
-
- # query PandaID with filenames
- def queryPandaIDwithLFN(self,vlfns):
- comment = ' /* DBProxy.queryPandaIDwithLFN */'
- _logger.debug("queryPandaIDwithLFN(%s)" % vlfns)
- if len(vlfns) == 0:
- return []
- # avoid too long expression
- nDiv = 15
- nLFN,tmpMod = divmod(len(vlfns),nDiv)
- if tmpMod != 0:
- nLFN += 1
- # execute
- retList = []
- for iLFN in range(nLFN):
- lfns = vlfns[iLFN*nDiv:(iLFN+1)*nDiv]
- # make SQL query
- sql1 = "SELECT PandaID FROM filesTable4 WHERE "
- for lfn in lfns:
- if lfn != lfns[0]:
- sql1+= "OR "
- sql1+= "lfn=%s "
- sql1+= "GROUP BY PandaID"
- # get generic LFNs
- gLFNs = []
- for lfn in lfns:
- gLFNs.append(re.sub('\.\d+$','',lfn))
- # try
- for iTry in range(self.nTry):
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute(sql1+comment, tuple(gLFNs))
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append IDs
- for r in res:
- if not r[0] in retList:
- retList.append(r[0])
- break
- except:
- # roll back
- self._rollback()
- # error report
- if iTry+1 < self.nTry:
- _logger.debug("queryPandaIDwithLFN retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("queryPandaIDwithLFN : %s %s" % (type, value))
- return []
- # return
- _logger.debug("queryPandaIDwithLFN : %s" % str(retList))
- return retList
-
-
- # get job statistics
- def getJobStatistics(self,archived=False,predefined=False):
- comment = ' /* DBProxy.getJobStatistics */'
- _logger.debug("getJobStatistics()")
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12)
- sql0 = "SELECT computingSite,jobStatus,COUNT(*) FROM %s WHERE prodSourceLabel in ('managed','rc_test','user','panda','ddm') "
- if predefined:
- sql0 += "AND relocationFlag=1 "
- sql0 += "GROUP BY computingSite,jobStatus"
- sqlA = "SELECT computingSite,jobStatus,COUNT(*) FROM jobsArchived4 WHERE modificationTime>'%s' AND prodSourceLabel in ('managed','rc_test','user','panda','ddm') " \
- % (timeLimit.strftime('%Y-%m-%d %H:%M:%S'))
- if predefined:
- sqlA += "AND relocationFlag=1 "
- sqlA += "GROUP BY computingSite,jobStatus"
- tables = ['jobsActive4','jobsDefined4']
- if archived:
- tables.append('jobsArchived4')
- ret = {}
- nTry=3
- for iTry in range(nTry):
- try:
- for table in tables:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- if table != 'jobsArchived4':
- self.cur.execute((sql0+comment) % table)
- else:
- self.cur.execute(sqlA+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for item in res:
- if not ret.has_key(item[0]):
- ret[item[0]] = {}
- if not ret[item[0]].has_key(item[1]):
- ret[item[0]][item[1]] = 0
- ret[item[0]][item[1]] += item[2]
- # for zero
- stateList = ['assigned','activated','running']
- if archived:
- stateList += ['finished','failed']
- for site in ret.keys():
- for state in stateList:
- if not ret[site].has_key(state):
- ret[site][state] = 0
- # return
- _logger.debug("getJobStatistics() : %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("getJobStatistics() retry : %s" % iTry)
- time.sleep(2)
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobStatistics : %s %s" % (type, value))
- return {}
-
-
- # get job statistics for brokerage
- def getJobStatisticsBrokerage(self):
- comment = ' /* DBProxy.getJobStatisticsBrokerage */'
- _logger.debug("getJobStatisticsBrokerage()")
- sql0 = "SELECT computingSite,jobStatus,processingType,COUNT(*) FROM %s WHERE prodSourceLabel IN ('managed','rc_test','user','panda','ddm') "
- sql0 += "GROUP BY computingSite,jobStatus,processingType"
- tables = ['jobsActive4','jobsDefined4']
- ret = {}
- nTry=3
- for iTry in range(nTry):
- try:
- for table in tables:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute((sql0+comment) % table)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for computingSite,jobStatus,processingType,count in res:
- # add site
- if not ret.has_key(computingSite):
- ret[computingSite] = {}
- # add processingType
- if not ret[computingSite].has_key(processingType):
- ret[computingSite][processingType] = {}
- # add jobStatus
- if not ret[computingSite][processingType].has_key(jobStatus):
- ret[computingSite][processingType][jobStatus] = count
- # for zero
- for site,siteVal in ret.iteritems():
- for pType,typeVal in siteVal.iteritems():
- for stateItem in ['assigned','activated','running']:
- if not typeVal.has_key(stateItem):
- typeVal[stateItem] = 0
- # return
- return ret
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("getJobStatisticsBrokerage retry : %s" % iTry)
- time.sleep(2)
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobStatisticsBrokerage : %s %s" % (type, value))
- return {}
-
-
- # get computingSite and destinationSE for a dataset
- def getDestSE(self,dsname):
- comment = ' /* DBProxy.getDestSE */'
- _logger.debug("getDestSE(%s)" % dsname)
- sql0 = "SELECT PandaID FROM filesTable4 WHERE destinationDBlock='%s' AND status='transferring' LIMIT 1" % dsname
- sql1 = "SELECT computingSite,destinationSE FROM jobsActive4 WHERE PandaID=%s"
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute(sql0+comment)
- res = self.cur.fetchall()
- # get PandaID
- pandaID = None
- if len(res) != 0:
- pandaID = res[0][0]
- # get computingSite and destinationSE
- destSE = None,None
- if pandaID != None:
- self.cur.execute((sql1+comment) % pandaID)
- res = self.cur.fetchall()
- if len(res) != 0:
- destSE = res[0]
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- _logger.debug("getDestSE(%s) : %s" % (dsname,str(destSE)))
- return destSE
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getDestSE : %s %s" % (type, value))
- return None,None
-
-
- # get destinationDBlockToken for a dataset
- def getDestTokens(self,dsname):
- comment = ' /* DBProxy.getDestTokens */'
- _logger.debug("getDestTokens(%s)" % dsname)
- sql0 = "SELECT destinationDBlockToken FROM filesTable4 WHERE destinationDBlock='%s' LIMIT 1" % dsname
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute(sql0+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- retToken = None
- if len(res) != 0:
- retToken = res[0][0]
- # return
- _logger.debug("getDestTokens(%s) : %s" % (dsname,retToken))
- return retToken
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getDestTokens : %s %s" % (type, value))
- return None
-
-
- # get the number of job for a user
- def getNumberJobsUser(self,dn):
- comment = ' /* DBProxy.getNumberJobsUser */'
- _logger.debug("getNumberJobsUsers(%s)" % dn)
- sql0 = "SELECT COUNT(*) FROM %s WHERE prodUserID='%s' AND prodSourceLabel='user'"
- nTry = 1
- nJob = 0
- for iTry in range(nTry):
- try:
- for table in ('jobsActive4','jobsDefined4'):
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute((sql0+comment) % (table,dn))
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- if len(res) != 0:
- nJob += res[0][0]
- # return
- _logger.debug("getNumberJobsUsers(%s) : %s" % (dn,nJob))
- return nJob
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- time.sleep(2)
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("getNumberJobsUsers : %s %s" % (type, value))
- return 0
-
-
- # get job statistics for ExtIF
- def getJobStatisticsForExtIF(self,sourcetype=None):
- comment = ' /* DBProxy.getJobStatisticsForExtIF */'
- _logger.debug("getJobStatisticsForExtIF()")
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12)
- if sourcetype == 'analysis':
- sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel in ('user','panda') GROUP BY jobStatus,cloud"
- sqlA = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel in ('user','panda') "
- else:
- sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN ('managed','rc_test') GROUP BY jobStatus,cloud"
- sqlA = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN ('managed','rc_test') "
- sqlA+= "AND modificationTime>'%s' GROUP BY jobStatus,cloud" % (timeLimit.strftime('%Y-%m-%d %H:%M:%S'))
- ret = {}
- try:
- for table in ('jobsActive4','jobsWaiting4','jobsArchived4','jobsDefined4'):
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- if table != 'jobsArchived4':
- self.cur.execute((sql0+comment) % table)
- else:
- self.cur.execute((sqlA+comment) % table)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # change NULL to US for old jobs
- newRes = []
- usMap = {}
- for jobStatus,count,cloud in res:
- if not cloud in ['US','NULL']:
- # append since no conversion is required
- newRes.append((jobStatus,count,cloud))
- else:
- # sum
- if not usMap.has_key(jobStatus):
- usMap[jobStatus] = 0
- usMap[jobStatus] += count
- # append US counts
- for jobStatus,count in usMap.iteritems():
- newRes.append((jobStatus,count,'US'))
- # create map
- for item in newRes:
- # add cloud
- if not ret.has_key(item[2]):
- ret[item[2]] = {}
- # this is needed for auto_increment of InnoDB
- if not ret[item[2]].has_key(item[0]):
- ret[item[2]][item[0]] = item[1]
- # return
- _logger.debug("getJobStatisticsForExtIF() : %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobStatisticsForExtIF : %s %s" % (type, value))
- return {}
-
-
- # get job statistics per processingType
- def getJobStatisticsPerProcessingType(self):
- comment = ' /* DBProxy.getJobStatisticsPerProcessingType */'
- _logger.debug("getJobStatisticsPerProcessingType()")
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12)
- sql0 = "SELECT jobStatus,COUNT(*),cloud,processingType FROM %s "
- sql0 += "WHERE prodSourceLabel IN ('managed','rc_test') "
- sqlT = "AND modificationTime>'%s' " % timeLimit.strftime('%Y-%m-%d %H:%M:%S')
- sql1 = "GROUP BY jobStatus,cloud,processingType"
- sqlN = sql0 + sql1
- sqlA = sql0 + sqlT + sql1
- ret = {}
- try:
- for table in ('jobsActive4','jobsWaiting4','jobsArchived4','jobsDefined4'):
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- if table == 'jobsArchived4':
- self.cur.execute((sqlA+comment) % table)
- else:
- self.cur.execute((sqlN+comment) % table)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for jobStatus,count,cloud,processingType in res:
- # add cloud
- if not ret.has_key(cloud):
- ret[cloud] = {}
- # add processingType
- if not ret[cloud].has_key(processingType):
- ret[cloud][processingType] = {}
- # this is needed for auto_increment of InnoDB
- if not ret[cloud][processingType].has_key(jobStatus):
- ret[cloud][processingType][jobStatus] = count
- # return
- _logger.debug("getJobStatisticsPerProcessingType() : %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobStatisticsPerProcessingType : %s %s" % (type, value))
- return {}
-
-
- # get number of analysis jobs per user
- def getNUserJobs(self,siteName,nJobs):
- comment = ' /* DBProxy.getNUserJobs */'
- _logger.debug("getNUserJobs(%s)" % siteName)
- sql0 = "SELECT prodUserID FROM jobsActive4 WHERE jobStatus='activated' AND prodSourceLabel in ('user','panda') AND computingSite='%s' ORDER BY currentPriority DESC LIMIT %s" % (siteName,nJobs)
- ret = {}
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute(sql0+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for prodUserID, in res:
- if not ret.has_key(prodUserID):
- ret[prodUserID] = 0
- ret[prodUserID] += 1
- # return
- _logger.debug("getNUserJobs() : %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getNUserJobs : %s %s" % (type, value))
- return {}
-
-
- # get number of activated analysis jobs
- def getNAnalysisJobs(self,nProcesses):
- comment = ' /* DBProxy.getNAnalysisJobs */'
- _logger.debug("getNAnalysisJobs(%s)" % nProcesses)
- sql0 = "SELECT computingSite,COUNT(*) FROM jobsActive4 WHERE jobStatus='activated' AND (prodSourceLabel='user' OR prodSourceLabel='panda') GROUP BY computingSite"
- ret = {}
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute(sql0+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for item in res:
- ret[item[0]] = float(item[1])/nProcesses
- # return
- _logger.debug("getNAnalysisJobs() : %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getNAnalysisJobs : %s %s" % (type, value))
- return {}
-
-
- # count pilot requests
- def countPilotRequests(self,ids,prodSourceLabel='None'):
- comment = ' /* DBProxy.countPilotRequests */'
- # prodSourceLabel
- if prodSourceLabel=='user':
- criteria = " AND MESSAGE REGEXP 'user$'"
- else:
- criteria = " AND MESSAGE REGEXP 'None$'"
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
- ret = {}
- try:
- for siteID in ids:
- # begin transaction
- self.cur.execute("START TRANSACTION")
- # select
- sql0 = "SELECT COUNT(*) FROM PANDALOG WHERE Type='getJob' AND BINTIME>'%s'" % \
- timeLimit.strftime('%Y-%m-%d %H:%M:%S')
- sql0+= " AND MESSAGE REGEXP '%s'" % siteID
- sql0+= criteria
- self.cur.execute(sql0+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- ret[siteID] = res[0][0]
- # return
- _logger.debug("countPilotRequests() : %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("countPilotRequests : %s %s" % (type, value))
- # for zero
- for siteID in ids:
- if not ret.has_key(siteID):
- ret[siteID]=0
- return ret
-
-
- # generate pilot token
- def genPilotToken(self,schedulerhost,scheduleruser,schedulerid):
- comment = ' /* DBProxy.genPilotToken */'
- try:
- _logger.debug("genPilotToken(%s,%s,%s)" % (schedulerhost,scheduleruser,schedulerid))
- token = commands.getoutput('uuidgen')
- timeNow = datetime.datetime.utcnow()
- timeExp = timeNow + datetime.timedelta(days=4)
- sql = "INSERT INTO pilottoken (token,schedulerhost,scheduleruser,schedulerid,created,expires) "
- sql += "VALUES (%s,%s,%s,%s,%s,%s)"
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # execute
- self.cur.execute(sql+comment,(token,schedulerhost,scheduleruser,schedulerid,
- timeNow.strftime('%Y-%m-%d %H:%M:%S'),
- timeExp.strftime('%Y-%m-%d %H:%M:%S')))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- retVal = "token=%s,created=%s,expires=%s" % (token,timeNow.strftime('%Y-%m-%d %H:%M:%S'),
- timeExp.strftime('%Y-%m-%d %H:%M:%S'))
- _logger.debug("genPilotToken -> %s" % retVal)
- return retVal
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("genPilotToken : %s %s" % (type, value))
- return None
-
-
- # get list of scheduler users
- def getListSchedUsers(self):
- comment = ' /* DBProxy.getListSchedUsers */'
- try:
- _logger.debug("getListSchedUsers")
- sql = "SELECT token,scheduleruser FROM pilottoken WHERE expires>UTC_TIMESTAMP()"
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # execute
- self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- retVal = {}
- for token,scheduleruser in res:
- retVal[token] = scheduleruser
- _logger.debug("getListSchedUsers->%s" % str(retVal))
- return retVal
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getListSchedUsers : %s %s" % (type, value))
- return {}
-
-
- # wake up connection
- def wakeUp(self):
- for iTry in range(5):
- try:
- # check if the connection is working
- self.conn.ping()
- return
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("wakeUp %d : %s %s" % (iTry,type,value))
- # wait for reconnection
- time.sleep(1)
- self.connect(reconnect=True)
-
-
- # commit
- def _commit(self):
- try:
- self.conn.commit()
- return True
- except:
- _logger.error("commit error")
- return False
-
-
- # rollback
- def _rollback(self):
- try:
- self.conn.rollback()
- return True
- except:
- _logger.error("rollback error")
- return False
-
diff --git a/current/pandaserver/taskbuffer/DBProxyPool.py b/current/pandaserver/taskbuffer/DBProxyPool.py
deleted file mode 100755
index 53aed84bd..000000000
--- a/current/pandaserver/taskbuffer/DBProxyPool.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""
-pool for DBProxies
-
-"""
-
-import inspect
-import Queue
-import OraDBProxy as DBProxy
-import os
-import time
-import random
-from threading import Lock
-from config import panda_config
-from taskbuffer.ConBridge import ConBridge
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('DBProxyPool')
-
-class DBProxyPool:
-
- def __init__(self,dbhost,dbpasswd,nConnection,useTimeout=False):
- # crate lock for callers
- self.lock = Lock()
- self.callers = []
- # create Proxies
- _logger.debug("init")
- self.proxyList = Queue.Queue(nConnection)
- for i in range(nConnection):
- _logger.debug("connect -> %s " % i)
- if useTimeout and hasattr(panda_config,'usedbtimeout') and \
- panda_config.usedbtimeout == True:
- proxy = ConBridge()
- else:
- proxy = DBProxy.DBProxy()
- iTry = 0
- while True:
- if proxy.connect(dbhost,dbpasswd,dbtimeout=60):
- break
- iTry += 1
- _logger.debug("failed -> %s : try %s" % (i,iTry))
- time.sleep(random.randint(60,90))
- self.proxyList.put(proxy)
- time.sleep(1)
- # get PID
- self.pid = os.getpid()
- _logger.debug("ready")
-
- # return a free proxy. this method blocks until a proxy is available
- def getProxy(self):
- """
- # get caller
- caller = inspect.stack()[1][3]
- _logger.debug("PID=%s %s getting proxy used by %s" % (self.pid,caller,str(self.callers)))
- """
- # get proxy
- proxy = self.proxyList.get()
- """
- # lock
- self.lock.acquire()
- # append
- self.callers.append(caller)
- # release
- self.lock.release()
- _logger.debug("PID=%s %s got proxy used by %s" % (self.pid,caller,str(self.callers)))
- """
- # wake up connection
- proxy.wakeUp()
- # return
- return proxy
-
- # put back a proxy
- def putProxy(self,proxy):
- """
- # get caller
- caller = inspect.stack()[1][3]
- _logger.debug("PID=%s %s releasing. used by %s" % (self.pid,caller,str(self.callers)))
- """
- self.proxyList.put(proxy)
- """
- # lock
- self.lock.acquire()
- # append
- self.callers.remove(caller)
- # release
- self.lock.release()
- _logger.debug("PID=%s %s released. used by %s" % (self.pid,caller,str(self.callers)))
- """
diff --git a/current/pandaserver/taskbuffer/DatasetSpec.py b/current/pandaserver/taskbuffer/DatasetSpec.py
deleted file mode 100755
index 815b98a59..000000000
--- a/current/pandaserver/taskbuffer/DatasetSpec.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""
-dataset specification
-
-"""
-
-class DatasetSpec(object):
- # attributes
- _attributes = ('vuid','name','version','type','status','numberfiles','currentfiles','creationdate',
- 'modificationdate','MoverID','transferStatus','subType')
-
- # attributes which have 0 by default
- _zeroAttrs = ('MoverID','transferStatus')
-
-
-
- # constructor
- def __init__(self):
- # install attributes
- for attr in self._attributes:
- setattr(self,attr,None)
-
-
- # override __getattribute__ for SQL
- def __getattribute__(self,name):
- ret = object.__getattribute__(self,name)
- if ret == None:
- return "NULL"
- return ret
-
-
- # return a tuple of values
- def values(self):
- ret = []
- for attr in self._attributes:
- val = getattr(self,attr)
- ret.append(val)
- return tuple(ret)
-
-
- # return map of values
- def valuesMap(self):
- ret = {}
- for attr in self._attributes:
- val = getattr(self,attr)
- if val == 'NULL':
- if attr in self._zeroAttrs:
- val = 0
- else:
- val = None
- ret[':%s' % attr] = val
- return ret
-
-
- # pack tuple into DatasetSpec
- def pack(self,values):
- for i in range(len(self._attributes)):
- attr= self._attributes[i]
- val = values[i]
- setattr(self,attr,val)
-
-
- # return column names for INSERT
- def columnNames(cls):
- ret = ""
- for attr in cls._attributes:
- if ret != "":
- ret += ','
- ret += attr
- return ret
- columnNames = classmethod(columnNames)
-
-
- # return expression of values for INSERT
- def valuesExpression(cls):
- ret = "VALUES("
- for attr in cls._attributes:
- ret += "%s"
- if attr != cls._attributes[len(cls._attributes)-1]:
- ret += ","
- ret += ")"
- return ret
- valuesExpression = classmethod(valuesExpression)
-
-
- # return expression of bind values for INSERT
- def bindValuesExpression(cls):
- ret = "VALUES("
- for attr in cls._attributes:
- ret += ":%s," % attr
- ret = ret[:-1]
- ret += ")"
- return ret
- bindValuesExpression = classmethod(bindValuesExpression)
-
-
- # return an expression for UPDATE
- def updateExpression(cls):
- ret = ""
- for attr in cls._attributes:
- ret = ret + attr + "=%s"
- if attr != cls._attributes[len(cls._attributes)-1]:
- ret += ","
- return ret
- updateExpression = classmethod(updateExpression)
-
-
- # return an expression of bind variables for UPDATE
- def bindUpdateExpression(cls):
- ret = ""
- for attr in cls._attributes:
- ret += '%s=:%s,' % (attr,attr)
- ret = ret[:-1]
- return ret
- bindUpdateExpression = classmethod(bindUpdateExpression)
-
-
-
-
diff --git a/current/pandaserver/taskbuffer/ErrorCode.py b/current/pandaserver/taskbuffer/ErrorCode.py
deleted file mode 100755
index 08f72b116..000000000
--- a/current/pandaserver/taskbuffer/ErrorCode.py
+++ /dev/null
@@ -1,37 +0,0 @@
-############## errror code
-
-# killed
-EC_Kill = 100
-
-# transfer timeout
-EC_Transfer = 101
-
-# expire
-EC_Expire = 102
-
-# aborted
-EC_Aborted = 103
-
-# wait timeout
-EC_WaitTimeout = 104
-
-# reassigned by rebrokeage
-EC_Reassigned = 105
-
-# reassigned by server-side retry
-EC_Retried = 106
-
-# retried by pilot
-EC_PilotRetried = 107
-
-# lost file (=dataservice.ErrorCode.EC_LostFile)
-EC_LostFile = 110
-
-# file not found
-class EC_NotFound:
- pass
-
-# file relocated
-class EC_Redirect:
- def __init__(self,url):
- self.url = url
diff --git a/current/pandaserver/taskbuffer/FileSpec.py b/current/pandaserver/taskbuffer/FileSpec.py
deleted file mode 100755
index 209b2ed65..000000000
--- a/current/pandaserver/taskbuffer/FileSpec.py
+++ /dev/null
@@ -1,213 +0,0 @@
-"""
-file specification
-
-"""
-
-
-class FileSpec(object):
- # attributes
- _attributes = ('row_ID','PandaID','GUID','lfn','type','dataset','status','prodDBlock',
- 'prodDBlockToken','dispatchDBlock','dispatchDBlockToken','destinationDBlock',
- 'destinationDBlockToken','destinationSE','fsize','md5sum','checksum','scope')
- # slots
- __slots__ = _attributes+('_owner','_changedAttrs','_oldPandaID')
- # attributes which have 0 by default
- _zeroAttrs = ('fsize',)
- # mapping between sequence and attr
- _seqAttrMap = {'row_ID':'ATLAS_PANDA.FILESTABLE4_ROW_ID_SEQ.nextval'}
-
-
- # constructor
- def __init__(self):
- # install attributes
- for attr in self._attributes:
- object.__setattr__(self,attr,None)
- # set owner to synchronize PandaID
- object.__setattr__(self,'_owner',None)
- # map of changed attributes
- object.__setattr__(self,'_changedAttrs',{})
- # old PandaID
- object.__setattr__(self,'_oldPandaID','NULL')
-
-
- # override __getattribute__ for SQL and PandaID
- def __getattribute__(self,name):
- # PandaID
- if name == 'PandaID':
- if self._owner == None:
- return 'NULL'
- return self._owner.PandaID
- # others
- ret = object.__getattribute__(self,name)
- if ret == None:
- return "NULL"
- return ret
-
-
- # override __setattr__ to collecte the changed attributes
- def __setattr__(self,name,value):
- oldVal = getattr(self,name)
- object.__setattr__(self,name,value)
- newVal = getattr(self,name)
- # collect changed attributes
- if oldVal != newVal:
- self._changedAttrs[name] = value
-
-
- # set owner
- def setOwner(self,owner):
- self._owner = owner
- self._oldPandaID = self.PandaID
-
-
- # reset changed attribute list
- def resetChangedList(self):
- self._oldPandaID = self.PandaID
- object.__setattr__(self,'_changedAttrs',{})
-
-
- # return a tuple of values
- def values(self):
- ret = []
- for attr in self._attributes:
- val = getattr(self,attr)
- ret.append(val)
- return tuple(ret)
-
-
- # return map of values
- def valuesMap(self,useSeq=False,onlyChanged=False):
- ret = {}
- for attr in self._attributes:
- if useSeq and self._seqAttrMap.has_key(attr):
- continue
- if onlyChanged:
- if attr == 'PandaID':
- if self.PandaID == self._oldPandaID:
- continue
- elif not self._changedAttrs.has_key(attr):
- continue
- val = getattr(self,attr)
- if val == 'NULL':
- if attr in self._zeroAttrs:
- val = 0
- else:
- val = None
- ret[':%s' % attr] = val
- return ret
-
-
- # pack tuple into FileSpec
- def pack(self,values):
- for i in range(len(self._attributes)):
- attr= self._attributes[i]
- val = values[i]
- object.__setattr__(self,attr,val)
-
-
- # return state values to be pickled
- def __getstate__(self):
- state = []
- for attr in self._attributes:
- val = getattr(self,attr)
- state.append(val)
- # append owner info
- state.append(self._owner)
- return state
-
-
- # restore state from the unpickled state values
- def __setstate__(self,state):
- pandaID = 'NULL'
- for i in range(len(self._attributes)):
- if i+1 < len(state):
- object.__setattr__(self,self._attributes[i],state[i])
- else:
- object.__setattr__(self,self._attributes[i],'NULL')
- if self._attributes[i] == 'PandaID':
- pandaID = state[i]
- object.__setattr__(self,'_owner',state[-1])
- object.__setattr__(self,'_changedAttrs',{})
- object.__setattr__(self,'_oldPandaID',pandaID)
-
-
- # return column names for INSERT
- def columnNames(cls,withMod=False):
- ret = ""
- for attr in cls._attributes:
- if ret != "":
- ret += ','
- ret += attr
- # add modificationTime
- if withMod:
- ret += ",modificationTime"
- return ret
- columnNames = classmethod(columnNames)
-
-
- # return expression of values for INSERT
- def valuesExpression(cls):
- ret = "VALUES("
- for attr in cls._attributes:
- ret += "%s"
- if attr != cls._attributes[len(cls._attributes)-1]:
- ret += ","
- ret += ")"
- return ret
- valuesExpression = classmethod(valuesExpression)
-
-
- # return expression of bind variables for INSERT
- def bindValuesExpression(cls,useSeq=False,withMod=False):
- ret = "VALUES("
- for attr in cls._attributes:
- if useSeq and cls._seqAttrMap.has_key(attr):
- ret += "%s," % cls._seqAttrMap[attr]
- else:
- ret += ":%s," % attr
- ret = ret[:-1]
- # add modificationTime
- if withMod:
- ret += ",:modificationTime"
- ret += ")"
- return ret
- bindValuesExpression = classmethod(bindValuesExpression)
-
-
- # return an expression for UPDATE
- def updateExpression(cls):
- ret = ""
- for attr in cls._attributes:
- ret = ret + attr + "=%s"
- if attr != cls._attributes[len(cls._attributes)-1]:
- ret += ","
- return ret
- updateExpression = classmethod(updateExpression)
-
-
- # return an expression of bind variables for UPDATE
- def bindUpdateExpression(cls):
- ret = ""
- for attr in cls._attributes:
- ret += '%s=:%s,' % (attr,attr)
- ret = ret[:-1]
- ret += ' '
- return ret
- bindUpdateExpression = classmethod(bindUpdateExpression)
-
-
- # return an expression of bind variables for UPDATE to update only changed attributes
- def bindUpdateChangesExpression(self):
- ret = ""
- for attr in self._attributes:
- if self._changedAttrs.has_key(attr) or \
- (attr == 'PandaID' and self.PandaID != self._oldPandaID):
- ret += '%s=:%s,' % (attr,attr)
- ret = ret[:-1]
- ret += ' '
- return ret
-
-
-
-
-
diff --git a/current/pandaserver/taskbuffer/Initializer.py b/current/pandaserver/taskbuffer/Initializer.py
deleted file mode 100644
index a9c158b43..000000000
--- a/current/pandaserver/taskbuffer/Initializer.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import sys
-import cx_Oracle
-from threading import Lock
-
-from config import panda_config
-
-# logger
-from pandalogger.PandaLogger import PandaLogger
-_logger = PandaLogger().getLogger('Initializer')
-
-# initialize cx_Oracle using dummy connection to avoid "Unable to acquire Oracle environment handle"
-class Initializer:
- def __init__(self):
- self.lock = Lock()
- self.first = True
-
- def init(self):
- _logger.debug("init new=%s" % self.first)
- # do nothing when nDBConnection is 0
- if panda_config.nDBConnection == 0:
- return True
- # lock
- self.lock.acquire()
- if self.first:
- self.first = False
- try:
- _logger.debug("connect")
- # connect
- conn = cx_Oracle.connect(dsn=panda_config.dbhost,user=panda_config.dbuser,
- password=panda_config.dbpasswd,threaded=True)
- # close
- conn.close()
- _logger.debug("done")
- except:
- self.lock.release()
- type, value, traceBack = sys.exc_info()
- _logger.error("connect : %s %s" % (type,value))
- return False
- # release
- self.lock.release()
- return True
-
-
-# singleton
-initializer = Initializer()
-del Initializer
diff --git a/current/pandaserver/taskbuffer/JobSpec.py b/current/pandaserver/taskbuffer/JobSpec.py
deleted file mode 100755
index 7eaa764ab..000000000
--- a/current/pandaserver/taskbuffer/JobSpec.py
+++ /dev/null
@@ -1,239 +0,0 @@
-"""
-job specification
-
-"""
-
-class JobSpec(object):
- # attributes
- _attributes = ('PandaID','jobDefinitionID','schedulerID','pilotID','creationTime','creationHost',
- 'modificationTime','modificationHost','AtlasRelease','transformation','homepackage',
- 'prodSeriesLabel','prodSourceLabel','prodUserID','assignedPriority','currentPriority',
- 'attemptNr','maxAttempt','jobStatus','jobName','maxCpuCount','maxCpuUnit','maxDiskCount',
- 'maxDiskUnit','ipConnectivity','minRamCount','minRamUnit','startTime','endTime',
- 'cpuConsumptionTime','cpuConsumptionUnit','commandToPilot','transExitCode','pilotErrorCode',
- 'pilotErrorDiag','exeErrorCode','exeErrorDiag','supErrorCode','supErrorDiag',
- 'ddmErrorCode','ddmErrorDiag','brokerageErrorCode','brokerageErrorDiag',
- 'jobDispatcherErrorCode','jobDispatcherErrorDiag','taskBufferErrorCode',
- 'taskBufferErrorDiag','computingSite','computingElement','jobParameters',
- 'metadata','prodDBlock','dispatchDBlock','destinationDBlock','destinationSE',
- 'nEvents','grid','cloud','cpuConversion','sourceSite','destinationSite','transferType',
- 'taskID','cmtConfig','stateChangeTime','prodDBUpdateTime','lockedby','relocationFlag',
- 'jobExecutionID','VO','pilotTiming','workingGroup','processingType','prodUserName',
- 'nInputFiles','countryGroup','batchID','parentID','specialHandling','jobsetID',
- 'coreCount','nInputDataFiles','inputFileType','inputFileProject','inputFileBytes',
- 'nOutputDataFiles','outputFileBytes','jobMetrics')
- # slots
- __slots__ = _attributes+('Files','_changedAttrs')
- # attributes which have 0 by default
- _zeroAttrs = ('assignedPriority','currentPriority','attemptNr','maxAttempt','maxCpuCount','maxDiskCount',
- 'minRamCount','cpuConsumptionTime','pilotErrorCode','exeErrorCode','supErrorCode','ddmErrorCode',
- 'brokerageErrorCode','jobDispatcherErrorCode','taskBufferErrorCode','nEvents','relocationFlag',
- 'jobExecutionID','nOutputDataFiles','outputFileBytes')
- # attribute to be suppressed. They are in another table
- _suppAttrs = ('jobParameters','metadata')
- # mapping between sequence and attr
- _seqAttrMap = {'PandaID':'ATLAS_PANDA.JOBSDEFINED4_PANDAID_SEQ.nextval'}
- # limit length
- _limitLength = {'ddmErrorDiag' : 500,
- 'taskBufferErrorDiag' : 300,
- 'jobDispatcherErrorDiag' : 250,
- 'brokerageErrorDiag' : 250,
- 'pilotErrorDiag' : 500,
- 'exeErrorDiag' : 500,
- }
-
-
- # constructor
- def __init__(self):
- # install attributes
- for attr in self._attributes:
- object.__setattr__(self,attr,None)
- # files list
- object.__setattr__(self,'Files',[])
- # map of changed attributes
- object.__setattr__(self,'_changedAttrs',{})
-
-
- # override __getattribute__ for SQL
- def __getattribute__(self,name):
- ret = object.__getattribute__(self,name)
- if ret == None:
- return "NULL"
- return ret
-
-
- # override __setattr__ to collecte the changed attributes
- def __setattr__(self,name,value):
- oldVal = getattr(self,name)
- object.__setattr__(self,name,value)
- newVal = getattr(self,name)
- # collect changed attributes
- if oldVal != newVal and not name in self._suppAttrs:
- self._changedAttrs[name] = value
-
-
- # reset changed attribute list
- def resetChangedList(self):
- object.__setattr__(self,'_changedAttrs',{})
-
-
- # add File to files list
- def addFile(self,file):
- # set owner
- file.setOwner(self)
- # append
- self.Files.append(file)
-
-
- # pack tuple into JobSpec
- def pack(self,values):
- for i in range(len(self._attributes)):
- attr= self._attributes[i]
- val = values[i]
- object.__setattr__(self,attr,val)
-
-
- # return a tuple of values
- def values(self):
- ret = []
- for attr in self._attributes:
- val = getattr(self,attr)
- ret.append(val)
- return tuple(ret)
-
-
- # return map of values
- def valuesMap(self,useSeq=False,onlyChanged=False):
- ret = {}
- for attr in self._attributes:
- if useSeq and self._seqAttrMap.has_key(attr):
- continue
- if onlyChanged:
- if not self._changedAttrs.has_key(attr):
- continue
- val = getattr(self,attr)
- if val == 'NULL':
- if attr in self._zeroAttrs:
- val = 0
- else:
- val = None
- # jobParameters/metadata go to another table
- if attr in self._suppAttrs:
- val = None
- # truncate too long values
- if self._limitLength.has_key(attr):
- if val != None:
- val = val[:self._limitLength[attr]]
- ret[':%s' % attr] = val
- return ret
-
-
- # return state values to be pickled
- def __getstate__(self):
- state = []
- for attr in self._attributes:
- val = getattr(self,attr)
- state.append(val)
- # append File info
- state.append(self.Files)
- return state
-
-
- # restore state from the unpickled state values
- def __setstate__(self,state):
- for i in range(len(self._attributes)):
- # schema evolution is supported only when adding attributes
- if i+1 < len(state):
- object.__setattr__(self,self._attributes[i],state[i])
- else:
- object.__setattr__(self,self._attributes[i],'NULL')
- object.__setattr__(self,'Files',state[-1])
- object.__setattr__(self,'_changedAttrs',{})
-
-
- # return column names for INSERT or full SELECT
- def columnNames(cls):
- ret = ""
- for attr in cls._attributes:
- if ret != "":
- ret += ','
- ret += attr
- return ret
- columnNames = classmethod(columnNames)
-
-
- # return expression of values for INSERT
- def valuesExpression(cls):
- ret = "VALUES("
- for attr in cls._attributes:
- ret += "%s"
- if attr != cls._attributes[len(cls._attributes)-1]:
- ret += ","
- ret += ")"
- return ret
- valuesExpression = classmethod(valuesExpression)
-
-
- # return expression of bind values for INSERT
- def bindValuesExpression(cls,useSeq=False):
- ret = "VALUES("
- for attr in cls._attributes:
- if useSeq and cls._seqAttrMap.has_key(attr):
- ret += "%s," % cls._seqAttrMap[attr]
- else:
- ret += ":%s," % attr
- ret = ret[:-1]
- ret += ")"
- return ret
- bindValuesExpression = classmethod(bindValuesExpression)
-
-
- # return an expression for UPDATE
- def updateExpression(cls):
- ret = ""
- for attr in cls._attributes:
- ret = ret + attr + "=%s"
- if attr != cls._attributes[len(cls._attributes)-1]:
- ret += ","
- return ret
- updateExpression = classmethod(updateExpression)
-
-
- # return an expression of bind variables for UPDATE
- def bindUpdateExpression(cls):
- ret = ""
- for attr in cls._attributes:
- ret += '%s=:%s,' % (attr,attr)
- ret = ret[:-1]
- ret += ' '
- return ret
- bindUpdateExpression = classmethod(bindUpdateExpression)
-
-
- # comparison function for sort
- def compFunc(cls,a,b):
- iPandaID = list(cls._attributes).index('PandaID')
- iPriority = list(cls._attributes).index('currentPriority')
- if a[iPriority] > b[iPriority]:
- return -1
- elif a[iPriority] < b[iPriority]:
- return 1
- else:
- if a[iPandaID] > b[iPandaID]:
- return 1
- elif a[iPandaID] < b[iPandaID]:
- return -1
- else:
- return 0
- compFunc = classmethod(compFunc)
-
-
- # return an expression of bind variables for UPDATE to update only changed attributes
- def bindUpdateChangesExpression(self):
- ret = ""
- for attr in self._attributes:
- if self._changedAttrs.has_key(attr):
- ret += '%s=:%s,' % (attr,attr)
- ret = ret[:-1]
- ret += ' '
- return ret
diff --git a/current/pandaserver/taskbuffer/LogDBProxy.py b/current/pandaserver/taskbuffer/LogDBProxy.py
deleted file mode 100755
index e32ef22fc..000000000
--- a/current/pandaserver/taskbuffer/LogDBProxy.py
+++ /dev/null
@@ -1,790 +0,0 @@
-"""
-proxy for log database connection
-
-"""
-
-import re
-import sys
-import time
-import datetime
-
-import MySQLdb
-
-from pandalogger.PandaLogger import PandaLogger
-from config import panda_config
-
-import SiteSpec
-import CloudSpec
-
-from JobSpec import JobSpec
-from FileSpec import FileSpec
-
-# logger
-_logger = PandaLogger().getLogger('LogDBProxy')
-
-# proxy
-class LogDBProxy:
-
- # constructor
- def __init__(self):
- # connection object
- self.conn = None
- # cursor object
- self.cur = None
-
- # connect to DB
- def connect(self,dbhost=panda_config.logdbhost,dbpasswd=panda_config.logdbpasswd,
- dbuser=panda_config.logdbuser,dbname=panda_config.logdbname,reconnect=False):
- # keep parameters for reconnect
- if not reconnect:
- self.dbhost = dbhost
- self.dbpasswd = dbpasswd
- self.dbuser = dbuser
- self.dbname = dbname
- # connect
- try:
- self.conn = MySQLdb.connect(host=self.dbhost,user=self.dbuser,
- passwd=self.dbpasswd,db=self.dbname)
- self.cur=self.conn.cursor()
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("connect : %s %s" % (type,value))
- # roll back
- self._rollback()
- return False
-
-
- # query an SQL
- def querySQL(self,sql):
- try:
- # begin transaction
- self.cur.execute("START TRANSACTION")
- self.cur.execute(sql)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return res
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("querySQL : %s %s" % (type,value))
- return None
-
-
- # get site data
- def getCurrentSiteData(self):
- _logger.debug("getCurrentSiteData")
- sql = "SELECT SITE,getJob,updateJob FROM SiteData WHERE FLAG='production' and HOURS=3"
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- self.cur.execute(sql)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- ret = {}
- for item in res:
- ret[item[0]] = {'getJob':item[1],'updateJob':item[2]}
- _logger.debug(ret)
- return ret
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getCurrentSiteData : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # get list of site
- def getSiteList(self):
- _logger.debug("getSiteList start")
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = "SELECT siteid,nickname FROM schedconfig WHERE siteid<>''"
- self.cur.execute(sql)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- retMap = {}
- if res != None and len(res) != 0:
- for siteid,nickname in res:
- # append
- if not retMap.has_key(siteid):
- retMap[siteid] = []
- retMap[siteid].append(nickname)
- _logger.debug(retMap)
- _logger.debug("getSiteList done")
- return retMap
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getSiteList : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # get site info
- def getSiteInfo(self):
- _logger.debug("getSiteInfo start")
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = "SELECT nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory,"
- sql+= "maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec,"
- sql+= "priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue,"
- sql+= "validatedreleases,accesscontrol "
- sql+= "FROM schedconfig WHERE siteid<>''"
- self.cur.execute(sql)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- retList = {}
- if resList != None:
- # loop over all results
- for res in resList:
- nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory,\
- maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec,\
- priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue,\
- validatedreleases,accesscontrol \
- = res
- # instantiate SiteSpec
- ret = SiteSpec.SiteSpec()
- ret.sitename = siteid
- ret.nickname = nickname
- ret.dq2url = dq2url
- ret.cloud = cloud
- ret.ddm = ddm.split(',')[0]
- ret.lfchost = lfchost
- ret.se = se
- ret.gatekeeper = gatekeeper
- ret.memory = memory
- ret.maxtime = maxtime
- ret.status = status
- ret.space = space
- ret.glexec = glexec
- ret.queue = queue
- ret.localqueue = localqueue
- ret.accesscontrol = accesscontrol
- # job recoverty
- ret.retry = True
- if retry == 'FALSE':
- ret.retry = False
- # convert releases to list
- ret.releases = []
- for tmpRel in releases.split('|'):
- # remove white space
- tmpRel = tmpRel.strip()
- if tmpRel != '':
- ret.releases.append(tmpRel)
- # convert validatedreleases to list
- ret.validatedreleases = []
- for tmpRel in validatedreleases.split('|'):
- # remove white space
- tmpRel = tmpRel.strip()
- if tmpRel != '':
- ret.validatedreleases.append(tmpRel)
- # cmtconfig
- # add slc3 if the column is empty
- ret.cmtconfig = ['i686-slc3-gcc323-opt']
- if cmtconfig != '':
- ret.cmtconfig.append(cmtconfig)
- # map between token and DQ2 ID
- ret.setokens = {}
- tmpTokens = setokens.split(',')
- for idxToken,tmpddmID in enumerate(ddm.split(',')):
- if idxToken < len(tmpTokens):
- ret.setokens[tmpTokens[idxToken]] = tmpddmID
- # expand [] in se path
- match = re.search('([^\[]*)\[([^\]]+)\](.*)',seprodpath)
- if match != None and len(match.groups()) == 3:
- seprodpath = ''
- for tmpBody in match.group(2).split(','):
- seprodpath += '%s%s%s,' % (match.group(1),tmpBody,match.group(3))
- seprodpath = seprodpath[:-1]
- # map between token and se path
- ret.seprodpath = {}
- tmpTokens = setokens.split(',')
- for idxToken,tmpSePath in enumerate(seprodpath.split(',')):
- if idxToken < len(tmpTokens):
- ret.seprodpath[tmpTokens[idxToken]] = tmpSePath
- # VO related params
- ret.priorityoffset = priorityoffset
- ret.allowedgroups = allowedgroups
- ret.defaulttoken = defaulttoken
- # append
- retList[ret.nickname] = ret
- _logger.debug("getSiteInfo done")
- return retList
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getSiteInfo : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # get cloud list
- def getCloudList(self):
- _logger.debug("getCloudList start")
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = "SELECT name,tier1,tier1SE,relocation,weight,server,status,transtimelo,"
- sql += "transtimehi,waittime,validation,mcshare,countries,fasttrack,nprestage,"
- sql += "pilotowners "
- sql+= "FROM cloudconfig"
- self.cur.execute(sql)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- ret = {}
- if res != None and len(res) != 0:
- for name,tier1,tier1SE,relocation,weight,server,status,transtimelo,transtimehi,\
- waittime,validation,mcshare,countries,fasttrack,nprestage,pilotowners in res:
- # instantiate CloudSpec
- tmpC = CloudSpec.CloudSpec()
- tmpC.name = name
- tmpC.tier1 = tier1
- tmpC.tier1SE = re.sub(' ','',tier1SE).split(',')
- tmpC.relocation = relocation
- tmpC.weight = weight
- tmpC.server = server
- tmpC.status = status
- tmpC.transtimelo = transtimelo
- tmpC.transtimehi = transtimehi
- tmpC.waittime = waittime
- tmpC.validation = validation
- tmpC.mcshare = mcshare
- tmpC.countries = countries
- tmpC.fasttrack = fasttrack
- tmpC.nprestage = nprestage
- tmpC.pilotowners = pilotowners
- # append
- ret[name] = tmpC
- _logger.debug("getCloudList done")
- return ret
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getCloudList : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # extract name from DN
- def cleanUserID(self, id):
- try:
- up = re.compile('/(DC|O|OU|C|L)=[^\/]+')
- username = up.sub('', id)
- up2 = re.compile('/CN=[0-9]+')
- username = up2.sub('', username)
- up3 = re.compile(' [0-9]+')
- username = up3.sub('', username)
- up4 = re.compile('_[0-9]+')
- username = up4.sub('', username)
- username = username.replace('/CN=proxy','')
- username = username.replace('/CN=limited proxy','')
- username = username.replace('limited proxy','')
- pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)')
- mat = pat.match(username)
- if mat:
- username = mat.group(2)
- else:
- username = username.replace('/CN=','')
- if username.lower().find('/email') > 0:
- username = username[:username.lower().find('/email')]
- pat = re.compile('.*(limited.*proxy).*')
- mat = pat.match(username)
- if mat:
- username = mat.group(1)
- username = username.replace('(','')
- username = username.replace(')','')
- return username
- except:
- return id
-
-
- # check quota
- def checkQuota(self,dn):
- _logger.debug("checkQuota %s" % dn)
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- name = self.cleanUserID(dn)
- sql = "SELECT cpua1,cpua7,cpua30,quotaa1,quotaa7,quotaa30 FROM users WHERE name = '%s'" % name
- self.cur.execute(sql)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- weight = 0.0
- if res != None and len(res) != 0:
- item = res[0]
- # cpu and quota
- cpu1 = item[0]
- cpu7 = item[1]
- cpu30 = item[2]
- quota1 = item[3] * 3600
- quota7 = item[4] * 3600
- quota30 = item[5] * 3600
- # CPU usage
- if cpu1 == None:
- cpu1 = 0.0
- # weight
- weight = float(cpu1) / float(quota1)
- # not exceeded the limit
- if weight < 1.0:
- weight = 0.0
- _logger.debug("checkQuota %s Weight:%s Quota:%s CPU:%s" % (dn,weight,quota1,cpu1))
- else:
- _logger.debug("checkQuota cannot found %s" % dn)
- return weight
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("checkQuota : %s %s" % (type,value))
- # roll back
- self._rollback()
- return 0.0
-
-
- # get serialize JobID and status
- def getUserParameter(self,dn,jobID):
- _logger.debug("getUserParameter %s %s" % (dn,jobID))
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- name = self.cleanUserID(dn)
- sql = "SELECT jobid,status FROM users WHERE name = '%s'" % name
- self.cur.execute(sql)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- retJobID = jobID
- retStatus = True
- if res != None and len(res) != 0:
- item = res[0]
- # JobID in DB
- dbJobID = item[0]
- # check status
- if item[1] in ['disabled']:
- retStatus = False
- # use larger JobID
- if dbJobID >= int(retJobID):
- retJobID = dbJobID+1
- # update DB
- sql = "UPDATE users SET jobid=%d WHERE name = '%s'" % (retJobID,name)
- self.cur.execute(sql)
- _logger.debug("getUserParameter set JobID=%s for %s" % (retJobID,dn))
- return retJobID,retStatus
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getUserParameter : %s %s" % (type,value))
- # roll back
- self._rollback()
- return jobID,True
-
-
- # get email address for a user
- def getEmailAddr(self,name):
- _logger.debug("get email for %s" % name)
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = "SELECT email FROM users WHERE name='%s'" % name
- self.cur.execute(sql)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if res != None and len(res) != 0:
- return res[0][0]
- # return empty string
- return ""
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getEmailAddr : %s %s" % (type,value))
- # roll back
- self._rollback()
- return ""
-
-
- # register proxy key
- def registerProxyKey(self,params):
- _logger.debug("register ProxyKey %s" % str(params))
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # construct SQL
- sql0 = 'INSERT INTO proxykey ('
- sql1 = 'VALUES ('
- vals = []
- for key,val in params.iteritems():
- sql0 += '%s,' % key
- sql1 += '%s,'
- vals.append(val)
- sql0 = sql0[:-1]
- sql1 = sql1[:-1]
- sql = sql0 + ') ' + sql1 + ') '
- # insert
- self.cur.execute(sql,tuple(vals))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return True
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("registerProxyKey : %s %s" % (type,value))
- # roll back
- self._rollback()
- return ""
-
-
- # get proxy key
- def getProxyKey(self,dn):
- _logger.debug("get ProxyKey %s" % dn)
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # construct SQL
- sql = 'SELECT credname,expires,origin,myproxy FROM proxykey WHERE dn=%s ORDER BY expires DESC'
- # select
- self.cur.execute(sql,(dn,))
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- retMap = {}
- if res != None and len(res) != 0:
- credname,expires,origin,myproxy = res[0]
- retMap['credname'] = credname
- retMap['expires'] = expires
- retMap['origin'] = origin
- retMap['myproxy'] = myproxy
- _logger.debug(retMap)
- return retMap
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getProxyKey : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # check site access
- def checkSiteAccess(self,siteid,dn):
- comment = ' /* LogDBProxy.checkSiteAccess */'
- _logger.debug("checkSiteAccess %s:%s" % (siteid,dn))
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # construct SQL
- sql = 'SELECT poffset,rights,status FROM siteaccess WHERE dn=%s AND pandasite=%s'
- # select
- self.cur.execute(sql+comment,(dn,siteid))
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- retMap = {}
- if res != None and len(res) != 0:
- poffset,rights,status = res[0]
- retMap['poffset'] = poffset
- retMap['rights'] = rights
- retMap['status'] = status
- _logger.debug(retMap)
- return retMap
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("checkSiteAccess : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # add account to siteaccess
- def addSiteAccess(self,siteID,dn):
- comment = ' /* LogDBProxy.addSiteAccess */'
- _logger.debug("addSiteAccess : %s %s" % (siteID,dn))
- try:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = 'SELECT status FROM siteaccess WHERE dn=%s AND pandasite=%s'
- self.cur.execute(sql+comment, (dn,siteID))
- res = self.cur.fetchone()
- if res != None:
- _logger.debug("account already exists with status=%s" % res[0])
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return res[0]
- # add
- sql = 'INSERT INTO siteaccess (dn,pandasite,status) VALUES (%s,%s,%s)'
- self.cur.execute(sql+comment, (dn,siteID,'requested'))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("account was added")
- return 0
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("addSiteAccess( : %s %s" % (type,value))
- # return None
- return -1
-
-
- # list site access
- def listSiteAccess(self,siteid=None,dn=None):
- comment = ' /* LogDBProxy.listSiteAccess */'
- _logger.debug("listSiteAccess %s:%s" % (siteid,dn))
- try:
- if siteid==None and dn==None:
- return []
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # construct SQL
- if siteid != None:
- varMap = (siteid,)
- sql = 'SELECT dn,status FROM siteaccess WHERE pandasite=%s ORDER BY dn'
- else:
- varMap = (dn,)
- sql = 'SELECT pandasite,status FROM siteaccess WHERE dn=%s ORDER BY pandasite'
- # select
- self.cur.execute(sql+comment,varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- ret = []
- if res != None and len(res) != 0:
- for tmpRes in res:
- ret.append(tmpRes)
- _logger.debug(ret)
- return ret
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("listSiteAccess : %s %s" % (type,value))
- # roll back
- self._rollback()
- return []
-
-
- # get list of archived tables
- def getArchiveTables(self):
- tables = []
- cdate = datetime.datetime.utcnow()
- for iCycle in range(2): # 2 = (1 months + 2 just in case)/2
- if cdate.month==1:
- cdate = cdate.replace(year = (cdate.year-1))
- cdate = cdate.replace(month = 12, day = 1)
- else:
- cdate = cdate.replace(month = (cdate.month/2)*2, day = 1)
- tableName = "jobsArchived_%s%s" % (cdate.strftime('%b'),cdate.year)
- if not tableName in tables:
- tables.append(tableName)
- # one older table
- if cdate.month > 2:
- cdate = cdate.replace(month = (cdate.month-2))
- else:
- cdate = cdate.replace(year = (cdate.year-1), month = 12)
- # return
- return tables
-
-
- # get JobIDs in a time range
- def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs):
- comment = ' /* LogDBProxy.getJobIDsInTimeRange */'
- _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S')))
- try:
- # get list of archived tables
- tables = self.getArchiveTables()
- # select
- for table in tables:
- # make sql
- sql = "SELECT jobDefinitionID FROM %s " % table
- sql += "WHERE prodUserID=%s AND modificationTime>%s AND prodSourceLabel='user'"
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- _logger.debug(sql+comment+str((dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))))
- self.cur.execute(sql+comment, (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S')))
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for tmpID, in resList:
- if not tmpID in retJobIDs:
- retJobIDs.append(tmpID)
- _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs))
- return retJobIDs
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobIDsInTimeRange : %s %s" % (type,value))
- # return empty list
- return retJobIDs
-
-
- # get PandaIDs for a JobID
- def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs):
- comment = ' /* LogProxy.getPandIDsWithJobID */'
- _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID))
- try:
- # get list of archived tables
- tables = self.getArchiveTables()
- # select
- for table in tables:
- # skip if all jobs have already been gotten
- if nJobs > 0 and len(idStatus) >= nJobs:
- continue
- # make sql
- sql = "SELECT PandaID,jobStatus,commandToPilot FROM %s " % table
- sql += "WHERE prodUserID=%s AND jobDefinitionID=%s "
- sql += "AND prodSourceLabel in ('user','panda') "
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- _logger.debug(sql+comment+str((dn,jobID)))
- self.cur.execute(sql+comment, (dn,jobID))
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for tmpID,tmpStatus,tmpCommand in resList:
- if not idStatus.has_key(tmpID):
- idStatus[tmpID] = (tmpStatus,tmpCommand)
- _logger.debug("getPandIDsWithJobID : %s" % str(idStatus))
- return idStatus
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getPandIDsWithJobID : %s %s" % (type,value))
- # return empty list
- return {}
-
-
- # peek at job
- def peekJob(self,pandaID):
- comment = ' /* LogDBProxy.peekJob */'
- _logger.debug("peekJob : %s" % pandaID)
- # return None for NULL PandaID
- if pandaID in ['NULL','','None',None]:
- return None
- sql1_0 = "SELECT %s FROM %s "
- sql1_1 = "WHERE PandaID=%s"
- try:
- # get list of archived tables
- tables = self.getArchiveTables()
- # select
- for table in tables:
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1
- self.cur.execute(sql+comment, (pandaID,))
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if len(res) != 0:
- # Job
- job = JobSpec()
- job.pack(res[0])
- # Files
- # set autocommit on
- self.cur.execute("SET AUTOCOMMIT=1")
- # select
- fileTableName = re.sub('jobsArchived','filesTable',table)
- sqlFile = "SELECT %s " % FileSpec.columnNames()
- sqlFile+= "FROM %s " % fileTableName
- sqlFile+= "WHERE PandaID=%s"
- self.cur.execute(sqlFile+comment, (job.PandaID,))
- resFs = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # set files
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- job.addFile(file)
- return job
- _logger.debug("peekJob() : PandaID %s not found" % pandaID)
- return None
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("peekJob : %s %s" % (type,value))
- # return None
- return None
-
-
- # wake up connection
- def wakeUp(self):
- for iTry in range(5):
- try:
- # check if the connection is working
- self.conn.ping()
- return
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("wakeUp %d : %s %s" % (iTry,type,value))
- # wait for reconnection
- time.sleep(1)
- self.connect(reconnect=True)
-
-
- # close
- def close(self):
- try:
- self.cur.close()
- self.conn.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("close : %s %s" % (type,value))
-
-
- # commit
- def _commit(self):
- try:
- self.conn.commit()
- return True
- except:
- _logger.error("commit error")
- return False
-
-
- # rollback
- def _rollback(self):
- try:
- self.conn.rollback()
- return True
- except:
- _logger.error("rollback error")
- return False
-
diff --git a/current/pandaserver/taskbuffer/LogDBProxyPool.py b/current/pandaserver/taskbuffer/LogDBProxyPool.py
deleted file mode 100755
index c9f986741..000000000
--- a/current/pandaserver/taskbuffer/LogDBProxyPool.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""
-pool for LogDBProxies
-
-"""
-
-import time
-import Queue
-import random
-import OraLogDBProxy as LogDBProxy
-from config import panda_config
-
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('LogDBProxyPool')
-
-class LogDBProxyPool:
-
- def __init__(self,nConnection=panda_config.nLogDBConnection):
- # create Proxies
- _logger.debug("init")
- self.proxyList = Queue.Queue(nConnection)
- for i in range(nConnection):
- _logger.debug("connect -> %s " % i)
- proxy = LogDBProxy.LogDBProxy()
- nTry = 10
- for iTry in range(nTry):
- if proxy.connect():
- break
- _logger.debug("failed -> %s : try %s" % (i,iTry))
- if iTry+1 == nTry:
- raise RuntimeError, 'LogDBProxyPool.__init__ failed'
- time.sleep(random.randint(10,20))
- self.proxyList.put(proxy)
- time.sleep(1)
- _logger.debug("ready")
-
- # return a free proxy. this method blocks until a proxy is available
- def getProxy(self):
- # get proxy
- proxy = self.proxyList.get()
- # wake up connection
- proxy.wakeUp()
- # return
- return proxy
-
-
- # put back a proxy
- def putProxy(self,proxy):
- # put
- self.proxyList.put(proxy)
-
diff --git a/current/pandaserver/taskbuffer/MemProxy.py b/current/pandaserver/taskbuffer/MemProxy.py
deleted file mode 100644
index 025b6a815..000000000
--- a/current/pandaserver/taskbuffer/MemProxy.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# proxy for memcached
-
-import sys
-
-from config import panda_config
-
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('MemProxy')
-
-
-# proxy
-class MemProxy:
-
- # constructor
- def __init__(self):
- try:
- import memcache
- # initialize memcached client
- _logger.debug("initialize memcache client with %s" % panda_config.memcached_srvs)
- self.mclient = memcache.Client(panda_config.memcached_srvs.split(','))
- # server statistics
- _logger.debug(self.mclient.get_stats())
- _logger.debug("memcache client is ready")
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("failed to initialize memcach client : %s %s" % (errType,errValue))
-
-
- # insert files
- def setFiles(self,pandaID,site,node,files):
- try:
- _logger.debug("setFiles site=%s node=%s start" % (site,node))
- # key prefix
- keyPrefix = self.getKeyPrefix(site,node)
- # failed to get key prefix
- if keyPrefix == None:
- _logger.error("setFiles failed to get key prefix")
- return False
- # loop over all files
- varMap = {}
- for tmpFile in files:
- newKey = tmpFile
- varMap[newKey] = True
- # bulk insert
- failedList = self.mclient.set_multi(varMap,time=panda_config.memcached_exptime,
- key_prefix=keyPrefix)
- # failed
- if failedList != []:
- _logger.error("setFiles failed to insert %s values for site=%s node=%s" % \
- (len(failedList),site,node))
- return False
- _logger.debug("setFiles site=%s node=%s completed" % (site,node))
- return True
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("setFiles failed with %s %s" % (errType,errValue))
- return False
-
-
- # delete files
- def deleteFiles(self,site,node,files):
- try:
- fileList = files.split(',')
- # remove ''
- try:
- fileList.remove('')
- except:
- pass
- _logger.debug("deleteFiles for %s:%s:%s start" % (site,node,len(fileList)))
- # empty list
- if len(fileList) == 0:
- _logger.debug("deleteFiles skipped for empty list")
- return True
- # key prefix
- keyPrefix = self.getKeyPrefix(site,node)
- # non-existing key
- if keyPrefix == None:
- _logger.debug("deleteFiles skipped for non-existing key")
- return True
- # get the number of bunches
- nKeys = 100
- tmpDiv,tmpMod = divmod(len(fileList),nKeys)
- if tmpMod != 0:
- tmpDiv += 1
- # loop over all bunches
- retMap = {True:0,False:0}
- for idxB in range(tmpDiv):
- # delete
- retD = self.mclient.delete_multi(fileList[idxB*nKeys:(idxB+1)*nKeys],
- key_prefix=keyPrefix)
- if retD == 1:
- retMap[True] += 1
- else:
- retMap[False] += 1
- # failed
- if retMap[False] != 0:
- _logger.error("deleteFiles failed %s/%s" % (retMap[False],
- retMap[True]+retMap[False]))
- return False
- _logger.debug("deleteFiles succeeded")
- return True
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("deleteFiles failed with %s %s" % (errType,errValue))
- return False
-
-
- # check files
- def checkFiles(self,pandaID,files,site,node,keyPrefix='',getDetail=False):
- try:
- _logger.debug("checkFiles PandaID=%s with %s:%s start" % (pandaID,site,node))
- # get key prefix
- if keyPrefix == '':
- keyPrefix = self.getKeyPrefix(site,node)
- # non-existing key
- if keyPrefix == None:
- _logger.debug("checkFiles PandaID=%s with %s:%s doesn't exist" % \
- (pandaID,site,node))
- return 0
- # loop over all files
- keyList = []
- for tmpFile in files:
- newKey = tmpFile
- if not newKey in keyList:
- keyList.append(newKey)
- # bulk get
- retMap = self.mclient.get_multi(keyList,key_prefix=keyPrefix)
- _logger.debug("checkFiles PandaID=%s with %s:%s has %s files" % \
- (pandaID,site,node,len(retMap)))
- # return detailed string
- if getDetail:
- retStr = ''
- for tmpFile in files:
- if retMap.has_key(tmpFile):
- retStr += '1,'
- else:
- retStr += '0,'
- retStr = retStr[:-1]
- return retStr
- # return number of files
- return len(retMap)
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("checkFiles failed with %s %s" % (errType,errValue))
- return 0
-
-
- # flush files
- def flushFiles(self,site,node):
- try:
- _logger.debug("flushFiles for %s:%s start" % (site,node))
- # key prefix stored in memcached
- keyPrefix = self.getInternalKeyPrefix(site,node)
- # increment
- serNum = self.mclient.incr(keyPrefix)
- # return if not exist
- if serNum == None:
- _logger.debug("flushFiles skipped for non-existing key")
- return True
- # avoid overflow
- if serNum > 1024:
- serNum = 0
- # set
- retS = self.mclient.set(keyPrefix,serNum,time=panda_config.memcached_exptime)
- if retS == 0:
- # failed
- _logger.error("flushFiles failed to set new SN")
- return False
- _logger.error("flushFiles completed")
- return True
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("flushFiles failed with %s %s" % (errType,errValue))
- return False
-
-
- # get internal key prefix
- def getInternalKeyPrefix(self,site,node):
- # get short WN name
- shortWN = node.split('.')[0]
- # key prefix stored in memcached
- keyPrefix = '%s_%s' % (site,shortWN)
- return keyPrefix
-
-
- # get key prefix
- def getKeyPrefix(self,site,node):
- # key prefix stored in memcached
- keyPrefix = self.getInternalKeyPrefix(site,node)
- # get serial number from memcached
- serNum = self.mclient.get(keyPrefix)
- # use 0 if not exist
- if serNum == None:
- serNum = 0
- # set to avoid expiration
- retS = self.mclient.set(keyPrefix,serNum,time=panda_config.memcached_exptime)
- if retS == 0:
- # failed
- return None
- else:
- # return prefix site_node_sn_
- newPrefix = '%s_%s' % (keyPrefix,serNum)
- return newPrefix
diff --git a/current/pandaserver/taskbuffer/OraDBProxy.py b/current/pandaserver/taskbuffer/OraDBProxy.py
deleted file mode 100755
index 1295ef360..000000000
--- a/current/pandaserver/taskbuffer/OraDBProxy.py
+++ /dev/null
@@ -1,10739 +0,0 @@
-"""
-proxy for database connection
-
-"""
-
-import re
-import os
-import sys
-import time
-import fcntl
-import types
-import random
-import urllib
-import socket
-import datetime
-import commands
-import traceback
-import warnings
-import cx_Oracle
-import ErrorCode
-import SiteSpec
-import CloudSpec
-import PrioUtil
-import ProcessGroups
-from JobSpec import JobSpec
-from FileSpec import FileSpec
-from DatasetSpec import DatasetSpec
-from CloudTaskSpec import CloudTaskSpec
-from pandalogger.PandaLogger import PandaLogger
-from config import panda_config
-from brokerage.PandaSiteIDs import PandaSiteIDs
-
-warnings.filterwarnings('ignore')
-
-# logger
-_logger = PandaLogger().getLogger('DBProxy')
-
-# lock file
-_lockGetSN = open(panda_config.lockfile_getSN, 'w')
-_lockSetDS = open(panda_config.lockfile_setDS, 'w')
-_lockGetCT = open(panda_config.lockfile_getCT, 'w')
-
-
-# proxy
-class DBProxy:
-
- # constructor
- def __init__(self,useOtherError=False):
- # connection object
- self.conn = None
- # cursor object
- self.cur = None
- # host name
- self.hostname = None
- # retry count
- self.nTry = 5
- # use special error codes for reconnection in querySQL
- self.useOtherError = useOtherError
- # memcached client
- self.memcache = None
- # pledge resource ratio
- self.beyondPledgeRatio = {}
- # update time for pledge resource ratio
- self.updateTimeForPledgeRatio = None
- # fareshare policy
- self.faresharePolicy = {}
- # update time for fareshare policy
- self.updateTimeForFaresharePolicy = None
- # hostname
- self.myHostName = socket.getfqdn()
-
-
- # connect to DB
- def connect(self,dbhost=panda_config.dbhost,dbpasswd=panda_config.dbpasswd,
- dbuser=panda_config.dbuser,dbname=panda_config.dbname,
- dbtimeout=None,reconnect=False):
- _logger.debug("connect : re=%s" % reconnect)
- # keep parameters for reconnect
- if not reconnect:
- self.dbhost = dbhost
- self.dbpasswd = dbpasswd
- self.dbuser = dbuser
- self.dbname = dbname
- self.dbtimeout = dbtimeout
- # close old connection
- if reconnect:
- _logger.debug("closing old connection")
- try:
- self.conn.close()
- except:
- _logger.debug("failed to close old connection")
- # connect
- try:
- self.conn = cx_Oracle.connect(dsn=self.dbhost,user=self.dbuser,
- password=self.dbpasswd,threaded=True)
- self.cur=self.conn.cursor()
- try:
- # use SQL dumper
- if panda_config.dump_sql:
- import SQLDumper
- self.cur = SQLDumper.SQLDumper(self.cur)
- except:
- pass
- # get hostname
- self.cur.execute("SELECT SYS_CONTEXT('USERENV','HOST') FROM dual")
- res = self.cur.fetchone()
- if res != None:
- self.hostname = res[0]
- # set TZ
- self.cur.execute("ALTER SESSION SET TIME_ZONE='UTC'")
- # set DATE format
- self.cur.execute("ALTER SESSION SET NLS_DATE_FORMAT='YYYY/MM/DD HH24:MI:SS'")
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("connect : %s %s" % (type,value))
- return False
-
-
- # query an SQL
- def querySQL(self,sql,arraySize=1000):
- comment = ' /* DBProxy.querySQL */'
- try:
- _logger.debug("querySQL : %s " % sql)
- # begin transaction
- self.conn.begin()
- self.cur.arraysize = arraySize
- self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return res
- except:
- # roll back
- self._rollback(self.useOtherError)
- type, value, traceBack = sys.exc_info()
- _logger.error("querySQL : %s " % sql)
- _logger.error("querySQL : %s %s" % (type,value))
- return None
-
-
- # query an SQL return Status
- def querySQLS(self,sql,varMap,arraySize=1000):
- comment = ' /* DBProxy.querySQLS */'
- try:
- # begin transaction
- self.conn.begin()
- self.cur.arraysize = arraySize
- ret = self.cur.execute(sql+comment,varMap)
- if sql.startswith('INSERT') or sql.startswith('UPDATE') or \
- sql.startswith('DELETE'):
- res = self.cur.rowcount
- else:
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return ret,res
- except:
- # roll back
- self._rollback(self.useOtherError)
- type, value, traceBack = sys.exc_info()
- _logger.error("querySQLS : %s %s" % (sql,str(varMap)))
- _logger.error("querySQLS : %s %s" % (type,value))
- return -1,None
-
-
- # get CLOB
- def getClobObj(self,sql,varMap,arraySize=10000):
- comment = ' /* DBProxy.getClobObj */'
- try:
- # begin transaction
- self.conn.begin()
- self.cur.arraysize = arraySize
- ret = self.cur.execute(sql+comment,varMap)
- res = []
- for items in self.cur:
- resItem = []
- for item in items:
- # read CLOB
- resItem.append(item.read())
- # append
- res.append(resItem)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return ret,res
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getClobObj : %s %s" % (sql,str(varMap)))
- _logger.error("getClobObj : %s %s" % (type,value))
- return -1,None
-
-
- # insert job to jobsDefined
- def insertNewJob(self,job,user,serNum,weight=0.0,priorityOffset=0,userVO=None,groupJobSN=0,toPending=False):
- comment = ' /* DBProxy.insertNewJob */'
- if not toPending:
- sql1 = "INSERT INTO ATLAS_PANDA.jobsDefined4 (%s) " % JobSpec.columnNames()
- else:
- sql1 = "INSERT INTO ATLAS_PANDA.jobsWaiting4 (%s) " % JobSpec.columnNames()
- sql1+= JobSpec.bindValuesExpression(useSeq=True)
- sql1+= " RETURNING PandaID INTO :newPandaID"
- # make sure PandaID is NULL
- job.PandaID = None
- # job status
- if not toPending:
- job.jobStatus='defined'
- else:
- job.jobStatus='pending'
- # host and time information
- job.modificationHost = self.hostname
- job.creationTime = datetime.datetime.utcnow()
- job.modificationTime = job.creationTime
- job.stateChangeTime = job.creationTime
- job.prodDBUpdateTime = datetime.datetime(1,1,1)
- # DN
- if job.prodUserID == "NULL" or job.prodSourceLabel in ['user','panda']:
- job.prodUserID = user
- # compact user name
- job.prodUserName = self.cleanUserID(job.prodUserID)
- if job.prodUserName in ['','NULL']:
- # use prodUserID as compact user name
- job.prodUserName = job.prodUserID
- # VO
- job.VO = userVO
- # priority
- if job.assignedPriority != 'NULL':
- job.currentPriority = job.assignedPriority
- if job.prodSourceLabel == 'install':
- job.currentPriority = 4100
- elif job.prodSourceLabel == 'user':
- if job.processingType in ['usermerge'] and not job.currentPriority in ['NULL',None]:
- # avoid prio reduction for merge jobs
- pass
- else:
- job.currentPriority = PrioUtil.calculatePriority(priorityOffset,serNum,weight)
- if 'express' in job.specialHandling:
- job.currentPriority = 6000
- elif job.prodSourceLabel == 'panda':
- job.currentPriority = 2000 + priorityOffset
- if 'express' in job.specialHandling:
- job.currentPriority = 6500
- # usergroup
- if job.prodSourceLabel == 'regional':
- job.computingSite= "BNLPROD"
- # group job SN
- groupJobSN = "%05d" % groupJobSN
- # set attempt numbers
- if job.prodSourceLabel in ['user','panda','ptest','rc_test']:
- if job.attemptNr in [None,'NULL','']:
- job.attemptNr = 0
- if job.maxAttempt in [None,'NULL','']:
- job.maxAttempt = 0
- # set maxAttempt to attemptNr to disable server/pilot retry
- if job.maxAttempt == -1:
- job.maxAttempt = job.attemptNr
- else:
- # set maxAttempt to have server/pilot retries for retried jobs
- if job.maxAttempt <= job.attemptNr:
- job.maxAttempt = job.attemptNr + 2
- try:
- # begin transaction
- self.conn.begin()
- # insert
- varMap = job.valuesMap(useSeq=True)
- varMap[':newPandaID'] = self.cur.var(cx_Oracle.NUMBER)
- retI = self.cur.execute(sql1+comment, varMap)
- # set PandaID
- job.PandaID = long(varMap[':newPandaID'].getvalue())
- # get jobsetID
- if job.jobsetID in [None,'NULL',-1]:
- jobsetID = 0
- else:
- jobsetID = job.jobsetID
- jobsetID = '%06d' % jobsetID
- # reset changed attribute list
- job.resetChangedList()
- # insert files
- _logger.debug("insertNewJob : %s Label:%s prio:%s" % (job.PandaID,job.prodSourceLabel,
- job.currentPriority))
- sqlFile = "INSERT INTO ATLAS_PANDA.filesTable4 (%s) " % FileSpec.columnNames()
- sqlFile+= FileSpec.bindValuesExpression(useSeq=True)
- sqlFile+= " RETURNING row_ID INTO :newRowID"
- for file in job.Files:
- file.row_ID = None
- if file.status != 'ready':
- file.status='unknown'
- # replace $PANDAID with real PandaID
- file.lfn = re.sub('\$PANDAID', '%05d' % job.PandaID, file.lfn)
- # replace $JOBSETID with real jobsetID
- if not job.prodSourceLabel in ['managed']:
- file.lfn = re.sub('\$JOBSETID', jobsetID, file.lfn)
- file.lfn = re.sub('\$GROUPJOBSN', groupJobSN, file.lfn)
- # set scope
- if file.type in ['output','log'] and job.VO in ['atlas']:
- file.scope = self.extractScope(file.dataset)
- # insert
- varMap = file.valuesMap(useSeq=True)
- varMap[':newRowID'] = self.cur.var(cx_Oracle.NUMBER)
- self.cur.execute(sqlFile+comment, varMap)
- # get rowID
- file.row_ID = long(varMap[':newRowID'].getvalue())
- # reset changed attribute list
- file.resetChangedList()
- # metadata
- if job.prodSourceLabel in ['user','panda'] and job.metadata != '':
- sqlMeta = "INSERT INTO ATLAS_PANDA.metaTable (PandaID,metaData) VALUES (:PandaID,:metaData)"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':metaData'] = job.metadata
- self.cur.execute(sqlMeta+comment, varMap)
- # job parameters
- if not job.prodSourceLabel in ['managed']:
- job.jobParameters = re.sub('\$JOBSETID', jobsetID, job.jobParameters)
- job.jobParameters = re.sub('\$GROUPJOBSN', groupJobSN, job.jobParameters)
- sqlJob = "INSERT INTO ATLAS_PANDA.jobParamsTable (PandaID,jobParameters) VALUES (:PandaID,:param)"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':param'] = job.jobParameters
- self.cur.execute(sqlJob+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("insertNewJob : %s File OK" % job.PandaID)
- # record status change
- try:
- self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job)
- except:
- _logger.error('recordStatusChange in insertNewJob')
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("insertNewJob : %s %s" % (type,value))
- # roll back
- self._rollback()
- return False
-
-
- # simply insert job to a table
- def insertJobSimple(self,job,table,fileTable,jobParamsTable,metaTable):
- comment = ' /* DBProxy.insertJobSimple */'
- _logger.debug("insertJobSimple : %s" % job.PandaID)
- sql1 = "INSERT INTO %s (%s) " % (table,JobSpec.columnNames())
- sql1+= JobSpec.bindValuesExpression()
- try:
- # begin transaction
- self.conn.begin()
- # insert
- self.cur.execute(sql1+comment, job.valuesMap())
- # files
- sqlFile = "INSERT INTO %s " % fileTable
- sqlFile+= "(%s) " % FileSpec.columnNames(withMod=True)
- sqlFile+= FileSpec.bindValuesExpression(withMod=True)
- for file in job.Files:
- varMap = file.valuesMap()
- varMap[':modificationTime'] = job.modificationTime
- self.cur.execute(sqlFile+comment, varMap)
- # job parameters
- sqlJob = "INSERT INTO %s (PandaID,jobParameters,modificationTime) VALUES (:PandaID,:param,:modificationTime)" \
- % jobParamsTable
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':param'] = job.jobParameters
- varMap[':modificationTime'] = job.modificationTime
- self.cur.execute(sqlJob+comment, varMap)
- # metadata
- if not job.metadata in [None,'NULL','']:
- sqlMeta = "INSERT INTO %s (PandaID,metaData,modificationTime) VALUES(:PandaID,:metaData,:modificationTime)" \
- % metaTable
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':metaData'] = job.metadata
- varMap[':modificationTime'] = job.modificationTime
- self.cur.execute(sqlMeta+comment,varMap)
- # set flag to avoid duplicated insertion attempts
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':archivedFlag'] = 1
- sqlArch = "UPDATE ATLAS_PANDA.jobsArchived4 SET archivedFlag=:archivedFlag WHERE PandaID=:PandaID"
- self.cur.execute(sqlArch+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("insertJobSimple : %s %s" % (type,value))
- # roll back
- self._rollback()
- return False
-
-
- # simply insert job to a table without reading
- def insertJobSimpleUnread(self,pandaID,modTime):
- comment = ' /* DBProxy.insertJobSimpleUnread */'
- _logger.debug("insertJobSimpleUnread : %s" % pandaID)
- # check
- sqlC = "SELECT archivedFlag FROM ATLAS_PANDA.jobsArchived4 "
- sqlC += "WHERE PandaID=:pandaID "
- # job
- sqlJ = "INSERT INTO ATLAS_PANDAARCH.jobsArchived (%s) " % JobSpec.columnNames()
- sqlJ += "SELECT %s FROM ATLAS_PANDA.jobsArchived4 " % JobSpec.columnNames()
- sqlJ += "WHERE PandaID=:pandaID "
- # file
- sqlF = "INSERT INTO ATLAS_PANDAARCH.filesTable_ARCH (%s) " % FileSpec.columnNames(withMod=True)
- sqlF += "SELECT %s,:modTime FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames(withMod=False)
- sqlF += "WHERE PandaID=:pandaID "
- # parameters
- sqlP = "INSERT INTO ATLAS_PANDAARCH.jobParamsTable_ARCH (PandaID,jobParameters,modificationTime) "
- sqlP += "SELECT PandaID,jobParameters,:modTime FROM ATLAS_PANDA.jobParamsTable "
- sqlP += "WHERE PandaID=:pandaID "
- # metadata
- sqlM1 = "SELECT PandaID FROM ATLAS_PANDA.metaTable "
- sqlM1 += "WHERE PandaID=:pandaID AND rownum<=1 "
- sqlM2 = "INSERT INTO ATLAS_PANDAARCH.metaTable_ARCH (PandaID,metaData,modificationTime) "
- sqlM2 += "SELECT PandaID,metaData,:modTime FROM ATLAS_PANDA.metaTable "
- sqlM2 += "WHERE PandaID=:pandaID "
- try:
- # begin transaction
- self.conn.begin()
- # check
- varMap = {}
- varMap[':pandaID'] = pandaID
- self.cur.execute(sqlC+comment,varMap)
- res = self.cur.fetchone()
- if res == None or res[0] == 1:
- if res == None:
- _logger.error("insertJobSimpleUnread : %s cannot get archivedFlag" % pandaID)
- else:
- _logger.debug("insertJobSimpleUnread : %s skip" % pandaID)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- # insert
- varMap = {}
- varMap[':pandaID'] = pandaID
- self.cur.execute(sqlJ+comment,varMap)
- varMap = {}
- varMap[':pandaID'] = pandaID
- varMap[':modTime'] = modTime
- self.cur.execute(sqlF+comment,varMap)
- varMap = {}
- varMap[':pandaID'] = pandaID
- varMap[':modTime'] = modTime
- self.cur.execute(sqlP+comment,varMap)
- varMap = {}
- varMap[':pandaID'] = pandaID
- self.cur.execute(sqlM1+comment,varMap)
- res = self.cur.fetchone()
- if res != None:
- varMap = {}
- varMap[':pandaID'] = pandaID
- varMap[':modTime'] = modTime
- self.cur.execute(sqlM2+comment,varMap)
- # set flag to avoid duplicated insertion attempts
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':archivedFlag'] = 1
- sqlArch = "UPDATE ATLAS_PANDA.jobsArchived4 SET archivedFlag=:archivedFlag WHERE PandaID=:PandaID"
- self.cur.execute(sqlArch+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("insertJobSimpleUnread %s : %s %s" % (pandaID,type,value))
- # roll back
- self._rollback()
- return False
-
-
- # delete job
- def deleteJobSimple(self,pandaID):
- comment = ' /* DBProxy.deleteJobSimple */'
- _logger.debug("deleteJobSimple : %s" % pandaID)
- try:
- # begin transaction
- self.conn.begin()
- # delete
- varMap = {}
- varMap[':PandaID'] = pandaID
- sql = 'DELETE from ATLAS_PANDA.jobsArchived4 WHERE PandaID=:PandaID'
- self.cur.execute(sql+comment, varMap)
- sql = "DELETE from ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID"
- self.cur.execute(sql+comment, varMap)
- sql = "DELETE from ATLAS_PANDA.metaTable WHERE PandaID=:PandaID"
- self.cur.execute(sql+comment, varMap)
- sql = "DELETE from ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID"
- self.cur.execute(sql+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- type, value = sys.exc_info()[:2]
- _logger.error("deleteJobSimple %s : %s %s" % (pandaID,type,value))
- # roll back
- self._rollback()
- return False
-
-
- # activate job. move job from jobsDefined to jobsActive
- def activateJob(self,job):
- comment = ' /* DBProxy.activateJob */'
- updatedFlag = False
- if job==None:
- _logger.debug("activateJob : None")
- return True
- _logger.debug("activateJob : %s" % job.PandaID)
- sql0 = "SELECT row_ID FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type AND NOT status IN (:status1,:status2) "
- sql1 = "DELETE FROM ATLAS_PANDA.jobsDefined4 "
- sql1+= "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2) AND commandToPilot IS NULL"
- sql2 = "INSERT INTO ATLAS_PANDA.jobsActive4 (%s) " % JobSpec.columnNames()
- sql2+= JobSpec.bindValuesExpression()
- # host and time information
- job.modificationTime = datetime.datetime.utcnow()
- # set stateChangeTime for defined->activated but not for assigned->activated
- if job.jobStatus in ['defined']:
- job.stateChangeTime = job.modificationTime
- nTry=3
- for iTry in range(nTry):
- try:
- # check if all files are ready
- allOK = True
- for file in job.Files:
- if file.type == 'input' and not file.status in ['ready','cached']:
- allOK = False
- break
- # begin transaction
- self.conn.begin()
- # check all inputs are ready
- varMap = {}
- varMap[':type'] = 'input'
- varMap[':status1'] = 'ready'
- varMap[':status2'] = 'cached'
- varMap[':PandaID'] = job.PandaID
- self.cur.arraysize = 100
- self.cur.execute(sql0+comment, varMap)
- res = self.cur.fetchall()
- if len(res) == 0 or allOK:
- # change status
- job.jobStatus = "activated"
- # delete
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':oldJobStatus1'] = 'assigned'
- varMap[':oldJobStatus2'] = 'defined'
- self.cur.execute(sql1+comment, varMap)
- n = self.cur.rowcount
- if n==0:
- # already killed or activated
- _logger.debug("activateJob : Not found %s" % job.PandaID)
- else:
- # insert
- self.cur.execute(sql2+comment, job.valuesMap())
- # update files
- for file in job.Files:
- sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID"
- varMap = file.valuesMap(onlyChanged=True)
- if varMap != {}:
- varMap[':row_ID'] = file.row_ID
- _logger.debug(sqlF+comment+str(varMap))
- self.cur.execute(sqlF+comment, varMap)
- # job parameters
- sqlJob = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':param'] = job.jobParameters
- self.cur.execute(sqlJob+comment, varMap)
- updatedFlag = True
- else:
- # update job
- sqlJ = ("UPDATE ATLAS_PANDA.jobsDefined4 SET %s " % job.bindUpdateChangesExpression()) + \
- "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)"
- varMap = job.valuesMap(onlyChanged=True)
- varMap[':PandaID'] = job.PandaID
- varMap[':oldJobStatus1'] = 'assigned'
- varMap[':oldJobStatus2'] = 'defined'
- _logger.debug(sqlJ+comment+str(varMap))
- self.cur.execute(sqlJ+comment, varMap)
- n = self.cur.rowcount
- if n==0:
- # already killed or activated
- _logger.debug("activateJob : Not found %s" % job.PandaID)
- else:
- # update files
- for file in job.Files:
- sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID"
- varMap = file.valuesMap(onlyChanged=True)
- if varMap != {}:
- varMap[':row_ID'] = file.row_ID
- _logger.debug(sqlF+comment+str(varMap))
- self.cur.execute(sqlF+comment, varMap)
- # job parameters
- sqlJob = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':param'] = job.jobParameters
- self.cur.execute(sqlJob+comment, varMap)
- updatedFlag = True
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # record status change
- try:
- if updatedFlag:
- self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job)
- except:
- _logger.error('recordStatusChange in activateJob')
- return True
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("activateJob : %s retry : %s" % (job.PandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("activateJob : %s %s" % (type,value))
- return False
-
-
- # send job to jobsWaiting
- def keepJob(self,job):
- comment = ' /* DBProxy.keepJob */'
- _logger.debug("keepJob : %s" % job.PandaID)
- sql1 = "DELETE FROM ATLAS_PANDA.jobsDefined4 "
- sql1+= "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2) AND commandToPilot IS NULL"
- sql2 = "INSERT INTO ATLAS_PANDA.jobsWaiting4 (%s) " % JobSpec.columnNames()
- sql2+= JobSpec.bindValuesExpression()
- # time information
- job.modificationTime = datetime.datetime.utcnow()
- job.stateChangeTime = job.modificationTime
- updatedFlag = False
- nTry=3
- for iTry in range(nTry):
- try:
- # begin transaction
- self.conn.begin()
- # delete
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':oldJobStatus1'] = 'assigned'
- varMap[':oldJobStatus2'] = 'defined'
- self.cur.execute(sql1+comment, varMap)
- n = self.cur.rowcount
- if n==0:
- # already killed
- _logger.debug("keepJob : Not found %s" % job.PandaID)
- else:
- # set status
- job.jobStatus = 'waiting'
- # insert
- self.cur.execute(sql2+comment, job.valuesMap())
- # update files
- for file in job.Files:
- sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID"
- varMap = file.valuesMap(onlyChanged=True)
- if varMap != {}:
- varMap[':row_ID'] = file.row_ID
- _logger.debug(sqlF+comment+str(varMap))
- self.cur.execute(sqlF+comment, varMap)
- updatedFlag = True
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # record status change
- try:
- if updatedFlag:
- self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job)
- except:
- _logger.error('recordStatusChange in keepJob')
- return True
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("keepJob : %s retry : %s" % (job.PandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("keepJob : %s %s" % (type,value))
- return False
-
-
- # archive job to jobArchived and remove the job from jobsActive or jobsDefined
- def archiveJob(self,job,fromJobsDefined):
- comment = ' /* DBProxy.archiveJob */'
- _logger.debug("archiveJob : %s" % job.PandaID)
- if fromJobsDefined:
- sql1 = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)"
- else:
- sql1 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID"
- sql2 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames()
- sql2+= JobSpec.bindValuesExpression()
- updatedJobList = []
- nTry=3
- for iTry in range(nTry):
- try:
- # begin transaction
- self.conn.begin()
- # delete
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- if fromJobsDefined:
- varMap[':oldJobStatus1'] = 'assigned'
- varMap[':oldJobStatus2'] = 'defined'
- self.cur.execute(sql1+comment, varMap)
- n = self.cur.rowcount
- if n==0:
- # already killed
- _logger.debug("archiveJob : Not found %s" % job.PandaID)
- else:
- # insert
- job.modificationTime = datetime.datetime.utcnow()
- job.stateChangeTime = job.modificationTime
- if job.endTime == 'NULL':
- job.endTime = job.modificationTime
- self.cur.execute(sql2+comment, job.valuesMap())
- # update files
- for file in job.Files:
- sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID"
- varMap = file.valuesMap(onlyChanged=True)
- if varMap != {}:
- varMap[':row_ID'] = file.row_ID
- _logger.debug(sqlF+comment+str(varMap))
- self.cur.execute(sqlF+comment, varMap)
- # update metadata and parameters
- sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':modificationTime'] = job.modificationTime
- self.cur.execute(sqlFMod+comment,varMap)
- self.cur.execute(sqlMMod+comment,varMap)
- self.cur.execute(sqlPMod+comment,varMap)
- # increment the number of failed jobs in _dis
- myDisList = []
- if job.jobStatus == 'failed' and job.prodSourceLabel in ['managed','test']:
- for tmpFile in job.Files:
- if tmpFile.type == 'input' and not tmpFile.dispatchDBlock in ['','NULL',None] \
- and not tmpFile.dispatchDBlock in myDisList:
- varMap = {}
- varMap[':name'] = tmpFile.dispatchDBlock
- # check currentfiles
- sqlGetCurFiles = """SELECT /*+ BEGIN_OUTLINE_DATA """
- sqlGetCurFiles += """INDEX_RS_ASC(@"SEL$1" "TAB"@"SEL$1" ("DATASETS"."NAME")) """
- sqlGetCurFiles += """OUTLINE_LEAF(@"SEL$1") ALL_ROWS """
- sqlGetCurFiles += """OPTIMIZER_FEATURES_ENABLE('10.2.0.4') """
- sqlGetCurFiles += """IGNORE_OPTIM_EMBEDDED_HINTS """
- sqlGetCurFiles += """END_OUTLINE_DATA */ """
- sqlGetCurFiles += "currentfiles,vuid FROM ATLAS_PANDA.Datasets tab WHERE name=:name"
- self.cur.execute(sqlGetCurFiles+comment,varMap)
- resCurFiles = self.cur.fetchone()
- _logger.debug("archiveJob : %s %s" % (job.PandaID,str(resCurFiles)))
- if resCurFiles != None:
- # increment currentfiles only for the first failed job since that is enough
- tmpCurrentFiles,tmpVUID = resCurFiles
- _logger.debug("archiveJob : %s %s currentfiles=%s" % (job.PandaID,tmpFile.dispatchDBlock,tmpCurrentFiles))
- if tmpCurrentFiles == 0:
- _logger.debug("archiveJob : %s %s update currentfiles" % (job.PandaID,tmpFile.dispatchDBlock))
- varMap = {}
- varMap[':vuid'] = tmpVUID
- sqlFailedInDis = 'UPDATE ATLAS_PANDA.Datasets '
- sqlFailedInDis += 'SET currentfiles=currentfiles+1 WHERE vuid=:vuid'
- self.cur.execute(sqlFailedInDis+comment,varMap)
- myDisList.append(tmpFile.dispatchDBlock)
- # collect to record state change
- updatedJobList.append(job)
- # delete downstream jobs
- ddmIDs = []
- newJob = None
- ddmAttempt = 0
- if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed':
- # look for outputs
- upOutputs = []
- for file in job.Files:
- if file.type == 'output':
- upOutputs.append(file.lfn)
- toBeClosedSubList = {}
- topUserDsList = []
- # look for downstream jobs
- sqlD = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND lfn=:lfn GROUP BY PandaID"
- sqlDJS = "SELECT %s " % JobSpec.columnNames()
- sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID"
- sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID"
- sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames()
- sqlDJI+= JobSpec.bindValuesExpression()
- sqlDFup = "UPDATE ATLAS_PANDA.filesTable4 SET status=:status WHERE PandaID=:PandaID AND type IN (:type1,:type2)"
- sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlGetSub = "SELECT DISTINCT destinationDBlock FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND PandaID=:PandaID"
- sqlCloseSub = 'UPDATE /*+ INDEX_RS_ASC(TAB("DATASETS"."NAME")) */ ATLAS_PANDA.Datasets tab '
- sqlCloseSub += 'SET status=:status,modificationDate=CURRENT_DATE WHERE name=:name'
- for upFile in upOutputs:
- _logger.debug("look for downstream jobs for %s" % upFile)
- # select PandaID
- varMap = {}
- varMap[':lfn'] = upFile
- varMap[':type'] = 'input'
- self.cur.arraysize = 100000
- self.cur.execute(sqlD+comment, varMap)
- res = self.cur.fetchall()
- for downID, in res:
- _logger.debug("delete : %s" % downID)
- # select jobs
- varMap = {}
- varMap[':PandaID'] = downID
- self.cur.arraysize = 10
- self.cur.execute(sqlDJS+comment, varMap)
- resJob = self.cur.fetchall()
- if len(resJob) == 0:
- continue
- # instantiate JobSpec
- dJob = JobSpec()
- dJob.pack(resJob[0])
- # delete
- varMap = {}
- varMap[':PandaID'] = downID
- self.cur.execute(sqlDJD+comment, varMap)
- retD = self.cur.rowcount
- if retD == 0:
- continue
- # error code
- dJob.jobStatus = 'cancelled'
- dJob.endTime = datetime.datetime.utcnow()
- dJob.taskBufferErrorCode = ErrorCode.EC_Kill
- dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed'
- dJob.modificationTime = dJob.endTime
- dJob.stateChangeTime = dJob.endTime
- # insert
- self.cur.execute(sqlDJI+comment, dJob.valuesMap())
- # update file status
- varMap = {}
- varMap[':PandaID'] = downID
- varMap[':status'] = 'failed'
- varMap[':type1'] = 'output'
- varMap[':type2'] = 'log'
- self.cur.execute(sqlDFup+comment, varMap)
- # update files,metadata,parametes
- varMap = {}
- varMap[':PandaID'] = downID
- varMap[':modificationTime'] = dJob.modificationTime
- self.cur.execute(sqlFMod+comment,varMap)
- self.cur.execute(sqlMMod+comment,varMap)
- self.cur.execute(sqlPMod+comment,varMap)
- # collect to record state change
- updatedJobList.append(dJob)
- # set tobeclosed to sub datasets
- if not toBeClosedSubList.has_key(dJob.jobDefinitionID):
- # init
- toBeClosedSubList[dJob.jobDefinitionID] = []
- # get sub datasets
- varMap = {}
- varMap[':type'] = 'output'
- varMap[':PandaID'] = downID
- self.cur.arraysize = 1000
- self.cur.execute(sqlGetSub+comment, varMap)
- resGetSub = self.cur.fetchall()
- if len(resGetSub) == 0:
- continue
- # loop over all sub datasets
- for tmpDestinationDBlock, in resGetSub:
- if re.search('_sub\d+$',tmpDestinationDBlock) == None:
- continue
- if not tmpDestinationDBlock in toBeClosedSubList[dJob.jobDefinitionID]:
- # set tobeclosed
- varMap = {}
- varMap[':status'] = 'tobeclosed'
- varMap[':name'] = tmpDestinationDBlock
- self.cur.execute(sqlCloseSub+comment, varMap)
- _logger.debug("set tobeclosed for %s" % tmpDestinationDBlock)
- # append
- toBeClosedSubList[dJob.jobDefinitionID].append(tmpDestinationDBlock)
- # close top-level user dataset
- topUserDsName = re.sub('_sub\d+$','',tmpDestinationDBlock)
- if topUserDsName != tmpDestinationDBlock and not topUserDsName in topUserDsList:
- # set tobeclosed
- varMap = {}
- if dJob.processingType.startswith('gangarobot') or \
- dJob.processingType.startswith('hammercloud'):
- varMap[':status'] = 'completed'
- else:
- varMap[':status'] = 'tobeclosed'
- varMap[':name'] = topUserDsName
- self.cur.execute(sqlCloseSub+comment, varMap)
- _logger.debug("set %s for %s" % (varMap[':status'],topUserDsName))
- # append
- topUserDsList.append(topUserDsName)
- elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='dis':
- # get corresponding jobs for production movers
- vuid = ''
- # extract vuid
- match = re.search('--callBack (\S+)',job.jobParameters)
- if match != None:
- try:
- callbackUrl = urllib.unquote(match.group(1))
- callbackUrl = re.sub('[&\?]',' ', callbackUrl)
- # look for vuid=
- for item in callbackUrl.split():
- if item.startswith('vuid='):
- vuid = item.split('=')[-1]
- break
- except:
- pass
- if vuid == '':
- _logger.error("cannot extract vuid from %s" % job.jobParameters)
- else:
- # get name
- varMap = {}
- varMap[':vuid'] = vuid
- varMap[':type'] = 'dispatch'
- self.cur.arraysize = 10
- self.cur.execute("SELECT name FROM ATLAS_PANDA.Datasets WHERE vuid=:vuid AND type=:type "+comment, varMap)
- res = self.cur.fetchall()
- if len(res) != 0:
- disName = res[0][0]
- # check lost files
- varMap = {}
- varMap[':status'] = 'lost'
- varMap[':dispatchDBlock'] = disName
- sqlLost = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ distinct PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE status=:status AND dispatchDBlock=:dispatchDBlock"
- self.cur.execute(sqlLost+comment,varMap)
- resLost = self.cur.fetchall()
- # fail jobs with lost files
- sqlDJS = "SELECT %s " % JobSpec.columnNames()
- sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID"
- sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID"
- sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames()
- sqlDJI+= JobSpec.bindValuesExpression()
- sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- lostJobIDs = []
- for tmpID, in resLost:
- _logger.debug("fail due to lost files : %s" % tmpID)
- varMap = {}
- varMap[':PandaID'] = tmpID
- self.cur.arraysize = 10
- self.cur.execute(sqlDJS+comment, varMap)
- resJob = self.cur.fetchall()
- if len(resJob) == 0:
- continue
- # instantiate JobSpec
- dJob = JobSpec()
- dJob.pack(resJob[0])
- # delete
- varMap = {}
- varMap[':PandaID'] = tmpID
- self.cur.execute(sqlDJD+comment, varMap)
- retD = self.cur.rowcount
- if retD == 0:
- continue
- # error code
- dJob.jobStatus = 'failed'
- dJob.endTime = datetime.datetime.utcnow()
- dJob.ddmErrorCode = 101 #ErrorCode.EC_LostFile
- dJob.ddmErrorDiag = 'lost file in SE'
- dJob.modificationTime = dJob.endTime
- dJob.stateChangeTime = dJob.endTime
- # insert
- self.cur.execute(sqlDJI+comment, dJob.valuesMap())
- # update files,metadata,parametes
- varMap = {}
- varMap[':PandaID'] = tmpID
- varMap[':modificationTime'] = dJob.modificationTime
- self.cur.execute(sqlFMod+comment,varMap)
- self.cur.execute(sqlMMod+comment,varMap)
- self.cur.execute(sqlPMod+comment,varMap)
- # append
- lostJobIDs.append(tmpID)
- # collect to record state change
- updatedJobList.append(dJob)
- # get PandaIDs
- varMap = {}
- varMap[':jobStatus'] = 'assigned'
- varMap[':dispatchDBlock'] = disName
- self.cur.execute("SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE dispatchDBlock=:dispatchDBlock AND jobStatus=:jobStatus "+comment,
- varMap)
- resDDM = self.cur.fetchall()
- for tmpID, in resDDM:
- if not tmpID in lostJobIDs:
- ddmIDs.append(tmpID)
- # get offset
- ddmAttempt = job.attemptNr
- _logger.debug("get PandaID for reassign : %s ddmAttempt=%s" % (str(ddmIDs),ddmAttempt))
- elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='ddm' and job.attemptNr<2 \
- and job.commandToPilot != 'tobekilled':
- # instantiate new mover to retry subscription
- newJob = JobSpec()
- newJob.jobDefinitionID = job.jobDefinitionID
- newJob.jobName = job.jobName
- newJob.attemptNr = job.attemptNr + 1
- newJob.transformation = job.transformation
- newJob.destinationDBlock = job.destinationDBlock
- newJob.destinationSE = job.destinationSE
- newJob.currentPriority = job.currentPriority
- newJob.prodSourceLabel = job.prodSourceLabel
- newJob.prodUserID = job.prodUserID
- newJob.computingSite = job.computingSite
- newJob.transferType = job.transferType
- newJob.sourceSite = job.sourceSite
- newJob.destinationSite = job.destinationSite
- newJob.jobParameters = job.jobParameters
- if job.Files != []:
- file = job.Files[0]
- fileOL = FileSpec()
- # add attempt nr
- fileOL.lfn = re.sub("\.\d+$","",file.lfn)
- fileOL.lfn = "%s.%d" % (fileOL.lfn,job.attemptNr)
- fileOL.destinationDBlock = file.destinationDBlock
- fileOL.destinationSE = file.destinationSE
- fileOL.dataset = file.dataset
- fileOL.type = file.type
- newJob.addFile(fileOL)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # record status change
- try:
- for tmpJob in updatedJobList:
- self.recordStatusChange(tmpJob.PandaID,tmpJob.jobStatus,jobInfo=tmpJob)
- except:
- _logger.error('recordStatusChange in archiveJob')
- return True,ddmIDs,ddmAttempt,newJob
- except:
- # roll back
- self._rollback(True)
- if iTry+1 < nTry:
- _logger.debug("archiveJob : %s retry : %s" % (job.PandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("archiveJob : %s" % job.PandaID)
- _logger.error("archiveJob : %s %s" % (type,value))
- return False,[],0,None
-
-
- # overload of archiveJob
- def archiveJobLite(self,pandaID,jobStatus,param):
- comment = ' /* DBProxy.archiveJobLite */'
- _logger.debug("archiveJobLite : %s" % pandaID)
- sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames()
- sql1+= "WHERE PandaID=:PandaID"
- sql2 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID"
- sql3 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames()
- sql3+= JobSpec.bindValuesExpression()
- sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- nTry=3
- for iTry in range(nTry):
- try:
- # begin transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- self.cur.arraysize = 10
- self.cur.execute(sql1+comment, varMap)
- res = self.cur.fetchall()
- if len(res) == 0:
- _logger.error("archiveJobLite() : PandaID %d not found" % pandaID)
- self._rollback()
- return False
- job = JobSpec()
- job.pack(res[0])
- job.jobStatus = jobStatus
- for key in param.keys():
- if param[key] != None:
- setattr(job,key,param[key])
- job.modificationTime = datetime.datetime.utcnow()
- job.endTime = job.modificationTime
- job.stateChangeTime = job.modificationTime
- # delete
- self.cur.execute(sql2+comment, varMap)
- n = self.cur.rowcount
- if n==0:
- # already killed
- _logger.debug("archiveJobLite : Not found %s" % pandaID)
- else:
- # insert
- self.cur.execute(sql3+comment, job.valuesMap())
- # update files
- for file in job.Files:
- sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID"
- varMap = file.valuesMap(onlyChanged=True)
- if varMap != {}:
- varMap[':row_ID'] = file.row_ID
- _logger.debug(sqlF+comment+str(varMap))
- self.cur.execute(sqlF+comment, varMap)
- # update files,metadata,parametes
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':modificationTime'] = job.modificationTime
- self.cur.execute(sqlFMod+comment,varMap)
- self.cur.execute(sqlMMod+comment,varMap)
- self.cur.execute(sqlPMod+comment,varMap)
- # delete downstream jobs
- if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed':
- # file select
- sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=:PandaID"
- varMap = {}
- varMap[':PandaID'] = pandaID
- self.cur.arraysize = 100000
- self.cur.execute(sqlFile+comment, varMap)
- resFs = self.cur.fetchall()
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- job.addFile(file)
- # look for outputs
- upOutputs = []
- for file in job.Files:
- if file.type == 'output':
- upOutputs.append(file.lfn)
- # look for downstream jobs
- sqlD = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND lfn=:lfn GROUP BY PandaID"
- sqlDJS = "SELECT %s " % JobSpec.columnNames()
- sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID"
- sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID"
- sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames()
- sqlDJI+= JobSpec.bindValuesExpression()
- for upFile in upOutputs:
- _logger.debug("look for downstream jobs for %s" % upFile)
- # select PandaID
- varMap = {}
- varMap[':lfn'] = upFile
- varMap[':type'] = 'input'
- self.cur.arraysize = 100000
- self.cur.execute(sqlD+comment, varMap)
- res = self.cur.fetchall()
- for downID, in res:
- _logger.debug("delete : %s" % downID)
- # select jobs
- varMap = {}
- varMap[':PandaID'] = downID
- self.cur.arraysize = 10
- self.cur.execute(sqlDJS+comment, varMap)
- resJob = self.cur.fetchall()
- if len(resJob) == 0:
- continue
- # instantiate JobSpec
- dJob = JobSpec()
- dJob.pack(resJob[0])
- # delete
- varMap = {}
- varMap[':PandaID'] = downID
- self.cur.execute(sqlDJD+comment, varMap)
- retD = self.cur.rowcount
- if retD == 0:
- continue
- # error code
- dJob.jobStatus = 'failed'
- dJob.endTime = datetime.datetime.utcnow()
- dJob.taskBufferErrorCode = ErrorCode.EC_Kill
- dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed'
- dJob.modificationTime = dJob.endTime
- dJob.stateChangeTime = dJob.endTime
- # insert
- self.cur.execute(sqlDJI+comment, dJob.valuesMap())
- # update files,metadata,parametes
- varMap = {}
- varMap[':PandaID'] = downID
- varMap[':modificationTime'] = dJob.modificationTime
- self.cur.execute(sqlFMod+comment,varMap)
- self.cur.execute(sqlMMod+comment,varMap)
- self.cur.execute(sqlPMod+comment,varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("archiveJobLite : %s retry : %s" % (pandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("archiveJobLite : %s %s" % (type,value))
- return False
-
-
- # finalize pending jobs
- def finalizePendingJobs(self,prodUserName,jobDefinitionID):
- comment = ' /* DBProxy.finalizePendingJobs */'
- _logger.debug("finalizePendingJobs : %s %s" % (prodUserName,jobDefinitionID))
- sql0 = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 "
- sql0+= "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sql0+= "AND prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus "
- sqlU = "UPDATE ATLAS_PANDA.jobsActive4 SET jobStatus=:newJobStatus "
- sqlU+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus "
- sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames()
- sql1+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus "
- sql2 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID AND jobStatus=:jobStatus "
- sql3 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames()
- sql3+= JobSpec.bindValuesExpression()
- sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- try:
- # begin transaction
- self.conn.begin()
- self.cur.arraysize = 100000
- # select
- varMap = {}
- varMap[':jobStatus'] = 'failed'
- varMap[':prodUserName'] = prodUserName
- varMap[':jobDefinitionID'] = jobDefinitionID
- varMap[':prodSourceLabel'] = 'user'
- self.cur.execute(sql0+comment,varMap)
- resPending = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # lock
- pPandaIDs = []
- for pandaID, in resPending:
- # begin transaction
- self.conn.begin()
- # update
- varMap = {}
- varMap[':jobStatus'] = 'failed'
- varMap[':newJobStatus'] = 'holding'
- varMap[':PandaID'] = pandaID
- self.cur.execute(sqlU+comment,varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- retU = self.cur.rowcount
- if retU != 0:
- pPandaIDs.append(pandaID)
- # loop over all PandaIDs
- for pandaID in pPandaIDs:
- # begin transaction
- self.conn.begin()
- # get job
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':jobStatus'] = 'holding'
- self.cur.arraysize = 10
- self.cur.execute(sql1+comment,varMap)
- res = self.cur.fetchall()
- if len(res) == 0:
- _logger.debug("finalizePendingJobs : PandaID %d not found" % pandaID)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- continue
- job = JobSpec()
- job.pack(res[0])
- job.jobStatus = 'failed'
- job.modificationTime = datetime.datetime.utcnow()
- # delete
- self.cur.execute(sql2+comment,varMap)
- n = self.cur.rowcount
- if n==0:
- # already killed
- _logger.debug("finalizePendingJobs : Not found %s" % pandaID)
- else:
- # insert
- self.cur.execute(sql3+comment,job.valuesMap())
- # update files,metadata,parametes
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':modificationTime'] = job.modificationTime
- self.cur.execute(sqlFMod+comment,varMap)
- self.cur.execute(sqlMMod+comment,varMap)
- self.cur.execute(sqlPMod+comment,varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("finalizePendingJobs : %s %s done for %s" % (prodUserName,jobDefinitionID,len(pPandaIDs)))
- return True
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("finalizePendingJobs : %s %s" % (errType,errValue))
- return False
-
-
- # delete stalled jobs
- def deleteStalledJobs(self,libFileName):
- comment = ' /* DBProxy.deleteStalledJobs */'
- _logger.debug("deleteStalledJobs : %s" % libFileName)
- sql2 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames()
- sql2+= JobSpec.bindValuesExpression()
- nTry=3
- try:
- # begin transaction
- self.conn.begin()
- # look for downstream jobs
- sqlD = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND lfn=:lfn GROUP BY PandaID"
- sqlDJS = "SELECT %s " % JobSpec.columnNames()
- sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID"
- sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID"
- sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames()
- sqlDJI+= JobSpec.bindValuesExpression()
- sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- _logger.debug("deleteStalledJobs : look for downstream jobs for %s" % libFileName)
- # select PandaID
- varMap = {}
- varMap[':lfn'] = libFileName
- varMap[':type'] = 'input'
- self.cur.arraysize = 100000
- self.cur.execute(sqlD+comment, varMap)
- res = self.cur.fetchall()
- for downID, in res:
- _logger.debug("deleteStalledJobs : delete %s" % downID)
- # select jobs
- varMap = {}
- varMap[':PandaID'] = downID
- self.cur.arraysize = 10
- self.cur.execute(sqlDJS+comment, varMap)
- resJob = self.cur.fetchall()
- if len(resJob) == 0:
- continue
- # instantiate JobSpec
- dJob = JobSpec()
- dJob.pack(resJob[0])
- # delete
- varMap = {}
- varMap[':PandaID'] = downID
- self.cur.execute(sqlDJD+comment, varMap)
- retD = self.cur.rowcount
- if retD == 0:
- continue
- # error code
- dJob.jobStatus = 'cancelled'
- dJob.endTime = datetime.datetime.utcnow()
- dJob.taskBufferErrorCode = ErrorCode.EC_Kill
- dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed'
- dJob.modificationTime = dJob.endTime
- dJob.stateChangeTime = dJob.endTime
- # insert
- self.cur.execute(sqlDJI+comment, dJob.valuesMap())
- # update files,metadata,parametes
- varMap = {}
- varMap[':PandaID'] = downID
- varMap[':modificationTime'] = dJob.modificationTime
- self.cur.execute(sqlFMod+comment,varMap)
- self.cur.execute(sqlMMod+comment,varMap)
- self.cur.execute(sqlPMod+comment,varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback(True)
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("deleteStalledJobs : %s %s" % (errtype,errvalue))
- return False
-
-
- # update Job status in jobsActive
- def updateJobStatus(self,pandaID,jobStatus,param,updateStateChange=False,attemptNr=None):
- comment = ' /* DBProxy.updateJobStatus */'
- _logger.debug("updateJobStatus : PandaID=%s attemptNr=%s status=%s" % (pandaID,attemptNr,jobStatus))
- sql0 = "SELECT commandToPilot,endTime,specialHandling,jobStatus,computingSite,cloud,prodSourceLabel FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID "
- varMap0 = {}
- varMap0[':PandaID'] = pandaID
- sql1 = "UPDATE ATLAS_PANDA.jobsActive4 SET jobStatus=:jobStatus,modificationTime=CURRENT_DATE"
- if updateStateChange or jobStatus in ['starting']:
- sql1 += ",stateChangeTime=CURRENT_DATE"
- varMap = {}
- varMap[':jobStatus'] = jobStatus
- presetEndTime = False
- for key in param.keys():
- if param[key] != None:
- sql1 += ',%s=:%s' % (key,key)
- varMap[':%s' % key] = param[key]
- if key == 'endTime':
- presetEndTime = True
- try:
- # store positive error code even for pilot retry
- if key == 'pilotErrorCode' and param[key].startswith('-'):
- varMap[':%s' % key] = param[key][1:]
- except:
- pass
- sql1W = " WHERE PandaID=:PandaID "
- varMap[':PandaID'] = pandaID
- if attemptNr != None:
- sql0 += "AND attemptNr=:attemptNr "
- sql1W += "AND attemptNr=:attemptNr "
- varMap[':attemptNr'] = attemptNr
- varMap0[':attemptNr'] = attemptNr
- # prevent change from holding to transferring which doesn't register files to sub/tid
- if jobStatus == 'transferring':
- sql1W += "AND NOT jobStatus=:ngStatus "
- varMap[':ngStatus'] = 'holding'
- updatedFlag = False
- nTry=3
- for iTry in range(nTry):
- try:
- # begin transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10
- self.cur.execute (sql0+comment,varMap0)
- res = self.cur.fetchone()
- if res != None:
- ret = ''
- commandToPilot,endTime,specialHandling,oldJobStatus,computingSite,cloud,prodSourceLabel = res
- # debug mode
- """
- if not specialHandling in [None,''] and 'debug' in specialHandling:
- ret += 'debugon,'
- else:
- ret += 'debugoff,'
- """
- # kill command
- if not commandToPilot in [None,'']:
- ret += '%s,' % commandToPilot
- ret = ret[:-1]
- # convert empty to NULL
- if ret == '':
- ret = 'NULL'
- # don't update holding
- if oldJobStatus == 'holding' and jobStatus == 'holding':
- _logger.debug("updateJobStatus : PandaID=%s skip to reset holding" % pandaID)
- else:
- # set endTime if undefined for holding
- if jobStatus == 'holding' and endTime==None and not presetEndTime:
- sql1 += ',endTime=CURRENT_DATE '
- # update
- self.cur.execute (sql1+sql1W+comment,varMap)
- nUp = self.cur.rowcount
- _logger.debug("updateJobStatus : PandaID=%s attemptNr=%s nUp=%s" % (pandaID,attemptNr,nUp))
- if nUp == 1:
- updatedFlag = True
- if nUp == 0 and jobStatus == 'transferring':
- _logger.debug("updateJobStatus : PandaID=%s ignore to update for transferring" % pandaID)
- else:
- _logger.debug("updateJobStatus : PandaID=%s attemptNr=%s notFound" % (pandaID,attemptNr))
- # already deleted or bad attempt number
- ret = "badattemptnr"
- #ret = 'tobekilled'
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # record status change
- try:
- if updatedFlag and oldJobStatus != None and oldJobStatus != jobStatus:
- self.recordStatusChange(pandaID,jobStatus,
- infoMap={'computingSite':computingSite,
- 'cloud':cloud,
- 'prodSourceLabel':prodSourceLabel})
- except:
- _logger.error('recordStatusChange in updateJobStatus')
- return ret
- except:
- # roll back
- self._rollback(True)
- if iTry+1 < nTry:
- _logger.debug("updateJobStatus : %s retry : %s" % (pandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("updateJobStatus : %s %s" % (type,value))
- _logger.error("updateJobStatus : %s" % pandaID)
- return False
-
-
- # update job information in jobsActive or jobsDefined
- def updateJob(self,job,inJobsDefined):
- comment = ' /* DBProxy.updateJob */'
- _logger.debug("updateJob : %s" % job.PandaID)
- updatedFlag = False
- nTry=3
- for iTry in range(nTry):
- try:
- job.modificationTime = datetime.datetime.utcnow()
- # set stateChangeTime for defined->assigned
- if inJobsDefined:
- job.stateChangeTime = job.modificationTime
- # make SQL
- if inJobsDefined:
- sql1 = "UPDATE ATLAS_PANDA.jobsDefined4 SET %s " % job.bindUpdateChangesExpression()
- else:
- sql1 = "UPDATE ATLAS_PANDA.jobsActive4 SET %s " % job.bindUpdateChangesExpression()
- sql1+= "WHERE PandaID=:PandaID "
- if inJobsDefined:
- sql1+= " AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2) "
- # begin transaction
- self.conn.begin()
- # update
- varMap = job.valuesMap(onlyChanged=True)
- varMap[':PandaID'] = job.PandaID
- if inJobsDefined:
- varMap[':oldJobStatus1'] = 'assigned'
- varMap[':oldJobStatus2'] = 'defined'
- _logger.debug(sql1+comment+str(varMap))
- self.cur.execute(sql1+comment, varMap)
- n = self.cur.rowcount
- if n==0:
- # already killed or activated
- _logger.debug("updateJob : Not found %s" % job.PandaID)
- else:
- for file in job.Files:
- sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID"
- varMap = file.valuesMap(onlyChanged=True)
- if varMap != {}:
- varMap[':row_ID'] = file.row_ID
- _logger.debug(sqlF+comment+str(varMap))
- self.cur.execute(sqlF+comment, varMap)
- # update job parameters
- sqlJobP = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':param'] = job.jobParameters
- self.cur.execute(sqlJobP+comment, varMap)
- updatedFlag = True
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # record status change
- try:
- if updatedFlag:
- self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job)
- except:
- _logger.error('recordStatusChange in updateJob')
- return True
- except:
- # roll back
- self._rollback(True)
- if iTry+1 < nTry:
- _logger.debug("updateJob : %s retry : %s" % (job.PandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("updateJob : %s %s" % (type,value))
- return False
-
-
- # retry analysis job
- def retryJob(self,pandaID,param,failedInActive=False,changeJobInMem=False,inMemJob=None,
- getNewPandaID=False,attemptNr=None):
- comment = ' /* DBProxy.retryJob */'
- _logger.debug("retryJob : %s inActive=%s" % (pandaID,failedInActive))
- sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames()
- sql1+= "WHERE PandaID=:PandaID "
- if failedInActive:
- sql1+= "AND jobStatus=:jobStatus "
- updatedFlag = False
- nTry=3
- for iTry in range(nTry):
- try:
- retValue = False
- if not changeJobInMem:
- # begin transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- if failedInActive:
- varMap[':jobStatus'] = 'failed'
- self.cur.arraysize = 10
- self.cur.execute(sql1+comment, varMap)
- res = self.cur.fetchall()
- if len(res) == 0:
- _logger.debug("retryJob() : PandaID %d not found" % pandaID)
- self._rollback()
- return retValue
- job = JobSpec()
- job.pack(res[0])
- else:
- job = inMemJob
- # don't use getNewPandaID for buildJob since the order of PandaIDs is broken
- if getNewPandaID and job.prodSourceLabel in ['panda']:
- if not changeJobInMem:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return retValue
- # convert attemptNr to int
- try:
- attemptNr = int(attemptNr)
- except:
- _logger.debug("retryJob : %s attemptNr=%s non-integer" % (pandaID,attemptNr))
- attemptNr = -999
- # check attemptNr
- if attemptNr != None:
- if job.attemptNr != attemptNr:
- _logger.debug("retryJob : %s bad attemptNr job.%s != pilot.%s" % (pandaID,job.attemptNr,attemptNr))
- if not changeJobInMem:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return retValue
- # check if already retried
- if job.taskBufferErrorCode in [ErrorCode.EC_Reassigned,ErrorCode.EC_Retried,ErrorCode.EC_PilotRetried]:
- _logger.debug("retryJob : %s already retried %s" % (pandaID,job.taskBufferErrorCode))
- if not changeJobInMem:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return retValue
- # check pilot retry
- usePilotRetry = False
- if job.prodSourceLabel in ['user','panda','ptest','rc_test'] and \
- param.has_key('pilotErrorCode') and param['pilotErrorCode'].startswith('-') and \
- job.maxAttempt > job.attemptNr and \
- (not job.processingType.startswith('gangarobot') or job.processingType=='gangarobot-rctest') and \
- not job.processingType.startswith('hammercloud'):
- usePilotRetry = True
- # check if it's analysis job # FIXME once pilot retry works correctly the conditions below will be cleaned up
- if (((job.prodSourceLabel == 'user' or job.prodSourceLabel == 'panda') \
- and not job.processingType.startswith('gangarobot') \
- and not job.processingType.startswith('hammercloud') \
- and job.computingSite.startswith('ANALY_') and param.has_key('pilotErrorCode') \
- and param['pilotErrorCode'] in ['1200','1201','1213'] and (not job.computingSite.startswith('ANALY_LONG_')) \
- and job.attemptNr < 2) or (job.prodSourceLabel == 'ddm' and job.cloud == 'CA' and job.attemptNr <= 10) \
- or failedInActive or usePilotRetry) \
- and job.commandToPilot != 'tobekilled':
- _logger.debug('reset PandaID:%s #%s' % (job.PandaID,job.attemptNr))
- if not changeJobInMem:
- # job parameters
- sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- self.cur.execute(sqlJobP+comment, varMap)
- for clobJobP, in self.cur:
- job.jobParameters = clobJobP.read()
- break
- # reset job
- job.jobStatus = 'activated'
- job.startTime = None
- job.modificationTime = datetime.datetime.utcnow()
- job.attemptNr = job.attemptNr + 1
- if usePilotRetry:
- job.currentPriority -= 10
- if failedInActive:
- job.endTime = None
- job.transExitCode = None
- for attr in job._attributes:
- if attr.endswith('ErrorCode') or attr.endswith('ErrorDiag'):
- setattr(job,attr,None)
- # remove flag regarding to pledge-resource handling
- if not job.specialHandling in [None,'NULL','']:
- newSpecialHandling = re.sub(',*localpool','',job.specialHandling)
- if newSpecialHandling == '':
- job.specialHandling = None
- else:
- job.specialHandling = newSpecialHandling
- # send it to long queue for analysis jobs
- oldComputingSite = job.computingSite
- if not changeJobInMem:
- if job.computingSite.startswith('ANALY'):
- longSite = None
- tmpLongSiteList = []
- tmpLongSite = re.sub('^ANALY_','ANALY_LONG_',job.computingSite)
- tmpLongSite = re.sub('_\d+$','',tmpLongSite)
- tmpLongSiteList.append(tmpLongSite)
- tmpLongSite = job.computingSite + '_LONG'
- tmpLongSiteList.append(tmpLongSite)
- tmpLongSite = re.sub('SHORT','LONG',job.computingSite)
- if tmpLongSite != job.computingSite:
- tmpLongSiteList.append(tmpLongSite)
- # loop over all possible long sitenames
- for tmpLongSite in tmpLongSiteList:
- varMap = {}
- varMap[':siteID'] = tmpLongSite
- varMap[':status'] = 'online'
- sqlSite = "SELECT COUNT(*) FROM ATLAS_PANDAMETA.schedconfig WHERE siteID=:siteID AND status=:status"
- self.cur.execute(sqlSite+comment, varMap)
- resSite = self.cur.fetchone()
- if resSite != None and resSite[0] > 0:
- longSite = tmpLongSite
- break
- # use long site if exists
- if longSite != None:
- _logger.debug('sending PandaID:%s to %s' % (job.PandaID,longSite))
- job.computingSite = longSite
- # set destinationSE if queue is changed
- if oldComputingSite == job.destinationSE:
- job.destinationSE = job.computingSite
- if not changeJobInMem:
- # select files
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- if not getNewPandaID:
- varMap[':type1'] = 'log'
- varMap[':type2'] = 'output'
- sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames()
- if not getNewPandaID:
- sqlFile+= "WHERE PandaID=:PandaID AND (type=:type1 OR type=:type2)"
- else:
- sqlFile+= "WHERE PandaID=:PandaID"
- self.cur.arraysize = 100
- self.cur.execute(sqlFile+comment, varMap)
- resFs = self.cur.fetchall()
- else:
- # get log or output files only
- resFs = []
- for tmpFile in job.Files:
- if tmpFile.type in ['log','output']:
- resFs.append(tmpFile)
- # loop over all files
- for resF in resFs:
- if not changeJobInMem:
- # set PandaID
- file = FileSpec()
- file.pack(resF)
- job.addFile(file)
- else:
- file = resF
- # set new GUID
- if file.type == 'log':
- file.GUID = commands.getoutput('uuidgen')
- # don't change input and lib.tgz
- if file.type == 'input' or (file.type == 'output' and job.prodSourceLabel == 'panda') or \
- (file.type == 'output' and file.lfn.endswith('.lib.tgz') and job.prodSourceLabel in ['rc_test','ptest']):
- continue
- # append attemptNr to LFN
- oldName = file.lfn
- file.lfn = re.sub('\.\d+$','',file.lfn)
- file.lfn = '%s.%s' % (file.lfn,job.attemptNr)
- newName = file.lfn
- # set destinationSE
- if oldComputingSite == file.destinationSE:
- file.destinationSE = job.computingSite
- # modify jobParameters
- sepPatt = "(\'|\"|%20)" + oldName + "(\'|\"|%20)"
- matches = re.findall(sepPatt,job.jobParameters)
- for match in matches:
- oldPatt = match[0]+oldName+match[-1]
- newPatt = match[0]+newName+match[-1]
- job.jobParameters = re.sub(oldPatt,newPatt,job.jobParameters)
- if not changeJobInMem and not getNewPandaID:
- # reset file status
- if file.type in ['output','log']:
- file.status = 'unknown'
- # update files
- sqlFup = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID"
- varMap = file.valuesMap(onlyChanged=True)
- if varMap != {}:
- varMap[':row_ID'] = file.row_ID
- self.cur.execute(sqlFup+comment, varMap)
- if not changeJobInMem:
- # reuse original PandaID
- if not getNewPandaID:
- # update job
- sql2 = "UPDATE ATLAS_PANDA.jobsActive4 SET %s " % job.bindUpdateChangesExpression()
- sql2+= "WHERE PandaID=:PandaID "
- varMap = job.valuesMap(onlyChanged=True)
- varMap[':PandaID'] = job.PandaID
- self.cur.execute(sql2+comment, varMap)
- # update job parameters
- sqlJobP = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':param'] = job.jobParameters
- self.cur.execute(sqlJobP+comment, varMap)
- updatedFlag = True
- else:
- # read metadata
- sqlMeta = "SELECT metaData FROM ATLAS_PANDA.metaTable WHERE PandaID=:PandaID"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- self.cur.execute(sqlMeta+comment, varMap)
- for clobJobP, in self.cur:
- job.metadata = clobJobP.read()
- break
- # insert job with new PandaID
- sql1 = "INSERT INTO ATLAS_PANDA.jobsActive4 (%s) " % JobSpec.columnNames()
- sql1+= JobSpec.bindValuesExpression(useSeq=True)
- sql1+= " RETURNING PandaID INTO :newPandaID"
- # set parentID
- job.parentID = job.PandaID
- varMap = job.valuesMap(useSeq=True)
- varMap[':newPandaID'] = self.cur.var(cx_Oracle.NUMBER)
- # insert
- retI = self.cur.execute(sql1+comment, varMap)
- # set PandaID
- job.PandaID = long(varMap[':newPandaID'].getvalue())
- _logger.debug('Generate new PandaID %s -> %s #%s' % (job.parentID,job.PandaID,job.attemptNr))
- # insert files
- sqlFile = "INSERT INTO ATLAS_PANDA.filesTable4 (%s) " % FileSpec.columnNames()
- sqlFile+= FileSpec.bindValuesExpression(useSeq=True)
- sqlFile+= " RETURNING row_ID INTO :newRowID"
- for file in job.Files:
- # reset rowID
- file.row_ID = None
- # insert
- varMap = file.valuesMap(useSeq=True)
- varMap[':newRowID'] = self.cur.var(cx_Oracle.NUMBER)
- self.cur.execute(sqlFile+comment, varMap)
- # update mod time for files
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':modificationTime'] = job.modificationTime
- sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- self.cur.execute(sqlFMod+comment,varMap)
- # metadata
- sqlMeta = "INSERT INTO ATLAS_PANDA.metaTable (PandaID,metaData,modificationTime) VALUES (:PandaID,:metaData,:modTime)"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':metaData'] = job.metadata
- varMap[':modTime'] = job.modificationTime
- self.cur.execute(sqlMeta+comment, varMap)
- # job parameters
- sqlJob = "INSERT INTO ATLAS_PANDA.jobParamsTable (PandaID,jobParameters,modificationTime) VALUES (:PandaID,:param,:modTime)"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- varMap[':param'] = job.jobParameters
- varMap[':modTime'] = job.modificationTime
- self.cur.execute(sqlJob+comment, varMap)
- # set error code to original job to avoid being retried by another process
- sqlE = "UPDATE ATLAS_PANDA.jobsActive4 SET taskBufferErrorCode=:errCode,taskBufferErrorDiag=:errDiag WHERE PandaID=:PandaID"
- varMap = {}
- varMap[':PandaID'] = job.parentID
- varMap[':errCode'] = ErrorCode.EC_PilotRetried
- varMap[':errDiag'] = 'retrying at the same site. new PandaID=%s' % job.PandaID
- self.cur.execute(sqlE+comment, varMap)
- # set return
- if not getNewPandaID:
- retValue = True
- if not changeJobInMem:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # record status change
- try:
- if updatedFlag:
- self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job)
- except:
- _logger.error('recordStatusChange in retryJob')
- return retValue
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("retryJob : %s retry : %s" % (pandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- # error report
- type, value, traceBack = sys.exc_info()
- _logger.error("retryJob : %s %s" % (type,value))
- return False
-
-
- # retry failed analysis jobs in Active4
- def retryJobsInActive(self,prodUserName,jobDefinitionID):
- comment = ' /* DBProxy.retryJobsInActive */'
- _logger.debug("retryJobsInActive : start - %s %s" % (prodUserName,jobDefinitionID))
- try:
- # begin transaction
- self.conn.begin()
- # count the number of jobs in Defined
- sqlC = "SELECT COUNT(*) FROM ATLAS_PANDA.jobsDefined4 "
- sqlC += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sqlC += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) "
- varMap = {}
- varMap[':prodUserName'] = prodUserName
- varMap[':jobDefinitionID'] = jobDefinitionID
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- self.cur.arraysize = 10
- self.cur.execute(sqlC+comment,varMap)
- res = self.cur.fetchone()
- # failed to get the number of jobs in Defined
- if res == None:
- _logger.error("retryJobsInActive : %s %s - failed to get num of jobs in Def" % (prodUserName,jobDefinitionID))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return None for DB error
- return None
- nJobsInDef = res[0]
- # get failed PandaIDs in Active
- sql0 = "SELECT PandaID,jobStatus,taskBufferErrorCode,attemptNr FROM ATLAS_PANDA.jobsActive4 "
- sql0+= "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sql0+= "AND prodSourceLabel=:prodSourceLabel "
- varMap = {}
- varMap[':prodUserName'] = prodUserName
- varMap[':jobDefinitionID'] = jobDefinitionID
- varMap[':prodSourceLabel'] = 'user'
- self.cur.execute(sql0+comment,varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # the number of jobs in Active
- nJobsInAct = len(res)
- # loop over all PandaID
- failedPandaIDs = []
- for pandaID,tmpJobStatus,tmpTaskBufferErrorCode,tmpAttemptNr in res:
- if tmpJobStatus == 'failed' and not tmpTaskBufferErrorCode in \
- [ErrorCode.EC_Reassigned,ErrorCode.EC_Retried,ErrorCode.EC_PilotRetried]:
- failedPandaIDs.append((pandaID,tmpAttemptNr))
- _logger.debug("retryJobsInActive : %s %s - %s failed jobs" % (prodUserName,jobDefinitionID,len(failedPandaIDs)))
- # there are some failed jobs in Active
- if failedPandaIDs != []:
- # get list of sub datasets to lock Closer
- sqlF = "SELECT DISTINCT destinationDBlock FROM ATLAS_PANDA.filesTable4 "
- sqlF += "WHERE PandaID=:PandaID AND type IN (:type1,:type2) "
- varMap = {}
- varMap[':PandaID'] = failedPandaIDs[0][0]
- varMap[':type1'] = 'log'
- varMap[':type2'] = 'output'
- # begin transaction
- self.conn.begin()
- self.cur.arraysize = 100000
- self.cur.execute(sqlF+comment,varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- subDsList = []
- for tmpDSname, in res:
- tmpDS = self.queryDatasetWithMap({'name':tmpDSname})
- if tmpDS == None:
- _logger.error("retryJobsInActive : %s %s - failed to get DS=%s" % (prodUserName,jobDefinitionID,tmpDSname))
- # return None for DB error
- return None
- # append
- subDsList.append(tmpDS)
- # lock datasets
- lockedDS = True
- ngStatus = ['closed','tobeclosed','completed','tobemerged','merging','cleanup']
- sqlD = "UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE "
- sqlD+= "WHERE vuid=:vuid AND NOT status IN ("
- for tmpIdx,tmpNgStat in enumerate(ngStatus):
- sqlD += ':ngSt%s,' % tmpIdx
- sqlD = sqlD[:-1]
- sqlD += ") "
- self.conn.begin()
- self.cur.arraysize = 10
- for tmpDS in subDsList:
- varMap = {}
- varMap[':status'] = 'locked'
- varMap[':vuid'] = tmpDS.vuid
- for tmpIdx,tmpNgStat in enumerate(ngStatus):
- tmpKey = ':ngSt%s' % tmpIdx
- varMap[tmpKey] = tmpNgStat
- # update
- self.cur.execute(sqlD+comment,varMap)
- retD = self.cur.rowcount
- # datasets already closed
- if retD == 0:
- # roll back
- self._rollback()
- # failed to lock datasets
- _logger.debug("retryJobsInActive : %s %s - %s is closed" % (prodUserName,jobDefinitionID,tmpDS.name))
- lockedDS = False
- break
- # retry jobs
- if lockedDS:
- # commit for dataset lock
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # loop over all PandaIDs
- for pandaID,tmpAttemptNr in failedPandaIDs:
- retryRet = self.retryJob(pandaID,{},failedInActive=True,attemptNr=tmpAttemptNr)
- _logger.debug("retryJobsInActive : %s %s - PandaID=%s %s" % (prodUserName,jobDefinitionID,pandaID,retryRet))
- # unlock datasets
- sqlDU = "UPDATE ATLAS_PANDA.Datasets SET status=:nStatus,modificationdate=CURRENT_DATE "
- sqlDU+= "WHERE vuid=:vuid AND status=:oStatus"
- self.conn.begin()
- self.cur.arraysize = 10
- for tmpDS in subDsList:
- varMap = {}
- varMap[':oStatus'] = 'locked'
- varMap[':nStatus'] = tmpDS.status
- varMap[':vuid'] = tmpDS.vuid
- # update
- self.cur.execute(sqlDU+comment,varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return True when job is active
- retVal = False
- if nJobsInAct > 0 or nJobsInDef > 0:
- retVal = True
- _logger.debug("retryJobsInActive : end %s - %s %s" % (retVal,prodUserName,jobDefinitionID))
- return retVal
- except:
- # roll back
- self._rollback()
- # error report
- errType,errValue = sys.exc_info()[:2]
- _logger.error("retryJobsInActive : %s %s" % (errType,errValue))
- return None
-
-
- # get jobs
- def getJobs(self,nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement,
- atlasRelease,prodUserID,countryGroup,workingGroup,allowOtherCountry):
- comment = ' /* DBProxy.getJobs */'
- # use memcache
- useMemcache = False
- try:
- if panda_config.memcached_enable and siteName in ['MWT2_UC','ANALY_MWT2','BNL_ATLAS_test','ANALY_BNL_test',
- 'ANALY_GLASGOW']: # FIXME
- # initialize memcache
- if self.memcache == None:
- from MemProxy import MemProxy
- self.memcache = MemProxy()
- if not self.memcache in [None,False]:
- useMemcache = True
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("failed to initialize memcached with %s %s" % (errType,errValue))
- # aggregated sites which use different appdirs
- aggSiteMap = {'CERN-PROD':{'CERN-RELEASE':'release',
- 'CERN-UNVALID':'unvalid',
- 'CERN-BUILDS' :'builds',
- },
- }
- # construct where clause
- dynamicBrokering = False
- getValMap = {}
- getValMap[':oldJobStatus'] = 'activated'
- getValMap[':computingSite'] = siteName
- if not aggSiteMap.has_key(siteName):
- sql1 = "WHERE jobStatus=:oldJobStatus AND computingSite=:computingSite AND commandToPilot IS NULL "
- else:
- # aggregated sites
- sql1 = "WHERE jobStatus=:oldJobStatus AND computingSite IN (:computingSite,"
- for tmpAggIdx,tmpAggSite in enumerate(aggSiteMap[siteName].keys()):
- tmpKeyName = ':computingSite%s' % tmpAggIdx
- sql1 += '%s,' % tmpKeyName
- getValMap[tmpKeyName] = tmpAggSite
- sql1 = sql1[:-1]
- sql1 += ") AND commandToPilot IS NULL "
- if not mem in [0,'0']:
- sql1+= "AND (minRamCount<=:minRamCount OR minRamCount=0) "
- getValMap[':minRamCount'] = mem
- if not diskSpace in [0,'0']:
- sql1+= "AND (maxDiskCount<=:maxDiskCount OR maxDiskCount=0) "
- getValMap[':maxDiskCount'] = diskSpace
- if prodSourceLabel == 'user':
- sql1+= "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2,:prodSourceLabel3) "
- getValMap[':prodSourceLabel1'] = 'user'
- getValMap[':prodSourceLabel2'] = 'panda'
- getValMap[':prodSourceLabel3'] = 'install'
- elif prodSourceLabel == 'ddm':
- dynamicBrokering = True
- sql1+= "AND prodSourceLabel=:prodSourceLabel "
- getValMap[':prodSourceLabel'] = 'ddm'
- elif prodSourceLabel in [None,'managed']:
- sql1+= "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2,:prodSourceLabel3,:prodSourceLabel4) "
- getValMap[':prodSourceLabel1'] = 'managed'
- getValMap[':prodSourceLabel2'] = 'test'
- getValMap[':prodSourceLabel3'] = 'prod_test'
- getValMap[':prodSourceLabel4'] = 'install'
- elif prodSourceLabel == 'software':
- sql1+= "AND prodSourceLabel=:prodSourceLabel "
- getValMap[':prodSourceLabel'] = 'software'
- elif prodSourceLabel == 'test' and computingElement != None:
- dynamicBrokering = True
- sql1+= "AND (processingType IN (:processingType1,:processingType2,:processingType3) "
- sql1+= "OR prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2,:prodSourceLabel3)) "
- getValMap[':processingType1'] = 'gangarobot'
- getValMap[':processingType2'] = 'analy_test'
- getValMap[':processingType3'] = 'prod_test'
- getValMap[':prodSourceLabel1'] = 'test'
- getValMap[':prodSourceLabel2'] = 'prod_test'
- getValMap[':prodSourceLabel3'] = 'install'
- else:
- sql1+= "AND prodSourceLabel=:prodSourceLabel "
- getValMap[':prodSourceLabel'] = prodSourceLabel
- # user ID
- if prodUserID != None:
- # get compact DN
- compactDN = self.cleanUserID(prodUserID)
- if compactDN in ['','NULL',None]:
- compactDN = prodUserID
- sql1+= "AND prodUserName=:prodUserName "
- getValMap[':prodUserName'] = compactDN
- # country group
- specialHandled = False
- if prodSourceLabel == 'user':
- # update pledge resource ratio
- self.getPledgeResourceRatio()
- # other country is allowed to use the pilot
- if allowOtherCountry=='True' and self.beyondPledgeRatio.has_key(siteName) and self.beyondPledgeRatio[siteName] > 0:
- # check if countryGroup needs to be used for beyond-pledge
- if self.checkCountryGroupForBeyondPledge(siteName):
- countryGroup = self.beyondPledgeRatio[siteName]['countryGroup']
- specialHandled = True
- else:
- countryGroup = ''
- # countryGroup
- if not countryGroup in ['',None]:
- sql1+= "AND countryGroup IN ("
- idxCountry = 1
- for tmpCountry in countryGroup.split(','):
- tmpKey = ":countryGroup%s" % idxCountry
- sql1+= "%s," % tmpKey
- getValMap[tmpKey] = tmpCountry
- idxCountry += 1
- sql1 = sql1[:-1]
- sql1+= ") "
- # workingGroup
- if not workingGroup in ['',None]:
- sql1+= "AND workingGroup IN ("
- idxWorking = 1
- for tmpWorking in workingGroup.split(','):
- tmpKey = ":workingGroup%s" % idxWorking
- sql1+= "%s," % tmpKey
- getValMap[tmpKey] = tmpWorking
- idxWorking += 1
- sql1 = sql1[:-1]
- sql1+= ") "
- # production share
- if prodSourceLabel in ['managed',None,'sharetest']:
- aggSitesForFairshare = []
- if aggSiteMap.has_key(siteName):
- aggSitesForFairshare = aggSiteMap[siteName].keys()
- shareSQL,shareVarMap = self.getCriteriaForProdShare(siteName,aggSitesForFairshare)
- if shareVarMap != {}:
- sql1 += shareSQL
- for tmpShareKey in shareVarMap.keys():
- getValMap[tmpShareKey] = shareVarMap[tmpShareKey]
- sql2 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames()
- sql2+= "WHERE PandaID=:PandaID"
- retJobs = []
- nSent = 0
- try:
- timeLimit = datetime.timedelta(seconds=timeout-10)
- timeStart = datetime.datetime.utcnow()
- strName = datetime.datetime.isoformat(timeStart)
- attLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=15)
- attSQL = "AND ((creationTime<:creationTime AND attemptNr>1) OR attemptNr<=1) "
- # get nJobs
- for iJob in range(nJobs):
- pandaID = 0
- fileMapForMem = {}
- # select channel for ddm jobs
- if prodSourceLabel == 'ddm':
- sqlDDM = "SELECT count(*),jobStatus,sourceSite,destinationSite,transferType FROM ATLAS_PANDA.jobsActive4 WHERE computingSite=:computingSite AND prodSourceLabel=:prodSourceLabel " \
- + attSQL + "GROUP BY jobStatus,sourceSite,destinationSite,transferType"
- ddmValMap = {}
- ddmValMap[':computingSite'] = siteName
- ddmValMap[':creationTime'] = attLimit
- ddmValMap[':prodSourceLabel'] = 'ddm'
- _logger.debug(sqlDDM+comment+str(ddmValMap))
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 100
- self.cur.execute(sqlDDM+comment, ddmValMap)
- resDDM = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # make a channel map
- channelMap = {}
- for tmp_count,tmp_jobStatus,tmp_sourceSite,tmp_destinationSite,tmp_transferType in resDDM:
- # use source,dest,type as the key
- channel = (tmp_sourceSite,tmp_destinationSite,tmp_transferType)
- if not channelMap.has_key(channel):
- channelMap[channel] = {}
- # ignore holding
- if tmp_jobStatus == 'holding':
- continue
- # distinguish activate from other stats
- if tmp_jobStatus != 'activated':
- tmp_jobStatus = 'others'
- # append
- if not channelMap[channel].has_key(tmp_jobStatus):
- channelMap[channel][tmp_jobStatus] = int(tmp_count)
- else:
- channelMap[channel][tmp_jobStatus] += int(tmp_count)
- _logger.debug(channelMap)
- # choose channel
- channels = channelMap.keys()
- random.shuffle(channels)
- foundChannel = False
- for channel in channels:
- # no activated jobs
- if (not channelMap[channel].has_key('activated')) or channelMap[channel]['activated'] == 0:
- continue
- maxRunning = 15
- # prestaging job
- if channel[0] == channel[1] and channel[2] == 'dis':
- maxRunning = 50
- if (not channelMap[channel].has_key('others')) or channelMap[channel]['others'] < maxRunning:
- # set SQL
- sql1+= "AND sourceSite=:sourceSite AND destinationSite=:destinationSite AND transferType=:transferType "
- getValMap[':sourceSite'] = channel[0]
- getValMap[':destinationSite'] = channel[1]
- getValMap[':transferType'] = channel[2]
- foundChannel = True
- break
- # no proper channel
- if not foundChannel:
- _logger.debug("getJobs : no DDM jobs for Site %s" % siteName)
- break
- # get job
- if prodSourceLabel in ['ddm']:
- # to add some delay for attempts
- sql1 += attSQL
- getValMap[':creationTime'] = attLimit
- nTry=1
- for iTry in range(nTry):
- # set siteID
- tmpSiteID = siteName
- if siteName.startswith('ANALY_BNL_ATLAS'):
- tmpSiteID = 'ANALY_BNL_ATLAS_1'
- # get file lock
- _logger.debug("getJobs : %s -> lock" % strName)
- if (datetime.datetime.utcnow() - timeStart) < timeLimit:
- toGetPandaIDs = True
- pandaIDs = []
- specialHandlingMap = {}
- # get max priority for analysis jobs
- if prodSourceLabel in ['panda','user']:
- sqlMX = "SELECT /*+ INDEX_RS_ASC(tab (PRODSOURCELABEL COMPUTINGSITE JOBSTATUS) ) */ MAX(currentPriority) FROM ATLAS_PANDA.jobsActive4 tab "
- sqlMX+= sql1
- _logger.debug(sqlMX+comment+str(getValMap))
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10
- self.cur.execute(sqlMX+comment, getValMap)
- tmpPriority, = self.cur.fetchone()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # no jobs
- if tmpPriority == None:
- toGetPandaIDs = False
- else:
- # set priority
- sql1 += "AND currentPriority=:currentPriority"
- getValMap[':currentPriority'] = tmpPriority
- maxAttemptIDx = 10
- if toGetPandaIDs:
- # get PandaIDs
- sqlP = "SELECT /*+ INDEX_RS_ASC(tab (PRODSOURCELABEL COMPUTINGSITE JOBSTATUS) ) */ PandaID,currentPriority,specialHandling FROM ATLAS_PANDA.jobsActive4 tab "
- sqlP+= sql1
- _logger.debug(sqlP+comment+str(getValMap))
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 100000
- self.cur.execute(sqlP+comment, getValMap)
- resIDs = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- maxCurrentPriority = None
- # get max priority and min PandaID
- for tmpPandaID,tmpCurrentPriority,tmpSpecialHandling in resIDs:
- if maxCurrentPriority==None or maxCurrentPriority < tmpCurrentPriority:
- maxCurrentPriority = tmpCurrentPriority
- pandaIDs = [tmpPandaID]
- elif maxCurrentPriority == tmpCurrentPriority:
- pandaIDs.append(tmpPandaID)
- specialHandlingMap[tmpPandaID] = tmpSpecialHandling
- # sort
- pandaIDs.sort()
- if pandaIDs == []:
- _logger.debug("getJobs : %s -> no PandaIDs" % strName)
- retU = 0
- else:
- # check the number of available files
- if useMemcache:
- _logger.debug("getJobs : %s -> memcache check start" % strName)
- # truncate
- pandaIDs = pandaIDs[:maxAttemptIDx]
- # get input files
- availableFileMap = {}
- self.cur.arraysize = 100000
- sqlMemFile = "SELECT lfn FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type"
- for tmpPandaID in pandaIDs:
- varMap = {}
- varMap[':type'] = 'input'
- varMap[':PandaID'] = tmpPandaID
- # start transaction
- self.conn.begin()
- # select
- self.cur.execute(sqlMemFile+comment,varMap)
- resFiles = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # get list
- fileMapForMem[tmpPandaID] = []
- for tmpItem, in resFiles:
- fileMapForMem[tmpPandaID].append(tmpItem)
- # get number of available files
- nAvailable = self.memcache.checkFiles(tmpPandaID,fileMapForMem[tmpPandaID],
- siteName,node)
- # append
- if not nAvailable in availableFileMap:
- availableFileMap[nAvailable] = []
- availableFileMap[nAvailable].append(tmpPandaID)
- # sort by the number of available files
- tmpAvaKeys = availableFileMap.keys()
- tmpAvaKeys.sort()
- tmpAvaKeys.reverse()
- pandaIDs = []
- for tmpAvaKey in tmpAvaKeys:
- pandaIDs += availableFileMap[tmpAvaKey]
- _logger.debug("getJobs : %s -> memcache check done" % strName)
- # update
- for indexID,tmpPandaID in enumerate(pandaIDs):
- # max attempts
- if indexID > maxAttemptIDx:
- break
- # update
- sqlJ = "UPDATE ATLAS_PANDA.jobsActive4 "
- sqlJ+= "SET jobStatus=:newJobStatus,modificationTime=CURRENT_DATE,modificationHost=:modificationHost,startTime=CURRENT_DATE"
- varMap = {}
- varMap[':PandaID'] = tmpPandaID
- varMap[':newJobStatus'] = 'sent'
- varMap[':oldJobStatus'] = 'activated'
- varMap[':modificationHost'] = node
- # set CE
- if computingElement != None:
- sqlJ+= ",computingElement=:computingElement"
- varMap[':computingElement'] = computingElement
- # set special handlng
- if specialHandled:
- sqlJ+= ",specialHandling=:specialHandling"
- spString = 'localpool'
- if specialHandlingMap.has_key(tmpPandaID) and isinstance(specialHandlingMap[tmpPandaID],types.StringType):
- if not spString in specialHandlingMap[tmpPandaID]:
- varMap[':specialHandling'] = specialHandlingMap[tmpPandaID]+','+spString
- else:
- varMap[':specialHandling'] = specialHandlingMap[tmpPandaID]
- else:
- varMap[':specialHandling'] = spString
- sqlJ+= " WHERE PandaID=:PandaID AND jobStatus=:oldJobStatus"
- # SQL to get nSent
- sentLimit = timeStart - datetime.timedelta(seconds=60)
- sqlSent = "SELECT count(*) FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus "
- sqlSent += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) "
- sqlSent += "AND computingSite=:computingSite "
- sqlSent += "AND modificationTime>:modificationTime "
- varMapSent = {}
- varMapSent[':jobStatus'] = 'sent'
- varMapSent[':computingSite'] = tmpSiteID
- varMapSent[':modificationTime'] = sentLimit
- varMapSent[':prodSourceLabel1'] = 'managed'
- varMapSent[':prodSourceLabel2'] = 'test'
- # start
- _logger.debug(sqlJ+comment+str(varMap))
- # start transaction
- self.conn.begin()
- # update
- self.cur.execute(sqlJ+comment, varMap)
- retU = self.cur.rowcount
- if retU != 0:
- # get nSent for production jobs
- if prodSourceLabel in [None,'managed']:
- _logger.debug(sqlSent+comment+str(varMapSent))
- self.cur.execute(sqlSent+comment, varMapSent)
- resSent = self.cur.fetchone()
- if resSent != None:
- nSent, = resSent
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # succeeded
- if retU != 0:
- pandaID = tmpPandaID
- break
- else:
- _logger.debug("getJobs : %s -> do nothing" % strName)
- retU = 0
- # release file lock
- _logger.debug("getJobs : %s -> unlock" % strName)
- # succeeded
- if retU != 0:
- break
- if iTry+1 < nTry:
- #time.sleep(0.5)
- pass
- # failed to UPDATE
- if retU == 0:
- # reset pandaID
- pandaID = 0
- _logger.debug("getJobs : Site %s : retU %s : PandaID %s - %s"
- % (siteName,retU,pandaID,prodSourceLabel))
- if pandaID == 0:
- break
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- self.cur.arraysize = 10
- self.cur.execute(sql2+comment, varMap)
- res = self.cur.fetchone()
- if len(res) == 0:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- break
- # instantiate Job
- job = JobSpec()
- job.pack(res)
- # Files
- sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=:PandaID"
- self.cur.arraysize = 10000
- self.cur.execute(sqlFile+comment, varMap)
- resFs = self.cur.fetchall()
- # job parameters
- sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- self.cur.execute(sqlJobP+comment, varMap)
- for clobJobP, in self.cur:
- job.jobParameters = clobJobP.read()
- break
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- job.addFile(file)
- # overwrite processingType for appdir at aggrigates sites
- if aggSiteMap.has_key(siteName):
- if aggSiteMap[siteName].has_key(job.computingSite):
- job.processingType = aggSiteMap[siteName][job.computingSite]
- job.computingSite = job.computingSite
- # append
- retJobs.append(job)
- # record status change
- try:
- self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job)
- except:
- _logger.error('recordStatusChange in getJobs')
- return retJobs,nSent
- except:
- # roll back
- self._rollback()
- # error report
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobs : %s %s" % (type,value))
- return [],0
-
-
- # reset job in jobsActive or jobsWaiting
- def resetJob(self,pandaID,activeTable=True,keepSite=False,getOldSubs=False,forPending=True):
- comment = ' /* DBProxy.resetJob */'
- _logger.debug("resetJobs : %s" % pandaID)
- # select table
- table = 'ATLAS_PANDA.jobsWaiting4'
- if activeTable:
- table = 'ATLAS_PANDA.jobsActive4'
- sql1 = "SELECT %s FROM %s " % (JobSpec.columnNames(),table)
- sql1+= "WHERE PandaID=:PandaID"
- sql2 = "DELETE FROM %s " % table
- sql2+= "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)"
- sql3 = "INSERT INTO ATLAS_PANDA.jobsDefined4 (%s) " % JobSpec.columnNames()
- sql3+= JobSpec.bindValuesExpression()
- try:
- # transaction causes Request ndbd time-out in ATLAS_PANDA.jobsActive4
- self.conn.begin()
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- self.cur.arraysize = 10
- self.cur.execute(sql1+comment,varMap)
- res = self.cur.fetchone()
- # not found
- if res == None:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return None
- # instantiate Job
- job = JobSpec()
- job.pack(res)
- # if already running
- if job.jobStatus != 'waiting' and job.jobStatus != 'activated' \
- and (forPending and job.jobStatus != 'pending'):
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return None
- # do nothing for analysis jobs
- if job.prodSourceLabel in ['user','panda']:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return None
- # delete
- varMap = {}
- varMap[':PandaID'] = pandaID
- if not forPending:
- varMap[':oldJobStatus1'] = 'waiting'
- else:
- varMap[':oldJobStatus1'] = 'pending'
- varMap[':oldJobStatus2'] = 'activated'
- self.cur.execute(sql2+comment,varMap)
- retD = self.cur.rowcount
- # delete failed
- _logger.debug("resetJobs : retD = %s" % retD)
- if retD != 1:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return None
- # delete from jobsDefined4 just in case
- varMap = {}
- varMap[':PandaID'] = pandaID
- sqlD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID"
- self.cur.execute(sqlD+comment,varMap)
- # increase priority
- if job.jobStatus == 'activated' and job.currentPriority < 100:
- job.currentPriority = 100
- # reset computing site and dispatchDBlocks
- job.jobStatus = 'defined'
- job.dispatchDBlock = None
- # erase old assignment
- if (not keepSite) and job.relocationFlag != 1:
- job.computingSite = None
- job.computingElement = None
- # host and time information
- job.modificationHost = self.hostname
- job.modificationTime = datetime.datetime.utcnow()
- job.stateChangeTime = job.modificationTime
- # reset
- job.brokerageErrorDiag = None
- job.brokerageErrorCode = None
- # insert
- self.cur.execute(sql3+comment, job.valuesMap())
- # job parameters
- sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID"
- self.cur.execute(sqlJobP+comment, varMap)
- for clobJobP, in self.cur:
- job.jobParameters = clobJobP.read()
- break
- # Files
- oldSubList = []
- sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=:PandaID"
- self.cur.arraysize = 10000
- self.cur.execute(sqlFile+comment, varMap)
- resFs = self.cur.fetchall()
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- # reset GUID to trigger LRC/LFC scanning
- if file.status == 'missing':
- file.GUID = None
- # collect old subs
- if job.prodSourceLabel in ['managed','test'] and file.type in ['output','log'] \
- and re.search('_sub\d+$',file.destinationDBlock) != None:
- if not file.destinationDBlock in oldSubList:
- oldSubList.append(file.destinationDBlock)
- # reset status, destinationDBlock and dispatchDBlock
- file.status ='unknown'
- file.dispatchDBlock = None
- file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock)
- # add file
- job.addFile(file)
- # update files
- sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID"
- varMap = file.valuesMap(onlyChanged=True)
- if varMap != {}:
- varMap[':row_ID'] = file.row_ID
- _logger.debug(sqlF+comment+str(varMap))
- self.cur.execute(sqlF+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # record status change
- try:
- self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job)
- except:
- _logger.error('recordStatusChange in resetJobs')
- if getOldSubs:
- return job,oldSubList
- return job
- except:
- # roll back
- self._rollback()
- # error report
- type, value, traceBack = sys.exc_info()
- _logger.error("resetJobs : %s %s" % (type,value))
- _logger.error("resetJobs : %s" % pandaID)
- return None
-
-
- # reset jobs in jobsDefined
- def resetDefinedJob(self,pandaID,keepSite=False,getOldSubs=False):
- comment = ' /* DBProxy.resetDefinedJob */'
- _logger.debug("resetDefinedJob : %s" % pandaID)
- sql1 = "UPDATE ATLAS_PANDA.jobsDefined4 SET "
- sql1 += "jobStatus=:newJobStatus,"
- sql1 += "modificationTime=CURRENT_DATE,"
- sql1 += "dispatchDBlock=NULL,"
- sql1 += "computingElement=NULL"
- sql1 += " WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)"
- sql2 = "SELECT %s FROM ATLAS_PANDA.jobsDefined4 " % JobSpec.columnNames()
- sql2+= "WHERE PandaID=:PandaID"
- try:
- oldSubList = []
- # begin transaction
- self.conn.begin()
- # update
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':newJobStatus'] = 'defined'
- varMap[':oldJobStatus1'] = 'assigned'
- varMap[':oldJobStatus2'] = 'defined'
- self.cur.execute(sql1+comment,varMap)
- retU = self.cur.rowcount
- # not found
- updatedFlag = False
- job = None
- if retU == 0:
- _logger.debug("resetDefinedJob : Not found %s" % pandaID)
- else:
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- self.cur.arraysize = 10
- self.cur.execute(sql2+comment,varMap)
- res = self.cur.fetchone()
- # not found
- if res == None:
- raise RuntimeError, 'Could not SELECT : PandaID=%s' % pandaID
- # instantiate Job
- job = JobSpec()
- job.pack(res)
- # do nothing for analysis jobs
- if job.prodSourceLabel in ['user','panda']:
- _logger.debug('resetDefinedJob : rollback since PandaID=%s is analysis job' % pandaID)
- # roll back
- self._rollback()
- return None
- job.dispatchDBlock = None
- if (not keepSite) and job.relocationFlag != 1:
- # erase old assignment
- job.computingSite = None
- job.computingElement = None
- # job parameters
- sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID"
- self.cur.execute(sqlJobP+comment, varMap)
- for clobJobP, in self.cur:
- job.jobParameters = clobJobP.read()
- break
- # Files
- sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=:PandaID"
- self.cur.arraysize = 10000
- self.cur.execute(sqlFile+comment, varMap)
- resFs = self.cur.fetchall()
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- # collect old subs
- if job.prodSourceLabel in ['managed','test'] and file.type in ['output','log'] \
- and re.search('_sub\d+$',file.destinationDBlock) != None:
- if not file.destinationDBlock in oldSubList:
- oldSubList.append(file.destinationDBlock)
- # reset status, destinationDBlock and dispatchDBlock
- file.status ='unknown'
- file.dispatchDBlock = None
- file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock)
- # add file
- job.addFile(file)
- # update files
- sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID"
- varMap = file.valuesMap(onlyChanged=True)
- if varMap != {}:
- varMap[':row_ID'] = file.row_ID
- _logger.debug(sqlF+comment+str(varMap))
- self.cur.execute(sqlF+comment, varMap)
- updatedFlag = True
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # record status change
- try:
- if updatedFlag:
- self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job)
- except:
- _logger.error('recordStatusChange in resetDefinedJobs')
- if getOldSubs:
- return job,oldSubList
- return job
- except:
- # error report
- type, value, traceBack = sys.exc_info()
- _logger.error("resetDefinedJobs : %s %s" % (type,value))
- # roll back
- self._rollback()
- return None
-
-
- # kill job
- def killJob(self,pandaID,user,code,prodManager,getUserInfo=False,wgProdRole=[]):
- # code
- # 2 : expire
- # 3 : aborted
- # 4 : expire in waiting
- # 7 : retry by server
- # 8 : rebrokerage
- # 9 : force kill
- # 91 : kill user jobs with prod role
- comment = ' /* DBProxy.killJob */'
- _logger.debug("killJob : code=%s PandaID=%s role=%s user=%s wg=%s" % (code,pandaID,prodManager,user,wgProdRole))
- # check PandaID
- try:
- long(pandaID)
- except:
- _logger.error("not an integer : %s" % pandaID)
- if getUserInfo:
- return False,{}
- return False
- sql0 = "SELECT prodUserID,prodSourceLabel,jobDefinitionID,jobsetID,workingGroup FROM %s WHERE PandaID=:PandaID"
- sql1 = "UPDATE %s SET commandToPilot=:commandToPilot,taskBufferErrorDiag=:taskBufferErrorDiag WHERE PandaID=:PandaID AND commandToPilot IS NULL"
- sql1F = "UPDATE %s SET commandToPilot=:commandToPilot,taskBufferErrorDiag=:taskBufferErrorDiag WHERE PandaID=:PandaID"
- sql2 = "SELECT %s " % JobSpec.columnNames()
- sql2 += "FROM %s WHERE PandaID=:PandaID AND jobStatus<>:jobStatus"
- sql3 = "DELETE FROM %s WHERE PandaID=:PandaID"
- sqlU = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)"
- sql4 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames()
- sql4 += JobSpec.bindValuesExpression()
- sqlF = "UPDATE ATLAS_PANDA.filesTable4 SET status=:status WHERE PandaID=:PandaID AND type IN (:type1,:type2)"
- sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID"
- try:
- flagCommand = False
- flagKilled = False
- userProdUserID = ''
- userProdSourceLabel = ''
- userJobDefinitionID = ''
- userJobsetID = ''
- updatedFlag = False
- # begin transaction
- self.conn.begin()
- for table in ('ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4'):
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # begin transaction
- self.conn.begin()
- # get DN if user is not production DN
- varMap = {}
- varMap[':PandaID'] = pandaID
- self.cur.arraysize = 10
- self.cur.execute((sql0+comment) % table, varMap)
- res = self.cur.fetchone()
- # not found
- if res == None:
- continue
- # owner?
- def getCN(dn):
- distinguishedName = ''
- for line in dn.split('/'):
- if line.startswith('CN='):
- distinguishedName = re.sub('^CN=','',line)
- distinguishedName = re.sub('\d+$','',distinguishedName)
- distinguishedName = distinguishedName.strip()
- break
- if distinguishedName == '':
- distinguishedName = dn
- return distinguishedName
- # prevent prod proxy from killing analysis jobs
- userProdUserID,userProdSourceLabel,userJobDefinitionID,userJobsetID,workingGroup = res
- # check group prod role
- validGroupProdRole = False
- if res[1] in ['managed','test'] and workingGroup != '':
- for tmpGroupProdRole in wgProdRole:
- if tmpGroupProdRole == '':
- continue
- if re.search('(^|_)'+tmpGroupProdRole+'$',workingGroup,re.I) != None:
- validGroupProdRole = True
- break
- if prodManager:
- if res[1] in ['user','panda'] and (not code in ['2','4','7','8','9','91']):
- _logger.debug("ignore killJob -> prod proxy tried to kill analysis job type=%s" % res[1])
- break
- _logger.debug("killJob : %s using prod role" % pandaID)
- elif validGroupProdRole:
- # WGs with prod role
- _logger.debug("killJob : %s using group prod role for workingGroup=%s" % (pandaID,workingGroup))
- pass
- else:
- cn1 = getCN(res[0])
- cn2 = getCN(user)
- _logger.debug("Owner:%s - Requester:%s " % (cn1,cn2))
- if cn1 != cn2:
- _logger.debug("ignore killJob -> Owner != Requester")
- break
- # update
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':commandToPilot'] = 'tobekilled'
- varMap[':taskBufferErrorDiag'] = 'killed by %s' % user
- if userProdSourceLabel in ['managed','test'] and code in ['9',]:
- # ignore commandToPilot for force kill
- self.cur.execute((sql1F+comment) % table, varMap)
- else:
- self.cur.execute((sql1+comment) % table, varMap)
- retU = self.cur.rowcount
- if retU == 0:
- continue
- # set flag
- flagCommand = True
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- if (userProdSourceLabel in ['managed','test'] or 'test' in userProdSourceLabel) and code in ['9',]:
- # use dummy for force kill
- varMap[':jobStatus'] = 'dummy'
- else:
- varMap[':jobStatus'] = 'running'
- self.cur.arraysize = 10
- self.cur.execute((sql2+comment) % table, varMap)
- res = self.cur.fetchall()
- if len(res) == 0:
- continue
- # instantiate JobSpec
- job = JobSpec()
- job.pack(res[0])
- # delete
- if table=='ATLAS_PANDA.jobsDefined4':
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':oldJobStatus1'] = 'assigned'
- varMap[':oldJobStatus2'] = 'defined'
- self.cur.execute(sqlU+comment, varMap)
- else:
- varMap = {}
- varMap[':PandaID'] = pandaID
- self.cur.execute((sql3+comment) % table, varMap)
- retD = self.cur.rowcount
- if retD == 0:
- continue
- # error code
- if job.jobStatus != 'failed':
- # set status etc for non-failed jobs
- job.endTime = datetime.datetime.utcnow()
- job.modificationTime = job.endTime
- if code in ['2','4']:
- # expire
- if code == '2':
- job.taskBufferErrorCode = ErrorCode.EC_Expire
- job.taskBufferErrorDiag = 'expired after 7 days since submission'
- else:
- # waiting timeout
- job.taskBufferErrorCode = ErrorCode.EC_Expire
- #job.taskBufferErrorCode = ErrorCode.EC_WaitTimeout
- job.taskBufferErrorDiag = 'expired after waiting for input data for 2 days'
- elif code=='3':
- # aborted
- job.taskBufferErrorCode = ErrorCode.EC_Aborted
- job.taskBufferErrorDiag = 'aborted by ExtIF'
- elif code=='8':
- # reassigned by rebrokeage
- job.taskBufferErrorCode = ErrorCode.EC_Reassigned
- job.taskBufferErrorDiag = 'reassigned to another site by rebrokerage. new %s' % user
- job.commandToPilot = None
- else:
- # killed
- job.taskBufferErrorCode = ErrorCode.EC_Kill
- job.taskBufferErrorDiag = 'killed by %s' % user
- # set job status
- job.jobStatus = 'cancelled'
- else:
- # keep status for failed jobs
- job.modificationTime = datetime.datetime.utcnow()
- if code=='7':
- # retried by server
- job.taskBufferErrorCode = ErrorCode.EC_Retried
- job.taskBufferErrorDiag = 'retrying at another site. new %s' % user
- job.commandToPilot = None
- job.stateChangeTime = job.modificationTime
- # insert
- self.cur.execute(sql4+comment, job.valuesMap())
- # update file
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':status'] = 'failed'
- varMap[':type1'] = 'output'
- varMap[':type2'] = 'log'
- self.cur.execute(sqlF+comment,varMap)
- # update files,metadata,parametes
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':modificationTime'] = job.modificationTime
- self.cur.execute(sqlFMod+comment,varMap)
- self.cur.execute(sqlMMod+comment,varMap)
- self.cur.execute(sqlPMod+comment,varMap)
- flagKilled = True
- updatedFlag = True
- break
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("killJob : com=%s kill=%s " % (flagCommand,flagKilled))
- # record status change
- try:
- if updatedFlag:
- self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job)
- except:
- _logger.error('recordStatusChange in killJob')
- if getUserInfo:
- return (flagCommand or flagKilled),{'prodUserID':userProdUserID,
- 'prodSourceLabel':userProdSourceLabel,
- 'jobDefinitionID':userJobDefinitionID,
- 'jobsetID':userJobsetID}
- return (flagCommand or flagKilled)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("killJob : %s %s" % (type,value))
- # roll back
- self._rollback()
- if getUserInfo:
- return False,{}
- return False
-
-
- # peek at job
- def peekJob(self,pandaID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal=False):
- comment = ' /* DBProxy.peekJob */'
- _logger.debug("peekJob : %s" % pandaID)
- # return None for NULL PandaID
- if pandaID in ['NULL','','None',None]:
- return None
- # only int
- try:
- tmpID = int(pandaID)
- except:
- _logger.debug("peekJob : return None for %s:non-integer" % pandaID)
- return None
- sql1_0 = "SELECT %s FROM %s "
- sql1_1 = "WHERE PandaID=:PandaID"
- nTry=3
- for iTry in range(nTry):
- try:
- tables=[]
- if fromDefined:
- tables.append('ATLAS_PANDA.jobsDefined4')
- if fromActive:
- tables.append('ATLAS_PANDA.jobsActive4')
- if fromArchived:
- tables.append('ATLAS_PANDA.jobsArchived4')
- if fromWaiting:
- tables.append('ATLAS_PANDA.jobsWaiting4')
- if fromDefined:
- # for jobs which are just reset
- tables.append('ATLAS_PANDA.jobsDefined4')
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- for table in tables:
- # start transaction
- self.conn.begin()
- # select
- sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1
- self.cur.arraysize = 10
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if len(res) != 0:
- # Job
- job = JobSpec()
- job.pack(res[0])
- # Files
- # start transaction
- self.conn.begin()
- # select
- sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=:PandaID"
- self.cur.arraysize = 10000
- self.cur.execute(sqlFile+comment, varMap)
- resFs = self.cur.fetchall()
- # metadata
- resMeta = None
- if table == 'ATLAS_PANDA.jobsArchived4' or forAnal:
- # read metadata only for finished/failed production jobs
- sqlMeta = "SELECT metaData FROM ATLAS_PANDA.metaTable WHERE PandaID=:PandaID"
- self.cur.execute(sqlMeta+comment, varMap)
- for clobMeta, in self.cur:
- if clobMeta != None:
- resMeta = clobMeta.read()
- break
- # job parameters
- job.jobParameters = None
- sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID"
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- self.cur.execute(sqlJobP+comment, varMap)
- for clobJobP, in self.cur:
- if clobJobP != None:
- job.jobParameters = clobJobP.read()
- break
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # set files
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- job.addFile(file)
- # set metadata
- job.metadata = resMeta
- return job
- _logger.debug("peekJob() : PandaID %s not found" % pandaID)
- return None
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("peekJob : %s retry : %s" % (pandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("peekJob : %s %s %s" % (pandaID,type,value))
- # return None for analysis
- if forAnal:
- return None
- # return 'unknown'
- job = JobSpec()
- job.PandaID = pandaID
- job.jobStatus = 'unknown'
- return job
-
-
- # get PandaID with jobexeID
- def getPandaIDwithJobExeID(self,jobexeID):
- comment = ' /* DBProxy.getPandaIDwithJobExeID */'
- _logger.debug("getPandaIDwithJobExeID : %s" % jobexeID)
- failedRetVal = (None,None,'')
- # return for wrong jobexeID
- if jobexeID in ['NULL','','None',None]:
- return failedRetVal
- # SQL
- sql = "SELECT PandaID,jobDefinitionID,jobName FROM ATLAS_PANDA.jobsWaiting4 "
- sql += "WHERE jobExecutionID=:jobexeID AND prodSourceLabel=:prodSourceLabel "
- sql += "AND jobStatus=:jobStatus "
- varMap = {}
- varMap[':jobexeID'] = jobexeID
- varMap[':jobStatus'] = 'pending'
- varMap[':prodSourceLabel'] = 'managed'
- try:
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10
- self.cur.execute(sql+comment,varMap)
- res = self.cur.fetchone()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # not found
- if res == None:
- _logger.debug("getPandaIDwithJobExeID : jobexeID %s not found" % jobexeID)
- return failedRetVal
- _logger.debug("getPandaIDwithJobExeID : %s -> %s" % (jobexeID,str(res)))
- return res
- except:
- # roll back
- self._rollback()
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("getPandaIDwithJobExeID : %s %s %s" % (jobexeID,errtype,errvalue))
- return failedRetVal
-
-
- # get express jobs
- def getExpressJobs(self,dn):
- comment = ' /* DBProxy.getExpressJobs */'
- _logger.debug("getExpressJobs : %s" % dn)
- sqlX = "SELECT specialHandling,COUNT(*) FROM %s "
- sqlX += "WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel1 "
- sqlX += "AND specialHandling IS NOT NULL "
- sqlXJob = "SELECT PandaID,jobStatus,prodSourceLabel,modificationTime,jobDefinitionID,jobsetID,startTime,endTime FROM %s "
- sqlXJob += "WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel1 "
- sqlXJob += "AND specialHandling IS NOT NULL AND specialHandling=:specialHandling "
- sqlQ = sqlX
- sqlQ += "GROUP BY specialHandling "
- sqlQJob = sqlXJob
- sqlA = sqlX
- sqlA += "AND modificationTime>:modificationTime GROUP BY specialHandling "
- sqlAJob = sqlXJob
- sqlAJob += "AND modificationTime>:modificationTime "
- try:
- # get compact DN
- compactDN = self.cleanUserID(dn)
- if compactDN in ['','NULL',None]:
- compactDN = dn
- expressStr = 'express'
- activeExpressU = []
- timeUsageU = datetime.timedelta(0)
- executionTimeU = datetime.timedelta(hours=1)
- jobCreditU = 3
- timeCreditU = executionTimeU * jobCreditU
- timeNow = datetime.datetime.utcnow()
- timeLimit = timeNow - datetime.timedelta(hours=6)
- # loop over tables
- for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']:
- varMap = {}
- varMap[':prodUserName'] = compactDN
- varMap[':prodSourceLabel1'] = 'user'
- if table == 'ATLAS_PANDA.jobsArchived4':
- varMap[':modificationTime'] = timeLimit
- sql = sqlA % table
- sqlJob = sqlAJob % table
- else:
- sql = sqlQ % table
- sqlJob = sqlQJob % table
- # start transaction
- self.conn.begin()
- # get the number of jobs for each specialHandling
- self.cur.arraysize = 10
- _logger.debug(sql+comment+str(varMap))
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- _logger.debug("getExpressJobs %s" % str(res))
- for specialHandling,countJobs in res:
- if specialHandling == None:
- continue
- # look for express jobs
- if expressStr in specialHandling:
- varMap[':specialHandling'] = specialHandling
- self.cur.arraysize = 1000
- self.cur.execute(sqlJob+comment, varMap)
- resJobs = self.cur.fetchall()
- _logger.debug("getExpressJobs %s" % str(resJobs))
- for tmp_PandaID,tmp_jobStatus,tmp_prodSourceLabel,tmp_modificationTime,\
- tmp_jobDefinitionID,tmp_jobsetID,tmp_startTime,tmp_endTime \
- in resJobs:
- # collect active jobs
- if not tmp_jobStatus in ['finished','failed','cancelled']:
- activeExpressU.append((tmp_PandaID,tmp_jobsetID,tmp_jobDefinitionID))
- # get time usage
- if not tmp_jobStatus in ['defined','activated']:
- # check only jobs which actually use or used CPU on WN
- if tmp_startTime != None:
- # running or not
- if tmp_endTime == None:
- # job got started before/after the time limit
- if timeLimit > tmp_startTime:
- timeDelta = timeNow - timeLimit
- else:
- timeDelta = timeNow - tmp_startTime
- else:
- # job got started before/after the time limit
- if timeLimit > tmp_startTime:
- timeDelta = tmp_endTime - timeLimit
- else:
- timeDelta = tmp_endTime - tmp_startTime
- # add
- if timeDelta > datetime.timedelta(0):
- timeUsageU += timeDelta
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # check quota
- rRet = True
- rRetStr = ''
- rQuota = 0
- if len(activeExpressU) >= jobCreditU:
- rRetStr += "The number of queued runXYZ exceeds the limit = %s. " % jobCreditU
- rRet = False
- if timeUsageU >= timeCreditU:
- rRetStr += "The total execution time for runXYZ exceeds the limit = %s min. " % (timeCreditU.seconds / 60)
- rRet = False
- # calculate available quota
- if rRet:
- tmpQuota = jobCreditU - len(activeExpressU) - timeUsageU.seconds/executionTimeU.seconds
- if tmpQuota < 0:
- rRetStr += "Quota for runXYZ exceeds. "
- rRet = False
- else:
- rQuota = tmpQuota
- # return
- retVal = {'status':rRet,'quota':rQuota,'output':rRetStr,'usage':timeUsageU,'jobs':activeExpressU}
- _logger.debug("getExpressJobs : %s" % str(retVal))
- return retVal
- except:
- # roll back
- self._rollback()
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("getExpressJobs : %s %s" % (errtype,errvalue))
- return None
-
-
- # get active debug jobs
- def getActiveDebugJobs(self,dn):
- comment = ' /* DBProxy.getActiveDebugJobs */'
- _logger.debug("getActiveDebugJobs : %s" % dn)
- sqlX = "SELECT PandaID,jobStatus,specialHandling FROM %s "
- sqlX += "WHERE prodUserName=:prodUserName "
- sqlX += "AND specialHandling IS NOT NULL "
- try:
- # get compact DN
- compactDN = self.cleanUserID(dn)
- if compactDN in ['','NULL',None]:
- compactDN = dn
- debugStr = 'debug'
- activeDebugJobs = []
- # loop over tables
- for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4']:
- varMap = {}
- varMap[':prodUserName'] = compactDN
- sql = sqlX % table
- # start transaction
- self.conn.begin()
- # get jobs with specialHandling
- self.cur.arraysize = 100000
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # loop over all PandaIDs
- for pandaID,jobStatus,specialHandling in res:
- if specialHandling == None:
- continue
- # only active jobs
- if not jobStatus in ['defined','activated','running','sent','starting']:
- continue
- # look for debug jobs
- if debugStr in specialHandling and not pandaID in activeDebugJobs:
- activeDebugJobs.append(pandaID)
- # return
- activeDebugJobs.sort()
- _logger.debug("getActiveDebugJobs : %s -> %s" % (dn,str(activeDebugJobs)))
- return activeDebugJobs
- except:
- # roll back
- self._rollback()
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("getActiveDebugJobs : %s %s" % (errtype,errvalue))
- return None
-
-
- # set debug mode
- def setDebugMode(self,dn,pandaID,prodManager,modeOn):
- comment = ' /* DBProxy.setDebugMode */'
- _logger.debug("turnDebugModeOn : dn=%s id=%s prod=%s mode=%s" % (dn,pandaID,prodManager,modeOn))
- sqlX = "SELECT prodUserName,jobStatus,specialHandling FROM %s "
- sqlX += "WHERE PandaID=:PandaID "
- sqlU = "UPDATE %s SET specialHandling=:specialHandling "
- sqlU += "WHERE PandaID=:PandaID "
- try:
- # get compact DN
- compactDN = self.cleanUserID(dn)
- if compactDN in ['','NULL',None]:
- compactDN = dn
- debugStr = 'debug'
- retStr = ''
- retCode = False
- # loop over tables
- for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4']:
- varMap = {}
- varMap[':PandaID'] = pandaID
- sql = sqlX % table
- # start transaction
- self.conn.begin()
- # get jobs with specialHandling
- self.cur.arraysize = 10
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchone()
- # not found
- if res == None:
- retStr = 'Not found in active DB'
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- continue
- prodUserName,jobStatus,specialHandling = res
- # not active
- if not jobStatus in ['defined','activated','running','sent','starting']:
- retStr = 'Not in one of active job status'
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- break
- # not owner
- if not prodManager and prodUserName != compactDN:
- retStr = 'Permission denied. Not the owner'
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- break
- # set specialHandling
- updateSH = True
- if specialHandling in [None,'']:
- if modeOn:
- # set debug mode
- specialHandling = debugStr
- else:
- # already disabled debug mode
- updateSH = False
- elif debugStr in specialHandling:
- if modeOn:
- # already in debug mode
- updateSH = False
- else:
- # disable debug mode
- specialHandling = re.sub(debugStr,'',specialHandling)
- specialHandling = re.sub(',,',',',specialHandling)
- specialHandling = re.sub('^,','',specialHandling)
- specialHandling = re.sub(',$','',specialHandling)
- else:
- if modeOn:
- # set debug mode
- specialHandling = '%s,%s' % (debugStr,specialHandling)
- else:
- # already disabled debug mode
- updateSH = False
-
- # no update
- if not updateSH:
- retStr = 'Already set accordingly'
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- break
- # update
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':specialHandling'] = specialHandling
- self.cur.execute((sqlU+comment) % table, varMap)
- retD = self.cur.rowcount
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if retD == 0:
- retStr = 'Failed to update DB'
- else:
- retStr = 'Succeeded'
- break
- # return
- _logger.debug("setDebugMode : %s %s -> %s" % (dn,pandaID,retStr))
- return retStr
- except:
- # roll back
- self._rollback()
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("setDebugMode : %s %s" % (errtype,errvalue))
- return None
-
-
- # get PandaID with destinationDBlock
- def getPandaIDwithDestDBlock(self,destinationDBlock):
- comment = ' /* DBProxy.getPandaIDwithDestDBlock */'
- _logger.debug("getPandaIDwithDestDBlock : %s" % destinationDBlock)
- try:
- sqlP = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab "
- sqlP += "WHERE type IN (:type1,:type2) AND destinationDBlock=:destinationDBlock AND rownum<=1"
- # start transaction
- self.conn.begin()
- pandaID = None
- varMap = {}
- varMap[':type1'] = 'log'
- varMap[':type2'] = 'output'
- varMap[':destinationDBlock'] = destinationDBlock
- # select
- self.cur.arraysize = 10
- self.cur.execute(sqlP+comment, varMap)
- res = self.cur.fetchone()
- # append
- if res != None:
- pandaID, = res
- # commit to release tables
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return pandaID
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getPandaIDwithDestDBlock : %s %s" % (errType,errValue))
- # return empty list
- return None
-
-
- # get destSE with destinationDBlock
- def getDestSEwithDestDBlock(self,destinationDBlock):
- comment = ' /* DBProxy.getDestSEwithDestDBlock */'
- _logger.debug("getDestSEwithDestDBlock : %s" % destinationDBlock)
- try:
- sqlP = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ destinationSE FROM ATLAS_PANDA.filesTable4 tab "
- sqlP += "WHERE type IN (:type1,:type2) AND destinationDBlock=:destinationDBlock AND rownum<=1"
- # start transaction
- self.conn.begin()
- varMap = {}
- varMap[':type1'] = 'log'
- varMap[':type2'] = 'output'
- varMap[':destinationDBlock'] = destinationDBlock
- # select
- self.cur.arraysize = 10
- self.cur.execute(sqlP+comment, varMap)
- res = self.cur.fetchone()
- # append
- destinationSE = None
- if res != None:
- destinationSE, = res
- # commit to release tables
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return destinationSE
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getDestSEwithDestDBlock : %s %s" % (errType,errValue))
- # return empty list
- return None
-
-
- # get number of activated/defined jobs with output datasets
- def getNumWaitingJobsWithOutDS(self,outputDSs):
- comment = ' /* DBProxy.getNumWaitingJobsWithOutDS */'
- _logger.debug("getNumWaitingJobsWithOutDS : %s" % str(outputDSs))
- try:
- sqlD = "SELECT distinct destinationDBlock FROM ATLAS_PANDA.filesTable4 "
- sqlD += "WHERE type IN (:type1,:type2) AND dataset=:dataset AND status IN (:status1,:status2)"
- sqlP = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab "
- sqlP += "WHERE type IN (:type1,:type2) AND destinationDBlock=:destinationDBlock AND status IN (:status1,:status2) AND rownum<=1"
- sqlJ = "SELECT jobDefinitionID,taskID,prodUserName,jobStatus,prodSourceLabel FROM %s "
- sqlJ += "WHERE PandaID=:PandaID"
- sqlC = "SELECT count(*) FROM ATLAS_PANDA.jobsActive4 "
- sqlC += "WHERE jobDefinitionID=:jobDefinitionID AND prodUserName=:prodUserName AND jobStatus IN (:jobStatus1)"
- # start transaction
- self.conn.begin()
- # get sub datasets
- subDSList = []
- for outputDS in outputDSs:
- varMap = {}
- varMap[':type1'] = 'log'
- varMap[':type2'] = 'output'
- varMap[':status1'] = 'unknown'
- varMap[':status2'] = 'pending'
- varMap[':dataset'] = outputDS
- # select
- self.cur.arraysize = 1000
- self.cur.execute(sqlD+comment, varMap)
- resList = self.cur.fetchall()
- # append
- for destinationDBlock, in resList:
- subDSList.append(destinationDBlock)
- # get PandaIDs
- pandaIDs = []
- for subDS in subDSList:
- varMap = {}
- varMap[':type1'] = 'log'
- varMap[':type2'] = 'output'
- varMap[':status1'] = 'unknown'
- varMap[':status2'] = 'pending'
- varMap[':destinationDBlock'] = subDS
- # select
- self.cur.arraysize = 10
- self.cur.execute(sqlP+comment, varMap)
- res = self.cur.fetchone()
- # append
- if res != None:
- pandaID, = res
- pandaIDs.append(pandaID)
- # commit to release tables
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # loop over all PandaIDs
- jobInfos = []
- for pandaID in pandaIDs:
- varMap = {}
- varMap[':PandaID'] = pandaID
- # start transaction
- self.conn.begin()
- # get jobID,nJobs,jobStatus,userName
- res = None
- for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']:
- # select
- self.cur.arraysize = 10
- self.cur.execute((sqlJ % table)+comment,varMap)
- res = self.cur.fetchone()
- if res != None:
- break
- # commit to release tables
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # not found
- if res == None:
- continue
- # append
- jobInfos.append(res)
- # no jobs
- if jobInfos == []:
- _logger.error("getNumWaitingJobsWithOutDS : no jobs found")
- return False,{}
- # loop over all jobIDs
- retMap = {}
- for jobID,taskID,prodUserName,jobStatus,prodSourceLabel in jobInfos:
- if retMap.has_key(jobID):
- continue
- retMap[jobID] = {}
- retMap[jobID]['nJobs'] = taskID
- retMap[jobID]['sourceLabel'] = prodSourceLabel
- # don't check # of activated
- if jobStatus in ['defined']:
- retMap[jobID]['activated'] = False
- retMap[jobID]['nActs'] = 0
- continue
- retMap[jobID]['activated'] = True
- # get # of activated jobs
- varMap = {}
- varMap[':prodUserName'] = prodUserName
- varMap[':jobDefinitionID'] = jobID
- varMap[':jobStatus1'] = 'activated'
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10
- self.cur.execute(sqlC+comment, varMap)
- res = self.cur.fetchone()
- # commit to release tables
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if res == None:
- _logger.error("getNumWaitingJobsWithOutDS : cannot get # of activated for %s:%s" % \
- (jobID,prodUserName))
- return False,{}
- # set # of activated
- nActs, = res
- retMap[jobID]['nActs'] = nActs
- # return
- _logger.debug("getNumWaitingJobsWithOutDS -> %s" % str(retMap))
- return True,retMap
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getNumWaitingJobsWithOutDS : %s %s" % (errType,errValue))
- # return empty list
- return False,{}
-
-
- # get slimmed file info with PandaIDs
- def getSlimmedFileInfoPandaIDs(self,pandaIDs):
- comment = ' /* DBProxy.getSlimmedFileInfoPandaIDs */'
- _logger.debug("getSlimmedFileInfoPandaIDs : %s len=%s" % (pandaIDs[0],len(pandaIDs)))
- try:
- sqlL = "SELECT lfn,type,dataset FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID"
- sqlA = "SELECT /*+ INDEX(tab FILES_ARCH_PANDAID_IDX)*/ lfn,type,dataset FROM ATLAS_PANDAARCH.filesTable_ARCH tab "
- sqlA += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-60)"
- retMap = {'inDS':[],'outDS':[]}
- # start transaction
- self.conn.begin()
- # select
- for pandaID in pandaIDs:
- # make sql
- varMap = {}
- varMap[':PandaID'] = pandaID
- # select
- self.cur.arraysize = 10000
- self.cur.execute(sqlL+comment, varMap)
- resList = self.cur.fetchall()
- # try archived if not found in filesTable4
- if len(resList) == 0:
- self.cur.execute(sqlA+comment, varMap)
- resList = self.cur.fetchall()
- # append
- for tmp_lfn,tmp_type,tmp_dataset in resList:
- # skip lib.tgz
- if tmp_lfn.endswith('.lib.tgz'):
- continue
- if tmp_type == 'input':
- if not tmp_dataset in retMap['inDS']:
- retMap['inDS'].append(tmp_dataset)
- elif tmp_type == 'output':
- if not tmp_dataset in retMap['outDS']:
- retMap['outDS'].append(tmp_dataset)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("getSlimmedFileInfoPandaIDs : %s" % str(retMap))
- return retMap
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getSlimmedFileInfoPandaIDs : %s %s" % (type,value))
- # return empty list
- return {}
-
-
- # get JobIDs in a time range
- def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs):
- comment = ' /* DBProxy.getJobIDsInTimeRange */'
- _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S')))
- try:
- # get compact DN
- compactDN = self.cleanUserID(dn)
- if compactDN in ['','NULL',None]:
- compactDN = dn
- tables = ['ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']
- # select
- for table in tables:
- # make sql
- if table == 'ATLAS_PANDA.jobsArchived4':
- sql = 'SELECT /*+ INDEX_RS_ASC(TAB("JOBSARCHIVED4"."PRODUSERNAME")) NO_INDEX(TAB("JOBSARCHIVED4"."MODIFICATIONTIME")) */ jobDefinitionID FROM %s tab ' % table
- elif table == 'ATLAS_PANDA.jobsActive4':
- sql = 'SELECT /*+ INDEX_RS_ASC(TAB("JOBSACTIVE4"."PRODUSERNAME")) NO_INDEX(TAB("JOBSACTIVE4"."MODIFICATIONTIME")) */ jobDefinitionID FROM %s tab ' % table
- else:
- sql = "SELECT jobDefinitionID FROM %s " % table
- sql += "WHERE prodUserName=:prodUserName AND modificationTime>:modificationTime "
- sql += "AND prodSourceLabel=:prodSourceLabel GROUP BY jobDefinitionID"
- varMap = {}
- varMap[':prodUserName'] = compactDN
- varMap[':prodSourceLabel'] = 'user'
- varMap[':modificationTime'] = timeRange
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- _logger.debug(sql+comment+str(varMap))
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for tmpID, in resList:
- if not tmpID in retJobIDs:
- retJobIDs.append(tmpID)
- _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs))
- return retJobIDs
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobIDsInTimeRange : %s %s" % (type,value))
- # return empty list
- return []
-
-
- # get PandaIDs for a JobID
- def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs):
- comment = ' /* DBProxy.getPandIDsWithJobID */'
- _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID))
- try:
- # get compact DN
- compactDN = self.cleanUserID(dn)
- if compactDN in ['','NULL',None]:
- compactDN = dn
- tables = ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsArchived4']
- buildJobID = None
- # select
- for table in tables:
- # skip if all jobs have already been gotten
- if nJobs > 0 and len(idStatus) >= nJobs:
- continue
- # make sql
- sql = "SELECT PandaID,jobStatus,commandToPilot,prodSourceLabel,taskBufferErrorCode FROM %s " % table
- sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sql += "AND prodSourceLabel in (:prodSourceLabel1,:prodSourceLabel2)"
- varMap = {}
- varMap[':prodUserName'] = compactDN
- varMap[':jobDefinitionID'] = jobID
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- # select
- _logger.debug(sql+comment+str(varMap))
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- # append
- for tmpID,tmpStatus,tmpCommand,tmpProdSourceLabel,tmpTaskBufferErrorCode in resList:
- # ignore jobs retried by pilot since they have new PandaIDs with the same jobsetID/jobdefID
- if tmpTaskBufferErrorCode in [ErrorCode.EC_PilotRetried]:
- continue
- # ignore old buildJob which was replaced by rebrokerage
- if tmpProdSourceLabel == 'panda':
- if buildJobID == None:
- # first buildJob
- buildJobID = tmpID
- elif buildJobID >= tmpID:
- # don't append old one
- continue
- else:
- # delete old one
- del idStatus[buildJobID]
- buildJobID = tmpID
- # append
- idStatus[tmpID] = (tmpStatus,tmpCommand)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("getPandIDsWithJobID : %s" % str(idStatus))
- return idStatus,buildJobID
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getPandIDsWithJobID : %s %s" % (type,value))
- # return empty list
- return {},None
-
-
- # lock jobs for reassign
- def lockJobsForReassign(self,tableName,timeLimit,statList,labels,processTypes,sites,clouds):
- comment = ' /* DBProxy.lockJobsForReassign */'
- _logger.debug("lockJobsForReassign : %s %s %s %s %s %s %s" % \
- (tableName,timeLimit,statList,labels,processTypes,sites,clouds))
- try:
- # make sql
- sql = "SELECT PandaID FROM %s " % tableName
- sql += "WHERE modificationTime<:modificationTime "
- varMap = {}
- varMap[':modificationTime'] = timeLimit
- if statList != []:
- sql += 'AND jobStatus IN ('
- tmpIdx = 0
- for tmpStat in statList:
- tmpKey = ':stat%s' % tmpIdx
- varMap[tmpKey] = tmpStat
- sql += '%s,' % tmpKey
- sql = sql[:-1]
- sql += ') '
- if labels != []:
- sql += 'AND prodSourceLabel IN ('
- tmpIdx = 0
- for tmpStat in labels:
- tmpKey = ':label%s' % tmpIdx
- varMap[tmpKey] = tmpStat
- sql += '%s,' % tmpKey
- sql = sql[:-1]
- sql += ') '
- if processTypes != []:
- sql += 'AND processingType IN ('
- tmpIdx = 0
- for tmpStat in processTypes:
- tmpKey = ':processType%s' % tmpIdx
- varMap[tmpKey] = tmpStat
- sql += '%s,' % tmpKey
- sql = sql[:-1]
- sql += ') '
- if sites != []:
- sql += 'AND computingSite IN ('
- tmpIdx = 0
- for tmpStat in sites:
- tmpKey = ':site%s' % tmpIdx
- varMap[tmpKey] = tmpStat
- sql += '%s,' % tmpKey
- sql = sql[:-1]
- sql += ') '
- if clouds != []:
- sql += 'AND cloud IN ('
- tmpIdx = 0
- for tmpStat in clouds:
- tmpKey = ':cloud%s' % tmpIdx
- varMap[tmpKey] = tmpStat
- sql += '%s,' % tmpKey
- sql = sql[:-1]
- sql += ') '
- # sql for lock
- sqlLock = 'UPDATE %s SET modificationTime=CURRENT_DATE WHERE PandaID=:PandaID' % tableName
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 1000000
- self.cur.execute(sql+comment,varMap)
- resList = self.cur.fetchall()
- retList = []
- # lock
- for tmpID, in resList:
- varLock = {':PandaID':tmpID}
- self.cur.execute(sqlLock+comment,varLock)
- retList.append((tmpID,))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # sort
- retList.sort()
- _logger.debug("lockJobsForReassign : %s" % (len(retList)))
- return True,retList
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("lockJobsForReassign : %s %s" % (errType,errValue))
- # return empty
- return False,[]
-
-
- # lock jobs for finisher
- def lockJobsForFinisher(self,timeNow,rownum,highPrio):
- comment = ' /* DBProxy.lockJobsForFinisher */'
- _logger.debug("lockJobsForFinisher : %s %s %s" % (timeNow,rownum,highPrio))
- try:
- varMap = {}
- varMap[':jobStatus'] = 'transferring'
- varMap[':currentPriority'] = 800
- varMap[':prodSourceLabel'] = 'managed'
- # make sql
- sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 "
- sql += "WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel "
- if highPrio:
- varMap[':modificationTime'] = timeNow - datetime.timedelta(hours=1)
- sql += "AND currentPriority>=:currentPriority AND rownum<=%s " % rownum
- else:
- sql += "AND currentPriority<:currentPriority AND rownum<=%s " % rownum
- varMap[':modificationTime'] = timeNow - datetime.timedelta(hours=12)
- sql += "FOR UPDATE "
- # sql for lock
- sqlLock = 'UPDATE ATLAS_PANDA.jobsActive4 SET modificationTime=CURRENT_DATE WHERE PandaID=:PandaID'
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 1000
- self.cur.execute(sql+comment,varMap)
- resList = self.cur.fetchall()
- retList = []
- # lock
- for tmpID, in resList:
- varLock = {':PandaID':tmpID}
- self.cur.execute(sqlLock+comment,varLock)
- retList.append(tmpID)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # sort
- retList.sort()
- _logger.debug("lockJobsForFinisher : %s" % (len(retList)))
- return True,retList
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("lockJobsForFinisher : %s %s" % (errType,errValue))
- # return empty
- return False,[]
-
-
- # get the number of waiting jobs with a dataset
- def getNumWaitingJobsForPD2P(self,datasetName):
- comment = ' /* DBProxy.getNumWaitingJobsForPD2P */'
- _logger.debug("getNumWaitingJobsForPD2P : %s" % datasetName)
- try:
- tables = ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4']
- nJobs = 0
- # select
- for table in tables:
- # make sql
- sql = "SELECT COUNT(*) FROM %s " % table
- sql += "WHERE prodDBlock=:prodDBlock AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) "
- sql += "AND jobStatus IN (:jobStatus1,:jobStatus2) "
- varMap = {}
- varMap[':prodDBlock'] = datasetName
- varMap[':jobStatus1'] = 'defined'
- varMap[':jobStatus2'] = 'activated'
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchone()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if res != None:
- tmpN, = res
- nJobs += tmpN
- _logger.debug("getNumWaitingJobsForPD2P : %s -> %s" % (datasetName,nJobs))
- return nJobs
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getNumWaitingJobsForPD2P : %s %s" % (errType,errValue))
- # return 0
- return 0
-
-
- # get the number of waiting jobsets with a dataset
- def getNumWaitingJobsetsForPD2P(self,datasetName):
- comment = ' /* DBProxy.getNumWaitingJobsetsForPD2P */'
- _logger.debug("getNumWaitingJobsetsForPD2P : %s" % datasetName)
- try:
- tables = ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4']
- jobsetIDuserList = []
- # select
- for table in tables:
- # make sql
- sql = "SELECT jobsetID,prodUserName FROM %s " % table
- sql += "WHERE prodDBlock=:prodDBlock AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) "
- sql += "AND jobStatus IN (:jobStatus1,:jobStatus2) GROUP BY jobsetID,prodUserName"
- varMap = {}
- varMap[':prodDBlock'] = datasetName
- varMap[':jobStatus1'] = 'defined'
- varMap[':jobStatus2'] = 'activated'
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- for jobsetID,prodUserName in resList:
- tmpKey = (jobsetID,prodUserName)
- if not tmpKey in jobsetIDuserList:
- jobsetIDuserList.append(tmpKey)
- _logger.debug("getNumWaitingJobsetsForPD2P : %s -> %s" % (datasetName,len(jobsetIDuserList)))
- return len(jobsetIDuserList)
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getNumWaitingJobsetsForPD2P : %s %s" % (errType,errValue))
- # return 0
- return 0
-
-
- # lock job for re-brokerage
- def lockJobForReBrokerage(self,dn,jobID,simulation,forceOpt,forFailed=False):
- comment = ' /* lockJobForReBrokerage */'
- _logger.debug("lockJobForReBrokerage : %s %s %s %s %s" % (dn,jobID,simulation,forceOpt,forFailed))
- try:
- errMsg = ''
- # get compact DN
- compactDN = self.cleanUserID(dn)
- if compactDN in ['','NULL',None]:
- compactDN = dn
- # start transaction
- self.conn.begin()
- buildJobPandaID = None
- buildJobStatus = None
- buildJobDefID = None
- buildCreationTime = None
- runPandaID = None
- minPandaIDlibDS = None
- maxPandaIDlibDS = None
- # get one runXYZ job
- if errMsg == '':
- for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']:
- sql = "SELECT PandaID FROM %s " % table
- sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sql += "AND prodSourceLabel=:prodSourceLabel1 AND jobStatus IN (:jobStatus1,:jobStatus2) "
- sql += "AND rownum <= 1"
- varMap = {}
- varMap[':prodUserName'] = compactDN
- varMap[':jobDefinitionID'] = jobID
- varMap[':prodSourceLabel1'] = 'user'
- if not forFailed:
- # lock active jobs for normal rebrokerage
- varMap[':jobStatus1'] = 'defined'
- varMap[':jobStatus2'] = 'activated'
- else:
- # lock failed jobs for retry
- varMap[':jobStatus1'] = 'failed'
- varMap[':jobStatus2'] = 'dummy'
- # select
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchone()
- # not found
- if res != None:
- runPandaID, = res
- break
- if runPandaID == None:
- if not forFailed:
- errMsg = "no defined/activated jobs to reassign. running/finished/failed jobs are not reassigned by rebrokerage "
- else:
- errMsg = "could not get failed runXYZ jobs"
- # get libDS
- libDS = ''
- if errMsg == '':
- sql = "SELECT lfn,dataset FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND PandaID=:PandaID"
- varMap = {}
- varMap[':type'] = 'input'
- varMap[':PandaID'] = runPandaID
- # select
- self.cur.arraysize = 10000
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- for tmpLFN,tmpDS in resList:
- if tmpLFN.endswith('.lib.tgz'):
- libDS = tmpDS
- break
- # check status of corresponding buildJob
- if libDS != '':
- sql = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 "
- sql += "WHERE type=:type AND dataset=:dataset"
- varMap = {}
- varMap[':type'] = 'output'
- varMap[':dataset'] = libDS
- # select
- self.cur.arraysize = 10
- # select
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchone()
- # not found in active table
- if res == None:
- # look for buildJob in archived table
- sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODSOURCELABEL_IDX JOBS_PRODUSERNAME_IDX) */ "
- sql += "PandaID,jobStatus,jobDefinitionID,creationTime "
- sql += "FROM ATLAS_PANDAARCH.jobsArchived tab "
- sql += "WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLable1 "
- sql += "AND modificationTime>(CURRENT_DATE-10) ORDER BY PandaID DESC"
- varMap = {}
- varMap[':prodUserName'] = compactDN
- varMap[':prodSourceLable1'] = 'panda'
- # select
- self.cur.arraysize = 10000
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- # loop over PandaIDs to find corresponding libDS
- sql = "SELECT /*+ INDEX(tab FILES_ARCH_PANDAID_IDX)*/ PandaID FROM ATLAS_PANDAARCH.filesTable_ARCH tab "
- sql += "WHERE PandaID=:PandaID AND type=:type AND dataset=:dataset AND status=:status "
- sql += "AND modificationTime>(CURRENT_DATE-10)"
- self.cur.arraysize = 10
- for tmpID,tmpJobStatus,tmpJobDefID,tmpCreationTime in resList:
- varMap = {}
- varMap[':PandaID'] = tmpID
- varMap[':type'] = 'output'
- varMap[':status'] = 'ready'
- varMap[':dataset'] = libDS
- # select
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchone()
- if res != None:
- # get PandaID of buildJob
- buildJobPandaID, = res
- buildJobStatus = tmpJobStatus
- buildJobDefID = tmpJobDefID
- buildCreationTime = tmpCreationTime
- break
- # not found
- if buildJobPandaID == None:
- errMsg = "could not find successful buildJob for %s" % libDS
- else:
- # get PandaID of buildJob
- buildJobPandaID, = res
- # found buildJob
- if errMsg == '':
- # get current buildJob status
- if buildJobStatus == None:
- for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsDefined4']:
- # make sql
- sql = "SELECT jobStatus,jobDefinitionID,creationTime FROM %s " % table
- sql += "WHERE PandaID=:PandaID "
- varMap = {}
- varMap[':PandaID'] = buildJobPandaID
- # select
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchone()
- # found
- if res != None:
- buildJobStatus,buildJobDefID,buildCreationTime = res
- break
- # not found
- if buildJobStatus == None:
- errMsg = "could not find buildJob=%s in database" % buildJobPandaID
- # check status
- if errMsg != '':
- if not buildJobStatus in ['defined','activated','finished','cancelled']:
- errMsg = "status of buildJob is '%s' != defined/activated/finished/cancelled so that jobs cannot be reassigned" \
- % buildJobStatus
- # get max/min PandaIDs using the libDS
- if errMsg == '':
- sql = "SELECT MAX(PandaID),MIN(PandaID) FROM ATLAS_PANDA.filesTable4 "
- sql += "WHERE type=:type AND dataset=:dataset"
- varMap = {}
- varMap[':type'] = 'input'
- varMap[':dataset'] = libDS
- self.cur.arraysize = 10
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchone()
- if res == None:
- errMsg = "cannot get MAX/MIN PandaID for multiple usage for %s" % libDS
- else:
- maxPandaIDlibDS,minPandaIDlibDS = res
- # check creationDate of buildJob
- if errMsg == '':
- # buildJob has already finished
- timeLimit = datetime.datetime.utcnow()-datetime.timedelta(days=6)
- if buildJobStatus in ['finished','cancelled'] and buildCreationTime < timeLimit:
- errMsg = "corresponding buildJob %s is too old %s" % (buildJobPandaID,buildCreationTime.strftime('%Y-%m-%d %H:%M:%S'))
- # check modificationTime
- if errMsg == '':
- # make sql
- tables = ['ATLAS_PANDA.jobsDefined4']
- if not buildJobStatus in ['defined']:
- tables.append('ATLAS_PANDA.jobsActive4')
- sql = "SELECT modificationTime FROM %s "
- sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus IN (:jobStatus1,:jobStatus2) "
- sql += "FOR UPDATE "
- varMap = {}
- varMap[':prodUserName'] = compactDN
- varMap[':jobDefinitionID'] = jobID
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- if not forFailed:
- # normal rebrokerage
- varMap[':jobStatus1'] = 'defined'
- varMap[':jobStatus2'] = 'activated'
- else:
- # retry
- varMap[':jobStatus1'] = 'failed'
- varMap[':jobStatus2'] = 'dummy'
- for tableName in tables:
- # select
- self.cur.execute((sql % tableName)+comment, varMap)
- res = self.cur.fetchone()
- if res != None:
- break
- if res == None:
- if not forFailed:
- errMsg = "no defined/activated jobs to be reassigned"
- else:
- errMsg = "no failed jobs to be retried"
- else:
- tmpModificationTime, = res
- # prevent users from rebrokering more than once in one hour
- timeLimit = datetime.datetime.utcnow()-datetime.timedelta(hours=1)
- if timeLimit < tmpModificationTime and not forceOpt:
- errMsg = "last mod time is %s > current-1hour. Cannot run (re)brokerage more than once in one hour" \
- % tmpModificationTime.strftime('%Y-%m-%d %H:%M:%S')
- elif simulation:
- pass
- else:
- # update modificationTime for locking
- for tableName in tables:
- sql = 'UPDATE %s ' % tableName
- sql += 'SET modificationTime=CURRENT_DATE '
- sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus IN (:jobStatus1,:jobStatus2) "
- self.cur.execute(sql+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return failure
- if errMsg != '':
- _logger.debug('lockJobForReBrokerage : '+errMsg)
- return False,{'err':errMsg}
- # return
- retMap = {'bPandaID':buildJobPandaID,'bStatus':buildJobStatus,'userName':compactDN,
- 'bJobID':buildJobDefID,'rPandaID':runPandaID,
- 'maxPandaIDlibDS':maxPandaIDlibDS,'minPandaIDlibDS':minPandaIDlibDS}
- _logger.debug("lockJobForReBrokerage %s" % str(retMap))
- return True,retMap
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("lockJobForReBrokerage : %s %s" % (type,value))
- # return empty list
- return False,{'err':'database error'}
-
-
- # get input datasets for rebrokerage
- def getInDatasetsForReBrokerage(self,jobID,userName):
- comment = ' /* DBProxy.getInDatasetsForReBrokerage */'
- failedRet = False,{},None
- try:
- _logger.debug("getInDatasetsForReBrokerage(%s,%s)" % (jobID,userName))
- # start transaction
- self.conn.begin()
- # get pandaID
- pandaIDs = []
- maxTotalFileSize = None
- for table in ['jobsActive4','jobsDefined4']:
- sql = "SELECT PandaID FROM ATLAS_PANDA.%s WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " % table
- sql += "AND prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2)"
- varMap = {}
- varMap[':prodUserName'] = userName
- varMap[':jobDefinitionID'] = jobID
- varMap[':prodSourceLabel'] = 'user'
- varMap[':jobStatus1'] = 'defined'
- varMap[':jobStatus2'] = 'activated'
- self.cur.arraysize = 10000
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- if res != []:
- for tmpItem in res:
- pandaIDs.append(tmpItem[0])
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # not found
- if pandaIDs == []:
- _logger.debug("getInDatasetsForReBrokerage : PandaIDs not found")
- return failedRet
- # get dataset and lfn
- retMapLFN = {}
- sql = "SELECT dataset,lfn,fsize FROM ATLAS_PANDA.filesTable4 "
- sql += "WHERE PandaID=:PandaID AND type=:type"
- for pandaID in pandaIDs:
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':type'] = 'input'
- # start transaction
- self.conn.begin()
- self.cur.arraysize = 10000
- self.cur.execute(sql+comment, varMap)
- resL = self.cur.fetchall()
- # append
- tmpTotalFileSize = 0
- for tmpDataset,tmpLFN,tmpFileSize in resL:
- # ignore lib.tgz
- if tmpLFN.endswith('.lib.tgz'):
- continue
- if not retMapLFN.has_key(tmpDataset):
- retMapLFN[tmpDataset] = []
- if not tmpLFN in retMapLFN[tmpDataset]:
- retMapLFN[tmpDataset].append(tmpLFN)
- try:
- tmpTotalFileSize += long(tmpFileSize)
- except:
- pass
- if maxTotalFileSize == None or maxTotalFileSize < tmpTotalFileSize:
- maxTotalFileSize = tmpTotalFileSize
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("getInDatasetsForReBrokerage : done")
- # max size in MB
- maxTotalFileSize /= (1024*1024)
- # return
- return True,retMapLFN,maxTotalFileSize
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getInDatasetsForReBrokerage(%s,%s) : %s %s" % (jobID,userName,errType,errValue))
- return failedRet
-
-
- # move jobs to jobsDefine4 for re-brokerage
- def resetBuildJobForReBrokerage(self,pandaID):
- comment = ' /* resetBuildJobForReBrokerage */'
- _logger.debug("resetBuildJobForReBrokerage : start %s" % pandaID)
- try:
- # make sql to move jobs
- sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames()
- sql1+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus1"
- sql3 = "INSERT INTO ATLAS_PANDA.jobsDefined4 (%s) " % JobSpec.columnNames()
- sql3+= JobSpec.bindValuesExpression()
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':jobStatus1'] = 'activated'
- self.cur.arraysize = 10
- self.cur.execute(sql1+comment,varMap)
- res = self.cur.fetchone()
- # not found
- if res == None:
- _logger.error("resetBuildJobForReBrokerage : PandaID=%s not found" % pandaID)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return False
- # instantiate Job
- job = JobSpec()
- job.pack(res)
- # delete from jobsDefined4 just in case
- varMap = {}
- varMap[':PandaID'] = pandaID
- sqlD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID"
- self.cur.execute(sqlD+comment,varMap)
- # reset job status
- job.jobStatus = 'defined'
- # host and time information
- job.modificationHost = self.hostname
- job.modificationTime = datetime.datetime.utcnow()
- # insert to Defined
- self.cur.execute(sql3+comment, job.valuesMap())
- # delete from Active
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':jobStatus1'] = 'activated'
- sql2 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID AND jobStatus=:jobStatus1"
- self.cur.execute(sql2+comment,varMap)
- retD = self.cur.rowcount
- # delete failed
- if retD != 1:
- _logger.error("resetBuildJobForReBrokerage : failed to delete PandaID=%s" % pandaID)
- # rollback
- self._rollback()
- return False
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- _logger.debug("resetBuildJobForReBrokerage : end %s" % pandaID)
- return True
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("resetBuildJobForReBrokerage : %s %s" % (type,value))
- # return empty list
- return False
-
-
- # get PandaIDs using userName/jobID for re-brokerage or retry
- def getPandaIDsForReBrokerage(self,userName,jobID,fromActive,forFailed=False):
- comment = ' /* DBProxy.getPandaIDsForReBrokerage */'
- _logger.debug("getPandaIDsForReBrokerage : %s %s %s %s" % (userName,jobID,fromActive,forFailed))
- try:
- returnList = []
- varMap = {}
- varMap[':prodUserName'] = userName
- varMap[':jobDefinitionID'] = jobID
- if not forFailed:
- varMap[':jobStatus1'] = 'activated'
- else:
- varMap[':jobStatus1'] = 'failed'
- sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 "
- sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sql += "AND jobStatus=:jobStatus1"
- # get IDs from Active table
- if fromActive:
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 20000
- self.cur.execute(sql+comment,varMap)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for tmpID, in resList:
- if not tmpID in returnList:
- returnList.append(tmpID)
- # set holding to prevent activated jobs from being picked up
- if not forFailed:
- sql = 'UPDATE ATLAS_PANDA.jobsActive4 SET jobStatus=:newStatus '
- sql += 'WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID '
- sql += "AND jobStatus=:jobStatus1"
- varMap[':newStatus'] = 'holding'
- # start transaction
- self.conn.begin()
- # update
- self.cur.execute(sql+comment,varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # get IDs from Defined table just in case
- varMap = {}
- varMap[':prodUserName'] = userName
- varMap[':jobDefinitionID'] = jobID
- varMap[':jobStatus1'] = 'defined'
- varMap[':jobStatus2'] = 'assgined'
- sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 "
- sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sql += "AND jobStatus IN (:jobStatus1,:jobStatus2)"
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 20000
- self.cur.execute(sql+comment,varMap)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for tmpID, in resList:
- if not tmpID in returnList:
- returnList.append(tmpID)
- # sort
- returnList.sort()
- # return
- return returnList
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getPandaIDsForReBrokerage : %s %s" % (type,value))
- # return empty list
- return []
-
-
- # get outDSs with userName/jobID
- def getOutDSsForReBrokerage(self,userName,jobID):
- comment = ' /* DBProxy.getOutDSsForReBrokerage */'
- _logger.debug("getOutDSsForReBrokerage : %s %s" % (userName,jobID))
- falseRet = (False,[],None,None)
- try:
- # get one PandaID
- sql = "SELECT PandaID,computingSite,destinationSE FROM ATLAS_PANDA.jobsActive4 "
- sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sql += "AND prodSourceLabel=:prodSourceLabel AND rownum<=1"
- varMap = {}
- varMap[':prodUserName'] = userName
- varMap[':jobDefinitionID'] = jobID
- varMap[':prodSourceLabel'] = 'user'
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchone()
- # not found
- if res == None:
- _logger.debug("getOutDSsForReBrokerage : failed to get PandaID")
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return falseRet
- pandaID,computingSite,destinationSE = res
- # get outDSs
- sql = "SELECT dataset FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type IN (:type1,:type2)"
- varMap = {}
- varMap[':type1'] = 'output'
- varMap[':type2'] = 'log'
- varMap[':PandaID'] = pandaID
- self.cur.arraysize = 1000
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- returnList = []
- for tmpOutDS, in resList:
- if not tmpOutDS in returnList:
- returnList.append(tmpOutDS)
- # return
- return True,returnList,computingSite,destinationSE
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getOutDSsForReBrokerage : %s %s" % (type,value))
- # return empty list
- return falseRet
-
-
- # query PandaID
- def queryPandaID(self,jobDefID):
- comment = ' /* DBProxy.queryPandaID */'
- _logger.debug("queryPandaID : %s" % jobDefID)
- sql0 = "SELECT PandaID,attemptNr FROM %s WHERE attemptNr=("
- sql0+= "SELECT MAX(attemptNr) FROM %s"
- sql1= " WHERE prodSourceLabel=:prodSourceLabel AND jobDefinitionID=:jobDefinitionID)"
- sql1+=" AND prodSourceLabel=:prodSourceLabel AND jobDefinitionID=:jobDefinitionID"
- try:
- ids = []
- # select
- varMap = {}
- varMap[':jobDefinitionID'] = jobDefID
- varMap[':prodSourceLabel'] = 'managed'
- for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsWaiting4']:
- # start transaction
- self.conn.begin()
- # select
- sql = sql0 % (table,table) + sql1
- self.cur.arraysize = 10
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- ids += list(res)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # look for the latest attempt
- preAtt =-1
- pandaID=None
- for pID,att in ids:
- if att > preAtt:
- pandaID = pID
- preAtt = att
- if att == preAtt:
- if pandaID < pID:
- pandaID = pID
- return pandaID
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("queryPandaID : %s %s" % (type,value))
- # roll back
- self._rollback()
- return None
-
-
- # query job info per cloud
- def queryJobInfoPerCloud(self,cloud,schedulerID=None):
- comment = ' /* DBProxy.queryJobInfoPerCloud */'
- _logger.debug("queryJobInfoPerCloud : %s %s" % (cloud,schedulerID))
- attrs = ['PandaID','jobStatus','jobName']
- sql0 = "SELECT "
- for attr in attrs:
- sql0 += "%s," % attr
- sql0 = "%s " % sql0[:-1]
- sql0+= "FROM %s "
- sql0+= "WHERE cloud=:cloud "
- varMap = {}
- varMap[':cloud'] = cloud
- if schedulerID != None:
- sql0+= "AND schedulerID=:schedulerID "
- varMap[':schedulerID'] = schedulerID
- try:
- ids = []
- returnList = []
- # select
- for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']:
- # start transaction
- self.conn.begin()
- # select
- sql = sql0 % table
- self.cur.arraysize = 10000
- self.cur.execute(sql+comment,varMap)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # loop over all
- for res in resList:
- valMap = {}
- # skip if already in the list
- PandaID = res[0]
- if PandaID in ids:
- continue
- # convert to map
- for idx,attr in enumerate(attrs):
- valMap[attr] = res[idx]
- # append to list
- ids.append(PandaID)
- returnList.append(valMap)
- # return
- return returnList
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("queryJobInfoPerCloud : %s %s" % (type,value))
- # roll back
- self._rollback()
- return None
-
-
- # get PandaIDs at Site
- def getPandaIDsSite(self,site,status,limit):
- comment = ' /* DBProxy.getPandaIDsSite */'
- _logger.debug("getPandaIDsSite : %s %s %s" % (site,status,limit))
- try:
- ids = []
- # find table
- if status in ['defined','assigned']:
- table = 'ATLAS_PANDA.jobsDefined4'
- elif status in ['activated','running','holding','trasnferring']:
- table = 'ATLAS_PANDA.jobsActive4'
- elif status in ['waiting']:
- table = 'ATLAS_PANDA.jobsWaiting4'
- elif status in ['finished','failed']:
- table = 'ATLAS_PANDA.jobsArchived4'
- else:
- _logger.error("unknown status:%s" % status)
- return ids
- # limit
- limit = int(limit)
- # SQL
- sql = "SELECT PandaID FROM %s " % table
- sql += "WHERE computingSite=:computingSite AND jobStatus=:jobStatus AND prodSourceLabel=:prodSourceLabel "
- sql += "AND rownum<=:limit"
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':computingSite'] = site
- varMap[':jobStatus'] = status
- varMap[':limit'] = limit
- varMap[':prodSourceLabel'] = 'managed'
- self.cur.arraysize = limit
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # convert to list
- for id, in res:
- ids.append(id)
- return ids
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getPandaIDsSite : %s %s" % (type,value))
- # roll back
- self._rollback()
- return []
-
-
- # get PandaIDs to be updated in prodDB
- def getPandaIDsForProdDB(self,limit,lockedby):
- comment = ' /* DBProxy.getPandaIDsForProdDB */'
- _logger.debug("getPandaIDsForProdDB %s" % limit)
- sql0 = "PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID FROM %s "
- sqlW = "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND lockedby=:lockedby "
- sqlX = "AND stateChangeTime>prodDBUpdateTime "
- sqlA = "AND (CASE WHEN stateChangeTime>prodDBUpdateTime THEN 1 ELSE null END) = 1 "
- sql1 = "AND rownum<=:limit "
- varMap = {}
- varMap[':lockedby'] = lockedby
- varMap[':limit'] = limit
- varMap[':prodSourceLabel1'] = 'managed'
- varMap[':prodSourceLabel2'] = 'rc_test'
- try:
- retMap = {}
- totalIDs = 0
- # select
- for table in ['ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']:
- # start transaction
- self.conn.begin()
- # select
- sql = sql0 % table
- if table in ['ATLAS_PANDA.jobsArchived4']:
- sql = "SELECT /*+ INDEX_RS_ASC(tab JOBSARCHIVED4_CHANGETIME) NO_INDEX(tab(PRODSOURCELABEL))*/ " + sql + " tab " + sqlW + sqlA
- else:
- sql = "SELECT " + sql + sqlW + sqlX
- sql += sql1
- self.cur.arraysize = limit
- _logger.debug("getPandaIDsForProdDB %s %s" % (sql+comment,str(varMap)))
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- _logger.debug("getPandaIDsForProdDB got %s" % len(res))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- for PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID in res:
- # ignore dummy jobs in jobsDefined4
- if table == 'ATLAS_PANDA.jobsDefined4' and (not jobStatus in ['defined','assigned']):
- continue
- # add status
- if not retMap.has_key(jobStatus):
- retMap[jobStatus] = []
- # append
- retMap[jobStatus].append({'PandaID':PandaID,'attemptNr':attemptNr,
- 'stateChangeTime':stateChangeTime.strftime('%Y-%m-%d %H:%M:%S'),
- 'jobDefinitionID':jobDefinitionID,
- 'jobExecutionID':jobExecutionID})
- totalIDs += 1
- # limit
- if totalIDs > limit:
- break
- _logger.debug("getPandaIDsForProdDB %s ret->%s" % (limit,totalIDs))
- return retMap
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getPandaIDsForProdDB : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # update prodDBUpdateTime
- def updateProdDBUpdateTime(self,param):
- comment = ' /* DBProxy.updateProdDBUpdateTime */'
- _logger.debug("updateProdDBUpdateTime %s" % str(param))
- sql0 = "UPDATE %s "
- sql0+= "SET prodDBUpdateTime=TO_TIMESTAMP(:prodDBUpdateTime,'YYYY-MM-DD HH24:MI:SS') "
- sql0+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus AND stateChangeTime=TO_TIMESTAMP(:stateChangeTime,'YYYY-MM-DD HH24:MI:SS') "
- varMap = {}
- varMap[':prodDBUpdateTime'] = param['stateChangeTime']
- varMap[':PandaID'] = param['PandaID']
- varMap[':jobStatus'] = param['jobStatus']
- varMap[':stateChangeTime'] = param['stateChangeTime']
- try:
- # convert to string
- if isinstance(varMap[':prodDBUpdateTime'],datetime.datetime):
- varMap[':prodDBUpdateTime'] = varMap[':prodDBUpdateTime'].strftime('%Y-%m-%d %H:%M:%S')
- if isinstance(varMap[':stateChangeTime'],datetime.datetime):
- varMap[':stateChangeTime'] = varMap[':stateChangeTime'].strftime('%Y-%m-%d %H:%M:%S')
- # set table
- if param['jobStatus'] in ['defined','assigned']:
- table = 'ATLAS_PANDA.jobsDefined4'
- elif param['jobStatus'] in ['waiting','pending']:
- table = 'ATLAS_PANDA.jobsWaiting4'
- elif param['jobStatus'] in ['activated','sent','starting','running','holding','transferring']:
- table = 'ATLAS_PANDA.jobsActive4'
- elif param['jobStatus'] in ['finished','failed','cancelled']:
- table = 'ATLAS_PANDA.jobsArchived4'
- else:
- _logger.error("invalid status %s" % param['jobStatus'])
- return False
- # set transaction
- self.conn.begin()
- # update
- sql = sql0 % table
- _logger.debug(sql+comment+str(varMap))
- self.cur.execute(sql+comment, varMap)
- retU = self.cur.rowcount
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("updateProdDBUpdateTime %s ret=%s" % (param['PandaID'],retU))
- if retU == 1:
- return True
- return False
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("updateProdDBUpdateTime : %s %s" % (type,value))
- # roll back
- self._rollback()
- return False
-
-
- # add metadata
- def addMetadata(self,pandaID,metadata):
- comment = ' /* DBProxy.addMetaData */'
- _logger.debug("addMetaData : %s" % pandaID)
- sql0 = "SELECT PandaID FROM ATLAS_PANDA.metaTable WHERE PandaID=:PandaID"
- sql1 = "INSERT INTO ATLAS_PANDA.metaTable (PandaID,metaData) VALUES (:PandaID,:metaData)"
- nTry=3
- for iTry in range(nTry):
- try:
- # autocommit on
- self.conn.begin()
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- self.cur.arraysize = 10
- self.cur.execute(sql0+comment, varMap)
- res = self.cur.fetchone()
- # already exist
- if res != None:
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- # insert
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':metaData'] = metadata
- self.cur.execute(sql1+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("addMetaData : %s retry : %s" % (pandaID,iTry))
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("addMetaData : %s %s" % (type,value))
- return False
-
-
- # add stdout
- def addStdOut(self,pandaID,stdOut):
- comment = ' /* DBProxy.addStdOut */'
- _logger.debug("addStdOut : %s start" % pandaID)
- sqlJ = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID FOR UPDATE "
- sqlC = "SELECT PandaID FROM ATLAS_PANDA.jobsDebug WHERE PandaID=:PandaID "
- sqlI = "INSERT INTO ATLAS_PANDA.jobsDebug (PandaID,stdOut) VALUES (:PandaID,:stdOut) "
- sqlU = "UPDATE ATLAS_PANDA.jobsDebug SET stdOut=:stdOut WHERE PandaID=:PandaID "
- try:
- # autocommit on
- self.conn.begin()
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- self.cur.arraysize = 10
- # check job table
- self.cur.execute(sqlJ+comment, varMap)
- res = self.cur.fetchone()
- if res == None:
- _logger.debug("addStdOut : %s non active" % pandaID)
- else:
- # check debug table
- self.cur.execute(sqlC+comment, varMap)
- res = self.cur.fetchone()
- # already exist
- if res != None:
- # update
- sql = sqlU
- else:
- # insert
- sql = sqlI
- # write stdout
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':stdOut'] = stdOut
- self.cur.execute(sql+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("addStdOut : %s %s" % (errtype,errvalue))
- return False
-
-
- # insert sandbox file info
- def insertSandboxFileInfo(self,userName,hostName,fileName,fileSize,checkSum):
- comment = ' /* DBProxy.insertSandboxFileInfo */'
- _logger.debug("insertSandboxFileInfo : %s %s %s %s %s" % (userName,hostName,fileName,fileSize,checkSum))
- sqlC = "SELECT userName,fileSize,checkSum FROM ATLAS_PANDAMETA.userCacheUsage "
- sqlC += "WHERE hostName=:hostName AND fileName=:fileName FOR UPDATE"
- sql = "INSERT INTO ATLAS_PANDAMETA.userCacheUsage "
- sql += "(userName,hostName,fileName,fileSize,checkSum,creationTime,modificationTime) "
- sql += "VALUES (:userName,:hostName,:fileName,:fileSize,:checkSum,CURRENT_DATE,CURRENT_DATE) "
- try:
- # begin transaction
- self.conn.begin()
- # check if it already exists
- varMap = {}
- varMap[':hostName'] = hostName
- varMap[':fileName'] = fileName
- self.cur.arraysize = 10
- self.cur.execute(sqlC+comment, varMap)
- res = self.cur.fetchall()
- if len(res) != 0:
- _logger.debug("insertSandboxFileInfo : skip %s %s since already exists" % (hostName,fileName))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return "WARNING: file exist"
- # insert
- varMap = {}
- varMap[':userName'] = userName
- varMap[':hostName'] = hostName
- varMap[':fileName'] = fileName
- varMap[':fileSize'] = fileSize
- varMap[':checkSum'] = checkSum
- self.cur.execute(sql+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return "OK"
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("insertSandboxFileInfo : %s %s" % (type,value))
- return "ERROR: DB failure"
-
-
- # check duplicated sandbox file
- def checkSandboxFile(self,dn,fileSize,checkSum):
- comment = ' /* DBProxy.checkSandboxFile */'
- _logger.debug("checkSandboxFile : %s %s %s" % (dn,fileSize,checkSum))
- sqlC = "SELECT hostName,fileName FROM ATLAS_PANDAMETA.userCacheUsage "
- sqlC += "WHERE userName=:userName AND fileSize=:fileSize AND checkSum=:checkSum "
- sqlC += "AND hostName<>:ngHostName AND creationTime>CURRENT_DATE-3 "
- sqlC += "AND creationTime>CURRENT_DATE-3 "
- try:
- retStr = 'NOTFOUND'
- # get compact DN
- compactDN = self.cleanUserID(dn)
- if compactDN in ['','NULL',None]:
- compactDN = dn
- # begin transaction
- self.conn.begin()
- # check if it already exists
- varMap = {}
- varMap[':userName'] = compactDN
- varMap[':fileSize'] = fileSize
- varMap[':checkSum'] = checkSum
- varMap[':ngHostName'] = 'localhost.localdomain'
- self.cur.arraysize = 10
- self.cur.execute(sqlC+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if len(res) != 0:
- hostName,fileName = res[0]
- retStr = "FOUND:%s:%s" % (hostName,fileName)
- _logger.debug("checkSandboxFile -> %s" % retStr)
- return retStr
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("checkSandboxFile : %s %s" % (type,value))
- return "ERROR: DB failure"
-
-
- # insert dataset
- def insertDataset(self,dataset,tablename="ATLAS_PANDA.Datasets"):
- comment = ' /* DBProxy.insertDataset */'
- _logger.debug("insertDataset(%s)" % dataset.name)
- sql0 = "SELECT COUNT(*) FROM %s WHERE vuid=:vuid" % tablename
- sql1 = "INSERT INTO %s " % tablename
- sql1+= "(%s) " % DatasetSpec.columnNames()
- sql1+= DatasetSpec.bindValuesExpression()
- # time information
- dataset.creationdate = datetime.datetime.utcnow()
- dataset.modificationdate = dataset.creationdate
- try:
- # subtype
- if dataset.subType in ['','NULL',None]:
- # define using name
- if re.search('_dis\d+$',dataset.name) != None:
- dataset.subType = 'dis'
- elif re.search('_sub\d+$',dataset.name) != None:
- dataset.subType= 'sub'
- else:
- dataset.subType= 'top'
- # begin transaction
- self.conn.begin()
- # check if it already exists
- varMap = {}
- varMap[':vuid'] = dataset.vuid
- self.cur.execute(sql0+comment, varMap)
- nDS, = self.cur.fetchone()
- _logger.debug("insertDataset nDS=%s with %s" % (nDS,dataset.vuid))
- if nDS == 0:
- # insert
- _logger.debug("insertDataset insert %s" % dataset.name)
- self.cur.execute(sql1+comment, dataset.valuesMap())
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("insertDataset() : %s %s" % (type,value))
- return False
-
-
- # get and lock dataset with a query
- def getLockDatasets(self,sqlQuery,varMapGet,modTimeOffset='',getVersion=False):
- comment = ' /* DBProxy.getLockDatasets */'
- _logger.debug("getLockDatasets(%s,%s,%s)" % (sqlQuery,str(varMapGet),modTimeOffset))
- sqlGet = "SELECT /*+ INDEX_RS_ASC(tab(STATUS,TYPE,MODIFICATIONDATE)) */ vuid,name,modificationdate,version,transferStatus FROM ATLAS_PANDA.Datasets tab WHERE " + sqlQuery
- sqlLock = "UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE"
- if modTimeOffset != '':
- sqlLock += "+%s" % modTimeOffset
- sqlLock += ",transferStatus=MOD(transferStatus+1,10)"
- if getVersion:
- sqlLock += ",version=:version"
- sqlLock += " WHERE vuid=:vuid AND transferStatus=:transferStatus"
- retList = []
- try:
- # begin transaction
- self.conn.begin()
- # get datasets
- self.cur.arraysize = 1000000
- self.cur.execute(sqlGet+comment,varMapGet)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # loop over all datasets
- if res != None and len(res) != 0:
- for vuid,name,modificationdate,version,transferStatus in res:
- # lock
- varMapLock = {}
- varMapLock[':vuid'] = vuid
- varMapLock[':transferStatus'] = transferStatus
- if getVersion:
- try:
- varMapLock[':version'] = str(int(version) + 1)
- except:
- varMapLock[':version'] = str(1)
- # begin transaction
- self.conn.begin()
- # update for lock
- self.cur.execute(sqlLock+comment,varMapLock)
- retU = self.cur.rowcount
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if retU > 0:
- # append
- if not getVersion:
- retList.append((vuid,name,modificationdate))
- else:
- retList.append((vuid,name,modificationdate,version))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # retrun
- return retList
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getLockDatasets : %s %s" % (type,value))
- return []
-
-
- # query dataset with map
- def queryDatasetWithMap(self,map):
- comment = ' /* DBProxy.queryDatasetWithMap */'
- _logger.debug("queryDatasetWithMap(%s)" % map)
- if map.has_key('name'):
- sql1 = """SELECT /*+ BEGIN_OUTLINE_DATA """
- sql1 += """INDEX_RS_ASC(@"SEL$1" "TAB"@"SEL$1" ("DATASETS"."NAME")) """
- sql1 += """OUTLINE_LEAF(@"SEL$1") ALL_ROWS """
- sql1 += """OPTIMIZER_FEATURES_ENABLE('10.2.0.4') """
- sql1 += """IGNORE_OPTIM_EMBEDDED_HINTS """
- sql1 += """END_OUTLINE_DATA */ """
- sql1 += "%s FROM ATLAS_PANDA.Datasets tab" % DatasetSpec.columnNames()
- else:
- sql1 = "SELECT %s FROM ATLAS_PANDA.Datasets" % DatasetSpec.columnNames()
- varMap = {}
- for key in map.keys():
- if len(varMap)==0:
- sql1+= " WHERE %s=:%s" % (key,key)
- else:
- sql1+= " AND %s=:%s" % (key,key)
- varMap[':%s' % key] = map[key]
- try:
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 100
- _logger.debug(sql1+comment+str(varMap))
- self.cur.execute(sql1+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # instantiate Dataset
- if res != None and len(res) != 0:
- dataset = DatasetSpec()
- dataset.pack(res[0])
- return dataset
- _logger.error("queryDatasetWithMap(%s) : dataset not found" % map)
- return None
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("queryDatasetWithMap(%s) : %s %s" % (map,type,value))
- return None
-
-
- # update dataset
- def updateDataset(self,datasets,withLock,withCriteria,criteriaMap):
- comment = ' /* DBProxy.updateDataset */'
- _logger.debug("updateDataset()")
- sql1 = "UPDATE ATLAS_PANDA.Datasets SET %s " % DatasetSpec.bindUpdateExpression()
- sql1+= "WHERE vuid=:vuid"
- if withCriteria != "":
- sql1+= " AND %s" % withCriteria
- retList = []
- try:
- # start transaction
- self.conn.begin()
- for dataset in datasets:
- _logger.debug("updateDataset(%s,%s)" % (dataset.name,dataset.status))
- # time information
- dataset.modificationdate = datetime.datetime.utcnow()
- # update
- varMap = dataset.valuesMap()
- varMap[':vuid'] = dataset.vuid
- for cKey in criteriaMap.keys():
- varMap[cKey] = criteriaMap[cKey]
- self.cur.execute(sql1+comment, varMap)
- retU = self.cur.rowcount
- if retU != 0 and retU != 1:
- raise RuntimeError, 'Invalid retrun %s' % retU
- retList.append(retU)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("updateDataset() ret:%s" % retList)
- return retList
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("updateDataset() : %s %s" % (type,value))
- return []
-
-
- # delete dataset
- def deleteDataset(self,name):
- comment = ' /* DBProxy.deleteDataset */'
- sql1 = "DELETE /*+ INDEX(tab DATASETS_NAME_IDX)*/ FROM ATLAS_PANDA.Datasets tab WHERE name=:name"
- try:
- # start transaction
- self.conn.begin()
- # delete
- varMap = {}
- varMap[':name'] = name
- self.cur.execute(sql1+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("deleteDataset() : %s %s" % (type,value))
- return False
-
-
- # get serial number for dataset, insert dummy datasets to increment SN
- def getSerialNumber(self,datasetname,definedFreshFlag=None):
- comment = ' /* DBProxy.getSerialNumber */'
- try:
- _logger.debug("getSerialNumber(%s,%s)" % (datasetname,definedFreshFlag))
- # start transaction
- self.conn.begin()
- # check freashness
- if definedFreshFlag == None:
- # select
- varMap = {}
- varMap[':name'] = datasetname
- varMap[':type'] = 'output'
- sql = "SELECT /*+ INDEX_RS_ASC(TAB (DATASETS.NAME)) */ COUNT(*) FROM ATLAS_PANDA.Datasets tab WHERE type=:type AND name=:name"
- self.cur.arraysize = 100
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchone()
- # fresh dataset or not
- if res != None and len(res) != 0 and res[0] > 0:
- freshFlag = False
- else:
- freshFlag = True
- else:
- # use predefined flag
- freshFlag = definedFreshFlag
- # get serial number
- sql = "SELECT ATLAS_PANDA.SUBCOUNTER_SUBID_SEQ.nextval FROM dual";
- self.cur.arraysize = 100
- self.cur.execute(sql+comment, {})
- sn, = self.cur.fetchone()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # release file lock
- _logger.debug("getSerialNumber : %s %s" % (sn,freshFlag))
- return (sn,freshFlag)
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getSerialNumber() : %s %s" % (type,value))
- return (-1,False)
-
-
- # get serial number for group job
- def getSerialNumberForGroupJob(self,name):
- comment = ' /* DBProxy.getSerialNumberForGroupJob */'
- retVal = {'sn':'','status':False}
- try:
- _logger.debug("getSerialNumberForGroupJob(%s)" % name)
- # start transaction
- self.conn.begin()
- # get serial number
- sql = "SELECT ATLAS_PANDA.GROUP_JOBID_SEQ.nextval FROM dual";
- self.cur.execute(sql+comment, {})
- sn, = self.cur.fetchone()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- retVal['sn'] = sn
- retVal['status'] = True
- _logger.debug("getSerialNumberForGroupJob : %s %s" % (name,str(retVal)))
- return retVal
- except:
- # roll back
- self._rollback()
- # error
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("getSerialNumberForGroupJob : %s %s" % (errtype,errvalue))
- retVal['status'] = False
- return retVal
-
-
- # change job priorities
- def changeJobPriorities(self,newPrioMap):
- comment = ' /* DBProxy.changeJobPriorities */'
- try:
- _logger.debug("changeJobPriorities start")
- sql = "UPDATE %s SET currentPriority=:currentPriority,assignedPriority=:assignedPriority "
- sql += "WHERE PandaID=:PandaID"
- # loop over all PandaIDs
- for pandaID,newPrio in newPrioMap.iteritems():
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':currentPriority'] = newPrio
- varMap[':assignedPriority'] = newPrio
- _logger.debug("changeJobPriorities PandaID=%s -> prio=%s" % (pandaID,newPrio))
- # start transaction
- self.conn.begin()
- # try active tables
- retU = None
- for tableName in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsWaiting4']:
- # execute
- self.cur.execute((sql % tableName)+comment,varMap)
- retU = self.cur.rowcount
- if retU > 0:
- break
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("changeJobPriorities PandaID=%s retU=%s" % (pandaID,retU))
- # return
- _logger.debug("changeJobPriorities done")
- return True,''
- except:
- # roll back
- self._rollback()
- # error
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("changeJobPriorities : %s %s" % (errtype,errvalue))
- return False,'database error'
-
-
- # update transfer status for a dataset
- def updateTransferStatus(self,datasetname,bitMap):
- comment = ' /* DBProxy.updateTransferStatus */'
- try:
- _logger.debug("updateTransferStatus(%s,%s)" % (datasetname,hex(bitMap)))
- # start transaction
- self.conn.begin()
- retTransSt = 0
- # update bitmap
- sqlU = 'UPDATE /*+ INDEX_RS_ASC(TAB("DATASETS"."NAME")) */ ATLAS_PANDA.Datasets tab SET transferStatus=ATLAS_PANDA.BITOR(transferStatus,:bitMap) WHERE name=:name'
- varMap = {}
- varMap[':bitMap'] = bitMap
- varMap[':name'] = datasetname
- retU = self.cur.execute(sqlU+comment, varMap)
- # get transferStatus
- sqlS = 'SELECT /*+ INDEX_RS_ASC(TAB("DATASETS"."NAME")) */ transferStatus FROM ATLAS_PANDA.Datasets tab WHERE name=:name'
- varMap = {}
- varMap[':name'] = datasetname
- self.cur.arraysize = 10
- retS = self.cur.execute(sqlS+comment, varMap)
- resS = self.cur.fetchall()
- if resS != None and len(resS) != 0:
- retTransSt = resS[0][0]
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("updateTransferStatus : %s" % hex(retTransSt))
- return retTransSt
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("updateTransferStatus : %s %s" % (type,value))
- return 0
-
-
- # get CloudTask. If not exist, create it
- def getCloudTask(self,tid):
- comment = ' /* getCloudTask */'
- try:
- _logger.debug("getCloudTask(%s)" % tid)
- # check tid
- if tid in [None,'NULL']:
- _logger.error("invalid TID : %s" % tid)
- return None
- # start transaction
- self.conn.begin()
- # get CloudTask
- sql = "SELECT %s FROM ATLAS_PANDA.cloudtasks " % CloudTaskSpec.columnNames()
- sql += "WHERE taskid=:taskid"
- varMap = {}
- varMap[':taskid'] = tid
- # select
- self.cur.arraysize = 10
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- # already exist
- if res != None and len(res) != 0:
- # instantiate CloudTask
- cloudTask = CloudTaskSpec()
- cloudTask.pack(res[0])
- # update tmod if status is defined
- if cloudTask.status == 'defined':
- sql = "UPDATE ATLAS_PANDA.cloudtasks SET tmod=CURRENT_DATE WHERE taskid=:taskid"
- varMap = {}
- varMap[':taskid'] = cloudTask.taskid
- self.cur.execute(sql+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return cloudTask
- # insert new CloudTask
- _logger.debug("insert new CloudTask")
- cloudTask = CloudTaskSpec()
- cloudTask.taskid = tid
- cloudTask.status = 'defined'
- sql = "INSERT INTO ATLAS_PANDA.cloudtasks (id,taskid,status,tmod,tenter) VALUES(ATLAS_PANDA.CLOUDTASKS_ID_SEQ.nextval,:taskid,:status,CURRENT_DATE,CURRENT_DATE)"
- sql+= " RETURNING id INTO :newID"
- varMap = {}
- varMap[':taskid'] = cloudTask.taskid
- varMap[':status'] = cloudTask.status
- varMap[':newID'] = self.cur.var(cx_Oracle.NUMBER)
- self.cur.execute(sql+comment, varMap)
- # get id
- cloudTask.id = long(varMap[':newID'].getvalue())
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("return new CloudTask")
- return cloudTask
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getCloudTask() : %s %s" % (type,value))
- return None
-
-
- # set cloud to CloudTask
- def setCloudTask(self,cloudTask):
- comment = ' /* setCloudTask */'
- try:
- _logger.debug("setCloudTask(id=%s,taskid=%s)" % (cloudTask.id,cloudTask.taskid))
- sql = "UPDATE ATLAS_PANDA.cloudtasks SET cloud=:cloud,status=:newStatus,tmod=CURRENT_DATE WHERE id=:id AND status=:oldStatus"
- # start transaction
- self.conn.begin()
- # update
- varMap = {}
- varMap[':cloud'] = cloudTask.cloud
- varMap[':id'] = cloudTask.id
- varMap[':newStatus'] = 'assigned'
- varMap[':oldStatus'] = 'defined'
- self.cur.execute(sql+comment, varMap)
- retU = self.cur.rowcount
- # succeeded
- if retU == 1:
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return cloudTask
- # read if it is already set by another thread
- sql = "SELECT %s FROM ATLAS_PANDA.cloudtasks " % CloudTaskSpec.columnNames()
- sql += "WHERE id=:id"
- varMap = {}
- varMap[':id'] = cloudTask.id
- # select
- self.cur.arraysize = 10
- retS = self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # retrun CloudTask
- if res != None and len(res) != 0:
- # instantiate CloudTask
- cloudTask = CloudTaskSpec()
- cloudTask.pack(res[0])
- return cloudTask
- _logger.error("setCloudTask() : cannot find CloudTask for %s" % cloudTask.id)
- return None
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("setCloudTask() : %s %s" % (type,value))
- return None
-
-
- # see CloudTask
- def seeCloudTask(self,tid):
- comment = ' /* seeCloudTask */'
- try:
- _logger.debug("seeCloudTask(%s)" % tid)
- # check tid
- if tid in [None,'NULL']:
- _logger.error("invalid TID : %s" % tid)
- return None
- # start transaction
- self.conn.begin()
- # select
- sql = "SELECT cloud FROM ATLAS_PANDA.cloudtasks WHERE taskid=:taskid"
- varMap = {}
- varMap[':taskid'] = tid
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # existing task
- if res != None and len(res) != 0:
- # return cloud
- return res[0][0]
- else:
- return None
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("seeCloudTask() : %s %s" % (type,value))
- return None
-
-
- # reset modification time of a task to shorten retry interval
- def resetTmodCloudTask(self,tid):
- comment = ' /* resetTmodCloudTask */'
- try:
- _logger.debug("resetTmodCloudTask %s" % tid)
- # check tid
- if tid in [None,'NULL']:
- _logger.error("invalid TID : %s" % tid)
- return None
- # start transaction
- self.conn.begin()
- # update
- sql = "UPDATE ATLAS_PANDA.cloudtasks SET tmod=:tmod WHERE taskid=:taskid"
- varMap = {}
- varMap[':taskid'] = tid
- varMap[':tmod'] = datetime.datetime.utcnow() - datetime.timedelta(minutes=165)
- self.cur.execute(sql+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("resetTmodCloudTask : %s %s" % (type,value))
- return False
-
-
- # get assigning task
- def getAssigningTask(self):
- comment = ' /* getAssigningTask */'
- try:
- _logger.debug("getAssigningTask")
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
- # start transaction
- self.conn.begin()
- # select
- sql = "SELECT taskid FROM ATLAS_PANDA.cloudtasks WHERE status=:status AND tmod>:tmod"
- varMap = {}
- varMap[':tmod'] = timeLimit
- varMap[':status'] = 'defined'
- self.cur.arraysize = 100
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # loop over all taskid
- retList = []
- if res != None:
- for tid, in res:
- retList.append(tid)
- # return
- _logger.debug("getAssigningTask ret:%s" % retList)
- return retList
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getAssigningTask : %s %s" % (type,value))
- return []
-
-
- # set CloudTask by user
- def setCloudTaskByUser(self,user,tid,cloud,status):
- comment = ' /* setCloudTaskByUser */'
- try:
- _logger.debug("setCloudTaskByUser(tid=%s,cloud=%s,status=%s) by %s" % (tid,cloud,status,user))
- # check tid
- if tid in [None,'NULL']:
- tmpMsg = "invalid TID : %s" % tid
- _logger.error(tmpMsg)
- return "ERROR: " + tmpMsg
- # check status
- statusList = ['tobeaborted']
- if not status in statusList:
- tmpMsg = "invalid status=%s. Must be one of %s" (status,str(statusList))
- _logger.error(tmpMsg)
- return "ERROR: " + tmpMsg
- # start transaction
- self.conn.begin()
- # get CloudTask
- sql = "SELECT %s FROM ATLAS_PANDA.cloudtasks " % CloudTaskSpec.columnNames()
- sql += "WHERE taskid=:taskid"
- varMap = {}
- varMap[':taskid'] = tid
- # select
- self.cur.arraysize = 10
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- # already exist
- if res != None and len(res) != 0:
- # set status
- sql = "UPDATE ATLAS_PANDA.cloudtasks SET status=:status,tmod=CURRENT_DATE WHERE taskid=:taskid"
- varMap = {}
- varMap[':taskid'] = tid
- varMap[':status'] = status
- self.cur.execute(sql+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return "SUCCEEDED"
- # insert new CloudTask
- sql = "INSERT INTO ATLAS_PANDA.cloudtasks (id,taskid,status,tmod,tenter) VALUES(ATLAS_PANDA.CLOUDTASKS_ID_SEQ.nextval,:taskid,:status,CURRENT_DATE,CURRENT_DATE)"
- varMap = {}
- varMap[':taskid'] = tid
- varMap[':status'] = status
- self.cur.execute(sql+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return "SUCCEEDED"
- except:
- # roll back
- self._rollback()
- # error
- errType,errValue = sys.exc_info()[:2]
- _logger.error("setCloudTaskByUser() : %s %s" % (errType,errValue))
- return "ERROR: database error"
-
-
- # query files with map
- def queryFilesWithMap(self,map):
- comment = ' /* DBProxy.queryFilesWithMap */'
- _logger.debug("queryFilesWithMap()")
- sql1 = "SELECT PandaID,%s FROM ATLAS_PANDA.filesTable4" % FileSpec.columnNames()
- varMap = {}
- for key in map.keys():
- if len(varMap)==0:
- sql1+= " WHERE %s=:%s" % (key,key)
- else:
- sql1+= " AND %s=:%s" % (key,key)
- varMap[':%s' % key] = map[key]
- nTry=3
- for iTry in range(nTry):
- try:
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- self.cur.execute(sql1+comment, varMap)
- res = self.cur.fetchall()
- _logger.debug("queryFilesWithMap() : %s" % str(res))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # instantiate files
- retList = []
- for item in res:
- # instantiate dummy JobSpec obj for PandaID
- job = JobSpec()
- job.PandaID = item[0]
- # instantiate file
- file = FileSpec()
- file.pack(item[1:])
- # set owner
- file.setOwner(job)
- # append
- retList.append(file)
- return retList
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("queryFilesWithMap retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("queryFilesWithMap : %s %s" % (type,value))
- return []
-
-
- # count the number of files with map
- def countFilesWithMap(self,map):
- comment = ' /* DBProxy.countFilesWithMap */'
- sql1 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ COUNT(*) FROM ATLAS_PANDA.filesTable4 tab"
- varMap = {}
- for key in map.keys():
- if len(varMap)==0:
- sql1+= " WHERE %s=:%s" % (key,key)
- else:
- sql1+= " AND %s=:%s" % (key,key)
- varMap[':%s' % key] = map[key]
- nTry=3
- for iTry in range(nTry):
- try:
- # start transaction
- self.conn.begin()
- # select
- _logger.debug("countFilesWithMap() : %s %s" % (sql1,str(map)))
- self.cur.arraysize = 10
- retS = self.cur.execute(sql1+comment, varMap)
- res = self.cur.fetchone()
- _logger.debug("countFilesWithMap() : %s %s" % (retS,str(res)))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- nFiles=0
- if res != None:
- nFiles=res[0]
- return nFiles
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("countFilesWithMap() retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("countFilesWithMap(%s) : %s %s" % (map,type,value))
- return -1
-
-
- # count the number of pending files
- def countPendingFiles(self,pandaID,forInput=True):
- comment = ' /* DBProxy.countPendingFiles */'
- varMap = {}
- varMap[':pandaID'] = pandaID
- varMap[':status'] = 'ready'
- if forInput:
- sql1 = "SELECT COUNT(*) FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:pandaID AND type=:type AND status<>:status "
- varMap[':type'] = 'input'
- else:
- sql1 = "SELECT COUNT(*) FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:pandaID AND type IN (:type1,:type2) AND status<>:status "
- varMap[':type1'] = 'output'
- varMap[':type2'] = 'log'
- try:
- # start transaction
- self.conn.begin()
- # select
- _logger.debug("countPendingFiles : %s start" % pandaID)
- self.cur.arraysize = 10
- retS = self.cur.execute(sql1+comment, varMap)
- res = self.cur.fetchone()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- nFiles = -1
- if res != None:
- nFiles=res[0]
- _logger.debug("countPendingFiles : %s -> %s" % (pandaID,nFiles))
- return nFiles
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("countPendingFiles : %s : %s %s" % (pandaID,errType,errValue))
- return -1
-
-
- # get datasets associated with file
- def getDatasetWithFile(self,lfn,jobPrioity=0):
- comment = ' /* DBProxy.getDatasetWithFile */'
- varMap = {}
- varMap[':lfn'] = lfn
- varMap[':status1'] = 'pending'
- varMap[':status2'] = 'transferring'
- sql1 = "SELECT PandaID,status,destinationDBlock,destinationDBlockToken,dispatchDBlock FROM ATLAS_PANDA.filesTable4 "
- sql1 += "WHERE lfn=:lfn AND status IN (:status1,:status2) AND modificationTime=:currentPriority '
- varMap[':currentPriority'] = jobPrioity
- self.cur.execute(sqlP+comment, varMap)
- resP = self.cur.fetchone()
- # append
- if resP != None and resP[1] in ['managed','test']:
- retMap[dsName] = (resP[0],dsToken)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("getDatasetWithFile : %s -> %s" % (lfn,str(retMap)))
- return retMap
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getDatasetWithFile : %s : %s %s" % (lfn,errType,errValue))
- return {}
-
-
- # get input files currently in use for analysis
- def getFilesInUseForAnal(self,outDataset):
- comment = ' /* DBProxy.getFilesInUseForAnal */'
- sqlSub = "SELECT destinationDBlock,PandaID FROM ATLAS_PANDA.filesTable4 "
- sqlSub += "WHERE dataset=:dataset AND type IN (:type1,:type2) GROUP BY destinationDBlock,PandaID"
- sqlPaA = "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsDefined4 "
- sqlPaA += "WHERE PandaID=:PandaID "
- sqlPaA += "UNION "
- sqlPaA += "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsActive4 "
- sqlPaA += "WHERE PandaID=:PandaID "
- sqlPan = "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsArchived4 "
- sqlPan += "WHERE PandaID=:PandaID AND modificationTime<=CURRENT_DATE "
- sqlPan += "UNION "
- sqlPan += "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDAARCH.jobsArchived "
- sqlPan += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)"
- sqlIdA = "SELECT PandaID,jobStatus FROM ATLAS_PANDA.jobsArchived4 "
- sqlIdA += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sqlIdA += "AND prodSourceLabel=:prodSourceLabel1 "
- sqlIdL = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ "
- sqlIdL += "PandaID,jobStatus FROM ATLAS_PANDAARCH.jobsArchived tab "
- sqlIdL += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sqlIdL += "AND prodSourceLabel=:prodSourceLabel1 AND modificationTime>(CURRENT_DATE-30) "
- sqlDis = "SELECT distinct dispatchDBlock FROM ATLAS_PANDA.filesTable4 "
- sqlDis += "WHERE PandaID=:PandaID AND type=:type AND dispatchDBlock IS NOT NULL AND modificationTime <= CURRENT_DATE"
- sqlLfn = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ lfn,PandaID FROM ATLAS_PANDA.filesTable4 tab "
- sqlLfn += "WHERE dispatchDBlock=:dispatchDBlock AND type=:type "
- sqlLfn += "AND (destinationDBlockToken IS NULL OR destinationDBlockToken<>:noshadow) AND modificationTime<=CURRENT_DATE"
- nTry=3
- for iTry in range(nTry):
- inputFilesList = []
- try:
- # start transaction
- self.conn.begin()
- # get sub datasets
- varMap = {}
- varMap[':dataset'] = outDataset
- varMap[':type1'] = 'output'
- varMap[':type2'] = 'log'
- _logger.debug("getFilesInUseForAnal : %s %s" % (sqlSub,str(varMap)))
- self.cur.arraysize = 100000
- retS = self.cur.execute(sqlSub+comment, varMap)
- res = self.cur.fetchall()
- subDSpandaIDmap = {}
- checkedPandaIDs = {}
- for subDataset,pandaID in res:
- # avoid redundunt lookup
- if checkedPandaIDs.has_key(pandaID):
- continue
- if subDSpandaIDmap.has_key(subDataset):
- # append jobs as running since they are not in archived tables
- if not pandaID in subDSpandaIDmap[subDataset]:
- checkedPandaIDs[pandaID] = 'running'
- subDSpandaIDmap[subDataset].append(pandaID)
- continue
- # look for jobdefID and userName
- varMap = {}
- varMap[':PandaID'] = pandaID
- _logger.debug("getFilesInUseForAnal : %s %s" % (sqlPaA,str(varMap)))
- retP = self.cur.execute(sqlPaA+comment, varMap)
- resP = self.cur.fetchall()
- if len(resP) != 0:
- jobDefinitionID,prodUserName = resP[0]
- else:
- _logger.debug("getFilesInUseForAnal : %s %s" % (sqlPan,str(varMap)))
- retP = self.cur.execute(sqlPan+comment, varMap)
- resP = self.cur.fetchall()
- if len(resP) != 0:
- jobDefinitionID,prodUserName = resP[0]
- else:
- continue
- # get PandaIDs with obdefID and userName
- tmpPandaIDs = []
- varMap = {}
- varMap[':prodUserName'] = prodUserName
- varMap[':jobDefinitionID'] = jobDefinitionID
- varMap[':prodSourceLabel1'] = 'user'
- _logger.debug("getFilesInUseForAnal : %s %s" % (sqlIdA,str(varMap)))
- retID = self.cur.execute(sqlIdA+comment, varMap)
- resID = self.cur.fetchall()
- for tmpPandaID,tmpJobStatus in resID:
- checkedPandaIDs[tmpPandaID] = tmpJobStatus
- tmpPandaIDs.append(tmpPandaID)
- _logger.debug("getFilesInUseForAnal : %s %s" % (sqlIdL,str(varMap)))
- retID = self.cur.execute(sqlIdL+comment, varMap)
- resID = self.cur.fetchall()
- for tmpPandaID,tmpJobStatus in resID:
- if not tmpPandaID in tmpPandaIDs:
- checkedPandaIDs[tmpPandaID] = tmpJobStatus
- tmpPandaIDs.append(tmpPandaID)
- # append
- if not subDSpandaIDmap.has_key(subDataset):
- subDSpandaIDmap[subDataset] = []
- for tmpPandaID in tmpPandaIDs:
- # reuse failed files if jobs are in Archived since they cannot change back to active
- if checkedPandaIDs[tmpPandaID] in ['failed','cancelled']:
- continue
- # collect PandaIDs
- subDSpandaIDmap[subDataset].append(tmpPandaID)
- # loop over all sub datasets
- for subDataset,activePandaIDs in subDSpandaIDmap.iteritems():
- # skip empty
- if activePandaIDs == []:
- continue
- # get dispatchDBlocks
- pandaID = activePandaIDs[0]
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':type'] = 'input'
- _logger.debug("getFilesInUseForAnal : %s %s" % (sqlDis,str(varMap)))
- self.cur.arraysize = 10000
- retD = self.cur.execute(sqlDis+comment, varMap)
- resD = self.cur.fetchall()
- # get LFNs
- for disDataset, in resD:
- # use new style only
- if not disDataset.startswith('user_disp.'):
- continue
- varMap = {}
- varMap[':dispatchDBlock'] = disDataset
- varMap[':type'] = 'input'
- varMap[':noshadow'] = 'noshadow'
- _logger.debug("getFilesInUseForAnal : %s %s" % (sqlLfn,str(varMap)))
- self.cur.arraysize = 100000
- retL = self.cur.execute(sqlLfn+comment, varMap)
- resL = self.cur.fetchall()
- # append
- for lfn,filePandaID in resL:
- # skip files used by archived failed or cancelled jobs
- if filePandaID in activePandaIDs and not lfn in inputFilesList:
- inputFilesList.append(lfn)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("getFilesInUseForAnal : %s" % len(inputFilesList))
- return inputFilesList
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("inputFilesList retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("inputFilesList(%s) : %s %s" % (outDataset,type,value))
- return []
-
-
- # get list of dis dataset to get input files in shadow
- def getDisInUseForAnal(self,outDataset):
- comment = ' /* DBProxy.getDisInUseForAnal */'
- sqlSub = "SELECT destinationDBlock,PandaID,status FROM ATLAS_PANDA.filesTable4 "
- sqlSub += "WHERE dataset=:dataset AND type=:type1 GROUP BY destinationDBlock,PandaID,status"
- sqlPaA = "SELECT jobStatus FROM ATLAS_PANDA.jobsDefined4 "
- sqlPaA += "WHERE PandaID=:PandaID "
- sqlPaA += "UNION "
- sqlPaA += "SELECT jobStatus FROM ATLAS_PANDA.jobsActive4 "
- sqlPaA += "WHERE PandaID=:PandaID "
- sqlPan = "SELECT jobStatus FROM ATLAS_PANDA.jobsArchived4 "
- sqlPan += "WHERE PandaID=:PandaID AND modificationTime<=CURRENT_DATE "
- sqlPan += "UNION "
- sqlPan += "SELECT jobStatus FROM ATLAS_PANDAARCH.jobsArchived "
- sqlPan += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)"
- sqlDis = "SELECT distinct dispatchDBlock FROM ATLAS_PANDA.filesTable4 "
- sqlDis += "WHERE PandaID=:PandaID AND type=:type AND dispatchDBlock IS NOT NULL AND modificationTime <= CURRENT_DATE"
- inputDisList = []
- try:
- timeStart = datetime.datetime.utcnow()
- _logger.debug("getDisInUseForAnal start for %s" % outDataset)
- # start transaction
- self.conn.begin()
- # get sub datasets
- varMap = {}
- varMap[':dataset'] = outDataset
- varMap[':type1'] = 'log'
- _logger.debug("getDisInUseForAnal : %s %s" % (sqlSub,str(varMap)))
- self.cur.arraysize = 100000
- retS = self.cur.execute(sqlSub+comment, varMap)
- res = self.cur.fetchall()
- subDSpandaIDmap = {}
- checkedPandaIDs = {}
- for subDataset,pandaID,fileStatus in res:
- # add map
- if not subDSpandaIDmap.has_key(subDataset):
- subDSpandaIDmap[subDataset] = []
- # check job status
- if fileStatus != 'ready':
- varMap = {}
- varMap[':PandaID'] = pandaID
- _logger.debug("getDisInUseForAnal : %s %s" % (sqlPaA,str(varMap)))
- retP = self.cur.execute(sqlPaA+comment, varMap)
- resP = self.cur.fetchall()
- if len(resP) != 0:
- # append jobs as running since they are not in archived tables yet
- checkedPandaIDs[pandaID] = 'running'
- subDSpandaIDmap[subDataset].append(pandaID)
- else:
- _logger.debug("getDisInUseForAnal : %s %s" % (sqlPan,str(varMap)))
- retP = self.cur.execute(sqlPan+comment, varMap)
- resP = self.cur.fetchall()
- if len(resP) != 0:
- checkedPandaIDs[pandaID], = resP[0]
- # reuse failed files if jobs are in Archived since they cannot change back to active
- if checkedPandaIDs[pandaID] in ['failed','cancelled']:
- continue
- # collect PandaIDs
- subDSpandaIDmap[subDataset].append(pandaID)
- else:
- # not found
- continue
- else:
- # no job lookup since file was sucessfully finished
- checkedPandaIDs[pandaID] = 'finished'
- # collect PandaIDs
- subDSpandaIDmap[subDataset].append(pandaID)
- # loop over all sub datasets
- for subDataset,activePandaIDs in subDSpandaIDmap.iteritems():
- # skip empty
- if activePandaIDs == []:
- continue
- resDisList = []
- # get dispatchDBlocks
- pandaID = activePandaIDs[0]
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':type'] = 'input'
- _logger.debug("getDisInUseForAnal : %s %s" % (sqlDis,str(varMap)))
- self.cur.arraysize = 10000
- retD = self.cur.execute(sqlDis+comment, varMap)
- resD = self.cur.fetchall()
- # get shadow dis
- for disDataset, in resD:
- # use new style only
- if not disDataset.startswith('user_disp.'):
- continue
- if not disDataset in resDisList:
- resDisList.append(disDataset)
- # append
- if resDisList != []:
- inputDisList.append((resDisList,activePandaIDs))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- timeDelta = datetime.datetime.utcnow()-timeStart
- _logger.debug("getDisInUseForAnal end for %s len=%s time=%ssec" % (outDataset,len(inputDisList),timeDelta.seconds))
- return inputDisList
- except:
- # roll back
- self._rollback()
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("getDisInUseForAnal(%s) : %s %s" % (outDataset,errtype,errvalue))
- return None
-
-
- # get input LFNs currently in use for analysis with shadow dis
- def getLFNsInUseForAnal(self,inputDisList):
- comment = ' /* DBProxy.getLFNsInUseForAnal */'
- sqlLfn = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ lfn,PandaID FROM ATLAS_PANDA.filesTable4 tab "
- sqlLfn += "WHERE dispatchDBlock=:dispatchDBlock AND type=:type "
- sqlLfn += "AND (destinationDBlockToken IS NULL OR destinationDBlockToken<>:noshadow) AND modificationTime<=CURRENT_DATE"
- inputFilesList = []
- try:
- token = datetime.datetime.utcnow().isoformat('/')
- # loop over all shadow dis datasets
- pandaIdLfnMap = {}
- for disDatasetList,activePandaIDs in inputDisList:
- for disDataset in disDatasetList:
- # use new style only
- if not disDataset.startswith('user_disp.'):
- continue
- # read LFNs and PandaIDs
- if not pandaIdLfnMap.has_key(disDataset):
- # start transaction
- self.conn.begin()
- varMap = {}
- varMap[':dispatchDBlock'] = disDataset
- varMap[':type'] = 'input'
- varMap[':noshadow'] = 'noshadow'
- _logger.debug("getLFNsInUseForAnal : <%s> %s %s" % (token,sqlLfn,str(varMap)))
- timeStart = datetime.datetime.utcnow()
- self.cur.arraysize = 100000
- retL = self.cur.execute(sqlLfn+comment, varMap)
- resL = self.cur.fetchall()
- # commit
- timeDelta = datetime.datetime.utcnow()-timeStart
- _logger.debug("getLFNsInUseForAnal : <%s> %s time=%ssec commit" % (token,disDataset,timeDelta.seconds))
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # make map
- pandaIdLfnMap[disDataset] = {}
- for lfn,filePandaID in resL:
- if not pandaIdLfnMap[disDataset].has_key(filePandaID):
- pandaIdLfnMap[disDataset][filePandaID] = []
- pandaIdLfnMap[disDataset][filePandaID].append(lfn)
- _logger.debug("getLFNsInUseForAnal : <%s> %s map made with len=%s" % \
- (token,disDataset,len(resL)))
- # append
- for disDataset in disDatasetList:
- _logger.debug("getLFNsInUseForAnal : <%s> %s list making pandaIDs=%s fileLen=%s" % \
- (token,disDataset,len(activePandaIDs),len(inputFilesList)))
- for activePandaID in activePandaIDs:
- # skip files used by archived failed or cancelled jobs
- if pandaIdLfnMap[disDataset].has_key(activePandaID):
- inputFilesList += pandaIdLfnMap[disDataset][activePandaID]
- _logger.debug("getLFNsInUseForAnal : <%s> %s done" % (token,disDataset))
- _logger.debug("getLFNsInUseForAnal : <%s> %s" % (token,len(inputFilesList)))
- return inputFilesList
- except:
- # roll back
- self._rollback()
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("getLFNsInUseForAnal(%s) : %s %s" % (str(inputDisList),errtype,errvalue))
- return None
-
-
- # update input files and return corresponding PandaIDs
- def updateInFilesReturnPandaIDs(self,dataset,status,fileLFN=''):
- comment = ' /* DBProxy.updateInFilesReturnPandaIDs */'
- _logger.debug("updateInFilesReturnPandaIDs(%s,%s)" % (dataset,fileLFN))
- sql0 = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ row_ID,PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE status<>:status AND dispatchDBlock=:dispatchDBlock"
- sql1 = "UPDATE /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ ATLAS_PANDA.filesTable4 tab SET status=:status WHERE status<>:status AND dispatchDBlock=:dispatchDBlock"
- varMap = {}
- varMap[':status'] = status
- varMap[':dispatchDBlock'] = dataset
- if fileLFN != '':
- sql0 += " AND lfn=:lfn"
- sql1 += " AND lfn=:lfn"
- varMap[':lfn'] = fileLFN
- for iTry in range(self.nTry):
- try:
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- retS = self.cur.execute(sql0+comment, varMap)
- resS = self.cur.fetchall()
- # update
- retU = self.cur.execute(sql1+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # collect PandaIDs
- retList = []
- for tmpRowID,tmpPandaID in resS:
- # append
- if not tmpPandaID in retList:
- retList.append(tmpPandaID)
- # return
- _logger.debug("updateInFilesReturnPandaIDs : %s" % str(retList))
- return retList
- except:
- # roll back
- self._rollback()
- # error report
- if iTry+1 < self.nTry:
- _logger.debug("updateInFilesReturnPandaIDs retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("updateInFilesReturnPandaIDs : %s %s" % (type, value))
- return []
-
-
- # update file status in dispatch dataset
- def updateFileStatusInDisp(self,dataset,fileStatusMap):
- comment = ' /* DBProxy.updateFileStatusInDisp */'
- _logger.debug("updateFileStatusInDisp(%s,%s)" % (dataset,fileStatusMap))
- sql1 = "UPDATE /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ ATLAS_PANDA.filesTable4 tab SET status=:status WHERE dispatchDBlock=:dispatchDBlock AND lfn=:lfn"
- nTry = 1
- for iTry in range(nTry):
- try:
- # start transaction
- self.conn.begin()
- # update
- for status,lfns in fileStatusMap.iteritems():
- varMap = {}
- varMap[':status'] = status
- varMap[':dispatchDBlock'] = dataset
- # loop over all files
- for lfn in lfns:
- varMap['lfn'] = lfn
- # update
- retU = self.cur.execute(sql1+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- _logger.debug("updateFileStatusInDisp : done")
- return True
- except:
- # roll back
- self._rollback()
- # error report
- if iTry+1 < nTry:
- _logger.debug("updateFileStatusInDisp retry : %s" % iTry)
- time.sleep(random.randint(5,10))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("updateFileStatusInDisp : %s %s" % (type, value))
- return False
-
-
- # update output files and return corresponding PandaIDs
- def updateOutFilesReturnPandaIDs(self,dataset,fileLFN=''):
- comment = ' /* DBProxy.updateOutFilesReturnPandaIDs */'
- _logger.debug("updateOutFilesReturnPandaIDs(%s,%s)" % (dataset,fileLFN))
- sql0 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ row_ID,PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status=:status"
- sql1 = "UPDATE /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ ATLAS_PANDA.filesTable4 tab SET status='ready' WHERE destinationDBlock=:destinationDBlock AND status=:status"
- varMap = {}
- varMap[':status'] = 'transferring'
- varMap[':destinationDBlock'] = dataset
- if fileLFN != '':
- sql0 += " AND lfn=:lfn"
- sql1 += " AND lfn=:lfn"
- varMap[':lfn'] = fileLFN
- for iTry in range(self.nTry):
- try:
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- retS = self.cur.execute(sql0+comment, varMap)
- resS = self.cur.fetchall()
- # update
- retList = []
- retU = self.cur.execute(sql1+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # collect PandaIDs
- retList = []
- for tmpRowID,tmpPandaID in resS:
- # append
- if not tmpPandaID in retList:
- retList.append(tmpPandaID)
- # return
- _logger.debug("updateOutFilesReturnPandaIDs : %s" % str(retList))
- return retList
- except:
- # roll back
- self._rollback()
- # error report
- if iTry+1 < self.nTry:
- _logger.debug("updateOutFilesReturnPandaIDs retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("updateOutFilesReturnPandaIDs : %s %s" % (type, value))
- return []
-
-
- # get _dis datasets associated to _sub
- def getAssociatedDisDatasets(self,subDsName):
- comment = ' /* DBProxy.getAssociatedDisDatasets */'
- _logger.debug("getAssociatedDisDatasets(%s)" % subDsName)
- sqlF = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ distinct PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock"
- sqlJ = "SELECT distinct dispatchDBlock FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type"
- try:
- # start transaction
- self.conn.begin()
- # get PandaIDs
- varMap = {}
- varMap[':destinationDBlock'] = subDsName
- self.cur.arraysize = 10000
- self.cur.execute(sqlF+comment,varMap)
- resS = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # loop over all PandaIDs
- retList = []
- for pandaID, in resS:
- # start transaction
- self.conn.begin()
- # get _dis name
- varMap = {}
- varMap[':type'] = 'input'
- varMap[':PandaID'] = pandaID
- self.cur.arraysize = 1000
- self.cur.execute(sqlJ+comment,varMap)
- resD = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for disName, in resD:
- if disName != None and not disName in retList:
- retList.append(disName)
- # return
- _logger.debug("getAssociatedDisDatasets : %s" % str(retList))
- return retList
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getAssociatedDisDatasets : %s : %s %s" % (subDsName,errType,errValue))
- return []
-
-
- # set GUIDs
- def setGUIDs(self,files):
- comment = ' /* DBProxy.setGUIDs */'
- _logger.debug("setGUIDs(%s)" % files)
- sql0 = "UPDATE ATLAS_PANDA.filesTable4 SET GUID=:GUID,fsize=:fsize,checksum=:checksum,scope=:scope WHERE lfn=:lfn"
- for iTry in range(self.nTry):
- try:
- # start transaction
- self.conn.begin()
- # update
- for file in files:
- varMap = {}
- varMap[':GUID'] = file['guid']
- varMap[':lfn'] = file['lfn']
- if file['checksum'] in ['','NULL']:
- varMap[':checksum'] = None
- else:
- varMap[':checksum'] = file['checksum']
- varMap[':fsize'] = file['fsize']
- if not file.has_key('scope') or file['scope'] in ['','NULL']:
- varMap[':scope'] = None
- else:
- varMap[':scope'] = file['scope']
- self.cur.execute(sql0+comment, varMap)
- retU = self.cur.rowcount
- _logger.debug("setGUIDs : retU %s" % retU)
- if retU<0:
- raise RuntimeError, 'SQL error'
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- # error report
- if iTry+1 < self.nTry:
- _logger.debug("setGUIDs retry : %s" % iTry)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("setGUIDs : %s %s" % (type, value))
- return False
-
-
- # query PandaID with Datasets
- def queryPandaIDwithDataset(self,datasets):
- comment = ' /* DBProxy.queryPandaIDwithDataset */'
- _logger.debug("queryPandaIDwithDataset(%s)" % datasets)
- if len(datasets) == 0:
- return []
- # make SQL query
- sql1 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock GROUP BY PandaID"
- # execute
- try:
- retList = []
- for dataset in datasets:
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- varMap = {}
- varMap[':destinationDBlock'] = dataset
- self.cur.execute(sql1+comment,varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # get IDs
- for r in res:
- retList.append(r[0])
- # return
- _logger.debug("queryPandaIDwithDataset : %s" % str(retList))
- return retList
- except:
- # roll back
- self._rollback()
- # error report
- type, value, traceBack = sys.exc_info()
- _logger.error("queryPandaIDwithDataset : %s %s" % (type, value))
- return []
-
-
- # query last files in datasets
- def queryLastFilesInDataset(self,datasets):
- comment = ' /* DBProxy.queryLastFilesInDataset */'
- _logger.debug("queryLastFilesInDataset(%s)" % datasets)
- if len(datasets) == 0:
- return []
- # make SQL query
- sql1 = "SELECT lfn,PandaID FROM ATLAS_PANDA.filesTable4 WHERE dataset=:dataset AND type=:type ORDER BY lfn DESC"
- sqlL = "SELECT processingType FROM %s WHERE PandaID=:PandaID "
- sqlA = "UNION SELECT processingType FROM ATLAS_PANDAARCH.jobsArchived WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)"
- sql2 = "SELECT lfn FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type"
- # execute
- try:
- retMap = {}
- for dataset in datasets:
- # start transaction
- self.conn.begin()
- # select max LFN
- varMap = {}
- varMap[':type'] = 'output'
- varMap[':dataset'] = dataset
- self.cur.arraysize = 100000
- self.cur.execute(sql1+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # found
- retList = []
- for tmpLFN,pandaID in res:
- # skip log.tgz
- if re.search('\.log\.tgz(\.\d+)*$',tmpLFN) != None:
- continue
- # start transaction
- self.conn.begin()
- self.cur.arraysize = 10
- # check processingType
- processingType = None
- for tmpTable in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']:
- varMap = {}
- varMap[':PandaID'] = pandaID
- if tmpTable == 'ATLAS_PANDA.jobsArchived4':
- self.cur.execute((sqlL % tmpTable)+sqlA+comment, varMap)
- else:
- self.cur.execute((sqlL % tmpTable)+comment, varMap)
- resP = self.cur.fetchone()
- if resP != None:
- processingType = resP[0]
- break
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # job not found
- if processingType == None:
- continue
- # ignore merge jobs
- if processingType in ['usermerge']:
- continue
- # start transaction
- self.conn.begin()
- # select LFNs
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':type'] = 'output'
- self.cur.arraysize = 1000
- self.cur.execute(sql2+comment, varMap)
- res = self.cur.fetchall()
- for r in res:
- retList.append(r[0])
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # get only the largest one
- break
- # append
- retMap[dataset] = retList
- # return
- _logger.debug("queryLastFilesInDataset : %s" % str(retMap))
- return retMap
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("queryLastFilesInDataset : %s %s" % (type, value))
- return {}
-
-
- # query PandaID with filenames
- def queryPandaIDwithLFN(self,vlfns):
- comment = ' /* DBProxy.queryPandaIDwithLFN */'
- _logger.debug("queryPandaIDwithLFN(%s)" % vlfns)
- if len(vlfns) == 0:
- return []
- # make SQL query
- sql1 = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE lfn=:lfn GROUP BY PandaID"
- # execute
- retList = []
- for lfn in vlfns:
- # get generic LFNs
- gLFN = re.sub('\.\d+$','',lfn)
- # try
- try:
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':lfn'] = gLFN
- self.cur.arraysize = 10000
- self.cur.execute(sql1+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append IDs
- for tmpID, in res:
- if not tmpID in retList:
- retList.append(tmpID)
- except:
- # roll back
- self._rollback()
- # error report
- type, value, traceBack = sys.exc_info()
- _logger.error("queryPandaIDwithLFN : %s %s" % (type, value))
- return []
- # return
- _logger.debug("queryPandaIDwithLFN : %s" % str(retList))
- return retList
-
-
- # get job statistics
- def getJobStatistics(self,archived=False,predefined=False,workingGroup='',countryGroup='',jobType='',forAnal=None,minPriority=None):
- comment = ' /* DBProxy.getJobStatistics */'
- _logger.debug("getJobStatistics(%s,%s,'%s','%s','%s',%s,%s)" % (archived,predefined,workingGroup,countryGroup,jobType,forAnal,minPriority))
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12)
- sql0 = "SELECT computingSite,jobStatus,COUNT(*) FROM %s "
- # processingType
- tmpJobTypeMap = {}
- sqlJobType = ''
- useWhereInSQL = True
- if forAnal == None or jobType != "":
- useWhereInSQL = False
- elif forAnal == True:
- tmpJobTypeMap[':prodSourceLabel1'] = 'user'
- tmpJobTypeMap[':prodSourceLabel2'] = 'panda'
- sql0 += "WHERE prodSourceLabel IN ("
- sqlJobType = ":prodSourceLabel1,:prodSourceLabel2) "
- else:
- tmpJobTypeMap[':prodSourceLabel1'] = 'managed'
- sql0 += "WHERE prodSourceLabel IN ("
- sqlJobType = ":prodSourceLabel1) "
- sql0 += sqlJobType
- # predefined
- if predefined:
- if useWhereInSQL:
- sql0 += "AND relocationFlag=1 "
- else:
- sql0 += "WHERE relocationFlag=1 "
- useWhereInSQL = True
- # working group
- tmpGroupMap = {}
- sqlGroups = ''
- if workingGroup != '':
- if useWhereInSQL:
- sqlGroups += "AND workingGroup IN ("
- else:
- sqlGroups += "WHERE workingGroup IN ("
- useWhereInSQL = True
- # loop over all groups
- idxWG = 1
- for tmpWG in workingGroup.split(','):
- tmpWGkey = ':workingGroup%s' % idxWG
- sqlGroups += "%s," % tmpWGkey
- tmpGroupMap[tmpWGkey] = tmpWG
- idxWG += 1
- sqlGroups = sqlGroups[:-1] + ") "
- # country group
- if countryGroup != '':
- if useWhereInSQL:
- sqlGroups += "AND countryGroup IN ("
- else:
- sqlGroups += "WHERE countryGroup IN ("
- useWhereInSQL = True
- # loop over all groups
- idxCG = 1
- for tmpCG in countryGroup.split(','):
- tmpCGkey = ':countryGroup%s' % idxCG
- sqlGroups += "%s," % tmpCGkey
- tmpGroupMap[tmpCGkey] = tmpCG
- idxCG += 1
- sqlGroups = sqlGroups[:-1] + ") "
- sql0 += sqlGroups
- # minimum priority
- sqlPrio = ''
- tmpPrioMap = {}
- if minPriority != None:
- if useWhereInSQL:
- sqlPrio = "AND currentPriority>=:minPriority "
- else:
- sqlPrio = "WHERE currentPriority>=:minPriority "
- useWhereInSQL = True
- tmpPrioMap[':minPriority'] = minPriority
- sql0 += sqlPrio
- sql0 += "GROUP BY computingSite,jobStatus"
- sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ computingSite,jobStatus,COUNT(*) FROM ATLAS_PANDA.jobsArchived4 tab WHERE modificationTime>:modificationTime "
- if sqlJobType != "":
- sqlA += "AND prodSourceLabel IN ("
- sqlA += sqlJobType
- if predefined:
- sqlA += "AND relocationFlag=1 "
- sqlA += sqlGroups
- sqlA += sqlPrio
- sqlA += "GROUP BY computingSite,jobStatus"
- tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']
- if archived:
- tables.append('ATLAS_PANDA.jobsArchived4')
- # sql for materialized view
- sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0)
- sqlMV = re.sub(':minPriority','TRUNC(:minPriority,-1)',sqlMV)
- sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV)
- ret = {}
- nTry=3
- for iTry in range(nTry):
- try:
- for table in tables:
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- for tmpJobType in tmpJobTypeMap.keys():
- varMap[tmpJobType] = tmpJobTypeMap[tmpJobType]
- for tmpGroup in tmpGroupMap.keys():
- varMap[tmpGroup] = tmpGroupMap[tmpGroup]
- for tmpPrio in tmpPrioMap.keys():
- varMap[tmpPrio] = tmpPrioMap[tmpPrio]
- if table != 'ATLAS_PANDA.jobsArchived4':
- self.cur.arraysize = 10000
- if table == 'ATLAS_PANDA.jobsActive4':
- sqlExeTmp = (sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS'
- else:
- sqlExeTmp = (sql0+comment) % table
- _logger.debug("getJobStatistics : %s %s" % (sqlExeTmp,str(varMap)))
- self.cur.execute(sqlExeTmp, varMap)
- else:
- varMap[':modificationTime'] = timeLimit
- self.cur.arraysize = 10000
- self.cur.execute(sqlA+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for item in res:
- if not ret.has_key(item[0]):
- ret[item[0]] = {}
- if not ret[item[0]].has_key(item[1]):
- ret[item[0]][item[1]] = 0
- ret[item[0]][item[1]] += item[2]
- # for zero
- stateList = ['assigned','activated','running']
- if archived:
- stateList += ['finished','failed']
- for site in ret.keys():
- for state in stateList:
- if not ret[site].has_key(state):
- ret[site][state] = 0
- # return
- _logger.debug("getJobStatistics -> %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("getJobStatistics() retry : %s" % iTry)
- time.sleep(2)
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobStatistics : %s %s" % (type, value))
- return {}
-
-
- # get job statistics with label
- def getJobStatisticsWithLabel(self,siteStr=''):
- comment = ' /* DBProxy.getJobStatisticsWithLabel */'
- _logger.debug("getJobStatisticsWithLabel(%s)" % siteStr)
- sql0 = "SELECT computingSite,prodSourceLabel,jobStatus,COUNT(*) FROM %s "
- # site
- tmpSiteMap = {}
- if siteStr != '':
- sql0 += "WHERE computingSite IN ("
- # loop over all sites
- idxSite = 1
- for tmpSite in siteStr.split(','):
- tmpSiteKey = ':site%s' % idxSite
- sql0 += "%s," % tmpSiteKey
- tmpSiteMap[tmpSiteKey] = tmpSite
- idxSite += 1
- sql0 = sql0[:-1] + ") "
- sql0 += "GROUP BY computingSite,prodSourceLabel,jobStatus "
- sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0)
- sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV)
- tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']
- returnMap = {}
- try:
- for table in tables:
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- self.cur.arraysize = 10000
- if table == 'ATLAS_PANDA.jobsActive4':
- sqlExeTmp = (sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS'
- else:
- sqlExeTmp = (sql0+comment) % table
- self.cur.execute(sqlExeTmp,tmpSiteMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for computingSite,prodSourceLabel,jobStatus,nCount in res:
- # add site
- if not returnMap.has_key(computingSite):
- returnMap[computingSite] = {}
- # add SourceLabel
- if not returnMap[computingSite].has_key(prodSourceLabel):
- returnMap[computingSite][prodSourceLabel] = {}
- # add jobstatus
- if not returnMap[computingSite][prodSourceLabel].has_key(jobStatus):
- returnMap[computingSite][prodSourceLabel][jobStatus] = 0
- # add
- returnMap[computingSite][prodSourceLabel][jobStatus] += nCount
- # return
- _logger.debug("getJobStatisticsWithLabel() : %s" % str(returnMap))
- return returnMap
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getJobStatisticsWithLabel : %s %s" % (errType,errValue))
- return {}
-
-
- # get job statistics for brokerage
- def getJobStatisticsBrokerage(self,minPriority=None):
- comment = ' /* DBProxy.getJobStatisticsBrokerage */'
- _logger.debug("getJobStatisticsBrokerage(%s)" % minPriority)
- sql0 = "SELECT cloud,computingSite,jobStatus,processingType,COUNT(*) FROM %s WHERE "
- sql0 += "prodSourceLabel IN (:prodSourceLabel1) "
- tmpPrioMap = {}
- if minPriority != None:
- sql0 += "AND currentPriority>=:minPriority "
- tmpPrioMap[':minPriority'] = minPriority
- sql0 += "GROUP BY cloud,computingSite,jobStatus,processingType"
- # sql for materialized view
- sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0)
- sqlMV = re.sub(':minPriority','TRUNC(:minPriority,-1)',sqlMV)
- sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV)
- tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']
- if minPriority != None:
- # read the number of running jobs with prio<=MIN
- tables.append('ATLAS_PANDA.jobsActive4')
- sqlMVforRun = re.sub('currentPriority>=','currentPriority<=',sqlMV)
- ret = {}
- nTry=3
- iActive = 0
- for iTry in range(nTry):
- try:
- for table in tables:
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':prodSourceLabel1'] = 'managed'
- for tmpPrio in tmpPrioMap.keys():
- varMap[tmpPrio] = tmpPrioMap[tmpPrio]
- self.cur.arraysize = 10000
- useRunning = None
- if table == 'ATLAS_PANDA.jobsActive4':
- # first count non-running and then running if minPriority is specified
- if minPriority != None:
- if iActive == 0:
- useRunning = False
- else:
- useRunning = True
- iActive += 1
- if useRunning in [None,False]:
- self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap)
- else:
- self.cur.execute((sqlMVforRun+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap)
- else:
- self.cur.execute((sql0+comment) % table, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for cloud,computingSite,jobStatus,processingType,count in res:
- # check jobstatus if minPriority isspecified
- if minPriority != None:
- # count the number of non-running with prio>=MIN
- if useRunning == True and jobStatus != 'running':
- continue
- # count the number of running with prio<=MIN
- if useRunning == False and jobStatus == 'running':
- continue
- # add cloud
- if not ret.has_key(cloud):
- ret[cloud] = {}
- # add site
- if not ret[cloud].has_key(computingSite):
- ret[cloud][computingSite] = {}
- # add processingType
- if not ret[cloud][computingSite].has_key(processingType):
- ret[cloud][computingSite][processingType] = {}
- # add jobStatus
- if not ret[cloud][computingSite][processingType].has_key(jobStatus):
- ret[cloud][computingSite][processingType][jobStatus] = count
- # for zero
- for cloud,cloudVal in ret.iteritems():
- for site,siteVal in cloudVal.iteritems():
- for pType,typeVal in siteVal.iteritems():
- for stateItem in ['assigned','activated','running','transferring']:
- if not typeVal.has_key(stateItem):
- typeVal[stateItem] = 0
- # return
- _logger.debug("getJobStatisticsBrokerage -> %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("getJobStatisticsBrokerage retry : %s" % iTry)
- time.sleep(2)
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobStatisticsBrokerage : %s %s" % (type, value))
- return {}
-
-
- # get job statistics for analysis brokerage
- def getJobStatisticsAnalBrokerage(self,minPriority=None):
- comment = ' /* DBProxy.getJobStatisticsAnalBrokerage */'
- _logger.debug("getJobStatisticsAnalBrokerage(%s)" % minPriority)
- sql0 = "SELECT computingSite,jobStatus,processingType,COUNT(*) FROM %s WHERE "
- sql0 += "prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) "
- if minPriority != None:
- sql0 += "AND currentPriority>=:minPriority "
- sql0 += "GROUP BY cloud,computingSite,jobStatus,processingType"
- # sql for materialized view
- sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0)
- sqlMV = re.sub(':minPriority','TRUNC(:minPriority,-1)',sqlMV)
- sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV)
- tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']
- ret = {}
- nTry=3
- for iTry in range(nTry):
- try:
- for table in tables:
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- if minPriority != None:
- varMap[':minPriority'] = minPriority
- self.cur.arraysize = 10000
- if table == 'ATLAS_PANDA.jobsActive4':
- self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap)
- else:
- self.cur.execute((sql0+comment) % table, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for computingSite,jobStatus,processingType,count in res:
- # add site
- if not ret.has_key(computingSite):
- ret[computingSite] = {}
- # add processingType
- if not ret[computingSite].has_key(processingType):
- ret[computingSite][processingType] = {}
- # add jobStatus
- if not ret[computingSite][processingType].has_key(jobStatus):
- ret[computingSite][processingType][jobStatus] = count
- # for zero
- for site,siteVal in ret.iteritems():
- for pType,typeVal in siteVal.iteritems():
- for stateItem in ['defined','assigned','activated','running']:
- if not typeVal.has_key(stateItem):
- typeVal[stateItem] = 0
- # return
- _logger.debug("getJobStatisticsAnalBrokerage -> %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.debug("getJobStatisticsAnalBrokerage retry : %s" % iTry)
- time.sleep(2)
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobStatisticsAnalBrokerage : %s %s" % (type, value))
- return {}
-
-
- # get highest prio jobs
- def getHighestPrioJobStat(self):
- comment = ' /* DBProxy.getHighestPrioJobStat */'
- _logger.debug("getHighestPrioJobStat()")
- sql0 = "SELECT cloud,max(currentPriority) FROM %s WHERE "
- sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) GROUP BY cloud"
- sqlC = "SELECT COUNT(*) FROM %s WHERE "
- sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND "
- sqlC += "cloud=:cloud AND currentPriority=:currentPriority"
- tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']
- ret = {}
- try:
- for table in tables:
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':prodSourceLabel'] = 'managed'
- if table == 'ATLAS_PANDA.jobsActive4':
- varMap[':jobStatus1'] = 'activated'
- varMap[':jobStatus2'] = 'dummy'
- else:
- varMap[':jobStatus1'] = 'defined'
- varMap[':jobStatus2'] = 'assigned'
- self.cur.arraysize = 100
- _logger.debug((sql0+comment) % table)
- self.cur.execute((sql0+comment) % table, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for cloud,maxPriority in res:
- # add cloud
- if not ret.has_key(cloud):
- ret[cloud] = {}
- # add max priority
- prioKey = 'highestPrio'
- nNotRunKey = 'nNotRun'
- getNumber = False
- if not ret[cloud].has_key(prioKey):
- ret[cloud][prioKey] = maxPriority
- ret[cloud][nNotRunKey] = 0
- getNumber = True
- else:
- # use highest one
- if ret[cloud][prioKey] < maxPriority:
- ret[cloud][prioKey] = maxPriority
- # reset
- ret[cloud][nNotRunKey] = 0
- getNumber = True
- elif ret[cloud][prioKey] == maxPriority:
- getNumber = True
- # get number of jobs with highest prio
- if getNumber:
- varMap[':cloud'] = cloud
- varMap[':currentPriority'] = maxPriority
- self.cur.arraysize = 10
- _logger.debug((sqlC+comment) % table)
- self.cur.execute((sqlC+comment) % table, varMap)
- resC = self.cur.fetchone()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- ret[cloud][nNotRunKey] += resC[0]
- # return
- return ret
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getHighestPrioJobStat : %s %s" % (type, value))
- return {}
-
-
- # get highest prio jobs per process group
- def getHighestPrioJobStatPerPG(self,useMorePG=False):
- comment = ' /* DBProxy.getHighestPrioJobStatPerPG */'
- _logger.debug("getHighestPrioJobStatPerPG()")
- if useMorePG == False:
- sql0 = "SELECT cloud,max(currentPriority),processingType FROM %s WHERE "
- sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) GROUP BY cloud,processingType"
- sqlC = "SELECT COUNT(*) FROM %s WHERE "
- sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND "
- sqlC += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType"
- else:
- sql0 = "SELECT cloud,max(currentPriority),processingType,coreCount,workingGroup FROM %s WHERE "
- sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) "
- sql0 += "GROUP BY cloud,processingType,coreCount,workingGroup"
- sqlC = "SELECT COUNT(*) FROM %s WHERE "
- sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND "
- sqlC += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType AND "
- sqlC += "coreCount=:coreCount AND workingGroup=:workingGroup"
- sqlCN = "SELECT COUNT(*) FROM %s WHERE "
- sqlCN += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND "
- sqlCN += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType AND "
- sqlCN += "coreCount IS NULL AND workingGroup=:workingGroup"
- tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']
- ret = {}
- try:
- for table in tables:
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':prodSourceLabel'] = 'managed'
- if table == 'ATLAS_PANDA.jobsActive4':
- varMap[':jobStatus1'] = 'activated'
- varMap[':jobStatus2'] = 'dummy'
- else:
- varMap[':jobStatus1'] = 'defined'
- varMap[':jobStatus2'] = 'assigned'
- self.cur.arraysize = 100
- _logger.debug((sql0+comment) % table+str(varMap))
- self.cur.execute((sql0+comment) % table, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for tmpItem in res:
- if useMorePG == False:
- cloud,maxPriority,processingType = tmpItem
- origCloud = cloud
- origProcessingType = processingType
- else:
- origCloud,maxPriority,origProcessingType,coreCount,workingGroup = tmpItem
- # convert cloud and processingType for extended process group
- if useMorePG == ProcessGroups.extensionLevel_1:
- # extension level 1
- cloud,processingType = ProcessGroups.converCPTforEPG(origCloud,origProcessingType,
- coreCount)
- else:
- # extension level 2
- cloud,processingType = ProcessGroups.converCPTforEPG(origCloud,origProcessingType,
- coreCount,workingGroup)
- # add cloud
- if not ret.has_key(cloud):
- ret[cloud] = {}
- # get process group
- processGroup = ProcessGroups.getProcessGroup(processingType)
- # add process group
- if not ret[cloud].has_key(processGroup):
- ret[cloud][processGroup] = {}
- # add max priority
- prioKey = 'highestPrio'
- nNotRunKey = 'nNotRun'
- getNumber = False
- if not ret[cloud][processGroup].has_key(prioKey):
- ret[cloud][processGroup][prioKey] = maxPriority
- ret[cloud][processGroup][nNotRunKey] = 0
- getNumber = True
- else:
- # use highest one
- if ret[cloud][processGroup][prioKey] < maxPriority:
- ret[cloud][processGroup][prioKey] = maxPriority
- # reset
- ret[cloud][processGroup][nNotRunKey] = 0
- getNumber = True
- elif ret[cloud][processGroup][prioKey] == maxPriority:
- getNumber = True
- # get number of jobs with highest prio
- if getNumber:
- varMap[':cloud'] = origCloud
- varMap[':currentPriority'] = maxPriority
- varMap[':processingType'] = origProcessingType
- if useMorePG != False:
- varMap[':workingGroup'] = workingGroup
- if coreCount != None:
- varMap[':coreCount'] = coreCount
- self.cur.arraysize = 10
- _logger.debug((sqlC+comment) % table+str(varMap))
- self.cur.execute((sqlC+comment) % table, varMap)
- resC = self.cur.fetchone()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- ret[cloud][processGroup][nNotRunKey] += resC[0]
- # return
- _logger.debug("getHighestPrioJobStatPerPG -> %s" % ret)
- return ret
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getHighestPrioJobStatPerPG : %s %s" % (type, value))
- return {}
-
-
- # get queued analysis jobs at a site
- def getQueuedAnalJobs(self,site,dn):
- comment = ' /* DBProxy.getQueuedAnalJobs */'
- _logger.debug("getQueuedAnalJobs(%s,%s)" % (site,dn))
- sql0 = "SELECT COUNT(*),jobStatus FROM %s WHERE "
- sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) "
- sql0 += "AND computingSite=:computingSite AND prodUserName != :prodUserName "
- sql0 += "GROUP BY jobStatus "
- tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']
- try:
- # get compact DN
- compactDN = self.cleanUserID(dn)
- if compactDN in ['','NULL',None]:
- compactDN = dn
- nQueued = 0
- nRunning = 0
- # loop over all tables
- for table in tables:
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':prodSourceLabel'] = 'user'
- varMap[':computingSite'] = site
- varMap[':prodUserName'] = compactDN
- if table == 'ATLAS_PANDA.jobsActive4':
- varMap[':jobStatus1'] = 'activated'
- varMap[':jobStatus2'] = 'running'
- else:
- varMap[':jobStatus1'] = 'defined'
- varMap[':jobStatus2'] = 'assigned'
- self.cur.arraysize = 10
- self.cur.execute((sql0+comment) % table, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # sum
- for cnt,jobStatus in res:
- if jobStatus == 'running':
- nRunning += cnt
- else:
- nQueued += cnt
- # return
- return {'queued':nQueued, 'running':nRunning}
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getQueuedAnalJobs : %s %s" % (errType,errValue))
- return {}
-
-
- # get computingSite and destinationSE for a dataset
- def getDestSE(self,dsname,fromArch=False):
- comment = ' /* DBProxy.getDestSE */'
- _logger.debug("getDestSE(%s,%s)" % (dsname,fromArch))
- sql0 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock "
- if not fromArch:
- sql0 += "AND status=:status "
- sql0 += "AND rownum=1"
- sql1 = "SELECT computingSite,destinationSE FROM %s WHERE PandaID=:PandaID"
- actTableList = ['ATLAS_PANDA.jobsActive4']
- if fromArch:
- actTableList.append("ATLAS_PANDA.jobsArchived4")
- try:
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- if not fromArch:
- varMap[':status'] = 'transferring'
- varMap[':destinationDBlock'] = dsname
- self.cur.arraysize = 10
- self.cur.execute(sql0+comment, varMap)
- res = self.cur.fetchall()
- # get PandaID
- pandaID = None
- if len(res) != 0:
- pandaID = res[0][0]
- # get computingSite and destinationSE
- destSE = None,None
- if pandaID != None:
- varMap = {}
- varMap[':PandaID'] = pandaID
- # loop over all active tables
- foundInActive = False
- for actTable in actTableList:
- self.cur.execute((sql1 % actTable)+comment, varMap)
- res = self.cur.fetchall()
- if len(res) != 0:
- destSE = res[0]
- foundInActive = True
- break
- # look into ARCH table
- if not foundInActive:
- if fromArch:
- sqlA = "SELECT computingSite,destinationSE FROM ATLAS_PANDAARCH.jobsArchived WHERE PandaID=:PandaID "
- sqlA += "AND modificationTime>(CURRENT_DATE-30) "
- self.cur.execute(sqlA+comment, varMap)
- res = self.cur.fetchall()
- if len(res) != 0:
- destSE = res[0]
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- _logger.debug("getDestSE(%s) : %s" % (dsname,str(destSE)))
- return destSE
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getDestSE : %s %s" % (type, value))
- return None,None
-
-
- # get destinationDBlockToken for a dataset
- def getDestTokens(self,dsname):
- comment = ' /* DBProxy.getDestTokens */'
- _logger.debug("getDestTokens(%s)" % dsname)
- sql0 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ destinationDBlockToken FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND rownum=1"
- try:
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':destinationDBlock'] = dsname
- self.cur.arraysize = 10
- self.cur.execute(sql0+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- retToken = None
- if len(res) != 0:
- retToken = res[0][0]
- # convert None to NULL
- if retToken == None:
- retToken = 'NULL'
- # return
- _logger.debug("getDestTokens(%s) : %s" % (dsname,retToken))
- return retToken
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getDestTokens : %s %s" % (type, value))
- return None
-
-
- # get the number of job for a user
- def getNumberJobsUser(self,dn,workingGroup=None):
- comment = ' /* DBProxy.getNumberJobsUser */'
- _logger.debug("getNumberJobsUsers(%s,%s)" % (dn,workingGroup))
- # get compact DN
- compactDN = self.cleanUserID(dn)
- if compactDN in ['','NULL',None]:
- compactDN = dn
- if workingGroup != None:
- sql0 = "SELECT COUNT(*) FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel AND workingGroup=:workingGroup"
- else:
- sql0 = "SELECT COUNT(*) FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel AND workingGroup IS NULL"
- nTry = 1
- nJob = 0
- for iTry in range(nTry):
- try:
- for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'):
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- varMap[':prodUserName'] = compactDN
- varMap[':prodSourceLabel'] = 'user'
- if workingGroup != None:
- varMap[':workingGroup'] = workingGroup
- self.cur.arraysize = 10
- self.cur.execute((sql0+comment) % table, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- if len(res) != 0:
- nJob += res[0][0]
- # return
- _logger.debug("getNumberJobsUsers(%s) : %s" % (dn,nJob))
- return nJob
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- time.sleep(2)
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("getNumberJobsUsers : %s %s" % (type, value))
- return 0
-
-
- # get job statistics for ExtIF
- def getJobStatisticsForExtIF(self,sourcetype=None):
- comment = ' /* DBProxy.getJobStatisticsForExtIF */'
- _logger.debug("getJobStatisticsForExtIF()")
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12)
- if sourcetype == 'analysis':
- sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) GROUP BY jobStatus,cloud"
- sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ jobStatus,COUNT(*),cloud FROM %s tab WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) "
- else:
- sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) GROUP BY jobStatus,cloud"
- sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ jobStatus,COUNT(*),cloud FROM %s tab WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) "
- sqlA+= "AND modificationTime>:modificationTime GROUP BY jobStatus,cloud"
- # sql for materialized view
- sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0)
- sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV)
- ret = {}
- try:
- for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsDefined4'):
- # start transaction
- self.conn.begin()
- # select
- varMap = {}
- if sourcetype == 'analysis':
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- else:
- varMap[':prodSourceLabel1'] = 'managed'
- varMap[':prodSourceLabel2'] = 'rc_test'
- if table != 'ATLAS_PANDA.jobsArchived4':
- self.cur.arraysize = 10000
- if table == 'ATLAS_PANDA.jobsActive4':
- self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap)
- else:
- self.cur.execute((sql0+comment) % table, varMap)
- else:
- varMap[':modificationTime'] = timeLimit
- self.cur.arraysize = 10000
- self.cur.execute((sqlA+comment) % table, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # change NULL to US for old jobs
- newRes = []
- usMap = {}
- for jobStatus,count,cloud in res:
- if not cloud in ['US','NULL']:
- # append since no conversion is required
- newRes.append((jobStatus,count,cloud))
- else:
- # sum
- if not usMap.has_key(jobStatus):
- usMap[jobStatus] = 0
- usMap[jobStatus] += count
- # append US counts
- for jobStatus,count in usMap.iteritems():
- newRes.append((jobStatus,count,'US'))
- # create map
- for item in newRes:
- # add cloud
- if not ret.has_key(item[2]):
- ret[item[2]] = {}
- # this is needed for auto_increment of InnoDB
- if not ret[item[2]].has_key(item[0]):
- ret[item[2]][item[0]] = item[1]
- # return
- _logger.debug("getJobStatisticsForExtIF -> %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobStatisticsForExtIF : %s %s" % (type, value))
- return {}
-
-
- # get job statistics per processingType
- def getJobStatisticsPerProcessingType(self,useMorePG=False):
- comment = ' /* DBProxy.getJobStatisticsPerProcessingType */'
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12)
- _logger.debug("getJobStatisticsPerProcessingType()")
- if useMorePG == False:
- sqlN = "SELECT jobStatus,COUNT(*),cloud,processingType FROM %s "
- sqlN += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) GROUP BY jobStatus,cloud,processingType"
- sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ jobStatus,COUNT(*),cloud,processingType FROM %s tab "
- sqlA += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>:modificationTime GROUP BY jobStatus,cloud,processingType"
- else:
- sqlN = "SELECT jobStatus,COUNT(*),cloud,processingType,coreCount,workingGroup FROM %s "
- sqlN += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) "
- sqlN += "GROUP BY jobStatus,cloud,processingType,coreCount,workingGroup"
- sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ "
- sqlA += "jobStatus,COUNT(*),cloud,processingType,coreCount,workingGroup FROM %s tab "
- sqlA += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>:modificationTime "
- sqlA += "GROUP BY jobStatus,cloud,processingType,coreCount,workingGroup"
- # sql for materialized view
- sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sqlN)
- sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV)
- ret = {}
- try:
- for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsDefined4'):
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- # select
- varMap = {}
- varMap[':prodSourceLabel1'] = 'managed'
- varMap[':prodSourceLabel2'] = 'rc_test'
- if table == 'ATLAS_PANDA.jobsArchived4':
- varMap[':modificationTime'] = timeLimit
- self.cur.execute((sqlA+comment) % table, varMap)
- else:
- if table == 'ATLAS_PANDA.jobsActive4' and useMorePG == False:
- self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap)
- else:
- # use real table since coreCount is unavailable in MatView
- self.cur.execute((sqlN+comment) % table, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for tmpItem in res:
- if useMorePG == False:
- jobStatus,count,cloud,processingType = tmpItem
- else:
- jobStatus,count,cloud,processingType,coreCount,workingGroup = tmpItem
- # convert cloud and processingType for extended process group
- if useMorePG == ProcessGroups.extensionLevel_1:
- # extension level 1
- cloud,processingType = ProcessGroups.converCPTforEPG(cloud,processingType,
- coreCount)
- else:
- # extension level 2
- cloud,processingType = ProcessGroups.converCPTforEPG(cloud,processingType,
- coreCount,workingGroup)
-
- # add cloud
- if not ret.has_key(cloud):
- ret[cloud] = {}
- # add processingType
- if not ret[cloud].has_key(processingType):
- ret[cloud][processingType] = {}
- # add status
- if not ret[cloud][processingType].has_key(jobStatus):
- ret[cloud][processingType][jobStatus] = 0
- ret[cloud][processingType][jobStatus] += count
- # return
- _logger.debug("getJobStatisticsPerProcessingType -> %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobStatisticsPerProcessingType : %s %s" % (type, value))
- return {}
-
-
- # get the number of waiting jobs per site and user
- def getJobStatisticsPerUserSite(self):
- comment = ' /* DBProxy.getJobStatisticsPerUserSite */'
- _logger.debug("getJobStatisticsPerUserSite()")
- sqlN = "SELECT COUNT(*),prodUserID,computingSite FROM %s "
- sqlN += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus=:jobStatus GROUP BY prodUserID,computingSite"
- ret = {}
- try:
- for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'):
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 100000
- # select
- if table == 'ATLAS_PANDA.jobsActive4':
- jobStatus = 'activated'
- else:
- jobStatus = 'assigned'
- varMap = {}
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- varMap[':jobStatus'] = jobStatus
- self.cur.execute((sqlN+comment) % table, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for cnt,prodUserName,computingSite in res:
- # add site
- if not ret.has_key(computingSite):
- ret[computingSite] = {}
- # add user
- if not ret[computingSite].has_key(prodUserName):
- ret[computingSite][prodUserName] = {'assigned':0,'activated':0}
- # add info
- ret[computingSite][prodUserName][jobStatus] = cnt
- # return
- _logger.debug("getJobStatisticsPerUserSite -> %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- # error
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("getJobStatisticsPerUserSite : %s %s" % (errtype,errvalue))
- return {}
-
-
- # get number of analysis jobs per user
- def getNUserJobs(self,siteName,nJobs):
- comment = ' /* DBProxy.getNUserJobs */'
- _logger.debug("getNUserJobs(%s)" % siteName)
- sql0 = "SELECT * FROM (SELECT prodUserID FROM ATLAS_PANDA.jobsActive4 "
- sql0 += "WHERE jobStatus=:jobStatus AND prodSourceLabel in (:prodSourceLabel1,:prodSourceLabel2) "
- sql0 += "AND computingSite=:computingSite ORDER BY currentPriority DESC) WHERE rownum<=:nJobs"
- varMap = {}
- varMap[':computingSite'] = siteName
- varMap[':nJobs'] = nJobs
- varMap[':jobStatus'] = 'activated'
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- ret = {}
- try:
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- self.cur.execute(sql0+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for prodUserID, in res:
- if not ret.has_key(prodUserID):
- ret[prodUserID] = 0
- ret[prodUserID] += 1
- # return
- _logger.debug("getNUserJobs() : %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getNUserJobs : %s %s" % (type, value))
- return {}
-
-
- # get number of activated analysis jobs
- def getNAnalysisJobs(self,nProcesses):
- comment = ' /* DBProxy.getNAnalysisJobs */'
- _logger.debug("getNAnalysisJobs(%s)" % nProcesses)
- sql0 = "SELECT computingSite,COUNT(*) FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus "
- sql0 += "AND (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2) GROUP BY computingSite"
- varMap = {}
- varMap[':jobStatus'] = 'activated'
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- ret = {}
- try:
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- self.cur.execute(sql0+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # create map
- for item in res:
- ret[item[0]] = float(item[1])/nProcesses
- # return
- _logger.debug("getNAnalysisJobs() : %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getNAnalysisJobs : %s %s" % (type, value))
- return {}
-
-
- # generate pilot token
- def genPilotToken(self,schedulerhost,scheduleruser,schedulerid):
- comment = ' /* DBProxy.genPilotToken */'
- try:
- _logger.debug("genPilotToken(%s,%s,%s)" % (schedulerhost,scheduleruser,schedulerid))
- token = commands.getoutput('uuidgen')
- timeNow = datetime.datetime.utcnow()
- timeExp = timeNow + datetime.timedelta(days=4)
- sql = "INSERT INTO ATLAS_PANDA.pilottoken (token,schedulerhost,scheduleruser,schedulerid,created,expires) "
- sql += "VALUES (:token,:schedulerhost,:scheduleruser,:schedulerid,:created,:expires)"
- # start transaction
- self.conn.begin()
- # execute
- varMap = {':token':token,':schedulerhost':schedulerhost,':scheduleruser':scheduleruser,
- ':schedulerid':schedulerid,':created':timeNow,':expires':timeExp}
- self.cur.execute(sql+comment,varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- retVal = "token=%s,created=%s,expires=%s" % (token,timeNow.strftime('%Y-%m-%d %H:%M:%S'),
- timeExp.strftime('%Y-%m-%d %H:%M:%S'))
- _logger.debug("genPilotToken -> %s" % retVal)
- return retVal
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("genPilotToken : %s %s" % (type, value))
- return None
-
-
- # get list of scheduler users
- def getListSchedUsers(self):
- comment = ' /* DBProxy.getListSchedUsers */'
- try:
- _logger.debug("getListSchedUsers")
- sql = "SELECT token,scheduleruser FROM ATLAS_PANDA.pilottoken WHERE expires>CURRENT_DATE"
- # start transaction
- self.conn.begin()
- # execute
- self.cur.arraysize = 100
- self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- retVal = {}
- for token,scheduleruser in res:
- retVal[token] = scheduleruser
- _logger.debug("getListSchedUsers->%s" % str(retVal))
- return retVal
- except:
- # roll back
- self._rollback()
- # error
- type, value, traceBack = sys.exc_info()
- _logger.error("getListSchedUsers : %s %s" % (type, value))
- return {}
-
-
- ###########################################################################
- #
- # LogDBProxy stuff
-
- # update site data
- def updateSiteData(self,hostID,pilotRequests):
- comment = ' /* DBProxy.updateSiteData */'
- _logger.debug("updateSiteData start")
- sqlDel = "DELETE FROM ATLAS_PANDAMETA.SiteData WHERE HOURS=:HOURS AND LASTMOD<:LASTMOD"
- sqlCh = "SELECT count(*) FROM ATLAS_PANDAMETA.SiteData WHERE FLAG=:FLAG AND HOURS=:HOURS AND SITE=:SITE"
- sqlIn = "INSERT INTO ATLAS_PANDAMETA.SiteData (SITE,FLAG,HOURS,GETJOB,UPDATEJOB,LASTMOD,"
- sqlIn += "NSTART,FINISHED,FAILED,DEFINED,ASSIGNED,WAITING,ACTIVATED,HOLDING,RUNNING,TRANSFERRING) "
- sqlIn += "VALUES (:SITE,:FLAG,:HOURS,:GETJOB,:UPDATEJOB,CURRENT_DATE,"
- sqlIn += "0,0,0,0,0,0,0,0,0,0)"
- sqlUp = "UPDATE ATLAS_PANDAMETA.SiteData SET GETJOB=:GETJOB,UPDATEJOB=:UPDATEJOB,LASTMOD=CURRENT_DATE "
- sqlUp += "WHERE FLAG=:FLAG AND HOURS=:HOURS AND SITE=:SITE"
- sqlAll = "SELECT getJob,updateJob,FLAG FROM ATLAS_PANDAMETA.SiteData WHERE HOURS=:HOURS AND SITE=:SITE"
- try:
- # delete old records
- varMap = {}
- varMap[':HOURS'] = 3
- varMap[':LASTMOD'] = datetime.datetime.utcnow()-datetime.timedelta(hours=varMap[':HOURS'])
- self.conn.begin()
- self.cur.execute(sqlDel+comment,varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # shuffle to avoid concatenation
- tmpSiteList = pilotRequests.keys()
- random.shuffle(tmpSiteList)
- # loop over all sites
- for tmpSite in tmpSiteList:
- tmpVal = pilotRequests[tmpSite]
- # start transaction
- self.conn.begin()
- # check individual host info first
- varMap = {}
- varMap[':FLAG'] = hostID
- varMap[':SITE'] = tmpSite
- varMap[':HOURS'] = 3
- self.cur.arraysize = 10
- self.cur.execute(sqlCh+comment,varMap)
- res = self.cur.fetchone()
- # row exists or not
- if res[0] == 0:
- sql = sqlIn
- else:
- sql = sqlUp
- if tmpVal.has_key('getJob'):
- varMap[':GETJOB'] = len(tmpVal['getJob'])
- else:
- varMap[':GETJOB'] = 0
- if tmpVal.has_key('updateJob'):
- varMap[':UPDATEJOB'] = len(tmpVal['updateJob'])
- else:
- varMap[':UPDATEJOB'] = 0
- # update
- self.cur.execute(sql+comment,varMap)
- # get all info
- sumExist = False
- varMap = {}
- varMap[':SITE'] = tmpSite
- varMap[':HOURS'] = 3
- self.cur.arraysize = 100
- self.cur.execute(sqlAll+comment,varMap)
- res = self.cur.fetchall()
- # get total getJob/updateJob
- varMap[':GETJOB'] = 0
- varMap[':UPDATEJOB'] = 0
- nCol = 0
- for tmpGetJob,tmpUpdateJob,tmpFlag in res:
- # don't use summed info
- if tmpFlag == 'production':
- sumExist = True
- continue
- if tmpFlag == 'analysis':
- if tmpSite.startswith('ANALY_'):
- sumExist = True
- continue
- if tmpFlag in ['test']:
- continue
- # sum
- varMap[':GETJOB'] += tmpGetJob
- varMap[':UPDATEJOB'] += tmpUpdateJob
- nCol += 1
- # get average
- if nCol != 0:
- if varMap[':GETJOB'] >= nCol:
- varMap[':GETJOB'] /= nCol
- if varMap[':UPDATEJOB'] >= nCol:
- varMap[':UPDATEJOB'] /= nCol
- if tmpSite.startswith('ANALY_'):
- varMap[':FLAG'] = 'analysis'
- else:
- varMap[':FLAG'] = 'production'
- # row exists or not
- if sumExist:
- sql = sqlUp
- else:
- sql = sqlIn
- # update
- self.cur.execute(sql+comment,varMap)
- _logger.debug('updateSiteData : %s getJob=%s updateJob=%s' % \
- (tmpSite,varMap[':GETJOB'],varMap[':UPDATEJOB']))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("updateSiteData done")
- return True
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("updateSiteData : %s %s" % (type,value))
- return False
-
-
- # get site data
- def getCurrentSiteData(self):
- comment = ' /* DBProxy.getCurrentSiteData */'
- _logger.debug("getCurrentSiteData")
- sql = "SELECT SITE,getJob,updateJob,FLAG FROM ATLAS_PANDAMETA.SiteData WHERE FLAG IN (:FLAG1,:FLAG2) and HOURS=3"
- varMap = {}
- varMap[':FLAG1'] = 'production'
- varMap[':FLAG2'] = 'analysis'
- try:
- # set autocommit on
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- ret = {}
- for site,getJob,updateJob,flag in res:
- if site.startswith('ANALY_'):
- if flag != 'analysis':
- continue
- else:
- if flag != 'production':
- continue
- ret[site] = {'getJob':getJob,'updateJob':updateJob}
- return ret
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getCurrentSiteData : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # insert nRunning in site data
- def insertnRunningInSiteData(self):
- comment = ' /* DBProxy.insertnRunningInSiteData */'
- _logger.debug("insertnRunningInSiteData start")
- sqlDel = "DELETE FROM ATLAS_PANDAMETA.SiteData WHERE FLAG IN (:FLAG1,:FLAG2) AND LASTMOD= nSiteRow:
- continue
- tmpIdx += 1
- if usingGroup:
- workingGroup = tmpItem[tmpIdx]
- tmpIdx += 1
- else:
- workingGroup = None
- if usingType:
- processingType = tmpItem[tmpIdx]
- tmpIdx += 1
- # get process group
- processGroup = ProcessGroups.getProcessGroup(processingType)
- else:
- processingType = None
- processGroup = None
- if usingPrio:
- currentPriority = tmpItem[tmpIdx]
- tmpIdx += 1
- else:
- currentPriority = None
- cnt = tmpItem[tmpIdx]
- tmpIdx += 1
- maxPriority = tmpItem[tmpIdx]
- # append processingType list
- if not processGroupInQueueMap.has_key(processGroup):
- processGroupInQueueMap[processGroup] = []
- if not processingType in processGroupInQueueMap[processGroup]:
- processGroupInQueueMap[processGroup].append(processingType)
- # count the number of jobs for each policy
- for tmpShareDef in shareDefList:
- policyName = tmpShareDef['policy']['name']
- # use different list based on usage of priority
- if tmpShareDef['policy']['priority'] == None:
- groupInDefList = self.faresharePolicy[siteName]['groupList']
- typeInDefList = self.faresharePolicy[siteName]['typeList'][tmpShareDef['policy']['group']]
- else:
- groupInDefList = self.faresharePolicy[siteName]['groupListWithPrio']
- typeInDefList = self.faresharePolicy[siteName]['typeListWithPrio'][tmpShareDef['policy']['group']]
- # check working group
- if usingGroup:
- if tmpShareDef['policy']['group'] == None:
- # catchall doesn't contain WGs used by other policies
- if workingGroup != None and workingGroup in groupInDefList:
- continue
- # check for wildcard
- toBeSkippedFlag = False
- for tmpPattern in groupInDefList:
- if '*' in tmpPattern:
- tmpPattern = '^' + tmpPattern.replace('*','.*') + '$'
- # don't use WG if it is included in other policies
- if re.search(tmpPattern,workingGroup) != None:
- toBeSkippedFlag = True
- break
- if toBeSkippedFlag:
- continue
- else:
- # needs to be matched if it is specified in the policy
- if '*' in tmpShareDef['policy']['group']:
- # using wild card
- tmpPattern = '^' + tmpShareDef['policy']['group'].replace('*','.*') + '$'
- if re.search(tmpPattern,workingGroup) == None:
- continue
- else:
- if tmpShareDef['policy']['group'] != workingGroup:
- continue
- # collect real WGs per defined WG mainly for wildcard
- if not workingGroupInQueueMap.has_key(tmpShareDef['policy']['group']):
- workingGroupInQueueMap[tmpShareDef['policy']['group']] = []
- if not workingGroup in workingGroupInQueueMap[tmpShareDef['policy']['group']]:
- workingGroupInQueueMap[tmpShareDef['policy']['group']].append(workingGroup)
- # check processingType
- if usingType:
- if tmpShareDef['policy']['type'] == None:
- # catchall doesn't contain processGroups used by other policies
- if processGroup != None and processGroup in typeInDefList:
- continue
- else:
- # needs to be matched if it is specified in the policy
- if tmpShareDef['policy']['type'] != processGroup:
- continue
- # check priority
- if usingPrio:
- if currentPriority != None and tmpShareDef['policy']['priority'] != None:
- if tmpShareDef['policy']['prioCondition'] == '>':
- if currentPriority <= tmpShareDef['policy']['priority']:
- continue
- elif tmpShareDef['policy']['prioCondition'] == '>=':
- if currentPriority < tmpShareDef['policy']['priority']:
- continue
- elif tmpShareDef['policy']['prioCondition'] == '<=':
- if currentPriority > tmpShareDef['policy']['priority']:
- continue
- elif tmpShareDef['policy']['prioCondition'] == '<':
- if currentPriority >= tmpShareDef['policy']['priority']:
- continue
- # append job status
- if not tmpShareDef['count'].has_key(jobStatus):
- tmpShareDef['count'][jobStatus] = 0
- # sum
- tmpShareDef['count'][jobStatus] += cnt
- # max priority
- if not tmpShareDef['maxprio'].has_key(jobStatus):
- tmpShareDef['maxprio'][jobStatus] = maxPriority
- elif tmpShareDef['maxprio'][jobStatus] < maxPriority:
- tmpShareDef['maxprio'][jobStatus] = maxPriority
- # loop over all policies to calcurate total number of running jobs and total share
- totalRunning = 0
- shareMap = {}
- msgShare = 'share->'
- msgShareMap = {}
- totalShareNonGP = 0
- totalRunningNonGP = 0
- totalActiveShareNonGP = 0
- for tmpShareDef in shareDefList:
- tmpNumMap = tmpShareDef['count']
- policyName = tmpShareDef['policy']['name']
- # policies with priorities are used only to limit the numer of jobs
- if tmpShareDef['policy']['priority'] != None:
- continue
- # the number of activated jobs
- if not tmpNumMap.has_key('activated') or tmpNumMap['activated'] == 0:
- tmpNumActivated = 0
- else:
- tmpNumActivated = tmpNumMap['activated']
- # get share, removing %
- tmpShareValue = tmpShareDef['policy']['share'][:-1]
- tmpShareValue = int(tmpShareValue)
- # get the number of runnig
- if not tmpNumMap.has_key('running'):
- tmpNumRunning = 0
- else:
- tmpNumRunning = tmpNumMap['running']
- # debug message for share
- msgShareMap[policyName] = '%s:activated=%s:running=%s' % (policyName,tmpNumActivated,tmpNumRunning)
- # get total share and total number of running jobs for non-GP
- if tmpShareDef['policy']['group'] == None:
- totalShareNonGP += tmpShareValue
- totalRunningNonGP += tmpNumRunning
- # get total share for active non-GP
- if tmpNumActivated != 0:
- totalActiveShareNonGP += tmpShareValue
- # sum
- totalRunning += tmpNumRunning
- # not use the policy if no activated jobs
- if tmpNumActivated == 0:
- continue
- # max priority
- maxPriority = 0
- if tmpShareDef['maxprio'].has_key('activated'):
- maxPriority = tmpShareDef['maxprio']['activated']
- # append
- shareMap[policyName] = {
- 'share':tmpShareValue,
- 'running':tmpNumRunning,
- 'policy':tmpShareDef['policy'],
- 'maxprio':maxPriority,
- }
- # re-normalize when some non-GP policies are inactive
- if totalShareNonGP != totalActiveShareNonGP and totalActiveShareNonGP != 0:
- for policyName,tmpVarMap in shareMap.iteritems():
- # essentially non-GP share is multiplied by totalShareNonGP/totalActiveShareNonGP
- if tmpVarMap['policy']['group'] == None:
- tmpVarMap['share'] *= totalShareNonGP
- else:
- tmpVarMap['share'] *= totalActiveShareNonGP
- # make message with share info
- for policyName in msgShareMap.keys():
- if shareMap.has_key(policyName):
- msgShare += '%s:share=%s,' % (msgShareMap[policyName],shareMap[policyName]['share'])
- else:
- msgShare += '%s:share=0,' % msgShareMap[policyName]
- # get total share
- totalShare = 0
- for policyName,tmpVarMap in shareMap.iteritems():
- totalShare += tmpVarMap['share']
- msgShare = msgShare[:-1]
- # loop over all policies to check if the priority constraint should be activated
- prioToBeImposed = []
- msgPrio = ''
- if usingPrio:
- msgPrio += 'prio->'
- for tmpShareDef in shareDefList:
- tmpNumMap = tmpShareDef['count']
- policyName = tmpShareDef['policy']['name']
- # only policies with priorities are used to limit the numer of jobs
- if tmpShareDef['policy']['priority'] == None:
- continue
- # get the number of runnig
- if not tmpNumMap.has_key('running'):
- tmpNumRunning = 0
- else:
- tmpNumRunning = tmpNumMap['running']
- # the number of activated jobs
- if not tmpNumMap.has_key('activated') or tmpNumMap['activated'] == 0:
- tmpNumActivated = 0
- else:
- tmpNumActivated = tmpNumMap['activated']
- # get limit
- tmpLimitValue = tmpShareDef['policy']['share']
- # check if more jobs are running than the limit
- toBeImposed = False
- if tmpLimitValue.endswith('%'):
- # percentage based
- tmpLimitValue = tmpLimitValue[:-1]
- if float(tmpNumRunning) > float(totalRunning) * float(tmpLimitValue) / 100.0:
- toBeImposed = True
- # debug message for prio
- msgPrio += '%s:total=%s:running=%s:impose=%s,' % (policyName,totalRunning,tmpNumRunning,toBeImposed)
- else:
- # number based
- if tmpNumRunning > int(tmpLimitValue):
- toBeImposed = True
- # debug message for prio
- msgPrio += '%s:running=%s:impose=%s,' % (policyName,tmpNumRunning,toBeImposed)
- # append
- if toBeImposed:
- prioToBeImposed.append(tmpShareDef['policy'])
- msgPrio = msgPrio[:-1]
- # no activated
- if shareMap == {}:
- _logger.debug("getCriteriaForProdShare %s : ret=None - no activated" % siteName)
- return retForNone
- # no running
- if totalRunning == 0:
- _logger.debug("getCriteriaForProdShare %s : ret=None - no running" % siteName)
- return retForNone
- # zero share
- if totalShare == 0:
- _logger.debug("getCriteriaForProdShare %s : ret=None - zero share" % siteName)
- return retForNone
- # select the group where share most diverges from the definition
- lowestShareRatio = None
- lowestSharePolicy = None
- for policyName,tmpVarMap in shareMap.iteritems():
- # ignore zero share
- if tmpVarMap['share'] == 0:
- continue
- tmpShareDef = float(tmpVarMap['share']) / float(totalShare)
- tmpShareNow = float(tmpVarMap['running']) / float(totalRunning)
- tmpShareRatio = tmpShareNow / tmpShareDef
- # take max priority into account for cloud share
- if usingCloud != '':
- # skip over share
- if tmpShareNow > tmpShareDef:
- continue
- tmpShareRatio /= float(1000 + tmpVarMap['maxprio'])
- if lowestShareRatio == None or lowestShareRatio > tmpShareRatio:
- lowestShareRatio = tmpShareRatio
- lowestSharePolicy = policyName
- # make criteria
- retVarMap = {}
- retStr = ''
- if lowestSharePolicy != None:
- tmpShareDef = shareMap[lowestSharePolicy]['policy']
- # working group
- if tmpShareDef['group'] == None:
- groupInDefList = self.faresharePolicy[siteName]['groupList']
- # catch all except WGs used by other policies
- if groupInDefList != []:
- groupUsedInClause = []
- tmpIdx = 0
- # use real name of workingGroup
- for tmpGroupIdx in groupInDefList:
- if not workingGroupInQueueMap.has_key(tmpGroupIdx):
- continue
- for tmpGroup in workingGroupInQueueMap[tmpGroupIdx]:
- if tmpGroup in groupUsedInClause:
- continue
- # add AND at the first WG
- if groupUsedInClause == []:
- retStr += 'AND workingGroup NOT IN ('
- # add WG
- tmpKey = ':shareWG%s' % tmpIdx
- retVarMap[tmpKey] = tmpGroup
- retStr += '%s,' % tmpKey
- tmpIdx += 1
- # append
- groupUsedInClause.append(tmpGroup)
- if groupUsedInClause != []:
- retStr = retStr[:-1]
- retStr += ') '
- else:
- # match with one WG
- if workingGroupInQueueMap.has_key(tmpShareDef['group']):
- groupUsedInClause = []
- tmpIdx = 0
- # use real name of workingGroup
- for tmpGroup in workingGroupInQueueMap[tmpShareDef['group']]:
- if tmpGroup in groupUsedInClause:
- continue
- # add AND at the first WG
- if groupUsedInClause == []:
- retStr += 'AND workingGroup IN ('
- # add WG
- tmpKey = ':shareWG%s' % tmpIdx
- retVarMap[tmpKey] = tmpGroup
- retStr += '%s,' % tmpKey
- tmpIdx += 1
- # append
- groupUsedInClause.append(tmpGroup)
- if groupUsedInClause != []:
- retStr = retStr[:-1]
- retStr += ') '
- # processing type
- if tmpShareDef['type'] == None:
- typeInDefList = self.faresharePolicy[siteName]['typeList'][tmpShareDef['group']]
- # catch all except WGs used by other policies
- if typeInDefList != []:
- # get the list of processingTypes from the list of processGroups
- retVarMapP = {}
- retStrP = 'AND processingType NOT IN ('
- tmpIdx = 0
- for tmpTypeGroup in typeInDefList:
- if processGroupInQueueMap.has_key(tmpTypeGroup):
- for tmpType in processGroupInQueueMap[tmpTypeGroup]:
- tmpKey = ':sharePT%s' % tmpIdx
- retVarMapP[tmpKey] = tmpType
- retStrP += '%s,' % tmpKey
- tmpIdx += 1
- retStrP = retStrP[:-1]
- retStrP += ') '
- # copy
- if retVarMapP != {}:
- retStr += retStrP
- for tmpKey,tmpType in retVarMapP.iteritems():
- retVarMap[tmpKey] = tmpType
- else:
- # match with one processingGroup
- if processGroupInQueueMap.has_key(tmpShareDef['type']) and processGroupInQueueMap[tmpShareDef['type']] != []:
- retStr += 'AND processingType IN ('
- tmpIdx = 0
- for tmpType in processGroupInQueueMap[tmpShareDef['type']]:
- tmpKey = ':sharePT%s' % tmpIdx
- retVarMap[tmpKey] = tmpType
- retStr += '%s,' % tmpKey
- tmpIdx += 1
- retStr = retStr[:-1]
- retStr += ') '
- # priority
- tmpIdx = 0
- for tmpDefItem in prioToBeImposed:
- if tmpDefItem['group'] in [None,tmpShareDef['group']] and \
- tmpDefItem['type'] in [None,tmpShareDef['type']]:
- if tmpDefItem['prioCondition'] == '>':
- retStrP = '<='
- elif tmpDefItem['prioCondition'] == '>=':
- retStrP = '<'
- elif tmpDefItem['prioCondition'] == '<=':
- retStrP = '>'
- elif tmpDefItem['prioCondition'] == '<':
- retStrP = '>='
- else:
- continue
- tmpKey = ':sharePrio%s' % tmpIdx
- retVarMap[tmpKey] = tmpDefItem['priority']
- retStr += ('AND currentPriority%s%s' % (retStrP,tmpKey))
- tmpIdx += 1
- _logger.debug("getCriteriaForProdShare %s : sql='%s' var=%s cloud=%s %s %s" % \
- (siteName,retStr,str(retVarMap),usingCloud,msgShare,msgPrio))
- # append criteria for test jobs
- if retStr != '':
- retVarMap[':shareLabel1'] = 'managed'
- retVarMap[':shareLabel2'] = 'test'
- retVarMap[':shareLabel3'] = 'prod_test'
- retVarMap[':shareLabel4'] = 'install'
- retStr = 'AND (prodSourceLabel IN (:shareLabel2,:shareLabel3,:shareLabel4) OR (prodSourceLabel=:shareLabel1 ' + retStr + '))'
- return retStr,retVarMap
- except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("getCriteriaForProdShare %s : %s %s" % (siteName,errtype,errvalue))
- # roll back
- self._rollback()
- return retForNone
-
-
- # get beyond pledge resource ratio
- def getPledgeResourceRatio(self):
- comment = ' /* DBProxy.getPledgeResourceRatio */'
- # check utime
- if self.updateTimeForPledgeRatio != None and (datetime.datetime.utcnow()-self.updateTimeForPledgeRatio) < datetime.timedelta(hours=3):
- return
- # update utime
- self.updateTimeForPledgeRatio = datetime.datetime.utcnow()
- _logger.debug("getPledgeResourceRatio")
- try:
- # set autocommit on
- self.conn.begin()
- # select
- sql = "SELECT siteid,countryGroup,availableCPU,availableStorage,pledgedCPU,pledgedStorage "
- sql += "FROM ATLAS_PANDAMETA.schedconfig WHERE countryGroup IS NOT NULL AND siteid LIKE 'ANALY_%' "
- self.cur.arraysize = 100000
- self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # update ratio
- self.beyondPledgeRatio = {}
- if res != None and len(res) != 0:
- for siteid,countryGroup,tmp_availableCPU,tmp_availableStorage,tmp_pledgedCPU,tmp_pledgedStorage in res:
- # ignore when countryGroup is undefined
- if countryGroup in ['',None]:
- continue
- # append
- self.beyondPledgeRatio[siteid] = {}
- self.beyondPledgeRatio[siteid]['countryGroup'] = countryGroup
- # convert to float
- try:
- availableCPU = float(tmp_availableCPU)
- except:
- availableCPU = 0
- try:
- pledgedCPU = float(tmp_pledgedCPU)
- except:
- pledgedCPU = 0
- # calculate ratio
- if availableCPU == 0 or pledgedCPU == 0:
- # set 0% when CPU ratio is undefined
- self.beyondPledgeRatio[siteid]['ratio'] = 0
- else:
- # ratio = (availableCPU-pledgedCPU)/availableCPU*(1-storageTerm)
- self.beyondPledgeRatio[siteid]['ratio'] = (availableCPU-pledgedCPU)/availableCPU
- _logger.debug("getPledgeResourceRatio -> %s" % str(self.beyondPledgeRatio))
- return
- except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("getPledgeResourceRatio : %s %s" % (errtype,errvalue))
- # roll back
- self._rollback()
- return
-
-
- # get fareshare policy
- def getFaresharePolicy(self,getNewMap=False):
- comment = ' /* DBProxy.getFaresharePolicy */'
- # check utime
- if not getNewMap and self.updateTimeForFaresharePolicy != None and \
- (datetime.datetime.utcnow()-self.updateTimeForFaresharePolicy) < datetime.timedelta(hours=3):
- return
- if not getNewMap:
- # update utime
- self.updateTimeForFaresharePolicy = datetime.datetime.utcnow()
- _logger.debug("getFaresharePolicy")
- try:
- # set autocommit on
- self.conn.begin()
- # get default share
- cloudShareMap = {}
- cloudTier1Map = {}
- sqlD = "SELECT name,fairshare,tier1 FROM ATLAS_PANDAMETA.cloudconfig"
- self.cur.arraysize = 100000
- self.cur.execute(sqlD+comment)
- res = self.cur.fetchall()
- for cloudName,cloudShare,cloudTier1 in res:
- try:
- cloudTier1Map[cloudName] = cloudTier1.split(',')
- except:
- pass
- if not cloudShare in ['',None]:
- cloudShareMap[cloudName] = cloudShare
- # get share per site
- sql = "SELECT siteid,fairsharePolicy,cloud "
- sql += "FROM ATLAS_PANDAMETA.schedconfig WHERE NOT siteid LIKE 'ANALY_%' GROUP BY siteid,fairsharePolicy,cloud"
- self.cur.execute(sql+comment)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # update policy
- faresharePolicy = {}
- for siteid,faresharePolicyStr,cloudName in res:
- try:
- # share is undefined
- usingCloudShare = ''
- if faresharePolicyStr in ['',None]:
- # skip if share is not defined at site or cloud
- if not cloudShareMap.has_key(cloudName):
- continue
- # skip if T1 doesn't define share
- if cloudTier1Map.has_key(cloudName) and siteid in cloudTier1Map[cloudName]:
- continue
- # use cloud share
- faresharePolicyStr = cloudShareMap[cloudName]
- usingCloudShare = cloudName
- # decompose
- hasNonPrioPolicy = False
- for tmpItem in faresharePolicyStr.split(','):
- # skip empty
- tmpItem = tmpItem.strip()
- if tmpItem == '':
- continue
- # keep name
- tmpPolicy = {'name':tmpItem}
- # group
- tmpPolicy['group'] = None
- tmpMatch = re.search('group=([^:]+)',tmpItem)
- if tmpMatch != None:
- if tmpMatch.group(1) in ['','central','*','any']:
- # use None for catchall
- pass
- else:
- tmpPolicy['group'] = tmpMatch.group(1)
- # type
- tmpPolicy['type'] = None
- tmpMatch = re.search('type=([^:]+)',tmpItem)
- if tmpMatch != None:
- if tmpMatch.group(1) in ['*','any']:
- # use None for catchall
- pass
- else:
- tmpPolicy['type'] = tmpMatch.group(1)
- # priority
- tmpPolicy['priority'] = None
- tmpPolicy['prioCondition'] = None
- tmpMatch = re.search('priority([=<>]+)(\d+)',tmpItem)
- if tmpMatch != None:
- tmpPolicy['priority'] = int(tmpMatch.group(2))
- tmpPolicy['prioCondition'] = tmpMatch.group(1)
- else:
- hasNonPrioPolicy = True
- # share
- tmpPolicy['share'] = tmpItem.split(':')[-1]
- # append
- if not faresharePolicy.has_key(siteid):
- faresharePolicy[siteid] = {'policyList':[]}
- faresharePolicy[siteid]['policyList'].append(tmpPolicy)
- # add any:any if only priority policies
- if not hasNonPrioPolicy:
- tmpPolicy = {'name' : 'type=any',
- 'group' : None,
- 'type' : None,
- 'priority' : None,
- 'prioCondition' : None,
- 'share' : '100%'}
- faresharePolicy[siteid]['policyList'].append(tmpPolicy)
- # some translation
- faresharePolicy[siteid]['usingGroup'] = False
- faresharePolicy[siteid]['usingType'] = False
- faresharePolicy[siteid]['usingPrio'] = False
- faresharePolicy[siteid]['usingCloud'] = usingCloudShare
- faresharePolicy[siteid]['groupList'] = []
- faresharePolicy[siteid]['typeList'] = {}
- faresharePolicy[siteid]['groupListWithPrio'] = []
- faresharePolicy[siteid]['typeListWithPrio'] = {}
- for tmpDefItem in faresharePolicy[siteid]['policyList']:
- # using WG
- if tmpDefItem['group'] != None:
- faresharePolicy[siteid]['usingGroup'] = True
- # using PG
- if tmpDefItem['type'] != None:
- faresharePolicy[siteid]['usingType'] = True
- # using prio
- if tmpDefItem['priority'] != None:
- faresharePolicy[siteid]['usingPrio'] = True
- # get list of WG and PG with/without priority
- if tmpDefItem['priority'] == None:
- # get list of woringGroups
- if tmpDefItem['group'] != None and not tmpDefItem['group'] in faresharePolicy[siteid]['groupList']:
- faresharePolicy[siteid]['groupList'].append(tmpDefItem['group'])
- # get list of processingGroups
- if not faresharePolicy[siteid]['typeList'].has_key(tmpDefItem['group']):
- faresharePolicy[siteid]['typeList'][tmpDefItem['group']] = []
- if tmpDefItem['type'] != None and not tmpDefItem['type'] in faresharePolicy[siteid]['typeList'][tmpDefItem['group']]:
- faresharePolicy[siteid]['typeList'][tmpDefItem['group']].append(tmpDefItem['type'])
- else:
- # get list of woringGroups
- if tmpDefItem['group'] != None and not tmpDefItem['group'] in faresharePolicy[siteid]['groupListWithPrio']:
- faresharePolicy[siteid]['groupListWithPrio'].append(tmpDefItem['group'])
- # get list of processingGroups
- if not faresharePolicy[siteid]['typeListWithPrio'].has_key(tmpDefItem['group']):
- faresharePolicy[siteid]['typeListWithPrio'][tmpDefItem['group']] = []
- if tmpDefItem['type'] != None and not tmpDefItem['type'] in faresharePolicy[siteid]['typeListWithPrio'][tmpDefItem['group']]:
- faresharePolicy[siteid]['typeListWithPrio'][tmpDefItem['group']].append(tmpDefItem['type'])
- except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.warning("getFaresharePolicy : wrond definition '%s' for %s : %s %s" % (faresharePolicy,siteid,errtype,errvalue))
- _logger.debug("getFaresharePolicy -> %s" % str(faresharePolicy))
- if not getNewMap:
- self.faresharePolicy = faresharePolicy
- return
- else:
- return faresharePolicy
- except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("getFaresharePolicy : %s %s" % (errtype,errvalue))
- # roll back
- self._rollback()
- if not getNewMap:
- return
- else:
- return {}
-
-
- # get cloud list
- def getCloudList(self):
- comment = ' /* DBProxy.getCloudList */'
- _logger.debug("getCloudList start")
- try:
- # set autocommit on
- self.conn.begin()
- # select
- sql = "SELECT name,tier1,tier1SE,relocation,weight,server,status,transtimelo,"
- sql += "transtimehi,waittime,validation,mcshare,countries,fasttrack,nprestage,"
- sql += "pilotowners "
- sql+= "FROM ATLAS_PANDAMETA.cloudconfig"
- self.cur.arraysize = 10000
- self.cur.execute(sql+comment)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- ret = {}
- if resList != None and len(resList) != 0:
- for res in resList:
- # change None to ''
- resTmp = []
- for tmpItem in res:
- if tmpItem == None:
- tmpItem = ''
- resTmp.append(tmpItem)
- name,tier1,tier1SE,relocation,weight,server,status,transtimelo,transtimehi,\
- waittime,validation,mcshare,countries,fasttrack,nprestage,pilotowners = resTmp
- # instantiate CloudSpec
- tmpC = CloudSpec.CloudSpec()
- tmpC.name = name
- tmpC.tier1 = tier1
- tmpC.tier1SE = re.sub(' ','',tier1SE).split(',')
- tmpC.relocation = relocation
- tmpC.weight = weight
- tmpC.server = server
- tmpC.status = status
- tmpC.transtimelo = transtimelo
- tmpC.transtimehi = transtimehi
- tmpC.waittime = waittime
- tmpC.validation = validation
- tmpC.mcshare = mcshare
- tmpC.countries = countries
- tmpC.fasttrack = fasttrack
- tmpC.nprestage = nprestage
- tmpC.pilotowners = pilotowners
- # append
- ret[name] = tmpC
- _logger.debug("getCloudList done")
- return ret
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getCloudList : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # check sites with release/cache
- def checkSitesWithRelease(self,sites,releases,caches,cmtConfig=None):
- comment = ' /* DBProxy.checkSitesWithRelease */'
- try:
- relStr = releases
- if releases != None:
- relStr = releases.replace('\n',' ')
- caStr = caches
- if caches != None:
- caStr = caches.replace('\n',' ')
- _logger.debug("checkSitesWithRelease(%s,%s,%s,%s)" % (sites,relStr,caStr,cmtConfig))
- # select
- sql = "SELECT distinct siteid FROM ATLAS_PANDAMETA.InstalledSW WHERE "
- loopKey2 = None
- loopValues2 = []
- if not caches in ['','NULL',None]:
- loopKey = ':cache'
- loopValues = caches.split('\n')
- sql += "cache=:cache "
- if not releases in ['','NULL',None]:
- loopKey2 = ':release'
- loopValues2 = releases.split('\n')
- sql += "AND release=:release "
- elif not releases in ['','NULL',None]:
- loopKey = ':release'
- loopValues = releases.split('\n')
- sql += "release=:release AND cache='None' "
- else:
- # don't check
- return sites
- checkCMT = False
- if not cmtConfig in ['','NULL',None]:
- sql += "AND cmtConfig=:cmtConfig "
- checkCMT = True
- sql += "AND siteid IN ("
- # start transaction
- self.conn.begin()
- self.cur.arraysize = 1000
- # loop over all releases/caches
- for loopIdx,loopVal in enumerate(loopValues):
- # remove Atlas-
- loopVal = re.sub('^Atlas-','',loopVal)
- sqlSite = sql
- varMap = {}
- varMap[loopKey] = loopVal
- if loopKey2 != None:
- loopVal2 = loopValues2[loopIdx]
- loopVal2 = re.sub('^Atlas-','',loopVal2)
- varMap[loopKey2] = loopVal2
- if checkCMT:
- varMap[':cmtConfig'] = cmtConfig
- tmpRetSites = []
- # loop over sites
- nSites = 10
- iSite = 0
- for siteIndex,site in enumerate(sites):
- iSite += 1
- tmpSiteKey = ':siteid%s' % iSite
- varMap[tmpSiteKey] = site
- sqlSite += '%s,' % tmpSiteKey
- if iSite == nSites or (siteIndex+1) == len(sites):
- iSite = 0
- # close bracket in SQL
- sqlSite = sqlSite[:-1]
- sqlSite += ')'
- # execute
- _logger.debug(sqlSite+comment+str(varMap))
- self.cur.execute(sqlSite+comment, varMap)
- resList = self.cur.fetchall()
- # collect candidates
- if len(resList) > 0:
- for tmpSite, in resList:
- # append
- tmpRetSites.append(tmpSite)
- # reset
- sqlSite = sql
- varMap = {}
- varMap[loopKey] = loopVal
- if loopKey2 != None:
- varMap[loopKey2] = loopVal2
- if checkCMT:
- varMap[':cmtConfig'] = cmtConfig
- # set
- sites = tmpRetSites
- # escape
- if sites == []:
- break
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("checkSitesWithRelease -> %s" % sites)
- return sites
- except:
- # roll back
- self._rollback()
- type,value,traceBack = sys.exc_info()
- _logger.error("checkSitesWithRelease : %s %s" % (type,value))
- return []
-
-
- # get sites with release/cache in cloud
- def getSitesWithReleaseInCloud(self,cloud,releases,caches,validation):
- comment = ' /* DBProxy.getSitesWithReleaseInCloud */'
- try:
- relStr = releases
- if releases != None:
- relStr = releases.replace('\n',' ')
- caStr = caches
- if caches != None:
- caStr = caches.replace('\n',' ')
- _logger.debug("getSitesWithReleaseInCloud(%s,%s,%s,%s)" % (cloud,relStr,caStr,validation))
- # select
- sql = "SELECT distinct siteid FROM ATLAS_PANDAMETA.InstalledSW WHERE cloud=:cloud AND "
- varMap = {}
- varMap[':cloud'] = cloud
- if not caches in ['','NULL',None]:
- loopKey = ':cache'
- loopValues = caches.split('\n')
- sql += "cache=:cache "
- else:
- loopKey = ':release'
- loopValues = releases.split('\n')
- sql += "release=:release AND cache='None' "
- # validation
- if validation:
- sql += "validation=:validation "
- varMap[':validation'] = 'validated'
- # start transaction
- self.conn.begin()
- self.cur.arraysize = 100
- # loop over all releases/caches
- retSites = None
- for loopVal in loopValues:
- # remove Atlas-
- loopVal = re.sub('^Atlas-','',loopVal)
- varMap[loopKey] = loopVal
- # execute
- _logger.debug(sql+comment+str(varMap))
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- # append
- tmpRetSites = []
- for tmpItem, in resList:
- if retSites == None or (tmpItem in retSites):
- tmpRetSites.append(tmpItem)
- # set
- retSites = tmpRetSites
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- retSites = []
- for tmpItem, in resList:
- retSites.append(tmpItem)
- _logger.debug("getSitesWithReleaseInCloud -> %s" % retSites)
- return retSites
- except:
- # roll back
- self._rollback()
- type,value,traceBack = sys.exc_info()
- _logger.error("getSitesWithReleaseInCloud : %s %s" % (type,value))
- return []
-
-
- # get list of cache prefix
- def getCachePrefixes(self):
- comment = ' /* DBProxy.getCachePrefixes */'
- try:
- _logger.debug("getCachePrefixes")
- # select
- sql = "SELECT distinct cache FROM ATLAS_PANDAMETA.installedSW WHERE cache IS NOT NULL"
- # start transaction
- self.conn.begin()
- self.cur.arraysize = 10000
- # execute
- self.cur.execute(sql+comment, {})
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- tmpList = []
- for tmpItem, in resList:
- match = re.search('^([^-]+)-',tmpItem)
- if match != None:
- tmpPrefix = match.group(1)
- if not tmpPrefix in tmpList:
- tmpList.append(tmpPrefix)
- _logger.debug("getCachePrefixes -> %s" % tmpList)
- return tmpList
- except:
- # roll back
- self._rollback()
- type,value,traceBack = sys.exc_info()
- _logger.error("getCachePrefixes : %s %s" % (type,value))
- return []
-
-
- # get pilot owners
- def getPilotOwners(self):
- comment = ' /* DBProxy.getPilotOwners */'
- _logger.debug("getPilotOwners")
- try:
- # set autocommit on
- self.conn.begin()
- # select
- sql = "SELECT pilotowners FROM ATLAS_PANDAMETA.cloudconfig"
- self.cur.arraysize = 100
- self.cur.execute(sql+comment)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- ret = []
- for tmpItem, in resList:
- if tmpItem != None:
- for tmpOwner in tmpItem.split('|'):
- if tmpOwner != '':
- ret.append(tmpOwner)
- _logger.debug("getPilotOwners -> %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- type,value,traceBack = sys.exc_info()
- _logger.error("getPilotOwners : %s %s" % (type,value))
- return []
-
-
- # get allowed nodes
- def getAllowedNodes(self):
- comment = ' /* DBProxy.getAllowedNodes */'
- _logger.debug("getAllowedNodes")
- try:
- # set autocommit on
- self.conn.begin()
- # select
- sql = "SELECT siteid,allowedNode FROM ATLAS_PANDAMETA.schedconfig "
- sql += "WHERE siteid IS NOT NULL AND allowedNode IS NOT NULL"
- self.cur.arraysize = 1000
- self.cur.execute(sql+comment)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- ret = {}
- for tmpSiteID,tmpAllowedNode in resList:
- if not ret.has_key(tmpSiteID):
- ret[tmpSiteID] = tmpAllowedNode.split(',')
- _logger.debug("getAllowedNodes -> %s" % str(ret))
- return ret
- except:
- # roll back
- self._rollback()
- tmpType,tmpValue = sys.exc_info()[:2]
- _logger.error("getAllowedNodes : %s %s" % (tmpType,tmpValue))
- return {}
-
-
- # extract name from DN
- def cleanUserID(self, id):
- try:
- up = re.compile('/(DC|O|OU|C|L)=[^\/]+')
- username = up.sub('', id)
- up2 = re.compile('/CN=[0-9]+')
- username = up2.sub('', username)
- up3 = re.compile(' [0-9]+')
- username = up3.sub('', username)
- up4 = re.compile('_[0-9]+')
- username = up4.sub('', username)
- username = username.replace('/CN=proxy','')
- username = username.replace('/CN=limited proxy','')
- username = username.replace('limited proxy','')
- username = re.sub('/CN=Robot:[^/]+','',username)
- pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)')
- mat = pat.match(username)
- if mat:
- username = mat.group(2)
- else:
- username = username.replace('/CN=','')
- if username.lower().find('/email') > 0:
- username = username[:username.lower().find('/email')]
- pat = re.compile('.*(limited.*proxy).*')
- mat = pat.match(username)
- if mat:
- username = mat.group(1)
- username = username.replace('(','')
- username = username.replace(')','')
- username = username.replace("'",'')
- return username
- except:
- return id
-
-
- # extract scope from dataset name
- def extractScope(self,name):
- try:
- if name.lower().startswith('user') or \
- name.lower().startswith('group'):
- # return None if there are not enough fields
- if len(name.split('.')) < 2:
- return None
- return name.lower().split('.')[0] + '.' + name.lower().split('.')[1]
- return name.split('.')[0]
- except:
- return None
-
-
- # check quota
- def checkQuota(self,dn):
- comment = ' /* DBProxy.checkQuota */'
- _logger.debug("checkQuota %s" % dn)
- try:
- # set autocommit on
- self.conn.begin()
- # select
- name = self.cleanUserID(dn)
- sql = "SELECT cpua1,cpua7,cpua30,quotaa1,quotaa7,quotaa30 FROM ATLAS_PANDAMETA.users WHERE name=:name"
- varMap = {}
- varMap[':name'] = name
- self.cur.arraysize = 10
- self.cur.execute(sql+comment,varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- weight = 0.0
- if res != None and len(res) != 0:
- item = res[0]
- # cpu and quota
- cpu1 = item[0]
- cpu7 = item[1]
- cpu30 = item[2]
- if item[3] in [0,None]:
- quota1 = 0
- else:
- quota1 = item[3] * 3600
- if item[4] in [0,None]:
- quota7 = 0
- else:
- quota7 = item[4] * 3600
- if item[5] in [0,None]:
- quota30 = 0
- else:
- quota30 = item[5] * 3600
- # CPU usage
- if cpu1 == None:
- cpu1 = 0.0
- # weight
- if quota1 > 0:
- weight = float(cpu1) / float(quota1)
- # not exceeded the limit
- if weight < 1.0:
- weight = 0.0
- _logger.debug("checkQuota %s Weight:%s Quota:%s CPU:%s" % (dn,weight,quota1,cpu1))
- else:
- _logger.debug("checkQuota cannot found %s" % dn)
- return weight
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("checkQuota : %s %s" % (type,value))
- # roll back
- self._rollback()
- return 0.0
-
-
- # get serialize JobID and status
- def getUserParameter(self,dn,jobID,jobsetID):
- comment = ' /* DBProxy.getUserParameter */'
- _logger.debug("getUserParameter %s JobID=%s JobsetID=%s" % (dn,jobID,jobsetID))
- try:
- # set initial values
- retStatus = True
- if jobsetID == -1:
- # generate new jobsetID
- retJobsetID = jobID
- # new jobID = 1 + new jobsetID
- retJobID = retJobsetID + 1
- elif jobsetID in ['NULL',None,0]:
- # no jobsetID
- retJobsetID = None
- retJobID = jobID
- else:
- # user specified jobsetID
- retJobsetID = jobsetID
- retJobID = jobID
- # set autocommit on
- self.conn.begin()
- # select
- name = self.cleanUserID(dn)
- sql = "SELECT jobid,status FROM ATLAS_PANDAMETA.users WHERE name=:name "
- sql += "FOR UPDATE "
- sqlAdd = "INSERT INTO ATLAS_PANDAMETA.users "
- sqlAdd += "(ID,NAME,LASTMOD,FIRSTJOB,LATESTJOB,CACHETIME,NCURRENT,JOBID) "
- sqlAdd += "VALUES(ATLAS_PANDAMETA.USERS_ID_SEQ.nextval,:name,"
- sqlAdd += "CURRENT_DATE,CURRENT_DATE,CURRENT_DATE,CURRENT_DATE,0,1) "
- varMap = {}
- varMap[':name'] = name
- self.cur.execute(sql+comment,varMap)
- self.cur.arraysize = 10
- res = self.cur.fetchall()
- # insert if no record
- if res == None or len(res) == 0:
- try:
- self.cur.execute(sqlAdd+comment,varMap)
- retI = self.cur.rowcount
- _logger.debug("getUserParameter %s inserted new row with %s" % (dn,retI))
- # emulate DB response
- res = [[1,'']]
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getUserParameter %s failed to insert new row with %s:%s" % (dn,errType,errValue))
- if res != None and len(res) != 0:
- item = res[0]
- # JobID in DB
- dbJobID = item[0]
- # check status
- if item[1] in ['disabled']:
- retStatus = False
- # use larger JobID
- if dbJobID >= int(retJobID) or (jobsetID == -1 and dbJobID >= int(retJobsetID)):
- if jobsetID == -1:
- # generate new jobsetID = 1 + exsiting jobID
- retJobsetID = dbJobID+1
- # new jobID = 1 + new jobsetID
- retJobID = retJobsetID + 1
- else:
- # new jobID = 1 + exsiting jobID
- retJobID = dbJobID+1
- # update DB
- varMap = {}
- varMap[':name'] = name
- varMap[':jobid'] = retJobID
- sql = "UPDATE ATLAS_PANDAMETA.users SET jobid=:jobid WHERE name=:name"
- self.cur.execute(sql+comment,varMap)
- _logger.debug("getUserParameter set JobID=%s for %s" % (retJobID,dn))
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("getUserParameter %s return JobID=%s JobsetID=%s Status=%s" % (dn,retJobID,retJobsetID,retStatus))
- return retJobID,retJobsetID,retStatus
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getUserParameter : %s %s" % (type,value))
- # roll back
- self._rollback()
- return retJobID,retJobsetID,retStatus
-
-
- # get JobID for user
- def getJobIdUser(self,dn):
- comment = ' /* DBProxy.getJobIdUser */'
- _logger.debug("getJobIdUser %s" % dn)
- jobID = 0
- try:
- # set autocommit on
- self.conn.begin()
- # select
- name = self.cleanUserID(dn)
- sql = "SELECT jobid FROM ATLAS_PANDAMETA.users WHERE name=:name"
- varMap = {}
- varMap[':name'] = name
- self.cur.arraysize = 10
- self.cur.execute(sql+comment,varMap)
- res = self.cur.fetchone()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if res != None:
- jobID, = res
- _logger.debug("getJobIdUser %s -> %s" % (name,jobID))
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getJobIdUser : %s %s" % (errType,errValue))
- # roll back
- self._rollback()
- return jobID
-
-
- # check ban user
- def checkBanUser(self,dn,sourceLabel):
- comment = ' /* DBProxy.checkBanUser */'
- _logger.debug("checkBanUser %s %s" % (dn,sourceLabel))
- try:
- # set initial values
- retStatus = True
- # set autocommit on
- self.conn.begin()
- # select
- name = self.cleanUserID(dn)
- sql = "SELECT status FROM ATLAS_PANDAMETA.users WHERE name=:name"
- varMap = {}
- varMap[':name'] = name
- self.cur.execute(sql+comment,varMap)
- self.cur.arraysize = 10
- res = self.cur.fetchone()
- if res != None:
- # check status
- tmpStatus, = res
- if tmpStatus in ['disabled']:
- retStatus = False
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("checkBanUser %s %s Status=%s" % (dn,sourceLabel,retStatus))
- return retStatus
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("checkBanUser %s %s : %s %s" % (dn,sourceLabel,errType,errValue))
- # roll back
- self._rollback()
- return retStatus
-
-
- # get email address for a user
- def getEmailAddr(self,name):
- comment = ' /* DBProxy.getEmailAddr */'
- _logger.debug("get email for %s" % name)
- try:
- # set autocommit on
- self.conn.begin()
- # select
- sql = "SELECT email FROM ATLAS_PANDAMETA.users WHERE name=:name"
- varMap = {}
- varMap[':name'] = name
- self.cur.execute(sql+comment,varMap)
- self.cur.arraysize = 10
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if res != None and len(res) != 0:
- return res[0][0]
- # return empty string
- return ""
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getEmailAddr : %s %s" % (type,value))
- # roll back
- self._rollback()
- return ""
-
-
- # get client version
- def getPandaClientVer(self):
- comment = ' /* DBProxy.getPandaClientVer */'
- _logger.debug("getPandaClientVer")
- try:
- # set autocommit on
- self.conn.begin()
- # select
- sql = "SELECT pathena FROM ATLAS_PANDAMETA.pandaconfig WHERE name=:name"
- varMap = {}
- varMap[':name'] = 'current'
- self.cur.execute(sql+comment,varMap)
- self.cur.arraysize = 10
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- retStr = ''
- if res != None and len(res) != 0:
- retStr = res[0][0]
- _logger.debug("getPandaClientVer -> %s" % retStr)
- return retStr
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getPandaClientVer : %s %s" % (type,value))
- return ""
-
-
- # add files to memcached
- def addFilesToMemcached(self,site,node,files):
- _logger.debug("addFilesToMemcached start %s %s" % (site,node))
- # memcached is unused
- if not panda_config.memcached_enable:
- _logger.debug("addFilesToMemcached skip %s %s" % (site,node))
- return True
- try:
- # initialize memcache if needed
- if self.memcache == None:
- from MemProxy import MemProxy
- self.memcache = MemProxy()
- # convert string to list
- fileList = files.split(',')
- # remove ''
- try:
- fileList.remove('')
- except:
- pass
- # empty list
- if len(fileList) == 0:
- _logger.debug("addFilesToMemcached skipped for empty list")
- return True
- # list of siteIDs
- siteIDs = site.split(',')
- # loop over all siteIDs
- for tmpSite in siteIDs:
- # add
- iFiles = 0
- nFiles = 100
- retS = True
- while iFiles < len(fileList):
- tmpRetS = self.memcache.setFiles(None,tmpSite,node,fileList[iFiles:iFiles+nFiles])
- if not tmpRetS:
- retS = False
- iFiles += nFiles
- _logger.debug("addFilesToMemcached done %s %s with %s" % (site,node,retS))
- return retS
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("addFilesToMemcached : %s %s" % (errType,errValue))
- return False
-
-
- # delete files from memcached
- def deleteFilesFromMemcached(self,site,node,files):
- _logger.debug("deleteFilesFromMemcached start %s %s" % (site,node))
- # memcached is unused
- if not panda_config.memcached_enable:
- _logger.debug("deleteFilesFromMemcached skip %s %s" % (site,node))
- return True
- try:
- # initialize memcache if needed
- if self.memcache == None:
- from MemProxy import MemProxy
- self.memcache = MemProxy()
- # list of siteIDs
- siteIDs = site.split(',')
- # loop over all siteIDs
- for tmpSite in siteIDs:
- # delete
- self.memcache.deleteFiles(tmpSite,node,files)
- _logger.debug("deleteFilesFromMemcached done %s %s" % (site,node))
- return True
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("deleteFilesFromMemcached : %s %s" % (errType,errValue))
- return False
-
-
- # flush memcached
- def flushMemcached(self,site,node):
- _logger.debug("flushMemcached start %s %s" % (site,node))
- # memcached is unused
- if not panda_config.memcached_enable:
- _logger.debug("flushMemcached skip %s %s" % (site,node))
- return True
- try:
- # initialize memcache if needed
- if self.memcache == None:
- from MemProxy import MemProxy
- self.memcache = MemProxy()
- # list of siteIDs
- siteIDs = site.split(',')
- # loop over all siteIDs
- for tmpSite in siteIDs:
- # flush
- self.memcache.flushFiles(tmpSite,node)
- _logger.debug("flushMemcached done %s %s" % (site,node))
- return True
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("flushMemcached : %s %s" % (errType,errValue))
- return False
-
-
- # check files with memcached
- def checkFilesWithMemcached(self,site,node,files):
- _logger.debug("checkFilesWithMemcached start %s %s" % (site,node))
- # convert string to list
- fileList = files.split(',')
- # remove ''
- try:
- fileList.remove('')
- except:
- pass
- # memcached is unused
- if not panda_config.memcached_enable:
- _logger.debug("checkFilesWithMemcached skip %s %s" % (site,node))
- # return 0
- retStr = ''
- for tmpF in fileList:
- retStr += '0,'
- retStr = retStr[:-1]
- return retStr
- try:
- # initialize memcache if needed
- if self.memcache == None:
- from MemProxy import MemProxy
- self.memcache = MemProxy()
- # empty list
- if len(fileList) == 0:
- _logger.debug("checkFilesWithMemcached skipped for empty list")
- return ''
- # check
- iFiles = 0
- nFiles = 100
- retS = ''
- while iFiles < len(fileList):
- retS += self.memcache.checkFiles(None,fileList[iFiles:iFiles+nFiles],site,node,getDetail=True)
- retS += ','
- iFiles += nFiles
- retS = retS[:-1]
- _logger.debug("checkFilesWithMemcached done %s %s with %s" % (site,node,retS))
- return retS
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("checkFilesWithMemcached : %s %s" % (errType,errValue))
- return False
-
-
- # register proxy key
- def registerProxyKey(self,params):
- comment = ' /* DBProxy.registerProxyKey */'
- _logger.debug("register ProxyKey %s" % str(params))
- try:
- # set autocommit on
- self.conn.begin()
- # construct SQL
- sql0 = 'INSERT INTO ATLAS_PANDAMETA.proxykey (id,'
- sql1 = 'VALUES (ATLAS_PANDAMETA.PROXYKEY_ID_SEQ.nextval,'
- vals = {}
- for key,val in params.iteritems():
- sql0 += '%s,' % key
- sql1 += ':%s,' % key
- vals[':%s' % key] = val
- sql0 = sql0[:-1]
- sql1 = sql1[:-1]
- sql = sql0 + ') ' + sql1 + ') '
- # insert
- self.cur.execute(sql+comment,vals)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return True
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("registerProxyKey : %s %s" % (type,value))
- # roll back
- self._rollback()
- return ""
-
-
- # get proxy key
- def getProxyKey(self,dn):
- comment = ' /* DBProxy.getProxyKey */'
- _logger.debug("get ProxyKey %s" % dn)
- try:
- # set autocommit on
- self.conn.begin()
- # construct SQL
- sql = 'SELECT credname,expires,origin,myproxy FROM ATLAS_PANDAMETA.proxykey WHERE dn=:dn ORDER BY expires DESC'
- varMap = {}
- varMap[':dn'] = dn
- # select
- self.cur.execute(sql+comment,varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- retMap = {}
- if res != None and len(res) != 0:
- credname,expires,origin,myproxy = res[0]
- retMap['credname'] = credname
- retMap['expires'] = expires
- retMap['origin'] = origin
- retMap['myproxy'] = myproxy
- _logger.debug(retMap)
- return retMap
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getProxyKey : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # check site access
- def checkSiteAccess(self,siteid,longDN):
- comment = ' /* DBProxy.checkSiteAccess */'
- _logger.debug("checkSiteAccess %s:%s" % (siteid,longDN))
- try:
- # use compact DN
- dn = self.cleanUserID(longDN)
- # construct SQL
- sql = 'SELECT poffset,rights,status,workingGroups FROM ATLAS_PANDAMETA.siteaccess WHERE dn=:dn AND pandasite=:pandasite'
- varMap = {}
- varMap[':dn'] = dn
- varMap[':pandasite'] = siteid
- # set autocommit on
- self.conn.begin()
- # select
- self.cur.execute(sql+comment,varMap)
- self.cur.arraysize = 10
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- retMap = {}
- if res != None and len(res) != 0:
- poffset,rights,status,workingGroups = res[0]
- retMap['poffset'] = poffset
- retMap['rights'] = rights
- retMap['status'] = status
- if workingGroups in ['',None]:
- workingGroups = []
- else:
- workingGroups = workingGroups.split(',')
- retMap['workingGroups'] = workingGroups
- _logger.debug(retMap)
- return retMap
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("checkSiteAccess : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # add account to siteaccess
- def addSiteAccess(self,siteID,longDN):
- comment = ' /* DBProxy.addSiteAccess */'
- _logger.debug("addSiteAccess : %s %s" % (siteID,longDN))
- try:
- # use compact DN
- dn = self.cleanUserID(longDN)
- # set autocommit on
- self.conn.begin()
- # select
- sql = 'SELECT status FROM ATLAS_PANDAMETA.siteaccess WHERE dn=:dn AND pandasite=:pandasite'
- varMap = {}
- varMap[':dn'] = dn
- varMap[':pandasite'] = siteID
- self.cur.execute(sql+comment,varMap)
- self.cur.arraysize = 10
- res = self.cur.fetchone()
- if res != None:
- _logger.debug("account already exists with status=%s" % res[0])
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return res[0]
- # add
- sql = 'INSERT INTO ATLAS_PANDAMETA.siteaccess (id,dn,pandasite,status,created) VALUES (ATLAS_PANDAMETA.SITEACCESS_ID_SEQ.nextval,:dn,:pandasite,:status,CURRENT_DATE)'
- varMap = {}
- varMap[':dn'] = dn
- varMap[':pandasite'] = siteID
- varMap[':status'] = 'requested'
- self.cur.execute(sql+comment,varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("account was added")
- return 0
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("addSiteAccess : %s %s" % (type,value))
- # return None
- return -1
-
-
- # list site access
- def listSiteAccess(self,siteid=None,dn=None,longFormat=False):
- comment = ' /* DBProxy.listSiteAccess */'
- _logger.debug("listSiteAccess %s:%s" % (siteid,dn))
- try:
- if siteid==None and dn==None:
- return []
- longAttributes = 'status,poffset,rights,workingGroups,created'
- # set autocommit on
- self.conn.begin()
- # construct SQL
- if siteid != None:
- varMap = {':pandasite':siteid}
- if not longFormat:
- sql = 'SELECT dn,status FROM ATLAS_PANDAMETA.siteaccess WHERE pandasite=:pandasite ORDER BY dn'
- else:
- sql = 'SELECT dn,%s FROM ATLAS_PANDAMETA.siteaccess ' % longAttributes
- sql += 'WHERE pandasite=:pandasite ORDER BY dn'
- else:
- shortDN = self.cleanUserID(dn)
- varMap = {':dn':shortDN}
- if not longFormat:
- sql = 'SELECT pandasite,status FROM ATLAS_PANDAMETA.siteaccess WHERE dn=:dn ORDER BY pandasite'
- else:
- sql = 'SELECT pandasite,%s FROM ATLAS_PANDAMETA.siteaccess ' % longAttributes
- sql += 'WHERE dn=:dn ORDER BY pandasite'
- # select
- self.cur.execute(sql+comment,varMap)
- self.cur.arraysize = 1000
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- ret = []
- if res != None and len(res) != 0:
- for tmpRes in res:
- if not longFormat:
- ret.append(tmpRes)
- else:
- # create map for long format
- tmpRetMap = {}
- # use first value as a primary key
- tmpRetMap['primKey'] = tmpRes[0]
- idxVal = 1
- for tmpKey in longAttributes.split(','):
- tmpRetMap[tmpKey] = tmpRes[idxVal]
- idxVal += 1
- ret.append(tmpRetMap)
- _logger.debug(ret)
- return ret
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("listSiteAccess : %s %s" % (type,value))
- return []
-
-
- # update site access
- def updateSiteAccess(self,method,siteid,requesterDN,userName,attrValue):
- comment = ' /* DBProxy.updateSiteAccess */'
- _logger.debug("updateSiteAccess %s:%s:%s:%s:%s" % (method,siteid,requesterDN,userName,attrValue))
- try:
- # set autocommit on
- self.conn.begin()
- # check existence
- varMap = {}
- varMap[':pandasite'] = siteid
- varMap[':dn'] = userName
- sql = 'SELECT count(*) FROM ATLAS_PANDAMETA.siteaccess WHERE pandasite=:pandasite AND dn=:dn'
- self.cur.execute(sql+comment,varMap)
- self.cur.arraysize = 10
- res = self.cur.fetchall()
- if res == None or res[0][0] == 0:
- _logger.error("updateSiteAccess : No request for %s" % varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return 'No request for %s:%s' % (siteid,userName)
- # get cloud
- varMap = {':pandasite':siteid}
- sql = 'SELECT cloud,dn FROM ATLAS_PANDAMETA.schedconfig WHERE siteid=:pandasite AND rownum<=1'
- self.cur.execute(sql+comment,varMap)
- res = self.cur.fetchall()
- if res == None or len(res) == 0:
- _logger.error("updateSiteAccess : No cloud in schedconfig for %s" % siteid)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return "No cloud in schedconfig for %s" % siteid
- cloud = res[0][0]
- siteContact = res[0][1]
- # get cloud responsible
- varMap = {':cloud':cloud}
- sql = 'SELECT dn FROM ATLAS_PANDAMETA.cloudconfig WHERE name=:cloud'
- self.cur.execute(sql+comment,varMap)
- res = self.cur.fetchall()
- if res == None or len(res) == 0:
- _logger.error("updateSiteAccess : No contact in cloudconfig for %s" % cloud)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- return "No contact in cloudconfig for %s" % cloud
- contactNames = res[0][0]
- if contactNames in [None,'']:
- contactNames = []
- else:
- contactNames = contactNames.split(',')
- # get site responsible
- if not siteContact in [None,'']:
- contactNames += siteContact.split(',')
- # check privilege
- if not self.cleanUserID(requesterDN) in contactNames:
- _logger.error("updateSiteAccess : %s is not one of contacts %s" % (requesterDN,str(contactNames)))
- # return
- return "Insufficient privilege"
- # update
- varMap = {}
- varMap[':pandasite'] = siteid
- varMap[':dn'] = userName
- if method in ['approve','reject']:
- # update status
- sql = 'UPDATE ATLAS_PANDAMETA.siteaccess SET status=:newStatus WHERE pandasite=:pandasite AND dn=:dn'
- if method == 'approve':
- varMap[':newStatus'] = 'tobeapproved'
- else:
- varMap[':newStatus'] = 'toberejected'
- elif method == 'delete':
- # delete
- sql = 'DELETE FROM ATLAS_PANDAMETA.siteaccess WHERE pandasite=:pandasite AND dn=:dn'
- elif method == 'set':
- # check value
- if re.search('^[a-z,A-Z]+:[a-z,A-Z,0-9,\,_\-]+$',attrValue) == None:
- errStr = "Invalid argument for set : %s. Must be key:value" % attrValue
- _logger.error("updateSiteAccess : %s" % errStr)
- # retrun
- return errStr
- # decompose to key and value
- tmpKey = attrValue.split(':')[0].lower()
- tmpVal = attrValue.split(':')[-1]
- # check key
- changeableKeys = ['poffset','workinggroups','rights']
- if not tmpKey in changeableKeys:
- errStr = "%s cannot be set. Only %s are allowed" % (tmpKey,str(changeableKeys))
- _logger.error("updateSiteAccess : %s" % errStr)
- # retrun
- return errStr
- # set value map
- varMap[':%s' % tmpKey] = tmpVal
- sql = 'UPDATE ATLAS_PANDAMETA.siteaccess SET %s=:%s WHERE pandasite=:pandasite AND dn=:dn' % (tmpKey,tmpKey)
- else:
- _logger.error("updateSiteAccess : Unknown method '%s'" % method)
- # return
- return "Unknown method '%s'" % method
- # execute
- _logger.debug(sql+comment+str(varMap))
- self.cur.execute(sql+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("updateSiteAccess : completed")
- return True
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("updateSiteAccess : %s %s" % (type,value))
- return 'DB error %s %s' % (type,value)
-
-
- # get list of archived tables
- def getArchiveTables(self):
- # return
- return ['ATLAS_PANDAARCH.jobsArchived']
-
-
- # get JobIDs in a time range
- def getJobIDsInTimeRangeLog(self,dn,timeRange,retJobIDs):
- comment = ' /* DBProxy.getJobIDsInTimeRangeLog */'
- _logger.debug("getJobIDsInTimeRangeLog : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S')))
- try:
- # get compact DN
- compactDN = self.cleanUserID(dn)
- if compactDN in ['','NULL',None]:
- compactDN = dn
- # get list of archived tables
- tables = self.getArchiveTables()
- # select
- for table in tables:
- # make sql
- sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODSOURCELABEL_IDX JOBS_PRODUSERNAME_IDX) */ "
- sql += "jobDefinitionID FROM %s tab " % table
- sql += "WHERE prodUserName=:prodUserName AND modificationTime>:modificationTime "
- sql += "AND prodSourceLabel=:prodSourceLabel GROUP BY jobDefinitionID"
- varMap = {}
- varMap[':prodUserName'] = compactDN
- varMap[':prodSourceLabel'] = 'user'
- varMap[':modificationTime'] = timeRange
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- _logger.debug(sql+comment+str(varMap))
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for tmpID, in resList:
- if not tmpID in retJobIDs:
- retJobIDs.append(tmpID)
- _logger.debug("getJobIDsInTimeRangeLog : %s" % str(retJobIDs))
- return retJobIDs
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobIDsInTimeRangeLog : %s %s" % (type,value))
- # return empty list
- return retJobIDs
-
-
- # get PandaIDs for a JobID
- def getPandIDsWithJobIDLog(self,dn,jobID,idStatus,nJobs,buildJobID=None):
- comment = ' /* Proxy.getPandIDsWithJobIDLog */'
- _logger.debug("getPandIDsWithJobIDLog : %s %s" % (dn,jobID))
- try:
- # get compact DN
- compactDN = self.cleanUserID(dn)
- if compactDN in ['','NULL',None]:
- compactDN = dn
- # get list of archived tables
- tables = self.getArchiveTables()
- # select
- for table in tables:
- # skip if all jobs have already been gotten
- if nJobs > 0 and len(idStatus) >= nJobs:
- continue
- # make sql
- sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ "
- sql += "PandaID,jobStatus,commandToPilot,prodSourceLabel,taskBufferErrorCode FROM %s tab " % table
- sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>(CURRENT_DATE-30) "
- varMap = {}
- varMap[':prodUserName'] = compactDN
- varMap[':jobDefinitionID'] = jobID
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- # select
- _logger.debug(sql+comment+str(varMap))
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for tmpID,tmpStatus,tmpCommand,tmpProdSourceLabel,tmpTaskBufferErrorCode in resList:
- # ignore jobs retried by pilot since they have new PandaIDs with the same jobsetID/jobdefID
- if tmpTaskBufferErrorCode in [ErrorCode.EC_PilotRetried]:
- continue
- # ignore old buildJob which was replaced by rebrokerage
- if tmpProdSourceLabel == 'panda':
- if buildJobID == None:
- # first buildJob
- buildJobID = tmpID
- elif buildJobID >= tmpID:
- # don't append old one
- continue
- else:
- # delete old one
- del idStatus[buildJobID]
- buildJobID = tmpID
- # append
- if not idStatus.has_key(tmpID):
- idStatus[tmpID] = (tmpStatus,tmpCommand)
- _logger.debug("getPandIDsWithJobIDLog : %s" % str(idStatus))
- return idStatus
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getPandIDsWithJobIDLog : %s %s" % (type,value))
- # return empty list
- return {}
-
-
- # get PandaIDs for a JobsetID or JobdefID in jobsArchived
- def getPandIDsWithIdInArch(self,prodUserName,id,isJobset):
- comment = ' /* Proxy.getPandIDsWithIdInArch */'
- _logger.debug("getPandIDsWithIdInArch : %s %s %s" % (prodUserName,id,isJobset))
- try:
- # make sql
- if isJobset:
- sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBSETID_IDX) */ "
- else:
- sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ "
- sql += "PandaID FROM ATLAS_PANDAARCH.jobsArchived tab "
- sql += "WHERE prodUserName=:prodUserName "
- sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>(CURRENT_DATE-30) "
- if isJobset:
- sql += "AND jobsetID=:jobID "
- else:
- sql += "AND jobDefinitionID=:jobID "
- varMap = {}
- varMap[':prodUserName'] = prodUserName
- varMap[':jobID'] = id
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 1000000
- # select
- _logger.debug(sql+comment+str(varMap))
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- pandaIDs = []
- for tmpID, in resList:
- pandaIDs.append(tmpID)
- _logger.debug("getPandIDsWithIdInArch : %s %s -> %s" % (prodUserName,id,str(pandaIDs)))
- return pandaIDs
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getPandIDsWithIdInArch : %s %s" % (errType,errValue))
- # return empty list
- return []
-
-
- # peek at job
- def peekJobLog(self,pandaID):
- comment = ' /* DBProxy.peekJobLog */'
- _logger.debug("peekJobLog : %s" % pandaID)
- # return None for NULL PandaID
- if pandaID in ['NULL','','None',None]:
- return None
- sql1_0 = "SELECT %s FROM %s "
- sql1_1 = "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30) "
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- nTry=3
- for iTry in range(nTry):
- try:
- # get list of archived tables
- tables = self.getArchiveTables()
- # select
- for table in tables:
- # start transaction
- self.conn.begin()
- # select
- sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1
- self.cur.arraysize = 10
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if len(res) != 0:
- # Job
- job = JobSpec()
- job.pack(res[0])
- # Files
- # start transaction
- self.conn.begin()
- # select
- fileTableName = re.sub('jobsArchived','filesTable_ARCH',table)
- sqlFile = "SELECT /*+ INDEX(tab FILES_ARCH_PANDAID_IDX)*/ %s " % FileSpec.columnNames()
- sqlFile+= "FROM %s tab " % fileTableName
- # put constraint on modificationTime to avoid full table scan
- sqlFile+= "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-60)"
- self.cur.arraysize = 10000
- self.cur.execute(sqlFile+comment, varMap)
- resFs = self.cur.fetchall()
- # metadata
- job.metadata = None
- metaTableName = re.sub('jobsArchived','metaTable_ARCH',table)
- sqlMeta = "SELECT metaData FROM %s WHERE PandaID=:PandaID" % metaTableName
- self.cur.execute(sqlMeta+comment, varMap)
- for clobMeta, in self.cur:
- if clobMeta != None:
- job.metadata = clobMeta.read()
- break
- # job parameters
- job.jobParameters = None
- jobParamTableName = re.sub('jobsArchived','jobParamsTable_ARCH',table)
- sqlJobP = "SELECT jobParameters FROM %s WHERE PandaID=:PandaID" % jobParamTableName
- varMap = {}
- varMap[':PandaID'] = job.PandaID
- self.cur.execute(sqlJobP+comment, varMap)
- for clobJobP, in self.cur:
- if clobJobP != None:
- job.jobParameters = clobJobP.read()
- break
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # set files
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- # remove redundant white spaces
- try:
- file.md5sum = file.md5sum.strip()
- except:
- pass
- try:
- file.checksum = file.checksum.strip()
- except:
- pass
- job.addFile(file)
- return job
- _logger.debug("peekJobLog() : PandaID %s not found" % pandaID)
- return None
- except:
- # roll back
- self._rollback()
- if iTry+1 < nTry:
- _logger.error("peekJobLog : %s" % pandaID)
- time.sleep(random.randint(10,20))
- continue
- type, value, traceBack = sys.exc_info()
- _logger.error("peekJobLog : %s %s" % (type,value))
- # return None
- return None
-
-
- # get user subscriptions
- def getUserSubscriptions(self,datasetName,timeRange):
- comment = ' /* DBProxy.getUserSubscriptions */'
- _logger.debug("getUserSubscriptions(%s,%s)" % (datasetName,timeRange))
- sql0 = "SELECT site FROM ATLAS_PANDAMETA.UserSubs "
- sql0 += "WHERE datasetName=:datasetName and modificationDate>CURRENT_DATE-:timeRange"
- varMap = {}
- varMap[':datasetName'] = datasetName
- varMap[':timeRange'] = timeRange
- try:
- # start transaction
- self.conn.begin()
- # select
- self.cur.execute(sql0+comment, varMap)
- resSs = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- retList = []
- for tmpSite, in resSs:
- retList.append(tmpSite)
- return retList
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getUserSubscriptions : %s %s" % (errType,errValue))
- return []
-
-
- # get the number of user subscriptions
- def getNumUserSubscriptions(self):
- comment = ' /* DBProxy.getNumUserSubscriptions */'
- _logger.debug("getNumUserSubscriptions")
- sql0 = "SELECT site,COUNT(*) FROM ATLAS_PANDAMETA.UserSubs "
- sql0 += "WHERE creationDate>CURRENT_DATE-2 GROUP BY site"
- try:
- # start transaction
- self.conn.begin()
- # select
- self.cur.execute(sql0+comment,{})
- resSs = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- retList = {}
- for tmpSite,countNum in resSs:
- retList[tmpSite] = countNum
- return retList
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getNumUserSubscriptions : %s %s" % (errType,errValue))
- return []
-
-
- # add user subscriptions
- def addUserSubscription(self,datasetName,dq2IDs):
- comment = ' /* DBProxy.addUserSubscription */'
- _logger.debug("addUserSubscription(%s,%s)" % (datasetName,dq2IDs))
- sql0 = "INSERT INTO ATLAS_PANDAMETA.UserSubs "
- sql0 += "(datasetName,site,creationDate,modificationDate,nUsed) "
- sql0 += "VALUES (:datasetName,:site,CURRENT_DATE,CURRENT_DATE,:nUsed)"
- try:
- # start transaction
- self.conn.begin()
- for site in dq2IDs:
- varMap = {}
- varMap[':datasetName'] = datasetName
- varMap[':site'] = site
- varMap[':nUsed'] = 0
- # insert
- self.cur.execute(sql0+comment, varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return True
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("addUserSubscription : %s %s" % (errType,errValue))
- return False
-
-
- # increment counter for subscription
- def incrementUsedCounterSubscription(self,datasetName):
- comment = ' /* DBProxy.incrementUsedCounterSubscription */'
- _logger.debug("incrementUsedCounterSubscription(%s)" % datasetName)
- sql0 = "UPDATE ATLAS_PANDAMETA.UserSubs SET nUsed=nUsed+1 "
- sql0 += "WHERE datasetName=:datasetName AND nUsed IS NOT NULL"
- sqlU = "SELECT MAX(nUsed) FROM ATLAS_PANDAMETA.UserSubs "
- sqlU += "WHERE datasetName=:datasetName"
- try:
- # start transaction
- self.conn.begin()
- varMap = {}
- varMap[':datasetName'] = datasetName
- # update
- self.cur.execute(sql0+comment,varMap)
- # get nUsed
- nUsed = 0
- retU = self.cur.rowcount
- if retU > 0:
- # get nUsed
- self.cur.execute(sqlU+comment,varMap)
- self.cur.arraysize = 10
- res = self.cur.fetchone()
- if res != None:
- nUsed = res[0]
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return nUsed
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("incrementUsedCounterSubscription : %s %s" % (errType,errValue))
- return -1
-
-
- # get active datasets
- def getActiveDatasets(self,computingSite,prodSourceLabel):
- comment = ' /* DBProxy.getActiveDatasets */'
- _logger.debug("getActiveDatasets(%s,%s)" % (computingSite,prodSourceLabel))
- varMap = {}
- varMap[':computingSite'] = computingSite
- varMap[':jobStatus1'] = 'assigned'
- varMap[':jobStatus2'] = 'activated'
- varMap[':jobStatus3'] = 'waiting'
- varMap[':prodSourceLabel'] = prodSourceLabel
- try:
- retList = []
- for table in ['jobsActive4','jobsDefined4','jobsWaiting4']:
- if table == 'jobsActive4':
- sql0 = "SELECT distinct prodDBlock FROM ATLAS_PANDA.%s " % table
- else:
- sql0 = "SELECT distinct prodDBlock FROM ATLAS_PANDA.%s " % table
- sql0 += "WHERE computingSite=:computingSite AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3) "
- sql0 += "AND prodSourceLabel=:prodSourceLabel"
- # start transaction
- self.conn.begin()
- # select
- self.cur.execute(sql0+comment, varMap)
- resSs = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for prodDBlock, in resSs:
- if not prodDBlock in retList:
- retList.append(prodDBlock)
- # make string
- retStr = ''
- for tmpItem in retList:
- retStr += '%s,' % tmpItem
- retStr = retStr[:-1]
- return retStr
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getActiveDatasets : %s %s" % (errType,errValue))
- return ""
-
-
- # check status of all sub datasets to trigger Notifier
- def checkDatasetStatusForNotifier(self,jobsetID,jobDefinitionID,prodUserName):
- comment = ' /* DBProxy.checkDatasetStatusForNotifier */'
- _logger.debug("checkDatasetStatusForNotifier(%s,%s,%s)" % (jobsetID,jobDefinitionID,prodUserName))
- try:
- # get PandaIDs to get all associated destinationDBlocks
- varMap = {}
- varMap[':jobsetID'] = jobsetID
- varMap[':prodUserName'] = prodUserName
- sql = "SELECT MAX(PandaID),jobDefinitionID FROM %s WHERE prodUserName=:prodUserName AND jobsetID=:jobsetID GROUP BY jobDefinitionID"
- pandaIDs = {}
- for table in ['ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsWaiting4']:
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 1000
- self.cur.execute((sql % table)+comment, varMap)
- resSs = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # get PandaIDs
- for tmpPandaID,tmpJobDefID in resSs:
- if (not pandaIDs.has_key(tmpJobDefID)) or tmpPandaID > pandaIDs[tmpJobDefID]:
- pandaIDs[tmpJobDefID] = tmpPandaID
- # get all destinationDBlocks
- varMap = {}
- varMap[':type1'] = 'log'
- varMap[':type2'] = 'output'
- sql = 'SELECT DISTINCT destinationDBlock FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type IN (:type1,:type2)'
- datasetMap = {}
- # start transaction
- self.conn.begin()
- self.cur.arraysize = 1000
- for tmpJobDefID,tmpPandaID in pandaIDs.iteritems():
- varMap[':PandaID'] = tmpPandaID
- # select
- self.cur.execute(sql+comment, varMap)
- resSs = self.cur.fetchall()
- # get destinationDBlock
- for tmpDestDBlock, in resSs:
- if not datasetMap.has_key(tmpJobDefID):
- datasetMap[tmpJobDefID] = []
- if not tmpDestDBlock in datasetMap[tmpJobDefID]:
- datasetMap[tmpJobDefID].append(tmpDestDBlock)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # check dataset status
- allClosed = True
- retInfo = {}
- latestUpdate = None
- latestJobDefID = None
- varMap = {}
- varMap[':type1'] = 'log'
- varMap[':type2'] = 'output'
- sql = 'SELECT status,modificationDate FROM ATLAS_PANDA.Datasets WHERE name=:name AND type IN (:type1,:type2)'
- sqlJ = "SELECT MAX(modificationTime) FROM ATLAS_PANDA.jobsArchived4 "
- sqlJ += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID"
- # start transaction
- self.conn.begin()
- self.cur.arraysize = 1000
- for tmpJobDefID,tmpDatasets in datasetMap.iteritems():
- retInfo[tmpJobDefID] = []
- for tmpDataset in tmpDatasets:
- if not tmpDataset in retInfo[tmpJobDefID]:
- retInfo[tmpJobDefID].append(tmpDataset)
- varMap[':name'] = tmpDataset
- # select
- self.cur.execute(sql+comment, varMap)
- resSs = self.cur.fetchall()
- # check status and mod time
- for tmpStatus,tmpModificationDate in resSs:
- _logger.debug("checkDatasetStatusForNotifier(%s,%s) %s has %s with %s at %s" % \
- (jobsetID,jobDefinitionID,tmpJobDefID,tmpDataset,tmpStatus,tmpModificationDate))
- if not tmpStatus in ['closed','tobeclosed','completed']:
- # some datasets are still active
- allClosed = False
- _logger.debug("checkDatasetStatusForNotifier(%s,%s) wait due to %s %s %s" % \
- (jobsetID,jobDefinitionID,tmpJobDefID,tmpDataset,tmpStatus))
- break
- elif tmpStatus == 'tobeclosed':
- # select latest modificationTime in job table
- varMapJ = {}
- varMapJ[':prodUserName'] = prodUserName
- varMapJ[':jobDefinitionID'] = tmpJobDefID
- self.cur.execute(sqlJ+comment, varMapJ)
- resJ = self.cur.fetchone()
- if resJ == None:
- # error
- allClosed = False
- _logger.error("checkDatasetStatusForNotifier(%s,%s) %s cannot find job" % \
- (jobsetID,jobDefinitionID,tmpJobDefID))
- break
- tmpModificationTime, = resJ
- _logger.debug("checkDatasetStatusForNotifier(%s,%s) %s modtime:%s" % \
- (jobsetID,jobDefinitionID,tmpJobDefID,tmpModificationTime))
- if latestUpdate == None or latestUpdate < tmpModificationTime:
- # use the latest updated jobDefID
- latestUpdate = tmpModificationTime
- latestJobDefID = tmpJobDefID
- elif latestUpdate == tmpModificationTime and latestJobDefID < tmpJobDefID:
- # use larger jobDefID when datasets are closed at the same time
- latestJobDefID = tmpJobDefID
- # escape
- if not allClosed:
- break
- # escape
- if not allClosed:
- break
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- _logger.debug("checkDatasetStatusForNotifier(%s,%s) -> all:%s %s latest:%s" % \
- (jobsetID,jobDefinitionID,allClosed,latestJobDefID,
- jobDefinitionID == latestJobDefID))
- # return
- if not allClosed or jobDefinitionID != latestJobDefID:
- return False,{}
- return True,retInfo
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("checkDatasetStatusForNotifier : %s %s" % (errType,errValue))
- return False,{}
-
-
- # get MoU share for T2 PD2P
- def getMouShareForT2PD2P(self):
- comment = ' /* DBProxy.getMouShareForT2PD2P */'
- _logger.debug("getMouShareForT2PD2P start")
- sqlG = "SELECT gid,ntup_share FROM ATLAS_GRISLI.t_tier2_groups "
- sqlT = "SELECT tier2,t2group,status FROM ATLAS_GRISLI.t_m4regions_replication"
- try:
- # start transaction
- self.conn.begin()
- self.cur.arraysize = 100000
- # get weight for each group
- self.cur.execute(sqlG+comment)
- resG = self.cur.fetchall()
- gidShareMap = {}
- for gid,ntup_share in resG:
- gidShareMap[gid] = {'ntup_share':ntup_share,'nSites':0}
- # get group for each site
- self.cur.execute(sqlT+comment)
- resT = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- siteGroupMap = {}
- # loop over all sites
- for tier2,t2group,t2status in resT:
- # unknown group
- if not gidShareMap.has_key(t2group):
- _logger.error("getMouShareForT2PD2P unknown group %s for %s" % (t2group,tier2))
- continue
- # use only DATADISK
- if not tier2.endswith('_DATADISK'):
- continue
- # count the number of ready sites per group
- if t2status in ['ready']:
- gidShareMap[t2group]['nSites'] += 1
- # append
- siteGroupMap[tier2] = {'group':t2group,'status':t2status}
- # normalize
- _logger.debug("getMouShareForT2PD2P normalize factor = %s" % str(gidShareMap))
- weightsMap = {}
- for tier2,t2Val in siteGroupMap.iteritems():
- t2group = t2Val['group']
- t2status = t2Val['status']
- if gidShareMap[t2group]['ntup_share'] == 0:
- # set 0 to be skipped in the brokerage
- tmpWeight = 0
- elif gidShareMap[t2group]['nSites'] > 0:
- # normalize
- tmpWeight = float(gidShareMap[t2group]['ntup_share']) / float(gidShareMap[t2group]['nSites'])
- else:
- # no site is ready in this group
- tmpWeight = 0
- weightsMap[tier2] = {'weight':tmpWeight,'status':t2status}
- _logger.debug("getMouShareForT2PD2P -> %s" % str(weightsMap))
- return weightsMap
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getMouShareForT2PD2P : %s %s" % (errType,errValue))
- return {}
-
-
- # record status change
- def recordStatusChange(self,pandaID,jobStatus,jobInfo=None,infoMap={}):
- comment = ' /* DBProxy.recordStatusChange */'
- # check config
- if not hasattr(panda_config,'record_statuschange') or panda_config.record_statuschange != True:
- return
- # get job info
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':jobStatus'] = jobStatus
- varMap[':modificationHost'] = self.myHostName
- if jobInfo != None:
- varMap[':computingSite'] = jobInfo.computingSite
- varMap[':cloud'] = jobInfo.cloud
- varMap[':prodSourceLabel'] = jobInfo.prodSourceLabel
- elif infoMap != None:
- varMap[':computingSite'] = infoMap['computingSite']
- varMap[':cloud'] = infoMap['cloud']
- varMap[':prodSourceLabel'] = infoMap['prodSourceLabel']
- else:
- # no info
- return
- # convert NULL to None
- for tmpKey in varMap.keys():
- if varMap[tmpKey] == 'NULL':
- varMap[tmpKey] = None
- # insert
- sql = "INSERT INTO ATLAS_PANDA.jobs_StatusLog "
- sql += "(PandaID,modificationTime,jobStatus,prodSourceLabel,cloud,computingSite,modificationHost) "
- sql += "VALUES (:PandaID,CURRENT_DATE,:jobStatus,:prodSourceLabel,:cloud,:computingSite,:modificationHost) "
- try:
- # start transaction
- self.conn.begin()
- self.cur.execute(sql+comment,varMap)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- except:
- # roll back
- self._rollback()
- errType,errValue = sys.exc_info()[:2]
- _logger.error("recordStatusChange %s %s: %s %s" % (pandaID,jobStatus,errType,errValue))
- return
-
-
- # wake up connection
- def wakeUp(self):
- for iTry in range(5):
- try:
- # check if the connection is working
- self.conn.ping()
- return
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("wakeUp %d : %s %s" % (iTry,type,value))
- # wait for reconnection
- time.sleep(1)
- self.connect(reconnect=True)
-
-
- # commit
- def _commit(self):
- try:
- self.conn.commit()
- return True
- except:
- _logger.error("commit error")
- return False
-
-
- # rollback
- def _rollback(self,useOtherError=False):
- retVal = True
- # rollback
- _logger.debug("rollback")
- try:
- self.conn.rollback()
- except:
- _logger.error("rollback error")
- retVal = False
- # reconnect if needed
- try:
- # get ORA ErrorCode
- errType,errValue = sys.exc_info()[:2]
- oraErrCode = str(errValue).split()[0]
- oraErrCode = oraErrCode[:-1]
- _logger.debug("rollback EC:%s %s" % (oraErrCode,errValue))
- # error codes for connection error
- error_Codes = ['ORA-01012','ORA-01033','ORA-01034','ORA-01089',
- 'ORA-03113','ORA-03114','ORA-12203','ORA-12500',
- 'ORA-12571','ORA-03135','ORA-25402']
- # other errors are apperantly given when connection lost contact
- if useOtherError:
- error_Codes += ['ORA-01861','ORA-01008']
- if oraErrCode in error_Codes:
- # reconnect
- retFlag = self.connect(reconnect=True)
- _logger.debug("rollback reconnected %s" % retFlag)
- except:
- pass
- # return
- return retVal
diff --git a/current/pandaserver/taskbuffer/OraLogDBProxy.py b/current/pandaserver/taskbuffer/OraLogDBProxy.py
deleted file mode 100755
index 8f397db40..000000000
--- a/current/pandaserver/taskbuffer/OraLogDBProxy.py
+++ /dev/null
@@ -1,727 +0,0 @@
-"""
-proxy for log database connection
-
-"""
-
-import re
-import sys
-import time
-
-import cx_Oracle
-
-from pandalogger.PandaLogger import PandaLogger
-from config import panda_config
-
-import SiteSpec
-import CloudSpec
-
-from JobSpec import JobSpec
-from FileSpec import FileSpec
-
-# logger
-_logger = PandaLogger().getLogger('LogDBProxy')
-
-# proxy
-class LogDBProxy:
-
- # constructor
- def __init__(self):
- # connection object
- self.conn = None
- # cursor object
- self.cur = None
-
- # connect to DB
- def connect(self,dbhost=panda_config.logdbhost,dbpasswd=panda_config.logdbpasswd,
- dbuser=panda_config.logdbuser,dbname=panda_config.logdbname,reconnect=False):
- # keep parameters for reconnect
- if not reconnect:
- self.dbhost = dbhost
- self.dbpasswd = dbpasswd
- self.dbuser = dbuser
- self.dbname = dbname
- # connect
- try:
- self.conn = cx_Oracle.connect(dsn=self.dbhost,user=self.dbuser,
- password=self.dbpasswd,threaded=True)
- self.cur=self.conn.cursor()
- # set TZ
- self.cur.execute("ALTER SESSION SET TIME_ZONE='UTC'")
- # set DATE format
- self.cur.execute("ALTER SESSION SET NLS_DATE_FORMAT='YYYY/MM/DD HH24:MI:SS'")
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("connect : %s %s" % (type,value))
- # roll back
- self._rollback()
- return False
-
-
- # query an SQL
- def querySQL(self,sql,arraySize=1000):
- try:
- # begin transaction
- self.conn.begin()
- self.cur.arraysize = arraySize
- self.cur.execute(sql)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- return res
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("querySQL : %s %s" % (type,value))
- return None
-
-
- # get site data
- def getCurrentSiteData(self):
- _logger.debug("getCurrentSiteData")
- sql = "SELECT SITE,getJob,updateJob FROM SiteData WHERE FLAG='production' and HOURS=3"
- try:
- # set autocommit on
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- self.cur.execute(sql)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- ret = {}
- for item in res:
- ret[item[0]] = {'getJob':item[1],'updateJob':item[2]}
- _logger.debug(ret)
- return ret
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getCurrentSiteData : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # get list of site
- def getSiteList(self):
- _logger.debug("getSiteList start")
- try:
- # set autocommit on
- self.conn.begin()
- # select
- sql = "SELECT siteid,nickname FROM schedconfig WHERE siteid IS NOT NULL"
- self.cur.arraysize = 10000
- self.cur.execute(sql)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- retMap = {}
- if res != None and len(res) != 0:
- for siteid,nickname in res:
- # skip invalid siteid
- if siteid in [None,'']:
- continue
- # append
- if not retMap.has_key(siteid):
- retMap[siteid] = []
- retMap[siteid].append(nickname)
- _logger.debug(retMap)
- _logger.debug("getSiteList done")
- return retMap
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getSiteList : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # get site info
- def getSiteInfo(self):
- _logger.debug("getSiteInfo start")
- try:
- # set autocommit on
- self.conn.begin()
- # select
- sql = "SELECT nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory,"
- sql+= "maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec,"
- sql+= "priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue "
- sql+= "FROM schedconfig WHERE siteid IS NOT NULL"
- self.cur.arraysize = 10000
- self.cur.execute(sql)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- retList = {}
- if resList != None:
- # loop over all results
- for res in resList:
- # change None to ''
- resTmp = []
- for tmpItem in res:
- if tmpItem == None:
- tmpItem = ''
- resTmp.append(tmpItem)
- nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory,\
- maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec,\
- priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue \
- = resTmp
- # skip invalid siteid
- if siteid in [None,'']:
- continue
- # instantiate SiteSpec
- ret = SiteSpec.SiteSpec()
- ret.sitename = siteid
- ret.nickname = nickname
- ret.dq2url = dq2url
- ret.cloud = cloud
- ret.ddm = ddm.split(',')[0]
- ret.lfchost = lfchost
- ret.se = se
- ret.gatekeeper = gatekeeper
- ret.memory = memory
- ret.maxtime = maxtime
- ret.status = status
- ret.space = space
- ret.glexec = glexec
- ret.queue = queue
- ret.localqueue = localqueue
- # job recoverty
- ret.retry = True
- if retry == 'FALSE':
- ret.retry = False
- # convert releases to list
- ret.releases = []
- for tmpRel in releases.split('|'):
- # remove white space
- tmpRel = tmpRel.strip()
- if tmpRel != '':
- ret.releases.append(tmpRel)
- # cmtconfig
- # add slc3 if the column is empty
- ret.cmtconfig = ['i686-slc3-gcc323-opt']
- if cmtconfig != '':
- ret.cmtconfig.append(cmtconfig)
- # map between token and DQ2 ID
- ret.setokens = {}
- tmpTokens = setokens.split(',')
- for idxToken,tmpddmID in enumerate(ddm.split(',')):
- if idxToken < len(tmpTokens):
- ret.setokens[tmpTokens[idxToken]] = tmpddmID
- # expand [] in se path
- match = re.search('([^\[]*)\[([^\]]+)\](.*)',seprodpath)
- if match != None and len(match.groups()) == 3:
- seprodpath = ''
- for tmpBody in match.group(2).split(','):
- seprodpath += '%s%s%s,' % (match.group(1),tmpBody,match.group(3))
- seprodpath = seprodpath[:-1]
- # map between token and se path
- ret.seprodpath = {}
- tmpTokens = setokens.split(',')
- for idxToken,tmpSePath in enumerate(seprodpath.split(',')):
- if idxToken < len(tmpTokens):
- ret.seprodpath[tmpTokens[idxToken]] = tmpSePath
- # VO related params
- ret.priorityoffset = priorityoffset
- ret.allowedgroups = allowedgroups
- ret.defaulttoken = defaulttoken
- # append
- retList[ret.nickname] = ret
- _logger.debug("getSiteInfo done")
- return retList
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getSiteInfo : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # get cloud list
- def getCloudList(self):
- _logger.debug("getCloudList start")
- try:
- # set autocommit on
- self.conn.begin()
- # select
- sql = "SELECT name,tier1,tier1SE,relocation,weight,server,status,transtimelo,"
- sql += "transtimehi,waittime,validation,mcshare,countries,fasttrack "
- sql+= "FROM cloudconfig"
- self.cur.arraysize = 10000
- self.cur.execute(sql)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- ret = {}
- if resList != None and len(resList) != 0:
- for res in resList:
- # change None to ''
- resTmp = []
- for tmpItem in res:
- if tmpItem == None:
- tmpItem = ''
- resTmp.append(tmpItem)
- name,tier1,tier1SE,relocation,weight,server,status,transtimelo,transtimehi,\
- waittime,validation,mcshare,countries,fasttrack = resTmp
- # instantiate CloudSpec
- tmpC = CloudSpec.CloudSpec()
- tmpC.name = name
- tmpC.tier1 = tier1
- tmpC.tier1SE = re.sub(' ','',tier1SE).split(',')
- tmpC.relocation = relocation
- tmpC.weight = weight
- tmpC.server = server
- tmpC.status = status
- tmpC.transtimelo = transtimelo
- tmpC.transtimehi = transtimehi
- tmpC.waittime = waittime
- tmpC.validation = validation
- tmpC.mcshare = mcshare
- tmpC.countries = countries
- tmpC.fasttrack = fasttrack
- # append
- ret[name] = tmpC
- _logger.debug("getCloudList done")
- return ret
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getCloudList : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # extract name from DN
- def cleanUserID(self, id):
- try:
- up = re.compile('/(DC|O|OU|C|L)=[^\/]+')
- username = up.sub('', id)
- up2 = re.compile('/CN=[0-9]+')
- username = up2.sub('', username)
- up3 = re.compile(' [0-9]+')
- username = up3.sub('', username)
- up4 = re.compile('_[0-9]+')
- username = up4.sub('', username)
- username = username.replace('/CN=proxy','')
- username = username.replace('/CN=limited proxy','')
- username = username.replace('limited proxy','')
- pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)')
- mat = pat.match(username)
- if mat:
- username = mat.group(2)
- else:
- username = username.replace('/CN=','')
- if username.lower().find('/email') > 0:
- username = username[:username.lower().find('/email')]
- pat = re.compile('.*(limited.*proxy).*')
- mat = pat.match(username)
- if mat:
- username = mat.group(1)
- username = username.replace('(','')
- username = username.replace(')','')
- return username
- except:
- return id
-
-
- # check quota
- def checkQuota(self,dn):
- _logger.debug("checkQuota %s" % dn)
- try:
- # set autocommit on
- self.conn.begin()
- # select
- name = self.cleanUserID(dn)
- sql = "SELECT cpua1,cpua7,cpua30,quotaa1,quotaa7,quotaa30 FROM users WHERE name = :name"
- varMap = {}
- varMap[':name'] = name
- self.cur.arraysize = 10
- self.cur.execute(sql,varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- weight = 0.0
- if res != None and len(res) != 0:
- item = res[0]
- # cpu and quota
- cpu1 = item[0]
- cpu7 = item[1]
- cpu30 = item[2]
- quota1 = item[3] * 3600
- quota7 = item[4] * 3600
- quota30 = item[5] * 3600
- # CPU usage
- if cpu1 == None:
- cpu1 = 0.0
- # weight
- weight = float(cpu1) / float(quota1)
- # not exceeded the limit
- if weight < 1.0:
- weight = 0.0
- _logger.debug("checkQuota %s Weight:%s Quota:%s CPU:%s" % (dn,weight,quota1,cpu1))
- else:
- _logger.debug("checkQuota cannot found %s" % dn)
- return weight
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("checkQuota : %s %s" % (type,value))
- # roll back
- self._rollback()
- return 0.0
-
-
- # get serialize JobID and status
- def getUserParameter(self,dn,jobID):
- _logger.debug("getUserParameter %s %s" % (dn,jobID))
- try:
- # set autocommit on
- self.conn.begin()
- # select
- name = self.cleanUserID(dn)
- sql = "SELECT jobid,status FROM users WHERE name = :name"
- varMap = {}
- varMap[':name'] = name
- self.cur.execute(sql,varMap)
- self.cur.arraysize = 10
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- retJobID = jobID
- retStatus = True
- if res != None and len(res) != 0:
- item = res[0]
- # JobID in DB
- dbJobID = item[0]
- # check status
- if item[1] in ['disabled']:
- retStatus = False
- # use larger JobID
- if dbJobID >= int(retJobID):
- retJobID = dbJobID+1
- # update DB
- sql = "UPDATE users SET jobid=%d WHERE name = '%s'" % (retJobID,name)
- self.cur.execute(sql)
- _logger.debug("getUserParameter set JobID=%s for %s" % (retJobID,dn))
- return retJobID,retStatus
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getUserParameter : %s %s" % (type,value))
- # roll back
- self._rollback()
- return jobID,True
-
-
- # get email address for a user
- def getEmailAddr(self,name):
- _logger.debug("get email for %s" % name)
- try:
- # set autocommit on
- self.conn.begin()
- # select
- sql = "SELECT email FROM users WHERE name=:name"
- varMap = {}
- varMap[':name'] = name
- self.cur.execute(sql,varMap)
- self.cur.arraysize = 10
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if res != None and len(res) != 0:
- return res[0][0]
- # return empty string
- return ""
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getEmailAddr : %s %s" % (type,value))
- # roll back
- self._rollback()
- return ""
-
-
- # register proxy key
- def registerProxyKey(self,params):
- _logger.debug("register ProxyKey %s" % str(params))
- try:
- # set autocommit on
- self.conn.begin()
- # construct SQL
- sql0 = 'INSERT INTO proxykey ('
- sql1 = 'VALUES ('
- vals = {}
- for key,val in params.iteritems():
- sql0 += '%s,' % key
- sql1 += ':%s,' % key
- vals[':%s' % key] = val
- sql0 = sql0[:-1]
- sql1 = sql1[:-1]
- sql = sql0 + ') ' + sql1 + ') '
- # insert
- self.cur.execute(sql,vals)
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return True
- return True
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("registerProxyKey : %s %s" % (type,value))
- # roll back
- self._rollback()
- return ""
-
-
- # get proxy key
- def getProxyKey(self,dn):
- _logger.debug("get ProxyKey %s" % dn)
- try:
- # set autocommit on
- self.conn.begin()
- # construct SQL
- sql = 'SELECT credname,expires,origin,myproxy FROM proxykey WHERE dn=:dn ORDER BY expires DESC'
- varMap = {}
- varMap[':dn'] = dn
- # select
- self.cur.execute(sql,varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # return
- retMap = {}
- if res != None and len(res) != 0:
- credname,expires,origin,myproxy = res[0]
- retMap['credname'] = credname
- retMap['expires'] = expires
- retMap['origin'] = origin
- retMap['myproxy'] = myproxy
- _logger.debug(retMap)
- return retMap
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getProxyKey : %s %s" % (type,value))
- # roll back
- self._rollback()
- return {}
-
-
- # get list of archived tables
- def getArchiveTables(self):
- tables = []
- cdate = datetime.datetime.utcnow()
- for iCycle in range(2): # 2 = (1 months + 2 just in case)/2
- if cdate.month==1:
- cdate = cdate.replace(year = (cdate.year-1))
- cdate = cdate.replace(month = 12, day = 1)
- else:
- cdate = cdate.replace(month = (cdate.month/2)*2, day = 1)
- tableName = "jobsArchived_%s%s" % (cdate.strftime('%b'),cdate.year)
- if not tableName in tables:
- tables.append(tableName)
- # one older table
- if cdate.month > 2:
- cdate = cdate.replace(month = (cdate.month-2))
- else:
- cdate = cdate.replace(year = (cdate.year-1), month = 12)
- # return
- return tables
-
-
- # get JobIDs in a time range
- def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs):
- comment = ' /* LogDBProxy.getJobIDsInTimeRange */'
- _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S')))
- try:
- # get list of archived tables
- tables = self.getArchiveTables()
- # select
- for table in tables:
- # make sql
- sql = "SELECT jobDefinitionID FROM %s " % table
- sql += "WHERE prodUserID=:prodUserID AND modificationTime>:modificationTime "
- sql += "AND prodSourceLabel='user' GROUP BY jobDefinitionID"
- varMap = {}
- varMap[':prodUserID'] = dn
- varMap[':modificationTime'] = timeRange
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 10000
- _logger.debug(sql+comment+str(varMap))
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for tmpID, in resList:
- if not tmpID in retJobIDs:
- retJobIDs.append(tmpID)
- _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs))
- return retJobIDs
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobIDsInTimeRange : %s %s" % (type,value))
- # return empty list
- return retJobIDs
-
-
- # get PandaIDs for a JobID
- def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs):
- comment = ' /* LogProxy.getPandIDsWithJobID */'
- _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID))
- try:
- # get list of archived tables
- tables = self.getArchiveTables()
- # select
- for table in tables:
- # skip if all jobs have already been gotten
- if nJobs > 0 and len(idStatus) >= nJobs:
- continue
- # make sql
- sql = "SELECT PandaID,jobStatus,commandToPilot FROM %s " % table
- sql += "WHERE prodUserID=:prodUserID AND jobDefinitionID=:jobDefinitionID "
- sql += "AND prodSourceLabel in ('user','panda') "
- varMap = {}
- varMap[':prodUserID'] = dn
- varMap[':jobDefinitionID'] = jobID
- # start transaction
- self.conn.begin()
- # select
- self.cur.arraysize = 5000
- # select
- _logger.debug(sql+comment+str(varMap))
- self.cur.execute(sql+comment, varMap)
- resList = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # append
- for tmpID,tmpStatus,tmpCommand in resList:
- if not idStatus.has_key(tmpID):
- idStatus[tmpID] = (tmpStatus,tmpCommand)
- _logger.debug("getPandIDsWithJobID : %s" % str(idStatus))
- return idStatus
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("getPandIDsWithJobID : %s %s" % (type,value))
- # return empty list
- return {}
-
-
- # peek at job
- def peekJob(self,pandaID):
- comment = ' /* LogDBProxy.peekJob */'
- _logger.debug("peekJob : %s" % pandaID)
- # return None for NULL PandaID
- if pandaID in ['NULL','','None',None]:
- return None
- sql1_0 = "SELECT %s FROM %s "
- sql1_1 = "WHERE PandaID=:PandaID"
- # select
- varMap = {}
- varMap[':PandaID'] = pandaID
- try:
- # get list of archived tables
- tables = self.getArchiveTables()
- # select
- for table in tables:
- # start transaction
- self.conn.begin()
- # select
- sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1
- self.cur.arraysize = 10
- self.cur.execute(sql+comment, varMap)
- res = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- if len(res) != 0:
- # Job
- job = JobSpec()
- job.pack(res[0])
- # Files
- # start transaction
- self.conn.begin()
- # select
- fileTableName = re.sub('jobsArchived','filesTable',table)
- sqlFile = "SELECT %s " % FileSpec.columnNames()
- sqlFile+= "FROM %s " % fileTableName
- sqlFile+= "WHERE PandaID=:PandaID"
- self.cur.arraysize = 10000
- self.cur.execute(sqlFile+comment, varMap)
- resFs = self.cur.fetchall()
- # commit
- if not self._commit():
- raise RuntimeError, 'Commit error'
- # set files
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- job.addFile(file)
- return job
- _logger.debug("peekJob() : PandaID %s not found" % pandaID)
- return None
- except:
- # roll back
- self._rollback()
- type, value, traceBack = sys.exc_info()
- _logger.error("peekJob : %s %s" % (type,value))
- # return None
- return None
-
-
- # wake up connection
- def wakeUp(self):
- for iTry in range(5):
- try:
- # check if the connection is working
- self.cur.execute("select user from dual")
- return
- except:
- type, value, traceBack = sys.exc_info()
- _logger.debug("wakeUp %d : %s %s" % (iTry,type,value))
- # wait for reconnection
- time.sleep(1)
- self.connect(reconnect=True)
-
-
- # close
- def close(self):
- try:
- self.cur.close()
- self.conn.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("close : %s %s" % (type,value))
-
-
- # commit
- def _commit(self):
- try:
- self.conn.commit()
- return True
- except:
- _logger.error("commit error")
- return False
-
-
- # rollback
- def _rollback(self):
- try:
- self.conn.rollback()
- return True
- except:
- _logger.error("rollback error")
- return False
-
diff --git a/current/pandaserver/taskbuffer/PrioUtil.py b/current/pandaserver/taskbuffer/PrioUtil.py
deleted file mode 100644
index ac8d99d5f..000000000
--- a/current/pandaserver/taskbuffer/PrioUtil.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# calculate priority for user jobs
-def calculatePriority(priorityOffset,serNum,weight):
- priority = 1000 + priorityOffset - (serNum / 5) - int(100 * weight)
- return priority
diff --git a/current/pandaserver/taskbuffer/ProcessGroups.py b/current/pandaserver/taskbuffer/ProcessGroups.py
deleted file mode 100644
index 1318ca0d1..000000000
--- a/current/pandaserver/taskbuffer/ProcessGroups.py
+++ /dev/null
@@ -1,101 +0,0 @@
-processGroups = [('others', []),
- ('evgensimul', ['evgen','simul']),
- ('reprocessing', ['reprocessing']),
- ('test', ['prod_test','rc_test','validation']),
- ('mcore', ['mcore']),
- ('group', ['group']),
- ]
-
-# source labels used for panda internal purpose
-internalSourceLabels = ['ddm']
-
-# maximum number of debug jobs per user
-maxDebugJobs = 3
-
-# maximum number of debug jobs for prod role
-maxDebugProdJobs = 30
-
-# extension level for GP
-extensionLevel_1 = 1
-
-
-# get corresponding group
-def getProcessGroup(valGroup):
- tmpGroup = None
- for tmpKey,tmpList in processGroups:
- # set default
- if tmpGroup == None:
- tmpGroup = tmpKey
- continue
- if valGroup in tmpList:
- tmpGroup = tmpKey
- break
- # return
- return tmpGroup
-
-
-# convert cloud and processingType for extended PG
-def converCPTforEPG(cloud,processingType,coreCount,workingGroup=None):
- if coreCount in [0,1,None]:
- # use group queue for GP jobs
- if workingGroup != None and workingGroup.startswith('GP_'):
- return cloud,'group'
- return cloud,processingType
- else:
- # use MCORE queue for MPC jobs in all clouds
- return "ALL","mcore"
-
-
-# count the number of jobs per group
-def countJobsPerGroup(valMap):
- ret = {}
- # loop over all clouds
- for cloud,cloudVal in valMap.iteritems():
- # add cloud
- if not ret.has_key(cloud):
- ret[cloud] = {}
- # loop over all sites
- for site,siteVal in cloudVal.iteritems():
- # add site
- if not ret[cloud].has_key(site):
- ret[cloud][site] = {}
- # loop over all types
- for pType,typeVal in siteVal.iteritems():
- # get process group
- tmpGroup = getProcessGroup(pType)
- # add group
- if not ret[cloud][site].has_key(tmpGroup):
- ret[cloud][site][tmpGroup] = {}
- # loop over all status
- for jobStatus,statVal in typeVal.iteritems():
- if not ret[cloud][site][tmpGroup].has_key(jobStatus):
- ret[cloud][site][tmpGroup][jobStatus] = 0
- # add
- ret[cloud][site][tmpGroup][jobStatus] += statVal
- # return
- return ret
-
-
-# count the number of jobs per group for analysis
-def countJobsPerGroupForAnal(valMap):
- ret = {}
- # loop over all sites
- for site,siteVal in valMap.iteritems():
- # add site
- if not ret.has_key(site):
- ret[site] = {}
- # loop over all types
- for pType,typeVal in siteVal.iteritems():
- # get process group
- tmpGroup = getProcessGroup(pType)
- # add group
- if not ret[site].has_key(tmpGroup):
- ret[site][tmpGroup] = {}
- # loop over all status
- for jobStatus,statVal in typeVal.iteritems():
- if not ret[site][tmpGroup].has_key(jobStatus):
- ret[site][tmpGroup][jobStatus] = 0
- # add
- ret[site][tmpGroup][jobStatus] += statVal
- # return
- return ret
diff --git a/current/pandaserver/taskbuffer/SQLDumper.py b/current/pandaserver/taskbuffer/SQLDumper.py
deleted file mode 100644
index 16240d1be..000000000
--- a/current/pandaserver/taskbuffer/SQLDumper.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('SQLDumper')
-
-class SQLDumper(object):
- def __init__(self,cur):
- self.cursor = cur
- def __iter__(self):
- return self
- def next(self):
- return self.cursor.next()
- def my_execute(self,sql,var={}):
- _logger.debug('SQL=%s var=%s' % (sql,str(var)))
- return self.cursor.execute(sql,var)
- def __getattribute__(self,name):
- if name == 'execute':
- return object.__getattribute__(self,'my_execute')
- elif name in ['cursor','__iter__','next']:
- return object.__getattribute__(self,name)
- else:
- return getattr(self.cursor,name)
diff --git a/current/pandaserver/taskbuffer/SiteSpec.py b/current/pandaserver/taskbuffer/SiteSpec.py
deleted file mode 100644
index e261e08d0..000000000
--- a/current/pandaserver/taskbuffer/SiteSpec.py
+++ /dev/null
@@ -1,31 +0,0 @@
-"""
-site specification
-
-"""
-
-class SiteSpec(object):
- # attributes
- _attributes = ('sitename','nickname','dq2url','cloud','ddm','lfchost','se','type','gatekeeper',
- 'releases','memory','maxtime','status','space','retry','cmtconfig','setokens',
- 'seprodpath','glexec','priorityoffset','allowedgroups','defaulttoken','queue',
- 'localqueue','validatedreleases','accesscontrol','copysetup','maxinputsize',
- 'cachedse','allowdirectaccess','comment','cloudlist','statusmodtime','lfcregister',
- 'countryGroup','availableCPU','pledgedCPU','coreCount','reliabilityLevel',
- 'iscvmfs','transferringlimit')
-
- # constructor
- def __init__(self):
- # install attributes
- for attr in self._attributes:
- setattr(self,attr,None)
-
- # serialize
- def __str__(self):
- str = ''
- for attr in self._attributes:
- str += '%s:%s ' % (attr,getattr(self,attr))
- return str
-
-
-
-
diff --git a/current/pandaserver/taskbuffer/TaskBuffer.py b/current/pandaserver/taskbuffer/TaskBuffer.py
deleted file mode 100755
index 9c03a1b35..000000000
--- a/current/pandaserver/taskbuffer/TaskBuffer.py
+++ /dev/null
@@ -1,2294 +0,0 @@
-import re
-import sys
-import types
-import shlex
-import datetime
-import ProcessGroups
-from threading import Lock
-from DBProxyPool import DBProxyPool
-from brokerage.SiteMapper import SiteMapper
-from dataservice.Setupper import Setupper
-from dataservice.Closer import Closer
-from dataservice.TaLauncher import TaLauncher
-from dataservice.ProcessLimiter import ProcessLimiter
-
-# logger
-from pandalogger.PandaLogger import PandaLogger
-_logger = PandaLogger().getLogger('TaskBuffer')
-
-
-class TaskBuffer:
- """
- task queue
-
- """
-
- # constructor
- def __init__(self):
- self.proxyPool = None
- self.lock = Lock()
- self.processLimiter = None
-
-
- # initialize
- def init(self,dbname,dbpass,nDBConnection=10,useTimeout=False):
- # lock
- self.lock.acquire()
- # create Proxy Pool
- if self.proxyPool == None:
- self.proxyPool = DBProxyPool(dbname,dbpass,nDBConnection,useTimeout)
- # create process limiter
- if self.processLimiter == None:
- self.processLimiter = ProcessLimiter()
- # release
- self.lock.release()
-
-
- # check production role
- def checkProdRole(self,fqans):
- for fqan in fqans:
- # check production role
- match = re.search('/([^/]+)/Role=production',fqan)
- if match != None:
- return True,match.group(1)
- return False,None
-
-
- # get priority parameters for user
- def getPrioParameters(self,jobs,user,fqans,userDefinedWG,validWorkingGroup):
- withProdRole = False
- workingGroup = None
- priorityOffset = 0
- serNum = 0
- weight = None
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # check production role
- withProdRole,workingGroup = self.checkProdRole(fqans)
- if withProdRole:
- # check dataset name
- for tmpFile in jobs[-1].Files:
- if tmpFile.type in ['output','log'] and not tmpFile.lfn.startswith('group'):
- # reset
- withProdRole,workingGroup = False,None
- break
- # set high prioryty for production role
- """
- if withProdRole:
- serNum = 0
- weight = 0.0
- priorityOffset = 2000
- """
- # reset nJob/weight for HC
- if jobs[0].processingType in ['hammercloud','gangarobot'] \
- or jobs[0].processingType.startswith('gangarobot-'):
- serNum = 0
- weight = 0.0
- if jobs[0].processingType in ['gangarobot','gangarobot-pft']:
- priorityOffset = 3000
- # check quota
- if weight == None:
- weight = proxy.checkQuota(user)
- # get nJob
- if userDefinedWG and validWorkingGroup:
- serNum = proxy.getNumberJobsUser(user,workingGroup=jobs[0].workingGroup)
- else:
- serNum = proxy.getNumberJobsUser(user,workingGroup=None)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return withProdRole,workingGroup,priorityOffset,serNum,weight
-
-
- # store Jobs into DB
- def storeJobs(self,jobs,user,joinThr=False,forkSetupper=False,fqans=[],hostname='',resetLocInSetupper=False,
- checkSpecialHandling=True,toPending=False):
- try:
- _logger.debug("storeJobs : start for %s nJobs=%s" % (user,len(jobs)))
- # check quota for priority calculation
- weight = 0.0
- userJobID = -1
- userJobsetID = -1
- userStatus = True
- priorityOffset = 0
- userVO = 'atlas'
- userCountry = None
- useExpress = False
- nExpressJobs = 0
- useDebugMode = False
- # check ban user except internally generated jobs
- if len(jobs) > 0 and not jobs[0].prodSourceLabel in ProcessGroups.internalSourceLabels:
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # check user status
- tmpStatus = proxy.checkBanUser(user,jobs[0].prodSourceLabel)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return if DN is blocked
- if not tmpStatus:
- _logger.debug("storeJobs : end for %s DN is blocked 1" % user)
- return []
- # set parameters for user jobs
- if len(jobs) > 0 and (jobs[0].prodSourceLabel in ['user','panda','ptest','rc_test','ssc']) \
- and (not jobs[0].processingType in ['merge','unmerge']):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # get JobID and status
- userJobID,userJobsetID,userStatus = proxy.getUserParameter(user,jobs[0].jobDefinitionID,jobs[0].jobsetID)
- # get site access
- userSiteAccess = proxy.checkSiteAccess(jobs[0].computingSite,user)
- # check quota for express jobs
- if 'express' in jobs[0].specialHandling:
- expressQuota = proxy.getExpressJobs(user)
- if expressQuota != None and expressQuota['status'] and expressQuota['quota'] > 0:
- nExpressJobs = expressQuota['quota']
- if nExpressJobs > 0:
- useExpress = True
- # debug mode
- if 'debug' in jobs[0].specialHandling:
- debugJobList = proxy.getActiveDebugJobs(user)
- if len(debugJobList) < ProcessGroups.maxDebugJobs:
- useDebugMode = True
- # release proxy
- self.proxyPool.putProxy(proxy)
- # get site spec
- siteMapper = SiteMapper(self)
- tmpSiteSpec = siteMapper.getSite(jobs[0].computingSite)
- # check allowed groups
- if userStatus and hasattr(tmpSiteSpec,'allowedgroups') and (not tmpSiteSpec.allowedgroups in ['',None]):
- # set status to False when allowedgroups is defined
- userStatus = False
- # loop over all groups
- for tmpGroup in tmpSiteSpec.allowedgroups.split(','):
- if tmpGroup == '':
- continue
- # loop over all FQANs
- for tmpFQAN in fqans:
- if re.search('^%s' % tmpGroup,tmpFQAN) != None:
- userStatus = True
- break
- # escape
- if userStatus:
- break
- # get priority offset
- if hasattr(tmpSiteSpec,'priorityoffset') and (not tmpSiteSpec.priorityoffset in ['',None]):
- # loop over all groups
- for tmpGP in tmpSiteSpec.priorityoffset.split(','):
- if tmpGP == '':
- continue
- # get group and offset
- tmpGroup = tmpGP.split(':')[0]
- try:
- tmpOffset = int(tmpGP.split(':')[-1])
- except:
- tmpOffset = 0
- # loop over all FQANs
- for tmpFQAN in fqans:
- _logger.debug(tmpFQAN)
- if re.search('^%s/' % tmpGroup,tmpFQAN) != None or \
- re.search('%s$' % tmpGroup,tmpFQAN) != None:
- # use the largest offset
- if tmpOffset > priorityOffset:
- priorityOffset = tmpOffset
- break
- # check site access
- if hasattr(tmpSiteSpec,'accesscontrol') and tmpSiteSpec.accesscontrol == 'grouplist':
- if userSiteAccess == {} or userSiteAccess['status'] != 'approved':
- # user is not allowed
- userStatus = False
- # set priority offset
- if userStatus:
- if userSiteAccess.has_key('poffset') and userSiteAccess['poffset'] > priorityOffset:
- priorityOffset = userSiteAccess['poffset']
- # extract country group
- for tmpFQAN in fqans:
- match = re.search('^/atlas/([^/]+)/',tmpFQAN)
- if match != None:
- tmpCountry = match.group(1)
- # use country code or usatlas
- if len(tmpCountry) == 2:
- userCountry = tmpCountry
- break
- # usatlas
- if tmpCountry in ['usatlas']:
- userCountry = 'us'
- break
- # return if DN is blocked
- if not userStatus:
- _logger.debug("storeJobs : end for %s DN is blocked 2" % user)
- return []
- # extract VO
- for tmpFQAN in fqans:
- match = re.search('^/([^/]+)/',tmpFQAN)
- if match != None:
- userVO = match.group(1)
- break
- # get number of jobs currently in PandaDB
- serNum = 0
- userDefinedWG = False
- validWorkingGroup = False
- usingBuild = False
- withProdRole = False
- workingGroup = None
- if len(jobs) > 0 and (jobs[0].prodSourceLabel in ['user','panda']) \
- and (not jobs[0].processingType in ['merge','unmerge']):
- # check workingGroup
- if not jobs[0].workingGroup in ['',None,'NULL']:
- userDefinedWG = True
- if userSiteAccess != {}:
- if userSiteAccess['status'] == 'approved' and jobs[0].workingGroup in userSiteAccess['workingGroups']:
- # valid workingGroup
- validWorkingGroup = True
- # using build for analysis
- if jobs[0].prodSourceLabel == 'panda':
- usingBuild = True
- # get priority parameters for user
- withProdRole,workingGroup,priorityOffset,serNum,weight = self.getPrioParameters(jobs,user,fqans,userDefinedWG,
- validWorkingGroup)
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # get group job serial number
- groupJobSerialNum = 0
- if len(jobs) > 0 and (jobs[0].prodSourceLabel in ['user','panda']) \
- and (not jobs[0].processingType in ['merge','unmerge']):
- for tmpFile in jobs[-1].Files:
- if tmpFile.type in ['output','log'] and '$GROUPJOBSN' in tmpFile.lfn:
- tmpSnRet = proxy.getSerialNumberForGroupJob(user)
- if tmpSnRet['status']:
- groupJobSerialNum = tmpSnRet['sn']
- break
- # loop over all jobs
- ret =[]
- newJobs=[]
- usePandaDDM = False
- firstLiveLog = True
- nRunJob = 0
- for job in jobs:
- # set JobID. keep original JobID when retry
- if userJobID != -1 and job.prodSourceLabel in ['user','panda'] \
- and (job.attemptNr in [0,'0','NULL'] or (not job.jobExecutionID in [0,'0','NULL'])) \
- and (not jobs[0].processingType in ['merge','unmerge']):
- job.jobDefinitionID = userJobID
- # set jobsetID
- if job.prodSourceLabel in ['user','panda','ptest','rc_test']:
- job.jobsetID = userJobsetID
- # set specialHandling
- if job.prodSourceLabel in ['user','panda']:
- if checkSpecialHandling:
- specialHandling = ''
- # debug mode
- if useDebugMode and nRunJob == 0 and job.prodSourceLabel == 'user':
- specialHandling += 'debug,'
- # express mode
- if useExpress and (nRunJob < nExpressJobs or job.prodSourceLabel == 'panda'):
- specialHandling += 'express,'
- # reset specialHandling
- specialHandling = specialHandling[:-1]
- job.specialHandling = specialHandling
- if job.prodSourceLabel != 'panda':
- nRunJob += 1
- # set relocation flag
- if job.computingSite != 'NULL':
- job.relocationFlag = 1
- # protection agains empty jobParameters
- if job.jobParameters in ['',None,'NULL']:
- job.jobParameters = ' '
- # set country group and nJobs (=taskID)
- if job.prodSourceLabel in ['user','panda']:
- job.countryGroup = userCountry
- # set workingGroup
- if not validWorkingGroup:
- if withProdRole:
- # set country group if submitted with production role
- job.workingGroup = workingGroup
- else:
- if userDefinedWG:
- # reset invalid working group
- job.workingGroup = None
- # set nJobs (=taskID)
- if usingBuild:
- tmpNumBuild = 1
- tmpNunRun = len(jobs) - 1
- else:
- tmpNumBuild = 0
- tmpNunRun = len(jobs)
- # encode
- job.taskID = tmpNumBuild + (tmpNunRun << 1)
- # change TRF URL just in case
- if job.transformation.startswith('http://www.usatlas.bnl.gov/svn/panda/pathena/trf'):
- job.transformation = re.sub('^http://www.usatlas.bnl.gov/svn/panda/pathena/trf/',
- 'http://pandaserver.cern.ch:25080/trf/user/',
- job.transformation)
- # set hostname
- if hostname != '':
- job.creationHost = hostname
- # insert job to DB
- if not proxy.insertNewJob(job,user,serNum,weight,priorityOffset,userVO,groupJobSerialNum,
- toPending):
- # reset if failed
- job.PandaID = None
- else:
- # live log
- if job.prodSourceLabel in ['user','panda']:
- if ' --liveLog ' in job.jobParameters:
- # enable liveLog only for the first one
- if firstLiveLog:
- # set file name
- repPatt = ' --liveLog stdout.%s ' % job.PandaID
- else:
- # remove the option
- repPatt = ' '
- job.jobParameters = re.sub(' --liveLog ',repPatt,job.jobParameters)
- firstLiveLog = False
- # append
- newJobs.append(job)
- if job.prodSourceLabel in ['user','panda','ptest','rc_test']:
- ret.append((job.PandaID,job.jobDefinitionID,{'jobsetID':job.jobsetID}))
- else:
- ret.append((job.PandaID,job.jobDefinitionID,job.jobName))
- serNum += 1
- # release DB proxy
- self.proxyPool.putProxy(proxy)
- # set up dataset
- if not toPending:
- if joinThr:
- thr = Setupper(self,newJobs,pandaDDM=usePandaDDM,forkRun=forkSetupper,resetLocation=resetLocInSetupper)
- thr.start()
- thr.join()
- else:
- # cannot use 'thr =' because it may trigger garbage collector
- Setupper(self,newJobs,pandaDDM=usePandaDDM,forkRun=forkSetupper,resetLocation=resetLocInSetupper).start()
- # return jobIDs
- _logger.debug("storeJobs : end for %s succeeded" % user)
- return ret
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("storeJobs : %s %s" % (errType,errValue))
- return "ERROR: ServerError with storeJobs"
-
-
- # lock jobs for reassign
- def lockJobsForReassign(self,tableName,timeLimit,statList,labels,processTypes,sites,clouds):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # exec
- res = proxy.lockJobsForReassign(tableName,timeLimit,statList,labels,processTypes,sites,clouds)
- # release DB proxy
- self.proxyPool.putProxy(proxy)
- # return
- return res
-
-
- # lock jobs for finisher
- def lockJobsForFinisher(self,timeNow,rownum,highPrio):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # exec
- res = proxy.lockJobsForFinisher(timeNow,rownum,highPrio)
- # release DB proxy
- self.proxyPool.putProxy(proxy)
- # return
- return res
-
-
- # get number of activated/defined jobs with output datasets
- def getNumWaitingJobsWithOutDS(self,outputDSs):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # exec
- res = proxy.getNumWaitingJobsWithOutDS(outputDSs)
- # release DB proxy
- self.proxyPool.putProxy(proxy)
- # return
- return res
-
-
- # resubmit jobs
- def resubmitJobs(self,jobIDs):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- jobs=[]
- # get jobs
- for jobID in jobIDs:
- res = proxy.peekJob(jobID,True,False,False,False)
- if res:
- jobs.append(res)
- # release DB proxy
- self.proxyPool.putProxy(proxy)
- # set up dataset
- if len(jobs) > 0:
- Setupper(self,jobs).start()
- # return jobIDs
- return True
-
-
- # update overall job information
- def updateJobs(self,jobs,inJobsDefined):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # loop over all jobs
- returns = []
- ddmIDs = []
- ddmAttempt = 0
- newMover = None
- for job in jobs:
- # update DB
- tmpddmIDs = []
- if job.jobStatus == 'failed' and job.prodSourceLabel == 'user' and not inJobsDefined:
- # keep failed analy jobs in Active4
- ret = proxy.updateJob(job,inJobsDefined)
- elif job.jobStatus in ['finished','failed','cancelled']:
- ret,tmpddmIDs,ddmAttempt,newMover = proxy.archiveJob(job,inJobsDefined)
- else:
- ret = proxy.updateJob(job,inJobsDefined)
- returns.append(ret)
- # collect IDs for reassign
- if ret:
- ddmIDs += tmpddmIDs
- # release proxy
- self.proxyPool.putProxy(proxy)
- # retry mover
- if newMover != None:
- self.storeJobs([newMover],None,joinThr=True)
- # reassign jobs when ddm failed
- if ddmIDs != []:
- self.reassignJobs(ddmIDs,ddmAttempt,joinThr=True)
- # return
- return returns
-
-
- # update job jobStatus only
- def updateJobStatus(self,jobID,jobStatus,param,updateStateChange=False,attemptNr=None):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # update DB and buffer
- if re.match('^finished$',jobStatus,re.I) or re.match('^failed$',jobStatus,re.I):
- ret = proxy.archiveJobLite(jobID,jobStatus,param)
- else:
- ret = proxy.updateJobStatus(jobID,jobStatus,param,updateStateChange,attemptNr)
- # release proxy
- self.proxyPool.putProxy(proxy)
- return ret
-
-
- # finalize pending analysis jobs
- def finalizePendingJobs(self,prodUserName,jobDefinitionID):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # update DB
- ret = proxy.finalizePendingJobs(prodUserName,jobDefinitionID)
- # release proxy
- self.proxyPool.putProxy(proxy)
- return ret
-
-
- # retry job
- def retryJob(self,jobID,param,failedInActive=False,changeJobInMem=False,inMemJob=None,
- getNewPandaID=False,attemptNr=None):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # update DB
- ret = proxy.retryJob(jobID,param,failedInActive,changeJobInMem,inMemJob,
- getNewPandaID,attemptNr)
- # release proxy
- self.proxyPool.putProxy(proxy)
- return ret
-
-
- # retry failed analysis jobs in Active4
- def retryJobsInActive(self,prodUserName,jobDefinitionID):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # update DB
- ret = proxy.retryJobsInActive(prodUserName,jobDefinitionID)
- # release proxy
- self.proxyPool.putProxy(proxy)
- return ret
-
-
- # activate jobs
- def activateJobs(self,jobs):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # loop over all jobs
- returns = []
- for job in jobs:
- # update DB
- ret = proxy.activateJob(job)
- returns.append(ret)
- # release proxy
- self.proxyPool.putProxy(proxy)
- return returns
-
-
- # send jobs to jobsWaiting
- def keepJobs(self,jobs):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # loop over all jobs
- returns = []
- for job in jobs:
- # update DB
- ret = proxy.keepJob(job)
- returns.append(ret)
- # release proxy
- self.proxyPool.putProxy(proxy)
- return returns
-
-
- # delete stalled jobs
- def deleteStalledJobs(self,libFileName):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # execute
- ret = proxy.deleteStalledJobs(libFileName)
- # release proxy
- self.proxyPool.putProxy(proxy)
- return ret
-
-
- # set debug mode
- def setDebugMode(self,dn,pandaID,prodManager,modeOn):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # check the number of debug jobs
- if modeOn == True:
- jobList = proxy.getActiveDebugJobs(dn)
- else:
- jobList = []
- if (not prodManager and len(jobList) >= ProcessGroups.maxDebugJobs) or \
- (prodManager and len(jobList) >= ProcessGroups.maxDebugProdJobs):
- # exceeded
- retStr = 'You already hit the limit on the maximum number of debug subjobs per '
- if not prodManager:
- retStr += 'user (%s). ' % ProcessGroups.maxDebugJobs
- else:
- retStr += 'prod user (%s). ' % ProcessGroups.maxDebugProdJobs
- retStr += 'Please set the debug mode off for one of the following PandaIDs : '
- for tmpID in jobList:
- retStr += '%s,' % tmpID
- retStr = retStr[:-1]
- else:
- # execute
- retStr = proxy.setDebugMode(dn,pandaID,prodManager,modeOn)
- # release proxy
- self.proxyPool.putProxy(proxy)
- return retStr
-
-
- # get jobs
- def getJobs(self,nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement,
- atlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup,allowOtherCountry):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get waiting jobs
- jobs,nSent = proxy.getJobs(nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement,
- atlasRelease,prodUserID,countryGroup,workingGroup,allowOtherCountry)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # get Proxy Key
- proxyKey = {}
- if getProxyKey and len(jobs) > 0:
- # get MetaDB proxy
- proxy = self.proxyPool.getProxy()
- # get Proxy Key
- proxyKey = proxy.getProxyKey(jobs[0].prodUserID)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return jobs+[nSent,proxyKey]
-
-
- # run task assignment
- def runTaskAssignment(self,jobs):
- # get DB proxy
- proxy = self.proxyPool.getProxy()
- # loop over all jobs
- retList =[]
- newJobs =[]
- for job in jobs:
- ret = None
- if not job.taskID in ['NULL',0,'']:
- # get cloud
- cloudTask = proxy.getCloudTask(job.taskID)
- if cloudTask != None and cloudTask.status == 'assigned':
- ret = cloudTask.cloud
- if ret == None:
- # append for TA
- newJobs.append(job)
- retList.append(ret)
- # release DB proxy
- self.proxyPool.putProxy(proxy)
- # run setupper
- if newJobs != []:
- TaLauncher(self,newJobs).start()
- # return clouds
- return retList
-
-
- # reset modification time of a task to shorten retry interval
- def resetTmodCloudTask(self,tid):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # run
- res = proxy.resetTmodCloudTask(tid)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return res
-
-
- # get assigning task
- def getAssigningTask(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # run
- res = proxy.getAssigningTask()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return res
-
-
- # get fareshare policy
- def getFaresharePolicy(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # run
- res = proxy.getFaresharePolicy(True)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return res
-
-
- # check merge job generation status
- def checkMergeGenerationStatus(self,dn,jobID):
- # return for NA
- retNA = {'status':'NA','mergeIDs':[]}
- try:
- # get at most 2 PandaIDs
- idStatus = self.getPandIDsWithJobID(dn,jobID,2)
- if idStatus == {}:
- return retNA
- # use larger PandaID which corresponds to runXYZ
- tmpKeys = idStatus.keys()
- tmpKeys.sort()
- pandaID = tmpKeys[-1]
- # get job
- tmpJobs = self.getFullJobStatus([pandaID])
- if tmpJobs == [] or tmpJobs[0] == None:
- return retNA
- pandaJob = tmpJobs[0]
- # non-merge job
- if not '--mergeOutput' in pandaJob.jobParameters:
- return retNA
- # loop over all sub datasets
- subDsList = []
- mergeStatus = None
- mergeIDs = []
- for tmpFile in pandaJob.Files:
- if tmpFile.type in ['output','log']:
- if not tmpFile.destinationDBlock in subDsList:
- subDsList.append(tmpFile.destinationDBlock)
- # get dataset
- tmpDsSpec = self.queryDatasetWithMap({'name':tmpFile.destinationDBlock})
- if tmpDsSpec != None:
- if tmpDsSpec.status in ['tobemerged']:
- # going to be merged
- mergeStatus = 'generating'
- mergeIDs = []
- elif tmpDsSpec.status in ['tobeclosed','closed','completed']:
- # another dataset from --individualOutDS is waiting for Merger
- if mergeStatus == 'generating':
- continue
- # set status
- mergeStatus = 'generated'
- # collect JobIDs of merge jobs
- tmpMergeID = tmpDsSpec.MoverID
- if not tmpMergeID in [0,None,'NULL']+mergeIDs:
- mergeIDs.append(tmpMergeID)
- # no merger most likely because jobs were killed
- if mergeStatus == 'generated' and mergeIDs == []:
- mergeStatus = 'aborted'
- # jobs are still runnign
- if mergeStatus == None:
- mergeStatus = 'standby'
- # return
- return {'status':mergeStatus,'mergeIDs':mergeIDs}
- except:
- return retNA
-
-
- # get job status
- def getJobStatus(self,jobIDs,fromDefined=True,fromActive=True,fromArchived=True,fromWaiting=True):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- retStatus = []
- # peek at job
- for jobID in jobIDs:
- res = proxy.peekJob(jobID,fromDefined,fromActive,fromArchived,fromWaiting)
- if res:
- retStatus.append(res.jobStatus)
- else:
- retStatus.append(None)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retStatus
-
-
- # peek at jobs
- def peekJobs(self,jobIDs,fromDefined=True,fromActive=True,fromArchived=True,fromWaiting=True,forAnal=False):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- retJobs = []
- # peek at job
- for jobID in jobIDs:
- res = proxy.peekJob(jobID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal)
- if res:
- retJobs.append(res)
- else:
- retJobs.append(None)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retJobs
-
-
- # get PandaID with jobexeID
- def getPandaIDwithJobExeID(self,jobexeIDs):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- retJobs = []
- # peek at job
- for jobexeID in jobexeIDs:
- res = proxy.getPandaIDwithJobExeID(jobexeID)
- retJobs.append(res)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retJobs
-
-
- # get slimmed file info with PandaIDs
- def getSlimmedFileInfoPandaIDs(self,pandaIDs):
- iPandaID = 0
- nPandaID = 100
- retInfo = {}
- while iPandaID < len(pandaIDs):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- tmpRetInfo = proxy.getSlimmedFileInfoPandaIDs(pandaIDs[iPandaID:iPandaID+nPandaID])
- # release proxy
- self.proxyPool.putProxy(proxy)
- iPandaID += nPandaID
- if retInfo == {}:
- retInfo = tmpRetInfo
- else:
- for outKey in tmpRetInfo.keys():
- if not retInfo.has_key(outKey):
- retInfo[outKey] = []
- # append
- for tmpItemRetInfo in tmpRetInfo[outKey]:
- if not tmpItemRetInfo in retInfo[outKey]:
- retInfo[outKey].append(tmpItemRetInfo)
- # return
- return retInfo
-
-
- # get JobIDs in a time range
- def getJobIDsInTimeRange(self,dn,timeRangeStr):
- # check DN
- if dn in ['NULL','','None',None]:
- return []
- # check timeRange
- match = re.match('^(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)$',timeRangeStr)
- if match == None:
- return []
- timeRange = datetime.datetime(year = int(match.group(1)),
- month = int(match.group(2)),
- day = int(match.group(3)),
- hour = int(match.group(4)),
- minute = int(match.group(5)),
- second = int(match.group(6)))
- # max range is 3 months
- maxRange = datetime.datetime.utcnow() - datetime.timedelta(days=30)
- if timeRange < maxRange:
- timeRange = maxRange
- retJobIDs = []
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get JobIDs
- retJobIDs = proxy.getJobIDsInTimeRange(dn,timeRange,retJobIDs)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # read ARCH when time window is more than 3days (- 3 hours as a margin)
- if timeRange < datetime.datetime.utcnow() - datetime.timedelta(days=2,hours=21) :
- # get ArchiveDBproxy
- proxy = self.proxyPool.getProxy()
- # get JobIDs
- retJobIDs = proxy.getJobIDsInTimeRangeLog(dn,timeRange,retJobIDs)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retJobIDs
-
-
- # get PandaIDs for a JobID
- def getPandIDsWithJobID(self,dn,jobID,nJobs):
- idStatus = {}
- # check DN
- if dn in ['NULL','','None',None]:
- return idStatus
- # check JobID
- try:
- jobID = long(jobID)
- nJobs = long(nJobs)
- except:
- return idStatus
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get IDs
- idStatus,buildJobID = proxy.getPandIDsWithJobID(dn,jobID,idStatus,nJobs)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # get ArchiveDBproxy
- proxy = self.proxyPool.getProxy()
- # get IDs
- idStatus = proxy.getPandIDsWithJobIDLog(dn,jobID,idStatus,nJobs,buildJobID)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return idStatus
-
-
- # get PandaIDs for a JobsetID or JobdefID in jobsArchived
- def getPandIDsWithIdInArch(self,prodUserName,id,isJobset):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.getPandIDsWithIdInArch(prodUserName,id,isJobset)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get beyond pledge resource ratio
- # ! this method is not thread-safe
- def getPledgeResourceRatio(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.getPledgeResourceRatio()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return proxy.beyondPledgeRatio
-
-
- # get the number of waiting jobs with a dataset
- def getNumWaitingJobsForPD2P(self,datasetName):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- nJobs = proxy.getNumWaitingJobsForPD2P(datasetName)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return nJobs
-
-
- # get the number of waiting jobsets with a dataset
- def getNumWaitingJobsetsForPD2P(self,datasetName):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- nJobs = proxy.getNumWaitingJobsetsForPD2P(datasetName)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return nJobs
-
-
- # lock job for re-brokerage
- def lockJobForReBrokerage(self,dn,jobID,simulation,forceOpt,forFailed=False):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get IDs
- ret = proxy.lockJobForReBrokerage(dn,jobID,simulation,forceOpt,forFailed)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # reset buildJob for re-brokerage
- def resetBuildJobForReBrokerage(self,pandaID):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get IDs
- ret = proxy.resetBuildJobForReBrokerage(pandaID)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get PandaIDs using libDS for re-brokerage
- def getPandaIDsForReBrokerage(self,userName,jobID,fromActive,forFailed=False):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get IDs
- ret = proxy.getPandaIDsForReBrokerage(userName,jobID,fromActive,forFailed)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get input datasets for rebroerage
- def getInDatasetsForReBrokerage(self,jobID,userName):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get IDs
- ret = proxy.getInDatasetsForReBrokerage(jobID,userName)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get outDSs with userName/jobID
- def getOutDSsForReBrokerage(self,userName,jobID):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get IDs
- ret = proxy.getOutDSsForReBrokerage(userName,jobID)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get full job status
- def getFullJobStatus(self,jobIDs,fromDefined=True,fromActive=True,fromArchived=True,fromWaiting=True,forAnal=True):
- retJobMap = {}
- # peek at job
- for jobID in jobIDs:
- # get DBproxy for each job to avoid occupying connection for long time
- proxy = self.proxyPool.getProxy()
- # peek job
- res = proxy.peekJob(jobID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal)
- retJobMap[jobID] = res
- # release proxy
- self.proxyPool.putProxy(proxy)
- # get IDs
- for jobID in jobIDs:
- if retJobMap[jobID] == None:
- # get ArchiveDBproxy
- proxy = self.proxyPool.getProxy()
- # peek job
- res = proxy.peekJobLog(jobID)
- retJobMap[jobID] = res
- # release proxy
- self.proxyPool.putProxy(proxy)
- # sort
- retJobs = []
- for jobID in jobIDs:
- retJobs.append(retJobMap[jobID])
- # return
- return retJobs
-
-
- # get script for offline running
- def getScriptOfflineRunning(self,pandaID):
- try:
- # get job
- tmpJobs = self.getFullJobStatus([pandaID])
- if tmpJobs == [] or tmpJobs[0] == None:
- return "ERROR: Cannot get PandaID=%s in DB for the last 30 days" % pandaID
- tmpJob = tmpJobs[0]
- # check prodSourceLabel
- if not tmpJob.prodSourceLabel in ['managed','test']:
- return "ERROR: Non production job : prodSourceLabel=%s. This method is only for production jobs" % tmpJob.prodSourceLabel
- # release and trf
- tmpRels = tmpJob.homepackage.split("\n")
- tmpPars = tmpJob.jobParameters.split("\n")
- tmpTrfs = tmpJob.transformation.split("\n")
- if not (len(tmpRels) == len(tmpPars) == len(tmpTrfs)):
- return "ERROR: The number of releases or parameters or trfs is inconsitent with others"
- # construct script
- scrStr = "#retrieve inputs\n\n"
- # collect inputs
- dsFileMap = {}
- for tmpFile in tmpJob.Files:
- if tmpFile.type=='input':
- if not dsFileMap.has_key(tmpFile.dataset):
- dsFileMap[tmpFile.dataset] = []
- if not tmpFile.lfn in dsFileMap[tmpFile.dataset]:
- dsFileMap[tmpFile.dataset].append(tmpFile.lfn)
- # dq2
- for tmpDS,tmpFileList in dsFileMap.iteritems():
- scrStr += "dq2-get --files "
- for tmpLFN in tmpFileList:
- scrStr += "%s," % tmpLFN
- scrStr = scrStr[:-1]
- scrStr += " %s\n" % tmpDS
- # ln
- for tmpLFN in tmpFileList:
- scrStr += "ln -fs %s*/%s ./%s\n" % (tmpDS.rstrip("/"),tmpLFN,tmpLFN)
- scrStr += "\n#transform commands\n\n"
- bitNum = '32'
- if 'x86_64' in tmpJob.cmtConfig:
- bitNum = '64'
- for tmpIdx,tmpRel in enumerate(tmpRels):
- # asetup
- scrStr += "asetup %s,%s,%s\n" % tuple(tmpRel.split("/")+[bitNum])
- # athenaMP
- if not tmpJob.coreCount in ['NULL',None] and tmpJob.coreCount > 1:
- scrStr += "export ATHENA_PROC_NUMBER=%s\n" % tmpJob.coreCount
- # add double quotes for zsh
- tmpParamStr = tmpPars[tmpIdx]
- tmpSplitter = shlex.shlex(tmpParamStr, posix=True)
- tmpSplitter.whitespace = ' '
- tmpSplitter.whitespace_split = True
- # loop for params
- for tmpItem in tmpSplitter:
- tmpMatch = re.search('^([^=]+=)(.+)$',tmpItem)
- if tmpMatch != None:
- tmpArgName = tmpMatch.group(1)
- tmpArgVal = tmpMatch.group(2)
- tmpArgIdx = tmpParamStr.find(tmpArgName) + len(tmpArgName)
- # add "
- if tmpParamStr[tmpArgIdx] != '"':
- tmpParamStr = tmpParamStr.replace(tmpMatch.group(0),
- tmpArgName+'"'+tmpArgVal+'"')
- # run trf
- scrStr += "%s %s\n\n" % (tmpTrfs[tmpIdx],tmpParamStr)
- return scrStr
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("getScriptOfflineRunning : %s %s" % (errType,errValue))
- return "ERROR: ServerError with getScriptOfflineRunning"
-
-
- # kill jobs
- def killJobs(self,ids,user,code,prodManager,wgProdRole=[]):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- rets = []
- # kill jobs
- pandaIDforCloserMap = {}
- for id in ids:
- ret,userInfo = proxy.killJob(id,user,code,prodManager,True,wgProdRole)
- rets.append(ret)
- if ret and userInfo['prodSourceLabel'] in ['user','managed','test']:
- jobIDKey = (userInfo['prodUserID'],userInfo['jobDefinitionID'],userInfo['jobsetID'])
- if not pandaIDforCloserMap.has_key(jobIDKey):
- pandaIDforCloserMap[jobIDKey] = id
- # release proxy
- self.proxyPool.putProxy(proxy)
- # run Closer
- try:
- if pandaIDforCloserMap != {}:
- for pandaIDforCloser in pandaIDforCloserMap.values():
- tmpJobs = self.peekJobs([pandaIDforCloser])
- tmpJob = tmpJobs[0]
- if tmpJob != None:
- tmpDestDBlocks = []
- # get destDBlock
- for tmpFile in tmpJob.Files:
- if tmpFile.type in ['output','log']:
- if not tmpFile.destinationDBlock in tmpDestDBlocks:
- tmpDestDBlocks.append(tmpFile.destinationDBlock)
- # run
- cThr = Closer(self,tmpDestDBlocks,tmpJob)
- cThr.start()
- cThr.join()
- except:
- pass
- # return
- return rets
-
-
- # reassign jobs
- def reassignJobs(self,ids,attempt=0,joinThr=False,forkSetupper=False,forPending=False):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- jobs = []
- oldSubMap = {}
- # keep old assignment
- keepSiteFlag = False
- if (attempt % 2) != 0:
- keepSiteFlag = True
- # reset jobs
- for id in ids:
- try:
- # try to reset active job
- if not forPending:
- tmpRet = proxy.resetJob(id,keepSite=keepSiteFlag,getOldSubs=True)
- if isinstance(tmpRet,types.TupleType):
- ret,tmpOldSubList = tmpRet
- else:
- ret,tmpOldSubList = tmpRet,[]
- if ret != None:
- jobs.append(ret)
- for tmpOldSub in tmpOldSubList:
- if not oldSubMap.has_key(tmpOldSub):
- oldSubMap[tmpOldSub] = ret
- continue
- # try to reset waiting job
- tmpRet = proxy.resetJob(id,False,keepSite=keepSiteFlag,getOldSubs=False,forPending=forPending)
- if isinstance(tmpRet,types.TupleType):
- ret,tmpOldSubList = tmpRet
- else:
- ret,tmpOldSubList = tmpRet,[]
- if ret != None:
- jobs.append(ret)
- # waiting jobs don't create sub or dis
- continue
- # try to reset defined job
- if not forPending:
- tmpRet = proxy.resetDefinedJob(id,keepSite=keepSiteFlag,getOldSubs=True)
- if isinstance(tmpRet,types.TupleType):
- ret,tmpOldSubList = tmpRet
- else:
- ret,tmpOldSubList = tmpRet,[]
- if ret != None:
- jobs.append(ret)
- for tmpOldSub in tmpOldSubList:
- if not oldSubMap.has_key(tmpOldSub):
- oldSubMap[tmpOldSub] = ret
- continue
- except:
- pass
- # release DB proxy
- self.proxyPool.putProxy(proxy)
- # run Closer for old sub datasets
- if not forPending:
- for tmpOldSub,tmpJob in oldSubMap.iteritems():
- cThr = Closer(self,[tmpOldSub],tmpJob)
- cThr.start()
- cThr.join()
- # setup dataset
- if jobs != []:
- if joinThr:
- thr = Setupper(self,jobs,resubmit=True,ddmAttempt=attempt,forkRun=forkSetupper)
- thr.start()
- thr.join()
- else:
- # cannot use 'thr =' because it may trigger garbage collector
- Setupper(self,jobs,resubmit=True,ddmAttempt=attempt,forkRun=forkSetupper).start()
- # return
- return True
-
-
- # awake jobs in jobsWaiting
- def awakeJobs(self,ids):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- jobs = []
- # reset jobs
- for id in ids:
- # try to reset waiting job
- ret = proxy.resetJob(id,False)
- if ret != None:
- jobs.append(ret)
- # release DB proxy
- self.proxyPool.putProxy(proxy)
- # setup dataset
- Setupper(self,jobs).start()
- # return
- return True
-
-
- # query PandaIDs
- def queryPandaIDs(self,jobDefIDs):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- pandaIDs = []
- # query PandaID
- for jobDefID in jobDefIDs:
- id = proxy.queryPandaID(jobDefID)
- pandaIDs.append(id)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return pandaIDs
-
-
- # query job info per cloud
- def queryJobInfoPerCloud(self,cloud,schedulerID=None):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query job info
- ret = proxy.queryJobInfoPerCloud(cloud,schedulerID)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get PandaIDs to be updated in prodDB
- def getPandaIDsForProdDB(self,limit,lockedby):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query PandaID
- ret = proxy.getPandaIDsForProdDB(limit,lockedby)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # update prodDBUpdateTime
- def updateProdDBUpdateTimes(self,paramList):
- retList = []
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # update
- for param in paramList:
- ret = proxy.updateProdDBUpdateTime(param)
- retList.append(ret)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # get PandaIDs at Site
- def getPandaIDsSite(self,site,status,limit):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query PandaID
- ids = proxy.getPandaIDsSite(site,status,limit)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ids
-
-
- # get input files currently in used for analysis
- def getFilesInUseForAnal(self,outDataset):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- retList = []
- # query LFNs
- retList = proxy.getFilesInUseForAnal(outDataset)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # get list of dis dataset to get input files in shadow
- def getDisInUseForAnal(self,outDataset):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query dis
- retList = proxy.getDisInUseForAnal(outDataset)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # get input LFNs currently in use for analysis with shadow dis
- def getLFNsInUseForAnal(self,inputDisList):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query dis
- retList = proxy.getLFNsInUseForAnal(inputDisList)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # update input files and return corresponding PandaIDs
- def updateInFilesReturnPandaIDs(self,dataset,status,fileLFN=''):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- retList = []
- # query PandaID
- retList = proxy.updateInFilesReturnPandaIDs(dataset,status,fileLFN)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # update file status in dispatch dataset
- def updateFileStatusInDisp(self,dataset,fileStatusMap):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query PandaID
- retVal = proxy.updateFileStatusInDisp(dataset,fileStatusMap)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retVal
-
-
- # update output files and return corresponding PandaIDs
- def updateOutFilesReturnPandaIDs(self,dataset,fileLFN=''):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- retList = []
- # query PandaID
- retList = proxy.updateOutFilesReturnPandaIDs(dataset,fileLFN)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # get datasets associated with file
- def getDatasetWithFile(self,lfn,jobPrioity=0):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query PandaID
- retList = proxy.getDatasetWithFile(lfn,jobPrioity)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # get _dis datasets associated to _sub
- def getAssociatedDisDatasets(self,subDsName):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- retList = []
- # query
- retList = proxy.getAssociatedDisDatasets(subDsName)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # insert sandbox file info
- def insertSandboxFileInfo(self,userName,hostName,fileName,fileSize,checkSum):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # exec
- ret= proxy.insertSandboxFileInfo(userName,hostName,fileName,fileSize,checkSum)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # check duplicated sandbox file
- def checkSandboxFile(self,userName,fileSize,checkSum):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # exec
- ret= proxy.checkSandboxFile(userName,fileSize,checkSum)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # insert datasets
- def insertDatasets(self,datasets):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- retList = []
- # insert
- for dataset in datasets:
- ret= proxy.insertDataset(dataset)
- retList.append(ret)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # query Dataset
- def queryDatasetWithMap(self,map):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query Dataset
- ret = proxy.queryDatasetWithMap(map)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # query last files in a dataset
- def queryLastFilesInDataset(self,datasets):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query files
- ret = proxy.queryLastFilesInDataset(datasets)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # set GUIDs
- def setGUIDs(self,files):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # set GUIDs
- ret = proxy.setGUIDs(files)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # query PandaID with dataset
- def queryPandaIDwithDataset(self,datasets):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query Dataset
- ret = proxy.queryPandaIDwithDataset(datasets)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # query PandaID with filenames
- def queryPandaIDwithLFN(self,lfns):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query Dataset
- ret = proxy.queryPandaIDwithLFN(lfns)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # update dataset
- def updateDatasets(self,datasets,withLock=False,withCriteria="",criteriaMap={}):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # update Dataset
- retList = proxy.updateDataset(datasets,withLock,withCriteria,criteriaMap)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # delete dataset
- def deleteDatasets(self,datasets):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- retList = []
- # query Dataset
- for dataset in datasets:
- ret = proxy.deleteDataset(dataset)
- retList.append(ret)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # query files with map
- def queryFilesWithMap(self,map):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query files
- ret = proxy.queryFilesWithMap(map)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # count the number of files with map
- def countFilesWithMap(self,map):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # query files
- ret = proxy.countFilesWithMap(map)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # count the number of pending files
- def countPendingFiles(self,pandaID,forInput=True):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # count files
- ret = proxy.countPendingFiles(pandaID,forInput)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get serial number for dataset
- def getSerialNumber(self,datasetname,definedFreshFlag=None):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get serial number
- ret = proxy.getSerialNumber(datasetname,definedFreshFlag)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get serial number for group job
- def getSerialNumberForGroupJob(self,name):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get serial number
- ret = proxy.getSerialNumberForGroupJob(name)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # add metadata
- def addMetadata(self,ids,metadataList):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # add metadata
- index = 0
- retList = []
- for id in ids:
- ret = proxy.addMetadata(id,metadataList[index])
- retList.append(ret)
- index += 1
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return retList
-
-
- # add stdout
- def addStdOut(self,id,stdout):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # add
- ret = proxy.addStdOut(id,stdout)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # extract name from DN
- def cleanUserID(self,id):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.cleanUserID(id)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # extract scope from dataset name
- def extractScope(self,name):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.extractScope(name)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # change job priorities
- def changeJobPriorities(self,newPrioMap):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.changeJobPriorities(newPrioMap)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get destinationDBlockToken for a dataset
- def getDestTokens(self,dsname):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get token
- ret = proxy.getDestTokens(dsname)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get destinationSE for a dataset
- def getDestSE(self,dsname,fromArch=False):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get token
- ret = proxy.getDestSE(dsname,fromArch)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get job statistics
- def getJobStatistics(self,archived=False,predefined=False,workingGroup='',countryGroup='',jobType='',forAnal=None,minPriority=None):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get serial number
- ret = proxy.getJobStatistics(archived,predefined,workingGroup,countryGroup,jobType,forAnal,minPriority)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get job statistics with label
- def getJobStatisticsWithLabel(self,siteStr=''):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get serial number
- ret = proxy.getJobStatisticsWithLabel(siteStr)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get job statistics for brokerage
- def getJobStatisticsBrokerage(self,minPrio=None):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get stat
- ret = proxy.getJobStatisticsBrokerage(minPrio)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # convert
- conRet = ProcessGroups.countJobsPerGroup(ret)
- # return
- return conRet
-
-
- # get job statistics for analysis brokerage
- def getJobStatisticsAnalBrokerage(self,minPriority=None):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get stat
- ret = proxy.getJobStatisticsAnalBrokerage(minPriority=minPriority)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # convert
- conRet = ProcessGroups.countJobsPerGroupForAnal(ret)
- # return
- return conRet
-
-
- # get the number of waiting jobs per site and user
- def getJobStatisticsPerUserSite(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get stat
- ret = proxy.getJobStatisticsPerUserSite()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get highest prio jobs
- def getHighestPrioJobStat(self,perPG=False,useMorePG=False):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get stat
- if not perPG:
- ret = proxy.getHighestPrioJobStat()
- else:
- ret = proxy.getHighestPrioJobStatPerPG(useMorePG)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get queued analysis jobs at a site
- def getQueuedAnalJobs(self,site,dn):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get stat
- ret = proxy.getQueuedAnalJobs(site,dn)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get job statistics for ExtIF
- def getJobStatisticsForExtIF(self,sourcetype=None):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get serial number
- ret = proxy.getJobStatisticsForExtIF(sourcetype)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get job statistics for Bamboo
- def getJobStatisticsForBamboo(self,useMorePG=False):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get serial number
- ret = proxy.getJobStatisticsPerProcessingType(useMorePG)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get number of analysis jobs per user
- def getNUserJobs(self,siteName,nJobs):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get number of analysis jobs per user
- tmpRet = proxy.getNUserJobs(siteName,nJobs)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # get log proxy
- proxy = self.proxyPool.getProxy()
- # get Proxy Key
- ret = {}
- for userID,nJobs in tmpRet.iteritems():
- proxyKey = proxy.getProxyKey(userID)
- if proxyKey != {}:
- # add nJobs
- proxyKey['nJobs'] = nJobs
- # append
- ret[userID] = proxyKey
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get number of activated analysis jobs
- def getNAnalysisJobs(self,nProcesses):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # count
- ret = proxy.getNAnalysisJobs(nProcesses)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # update transfer status for a dataset
- def updateTransferStatus(self,datasetname,bitMap):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # update
- ret = proxy.updateTransferStatus(datasetname,bitMap)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get CloudTask
- def getCloudTask(self,tid):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # count
- ret = proxy.getCloudTask(tid)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # set cloud to CloudTask
- def setCloudTask(self,cloudTask):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # count
- ret = proxy.setCloudTask(cloudTask)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # see CloudTask
- def seeCloudTask(self,tid):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # count
- ret = proxy.seeCloudTask(tid)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # set cloud to CloudTask by user
- def setCloudTaskByUser(self,user,tid,cloud,status):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # count
- ret = proxy.setCloudTaskByUser(user,tid,cloud,status)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # update site data
- def updateSiteData(self,hostID,pilotRequests):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get serial number
- ret = proxy.updateSiteData(hostID,pilotRequests)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get current site data
- def getCurrentSiteData(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get serial number
- ret = proxy.getCurrentSiteData()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # insert nRunning in site data
- def insertnRunningInSiteData(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get serial number
- ret = proxy.insertnRunningInSiteData()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get nRunning in site data
- def getnRunningInSiteData(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get serial number
- ret = proxy.getnRunningInSiteData()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get site list
- def getSiteList(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get site info
- ret = proxy.getSiteList()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get site info
- def getSiteInfo(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get site info
- ret = proxy.getSiteInfo()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get cloud list
- def getCloudList(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get cloud list
- ret = proxy.getCloudList()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # check sites with release/cache
- def checkSitesWithRelease(self,sites,releases=None,caches=None,cmtConfig=None):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # check
- ret = proxy.checkSitesWithRelease(sites,releases,caches,cmtConfig)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get sites with release/cache in cloud
- def getSitesWithReleaseInCloud(self,cloud,releases=None,caches=None,validation=False):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # check
- ret = proxy.getSitesWithReleaseInCloud(cloud,releases,caches,validation)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get list of cache prefix
- def getCachePrefixes(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # check
- ret = proxy.getCachePrefixes()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get pilot owners
- def getPilotOwners(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get pilot owners
- ret = proxy.getPilotOwners()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get allowed nodes
- def getAllowedNodes(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.getAllowedNodes()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get email address
- def getEmailAddr(self,name):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.getEmailAddr(name)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get client version
- def getPandaClientVer(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.getPandaClientVer()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # register proxy key
- def registerProxyKey(self,params):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # register proxy key
- ret = proxy.registerProxyKey(params)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # register proxy key
- def registerProxyKey(self,params):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # register proxy key
- ret = proxy.registerProxyKey(params)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get proxy key
- def getProxyKey(self,dn):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get proxy key
- ret = proxy.getProxyKey(dn)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # add account to siteaccess
- def addSiteAccess(self,siteID,dn):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # add account to siteaccess
- ret = proxy.addSiteAccess(siteID,dn)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # list site access
- def listSiteAccess(self,siteid,dn,longFormat=False):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # list site access
- ret = proxy.listSiteAccess(siteid,dn,longFormat)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # update site access
- def updateSiteAccess(self,method,siteid,requesterDN,userName,attrValue):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # update site access
- ret = proxy.updateSiteAccess(method,siteid,requesterDN,userName,attrValue)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # generate pilot token
- def genPilotToken(self,schedulerhost,scheduleruser,schedulerid):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.genPilotToken(schedulerhost,scheduleruser,schedulerid)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # add files to memcached
- def addFilesToMemcached(self,site,node,files):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.addFilesToMemcached(site,node,files)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # delete files from memcached
- def deleteFilesFromMemcached(self,site,node,files):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.deleteFilesFromMemcached(site,node,files)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # flush memcached
- def flushMemcached(self,site,node):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.flushMemcached(site,node)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
- # check files with memcached
- def checkFilesWithMemcached(self,site,node,files):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.checkFilesWithMemcached(site,node,files)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get list of scheduler users
- def getListSchedUsers(self):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.getListSchedUsers()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # query an SQL return Status
- def querySQLS(self,sql,varMap,arraySize=1000):
- # get DBproxy
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.querySQLS(sql,varMap,arraySize)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # check quota
- def checkQuota(self,dn):
- # query an SQL return Status
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.checkQuota(dn)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get JobID for user
- def getJobIdUser(self,dn):
- # query an SQL return Status
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.getJobIdUser(dn)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get user subscriptions
- def getUserSubscriptions(self,datasetName,timeRange):
- # query an SQL return Status
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.getUserSubscriptions(datasetName,timeRange)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get the number of user subscriptions
- def getNumUserSubscriptions(self):
- # query an SQL return Status
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.getNumUserSubscriptions()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # add user subscriptions
- def addUserSubscription(self,datasetName,dq2IDs):
- # query an SQL return Status
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.addUserSubscription(datasetName,dq2IDs)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # increment counter for subscription
- def incrementUsedCounterSubscription(self,datasetName):
- # query an SQL return Status
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.incrementUsedCounterSubscription(datasetName)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get active datasets
- def getActiveDatasets(self,computingSite,prodSourceLabel):
- # query an SQL return Status
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.getActiveDatasets(computingSite,prodSourceLabel)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # check status of all sub datasets to trigger Notifier
- def checkDatasetStatusForNotifier(self,jobsetID,jobDefinitionID,prodUserName):
- # query an SQL return Status
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.checkDatasetStatusForNotifier(jobsetID,jobDefinitionID,prodUserName)
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
- # get MoU share for T2 PD2P
- def getMouShareForT2PD2P(self):
- # query an SQL return Status
- proxy = self.proxyPool.getProxy()
- # get
- ret = proxy.getMouShareForT2PD2P()
- # release proxy
- self.proxyPool.putProxy(proxy)
- # return
- return ret
-
-
-# Singleton
-taskBuffer = TaskBuffer()
-
diff --git a/current/pandaserver/taskbuffer/Utils.py b/current/pandaserver/taskbuffer/Utils.py
deleted file mode 100755
index e3ad1efe9..000000000
--- a/current/pandaserver/taskbuffer/Utils.py
+++ /dev/null
@@ -1,512 +0,0 @@
-"""
-utility service
-
-"""
-import os
-import re
-import sys
-import zlib
-import uuid
-import time
-import socket
-import struct
-import datetime
-import jobdispatcher.Protocol as Protocol
-import ErrorCode
-from userinterface import Client
-from config import panda_config
-
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('Utils')
-
-# check if server is alive
-def isAlive(req):
- return "alive=yes"
-
-
-# extract name from DN
-def cleanUserID(id):
- try:
- up = re.compile('/(DC|O|OU|C|L)=[^\/]+')
- username = up.sub('', id)
- up2 = re.compile('/CN=[0-9]+')
- username = up2.sub('', username)
- up3 = re.compile(' [0-9]+')
- username = up3.sub('', username)
- up4 = re.compile('_[0-9]+')
- username = up4.sub('', username)
- username = username.replace('/CN=proxy','')
- username = username.replace('/CN=limited proxy','')
- username = username.replace('limited proxy','')
- username = re.sub('/CN=Robot:[^/]+','',username)
- pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)')
- mat = pat.match(username)
- if mat:
- username = mat.group(2)
- else:
- username = username.replace('/CN=','')
- if username.lower().find('/email') > 0:
- username = username[:username.lower().find('/email')]
- pat = re.compile('.*(limited.*proxy).*')
- mat = pat.match(username)
- if mat:
- username = mat.group(1)
- username = username.replace('(','')
- username = username.replace(')','')
- username = username.replace("'",'')
- return username
- except:
- return id
-
-
-# insert with rety
-def insertWithRetryCassa(familyName,keyName,valMap,msgStr,nTry=3):
- for iTry in range(nTry):
- try:
- familyName.insert(keyName,valMap)
- except pycassa.MaximumRetryException,tmpE:
- if iTry+1 < nTry:
- _logger.debug("%s sleep %s/%s" % (msgStr,iTry,nTry))
- time.sleep(30)
- else:
- raise pycassa.MaximumRetryException,tmpE.value
- else:
- break
-
-
-# touch in Cassandra
-def touchFileCassa(filefamily,fileKeyName,timeNow):
- try:
- # get old timestamp
- oldFileInfo = filefamily.get(fileKeyName)
- except:
- _logger.warning('cannot get old fileinfo for %s from Cassandra' % fileKeyName)
- return False
- try:
- # update time in fileTable
- for splitIdx in range(oldFileInfo['nSplit']):
- tmpFileKeyName = fileKeyName
- if splitIdx != 0:
- tmpFileKeyName += '_%s' % splitIdx
- insertWithRetryCassa(filefamily,tmpFileKeyName,
- {'year' : timeNow.year,
- 'month' : timeNow.month,
- 'day' : timeNow.day,
- 'hour' : timeNow.hour,
- 'minute' : timeNow.minute,
- 'second' : timeNow.second},
- 'touchFileCassa : %s' % fileKeyName
- )
- return True
- except:
- errType,errValue = sys.exc_info()[:2]
- errStr = "cannot touch %s due to %s %s" % (fileKeyName,errType,errValue)
- _logger.error(errStr)
- return False
-
-
-# upload file
-def putFile(req,file):
- if not Protocol.isSecure(req):
- return False
- if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']:
- return False
- _logger.debug("putFile : start %s %s" % (req.subprocess_env['SSL_CLIENT_S_DN'],file.filename))
- # size check
- fullSizeLimit = 768*1024*1024
- if not file.filename.startswith('sources.'):
- noBuild = True
- sizeLimit = 10*1024*1024
- else:
- noBuild = False
- sizeLimit = fullSizeLimit
- # get file size
- contentLength = 0
- try:
- contentLength = long(req.headers_in["content-length"])
- except:
- if req.headers_in.has_key("content-length"):
- _logger.error("cannot get CL : %s" % req.headers_in["content-length"])
- else:
- _logger.error("no CL")
- _logger.debug("size %s" % contentLength)
- if contentLength > sizeLimit:
- errStr = "ERROR : Upload failure. Exceeded size limit %s>%s." % (contentLength,sizeLimit)
- if noBuild:
- errStr += " Please submit the job without --noBuild/--libDS since those options impose a tighter size limit"
- else:
- errStr += " Please remove redundant files from your workarea"
- _logger.error(errStr)
- _logger.debug("putFile : end")
- return errStr
- try:
- fileFullPath = '%s/%s' % (panda_config.cache_dir,file.filename.split('/')[-1])
- # avoid overwriting
- if os.path.exists(fileFullPath):
- # touch
- os.utime(fileFullPath,None)
- # send error message
- errStr = "ERROR : Cannot overwrite file"
- _logger.debug('putFile : cannot overwrite file %s' % file.filename)
- _logger.debug("putFile : end")
- return errStr
- # write
- fo = open(fileFullPath,'wb')
- fileContent = file.file.read()
- fo.write(fileContent)
- fo.close()
- except:
- errStr = "ERROR : Cannot write file"
- _logger.error(errStr)
- _logger.debug("putFile : end")
- return errStr
- # checksum
- try:
- # decode Footer
- footer = fileContent[-8:]
- checkSum,isize = struct.unpack("II",footer)
- _logger.debug("CRC from gzip Footer %s" % checkSum)
- except:
- # calculate on the fly
- """
- import zlib
- checkSum = zlib.adler32(fileContent) & 0xFFFFFFFF
- """
- # use None to avoid delay for now
- checkSum = None
- _logger.debug("CRC calculated %s" % checkSum)
- # file size
- fileSize = len(fileContent)
- # user name
- username = cleanUserID(req.subprocess_env['SSL_CLIENT_S_DN'])
- _logger.debug("putFile : written dn=%s file=%s size=%s crc=%s" % \
- (username,file.filename,fileSize,checkSum))
- # put file info to DB
- statClient,outClient = Client.insertSandboxFileInfo(username,file.filename,
- fileSize,checkSum)
- if statClient != 0 or outClient.startswith("ERROR"):
- _logger.error("putFile : failed to put sandbox to DB with %s %s" % (statClient,outClient))
- #_logger.debug("putFile : end")
- #return "ERROR : Cannot insert sandbox to DB"
- else:
- _logger.debug("putFile : inserted sandbox to DB with %s" % outClient)
- # store to cassandra
- if hasattr(panda_config,'cacheUseCassandra') and panda_config.cacheUseCassandra == True:
- try:
- # time-stamp
- timeNow = datetime.datetime.utcnow()
- creationTime = timeNow.strftime('%Y-%m-%d %H:%M:%S')
- # user name
- username = req.subprocess_env['SSL_CLIENT_S_DN']
- username = username.replace('/CN=proxy','')
- username = username.replace('/CN=limited proxy','')
- # file size
- fileSize = len(fileContent)
- # key
- fileKeyName = file.filename.split('/')[-1]
- sizeCheckSum = '%s:%s' % (fileSize,checkSum)
- # insert to cassandra
- import pycassa
- pool = pycassa.ConnectionPool(panda_config.cacheKeySpace)
- filefamily = pycassa.ColumnFamily(pool,panda_config.cacheFileTable)
- # avoid overwriting
- gotoNextCassa = True
- if filefamily.get_count(fileKeyName) > 0:
- # touch
- touchFlag = touchFileCassa(filefamily,fileKeyName,timeNow)
- if touchFlag:
- gotoNextCassa = False
- # send error message
- errStr = "ERROR : Cannot overwrite file in Cassandra"
- _logger.error(errStr)
- if not panda_config.cacheIgnoreCassandraError:
- _logger.debug("putFile : end")
- return errStr
- # check uniqueness with size and checksum
- if gotoNextCassa:
- try:
- uniqExp = pycassa.index.create_index_expression('uniqID',sizeCheckSum)
- userExp = pycassa.index.create_index_expression('user',username)
- tmpClause = pycassa.index.create_index_clause([uniqExp,userExp])
- tmpResults = filefamily.get_indexed_slices(tmpClause,columns=['creationTime'])
- for oldFileKeyName,tmpDict in tmpResults:
- _logger.debug('The same size and chksum %s found in old:%s and new:%s' % \
- (sizeCheckSum,oldFileKeyName,fileKeyName))
- # touch
- touchFlag = touchFileCassa(filefamily,oldFileKeyName,timeNow)
- if touchFlag:
- # make alias
- _logger.debug('Making alias %s->%s' % (fileKeyName,oldFileKeyName))
- insertWithRetryCassa(filefamily,fileKeyName,
- {'alias':oldFileKeyName,
- 'creationTime':creationTime,
- 'nSplit':0,
- },
- 'putFile : make alias for %s' % file.filename
- )
- # set time
- touchFileCassa(filefamily,fileKeyName,timeNow)
- _logger.debug("putFile : end")
- return True
- except:
- gotoNextCassa = False
- errType,errValue = sys.exc_info()[:2]
- errStr = "cannot make alias for %s due to %s %s" % (fileKeyName,errType,errValue)
- _logger.error(errStr)
- if not panda_config.cacheIgnoreCassandraError:
- _logger.debug("putFile : end")
- return errStr
- # insert new record
- if gotoNextCassa:
- splitIdx = 0
- splitSize = 5 * 1024 * 1024
- nSplit,tmpMod = divmod(len(fileContent),splitSize)
- if tmpMod != 0:
- nSplit += 1
- _logger.debug('Inserting %s with %s blocks' % (fileKeyName,nSplit))
- for splitIdx in range(nSplit):
- # split to small chunks since cassandra is not good at large files
- tmpFileContent = fileContent[splitSize*splitIdx:splitSize*(splitIdx+1)]
- tmpFileKeyName = fileKeyName
- tmpAttMap = {'file':tmpFileContent,
- 'user':username,
- 'creationTime':creationTime,
- }
- if splitIdx == 0:
- tmpAttMap['size'] = fileSize
- tmpAttMap['nSplit'] = nSplit
- tmpAttMap['uniqID'] = sizeCheckSum
- tmpAttMap['checkSum'] = str(checkSum)
- else:
- tmpFileKeyName += '_%s' % splitIdx
- tmpAttMap['size'] = 0
- tmpAttMap['nSplit'] = 0
- # insert with retry
- insertWithRetryCassa(filefamily,tmpFileKeyName,tmpAttMap,
- 'putFile : insert %s' % file.filename)
- # set time
- touchFileCassa(filefamily,fileKeyName,timeNow)
- except:
- errType,errValue = sys.exc_info()[:2]
- errStr = "cannot put %s into Cassandra due to %s %s" % (fileKeyName,errType,errValue)
- _logger.error(errStr)
- # send error message
- errStr = "ERROR : " + errStr
- if not panda_config.cacheIgnoreCassandraError:
- _logger.debug("putFile : end")
- return errStr
- _logger.debug("putFile : %s end" % file.filename)
- return True
-
-
-# get file
-def getFile(req,fileName):
- _logger.debug("getFile : %s start" % fileName)
- try:
- # look into cassandra
- import pycassa
- pool = pycassa.ConnectionPool(panda_config.cacheKeySpace)
- filefamily = pycassa.ColumnFamily(pool,panda_config.cacheFileTable)
- fileInfo = filefamily.get(fileName)
- # check alias
- if fileInfo.has_key('alias') and fileInfo['alias'] != '':
- realFileName = fileInfo['alias']
- fileInfo = filefamily.get(realFileName)
- _logger.debug("getFile : %s use alias=%s" % (fileName,realFileName))
- else:
- realFileName = fileName
- # check cached file
- hostKey = socket.gethostname() + '_cache'
- if fileInfo.has_key(hostKey) and fileInfo[hostKey] != '':
- _logger.debug("getFile : %s found cache=%s" % (fileName,fileInfo[hostKey]))
- try:
- fileFullPath = '%s%s' % (panda_config.cache_dir,fileInfo[hostKey])
- # touch
- os.utime(fileFullPath,None)
- _logger.debug("getFile : %s end" % fileName)
- # return
- return ErrorCode.EC_Redirect('/cache%s' % fileInfo[hostKey])
- except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.debug("getFile : %s failed to touch %s due to %s:%s" % (fileName,fileFullPath,errtype,errvalue))
- # write to cache file
- fileRelPath = '/cassacache/%s' % str(uuid.uuid4())
- fileFullPath = '%s%s' % (panda_config.cache_dir,fileRelPath)
- _logger.debug("getFile : %s write cache to %s" % (fileName,fileFullPath))
- fo = open(fileFullPath,'wb')
- fo.write(fileInfo['file'])
- if fileInfo['nSplit'] > 1:
- for splitIdx in range(fileInfo['nSplit']):
- if splitIdx == 0:
- continue
- fileInfo = filefamily.get(realFileName+'_%s' % splitIdx)
- fo.write(fileInfo['file'])
- fo.close()
- # set cache name in DB
- insertWithRetryCassa(filefamily,realFileName,{hostKey:fileRelPath},
- 'getFile : set cache for %s' % fileName)
- _logger.debug("getFile : %s end" % fileName)
- # return
- return ErrorCode.EC_Redirect('/cache%s' % fileRelPath)
- except pycassa.NotFoundException:
- _logger.error("getFile : %s not found" % fileName)
- return ErrorCode.EC_NotFound
- except:
- errtype,errvalue = sys.exc_info()[:2]
- errStr = "getFile : %s %s for %s" % (errtype,errvalue,fileName)
- _logger.error(errStr)
- raise RuntimeError,errStr
-
-
-# get event picking request
-def putEventPickingRequest(req,runEventList='',eventPickDataType='',eventPickStreamName='',
- eventPickDS='',eventPickAmiTag='',userDatasetName='',lockedBy='',
- params='',inputFileList=''):
- if not Protocol.isSecure(req):
- return "ERROR : no HTTPS"
- userName = req.subprocess_env['SSL_CLIENT_S_DN']
- creationTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
- _logger.debug("putEventPickingRequest : %s start" % userName)
- # size check
- sizeLimit = 10*1024*1024
- # get total size
- try:
- contentLength = long(req.headers_in["content-length"])
- except:
- errStr = "cannot get content-length from HTTP request."
- _logger.error("putEventPickingRequest : " + errStr + " " + userName)
- _logger.debug("putEventPickingRequest : %s end" % userName)
- return "ERROR : " + errStr
- _logger.debug("size %s" % contentLength)
- if contentLength > sizeLimit:
- errStr = "Too large run/event list. Exceeded size limit %s>%s." % (contentLength,sizeLimit)
- _logger.error("putEventPickingRequest : " + errStr + " " + userName)
- _logger.debug("putEventPickingRequest : %s end" % userName)
- return "ERROR : " + errStr
- try:
- # make filename
- evpFileName = '%s/evp.%s' % (panda_config.cache_dir,str(uuid.uuid4()))
- _logger.debug("putEventPickingRequest : %s -> %s" % (userName,evpFileName))
- # write
- fo = open(evpFileName,'wb')
- fo.write("userName=%s\n" % userName)
- fo.write("creationTime=%s\n" % creationTime)
- fo.write("eventPickDataType=%s\n" % eventPickDataType)
- fo.write("eventPickStreamName=%s\n" % eventPickStreamName)
- fo.write("eventPickDS=%s\n" % eventPickDS)
- fo.write("eventPickAmiTag=%s\n" % eventPickAmiTag)
- fo.write("userDatasetName=%s\n" % userDatasetName)
- fo.write("lockedBy=%s\n" % lockedBy)
- fo.write("params=%s\n" % params)
- fo.write("inputFileList=%s\n" % inputFileList)
- for tmpLine in runEventList.split('\n'):
- tmpItems = tmpLine.split()
- if len(tmpItems) != 2:
- continue
- fo.write("runEvent=%s,%s\n" % tuple(tmpItems))
- fo.close()
- except:
- errType,errValue = sys.exc_info()[:2]
- errStr = "cannot put request due to %s %s" % (errType,errValue)
- _logger.error("putEventPickingRequest : " + errStr + " " + userName)
- return "ERROR : " + errStr
- _logger.debug("putEventPickingRequest : %s end" % userName)
- return True
-
-
-# delete file
-def deleteFile(req,file):
- if not Protocol.isSecure(req):
- return 'False'
- try:
- # may be reused for rebrokreage
- #os.remove('%s/%s' % (panda_config.cache_dir,file.split('/')[-1]))
- return 'True'
- except:
- return 'False'
-
-
-# touch file
-def touchFile(req,filename):
- if not Protocol.isSecure(req):
- return 'False'
- try:
- os.utime('%s/%s' % (panda_config.cache_dir,filename.split('/')[-1]),None)
- return 'True'
- except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("touchFile : %s %s" % (errtype,errvalue))
- return 'False'
-
-
-# get server name:port for SSL
-def getServer(req):
- return "%s:%s" % (panda_config.pserverhost,panda_config.pserverport)
-
-
-# update stdout
-def updateLog(req,file):
- _logger.debug("updateLog : %s start" % file.filename)
- # write to file
- try:
- # expand
- extStr = zlib.decompress(file.file.read())
- # stdout name
- logName = '%s/%s' % (panda_config.cache_dir,file.filename.split('/')[-1])
- # append
- ft = open(logName,'wa')
- ft.write(extStr)
- ft.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("updateLog : %s %s" % (type,value))
- _logger.debug("updateLog : %s end" % file.filename)
- return True
-
-
-# fetch stdout
-def fetchLog(req,logName,offset=0):
- _logger.debug("fetchLog : %s start offset=%s" % (logName,offset))
- # put dummy char to avoid Internal Server Error
- retStr = ' '
- try:
- # stdout name
- fullLogName = '%s/%s' % (panda_config.cache_dir,logName.split('/')[-1])
- # read
- ft = open(fullLogName,'r')
- ft.seek(long(offset))
- retStr += ft.read()
- ft.close()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("fetchLog : %s %s" % (type,value))
- _logger.debug("fetchLog : %s end read=%s" % (logName,len(retStr)))
- return retStr
-
-
-# get VOMS attributes
-def getVomsAttr(req):
- vomsAttrs = []
- for tmpKey,tmpVal in req.subprocess_env.iteritems():
- # compact credentials
- if tmpKey.startswith('GRST_CRED_'):
- vomsAttrs.append('%s : %s\n' % (tmpKey,tmpVal))
- vomsAttrs.sort()
- retStr = ''
- for tmpStr in vomsAttrs:
- retStr += tmpStr
- return retStr
-
-
-# get all attributes
-def getAttr(req):
- allAttrs = []
- for tmpKey,tmpVal in req.subprocess_env.iteritems():
- allAttrs.append('%s : %s\n' % (tmpKey,tmpVal))
- allAttrs.sort()
- retStr = ''
- for tmpStr in allAttrs:
- retStr += tmpStr
- return retStr
diff --git a/current/pandaserver/taskbuffer/WrappedPickle.py b/current/pandaserver/taskbuffer/WrappedPickle.py
deleted file mode 100644
index a3e1fa12f..000000000
--- a/current/pandaserver/taskbuffer/WrappedPickle.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import sys
-import StringIO
-import cPickle as pickle
-
-# wrapper to avoid de-serializing unsafe objects
-class WrappedPickle(object):
- # allowed modules and classes
- allowedModClass = {
- 'copy_reg' : ['_reconstructor'],
- '__builtin__' : ['object'],
- 'datetime' : ['datetime'],
- 'taskbuffer.JobSpec' : ['JobSpec'],
- 'taskbuffer.FileSpec' : ['FileSpec'],
- }
-
- # check module and class
- @classmethod
- def find_class(cls,module,name):
- # check module
- if not cls.allowedModClass.has_key(module):
- raise pickle.UnpicklingError,'Attempting to import disallowed module %s' % module
- # import module
- __import__(module)
- mod = sys.modules[module]
- # check class
- if not name in cls.allowedModClass[module]:
- raise pickle.UnpicklingError,'Attempting to get disallowed class %s in %s' % (name,module)
- klass = getattr(mod,name)
- return klass
-
- # loads
- @classmethod
- def loads(cls,pickle_string):
- pickle_obj = pickle.Unpickler(StringIO.StringIO(pickle_string))
- pickle_obj.find_global = cls.find_class
- return pickle_obj.load()
-
-
diff --git a/current/pandaserver/taskbuffer/__init__.py b/current/pandaserver/taskbuffer/__init__.py
deleted file mode 100755
index e69de29bb..000000000
diff --git a/current/pandaserver/test/XrdAna.py b/current/pandaserver/test/XrdAna.py
deleted file mode 100755
index 37cea8021..000000000
--- a/current/pandaserver/test/XrdAna.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import os
-import re
-import sys
-import commands
-
-tarList = []
-realTime = []
-timeStamps = {}
-for item in os.listdir('.'):
- if item.endswith('log.tgz'):
- commands.getoutput('tar xvfz %s' % item)
- for dirItem in os.listdir('.'):
- if os.path.isdir(dirItem):
- foundTime = False
- file = open('%s/pilot_child.stdout' % dirItem)
- event = -1
- for line in file:
- line = re.sub('\n','',line)
- if line.startswith('AthenaEventLoopMgr INFO ===>>> start of event') \
- or line.startswith('Init Time :') or line.startswith('Wake Time :'):
- #event = line.split()[-2]
- event += 1
- match = re.search('Wake Time : \d{4}-\d{2}-\d{2} (\d{2}:\d{2}:\d{2}\.\d{3})',line)
- if line.startswith('Exec Time :') or line.startswith('Init Time :') \
- or match != None:
- if match != None:
- timeVal = match.group(1)
- else:
- timeVal = line.split()[-1]
- if not (int(event) < 10 or int(event) % 10 == 0):
- continue
- if not timeStamps.has_key(event):
- timeStamps[event] = []
- timeStamps[event].append(timeVal)
- if line.startswith('real'):
- rT = re.sub('m',':',line.split()[-1])
- rT = re.sub('s','',rT)
- realTime.append(rT)
- file.close()
- commands.getoutput('rm -rf %s' % dirItem)
-outReal = open('real.txt','w')
-for rT in realTime:
- outReal.write('%s\n' % rT)
-outReal.close()
-nStamp = 0
-events = timeStamps.keys()
-events.sort()
-outStamp = open('stamp.txt','w')
-for event in events:
- stamps = timeStamps[event]
- if nStamp == 0:
- nStamp = len(stamps)
- if nStamp != len(stamps):
- print "ERROR : invalid nStamp %s %s" % (nStamp,len(stamps))
- str = '%s' % event
- for s in stamps:
- str += ',%s' % s
- outStamp.write(str+'\n')
-outStamp.close()
diff --git a/current/pandaserver/test/XrdTest.py b/current/pandaserver/test/XrdTest.py
deleted file mode 100755
index f377f46de..000000000
--- a/current/pandaserver/test/XrdTest.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = "ANALY_BNL_ATLAS_1"
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = 'BNL_SE'
-
-jobDefinitionID = int(time.time()) % 10000
-
-jobList = []
-
-for i in range(2):
- job = JobSpec()
- job.jobDefinitionID = jobDefinitionID
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i)
- job.AtlasRelease = 'Atlas-12.0.6'
- job.homepackage = 'AnalysisTransforms'
- job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/runAthenaXrd'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.currentPriority = 3000
- job.assignedPriority = 3000
- job.prodSourceLabel = 'user'
- job.computingSite = site
-
- file = FileSpec()
- file.lfn = "%s.AANT._%05d.root" % (job.jobName,i)
- file.destinationDBlock = job.destinationDBlock
- file.destinationSE = job.destinationSE
- file.dataset = job.destinationDBlock
- file.type = 'output'
- job.addFile(file)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- fileL = FileSpec()
- fileL.dataset = 'user.TadashiMaeno.acas0003.lib._000134'
- fileL.prodDBlock = fileL.dataset
- fileL.lfn = 'user.TadashiMaeno.acas0003.lib._000134.lib.tgz'
- fileL.type = 'input'
- fileL.status = 'ready'
- job.addFile(fileL)
-
- job.jobParameters=("-l %s " % fileL.lfn) + """-r run/ -j "%20AnalysisSkeleton_topOptions.py" -i "[]" -m "[]" -n "[]" -o "{'AANT': [('AANTupleStream', 'AANT', """ + ("""'%s')]}" -c""" % file.lfn)
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/activateBNL.py b/current/pandaserver/test/activateBNL.py
deleted file mode 100755
index 55be46f85..000000000
--- a/current/pandaserver/test/activateBNL.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import sys
-import time
-from dataservice.DDM import ddm
-from taskbuffer.DBProxy import DBProxy
-import userinterface.Client as Client
-import urllib2,urllib,datetime,time
-import jobscheduler.siteinfo
-import jobscheduler.Site
-import brokerage.broker_util
-
-# password
-# A very minor edit.
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# instantiate DB proxies
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-# get PandaIDs from jobsDefined
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1)
-sql = "SELECT dispatchDBlock from jobsDefined4 WHERE jobStatus='assigned' AND prodSourceLabel='managed' "
-sql += "AND (computingSite='BNL_ATLAS_1' OR computingSite='BNL_ATLAS_2') AND modificationTime<'%s' "
-sql += "GROUP BY dispatchDBlock"
-
-res = proxyS.querySQL(sql % timeLimit.strftime('%Y-%m-%d %H:%M:%S'))
-
-# emulate DDM callbacks
-for dispatchDBlock, in res:
- print dispatchDBlock
- time.sleep(5)
- # get file list
- status,out = ddm.dq2.main(['listFilesInDataset',dispatchDBlock])
- if status != 0 or out.startswith('Error'):
- print out
- continue
- # make LFN list
- lfns = []
- for line in out.split('\n'):
- items = line.split()
- if len(items) == 2:
- lfns.append(items[1])
- # skip empty datasets
- if len(lfns) == 0:
- print "empty dataset"
- continue
- # get missing file
- missLFNs = brokerage.broker_util.getMissLFNsFromLRC(lfns,jobscheduler.Site.KnownSite('BNL_ATLAS_2').getDQ2URL())
- if len(missLFNs) != 0:
- print "some files are missing"
- continue
- # get VUID and creationdate
- resvuid = proxyS.querySQL("SELECT vuid from Datasets WHERE name='%s'" % dispatchDBlock)
- if len(resvuid) == 1:
- vuid, = resvuid[0]
- # make HTTP request
- node={'vuid':vuid}
- url=Client.baseURLSSL+'/datasetCompleted'
- rdata=urllib.urlencode(node)
- req=urllib2.Request(url)
- # invoke callback
- fd=urllib2.urlopen(req,rdata)
-
diff --git a/current/pandaserver/test/activateDefJobs.py b/current/pandaserver/test/activateDefJobs.py
deleted file mode 100755
index d2d826c55..000000000
--- a/current/pandaserver/test/activateDefJobs.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from taskbuffer.DBProxy import DBProxy
-import userinterface.Client as Client
-import urllib2,urllib,datetime,time
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# time limit
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2)
-
-# instantiate DB proxies
-proxyS = DBProxy()
-proxyS.connect('adbpro.usatlas.bnl.gov',passwd,'panda-developer','PandaDevDB')
-
-# get PandaIDs from jobsDefined
-res = proxyS.querySQL("SELECT dispatchDBlock from jobsDefined4 GROUP BY dispatchDBlock")
-
-# emulate DDM callbacks
-jobs=[]
-for dispatchDBlock, in res:
- # get VUID and creationdate
- resvuid = proxyS.querySQL("SELECT vuid,creationdate from Datasets WHERE name='%s'" % dispatchDBlock)
- if len(resvuid) == 1:
- vuid,creationdate = resvuid[0]
- # convert creatindate to datetime
- creation_datetime = datetime.datetime(*time.strptime(creationdate,'%Y-%m-%d %H:%M:%S')[:6])
- if creation_datetime < timeLimit:
- # make HTTP request
- node={'vuid':vuid}
- url=Client.baseURLSSL+'/datasetCompleted'
- rdata=urllib.urlencode(node)
- req=urllib2.Request(url)
- # invoke callback
- fd=urllib2.urlopen(req,rdata)
-
diff --git a/current/pandaserver/test/activateDefJobs.sh b/current/pandaserver/test/activateDefJobs.sh
deleted file mode 100755
index b2c1bc6bf..000000000
--- a/current/pandaserver/test/activateDefJobs.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-BASEPATH=/usatlas/u/sm/prod
-BINPATH=/usatlas/u/sm/latest
-
-# for python
-export PATH=$BINPATH/python/bin:$PATH
-export PYTHONPATH=$BASEPATH/panda:$PYTHONPATH
-
-python $BASEPATH/panda/test/activateDefJobs.py
diff --git a/current/pandaserver/test/activateJobs.py b/current/pandaserver/test/activateJobs.py
deleted file mode 100755
index b33769d45..000000000
--- a/current/pandaserver/test/activateJobs.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import sys
-
-from taskbuffer.DBProxy import DBProxy
-import userinterface.Client as Client
-import urllib2,urllib,datetime,time
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-if len(sys.argv) == 2:
- startID = int(sys.argv[1])
- endID = startID
-else:
- startID = int(sys.argv[1])
- endID = int(sys.argv[2])
- if startID > endID:
- print '%d is less than %d' % (endID,startID)
- sys.exit(1)
-
-# instantiate DB proxies
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-# get PandaIDs from jobsDefined
-res = proxyS.querySQL("SELECT dispatchDBlock from jobsDefined4 WHERE PandaID>=%s AND PandaID<=%s GROUP BY dispatchDBlock" % (startID,endID))
-
-# emulate DDM callbacks
-for dispatchDBlock, in res:
- # get VUID and creationdate
- resvuid = proxyS.querySQL("SELECT vuid from Datasets WHERE name='%s'" % dispatchDBlock)
- if len(resvuid) == 1:
- vuid, = resvuid[0]
- # make HTTP request
- node={'vuid':vuid}
- url=Client.baseURLSSL+'/datasetCompleted'
- rdata=urllib.urlencode(node)
- req=urllib2.Request(url)
- # invoke callback
- fd=urllib2.urlopen(req,rdata)
-
diff --git a/current/pandaserver/test/activator.py b/current/pandaserver/test/activator.py
deleted file mode 100755
index 8ad5292de..000000000
--- a/current/pandaserver/test/activator.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import os
-import re
-import sys
-import time
-import datetime
-import commands
-from taskbuffer.TaskBuffer import taskBuffer
-from pandalogger.PandaLogger import PandaLogger
-from dataservice.Activator import Activator
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# instantiate TB
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-
-if len(sys.argv) != 2:
- print "datasetname is required"
-
-dataset = taskBuffer.queryDatasetWithMap({'name':sys.argv[1]})
-thr = Activator(taskBuffer,dataset)
-thr.start()
-thr.join()
diff --git a/current/pandaserver/test/add.py b/current/pandaserver/test/add.py
deleted file mode 100755
index a3e1437e5..000000000
--- a/current/pandaserver/test/add.py
+++ /dev/null
@@ -1,434 +0,0 @@
-import os
-import re
-import sys
-import time
-import glob
-import fcntl
-import random
-import datetime
-import commands
-import threading
-from taskbuffer.TaskBuffer import taskBuffer
-from pandalogger.PandaLogger import PandaLogger
-from dataservice.Adder2 import Adder
-from brokerage.SiteMapper import SiteMapper
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# logger
-_logger = PandaLogger().getLogger('add')
-
-_logger.debug("===================== start =====================")
-
-# overall timeout value
-overallTimeout = 20
-
-# current minute
-currentMinute = datetime.datetime.utcnow().minute
-
-# kill old process
-try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName)
- for line in out.split('\n'):
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("kill process : %s %s" % (type,value))
-
-
-# instantiate TB
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-
-# instantiate sitemapper
-aSiteMapper = SiteMapper(taskBuffer)
-
-# delete
-_logger.debug("Del session")
-status,retSel = taskBuffer.querySQLS("SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4",{})
-if retSel != None:
- try:
- maxID = retSel[0][0]
- _logger.debug("maxID : %s" % maxID)
- if maxID != None:
- varMap = {}
- varMap[':maxID'] = maxID
- varMap[':jobStatus1'] = 'activated'
- varMap[':jobStatus2'] = 'waiting'
- varMap[':jobStatus3'] = 'failed'
- varMap[':jobStatus4'] = 'cancelled'
- status,retDel = taskBuffer.querySQLS("DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)",varMap)
- except:
- pass
-
-# count # of getJob/updateJob in dispatcher's log
-try:
- # don't update when logrotate is running
- timeNow = datetime.datetime.utcnow()
- logRotateTime = timeNow.replace(hour=3,minute=2,second=0,microsecond=0)
- if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \
- (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)):
- _logger.debug("skip pilotCounts session for logrotate")
- else:
- # log filename
- dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
- # check if tgz is required
- com = 'head -1 %s' % dispLogName
- lostat,loout = commands.getstatusoutput(com)
- useLogTgz = True
- if lostat == 0:
- match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',loout)
- if match != None:
- startTime = datetime.datetime(*time.strptime(match.group(0),'%Y-%m-%d %H:%M:%S')[:6])
- # current log contains all info
- if startTime datetime.timedelta(minutes=1) and \
- (timeNow - modTime) < datetime.timedelta(hours=1):
- cSt,cOut = commands.getstatusoutput('ps aux | grep fork | grep -v PYTH')
- # if no process is running for the file
- if cSt == 0 and not tmpName in cOut:
- nThr += 1
- thr = ForkThr(tmpName)
- thr.start()
- forkThrList.append(thr)
- if nThr > maxThr:
- break
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s %s" % (errType,errValue))
-
-
-# thread pool
-class ThreadPool:
- def __init__(self):
- self.lock = threading.Lock()
- self.list = []
-
- def add(self,obj):
- self.lock.acquire()
- self.list.append(obj)
- self.lock.release()
-
- def remove(self,obj):
- self.lock.acquire()
- self.list.remove(obj)
- self.lock.release()
-
- def join(self):
- self.lock.acquire()
- thrlist = tuple(self.list)
- self.lock.release()
- for thr in thrlist:
- thr.join()
-
-# thread to adder
-class AdderThr (threading.Thread):
- def __init__(self,lock,pool,taskBuffer,aSiteMapper,pandaID,jobStatus,fileName,ignoreError=True):
- threading.Thread.__init__(self)
- self.lock = lock
- self.pool = pool
- self.pool.add(self)
- self.adder = Adder(taskBuffer,pandaID,"",jobStatus,xmlFile=fileName,
- ignoreDDMError=ignoreError,joinCloser=True,addOutput=True,
- siteMapper=aSiteMapper)
-
- def run(self):
- self.lock.acquire()
- self.adder.start()
- self.adder.join()
- self.pool.remove(self)
- self.lock.release()
-
-
-# get buildJobs in the holding state
-holdingAna = []
-varMap = {}
-varMap[':prodSourceLabel'] = 'panda'
-varMap[':jobStatus'] = 'holding'
-status,res = taskBuffer.querySQLS("SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus",varMap)
-if res != None:
- for id, in res:
- holdingAna.append(id)
-_logger.debug("holding Ana %s " % holdingAna)
-
-# add files
-_logger.debug("Adder session")
-timeNow = datetime.datetime.utcnow()
-timeInt = datetime.datetime.utcnow()
-dirName = panda_config.logdir
-fileList = os.listdir(dirName)
-fileList.sort()
-# remove duplicated files
-tmpList = []
-uMap = {}
-for file in fileList:
- match = re.search('^(\d+)_([^_]+)_.{36}(_\d+)*$',file)
- if match != None:
- fileName = '%s/%s' % (dirName,file)
- id = match.group(1)
- if uMap.has_key(id):
- try:
- os.remove(fileName)
- except:
- pass
- else:
- uMap[id] = fileName
- if long(id) in holdingAna:
- # give a priority to buildJobs
- tmpList.insert(0,file)
- else:
- tmpList.append(file)
-nFixed = 50
-randTmp = tmpList[nFixed:]
-random.shuffle(randTmp)
-fileList = tmpList[:nFixed] + randTmp
-
-# create thread pool and semaphore
-adderLock = threading.Semaphore(3)
-adderThreadPool = ThreadPool()
-
-# add
-while len(fileList) != 0:
- # time limit to aviod too many copyArchve running at the sametime
- if (datetime.datetime.utcnow() - timeNow) > datetime.timedelta(minutes=overallTimeout):
- _logger.debug("time over in Adder session")
- break
- # try to get Semaphore
- adderLock.acquire()
- # get fileList
- if (datetime.datetime.utcnow() - timeInt) > datetime.timedelta(minutes=15):
- timeInt = datetime.datetime.utcnow()
- # get file
- fileList = os.listdir(dirName)
- fileList.sort()
- # remove duplicated files
- tmpList = []
- uMap = {}
- for file in fileList:
- match = re.search('^(\d+)_([^_]+)_.{36}(_\d+)*$',file)
- if match != None:
- fileName = '%s/%s' % (dirName,file)
- id = match.group(1)
- if uMap.has_key(id):
- try:
- os.remove(fileName)
- except:
- pass
- else:
- uMap[id] = fileName
- if long(id) in holdingAna:
- # give a priority to buildJob
- tmpList.insert(0,file)
- else:
- tmpList.append(file)
- fileList = tmpList
- # choose a file
- file = fileList.pop(0)
- # release lock
- adderLock.release()
- # check format
- match = re.search('^(\d+)_([^_]+)_.{36}(_\d+)*$',file)
- if match != None:
- fileName = '%s/%s' % (dirName,file)
- if not os.path.exists(fileName):
- continue
- try:
- modTime = datetime.datetime(*(time.gmtime(os.path.getmtime(fileName))[:7]))
- if (timeNow - modTime) > datetime.timedelta(hours=24):
- # last chance
- _logger.debug("Last Add File : %s" % fileName)
- thr = AdderThr(adderLock,adderThreadPool,taskBuffer,aSiteMapper,match.group(1),
- match.group(2),fileName,False)
- thr.start()
- elif (timeInt - modTime) > datetime.timedelta(minutes=3):
- # add
- _logger.debug("Add File : %s" % fileName)
- thr = AdderThr(adderLock,adderThreadPool,taskBuffer,aSiteMapper,match.group(1),
- match.group(2),fileName)
- thr.start()
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("%s %s" % (type,value))
-
-# join all threads
-adderThreadPool.join()
-
-# join sender
-mailSender.join()
-
-# join fork threads
-for thr in forkThrList:
- thr.join()
-
-_logger.debug("===================== end =====================")
diff --git a/current/pandaserver/test/add.sh b/current/pandaserver/test/add.sh
deleted file mode 100755
index fed990df6..000000000
--- a/current/pandaserver/test/add.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# Panda home
-export PANDA_HOME=/home/sm/prod
-
-# for python
-export PYTHONPATH=$PANDA_HOME/panda:$PYTHONPATH
-
-python $PANDA_HOME/panda/test/add.py
diff --git a/current/pandaserver/test/aho.xml b/current/pandaserver/test/aho.xml
deleted file mode 100755
index 8bfd17333..000000000
--- a/current/pandaserver/test/aho.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/current/pandaserver/test/analysis.py b/current/pandaserver/test/analysis.py
deleted file mode 100755
index 91f498431..000000000
--- a/current/pandaserver/test/analysis.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = None
-
-jobList = []
-for i in range(2):
- datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
- destName = 'ANALY_BNL_ATLAS_1'
-
- job = JobSpec()
- job.jobDefinitionID = 1
- job.jobName = commands.getoutput('uuidgen')
- job.AtlasRelease = 'Atlas-12.0.2'
- job.homepackage = 'AnalysisTransforms'
- job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/runAthena2'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.currentPriority = 3000
- job.prodSourceLabel = 'user'
- job.computingSite = site
- job.prodDBlock = 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103'
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen')
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- fileOZ = FileSpec()
- fileOZ.lfn = "AANT.%s.root" % commands.getoutput('uuidgen')
- fileOZ.destinationDBlock = job.destinationDBlock
- fileOZ.destinationSE = job.destinationSE
- fileOZ.dataset = job.destinationDBlock
- fileOZ.type = 'output'
- job.addFile(fileOZ)
-
- files = [
- 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00001.pool.root.1',
- 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00002.pool.root.1',
- 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00003.pool.root.1',
- ]
- for lfn in files:
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- fileI.status = 'ready'
- job.addFile(fileI)
-
- fileL = FileSpec()
- fileL.dataset = 'user.TadashiMaeno.lib._000157'
- fileL.prodDBlock = 'user.TadashiMaeno.lib._000157'
- fileL.lfn = 'user.TadashiMaeno.lib._000157.lib.tgz'
- fileL.type = 'input'
- fileL.status = 'ready'
- job.addFile(fileL)
-
- job.jobParameters=""" -l user.TadashiMaeno.lib._000157.lib.tgz -r run/ -j " AnalysisSkeleton_jobOptions.py" -i "['testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00001.pool.root.1', 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00002.pool.root.1', 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00003.pool.root.1']" -o "{'AANT': [('AANTupleStream', 'AANT', '%s')]}" """ % fileOZ.lfn
-
- jobList.append(job)
-
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/analyzeLog.py b/current/pandaserver/test/analyzeLog.py
deleted file mode 100755
index 8b9314e5c..000000000
--- a/current/pandaserver/test/analyzeLog.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import re
-from config import panda_config
-
-# analyze Setupper log
-logSetupper = open('%s/panda-Setupper.log' % panda_config.logdir)
-# extract subscriptions
-mapSub = {}
-mapDataset = {}
-for line in logSetupper:
- items = re.findall("'registerDatasetSubscription', '(.+_dis\d+)', '([^']+)'",line)
- if len(items) != 0:
- dataset = items[0][0]
- siteID = items[0][1]
- date = '%s %s' % tuple(re.split(' |,',line)[:2])
- if not mapSub.has_key(siteID):
- mapSub[siteID] = []
- # append
- mapSub[siteID].append(dataset)
- mapDataset[dataset] = (date,False)
-logSetupper.close()
-
-# analyze Activator log
-logActivator = open('%s/panda-Activator.log' % panda_config.logdir)
-# extract callbacks
-for line in logActivator:
- items = re.findall("start: (\S+_dis\d+)$",line)
- if len(items) != 0:
- dataset = items[0]
- if dataset in mapDataset.keys():
- mapDataset[dataset] = mapDataset[dataset][:-1]+(True,)
-logActivator.close()
-
-# print
-for siteID in mapSub.keys():
- print "ID : %s" % siteID
- nSucceed = 0
- failedSubs = []
- for dataset in mapSub[siteID]:
- # succeeded
- if mapDataset[dataset][-1:][0]:
- nSucceed += 1
- # failed
- else:
- failedSubs.append((mapDataset[dataset][0],dataset))
- # statistics
- print " Total:%d Succeeded:%d" % (len(mapSub[siteID]),nSucceed)
- # not completed subscriptions
- print " Not completed"
- for item in failedSubs:
- print " %s" % item[0]
- print " %s" % item[1]
- print
-
-
-
diff --git a/current/pandaserver/test/archivelogs.py b/current/pandaserver/test/archivelogs.py
deleted file mode 100644
index 86d81d8ab..000000000
--- a/current/pandaserver/test/archivelogs.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import re
-import os
-import glob
-import stat
-import commands
-
-from config import panda_config
-
-srcDir = panda_config.logdir
-dstDir = '/tmp/logbackup' + srcDir
-
-logFiles = glob.glob(srcDir+'/*log.1.gz')
-
-# check time stamp
-for logFile in logFiles:
- baseName = logFile.split('/')[-1]
- print "log name : %s" % baseName
- targetFile = "%s/%s" % (dstDir,baseName)
- # already exists
- if os.path.exists(targetFile) and \
- os.stat(logFile)[stat.ST_SIZE] == os.stat(targetFile)[stat.ST_SIZE]:
- com = 'cmp %s %s' % (logFile,targetFile)
- cmpSt,cmpOut = commands.getstatusoutput(com)
- if cmpSt == 0:
- print " -> skip : already exists"
- continue
- # increment
- maxIndex = 60
- if os.path.exists(targetFile):
- templateName = re.sub('1\.gz$','%s.gz',baseName)
- for tmpIdx in range(1,maxIndex):
- renameSrc = dstDir + '/' + (templateName % (maxIndex-tmpIdx))
- renameDst = dstDir + '/' + (templateName % (maxIndex-tmpIdx+1))
- if os.path.exists(renameSrc):
- com = 'mv -f %s %s' % (renameSrc,renameDst)
- print com
- print commands.getoutput(com)
- # copy
- com = 'cp -fp %s %s' % (logFile,dstDir)
- print com
- print commands.getoutput(com)
-
-# touch to avoid tmpwatch
-com = 'touch %s/*' % dstDir
-print commands.getoutput(com)
diff --git a/current/pandaserver/test/backupJobArch.py b/current/pandaserver/test/backupJobArch.py
deleted file mode 100755
index 6ebc8dac2..000000000
--- a/current/pandaserver/test/backupJobArch.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import os
-import re
-import sys
-import time
-import fcntl
-import types
-import shelve
-import random
-import datetime
-import commands
-import threading
-import userinterface.Client as Client
-from dataservice.DDM import ddm
-from dataservice.DDM import dashBorad
-from taskbuffer.OraDBProxy import DBProxy
-from taskbuffer.TaskBuffer import taskBuffer
-from pandalogger.PandaLogger import PandaLogger
-from jobdispatcher.Watcher import Watcher
-from brokerage.SiteMapper import SiteMapper
-from dataservice.Adder import Adder
-from dataservice.Finisher import Finisher
-from dataservice.MailUtils import MailUtils
-from taskbuffer import ProcessGroups
-import brokerage.broker_util
-import brokerage.broker
-import taskbuffer.ErrorCode
-import dataservice.DDM
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# logger
-_logger = PandaLogger().getLogger('backupJobArch')
-
-_logger.debug("===================== start =====================")
-
-# memory checker
-def _memoryCheck(str):
- try:
- proc_status = '/proc/%d/status' % os.getpid()
- procfile = open(proc_status)
- name = ""
- vmSize = ""
- vmRSS = ""
- # extract Name,VmSize,VmRSS
- for line in procfile:
- if line.startswith("Name:"):
- name = line.split()[-1]
- continue
- if line.startswith("VmSize:"):
- vmSize = ""
- for item in line.split()[1:]:
- vmSize += item
- continue
- if line.startswith("VmRSS:"):
- vmRSS = ""
- for item in line.split()[1:]:
- vmRSS += item
- continue
- procfile.close()
- _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str))
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("memoryCheck() : %s %s" % (type,value))
- _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str))
- return
-
-_memoryCheck("start")
-
-# kill old dq2 process
-try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=30)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('ps axo user,pid,lstart,args | grep dq2.clientapi | grep -v PYTHONPATH | grep -v grep')
- for line in out.split('\n'):
- if line == '':
- continue
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old dq2 process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("kill dq2 process : %s %s" % (type,value))
-
-
-# kill old process
-try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName)
- for line in out.split('\n'):
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("kill process : %s %s" % (type,value))
-
-
-# instantiate TB
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-
-# instantiate sitemapper
-siteMapper = SiteMapper(taskBuffer)
-
-
-# table names
-jobATableName = "ATLAS_PANDAARCH.jobsArchived"
-filesATableName = "ATLAS_PANDAARCH.filesTable_ARCH"
-paramATableName = "ATLAS_PANDAARCH.jobParamsTable_ARCH"
-metaATableName = "ATLAS_PANDAARCH.metaTable_ARCH"
-
-# time limit
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=3)
-
-# copy
-_logger.debug("get PandaIDs for Archive")
-varMap = {}
-varMap[':archivedFlag'] = 0
-status,res = taskBuffer.querySQLS("SELECT PandaID,modificationTime FROM ATLAS_PANDA.jobsArchived4 WHERE archivedFlag=:archivedFlag ORDER BY PandaID",
- varMap,arraySize=1000000)
-if res == None:
- _logger.debug("total %s " % res)
-else:
- _logger.debug("total %s " % len(res))
- # copy
- tmpIndex = 0
- tmpTotal = len(res)
- random.shuffle(res)
- for (id,srcEndTime) in res:
- tmpIndex += 1
- try:
- # copy
- proxyS = taskBuffer.proxyPool.getProxy()
- proxyS.insertJobSimpleUnread(id,srcEndTime)
- taskBuffer.proxyPool.putProxy(proxyS)
- _logger.debug("INSERT %s" % id)
- if tmpIndex % 100 == 1:
- _logger.debug(" copied %s/%s" % (tmpIndex,tmpTotal))
- except:
- pass
-
-_logger.debug("===================== end =====================")
diff --git a/current/pandaserver/test/banUser.py b/current/pandaserver/test/banUser.py
deleted file mode 100644
index 6217a058c..000000000
--- a/current/pandaserver/test/banUser.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import sys
-import time
-import datetime
-import optparse
-
-from taskbuffer.OraDBProxy import DBProxy
-# password
-from config import panda_config
-
-optP = optparse.OptionParser(conflict_handler="resolve")
-optP.add_option('--user', action='store',dest='user', default=None,help='prodUserName')
-optP.add_option('--unban',action='store_const',const=True,dest='unban',default=False,help='unban the user')
-
-options,args = optP.parse_args()
-
-if options.user == None:
- print "--user= is required"
- sys.exit(1)
-
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-prodUserName = sys.argv[1]
-import userinterface.Client as Client
-
-varMap = {}
-varMap[':name'] = options.user
-if options.unban:
- varMap[':status'] = None
-else:
- varMap[':status'] = 'disabled'
-
-sql = "UPDATE ATLAS_PANDAMETA.users SET status=:status WHERE name=:name"
-
-status,res = proxyS.querySQLS(sql,varMap)
-if res == None:
- print "Failed with database error"
-else:
- print "%s rows updated" % res
-
-
diff --git a/current/pandaserver/test/boostPrio.py b/current/pandaserver/test/boostPrio.py
deleted file mode 100755
index 4bc13fda6..000000000
--- a/current/pandaserver/test/boostPrio.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import time
-import sys
-
-from taskbuffer.OraDBProxy import DBProxy
-
-# password
-from config import panda_config
-
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-varMap = {}
-varMap[':prodSourceLabel'] = 'managed'
-varMap[':taskID'] = sys.argv[1]
-varMap[':prio'] = sys.argv[2]
-sql = "UPDATE %s SET currentPriority=currentPriority+:prio WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID"
-for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']:
- status,res = proxyS.querySQLS(sql % table,varMap)
-
-
diff --git a/current/pandaserver/test/boostUser.py b/current/pandaserver/test/boostUser.py
deleted file mode 100755
index 17f6c1483..000000000
--- a/current/pandaserver/test/boostUser.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import sys
-from config import panda_config
-
-# initialize cx_Oracle using dummy connection
-from taskbuffer.Initializer import initializer
-initializer.init()
-
-from dataservice.Merger import Merger
-from taskbuffer.TaskBuffer import taskBuffer
-from pandalogger.PandaLogger import PandaLogger
-
-
-# logger
-_logger = PandaLogger().getLogger('boostUser')
-_logger.debug("================= start ==================")
-
-# instantiate TB
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-
-user = sys.stdin.read()
-user = user[:-1]
-
-sql = "UPDATE atlas_panda.%s set currentPriority=:prio where prodUserName=:uname and prodSourceLabel IN (:label1,:label2) and currentPriority<:prio"
-varMap = {}
-varMap[':prio'] = 4000
-varMap[':uname'] = user
-varMap[':label1'] = 'user'
-varMap[':label2'] = 'panda'
-for table in ('jobsactive4','jobsdefined4'):
- _logger.debug((sql % table) + str(varMap))
- ret = taskBuffer.querySQLS(sql % table,varMap)
- _logger.debug('ret -> %s' % str(ret))
-
-_logger.debug("================= end ==================")
diff --git a/current/pandaserver/test/callbackDDM.py b/current/pandaserver/test/callbackDDM.py
deleted file mode 100755
index 8564b272e..000000000
--- a/current/pandaserver/test/callbackDDM.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import sys
-import urllib2,urllib
-
-node={}
-node['vuid']=sys.argv[1]
-url='https://gridui01.usatlas.bnl.gov:25443/server/panda/datasetCompleted'
-rdata=urllib.urlencode(node)
-req=urllib2.Request(url)
-fd=urllib2.urlopen(req,rdata)
-data = fd.read()
-
-print data
diff --git a/current/pandaserver/test/checkGetJob.py b/current/pandaserver/test/checkGetJob.py
deleted file mode 100644
index 79d1a0ecf..000000000
--- a/current/pandaserver/test/checkGetJob.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import sys
-import re
-import time
-import datetime
-timeLimit = datetime.timedelta(seconds=10)
-f = open("../../httpd/logs/panda-DBProxy.log")
-for line in f:
- match = re.search('unlock',line)
- if match:
- timeM = re.search('^(\d+-\d+-\d+ \d+:\d+:\d+),(\d+)',line)
- endTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%d %H:%M:%S')[:6])
- endTime = endTime.replace(microsecond = 1000*int(timeM.group(2)))
- timeM = re.search('getJobs : (\d+-\d+-\d+T\d+:\d+:\d+)\.(\d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%dT%H:%M:%S')[:6])
- startTime = startTime.replace(microsecond = int(timeM.group(2)))
- if (endTime-startTime) > timeLimit:
- print '%s %s' % (startTime,endTime-startTime)
-f.close()
diff --git a/current/pandaserver/test/checkSetupper.py b/current/pandaserver/test/checkSetupper.py
deleted file mode 100644
index 1f1dbfdd6..000000000
--- a/current/pandaserver/test/checkSetupper.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import re
-import time
-import datetime
-f = open("../../httpd/logs/panda-Setupper.log")
-session = []
-timeList = {}
-for line in f:
- match = re.search('DEBUG (.*) startRun',line)
- if match:
- stamp = match.group(1)
- stamp = stamp.strip()
- session.append(stamp)
- timeM = re.search('^(\d+-\d+-\d+ \d+:\d+:\d+),',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%d %H:%M:%S')[:6])
- timeList[stamp] = startTime
- continue
- match = re.search('DEBUG (.*) endRun',line)
- if match:
- stamp = match.group(1)
- stamp = stamp.strip()
- session.remove(stamp)
- timeM = re.search('^(\d+-\d+-\d+ \d+:\d+:\d+),',line)
- endTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%d %H:%M:%S')[:6])
- if timeList.has_key(stamp):
- delta = endTime - timeList[stamp]
- if delta > datetime.timedelta(minutes = 10):
- print "Start : %s " % stamp
- print " took -> %02d:%02d:%02d" % (delta.seconds/(60*60),(delta.seconds%(60*60))/60,delta.seconds%60)
- continue
-
-print session
diff --git a/current/pandaserver/test/cl_testEvgen.py b/current/pandaserver/test/cl_testEvgen.py
deleted file mode 100644
index 137c496bd..000000000
--- a/current/pandaserver/test/cl_testEvgen.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#
-# eg. python cl_testEvgen.py SACLAY FR
-#
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)==2:
- site = sys.argv[1]
- cloud='CA'
-elif len(sys.argv)==3:
- site = sys.argv[1]
- cloud=sys.argv[2]
-else:
- site = None
- cloud = None
-
-datasetName = 'panda.destDB.%s_tid999991' % commands.getoutput('uuidgen')
-taskid = 999989
-
-jobList = []
-
-for i in range(1):
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i)
-# job.AtlasRelease = 'Atlas-12.0.6'
-# job.homepackage = 'AtlasProduction/12.0.6.5'
- job.AtlasRelease = 'Atlas-12.0.7'
- job.homepackage = 'AtlasProduction/12.0.7.1'
-
- job.transformation = 'csc_evgen_trf.py'
- job.destinationDBlock = datasetName
-# job.destinationSE = destName
-# job.cloud = 'CA'
- job.cloud = cloud
- job.taskID = taskid
- job.currentPriority = 1000
- job.prodSourceLabel = 'test'
-# job.prodSourceLabel = 'cloudtest'
- job.computingSite = site
-
- file = FileSpec()
- file.lfn = "%s.evgen.pool.root" % job.jobName
- file.destinationDBlock = job.destinationDBlock
- file.destinationSE = job.destinationSE
- file.dataset = job.destinationDBlock
- file.type = 'output'
- job.addFile(file)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="8072 0 5000 1 DC3.008072.JimmyPhotonJet1.py %s NONE NONE NONE" % file.lfn
- jobList.append(job)
-
-for i in range(1):
- s,o = Client.submitJobs(jobList)
- print "---------------------"
- print s
- for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/cl_testG4sim.py b/current/pandaserver/test/cl_testG4sim.py
deleted file mode 100644
index ed1db41ab..000000000
--- a/current/pandaserver/test/cl_testG4sim.py
+++ /dev/null
@@ -1,120 +0,0 @@
-#
-# eg. python cl_testG4sim.py SACLAY FR
-#
-
-import sys
-import time
-import random
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)==2:
- site = sys.argv[1]
- cloud='CA'
-elif len(sys.argv)==3:
- site = sys.argv[1]
- cloud=sys.argv[2]
-else:
- site = None
- cloud = None
-
-datasetName = 'panda.rod2.%s_tid999990' % commands.getoutput('uuidgen')
-#destName = 'BNL_SE'
-
-if cloud=='UK':
- files = {
- 'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01035.pool.root.1':'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541',
- }
-# or mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01174.pool.root.1, mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01035.pool.root.1
-elif cloud=='CA':
- files={'EVNT.012303._00901.pool.root.1':'mc12.005001.pythia_minbias.evgen.EVNT.v12000701_tid012303',}
-elif cloud=='FR':
- files={'EVNT.010822._00007.pool.root.1':'mc12.006873.PythiaWH140lnugamgam.evgen.EVNT.v12000701_tid010822',}
-elif cloud in ['ES']:
- files={'EVNT.016869._00187.pool.root.1':'mc12.005001.pythia_minbias.evgen.EVNT.v12000601_tid016869',}
-elif cloud in ['DE']:
- files={'EVNT.016869._00177.pool.root.2':'mc12.005001.pythia_minbias.evgen.EVNT.v12000601_tid016869',}
-else:
- print 'Cloud not known: %s'%cloud
- cloud = None
- files={'EVNT.012303._00545.pool.root.1':'rod.cloudtest1'}
-
-# UK
-#'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01035.pool.root.1':'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541',
-# CA
-# 'EVNT.012303._00901.pool.root.1':'mc12.005001.pythia_minbias.evgen.EVNT.v12000701_tid012303',
-
-
-
-jobList = []
-
-for i in range(1):
- for lfn in files.keys():
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = commands.getoutput('uuidgen')
- job.AtlasRelease = 'Atlas-12.0.7'
- job.homepackage = 'AtlasProduction/12.0.7.1'
-# Need different args too
-# job.AtlasRelease = 'Atlas-13.0.30'
-# job.homepackage = 'AtlasProduction/13.0.30.2'
- job.transformation = 'csc_simul_trf.py'
- job.destinationDBlock = datasetName
- job.cloud = cloud
- job.computingSite = site
-# job.prodDBlock = 'mc12.005001.pythia_minbias.evgen.EVNT.v12000701_tid012303'
- job.prodDBlock = files[lfn]
- job.prodSourceLabel = 'test'
-# job.prodSourceLabel = 'cloudtest'
- job.currentPriority = 1001
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileD = FileSpec()
- fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v030101'
- fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101'
- fileD.lfn = 'DBRelease-3.1.1.tar.gz'
- fileD.type = 'input'
- job.addFile(fileD)
-
-
- fileOE = FileSpec()
- fileOE.lfn = "%s.HITS.pool.root" % job.jobName
- fileOE.destinationDBlock = job.destinationDBlock
- fileOE.destinationSE = job.destinationSE
- fileOE.dataset = job.destinationDBlock
- fileOE.type = 'output'
- job.addFile(fileOE)
-
- fileOA = FileSpec()
- fileOA.lfn = "%s.RDO.pool.root" % job.jobName
- fileOA.destinationDBlock = job.destinationDBlock
- fileOA.destinationSE = job.destinationSE
- fileOA.dataset = job.destinationDBlock
- fileOA.type = 'output'
- job.addFile(fileOA)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="%s %s %s 1 4000 153781 ATLAS-CSC-01-02-00 NONE %s" % (fileI.lfn,fileOE.lfn,fileOA.lfn,fileD.lfn)
-
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/cl_testMXreco.py b/current/pandaserver/test/cl_testMXreco.py
deleted file mode 100644
index 1fb770bee..000000000
--- a/current/pandaserver/test/cl_testMXreco.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#
-# eg. python cl_testG4sim.py SACLAY FR
-#
-
-import sys
-import time
-import random
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)==2:
- site = sys.argv[1]
- cloud='CA'
-elif len(sys.argv)==3:
- site = sys.argv[1]
- cloud=sys.argv[2]
-else:
- site = None
- cloud = None
-
-datasetName = 'panda.rod2.%s_tid999990' % commands.getoutput('uuidgen')
-#destName = 'BNL_SE'
-
-files={'daq.m5_combined.0028997.Default.L1TT-b00000110.LB0000.SFO-1._0001.data':'M5.0028997.Default.L1TT-b00000110.RAW.v010803',}
-
-if cloud=='IT':
- files={'daq.m5_combined.0029118.Default.L1TT-b00000010.LB0000.SFO-1._0001.data':'M5.0029118.Default.L1TT-b00000010.RAW.v010803'}
-
-
-jobList = []
-
-for i in range(1):
- for lfn in files.keys():
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = commands.getoutput('uuidgen')
- job.AtlasRelease = 'Atlas-13.0.35'
- job.homepackage = 'AtlasPoint1/13.0.35.1'
- job.transformation = 'csc_cosmics_trf.py'
- job.destinationDBlock = datasetName
- job.cloud = cloud
- job.computingSite = site
- job.prodDBlock = files[lfn]
- job.prodSourceLabel = 'test'
- job.currentPriority = 1001
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileD = FileSpec()
- fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v030101'
- fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101'
- fileD.lfn = 'DBRelease-3.1.1.tar.gz'
- fileD.type = 'input'
-# job.addFile(fileD)
-
-
- fileO1 = FileSpec()
- fileO1.lfn = "%s.ESD.pool.root" % job.jobName
- fileO1.destinationDBlock = job.destinationDBlock
- fileO1.destinationSE = job.destinationSE
- fileO1.dataset = job.destinationDBlock
- fileO1.type = 'output'
- job.addFile(fileO1)
-
- fileO2 = FileSpec()
- fileO2.lfn = "%s.ESDF.pool.root" % job.jobName
- fileO2.destinationDBlock = job.destinationDBlock
- fileO2.destinationSE = job.destinationSE
- fileO2.dataset = job.destinationDBlock
- fileO2.type = 'output'
-# job.addFile(fileO2)
-
- fileO3 = FileSpec()
- fileO3.lfn = "%s.NTUP.pool.root" % job.jobName
- fileO3.destinationDBlock = job.destinationDBlock
- fileO3.destinationSE = job.destinationSE
- fileO3.dataset = job.destinationDBlock
- fileO3.type = 'output'
- job.addFile(fileO3)
-
- fileO4 = FileSpec()
- fileO4.lfn = "%s.HIST.pool.root" % job.jobName
- fileO4.destinationDBlock = job.destinationDBlock
- fileO4.destinationSE = job.destinationSE
- fileO4.dataset = job.destinationDBlock
- fileO4.type = 'output'
- job.addFile(fileO4)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="%s LAR_TILE_MUONS_LVL1C 10 %s NONE %s %s COMCOND-002-00 NONE" % (fileI.lfn,fileO1.lfn,fileO3.lfn,fileO4.lfn)
-
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/cleanup.py b/current/pandaserver/test/cleanup.py
deleted file mode 100644
index a1b170d11..000000000
--- a/current/pandaserver/test/cleanup.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import commands
-
-for patt in ['dq2.clientapi.cli.cliutil.getDQ2','forkSetupper.py','LFCclient.py']:
- out = commands.getoutput('ps aux | grep python | grep %s' % patt)
- for line in out.split('\n'):
- items = line.split()
- print items[1], items[8]
- if items[8] in ['Sep04','Sep05']:
- commands.getoutput('kill -9 %s' % items[1])
-
diff --git a/current/pandaserver/test/closeDS.py b/current/pandaserver/test/closeDS.py
deleted file mode 100755
index 4aeface4f..000000000
--- a/current/pandaserver/test/closeDS.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import os
-import time
-import datetime
-import commands
-import jobscheduler.Site
-import userinterface.Client as Client
-from dataservice.DDM import ddm
-from taskbuffer.DBProxy import DBProxy
-from taskbuffer.TaskBuffer import taskBuffer
-from pandalogger.PandaLogger import PandaLogger
-from jobdispatcher.Watcher import Watcher
-
-# logger
-_logger = PandaLogger().getLogger('closeDS')
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# instantiate DB proxies
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-# time limit for dataset closing
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7)
-
-# close datasets
-while True:
- sql = "SELECT vuid,name,modificationdate FROM Datasets " + \
- "WHERE type='output' AND (status='running' OR status='created' OR status='defined') " + \
- "AND modificationdate<'%s' AND name REGEXP '_sub[[:digit:]]+$'"
- ret,res = proxyS.querySQLS(sql % timeLimit.strftime('%Y-%m-%d %H:%M:%S'))
- _logger.debug("# of dataset : %s" % len(res))
- if len(res) == 0:
- break
- for (vuid,name,modDate) in res:
- _logger.debug("start %s %s" % (modDate,name))
- retF,resF = proxyS.querySQLS("SELECT lfn FROM filesTable4 WHERE destinationDBlock='%s'" % name)
- if retF<0 or retF == None or retF!=len(resF):
- _logger.error("SQL error")
- else:
- # no files in filesTable
- if len(resF) == 0:
- _logger.debug("freeze %s " % name)
- status,out = ddm.dq2.main(['freezeDataset',name])
- if status != 0 or (out.find('Error') != -1 and out.find('DQ2 unknown dataset exception') == -1 \
- and out.find('DQ2 security exception') == -1):
- _logger.error(out)
- else:
- proxyS.querySQL("UPDATE Datasets SET status='completed',modificationdate=UTC_TIMESTAMP() WHERE vuid='%s'" % vuid)
- else:
- _logger.debug("wait %s " % name)
- proxyS.querySQL("UPDATE Datasets SET modificationdate=UTC_TIMESTAMP() WHERE vuid='%s'" % vuid)
- _logger.debug("end %s " % name)
- time.sleep(1)
diff --git a/current/pandaserver/test/copyArchive.py b/current/pandaserver/test/copyArchive.py
deleted file mode 100755
index 486e28673..000000000
--- a/current/pandaserver/test/copyArchive.py
+++ /dev/null
@@ -1,1653 +0,0 @@
-import os
-import re
-import sys
-import time
-import fcntl
-import types
-import shelve
-import random
-import datetime
-import commands
-import threading
-import userinterface.Client as Client
-from dataservice.DDM import ddm
-from dataservice.DDM import dashBorad
-from taskbuffer.OraDBProxy import DBProxy
-from taskbuffer.TaskBuffer import taskBuffer
-from pandalogger.PandaLogger import PandaLogger
-from jobdispatcher.Watcher import Watcher
-from brokerage.SiteMapper import SiteMapper
-from dataservice.Adder import Adder
-from dataservice.Finisher import Finisher
-from dataservice.MailUtils import MailUtils
-from taskbuffer import ProcessGroups
-import brokerage.broker_util
-import brokerage.broker
-import taskbuffer.ErrorCode
-import dataservice.DDM
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# logger
-_logger = PandaLogger().getLogger('copyArchive')
-
-_logger.debug("===================== start =====================")
-
-# memory checker
-def _memoryCheck(str):
- try:
- proc_status = '/proc/%d/status' % os.getpid()
- procfile = open(proc_status)
- name = ""
- vmSize = ""
- vmRSS = ""
- # extract Name,VmSize,VmRSS
- for line in procfile:
- if line.startswith("Name:"):
- name = line.split()[-1]
- continue
- if line.startswith("VmSize:"):
- vmSize = ""
- for item in line.split()[1:]:
- vmSize += item
- continue
- if line.startswith("VmRSS:"):
- vmRSS = ""
- for item in line.split()[1:]:
- vmRSS += item
- continue
- procfile.close()
- _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str))
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("memoryCheck() : %s %s" % (type,value))
- _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str))
- return
-
-_memoryCheck("start")
-
-# kill old dq2 process
-try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('ps axo user,pid,lstart,args | grep dq2.clientapi | grep -v PYTHONPATH | grep -v grep')
- for line in out.split('\n'):
- if line == '':
- continue
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old dq2 process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("kill dq2 process : %s %s" % (type,value))
-
-
-# kill old process
-try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=7)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName)
- for line in out.split('\n'):
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("kill process : %s %s" % (type,value))
-
-
-# instantiate TB
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-
-# instantiate sitemapper
-siteMapper = SiteMapper(taskBuffer)
-
-
-
-# send email for access requests
-_logger.debug("Site Access")
-try:
- # get contact
- contactAddr = {}
- siteContactAddr = {}
- sql = "SELECT name,email FROM ATLAS_PANDAMETA.cloudconfig"
- status,res = taskBuffer.querySQLS(sql,{})
- for cloudName,cloudEmail in res:
- contactAddr[cloudName] = cloudEmail
- # get requests
- sql = "SELECT pandaSite,status,dn FROM ATLAS_PANDAMETA.siteaccess WHERE status IN (:status1,:status2,:status3) "
- sql += "ORDER BY pandaSite,status "
- varMap = {}
- varMap[':status1'] = 'requested'
- varMap[':status2'] = 'tobeapproved'
- varMap[':status3'] = 'toberejected'
- status,res = taskBuffer.querySQLS(sql,varMap)
- requestsInCloud = {}
- mailUtils = MailUtils()
- # loop over all requests
- for pandaSite,reqStatus,userName in res:
- cloud = siteMapper.getSite(pandaSite).cloud
- _logger.debug("request : '%s' site=%s status=%s cloud=%s" % (userName,pandaSite,reqStatus,cloud))
- # send emails to user
- if reqStatus in ['tobeapproved','toberejected']:
- # set status
- if reqStatus == 'tobeapproved':
- newStatus = 'approved'
- else:
- newStatus = 'rejected'
- # get mail address for user
- userMailAddr = ''
- sqlUM = "SELECT email FROM ATLAS_PANDAMETA.users WHERE name=:userName"
- varMap = {}
- varMap[':userName'] = userName
- stUM,resUM = taskBuffer.querySQLS(sqlUM,varMap)
- if resUM == None or len(resUM) == 0:
- _logger.error("email address is unavailable for '%s'" % userName)
- else:
- userMailAddr = resUM[0][0]
- # send
- if not userMailAddr in ['',None,'None','notsend']:
- _logger.debug("send update to %s" % userMailAddr)
- retMail = mailUtils.sendSiteAccessUpdate(userMailAddr,newStatus,pandaSite)
- _logger.debug(retMail)
- # update database
- sqlUp = "UPDATE ATLAS_PANDAMETA.siteaccess SET status=:newStatus "
- sqlUp += "WHERE pandaSite=:pandaSite AND dn=:userName"
- varMap = {}
- varMap[':userName'] = userName
- varMap[':newStatus'] = newStatus
- varMap[':pandaSite'] = pandaSite
- stUp,resUp = taskBuffer.querySQLS(sqlUp,varMap)
- else:
- # append cloud
- if not requestsInCloud.has_key(cloud):
- requestsInCloud[cloud] = {}
- # append site
- if not requestsInCloud[cloud].has_key(pandaSite):
- requestsInCloud[cloud][pandaSite] = []
- # append user
- requestsInCloud[cloud][pandaSite].append(userName)
- # send requests to the cloud responsible
- for cloud,requestsMap in requestsInCloud.iteritems():
- _logger.debug("requests for approval : cloud=%s" % cloud)
- # send
- if contactAddr.has_key(cloud) and (not contactAddr[cloud] in ['',None,'None']):
- # get site contact
- for pandaSite,userNames in requestsMap.iteritems():
- if not siteContactAddr.has_key(pandaSite):
- varMap = {}
- varMap[':siteid'] = pandaSite
- sqlSite = "SELECT email FROM ATLAS_PANDAMETA.schedconfig WHERE siteid=:siteid AND rownum<=1"
- status,res = taskBuffer.querySQLS(sqlSite,varMap)
- siteContactAddr[pandaSite] = res[0][0]
- # append
- if not siteContactAddr[pandaSite] in ['',None,'None']:
- contactAddr[cloud] += ',%s' % siteContactAddr[pandaSite]
- # send
- _logger.debug("send request to %s" % contactAddr[cloud])
- retMail = mailUtils.sendSiteAccessRequest(contactAddr[cloud],requestsMap,cloud)
- _logger.debug(retMail)
- # update database
- if retMail:
- sqlUp = "UPDATE ATLAS_PANDAMETA.siteaccess SET status=:newStatus "
- sqlUp += "WHERE pandaSite=:pandaSite AND dn=:userName"
- for pandaSite,userNames in requestsMap.iteritems():
- for userName in userNames:
- varMap = {}
- varMap[':userName'] = userName
- varMap[':newStatus'] = 'inprocess'
- varMap[':pandaSite'] = pandaSite
- stUp,resUp = taskBuffer.querySQLS(sqlUp,varMap)
- else:
- _logger.error("contact email address is unavailable for %s" % cloud)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("Failed with %s %s" % (type,value))
-_logger.debug("Site Access : done")
-
-
-# finalize failed jobs
-_logger.debug("AnalFinalizer session")
-try:
- # get min PandaID for failed jobs in Active table
- sql = "SELECT MIN(PandaID),prodUserName,jobDefinitionID FROM ATLAS_PANDA.jobsActive4 "
- sql += "WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus "
- sql += "GROUP BY prodUserName,jobDefinitionID "
- varMap = {}
- varMap[':jobStatus'] = 'failed'
- varMap[':prodSourceLabel'] = 'user'
- status,res = taskBuffer.querySQLS(sql,varMap)
- if res != None:
- # loop over all user/jobdefID
- for pandaID,prodUserName,jobDefinitionID in res:
- # check
- _logger.debug("check finalization for %s %s" % (prodUserName,jobDefinitionID))
- sqlC = "SELECT COUNT(*) FROM ATLAS_PANDA.jobsActive4 "
- sqlC += "WHERE prodSourceLabel=:prodSourceLabel AND prodUserName=:prodUserName "
- sqlC += "AND jobDefinitionID=:jobDefinitionID AND jobStatus<>:jobStatus "
- varMap = {}
- varMap[':jobStatus'] = 'failed'
- varMap[':prodSourceLabel'] = 'user'
- varMap[':jobDefinitionID'] = jobDefinitionID
- varMap[':prodUserName'] = prodUserName
- statC,resC = taskBuffer.querySQLS(sqlC,varMap)
- # finalize if there is no non-failed jobs
- if resC != None:
- _logger.debug("n of non-failed jobs : %s" % resC[0][0])
- if resC[0][0] == 0:
- _logger.debug("finalize %s %s" % (prodUserName,jobDefinitionID))
- taskBuffer.finalizePendingJobs(prodUserName,jobDefinitionID)
- else:
- _logger.debug("n of non-failed jobs : None")
-except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("AnalFinalizer failed with %s %s" % (errType,errValue))
-
-
-_memoryCheck("watcher")
-
-_logger.debug("Watcher session")
-# check heartbeat for analysis jobs
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2)
-varMap = {}
-varMap[':modificationTime'] = timeLimit
-varMap[':prodSourceLabel1'] = 'panda'
-varMap[':prodSourceLabel2'] = 'user'
-varMap[':jobStatus1'] = 'running'
-varMap[':jobStatus2'] = 'starting'
-varMap[':jobStatus3'] = 'stagein'
-varMap[':jobStatus4'] = 'stageout'
-sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2) "
-sql += "AND (jobStatus=:jobStatus1 OR jobStatus=:jobStatus2 OR jobStatus=:jobStatus3 OR jobStatus=:jobStatus4) AND modificationTime<:modificationTime"
-status,res = taskBuffer.querySQLS(sql,varMap)
-if res == None:
- _logger.debug("# of Anal Watcher : %s" % res)
-else:
- _logger.debug("# of Anal Watcher : %s" % len(res))
- for (id,) in res:
- _logger.debug("Anal Watcher %s" % id)
- thr = Watcher(taskBuffer,id,single=True,sleepTime=60,sitemapper=siteMapper)
- thr.start()
- thr.join()
- time.sleep(1)
-
-# check heartbeat for sent jobs
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=30)
-varMap = {}
-varMap[':jobStatus'] = 'sent'
-varMap[':modificationTime'] = timeLimit
-status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime",
- varMap)
-if res == None:
- _logger.debug("# of Sent Watcher : %s" % res)
-else:
- _logger.debug("# of Sent Watcher : %s" % len(res))
- for (id,) in res:
- _logger.debug("Sent Watcher %s" % id)
- thr = Watcher(taskBuffer,id,single=True,sleepTime=30,sitemapper=siteMapper)
- thr.start()
- thr.join()
- time.sleep(1)
-
-# check heartbeat for 'holding' analysis/ddm jobs
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
-# get XMLs
-xmlIDs = []
-xmlFiles = os.listdir(panda_config.logdir)
-for file in xmlFiles:
- match = re.search('^(\d+)_([^_]+)_.{36}$',file)
- if match != None:
- id = match.group(1)
- xmlIDs.append(int(id))
-sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND (modificationTime<:modificationTime OR (endTime IS NOT NULL AND endTime<:endTime)) AND (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2 OR prodSourceLabel=:prodSourceLabel3) AND stateChangeTime != modificationTime"
-varMap = {}
-varMap[':modificationTime'] = timeLimit
-varMap[':endTime'] = timeLimit
-varMap[':jobStatus'] = 'holding'
-varMap[':prodSourceLabel1'] = 'panda'
-varMap[':prodSourceLabel2'] = 'user'
-varMap[':prodSourceLabel3'] = 'ddm'
-status,res = taskBuffer.querySQLS(sql,varMap)
-if res == None:
- _logger.debug("# of Holding Anal/DDM Watcher : %s" % res)
-else:
- _logger.debug("# of Holding Anal/DDM Watcher : %s - XMLs : %s" % (len(res),len(xmlIDs)))
- for (id,) in res:
- _logger.debug("Holding Anal/DDM Watcher %s" % id)
- if int(id) in xmlIDs:
- _logger.debug(" found XML -> skip %s" % id)
- continue
- thr = Watcher(taskBuffer,id,single=True,sleepTime=180,sitemapper=siteMapper)
- thr.start()
- thr.join()
- time.sleep(1)
-
-# check heartbeat for production jobs
-timeOutVal = 48
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=timeOutVal)
-sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND (modificationTime<:modificationTime OR (endTime IS NOT NULL AND endTime<:endTime))"
-varMap = {}
-varMap[':modificationTime'] = timeLimit
-varMap[':endTime'] = timeLimit
-varMap[':jobStatus'] = 'holding'
-status,res = taskBuffer.querySQLS(sql,varMap)
-if res == None:
- _logger.debug("# of Holding Watcher : %s" % res)
-else:
- _logger.debug("# of Holding Watcher : %s" % len(res))
- for (id,) in res:
- _logger.debug("Holding Watcher %s" % id)
- thr = Watcher(taskBuffer,id,single=True,sleepTime=60*timeOutVal,sitemapper=siteMapper)
- thr.start()
- thr.join()
- time.sleep(1)
-
-# check heartbeat for ddm jobs
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2)
-varMap = {}
-varMap[':modificationTime'] = timeLimit
-varMap[':jobStatus1'] = 'running'
-varMap[':jobStatus2'] = 'starting'
-varMap[':jobStatus3'] = 'stagein'
-varMap[':jobStatus4'] = 'stageout'
-varMap[':prodSourceLabel'] = 'ddm'
-status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (jobStatus=:jobStatus1 OR jobStatus=:jobStatus2 OR jobStatus=:jobStatus3 OR jobStatus=:jobStatus4) AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel",
- varMap)
-if res == None:
- _logger.debug("# of DDM Watcher : %s" % res)
-else:
- _logger.debug("# of DDM Watcher : %s" % len(res))
- for (id,) in res:
- _logger.debug("DDM Watcher %s" % id)
- thr = Watcher(taskBuffer,id,single=True,sleepTime=120,sitemapper=siteMapper)
- thr.start()
- thr.join()
- time.sleep(1)
-
-# check heartbeat for production jobs
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=6)
-varMap = {}
-varMap[':modificationTime'] = timeLimit
-varMap[':jobStatus1'] = 'running'
-varMap[':jobStatus2'] = 'starting'
-varMap[':jobStatus3'] = 'stagein'
-varMap[':jobStatus4'] = 'stageout'
-status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (jobStatus=:jobStatus1 OR jobStatus=:jobStatus2 OR jobStatus=:jobStatus3 OR jobStatus=:jobStatus4) AND modificationTime<:modificationTime",
- varMap)
-if res == None:
- _logger.debug("# of General Watcher : %s" % res)
-else:
- _logger.debug("# of General Watcher : %s" % len(res))
- for (id,) in res:
- _logger.debug("General Watcher %s" % id)
- thr = Watcher(taskBuffer,id,single=True,sitemapper=siteMapper)
- thr.start()
- thr.join()
- time.sleep(1)
-
-_memoryCheck("reassign")
-
-# kill long-waiting jobs in defined table
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7)
-status,res = taskBuffer.querySQLS("SELECT PandaID,cloud,prodSourceLabel FROM ATLAS_PANDA.jobsDefined4 WHERE creationTime<:creationTime",
- {':creationTime':timeLimit})
-jobs=[]
-dashFileMap = {}
-if res != None:
- for pandaID,cloud,prodSourceLabel in res:
- # collect PandaIDs
- jobs.append(pandaID)
- try:
- if cloud in ['US']:
- # skip US since file info is not available in dashboard
- continue
- # check file status for production
- if not prodSourceLabel in ['managed']:
- pass
- else:
- # get T1 site
- tmpT1siteID = siteMapper.getCloud(cloud)['source']
- t1Site = siteMapper.getSite(tmpT1siteID)
- # get pending input files
- sqlF = "SELECT lfn,GUID,dispatchDBlock FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID "
- sqlF += "AND type=:type AND status=:status"
- varMap = {}
- varMap[':type'] = 'input'
- varMap[':status'] = 'pending'
- varMap[':PandaID'] = pandaID
- stFile,resFile = taskBuffer.querySQLS(sqlF,varMap)
- if resFile != None:
- # loop over all files
- for tmpLFN,tmpGUID,tmpDispDBlock in resFile:
- # get file events
- tmpDQ2IDs = t1Site.setokens.values()
- tmpKey = (tuple(tmpDQ2IDs),tmpLFN)
- if not dashFileMap.has_key(tmpKey):
- _logger.debug('getting fileEvents for %s:%s' % tmpKey)
- tmpStat,tmpOut = dashBorad.listFileEvents(tmpDQ2IDs,tmpGUID)
- _logger.debug(tmpStat)
- _logger.debug(tmpOut)
- if tmpStat != 0:
- # failed
- continue
- # convert to list
- try:
- exec "tmpEvens = %s" % tmpOut
- if not isinstance(tmpEvens,types.ListType):
- raise TypeError,"%s is not a list" % type(tmpEvens)
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error(tmpOut)
- _logger.error("invalid dashboard response %s %s" % (errType,errValue))
- continue
- dashFileMap[tmpKey] = None
- # look for latest events
- tmpLastTime = ''
- for tmpEvt in tmpEvens:
- # pickup only DQ2 events
- if not tmpEvt['tool_id'] in ['DQ2',None]:
- continue
- # pickup first one or newer
- if tmpLastTime == '' or tmpLastTime < tmpEvt['modified_time']:
- tmpLastTime = tmpEvt['modified_time']
- dashFileMap[tmpKey] = tmpEvt['state']
- _logger.debug('got status=%s' % dashFileMap[tmpKey])
- # update failed files
- if dashFileMap[tmpKey] in ['FAILED_TRANSFER','BAD']:
- sqlUpF = "UPDATE ATLAS_PANDA.filesTable4 SET status=:newStatus "
- sqlUpF += "WHERE PandaID=:PandaID AND lfn=:lfn"
- varMap = {}
- varMap[':PandaID'] = pandaID
- varMap[':lfn'] = tmpLFN
- varMap[':newStatus'] = dashFileMap[tmpKey].lower()
- taskBuffer.querySQLS(sqlUpF,varMap)
- _logger.debug('set status=%s to %s:%s' % (dashFileMap[tmpKey],pandaID,tmpLFN))
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("dashboard access failed with %s %s" % (errType,errValue))
-if len(jobs):
- _logger.debug("killJobs for Defined (%s)" % str(jobs))
- Client.killJobs(jobs,2)
-
-# kill long-waiting jobs in active table
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7)
-varMap = {}
-varMap[':jobStatus'] = 'activated'
-varMap[':creationTime'] = timeLimit
-status,res = taskBuffer.querySQLS("SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND creationTime<:creationTime",
- varMap)
-jobs=[]
-if res != None:
- for (id,) in res:
- jobs.append(id)
-if len(jobs):
- _logger.debug("killJobs for Active (%s)" % str(jobs))
- Client.killJobs(jobs,2)
-
-
-# kill long-waiting ddm jobs for dispatch
-_logger.debug("kill PandaMovers")
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12)
-sql = "SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND transferType=:transferType AND creationTime<:creationTime"
-varMap = {}
-varMap[':creationTime'] = timeLimit
-varMap[':prodSourceLabel'] = 'ddm'
-varMap[':transferType'] = 'dis'
-_logger.debug(sql+str(varMap))
-status,res = taskBuffer.querySQLS(sql,varMap)
-_logger.debug(res)
-jobs=[]
-if res != None:
- for (id,) in res:
- jobs.append(id)
-if len(jobs):
- _logger.debug("kill DDM Jobs (%s)" % str(jobs))
- Client.killJobs(jobs,2)
-
-# kill hang-up movers
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
-sql = "SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND transferType=:transferType AND jobStatus=:jobStatus AND startTime<:startTime"
-varMap = {}
-varMap[':startTime'] = timeLimit
-varMap[':prodSourceLabel'] = 'ddm'
-varMap[':transferType'] = 'dis'
-varMap[':jobStatus'] = 'running'
-_logger.debug(sql+str(varMap))
-status,res = taskBuffer.querySQLS(sql,varMap)
-_logger.debug(res)
-jobs = []
-movers = []
-if res != None:
- for id, in res:
- movers.append(id)
- # get dispatch dataset
- sql = 'SELECT name FROM ATLAS_PANDA.Datasets WHERE MoverID=:MoverID'
- stDS,resDS = taskBuffer.querySQLS(sql,{':MoverID':id})
- if resDS != None:
- disDS = resDS[0][0]
- # get PandaIDs associated to the dis dataset
- sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND dispatchDBlock=:dispatchDBlock"
- varMap = {}
- varMap[':jobStatus'] = 'assigned'
- varMap[':dispatchDBlock'] = disDS
- stP,resP = taskBuffer.querySQLS(sql,varMap)
- if resP != None:
- for pandaID, in resP:
- jobs.append(pandaID)
-# kill movers
-if len(movers):
- _logger.debug("kill hangup DDM Jobs (%s)" % str(movers))
- Client.killJobs(movers,2)
-# reassign jobs
-if len(jobs):
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- _logger.debug('reassignJobs for hangup movers (%s)' % jobs[iJob:iJob+nJob])
- taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True)
- iJob += nJob
-
-# reassign defined jobs in defined table
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=4)
-# get PandaIDs
-status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsDefined4",timeLimit,['defined'],['managed'],[],[],[])
-jobs=[]
-if res != None:
- for (id,) in res:
- jobs.append(id)
-# reassign
-_logger.debug('reassignJobs for defined jobs -> #%s' % len(jobs))
-if len(jobs) > 0:
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- _logger.debug('reassignJobs for defined jobs (%s)' % jobs[iJob:iJob+nJob])
- taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True)
- _logger.debug('reassignJobs for defined jobs done %s' % jobs[iJob])
- iJob += nJob
-
-
-# reassign when ratio of running/notrunning is too unbalanced
-"""
-_logger.debug("reassign Unbalanced")
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=4)
-jobStat = {}
-rangeValues = ['all','limit']
-for rangeVal in rangeValues:
- for jobStatus in ['running','activated','assigned']:
- table = 'ATLAS_PANDA.jobsDefined4'
- if jobStatus in ['running','activated']:
- table = 'ATLAS_PANDA.jobsActive4'
- varMap = {}
- varMap[':prodSourceLabel'] = 'managed'
- varMap[':jobStatus'] = jobStatus
- if rangeVal == 'all':
- sql = "SELECT computingSite,cloud,processingType,count(*) FROM %s WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus GROUP BY computingSite,cloud,processingType" \
- % table
- else:
- sql = "SELECT computingSite,cloud,processingType,count(*) FROM %s WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus AND modificationTime<:modificationTime GROUP BY computingSite,cloud,processingType" \
- % table
- varMap[':modificationTime'] = timeLimit
- # execute
- status,res = taskBuffer.querySQLS(sql,varMap)
- if res != None:
- for computingSite,cloud,processingType,nJobs in res:
- # add cloud
- if not jobStat.has_key(cloud):
- jobStat[cloud] = {}
- # add site
- if not jobStat[cloud].has_key(computingSite):
- jobStat[cloud][computingSite] = {}
- # add range
- if not jobStat[cloud][computingSite].has_key(rangeVal):
- jobStat[cloud][computingSite][rangeVal] = {}
- # add process group
- tmpProGroup = ProcessGroups.getProcessGroup(processingType)
- if not jobStat[cloud][computingSite][rangeVal].has_key(tmpProGroup):
- jobStat[cloud][computingSite][rangeVal][tmpProGroup] = {}
- # set status
- tmpStatus = jobStatus
- if jobStatus != 'running':
- tmpStatus = 'notrunning'
- # add status
- if not jobStat[cloud][computingSite][rangeVal][tmpProGroup].has_key(tmpStatus):
- jobStat[cloud][computingSite][rangeVal][tmpProGroup][tmpStatus] = 0
- # add
- jobStat[cloud][computingSite][rangeVal][tmpProGroup][tmpStatus] += nJobs
-# look for unbalanced site
-for cloud,siteVal in jobStat.iteritems():
- jobsCloud = {}
- ngSites = {}
- t1Site = siteMapper.getCloud(cloud)['source']
- _logger.debug("Cloud:%s" % cloud)
- for computingSite,jobVal in siteVal.iteritems():
- # set 0
- for rangeVal in rangeValues:
- for pgType,pgList in ProcessGroups.processGroups:
- # add range
- if not jobVal.has_key(rangeVal):
- jobVal[rangeVal] = {}
- # add process group
- if not jobVal[rangeVal].has_key(pgType):
- jobVal[rangeVal][pgType] = {}
- # number of jobs
- if not jobVal[rangeVal][pgType].has_key('running'):
- jobVal[rangeVal][pgType]['running'] = 0
- if not jobVal[rangeVal][pgType].has_key('notrunning'):
- jobVal[rangeVal][pgType]['notrunning'] = 0
- # check ratio
- for pgType,pgList in ProcessGroups.processGroups:
- # add process group to map
- if not jobsCloud.has_key(pgType):
- jobsCloud[pgType] = {'notrunning':0,'running':0,'notfull':False}
- if not ngSites.has_key(pgType):
- ngSites[pgType] = []
- # get ratio
- checkRatio = jobVal['limit'][pgType]['notrunning'] > jobVal['all'][pgType]['running']*4
- jobsCloud[pgType]['running'] += jobVal['all'][pgType]['running']
- jobsCloud[pgType]['notrunning'] += jobVal['all'][pgType]['notrunning']
- # check ratio
- if computingSite in [t1Site,'NULL']:
- # skip T1
- statStr = '--'
- else:
- if checkRatio:
- statStr = 'NG'
- ngSites[pgType].append(computingSite)
- else:
- statStr = '--'
- # not full
- if jobVal['all'][pgType]['notrunning'] < jobVal['all'][pgType]['running']*2:
- jobsCloud[pgType]['notfull'] = True
- _logger.debug("%20s : %14s %s n:%-5s r:%-5s" % (computingSite,pgType,statStr,jobVal['limit'][pgType]['notrunning'],
- jobVal['all'][pgType]['running']))
- # reassign
- for pgType,pgList in ProcessGroups.processGroups:
- _logger.debug(" %14s : n:%-5s r:%-5s %s" % (pgType,jobsCloud[pgType]['notrunning'],
- jobsCloud[pgType]['running'],jobsCloud[pgType]['notfull']))
- if jobsCloud[pgType]['notrunning'] > jobsCloud[pgType]['running']*2 and ngSites[pgType] != [] and jobsCloud[pgType]['notfull']:
- # reassign except reprocessing
- if pgType in ['reprocessing']:
- continue
- # get PandaIDs
- jobs = []
- for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4']:
- varMap = {}
- varMap[':prodSourceLabel'] = 'managed'
- varMap[':jobStatus1'] = 'activated'
- varMap[':jobStatus2'] = 'assigned'
- sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND computingSite IN (" % table
- idxSite = 1
- for ngSite in ngSites[pgType]:
- tmpSiteKey = ':computingSite%s' % idxSite
- sql += "%s," % tmpSiteKey
- varMap[tmpSiteKey] = ngSite
- idxSite += 1
- sql = sql[:-1]
- if pgList != []:
- sql += ") AND processingType IN ("
- tmpPgList = pgList
- else:
- sql += ") AND processingType NOT IN ("
- # get types to be excluded
- tmpPgList = []
- for tmpExPgType,tmpExPgList in ProcessGroups.processGroups:
- if tmpExPgType != pgType:
- tmpPgList += tmpExPgList
- idxPro = 1
- for pgItem in tmpPgList:
- tmpProKey = ':processingType%s' % idxPro
- sql += "%s," % tmpProKey
- varMap[tmpProKey] = pgItem
- idxPro += 1
- sql = sql[:-1]
- sql += ") AND modificationTime<:modificationTime ORDER BY PandaID"
- varMap[':modificationTime'] = timeLimit
- # execute
- _logger.debug(sql+str(varMap))
- status,res = taskBuffer.querySQLS(sql,varMap)
- if res != None:
- # get IDs
- for id, in res:
- jobs.append(id)
- # reassign
- if jobs != []:
- if len(jobs):
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- #_logger.debug('reassignJobs for Unbalanced (%s)' % jobs[iJob:iJob+nJob])
- #Client.reassignJobs(jobs[iJob:iJob+nJob])
- iJob += nJob
- #time.sleep(60)
-"""
-
-
-# reassign long-waiting jobs in defined table
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12)
-status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsDefined4",timeLimit,[],['managed'],[],[],[])
-jobs=[]
-if res != None:
- for (id,) in res:
- jobs.append(id)
-# reassign
-_logger.debug('reassignJobs for long in defined table -> #%s' % len(jobs))
-if len(jobs) > 0:
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- _logger.debug('reassignJobs for long in defined table (%s)' % jobs[iJob:iJob+nJob])
- taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True)
- iJob += nJob
-
-
-# reassign too long-standing evgen/simul jobs with active state at T1
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=6)
-for tmpCloud in siteMapper.getCloudList():
- # ignore special clouds
- if tmpCloud in ['CERN','OSG']:
- continue
- status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsActive4",timeLimit,['activated'],['managed'],
- ['evgen','simul'],[siteMapper.getCloud(tmpCloud)['tier1']],[])
- jobs = []
- if res != None:
- for (id,) in res:
- jobs.append(id)
- _logger.debug('reassignJobs for Active T1 evgensimul in %s -> #%s' % (tmpCloud,len(jobs)))
- if len(jobs) != 0:
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- _logger.debug('reassignJobs for Active T1 evgensimul (%s)' % jobs[iJob:iJob+nJob])
- taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True)
- iJob += nJob
-
-# reassign too long-standing evgen/simul jobs with active state at T2
-try:
- _logger.debug('looking for stuck T2s to reassign evgensimul')
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=6)
- varMap = {}
- varMap[':jobStatus1'] = 'activated'
- varMap[':jobStatus2'] = 'running'
- varMap[':prodSourceLabel'] = 'managed'
- varMap[':processingType1'] = 'evgen'
- varMap[':processingType2'] = 'simul'
- status,res = taskBuffer.querySQLS("SELECT cloud,computingSite,jobStatus,COUNT(*) FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus IN (:jobStatus1,:jobStatus2) AND prodSourceLabel=:prodSourceLabel AND processingType IN (:processingType1,:processingType2) GROUP BY cloud,computingSite,jobStatus",
- varMap)
- if res != None:
- # get ratio of activated/running
- siteStatData = {}
- for tmpCloud,tmpComputingSite,tmpJobStatus,tmpCount in res:
- # skip T1
- if tmpComputingSite == siteMapper.getCloud(tmpCloud)['tier1']:
- continue
- # add cloud/site
- tmpKey = (tmpCloud,tmpComputingSite)
- if not siteStatData.has_key(tmpKey):
- siteStatData[tmpKey] = {'activated':0,'running':0}
- # add the number of jobs
- if siteStatData[tmpKey].has_key(tmpJobStatus):
- siteStatData[tmpKey][tmpJobStatus] += tmpCount
- # look for stuck site
- stuckThr = 10
- stuckSites = []
- for tmpKey,tmpStatData in siteStatData.iteritems():
- if tmpStatData['running'] == 0 or \
- float(tmpStatData['activated'])/float(tmpStatData['running']) > stuckThr:
- tmpCloud,tmpComputingSite = tmpKey
- _logger.debug(' %s:%s %s/%s > %s' % (tmpCloud,tmpComputingSite,tmpStatData['activated'],tmpStatData['running'],stuckThr))
- # get stuck jobs
- status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsActive4",timeLimit,['activated'],['managed'],
- ['evgen','simul'],[tmpComputingSite],[tmpCloud])
- jobs = []
- if res != None:
- for (id,) in res:
- jobs.append(id)
- _logger.debug('reassignJobs for Active T2 evgensimul %s:%s -> #%s' % (tmpCloud,tmpComputingSite,len(jobs)))
- if len(jobs) > 0:
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- _logger.debug('reassignJobs for Active T2 evgensimul (%s)' % jobs[iJob:iJob+nJob])
- taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True)
- iJob += nJob
-except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("failed to reassign T2 evgensimul with %s:%s" % (errType,errValue))
-
-# reassign too long-standing jobs in active table
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=2)
-status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsActive4",timeLimit,['activated'],['managed'],[],[],[])
-jobs = []
-if res != None:
- for (id,) in res:
- jobs.append(id)
-_logger.debug('reassignJobs for long in active table -> #%s' % len(jobs))
-if len(jobs) != 0:
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- _logger.debug('reassignJobs for long in active table (%s)' % jobs[iJob:iJob+nJob])
- taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True)
- iJob += nJob
-
-
-# kill too long-standing analysis jobs in active table
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7)
-varMap = {}
-varMap[':prodSourceLabel1'] = 'test'
-varMap[':prodSourceLabel2'] = 'panda'
-varMap[':prodSourceLabel3'] = 'user'
-varMap[':modificationTime'] = timeLimit
-status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2 OR prodSourceLabel=:prodSourceLabel3) AND modificationTime<:modificationTime ORDER BY PandaID",
- varMap)
-jobs = []
-if res != None:
- for (id,) in res:
- jobs.append(id)
-# kill
-if len(jobs):
- Client.killJobs(jobs,2)
- _logger.debug("killJobs for Anal Active (%s)" % str(jobs))
-
-
-# kill too long pending jobs
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1)
-varMap = {}
-varMap[':jobStatus'] = 'pending'
-varMap[':creationTime'] = timeLimit
-status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsWaiting4 WHERE jobStatus=:jobStatus AND creationTime<:creationTime",
- varMap)
-jobs = []
-if res != None:
- for (id,) in res:
- jobs.append(id)
-# kill
-if len(jobs):
- Client.killJobs(jobs,4)
- _logger.debug("killJobs for Pending (%s)" % str(jobs))
-
-# kill too long waiting jobs
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=1)
-status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsWaiting4 WHERE creationTime<:creationTime",
- {':creationTime':timeLimit})
-jobs = []
-if res != None:
- for (id,) in res:
- jobs.append(id)
-# kill
-if len(jobs):
- Client.killJobs(jobs,4)
- _logger.debug("killJobs for Waiting (%s)" % str(jobs))
-
-
-# reassign long waiting jobs
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=30)
-status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsWaiting4",timeLimit,['waiting'],['managed'],[],[],[])
-jobs = []
-if res != None:
- for (id,) in res:
- jobs.append(id)
-_logger.debug('reassignJobs for Waiting -> #%s' % len(jobs))
-if len(jobs):
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- _logger.debug('reassignJobs for Waiting (%s)' % jobs[iJob:iJob+nJob])
- taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True)
- iJob += nJob
-
-# kill too long running jobs
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=21)
-status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE creationTime<:creationTime",
- {':creationTime':timeLimit})
-jobs = []
-if res != None:
- for (id,) in res:
- jobs.append(id)
-# kill
-if len(jobs):
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- # set tobekill
- _logger.debug('killJobs for Running (%s)' % jobs[iJob:iJob+nJob])
- Client.killJobs(jobs[iJob:iJob+nJob],2)
- # run watcher
- for id in jobs[iJob:iJob+nJob]:
- thr = Watcher(taskBuffer,id,single=True,sitemapper=siteMapper,sleepTime=60*24*21)
- thr.start()
- thr.join()
- time.sleep(1)
- iJob += nJob
- time.sleep(10)
-
-# kill too long waiting ddm jobs
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=5)
-varMap = {}
-varMap[':prodSourceLabel'] = 'ddm'
-varMap[':creationTime'] = timeLimit
-status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND creationTime<:creationTime",
- varMap)
-jobs = []
-if res != None:
- for (id,) in res:
- jobs.append(id)
-# kill
-if len(jobs):
- Client.killJobs(jobs,2)
- _logger.debug("killJobs for DDM (%s)" % str(jobs))
-
-_memoryCheck("closing")
-
-
-# delete old datasets
-"""
-timeLimitDnS = datetime.datetime.utcnow() - datetime.timedelta(days=60)
-timeLimitTop = datetime.datetime.utcnow() - datetime.timedelta(days=90)
-nDelDS = 1000
-for dsType,dsPrefix in [('','top'),]:
- sql = 'DELETE FROM ATLAS_PANDA.Datasets '
- if dsType != '':
- # dis or sub
- sql += 'WHERE type=:type AND modificationdate<:modificationdate '
- sql += 'AND REGEXP_LIKE(name,:pattern) AND rownum <= %s' % nDelDS
- varMap = {}
- varMap[':modificationdate'] = timeLimitDnS
- varMap[':type'] = dsType
- varMap[':pattern'] = '_%s[[:digit:]]+$' % dsPrefix
- else:
- # top level datasets
- sql+= 'WHERE modificationdate<:modificationdate AND rownum <= %s' % nDelDS
- varMap = {}
- varMap[':modificationdate'] = timeLimitTop
- for i in range(100):
- # del datasets
- ret,res = taskBuffer.querySQLS(sql, varMap)
- _logger.debug('# of %s datasets deleted: %s' % (dsPrefix,res))
- # no more datasets
- if res != nDelDS:
- break
-"""
-
-# thread pool
-class ThreadPool:
- def __init__(self):
- self.lock = threading.Lock()
- self.list = []
-
- def add(self,obj):
- self.lock.acquire()
- self.list.append(obj)
- self.lock.release()
-
- def remove(self,obj):
- self.lock.acquire()
- self.list.remove(obj)
- self.lock.release()
-
- def join(self):
- self.lock.acquire()
- thrlist = tuple(self.list)
- self.lock.release()
- for thr in thrlist:
- thr.join()
-
-
-# thread to close dataset
-class CloserThr (threading.Thread):
- def __init__(self,lock,proxyLock,datasets,pool):
- threading.Thread.__init__(self)
- self.datasets = datasets
- self.lock = lock
- self.proxyLock = proxyLock
- self.pool = pool
- self.pool.add(self)
-
- def run(self):
- self.lock.acquire()
- try:
- # loop over all datasets
- for vuid,name,modDate in self.datasets:
- _logger.debug("Close %s %s" % (modDate,name))
- if not name.startswith('pandaddm_'):
- status,out = ddm.DQ2.main('freezeDataset',name)
- else:
- status,out = 0,''
- if status != 0 and out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1:
- _logger.error(out)
- else:
- self.proxyLock.acquire()
- varMap = {}
- varMap[':vuid'] = vuid
- varMap[':status'] = 'completed'
- taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
- varMap)
- self.proxyLock.release()
- if name.startswith('pandaddm_'):
- continue
- # count # of files
- status,out = ddm.DQ2.main('getNumberOfFiles',name)
- _logger.debug(out)
- if status != 0:
- _logger.error(out)
- else:
- try:
- nFile = int(out)
- _logger.debug(nFile)
- if nFile == 0:
- # erase dataset
- _logger.debug('erase %s' % name)
- status,out = ddm.DQ2.main('eraseDataset',name)
- _logger.debug(out)
- except:
- pass
- except:
- pass
- self.pool.remove(self)
- self.lock.release()
-
-# close datasets
-"""
-timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30)
-timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3)
-closeLock = threading.Semaphore(5)
-closeProxyLock = threading.Lock()
-closeThreadPool = ThreadPool()
-while True:
- # lock
- closeLock.acquire()
- # get datasets
- closeProxyLock.acquire()
- varMap = {}
- varMap[':modificationdateU'] = timeLimitU
- varMap[':modificationdateL'] = timeLimitL
- varMap[':type'] = 'output'
- varMap[':status'] = 'tobeclosed'
- sqlQuery = 'type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= 500'
- proxyS = taskBuffer.proxyPool.getProxy()
- res = proxyS.getLockDatasets(sqlQuery,varMap)
- taskBuffer.proxyPool.putProxy(proxyS)
- if res == None:
- _logger.debug('# of datasets to be closed: %s' % res)
- else:
- _logger.debug('# of datasets to be closed: %s' % len(res))
- if res==None or len(res)==0:
- closeProxyLock.release()
- closeLock.release()
- break
- # release
- closeProxyLock.release()
- closeLock.release()
- # run thread
- closerThr = CloserThr(closeLock,closeProxyLock,res,closeThreadPool)
- closerThr.start()
-
-closeThreadPool.join()
-"""
-
-# thread to freeze dataset
-class Freezer (threading.Thread):
- def __init__(self,lock,proxyLock,datasets,pool):
- threading.Thread.__init__(self)
- self.datasets = datasets
- self.lock = lock
- self.proxyLock = proxyLock
- self.pool = pool
- self.pool.add(self)
-
- def run(self):
- self.lock.acquire()
- try:
- for vuid,name,modDate in self.datasets:
- _logger.debug("start %s %s" % (modDate,name))
- self.proxyLock.acquire()
- retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ lfn FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock",
- {':destinationDBlock':name})
- self.proxyLock.release()
- if retF<0:
- _logger.error("SQL error")
- else:
- # no files in filesTable
- if len(resF) == 0:
- _logger.debug("freeze %s " % name)
- if not name.startswith('pandaddm_'):
- status,out = ddm.DQ2.main('freezeDataset',name)
- else:
- status,out = 0,''
- if status != 0 and out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1:
- _logger.error(out)
- else:
- self.proxyLock.acquire()
- varMap = {}
- varMap[':vuid'] = vuid
- varMap[':status'] = 'completed'
- taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
- varMap)
- self.proxyLock.release()
- if name.startswith('pandaddm_'):
- continue
- # count # of files
- status,out = ddm.DQ2.main('getNumberOfFiles',name)
- _logger.debug(out)
- if status != 0:
- _logger.error(out)
- else:
- try:
- nFile = int(out)
- _logger.debug(nFile)
- if nFile == 0:
- # erase dataset
- _logger.debug('erase %s' % name)
- status,out = ddm.DQ2.main('eraseDataset',name)
- _logger.debug(out)
- except:
- pass
- else:
- _logger.debug("wait %s " % name)
- self.proxyLock.acquire()
- taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid})
- self.proxyLock.release()
- _logger.debug("end %s " % name)
- except:
- pass
- self.pool.remove(self)
- self.lock.release()
-
-# freeze dataset
-"""
-timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(days=4)
-timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=14)
-freezeLock = threading.Semaphore(5)
-freezeProxyLock = threading.Lock()
-freezeThreadPool = ThreadPool()
-while True:
- # lock
- freezeLock.acquire()
- # get datasets
- sqlQuery = 'type=:type AND status IN (:status1,:status2,:status3) ' + \
- 'AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND REGEXP_LIKE(name,:pattern) AND rownum <= 500'
- varMap = {}
- varMap[':modificationdateU'] = timeLimitU
- varMap[':modificationdateL'] = timeLimitL
- varMap[':type'] = 'output'
- varMap[':status1'] = 'running'
- varMap[':status2'] = 'created'
- varMap[':status3'] = 'defined'
- varMap[':pattern'] = '_sub[[:digit:]]+$'
- freezeProxyLock.acquire()
- proxyS = taskBuffer.proxyPool.getProxy()
- res = proxyS.getLockDatasets(sqlQuery,varMap)
- taskBuffer.proxyPool.putProxy(proxyS)
- if res == None:
- _logger.debug('# of datasets to be frozen: %s' % res)
- else:
- _logger.debug('# of datasets to be frozen: %s' % len(res))
- if res==None or len(res)==0:
- freezeProxyLock.release()
- freezeLock.release()
- break
- freezeProxyLock.release()
- # release
- freezeLock.release()
- # run freezer
- freezer = Freezer(freezeLock,freezeProxyLock,res,freezeThreadPool)
- freezer.start()
-
-freezeThreadPool.join()
-"""
-
-# thread to delete dataset replica from T2
-class T2Cleaner (threading.Thread):
- def __init__(self,lock,proxyLock,datasets,pool):
- threading.Thread.__init__(self)
- self.datasets = datasets
- self.lock = lock
- self.proxyLock = proxyLock
- self.pool = pool
- self.pool.add(self)
-
- def run(self):
- self.lock.acquire()
- try:
- for vuid,name,modDate in self.datasets:
- _logger.debug("cleanT2 %s" % name)
- # get list of replicas
- status,out = ddm.DQ2.main('listDatasetReplicas',name,0,None,False)
- if status != 0 and out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1:
- _logger.error(out)
- continue
- else:
- try:
- # convert res to map
- exec "tmpRepSites = %s" % out
- except:
- tmpRepSites = {}
- _logger.error("cannot convert to replica map")
- _logger.error(out)
- continue
- # check cloud
- cloudName = None
- for tmpCloudName in siteMapper.getCloudList():
- t1SiteName = siteMapper.getCloud(tmpCloudName)['source']
- t1SiteDDMs = siteMapper.getSite(t1SiteName).setokens.values()
- for tmpDDM in t1SiteDDMs:
- if tmpRepSites.has_key(tmpDDM):
- cloudName = tmpCloudName
- break
- # cloud is not found
- if cloudName == None:
- _logger.error("cannot find cloud for %s : %s" % (name,str(tmpRepSites)))
- elif not cloudName in ['DE','CA','ES','FR','IT','NL','UK','TW','RU']:
- # FIXME : test only EGEE for now
- pass
- else:
- # look for T2 IDs
- t2DDMs = []
- for tmpDDM in tmpRepSites.keys():
- if not tmpDDM in t1SiteDDMs and tmpDDM.endswith('_PRODDISK'):
- t2DDMs.append(tmpDDM)
- # delete replica for sub
- if re.search('_sub\d+$',name) != None and t2DDMs != []:
- _logger.debug(('deleteDatasetReplicas',name,t2DDMs))
- status,out = ddm.DQ2.main('deleteDatasetReplicas',name,t2DDMs)
- if status != 0:
- _logger.error(out)
- if out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \
- out.find("No replica found") == -1:
- continue
- # update
- self.proxyLock.acquire()
- varMap = {}
- varMap[':vuid'] = vuid
- varMap[':status'] = 'completed'
- taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
- varMap)
- self.proxyLock.release()
- _logger.debug("end %s " % name)
- except:
- pass
- self.pool.remove(self)
- self.lock.release()
-
-# delete dataset replica from T2
-"""
-timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30)
-timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3)
-t2cleanLock = threading.Semaphore(5)
-t2cleanProxyLock = threading.Lock()
-t2cleanThreadPool = ThreadPool()
-while True:
- # lock
- t2cleanLock.acquire()
- # get datasets
- varMap = {}
- varMap[':modificationdateU'] = timeLimitU
- varMap[':modificationdateL'] = timeLimitL
- varMap[':type'] = 'output'
- varMap[':status'] = 'cleanup'
- sqlQuery = 'type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= 500'
- t2cleanProxyLock.acquire()
- proxyS = taskBuffer.proxyPool.getProxy()
- res = proxyS.getLockDatasets(sqlQuery,varMap)
- taskBuffer.proxyPool.putProxy(proxyS)
- if res == None:
- _logger.debug('# of datasets to be deleted from T2: %s' % res)
- else:
- _logger.debug('# of datasets to be deleted from T2: %s' % len(res))
- if res==None or len(res)==0:
- t2cleanProxyLock.release()
- t2cleanLock.release()
- break
- t2cleanProxyLock.release()
- # release
- t2cleanLock.release()
- # run t2cleanr
- t2cleanr = T2Cleaner(t2cleanLock,t2cleanProxyLock,res,t2cleanThreadPool)
- t2cleanr.start()
-
-t2cleanThreadPool.join()
-"""
-
-
-_memoryCheck("delete XML")
-
-# delete old files in DA cache
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7)
-files = os.listdir(panda_config.cache_dir)
-for file in files:
- # skip special test file
- if file == 'sources.72c48dc5-f055-43e5-a86e-4ae9f8ea3497.tar.gz':
- continue
- if file == 'sources.090f3f51-fc81-4e80-9749-a5e4b2bd58de.tar.gz':
- continue
- try:
- # get timestamp
- timestamp = datetime.datetime.fromtimestamp(os.stat('%s/%s' % (panda_config.cache_dir,file)).st_mtime)
- # delete
- if timestamp < timeLimit:
- _logger.debug("delete %s " % file)
- os.remove('%s/%s' % (panda_config.cache_dir,file))
- except:
- pass
-
-
-_memoryCheck("delete core")
-
-# delete core
-dirName = '%s/..' % panda_config.logdir
-for file in os.listdir(dirName):
- if file.startswith('core.'):
- _logger.debug("delete %s " % file)
- try:
- os.remove('%s/%s' % (dirName,file))
- except:
- pass
-
-
-_memoryCheck("finisher")
-
-# finish transferring jobs
-"""
-timeNow = datetime.datetime.utcnow()
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12)
-sql = 'SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime AND rownum<=20'
-for ii in range(1000):
- varMap = {}
- varMap[':jobStatus'] = 'transferring'
- varMap[':modificationTime'] = timeLimit
- ret,res = taskBuffer.querySQLS(sql, varMap)
- if res == None:
- _logger.debug('# of jobs to be finished : %s' % res)
- break
- else:
- _logger.debug('# of jobs to be finished : %s' % len(res))
- if len(res) == 0:
- break
- # get jobs from DB
- ids = []
- for (id,) in res:
- ids.append(id)
- jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False)
- # update modificationTime to lock jobs
- for job in jobs:
- if job != None and job.jobStatus != 'unknown':
- taskBuffer.updateJobStatus(job.PandaID,job.jobStatus,{})
- upJobs = []
- finJobs = []
- for job in jobs:
- if job == None or job.jobStatus == 'unknown':
- continue
- # use BNL by default
- dq2URL = siteMapper.getSite('BNL_ATLAS_1').dq2url
- dq2SE = []
- # get LFC and SEs
- if job.prodSourceLabel == 'user' and not siteMapper.siteSpecList.has_key(job.destinationSE):
- # using --destSE for analysis job to transfer output
- try:
- dq2URL = dataservice.DDM.toa.getLocalCatalog(job.destinationSE)[-1]
- match = re.search('.+://([^:/]+):*\d*/*',dataservice.DDM.toa.getSiteProperty(job.destinationSE,'srm')[-1])
- if match != None:
- dq2SE.append(match.group(1))
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error('Failed to get DQ2/SE for %s with %s %s' % (job.PandaID,type,value))
- continue
- elif siteMapper.checkCloud(job.cloud):
- # normal production jobs
- tmpDstID = siteMapper.getCloud(job.cloud)['dest']
- tmpDstSite = siteMapper.getSite(tmpDstID)
- if not tmpDstSite.lfchost in [None,'']:
- # LFC
- dq2URL = 'lfc://'+tmpDstSite.lfchost+':/grid/atlas/'
- if tmpDstSite.se != None:
- for tmpDstSiteSE in tmpDstSite.se.split(','):
- match = re.search('.+://([^:/]+):*\d*/*',tmpDstSiteSE)
- if match != None:
- dq2SE.append(match.group(1))
- else:
- # LRC
- dq2URL = tmpDstSite.dq2url
- dq2SE = []
- # get LFN list
- lfns = []
- guids = []
- nTokens = 0
- for file in job.Files:
- # only output files are checked
- if file.type == 'output' or file.type == 'log':
- lfns.append(file.lfn)
- guids.append(file.GUID)
- nTokens += len(file.destinationDBlockToken.split(','))
- # get files in LRC
- _logger.debug('Cloud:%s DQ2URL:%s' % (job.cloud,dq2URL))
- okFiles = brokerage.broker_util.getFilesFromLRC(lfns,dq2URL,guids,dq2SE,getPFN=True)
- # count files
- nOkTokens = 0
- for okLFN,okPFNs in okFiles.iteritems():
- nOkTokens += len(okPFNs)
- # check all files are ready
- _logger.debug(' nToken:%s nOkToken:%s' % (nTokens,nOkTokens))
- if nTokens <= nOkTokens:
- _logger.debug('Finisher : Finish %s' % job.PandaID)
- for file in job.Files:
- if file.type == 'output' or file.type == 'log':
- file.status = 'ready'
- # append to run Finisher
- finJobs.append(job)
- else:
- endTime = job.endTime
- if endTime == 'NULL':
- endTime = job.startTime
- # priority-dependent timeout
- tmpCloudSpec = siteMapper.getCloud(job.cloud)
- if job.currentPriority >= 900 and (not job.prodSourceLabel in ['user']):
- if tmpCloudSpec.has_key('transtimehi'):
- timeOutValue = tmpCloudSpec['transtimehi']
- else:
- timeOutValue = 1
- else:
- if tmpCloudSpec.has_key('transtimelo'):
- timeOutValue = tmpCloudSpec['transtimelo']
- else:
- timeOutValue = 2
- # protection
- if timeOutValue < 1:
- timeOutValue = 1
- timeOut = timeNow - datetime.timedelta(days=timeOutValue)
- _logger.debug(' Priority:%s Limit:%s End:%s' % (job.currentPriority,str(timeOut),str(endTime)))
- if endTime < timeOut:
- # timeout
- _logger.debug('Finisher : Kill %s' % job.PandaID)
- strMiss = ''
- for lfn in lfns:
- if not lfn in okFiles:
- strMiss += ' %s' % lfn
- job.jobStatus = 'failed'
- job.taskBufferErrorCode = taskbuffer.ErrorCode.EC_Transfer
- job.taskBufferErrorDiag = 'transfer timeout for '+strMiss
- guidMap = {}
- for file in job.Files:
- # set file status
- if file.status == 'transferring':
- file.status = 'failed'
- # collect GUIDs to delete files from _tid datasets
- if file.type == 'output' or file.type == 'log':
- if not guidMap.has_key(file.destinationDBlock):
- guidMap[file.destinationDBlock] = []
- guidMap[file.destinationDBlock].append(file.GUID)
- else:
- # wait
- _logger.debug('Finisher : Wait %s' % job.PandaID)
- for lfn in lfns:
- if not lfn in okFiles:
- _logger.debug(' -> %s' % lfn)
- upJobs.append(job)
- # update
- _logger.debug('updating ...')
- taskBuffer.updateJobs(upJobs,False)
- # run Finisher
- for job in finJobs:
- fThr = Finisher(taskBuffer,None,job)
- fThr.start()
- fThr.join()
- _logger.debug('done')
- time.sleep(random.randint(1,10))
-"""
-
-# update email DB
-_memoryCheck("email")
-_logger.debug("Update emails")
-
-# lock file
-_lockGetMail = open(panda_config.lockfile_getMail, 'w')
-# lock email DB
-fcntl.flock(_lockGetMail.fileno(), fcntl.LOCK_EX)
-# open email DB
-pDB = shelve.open(panda_config.emailDB)
-# read
-mailMap = {}
-for name,addr in pDB.iteritems():
- mailMap[name] = addr
-# close DB
-pDB.close()
-# release file lock
-fcntl.flock(_lockGetMail.fileno(), fcntl.LOCK_UN)
-# set email address
-for name,addr in mailMap.iteritems():
- # remove _
- name = re.sub('_$','',name)
- status,res = taskBuffer.querySQLS("SELECT email FROM ATLAS_PANDAMETA.users WHERE name=:name",{':name':name})
- # failed or not found
- if status == -1 or len(res) == 0:
- _logger.error("%s not found in user DB" % name)
- continue
- # already set
- if not res[0][0] in ['','None',None]:
- continue
- # update email
- _logger.debug("set '%s' to %s" % (name,addr))
- status,res = taskBuffer.querySQLS("UPDATE ATLAS_PANDAMETA.users SET email=:addr WHERE name=:name",{':addr':addr,':name':name})
-
-# reassign reprocessing jobs in defined table
-_memoryCheck("repro")
-class ReassginRepro (threading.Thread):
- def __init__(self,taskBuffer,lock,jobs):
- threading.Thread.__init__(self)
- self.jobs = jobs
- self.lock = lock
- self.taskBuffer = taskBuffer
-
- def run(self):
- self.lock.acquire()
- try:
- if len(self.jobs):
- nJob = 100
- iJob = 0
- while iJob < len(self.jobs):
- # reassign jobs one by one to break dis dataset formation
- for job in self.jobs[iJob:iJob+nJob]:
- _logger.debug('reassignJobs in Pepro (%s)' % [job])
- self.taskBuffer.reassignJobs([job],joinThr=True)
- iJob += nJob
- except:
- pass
- self.lock.release()
-
-reproLock = threading.Semaphore(3)
-
-nBunch = 20
-iBunch = 0
-timeLimitMod = datetime.datetime.utcnow() - datetime.timedelta(hours=8)
-timeLimitCre = datetime.datetime.utcnow() - datetime.timedelta(hours=24)
-firstFlag = True
-while True:
- # lock
- reproLock.acquire()
- # get jobs
- varMap = {}
- varMap[':jobStatus'] = 'assigned'
- varMap[':prodSourceLabel'] = 'managed'
- varMap[':modificationTime'] = timeLimitMod
- varMap[':creationTime'] = timeLimitCre
- varMap[':processingType'] = 'reprocessing'
- if firstFlag:
- firstFlag = False
- status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND prodSourceLabel=:prodSourceLabel AND modificationTime<:modificationTime AND creationTime<:creationTime AND processingType=:processingType ORDER BY PandaID",
- varMap)
- if res != None:
- _logger.debug('total Repro for reassignJobs : %s' % len(res))
- # get a bunch
- status,res = taskBuffer.querySQLS("SELECT * FROM (SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND prodSourceLabel=:prodSourceLabel AND modificationTime<:modificationTime AND creationTime<:creationTime AND processingType=:processingType ORDER BY PandaID) WHERE rownum<=%s" % nBunch,
- varMap)
- # escape
- if res == None or len(res) == 0:
- reproLock.release()
- break
-
- # get IDs
- jobs=[]
- for id, in res:
- jobs.append(id)
-
- # reassign
- _logger.debug('reassignJobs for Pepro %s' % (iBunch*nBunch))
- # lock
- currentTime = datetime.datetime.utcnow()
- for jobID in jobs:
- varMap = {}
- varMap[':PandaID'] = jobID
- varMap[':modificationTime'] = currentTime
- status,res = taskBuffer.querySQLS("UPDATE ATLAS_PANDA.jobsDefined4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID",
- varMap)
- reproLock.release()
- # run thr
- reproThr = ReassginRepro(taskBuffer,reproLock,jobs)
- reproThr.start()
- iBunch += 1
-
-_memoryCheck("end")
-
-_logger.debug("===================== end =====================")
diff --git a/current/pandaserver/test/copyArchive.sh b/current/pandaserver/test/copyArchive.sh
deleted file mode 100755
index 220f01ee2..000000000
--- a/current/pandaserver/test/copyArchive.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# Panda home
-export PANDA_HOME=/home/sm/prod
-
-# for python
-export PYTHONPATH=$PANDA_HOME/panda:$PYTHONPATH
-
-python $PANDA_HOME/panda/test/copyArchive.py
diff --git a/current/pandaserver/test/copyROOT.py b/current/pandaserver/test/copyROOT.py
deleted file mode 100644
index aeca74801..000000000
--- a/current/pandaserver/test/copyROOT.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import os
-import re
-import sys
-from ftplib import FTP
-from pandalogger.PandaLogger import PandaLogger
-
-# supported architectures
-targetArchs = ['Linux-slc5-gcc4.3.tar.gz','Linux-slc5_amd64-gcc4.3.tar.gz']
-
-# destination dir
-destDir = '/data/atlpan/srv/var/appdir'
-
-# logger
-_logger = PandaLogger().getLogger('copyROOT')
-
-_logger.debug("===================== start =====================")
-
-try:
- # login to root repository
- ftp = FTP('root.cern.ch')
- output = ftp.login()
- _logger.debug(output)
- output = ftp.cwd('root')
- _logger.debug(output)
- # get list
- flist = ftp.nlst()
- # loop over all files
- for tmpFile in flist:
- # skip RC
- if re.search('-rc\d\.',tmpFile) != None:
- continue
- # check arch
- supportedFlag = False
- for tmpArch in targetArchs:
- if tmpFile.endswith(tmpArch):
- supportedFlag = True
- break
- # copy
- if supportedFlag:
- _logger.debug('start %s' % tmpFile)
- dstFileName = '%s/%s' % (destDir,tmpFile)
- # check local
- if os.path.exists(dstFileName):
- # get remote size
- rsize = ftp.size(tmpFile)
- if rsize == None:
- _logger.debug(' cannot get remote size for %s' % tmpFile)
- else:
- # local size
- lsize = os.path.getsize(dstFileName)
- if lsize == rsize:
- _logger.debug('skip since alredy there %s' % tmpFile)
- continue
- # copy
- _logger.debug('copy %s' % tmpFile)
- outFile = open(dstFileName,'wb')
- ftp.retrbinary('RETR %s' % tmpFile,outFile.write)
- outFile.close()
- _logger.debug('end %s' % tmpFile)
- # quit
- output = ftp.quit()
- _logger.debug(output)
- # make list
- listFileName = 'applist'
- listFilePath = '%s/%s' % (destDir,listFileName)
- listFile = open(listFilePath,'w')
- for tmpFile in os.listdir(destDir):
- # skip hidden files
- if tmpFile.startswith('.'):
- continue
- # skip applist
- if tmpFile == listFileName:
- continue
- listFile.write('%s\n' % tmpFile)
- listFile.close()
-except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("Failed with %s %s" % (errType,errValue))
-
-
-_logger.debug("===================== end =====================")
diff --git a/current/pandaserver/test/createPandaSiteIDs.py b/current/pandaserver/test/createPandaSiteIDs.py
deleted file mode 100644
index 34f8ef816..000000000
--- a/current/pandaserver/test/createPandaSiteIDs.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import re
-from jobscheduler import siteinfo
-
-from taskbuffer.DBProxy import DBProxy
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-proxyN = DBProxy()
-proxyN.connect(panda_config.logdbhost,panda_config.logdbpasswd,panda_config.logdbuser,'PandaMetaDB')
-
-status,res = proxyN.querySQLS("SELECT nickname from schedconfig")
-
-nicknames = []
-for (nickname,) in res:
- nicknames.append(nickname)
-
-
-print "PandaSiteIDs = {"
-sites = siteinfo.sites.keys()
-sites.sort()
-for site in sites:
- vals = siteinfo.sites[site]
- okFlag = vals[10]
- fName = ''
- sitePat = site
- sitePat = re.sub('_PAUL','',sitePat)
- sitePat = re.sub('_TEST$','',sitePat)
- sitePat = re.sub('_test$','',sitePat)
- sitePat = re.sub('^ANALY_LONG_','',sitePat)
- sitePat = re.sub('^ANALY_','',sitePat)
- if site == 'SLACXRD':
- sitePat = 'slac'
- if site == 'UVIC':
- sitePat = 'VICTORIA'
- if sitePat == 'LYON':
- sitePat = 'IN2P3-CC-T2'
- if sitePat == 'Purdue-ITB':
- sitePat = 'Purdue'
- if sitePat == "BNL":
- sitePat = "BNL_ATLAS"
- if sitePat == "RAL":
- sitePat = "RAL-LCG2"
- if sitePat == "SACLAY":
- sitePat = "GRIF-DAPNIA"
- for nickname in nicknames:
- if re.search(sitePat,nickname,re.I) != None:
- fName = nickname
- if fName == '':
- #print site, sitePat
- fName = 'BNL_ATLAS_1-condor'
- print " %-22s : {'nickname':'%s','status':'%s'}," % ("'"+site+"'",fName,okFlag)
-print "}"
diff --git a/current/pandaserver/test/datasetManager.py b/current/pandaserver/test/datasetManager.py
deleted file mode 100644
index b5f8b7189..000000000
--- a/current/pandaserver/test/datasetManager.py
+++ /dev/null
@@ -1,924 +0,0 @@
-import os
-import re
-import sys
-import time
-import fcntl
-import types
-import shelve
-import random
-import datetime
-import commands
-import threading
-import userinterface.Client as Client
-from dataservice.DDM import ddm
-from dataservice.DDM import dashBorad
-from taskbuffer.OraDBProxy import DBProxy
-from taskbuffer.TaskBuffer import taskBuffer
-from pandalogger.PandaLogger import PandaLogger
-from jobdispatcher.Watcher import Watcher
-from brokerage.SiteMapper import SiteMapper
-from dataservice.Adder import Adder
-from dataservice.Finisher import Finisher
-from dataservice.MailUtils import MailUtils
-from taskbuffer import ProcessGroups
-import brokerage.broker_util
-import brokerage.broker
-import taskbuffer.ErrorCode
-import dataservice.DDM
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# logger
-_logger = PandaLogger().getLogger('datasetManager')
-
-_logger.debug("===================== start =====================")
-
-# use native DQ2
-ddm.useDirectDQ2()
-
-# memory checker
-def _memoryCheck(str):
- try:
- proc_status = '/proc/%d/status' % os.getpid()
- procfile = open(proc_status)
- name = ""
- vmSize = ""
- vmRSS = ""
- # extract Name,VmSize,VmRSS
- for line in procfile:
- if line.startswith("Name:"):
- name = line.split()[-1]
- continue
- if line.startswith("VmSize:"):
- vmSize = ""
- for item in line.split()[1:]:
- vmSize += item
- continue
- if line.startswith("VmRSS:"):
- vmRSS = ""
- for item in line.split()[1:]:
- vmRSS += item
- continue
- procfile.close()
- _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str))
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("memoryCheck() : %s %s" % (type,value))
- _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str))
- return
-
-_memoryCheck("start")
-
-# kill old dq2 process
-try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('ps axo user,pid,lstart,args | grep dq2.clientapi | grep -v PYTHONPATH | grep -v grep')
- for line in out.split('\n'):
- if line == '':
- continue
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old dq2 process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("kill dq2 process : %s %s" % (type,value))
-
-
-# kill old process
-try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=7)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName)
- for line in out.split('\n'):
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("kill process : %s %s" % (type,value))
-
-
-# instantiate TB
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-
-# instantiate sitemapper
-siteMapper = SiteMapper(taskBuffer)
-
-
-# list with lock
-class ListWithLock:
- def __init__(self):
- self.lock = threading.Lock()
- self.list = []
-
- def __contains__(self,item):
- self.lock.acquire()
- ret = self.list.__contains__(item)
- self.lock.release()
- return ret
-
- def append(self,item):
- appended = False
- self.lock.acquire()
- if not item in self.list:
- self.list.append(item)
- appended = True
- self.lock.release()
- return appended
-
-
-# list of dis datasets to be deleted
-deletedDisList = ListWithLock()
-
-
-# set tobedeleted to dis dataset
-def setTobeDeletedToDis(subDsName):
- try:
- # only production sub datasets
- if subDsName.startswith('user') or subDsName.startswith('group') or \
- subDsName.startswith('pandaddm_') or re.search('_sub\d+$',subDsName)==None:
- return
- # get _dis names with _sub
- disNameList = taskBuffer.getAssociatedDisDatasets(subDsName)
- _logger.debug("setTobeDeletedToDis : sub:%s has dis:%s" % (subDsName,str(disNameList)))
- # loop over all _dis datasets
- for tmpDisName in disNameList:
- # try to append to locked list
- if not deletedDisList.append(tmpDisName):
- # another thread already took care of the _dis
- continue
- # get dataset
- _logger.debug("setTobeDeletedToDis : try to get %s in DB" % tmpDisName)
- tmpDS = taskBuffer.queryDatasetWithMap({'name':tmpDisName})
- if tmpDS == None:
- _logger.error("setTobeDeletedToDis : cannot get %s in DB" % tmpDisName)
- continue
- # check status
- if tmpDS.status in ['tobedeleted','deleted']:
- _logger.debug("setTobeDeletedToDis : skip %s since status=%s" % (tmpDisName,tmpDS.status))
- continue
- # check the number of failed jobs associated to the _dis
- if tmpDS.currentfiles == 0:
- # all succeeded
- tmpDS.status = 'deleting'
- excStatus = 'deleted'
- else:
- # some failed, to reduce the lifetime
- tmpDS.status = 'shortening'
- excStatus = 'shortened'
- # update dataset
- retU = taskBuffer.updateDatasets([tmpDS],withLock=True,withCriteria="status<>:crStatus",
- criteriaMap={':crStatus':excStatus})
- _logger.debug("setTobeDeletedToDis : set %s to %s with %s" % (tmpDS.status,tmpDisName,str(retU)))
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("setTobeDeletedToDis : %s %s %s" % (subDsName,errType,errValue))
-
-
-# thread pool
-class ThreadPool:
- def __init__(self):
- self.lock = threading.Lock()
- self.list = []
-
- def add(self,obj):
- self.lock.acquire()
- self.list.append(obj)
- self.lock.release()
-
- def remove(self,obj):
- self.lock.acquire()
- self.list.remove(obj)
- self.lock.release()
-
- def join(self):
- self.lock.acquire()
- thrlist = tuple(self.list)
- self.lock.release()
- for thr in thrlist:
- thr.join()
-
-
-# thread to close dataset
-class CloserThr (threading.Thread):
- def __init__(self,lock,proxyLock,datasets,pool):
- threading.Thread.__init__(self)
- self.datasets = datasets
- self.lock = lock
- self.proxyLock = proxyLock
- self.pool = pool
- self.pool.add(self)
-
- def run(self):
- self.lock.acquire()
- try:
- # loop over all datasets
- for vuid,name,modDate in self.datasets:
- _logger.debug("Close %s %s" % (modDate,name))
- if not name.startswith('pandaddm_'):
- status,out = ddm.DQ2.main('freezeDataset',name)
- else:
- status,out = 0,''
- if status != 0 and out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1:
- _logger.error(out)
- else:
- self.proxyLock.acquire()
- varMap = {}
- varMap[':vuid'] = vuid
- varMap[':newstatus'] = 'completed'
- varMap[':oldstatus'] = 'tobeclosed'
- taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:newstatus,modificationdate=CURRENT_DATE WHERE vuid=:vuid AND status=:oldstatus",
- varMap)
- self.proxyLock.release()
- if name.startswith('pandaddm_'):
- continue
- # set tobedeleted to dis
- setTobeDeletedToDis(name)
- # count # of files
- status,out = ddm.DQ2.main('getNumberOfFiles',name)
- _logger.debug(out)
- if status != 0:
- _logger.error(out)
- else:
- try:
- nFile = int(out)
- _logger.debug(nFile)
- if nFile == 0:
- # erase dataset
- _logger.debug('erase %s' % name)
- status,out = ddm.DQ2.main('eraseDataset',name)
- _logger.debug('OK with %s' % name)
- except:
- pass
- except:
- pass
- self.pool.remove(self)
- self.lock.release()
-
-# close datasets
-_logger.debug("==== close datasets ====")
-timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30)
-timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3)
-closeLock = threading.Semaphore(5)
-closeProxyLock = threading.Lock()
-closeThreadPool = ThreadPool()
-maxRows = 100000
-while True:
- # lock
- closeLock.acquire()
- # get datasets
- closeProxyLock.acquire()
- varMap = {}
- varMap[':modificationdateU'] = timeLimitU
- varMap[':modificationdateL'] = timeLimitL
- varMap[':type'] = 'output'
- varMap[':status'] = 'tobeclosed'
- sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows
- proxyS = taskBuffer.proxyPool.getProxy()
- res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60')
- taskBuffer.proxyPool.putProxy(proxyS)
- if res == None:
- _logger.debug("# of datasets to be closed: %s" % res)
- else:
- _logger.debug("# of datasets to be closed: %s" % len(res))
- if res==None or len(res)==0:
- closeProxyLock.release()
- closeLock.release()
- break
- # release
- closeProxyLock.release()
- closeLock.release()
- # run thread
- iRows = 0
- nRows = 500
- while iRows < len(res):
- closerThr = CloserThr(closeLock,closeProxyLock,res[iRows:iRows+nRows],closeThreadPool)
- closerThr.start()
- iRows += nRows
- closeThreadPool.join()
- if len(res) < maxRows:
- break
-
-
-# thread to freeze dataset
-class Freezer (threading.Thread):
- def __init__(self,lock,proxyLock,datasets,pool):
- threading.Thread.__init__(self)
- self.datasets = datasets
- self.lock = lock
- self.proxyLock = proxyLock
- self.pool = pool
- self.pool.add(self)
-
- def run(self):
- self.lock.acquire()
- try:
- for vuid,name,modDate in self.datasets:
- _logger.debug("start %s %s" % (modDate,name))
- self.proxyLock.acquire()
- retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ lfn FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND NOT status IN (:status1,:status2,:status3)",
- {':destinationDBlock':name,':status1':'ready',':status2':'failed',':status3':'skipped'})
- self.proxyLock.release()
- if retF<0:
- _logger.error("SQL error")
- else:
- # no files in filesTable
- if len(resF) == 0:
- _logger.debug("freeze %s " % name)
- if not name.startswith('pandaddm_'):
- status,out = ddm.DQ2.main('freezeDataset',name)
- else:
- status,out = 0,''
- if status != 0 and out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1:
- _logger.error(out)
- else:
- self.proxyLock.acquire()
- varMap = {}
- varMap[':vuid'] = vuid
- varMap[':status'] = 'completed'
- taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
- varMap)
- self.proxyLock.release()
- if name.startswith('pandaddm_'):
- continue
- # set tobedeleted to dis
- setTobeDeletedToDis(name)
- # count # of files
- status,out = ddm.DQ2.main('getNumberOfFiles',name)
- _logger.debug(out)
- if status != 0:
- _logger.error(out)
- else:
- try:
- nFile = int(out)
- _logger.debug(nFile)
- if nFile == 0:
- # erase dataset
- _logger.debug('erase %s' % name)
- status,out = ddm.DQ2.main('eraseDataset',name)
- _logger.debug('OK with %s' % name)
- except:
- pass
- else:
- _logger.debug("wait %s " % name)
- self.proxyLock.acquire()
- taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid})
- self.proxyLock.release()
- _logger.debug("end %s " % name)
- except:
- pass
- self.pool.remove(self)
- self.lock.release()
-
-# freeze dataset
-_logger.debug("==== freeze datasets ====")
-timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(days=4)
-timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=14)
-freezeLock = threading.Semaphore(5)
-freezeProxyLock = threading.Lock()
-freezeThreadPool = ThreadPool()
-maxRows = 100000
-while True:
- # lock
- freezeLock.acquire()
- # get datasets
- sqlQuery = "type=:type AND status IN (:status1,:status2,:status3,:status4) " + \
- "AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND subType=:subType AND rownum <= %s" % maxRows
- varMap = {}
- varMap[':modificationdateU'] = timeLimitU
- varMap[':modificationdateL'] = timeLimitL
- varMap[':type'] = 'output'
- varMap[':status1'] = 'running'
- varMap[':status2'] = 'created'
- varMap[':status3'] = 'defined'
- varMap[':status4'] = 'locked'
- varMap[':subType'] = 'sub'
- freezeProxyLock.acquire()
- proxyS = taskBuffer.proxyPool.getProxy()
- res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60')
- taskBuffer.proxyPool.putProxy(proxyS)
- if res == None:
- _logger.debug("# of datasets to be frozen: %s" % res)
- else:
- _logger.debug("# of datasets to be frozen: %s" % len(res))
- if res==None or len(res)==0:
- freezeProxyLock.release()
- freezeLock.release()
- break
- freezeProxyLock.release()
- # release
- freezeLock.release()
- # run freezer
- iRows = 0
- nRows = 500
- while iRows < len(res):
- freezer = Freezer(freezeLock,freezeProxyLock,res[iRows:iRows+nRows],freezeThreadPool)
- freezer.start()
- iRows += nRows
- freezeThreadPool.join()
- if len(res) < maxRows:
- break
-
-
-# thread to delete dataset replica from T2
-class T2Cleaner (threading.Thread):
- def __init__(self,lock,proxyLock,datasets,pool):
- threading.Thread.__init__(self)
- self.datasets = datasets
- self.lock = lock
- self.proxyLock = proxyLock
- self.pool = pool
- self.pool.add(self)
-
- def run(self):
- self.lock.acquire()
- try:
- for vuid,name,modDate in self.datasets:
- _logger.debug("cleanT2 %s" % name)
- # get list of replicas
- status,out = ddm.DQ2.main('listDatasetReplicas',name,0,None,False)
- if status != 0 and out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1:
- _logger.error(out)
- continue
- else:
- if out.find("DQUnknownDatasetException") == -1 and out.find("DQDeletedDatasetException") == -1:
- listOut = out
- try:
- # convert res to map
- exec "tmpRepSites = %s" % out
- except:
- tmpRepSites = {}
- _logger.error("cannot convert to replica map")
- _logger.error(out)
- continue
- # check if there is active subscription
- _logger.debug('listSubscriptions %s' % name)
- subStat,subOut = ddm.DQ2.main('listSubscriptions',name)
- if subStat != 0:
- _logger.error("cannot get subscriptions for %s" % name)
- _logger.error(subOut)
- _logger.debug('subscriptions for %s = %s' % (name,subOut))
- # active subscriotions
- if subOut != '[]':
- _logger.debug("wait %s due to active subscription" % name)
- continue
- # check cloud
- self.proxyLock.acquire()
- proxyS = taskBuffer.proxyPool.getProxy()
- destSE = proxyS.getDestSEwithDestDBlock(name)
- taskBuffer.proxyPool.putProxy(proxyS)
- self.proxyLock.release()
- cloudName = None
- if siteMapper.checkSite(destSE):
- cloudName = siteMapper.getSite(destSE).cloud
- # cloud is not found
- if cloudName == None:
- _logger.error("cannot find cloud for %s : %s" % (name,str(tmpRepSites)))
- else:
- _logger.debug('cloud=%s for %s' % (cloudName,name))
- t1SiteDDMs = siteMapper.getSite(destSE).setokens.values()
- # look for T2 IDs
- t2DDMs = []
- for tmpDDM in tmpRepSites.keys():
- if not tmpDDM in t1SiteDDMs:
- # check home cloud
- notDeleteFlag = False
- for tmpT2siteID,tmpT2siteSpec in siteMapper.siteSpecList.iteritems():
- if tmpT2siteSpec.ddm == tmpDDM:
- # not delete if src and dest are in US. OSG is regarded as US due to tier1
- if tmpT2siteSpec.cloud in ['US'] and cloudName in ['US','OSG']:
- notDeleteFlag = True
- if not notDeleteFlag:
- t2DDMs.append(tmpDDM)
- # delete replica for sub
- if re.search('_sub\d+$',name) != None and t2DDMs != []:
- setMetaFlag = True
- for tmpT2DDM in t2DDMs:
- _logger.debug('setReplicaMetaDataAttribute %s %s' % (name,tmpT2DDM))
- status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',name,tmpT2DDM,'pin_lifetime','')
- if status != 0:
- _logger.error(out)
- if out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \
- out.find("No replica found") == -1:
- setMetaFlag = False
- if not setMetaFlag:
- continue
- _logger.debug(('deleteDatasetReplicas',name,t2DDMs))
- status,out = ddm.DQ2.main('deleteDatasetReplicas',name,t2DDMs,0,False,False,False,False,False,'00:00:00')
- if status != 0:
- _logger.error(out)
- if out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \
- out.find("No replica found") == -1:
- continue
- else:
- _logger.debug('no delete for %s due to empty target in %s' % (name,listOut))
- # update
- self.proxyLock.acquire()
- varMap = {}
- varMap[':vuid'] = vuid
- varMap[':status'] = 'completed'
- taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
- varMap)
- self.proxyLock.release()
- _logger.debug("end %s " % name)
- except:
- pass
- self.pool.remove(self)
- self.lock.release()
-
-# delete dataset replica from T2
-_logger.debug("==== delete datasets from T2 ====")
-timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30)
-timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3)
-t2cleanLock = threading.Semaphore(5)
-t2cleanProxyLock = threading.Lock()
-t2cleanThreadPool = ThreadPool()
-maxRows = 100000
-while True:
- # lock
- t2cleanLock.acquire()
- # get datasets
- varMap = {}
- varMap[':modificationdateU'] = timeLimitU
- varMap[':modificationdateL'] = timeLimitL
- varMap[':type'] = 'output'
- varMap[':status'] = 'cleanup'
- sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows
- t2cleanProxyLock.acquire()
- proxyS = taskBuffer.proxyPool.getProxy()
- res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60')
- taskBuffer.proxyPool.putProxy(proxyS)
- if res == None:
- _logger.debug("# of datasets to be deleted from T2: %s" % res)
- else:
- _logger.debug("# of datasets to be deleted from T2: %s" % len(res))
- if res==None or len(res)==0:
- t2cleanProxyLock.release()
- t2cleanLock.release()
- break
- t2cleanProxyLock.release()
- # release
- t2cleanLock.release()
- # run t2cleanr
- iRows = 0
- nRows = 500
- while iRows < len(res):
- t2cleanr = T2Cleaner(t2cleanLock,t2cleanProxyLock,res[iRows:iRows+nRows],t2cleanThreadPool)
- t2cleanr.start()
- iRows += nRows
- t2cleanThreadPool.join()
- if len(res) < maxRows:
- break
-
-
-# delete dis datasets
-class EraserThr (threading.Thread):
- def __init__(self,lock,proxyLock,datasets,pool,operationType):
- threading.Thread.__init__(self)
- self.datasets = datasets
- self.lock = lock
- self.proxyLock = proxyLock
- self.pool = pool
- self.pool.add(self)
- self.operationType = operationType
-
- def run(self):
- self.lock.acquire()
- try:
- # loop over all datasets
- for vuid,name,modDate in self.datasets:
- # only dis datasets
- if re.search('_dis\d+$',name) == None:
- _logger.error("Eraser : non disDS %s" % name)
- continue
- # delete
- _logger.debug("Eraser %s dis %s %s" % (self.operationType,modDate,name))
- # delete or shorten
- if self.operationType == 'deleting':
- # erase
- endStatus = 'deleted'
- status,out = ddm.DQ2.main('eraseDataset',name)
- if status != 0 and out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1:
- _logger.error(out)
- continue
- else:
- # change replica lifetime
- endStatus = 'shortened'
- # get list of replicas
- status,out = ddm.DQ2.main('listDatasetReplicas',name,0,None,False)
- if status != 0 and out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1:
- _logger.error(out)
- continue
- if out.find("DQUnknownDatasetException") == -1 and out.find("DQDeletedDatasetException") == -1:
- try:
- # convert res to map
- exec "tmpRepSites = %s" % out
- except:
- tmpRepSites = {}
- _logger.error("cannot convert to replica map")
- _logger.error(out)
- continue
- # set replica lifetime
- setMetaFlag = True
- for tmpDDM in tmpRepSites.keys():
- _logger.debug('setReplicaMetaDataAttribute %s %s' % (name,tmpDDM))
- status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',name,tmpDDM,'lifetime','1 days')
- if status != 0:
- _logger.error(out)
- if out.find('DQFrozenDatasetException') == -1 and \
- out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \
- out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \
- out.find("No replica found") == -1:
- setMetaFlag = False
- if not setMetaFlag:
- continue
- _logger.debug('OK with %s' % name)
- # update
- self.proxyLock.acquire()
- varMap = {}
- varMap[':vuid'] = vuid
- varMap[':status'] = endStatus
- taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
- varMap)
- self.proxyLock.release()
- except:
- pass
- self.pool.remove(self)
- self.lock.release()
-
-# delete dis datasets
-_logger.debug("==== delete dis datasets ====")
-timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30)
-timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3)
-disEraseLock = threading.Semaphore(5)
-disEraseProxyLock = threading.Lock()
-disEraseThreadPool = ThreadPool()
-maxRows = 100000
-for targetStatus in ['deleting','shortening']:
- # lock
- disEraseLock.acquire()
- # get datasets
- varMap = {}
- varMap[':modificationdateU'] = timeLimitU
- varMap[':modificationdateL'] = timeLimitL
- varMap[':type'] = 'dispatch'
- varMap[':status'] = targetStatus
- sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows
- disEraseProxyLock.acquire()
- proxyS = taskBuffer.proxyPool.getProxy()
- res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60')
- taskBuffer.proxyPool.putProxy(proxyS)
- if res == None:
- _logger.debug("# of dis datasets for %s: None" % targetStatus)
- else:
- _logger.debug("# of dis datasets for %s: %s" % (targetStatus,len(res)))
- if res==None or len(res)==0:
- disEraseProxyLock.release()
- disEraseLock.release()
- break
- disEraseProxyLock.release()
- # release
- disEraseLock.release()
- # run disEraser
- iRows = 0
- nRows = 500
- while iRows < len(res):
- disEraser = EraserThr(disEraseLock,disEraseProxyLock,res[iRows:iRows+nRows],
- disEraseThreadPool,targetStatus)
- disEraser.start()
- iRows += nRows
- disEraseThreadPool.join()
-
-
-_memoryCheck("finisher")
-
-# finisher thread
-class FinisherThr (threading.Thread):
- def __init__(self,lock,proxyLock,ids,pool,timeNow):
- threading.Thread.__init__(self)
- self.ids = ids
- self.lock = lock
- self.proxyLock = proxyLock
- self.pool = pool
- self.timeNow = timeNow
- self.pool.add(self)
-
- def run(self):
- self.lock.acquire()
- try:
- # get jobs from DB
- ids = self.ids
- self.proxyLock.acquire()
- jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False)
- self.proxyLock.release()
- upJobs = []
- finJobs = []
- for job in jobs:
- if job == None or job.jobStatus == 'unknown':
- continue
- # use BNL by default
- dq2URL = siteMapper.getSite('BNL_ATLAS_1').dq2url
- dq2SE = []
- # get LFC and SEs
- if job.prodSourceLabel == 'user' and not siteMapper.siteSpecList.has_key(job.destinationSE):
- # using --destSE for analysis job to transfer output
- try:
- dq2URL = dataservice.DDM.toa.getLocalCatalog(job.destinationSE)[-1]
- match = re.search('.+://([^:/]+):*\d*/*',dataservice.DDM.toa.getSiteProperty(job.destinationSE,'srm')[-1])
- if match != None:
- dq2SE.append(match.group(1))
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("%s Failed to get DQ2/SE with %s %s" % (job.PandaID,type,value))
- continue
- elif siteMapper.checkCloud(job.cloud):
- # normal production jobs
- tmpDstID = siteMapper.getCloud(job.cloud)['dest']
- tmpDstSite = siteMapper.getSite(tmpDstID)
- if not tmpDstSite.lfchost in [None,'']:
- # LFC
- dq2URL = 'lfc://'+tmpDstSite.lfchost+':/grid/atlas/'
- if tmpDstSite.se != None:
- for tmpDstSiteSE in tmpDstSite.se.split(','):
- match = re.search('.+://([^:/]+):*\d*/*',tmpDstSiteSE)
- if match != None:
- dq2SE.append(match.group(1))
- else:
- # LRC
- dq2URL = tmpDstSite.dq2url
- dq2SE = []
- # get LFN list
- lfns = []
- guids = []
- nTokens = 0
- for file in job.Files:
- # only output files are checked
- if file.type == 'output' or file.type == 'log':
- lfns.append(file.lfn)
- guids.append(file.GUID)
- nTokens += len(file.destinationDBlockToken.split(','))
- # get files in LRC
- _logger.debug("%s Cloud:%s DQ2URL:%s" % (job.PandaID,job.cloud,dq2URL))
- okFiles = brokerage.broker_util.getFilesFromLRC(lfns,dq2URL,guids,dq2SE,getPFN=True)
- # count files
- nOkTokens = 0
- for okLFN,okPFNs in okFiles.iteritems():
- nOkTokens += len(okPFNs)
- # check all files are ready
- _logger.debug("%s nToken:%s nOkToken:%s" % (job.PandaID,nTokens,nOkTokens))
- if nTokens <= nOkTokens:
- _logger.debug("%s Finisher : Finish" % job.PandaID)
- for file in job.Files:
- if file.type == 'output' or file.type == 'log':
- file.status = 'ready'
- # append to run Finisher
- finJobs.append(job)
- else:
- endTime = job.endTime
- if endTime == 'NULL':
- endTime = job.startTime
- # priority-dependent timeout
- tmpCloudSpec = siteMapper.getCloud(job.cloud)
- if job.currentPriority >= 800 and (not job.prodSourceLabel in ['user']):
- if tmpCloudSpec.has_key('transtimehi'):
- timeOutValue = tmpCloudSpec['transtimehi']
- else:
- timeOutValue = 1
- else:
- if tmpCloudSpec.has_key('transtimelo'):
- timeOutValue = tmpCloudSpec['transtimelo']
- else:
- timeOutValue = 2
- # protection
- if timeOutValue < 1:
- timeOutValue = 1
- timeOut = self.timeNow - datetime.timedelta(days=timeOutValue)
- _logger.debug("%s Priority:%s Limit:%s End:%s" % (job.PandaID,job.currentPriority,str(timeOut),str(endTime)))
- if endTime < timeOut:
- # timeout
- _logger.debug("%s Finisher : Kill" % job.PandaID)
- strMiss = ''
- for lfn in lfns:
- if not lfn in okFiles:
- strMiss += ' %s' % lfn
- job.jobStatus = 'failed'
- job.taskBufferErrorCode = taskbuffer.ErrorCode.EC_Transfer
- job.taskBufferErrorDiag = 'transfer timeout for '+strMiss
- guidMap = {}
- for file in job.Files:
- # set file status
- if file.status == 'transferring':
- file.status = 'failed'
- # collect GUIDs to delete files from _tid datasets
- if file.type == 'output' or file.type == 'log':
- if not guidMap.has_key(file.destinationDBlock):
- guidMap[file.destinationDBlock] = []
- guidMap[file.destinationDBlock].append(file.GUID)
- else:
- # wait
- _logger.debug("%s Finisher : Wait" % job.PandaID)
- for lfn in lfns:
- if not lfn in okFiles:
- _logger.debug("%s -> %s" % (job.PandaID,lfn))
- upJobs.append(job)
- # update
- _logger.debug("updating ...")
- self.proxyLock.acquire()
- taskBuffer.updateJobs(upJobs,False)
- self.proxyLock.release()
- # run Finisher
- for job in finJobs:
- fThr = Finisher(taskBuffer,None,job)
- fThr.start()
- fThr.join()
- _logger.debug("done")
- time.sleep(1)
- except:
- pass
- self.pool.remove(self)
- self.lock.release()
-
-# finish transferring jobs
-_logger.debug("==== finish transferring jobs ====")
-finisherLock = threading.Semaphore(3)
-finisherProxyLock = threading.Lock()
-finisherThreadPool = ThreadPool()
-for loopIdx in ['low','high']:
- timeNow = datetime.datetime.utcnow()
- if loopIdx == 'high':
- highPrioFlag = True
- else:
- highPrioFlag = False
- # get jobs
- for ii in range(1000):
- # lock
- finisherLock.acquire()
- finisherProxyLock.acquire()
- ret,res = taskBuffer.lockJobsForFinisher(timeNow,200,highPrioFlag)
- finisherProxyLock.release()
- finisherLock.release()
- if res == None:
- _logger.debug("# of jobs to be finished for %s : %s" % (loopIdx,res))
- else:
- _logger.debug("# of jobs to be finished for %s : %s" % (loopIdx,len(res)))
- if res == None or len(res) == 0:
- break
- # run thread
- finThr = FinisherThr(finisherLock,finisherProxyLock,res,finisherThreadPool,timeNow)
- finThr.start()
- # wait
- finisherThreadPool.join()
-
-
-_memoryCheck("end")
-
-_logger.debug("===================== end =====================")
diff --git a/current/pandaserver/test/deleteJobs.py b/current/pandaserver/test/deleteJobs.py
deleted file mode 100755
index 18195c27c..000000000
--- a/current/pandaserver/test/deleteJobs.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import os
-import re
-import sys
-import time
-import fcntl
-import types
-import shelve
-import random
-import datetime
-import commands
-import threading
-import userinterface.Client as Client
-from dataservice.DDM import ddm
-from dataservice.DDM import dashBorad
-from taskbuffer.OraDBProxy import DBProxy
-from taskbuffer.TaskBuffer import taskBuffer
-from pandalogger.PandaLogger import PandaLogger
-from jobdispatcher.Watcher import Watcher
-from brokerage.SiteMapper import SiteMapper
-from dataservice.Adder import Adder
-from dataservice.Finisher import Finisher
-from dataservice.MailUtils import MailUtils
-from taskbuffer import ProcessGroups
-import brokerage.broker_util
-import brokerage.broker
-import taskbuffer.ErrorCode
-import dataservice.DDM
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# logger
-_logger = PandaLogger().getLogger('deleteJobs')
-
-_logger.debug("===================== start =====================")
-
-# memory checker
-def _memoryCheck(str):
- try:
- proc_status = '/proc/%d/status' % os.getpid()
- procfile = open(proc_status)
- name = ""
- vmSize = ""
- vmRSS = ""
- # extract Name,VmSize,VmRSS
- for line in procfile:
- if line.startswith("Name:"):
- name = line.split()[-1]
- continue
- if line.startswith("VmSize:"):
- vmSize = ""
- for item in line.split()[1:]:
- vmSize += item
- continue
- if line.startswith("VmRSS:"):
- vmRSS = ""
- for item in line.split()[1:]:
- vmRSS += item
- continue
- procfile.close()
- _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str))
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("memoryCheck() : %s %s" % (type,value))
- _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str))
- return
-
-_memoryCheck("start")
-
-# kill old process
-try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName)
- for line in out.split('\n'):
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("kill process : %s %s" % (type,value))
-
-
-# instantiate TB
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-
-# instantiate sitemapper
-siteMapper = SiteMapper(taskBuffer)
-
-
-# table names
-jobATableName = "ATLAS_PANDAARCH.jobsArchived"
-filesATableName = "ATLAS_PANDAARCH.filesTable_ARCH"
-paramATableName = "ATLAS_PANDAARCH.jobParamsTable_ARCH"
-metaATableName = "ATLAS_PANDAARCH.metaTable_ARCH"
-
-# time limit
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=3)
-
-# delete
-_logger.debug("get PandaIDs for Delete")
-sql = "SELECT COUNT(*) FROM ATLAS_PANDA.jobsArchived4 WHERE modificationTime<:modificationTime"
-varMap = {}
-varMap[':modificationTime'] = timeLimit
-status,res = taskBuffer.querySQLS(sql,varMap)
-if res != None:
- tmpTotal = res[0][0]
-else:
- tmpTotal = None
-maxBunch = 1000
-nBunch = 500
-tmpIndex = 0
-while True:
- sql = "SELECT PandaID,modificationTime FROM ATLAS_PANDA.jobsArchived4 "
- sql += "WHERE modificationTime<:modificationTime AND archivedFlag=:archivedFlag AND rownum<=:rowRange"
- varMap = {}
- varMap[':modificationTime'] = timeLimit
- varMap[':archivedFlag'] = 1
- varMap[':rowRange'] = maxBunch
- status,res = taskBuffer.querySQLS(sql,varMap)
- if res == None:
- _logger.error("failed to get PandaIDs to be deleted")
- break
- else:
- _logger.debug("got %s for deletion" % len(res))
- if len(res) == 0:
- _logger.debug("no jobs left for for deletion")
- break
- else:
- maxBunch = len(res)
- random.shuffle(res)
- res = res[:nBunch]
- # loop over all jobs
- for (id,srcEndTime) in res:
- tmpIndex += 1
- try:
- # check
- sql = "SELECT PandaID from %s WHERE PandaID=:PandaID" % jobATableName
- varMap = {}
- varMap[':PandaID'] = id
- status,check = taskBuffer.querySQLS(sql,varMap)
- if check == None or len(check) == 0:
- # no record in ArchivedDB
- _logger.error("No backup for %s" % id)
- else:
- # delete
- _logger.debug("DEL %s : endTime %s" % (id,srcEndTime))
- proxyS = taskBuffer.proxyPool.getProxy()
- proxyS.deleteJobSimple(id)
- taskBuffer.proxyPool.putProxy(proxyS)
- if tmpIndex % 1000 == 1:
- _logger.debug(" deleted %s/%s" % (tmpIndex,tmpTotal))
- except:
- pass
- # terminate
- if maxBunch < nBunch:
- break
-_logger.debug("===================== end =====================")
diff --git a/current/pandaserver/test/directSubmit.py b/current/pandaserver/test/directSubmit.py
deleted file mode 100755
index 81c96e953..000000000
--- a/current/pandaserver/test/directSubmit.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import re
-import sys
-import time
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv) != 2:
- print "task file is missing"
- sys.exit(0)
-
-# open task file
-taskFile = open(sys.argv[1])
-
-# read common parameters
-line = taskFile.readline()
-items = line.split()
-
-# common parameters
-taskID = items[0]
-inTaskName = items[1]
-taskName = items[2]
-formats = items[3].split('.')
-lparams = items[4].split(',')
-vparams = items[5].split(',')
-trf = items[7]
-trfVer = items[8]
-grid = items[10]
-priority = items[11]
-totalJob = items[14]
-cpu = items[15]
-memory = items[16]
-
-
-# input dataset
-iDataset = 'NULL'
-m = re.search('(.+)\.([^\.]+)\.([^\.]+)$',inTaskName)
-if m != None:
- step = m.group(2)
- if step == 'evgen':
- format = 'EVENT'
- elif step == 'digit':
- format = 'RDO'
- else:
- format = 'AOO'
- #### FIXME : _tidXXXX is missing
- iDataset = '%s.%s.%s.%s' % (m.group(1),step,format,m.group(3))
-
-
-# output datasets
-m = re.search('(.+)\.([^\.]+)\.([^\.]+)$',taskName)
-oDatasets = []
-for format in formats:
- step = m.group(2)
- if format=='HITS':
- step = 'simul'
- # append
- oDatasets.append('%s.%s.%s.%s_tid%06d' % (m.group(1),step,format,m.group(3),int(taskID)))
-
-# log dataset
-lDataset = '%s.%s.%s.%s_tid%06d' % (m.group(1),m.group(2),'log',m.group(3),int(taskID))
-
-
-# instantiate JobSpecs
-iJob = 0
-jobList = []
-for line in taskFile:
- iJob += 1
- job = JobSpec()
- # job ID ###### FIXME
- job.jobDefinitionID = int(time.time()) % 10000
- # job name
- job.jobName = "%s_%05d.job" % (taskName,iJob)
- # AtlasRelease
- if len(re.findall('\.',trfVer)) > 2:
- match = re.search('^(\d+\.\d+\.\d+)',trfVer)
- job.AtlasRelease = 'Atlas-%s' % match.group(1)
- else:
- job.AtlasRelease = 'Atlas-%s' % trfVer
- # homepackage
- vers = trfVer.split('.')
- if int(vers[0]) <= 11:
- job.homepackage = 'JobTransforms'
- for ver in vers:
- job.homepackage += "-%02d" % int(ver)
- else:
- job.homepackage = 'AtlasProduction/%s' % trfVer
- # trf
- job.transformation = trf
- job.destinationDBlock = oDatasets[0]
- # prod DBlock
- job.prodDBlock = iDataset
- # souce lavel
- job.prodSeriesLabel = 'pandatest'
- job.prodSourceLabel = 'managed'
- # priority
- job.assignedPriority = priority
- job.currentPriority = priority
- # CPU, memory,disk ### FIXME
-
- # attempt number ### FIXME
-
- # input files
- if iDataset != 'NULL':
- # remove _tidXXX
- pat = re.sub('_tid\d+$','',iDataset)
- # search
- m = re.search('('+pat+'\S+)',line)
- if m != None:
- file = FileSpec()
- file.lfn = m.group(1)
- file.type = 'input'
- file.dataset = iDataset
- file.prodDBlock = iDataset
- job.addFile(file)
- # DB release
- for i,lpar in enumerate(lparams):
- if lpar == 'DBRelease':
- file = FileSpec()
- file.lfn = "%s-%s.tgz" % (lpar,vparams[i])
- file.type = 'input'
- file.dataset = iDataset
- file.prodDBlock = iDataset
- job.addFile(file)
- break
- # output files
- for oDataset in oDatasets:
- # remove _tidXXX
- pat = re.sub('_tid\d+$','',oDataset)
- # search
- m = re.search('('+pat+'\S+)',line)
- if m != None:
- file = FileSpec()
- file.lfn = m.group(1)
- file.type = 'output'
- file.dataset = oDataset
- file.destinationDBlock = oDataset
- job.addFile(file)
- # log
- file = FileSpec()
- file.lfn = "%s._%05d.log.tgz" % (lDataset,iJob)
- file.type = 'log'
- file.dataset = lDataset
- file.destinationDBlock = lDataset
- job.addFile(file)
-
- # job par
- job.jobParameters = line[:-1]
-
- """
- print job.values()
- for file in job.Files:
- print file.values()
- sys.exit(0)
- """
- jobList.append(job)
-
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/distributeDefJobs.py b/current/pandaserver/test/distributeDefJobs.py
deleted file mode 100755
index c1cee20a2..000000000
--- a/current/pandaserver/test/distributeDefJobs.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import datetime
-from taskbuffer.DBProxy import DBProxy
-import userinterface.Client as Client
-import jobscheduler.Site
-import random
-import time
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# time limit
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1)
-
-# instantiate DB proxies
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-# get PandaIDs from jobsDefined
-res = proxyS.querySQL("SELECT PandaID,modificationTime from jobsDefined4 ORDER BY modificationTime")
-
-# list of known sites
-tmpSites = jobscheduler.Site.KnownSite.getAllSitesID()
-allSites = []
-for site in tmpSites:
- # _allSites may conain NULL after sort()
- if site == 'NULL':
- continue
- # ignore test sites
- if site.endswith('test') or site.endswith('Test'):
- continue
- # append
- allSites.append(site)
-
-# reassign jobs
-jobs=[]
-for (id,modTime) in res:
- if modTime < timeLimit:
- jobs.append(id)
-
-# reassign
-if len(jobs):
- nJob = 20
- iJob = 0
- while iJob < len(jobs):
- print 'reassignJobs(%s)' % jobs[iJob:iJob+nJob]
- index = random.randint(1,len(allSites))
- site = allSites[int(index)-1]
- print 'site=%s' % site
- Client.reassignJobs(jobs[iJob:iJob+nJob],site)
- iJob += nJob
- time.sleep(10)
-
diff --git a/current/pandaserver/test/dq2cr.py b/current/pandaserver/test/dq2cr.py
deleted file mode 100755
index b28b0ccea..000000000
--- a/current/pandaserver/test/dq2cr.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = None
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = 'BNL_SE'
-
-jobList = []
-
-for i in range(1):
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i)
- job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/run_dq2_cr'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.currentPriority = 100000
- #job.prodSourceLabel = 'test'
- job.prodSourceLabel = 'user'
- job.computingSite = site
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="8072 0 5000 1 DC3.008072.JimmyPhotonJet1.py NONE NONE NONE"
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/emailfix.py b/current/pandaserver/test/emailfix.py
deleted file mode 100755
index a39bd3bc4..000000000
--- a/current/pandaserver/test/emailfix.py
+++ /dev/null
@@ -1,16 +0,0 @@
-'''
-notifier
-
-'''
-
-import shelve
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-
-# open DB
-pDB = shelve.open(panda_config.emailDB)
-
-
-
-
diff --git a/current/pandaserver/test/evpPD2P.py b/current/pandaserver/test/evpPD2P.py
deleted file mode 100644
index 27cb721f8..000000000
--- a/current/pandaserver/test/evpPD2P.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import re
-import sys
-import glob
-import time
-import os.path
-import commands
-import datetime
-import threading
-from config import panda_config
-from taskbuffer.TaskBuffer import taskBuffer
-from brokerage import SiteMapper
-from dataservice.EventPicker import EventPicker
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('evpPD2P')
-
-_logger.debug("===================== start =====================")
-
-# overall timeout value
-overallTimeout = 60
-# prefix of evp files
-prefixEVP = 'evp.'
-# file pattern of evp files
-evpFilePatt = panda_config.cache_dir + '/' + prefixEVP + '*'
-
-# kill old process
-try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName)
- for line in out.split('\n'):
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("kill process : %s %s" % (type,value))
-
-# instantiate PD2P
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-siteMapper = SiteMapper.SiteMapper(taskBuffer)
-
-
-# thread pool
-class ThreadPool:
- def __init__(self):
- self.lock = threading.Lock()
- self.list = []
-
- def add(self,obj):
- self.lock.acquire()
- self.list.append(obj)
- self.lock.release()
-
- def remove(self,obj):
- self.lock.acquire()
- self.list.remove(obj)
- self.lock.release()
-
- def join(self):
- self.lock.acquire()
- thrlist = tuple(self.list)
- self.lock.release()
- for thr in thrlist:
- thr.join()
-
-
-# thread to ev-pd2p
-class EvpThr (threading.Thread):
- def __init__(self,lock,pool,aTaskBuffer,aSiteMapper,fileName,ignoreError):
- threading.Thread.__init__(self)
- self.lock = lock
- self.pool = pool
- self.fileName = fileName
- self.evp = EventPicker(aTaskBuffer,aSiteMapper,fileName,ignoreError)
- self.pool.add(self)
-
- def run(self):
- self.lock.acquire()
- retRun = self.evp.run()
- _logger.debug("%s : %s" % (retRun,self.fileName))
- self.pool.remove(self)
- self.lock.release()
-
-
-# get files
-_logger.debug("EVP session")
-timeNow = datetime.datetime.utcnow()
-timeInt = datetime.datetime.utcnow()
-fileList = glob.glob(evpFilePatt)
-fileList.sort()
-
-# create thread pool and semaphore
-adderLock = threading.Semaphore(3)
-adderThreadPool = ThreadPool()
-
-# add
-while len(fileList) != 0:
- # time limit to aviod too many copyArchve running at the sametime
- if (datetime.datetime.utcnow() - timeNow) > datetime.timedelta(minutes=overallTimeout):
- _logger.debug("time over in EVP session")
- break
- # try to get Semaphore
- adderLock.acquire()
- # get fileList
- if (datetime.datetime.utcnow() - timeInt) > datetime.timedelta(minutes=15):
- timeInt = datetime.datetime.utcnow()
- # get file
- fileList = glob.glob(evpFilePatt)
- fileList.sort()
- # choose a file
- fileName = fileList.pop(0)
- # release lock
- adderLock.release()
- if not os.path.exists(fileName):
- continue
- try:
- modTime = datetime.datetime(*(time.gmtime(os.path.getmtime(fileName))[:7]))
- if (timeNow - modTime) > datetime.timedelta(hours=24):
- # last chance
- _logger.debug("Last event picking : %s" % fileName)
- thr = EvpThr(adderLock,adderThreadPool,taskBuffer,siteMapper,fileName,False)
- thr.start()
- elif (timeInt - modTime) > datetime.timedelta(minutes=1):
- # try
- _logger.debug("event picking : %s" % fileName)
- thr = EvpThr(adderLock,adderThreadPool,taskBuffer,siteMapper,fileName,True)
- thr.start()
- else:
- _logger.debug("%s : %s" % ((timeInt - modTime),fileName))
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("%s %s" % (errType,errValue))
-
-# join all threads
-adderThreadPool.join()
-
-_logger.debug("===================== end =====================")
-
diff --git a/current/pandaserver/test/execute.py b/current/pandaserver/test/execute.py
deleted file mode 100755
index 8cc2f2429..000000000
--- a/current/pandaserver/test/execute.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = None
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = 'BNL_ATLAS_2'
-
-jobList = []
-for i in range(20):
-
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = commands.getoutput('uuidgen')
- job.AtlasRelease = 'Atlas-11.0.41'
- #job.AtlasRelease = 'Atlas-11.0.3'
- job.homepackage = 'AnalysisTransforms'
- job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/runAthena'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.currentPriority = 100
- job.prodSourceLabel = 'user'
- job.computingSite = site
- #job.prodDBlock = "pandatest.b1599dfa-cd36-4fc5-92f6-495781a94c66"
- job.prodDBlock = "pandatest.f228b051-077b-4f81-90bf-496340644379"
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = "lib.f228b051-077b-4f81-90bf-496340644379.tgz"
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen')
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- fileOZ = FileSpec()
- fileOZ.lfn = "%s.pool.root" % commands.getoutput('uuidgen')
- fileOZ.destinationDBlock = job.destinationDBlock
- fileOZ.destinationSE = job.destinationSE
- fileOZ.dataset = job.destinationDBlock
- fileOZ.type = 'output'
- job.addFile(fileOZ)
-
- job.jobParameters="""-l %s -r PhysicsAnalysis/AnalysisCommon/UserAnalysis/UserAnalysis-00-05-11/run -j " jobOptions.pythia.py" -i "[]" -o "{'Stream1': '%s'}" """ % (fileI.lfn,fileOZ.lfn)
-
- jobList.append(job)
-
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/fileCallbackListener.py b/current/pandaserver/test/fileCallbackListener.py
deleted file mode 100644
index bad0c76cd..000000000
--- a/current/pandaserver/test/fileCallbackListener.py
+++ /dev/null
@@ -1,253 +0,0 @@
-import os
-import re
-import sys
-import time
-import signal
-import socket
-import commands
-import optparse
-import datetime
-import cPickle as pickle
-
-from dq2.common import log as logging
-from dq2.common import stomp
-from config import panda_config
-from brokerage.SiteMapper import SiteMapper
-from dataservice.Finisher import Finisher
-
-# logger
-from pandalogger.PandaLogger import PandaLogger
-_logger = PandaLogger().getLogger('fileCallbackListener')
-
-# keep PID
-pidFile = '%s/file_callback_listener.pid' % panda_config.logdir
-
-# overall timeout value
-overallTimeout = 60 * 59
-
-# expiration time
-expirationTime = datetime.datetime.utcnow() + datetime.timedelta(minutes=overallTimeout)
-
-
-# kill whole process
-def catch_sig(sig, frame):
- try:
- os.remove(pidFile)
- except:
- pass
- # kill
- _logger.debug('terminating ...')
- commands.getoutput('kill -9 -- -%s' % os.getpgrp())
- # exit
- sys.exit(0)
-
-
-# callback listener
-class FileCallbackListener(stomp.ConnectionListener):
-
- def __init__(self,conn,tb,sm):
- # connection
- self.conn = conn
- # task buffer
- self.taskBuffer = tb
- # site mapper
- self.siteMapper = sm
-
-
- def on_error(self,headers,body):
- _logger.error("on_error : %s" % headers['message'])
-
-
- def on_disconnected(self,headers,body):
- _logger.error("on_disconnected : %s" % headers['message'])
-
-
- def on_message(self, headers, message):
- try:
- lfn = 'UNKNOWN'
- # send ack
- id = headers['message-id']
- self.conn.ack({'message-id':id})
- # check message type
- messageType = headers['cbtype']
- if not messageType in ['FileDoneMessage']:
- _logger.debug('%s skip' % messageType)
- return
- _logger.debug('%s start' % messageType)
- # re-construct message
- messageObj = pickle.loads(message)
- evtTime = datetime.datetime.utcfromtimestamp(messageObj.getItem('eventTime'))
- lfn = messageObj.getItem('lfn')
- guid = messageObj.getItem('guid')
- ddmSite = messageObj.getItem('site')
- _logger.debug('%s site=%s type=%s time=%s' % \
- (lfn,ddmSite,messageType,evtTime.strftime('%Y-%m-%d %H:%M:%S')))
- # ignore non production files
- flagNgPrefix = False
- for ngPrefix in ['user','step']:
- if lfn.startswith(ngPrefix):
- flagNgPrefix = True
- break
- if flagNgPrefix:
- _logger.debug('%s skip' % lfn)
- return
- # get datasets associated with the file only for high priority jobs
- dsNameMap = self.taskBuffer.getDatasetWithFile(lfn,800)
- _logger.debug('%s ds=%s' % (lfn,str(dsNameMap)))
- # loop over all datasets
- for dsName,dsData in dsNameMap.iteritems():
- pandaSite,dsToken = dsData
- # skip multiple destination since each file doesn't have
- # transferStatus
- if not dsToken in ['',None] and ',' in dsToken:
- _logger.debug('%s ignore ds=%s token=%s' % (lfn,dsName,dsToken))
- continue
- # check site
- tmpSiteSpec = self.siteMapper.getSite(pandaSite)
- if tmpSiteSpec.setokens.has_key(dsToken):
- pandaSiteDdmID = tmpSiteSpec.setokens[dsToken]
- else:
- pandaSiteDdmID = tmpSiteSpec.ddm
- if pandaSiteDdmID != ddmSite:
- _logger.debug('%s ignore ds=%s site=%s:%s <> %s' % \
- (lfn,dsName,pandaSite,pandaSiteDdmID,ddmSite))
- continue
- # update file
- forInput = None
- if re.search('_dis\d+$',dsName) != None:
- # dispatch datasets
- forInput = True
- ids = self.taskBuffer.updateInFilesReturnPandaIDs(dsName,'ready',lfn)
- elif re.search('_sub\d+$',dsName) != None:
- # sub datasets
- forInput = False
- ids = self.taskBuffer.updateOutFilesReturnPandaIDs(dsName,lfn)
- _logger.debug('%s ds=%s ids=%s' % (lfn,dsName,str(ids)))
- # loop over all PandaIDs
- if forInput != None and len(ids) != 0:
- # remove None and unknown
- targetIDs = []
- for tmpID in ids:
- # count the number of pending files
- nPending = self.taskBuffer.countPendingFiles(tmpID,forInput)
- _logger.debug('%s PandaID=%s nPen=%s' % (lfn,tmpID,nPending))
- if nPending != 0:
- continue
- targetIDs.append(tmpID)
- # get jobs
- targetJobs = []
- if targetIDs != []:
- if forInput:
- jobs = self.taskBuffer.peekJobs(targetIDs,fromActive=False,fromArchived=False,
- fromWaiting=False)
- else:
- jobs = self.taskBuffer.peekJobs(targetIDs,fromDefined=False,fromArchived=False,
- fromWaiting=False)
- for tmpJob in jobs:
- if tmpJob == None or tmpJob.jobStatus == 'unknown':
- continue
- targetJobs.append(tmpJob)
- # trigger subsequent processe
- if targetJobs == []:
- _logger.debug('%s no jobs to be triggerd for subsequent processe' % lfn)
- else:
- if forInput:
- # activate
- _logger.debug('%s activate %s' % (lfn,str(targetIDs)))
- self.taskBuffer.activateJobs(targetJobs)
- else:
- # finish
- _logger.debug('%s finish %s' % (lfn,str(targetIDs)))
- for tmpJob in targetJobs:
- fThr = Finisher(self.taskBuffer,None,tmpJob)
- fThr.start()
- fThr.join()
- _logger.debug('%s done' % lfn)
- except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("on_message : %s %s %s" % (lfn,errtype,errvalue))
-
-
-# main
-def main(backGround=False):
- _logger.debug('starting ...')
- # register signal handler
- signal.signal(signal.SIGINT, catch_sig)
- signal.signal(signal.SIGHUP, catch_sig)
- signal.signal(signal.SIGTERM,catch_sig)
- signal.signal(signal.SIGALRM,catch_sig)
- signal.alarm(overallTimeout)
- # forking
- pid = os.fork()
- if pid != 0:
- # watch child process
- os.wait()
- time.sleep(1)
- else:
- # main loop
- from taskbuffer.TaskBuffer import taskBuffer
- # initialize cx_Oracle using dummy connection
- from taskbuffer.Initializer import initializer
- initializer.init()
- # instantiate TB
- taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
- # instantiate sitemapper
- siteMapper = SiteMapper(taskBuffer)
- # ActiveMQ params
- clientid = 'PANDA-' + socket.getfqdn()
- queue = '/queue/Consumer.PANDA.atlas.ddm.siteservices'
- ssl_opts = {'use_ssl' : True,
- 'ssl_cert_file' : '/data/atlpan/pandasv1_usercert.pem',
- 'ssl_key_file' : '/data/atlpan/pandasv1_userkey.pem'}
- # resolve multiple brokers
- brokerList = socket.gethostbyname_ex('atlasddm-mb.cern.ch')[-1]
- # set listener
- for tmpBroker in brokerList:
- try:
- _logger.debug('setting listener on %s' % tmpBroker)
- conn = stomp.Connection(host_and_ports = [(tmpBroker, 6162)], **ssl_opts)
- conn.set_listener('FileCallbackListener', FileCallbackListener(conn,taskBuffer,siteMapper))
- conn.start()
- conn.connect(headers = {'client-id': clientid})
- conn.subscribe(destination=queue, ack='client-individual')
- #,headers = {'selector':"cbtype='FileDoneMessage'"})
- if not conn.is_connected():
- _logger.error("connection failure to %s" % tmpBroker)
- except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("failed to set listener on %s : %s %s" % (tmpBroker,errtype,errvalue))
- catch_sig(None,None)
-
-# entry
-if __name__ == "__main__":
- optP = optparse.OptionParser(conflict_handler="resolve")
- options,args = optP.parse_args()
- try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(seconds=overallTimeout-180)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName)
- for line in out.split('\n'):
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
- except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("kill process : %s %s" % (errtype,errvalue))
- # main loop
- main()
diff --git a/current/pandaserver/test/fileClean.py b/current/pandaserver/test/fileClean.py
deleted file mode 100755
index edef84ea5..000000000
--- a/current/pandaserver/test/fileClean.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import re
-import sys
-import datetime
-from taskbuffer.DBProxy import DBProxy
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# table names
-cdate = datetime.datetime.utcnow()
-if cdate.month==1:
- cdate = cdate.replace(year = (cdate.year-1))
- cdate = cdate.replace(month = 12, day = 1)
-else:
- cdate = cdate.replace(month = (cdate.month/2)*2, day = 1)
-currentSuffix = "_%s%s" % (cdate.strftime('%b'),cdate.year)
-if cdate.month > 2:
- odate = cdate.replace(month = (cdate.month-2))
-else:
- odate = cdate.replace(year = (cdate.year-1), month = 12)
-previousSuffix = "_%s%s" % (odate.strftime('%b'),odate.year)
-
-# instantiate DB proxies
-proxyS = DBProxy()
-proxyN = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-proxyN.connect(panda_config.logdbhost,panda_config.logdbpasswd,panda_config.logdbuser,'PandaArchiveDB')
-
-# get tables
-fileTables = []
-jobsTables = {}
-status,res = proxyN.querySQLS("show tables")
-if res != None:
- for table, in res:
- if table.startswith('filesTable'):
- fileTables.append(table)
- if table.startswith('jobsArchived'):
- # get MAX PandaID
- statusJ,resJ = proxyN.querySQLS("SELECT MAX(PandaID) FROM %s" % table)
- jobsTables[table] = resJ[0][0]
-
-# for the cumulative tables
-cumulativeSuffix = '4_current'
-cumulativePandaID = jobsTables['jobsArchived%s' % cumulativeSuffix]
-
-# create a map between MAX PandaID and suffix
-suffixMap = {}
-for table,maxPandaID in jobsTables.iteritems():
- # get suffix
- match = re.search('(\d??_.+)$',table)
- suffix = match.group(1)
- # special treatment is required for the cumulative tables
- if suffix == cumulativeSuffix:
- continue
- # name of corresponding file table
- name = "filesTable%s" % suffix
- if not name in fileTables:
- print "%s is not found" % name
- sys.exit(0)
- # check duplication
- if suffixMap.has_key(maxPandaID):
- print "%s is already used by %s" % (maxPandaID,suffixMap[maxPandaID])
- sys.exit(0)
- # append
- suffixMap[maxPandaID] = suffix
-
-# print the cumulative
-print "%8d %s" % (cumulativePandaID,cumulativeSuffix)
-# sort by max PandaID
-suffixKeys = suffixMap.keys()
-suffixKeys.sort()
-for key in suffixKeys:
- print "%8d %s" % (key,suffixMap[key])
-
-# get files
-minPandaID = -1
-sql = "SELECT PandaID FROM filesTable4 WHERE PandaID > %s GROUP BY PandaID ORDER BY PandaID LIMIT 100"
-#while True:
-for i in range(5):
- status,res = proxyS.querySQLS(sql % minPandaID)
- # no more job
- if len(res) == 0:
- break
- # set min
- minPandaID = res[-1][0]
- # loop over all PandaIDs
- for id, in res:
- # look for corresponding table
- tableSuffix = ''
- if id < cumulativePandaID:
- # use the cumulative
- tableSuffix = cumulativeSuffix
- else:
- for key in suffixKeys:
- if id < key:
- tableSuffix = suffixMap[key]
- break
- # check suffix
- if tableSuffix in ['',currentSuffix,previousSuffix]:
- print "Terminated since fresh PandID=%s found for '%s'" % (id,tableSuffix)
- sys.exit(0)
- print "PandaID:%s Suffix:%s" % (id,tableSuffix)
- # get FileSpec
- sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames()
- sqlFile+= "WHERE PandaID=%s" % id
- statusF,resFs = proxyS.querySQLS(sqlFile)
- for resF in resFs:
- file = FileSpec()
- file.pack(resF)
- # create a dummy Job to set PandaID
- job = JobSpec()
- job.PandaID = id
- job.addFile(file)
- # file table
- fileTable = 'filesTable%s' % tableSuffix
- # check
- sqlFileCheck = "SELECT PandaID FROM %s WHERE rowID=%s" % (fileTable,file.rowID)
- statusC,resC = proxyN.querySQLS(sqlFileCheck)
- if len(resC) != 0:
- if resC[0][0] != id:
- print "PandaID mismatch PandaArchive:%s PandaDB:%s for rowID=%s" % \
- (resC[0][0],id,file.rowID)
- else:
- print "rowID=%s not found" % file.rowID
- """
- # construct SQL
- sqlFileIn = "INSERT INTO %s " % fileTable
- sqlFileIn+= "(%s) " % FileSpec.columnNames()
- sqlFileIn+= FileSpec.valuesExpression()
- try:
- proxyN.cur.execute("SET AUTOCOMMIT=1")
- ret = proxyN.cur.execute(sqlFileIn,file.values())
- res = proxyN.cur.fetchall()
- # commit
- if not proxyN._commit():
- raise RuntimeError, 'Commit error'
- except:
- type, value, traceBack = sys.exc_info()
- print "insert error : %s %s" % (type,value)
- # roll back
- proxyN._rollback()
- """
diff --git a/current/pandaserver/test/finishJob.py b/current/pandaserver/test/finishJob.py
deleted file mode 100755
index 559bd61c3..000000000
--- a/current/pandaserver/test/finishJob.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import os
-import re
-import sys
-import urllib2,urllib
-
-import userinterface.Client as Client
-from userinterface.Client import baseURLSSL
-
-import httplib
-import commands
-
-id = sys.argv[1]
-s,o = Client.getJobStatus([id])
-
-if s != 0:
- print "failed to get job with:%s" % s
- sys.exit(0)
-
-job = o[0]
-
-if job == None:
- print "got None"
- sys.exit(0)
-
-xml = """
-
-
-
-"""
-
-for file in job.Files:
- if file.type in ['output','log']:
- xml += """
-
-
-
-
-
-
-
- """ % (commands.getoutput('uuidgen'),file.lfn,file.lfn)
-
-xml += """
-
-"""
-
-node={}
-node['jobId']=id
-node['state']='finished'
-node['metaData']='finished'
-#node['state']='failed'
-#node['pilotErrorCode']=1200
-node['siteName']='BNL_ATLAS_test'
-
-node['xml']=xml
-url='%s/updateJob' % baseURLSSL
-
-match = re.search('[^:/]+://([^/]+)(/.+)',url)
-host = match.group(1)
-path = match.group(2)
-
-if os.environ.has_key('X509_USER_PROXY'):
- certKey = os.environ['X509_USER_PROXY']
-else:
- certKey = '/tmp/x509up_u%s' % os.getuid()
-
-rdata=urllib.urlencode(node)
-
-conn = httplib.HTTPSConnection(host,key_file=certKey,cert_file=certKey)
-conn.request('POST',path,rdata)
-resp = conn.getresponse()
-data = resp.read()
-
-print data
diff --git a/current/pandaserver/test/getJobs.py b/current/pandaserver/test/getJobs.py
deleted file mode 100755
index 10fd553eb..000000000
--- a/current/pandaserver/test/getJobs.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import sys
-import time
-import datetime
-import commands
-import threading
-import urllib2,urllib
-
-import httplib
-
-import re
-import os
-
-from userinterface.Client import baseURLSSL
-
-node={}
-node['siteName']=sys.argv[1]
-node['mem']=1000
-node['node']=commands.getoutput('hostname -f')
-#node['prodSourceLabel']='user'
-url='%s/getJob' % baseURLSSL
-
-match = re.search('[^:/]+://([^/]+)(/.+)',url)
-host = match.group(1)
-path = match.group(2)
-
-if os.environ.has_key('X509_USER_PROXY'):
- certKey = os.environ['X509_USER_PROXY']
-else:
- certKey = '/tmp/x509up_u%s' % os.getuid()
-
-rdata=urllib.urlencode(node)
-
-class Thr(threading.Thread):
- def __init__(self):
- threading.Thread.__init__(self)
-
- def run(self):
- print datetime.datetime.utcnow().isoformat(' ')
- conn = httplib.HTTPSConnection(host,key_file=certKey,cert_file=certKey)
- conn.request('POST',path,rdata)
- resp = conn.getresponse()
- data = resp.read()
- conn.close()
- print datetime.datetime.utcnow().isoformat(' ')
- import cgi
- print cgi.parse_qs(data)
-
-nThr = 1
-thrs = []
-for i in range(nThr):
- thrs.append(Thr())
-
-for thr in thrs:
- thr.start()
diff --git a/current/pandaserver/test/input.data b/current/pandaserver/test/input.data
deleted file mode 100755
index 08272e947..000000000
--- a/current/pandaserver/test/input.data
+++ /dev/null
@@ -1,2 +0,0 @@
-pandatest.000003.dd.input:pandatest.000003.dd.input._00047.junk
-pandatest.000003.dd.input:pandatest.000003.dd.input._00001.junk
diff --git a/current/pandaserver/test/installSW.py b/current/pandaserver/test/installSW.py
deleted file mode 100755
index 1dbb349bf..000000000
--- a/current/pandaserver/test/installSW.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-# extract pacball and site
-argStr = ""
-pacball = None
-pacFlag = False
-siteName = None
-siteFlag = False
-for arg in sys.argv[1:]:
- if arg == '--pacball':
- pacFlag = True
- continue
- if pacFlag:
- pacball = arg
- pacFlag = False
- continue
- if arg == '--sitename':
- siteFlag = True
- continue
- if siteFlag:
- siteName = arg
- siteFlag = False
- continue
- argStr += "%s " % arg
-
-# check site
-if siteName == None:
- print "ERROR : --sitename needs to be specified"
- sys.exit(1)
-# append sitename
-argStr += "--sitename %s " % siteName
-
-# check pacball format
-if pacball != None and pacball.find(':') != -1:
- pacDS = pacball.split(':')[0]
- pacFile = pacball.split(':')[-1]
-else:
- pacDS = None
- pacFile = pacball
-
-# append pacball to arg
-if pacFile != None:
- argStr += "--pacball %s " % pacFile
-
-job = JobSpec()
-job.jobDefinitionID = int(time.time()) % 10000
-job.jobName = "%s_%s" % (siteName,commands.getoutput('uuidgen'))
-job.transformation = 'http://www.usatlas.bnl.gov/svn/panda/apps/sw/installAtlasSW'
-job.destinationDBlock = 'panda.%s' % job.jobName
-job.currentPriority = 10000
-job.prodSourceLabel = 'software'
-job.computingSite = siteName
-job.cloud = 'US'
-
-fileOL = FileSpec()
-fileOL.lfn = "%s.job.log.tgz" % job.jobName
-fileOL.destinationDBlock = job.destinationDBlock
-fileOL.dataset = job.destinationDBlock
-fileOL.type = 'log'
-job.addFile(fileOL)
-
-# pacball
-if pacDS != None:
- job.prodDBlock = pacDS
- fileP = FileSpec()
- fileP.dataset = pacDS
- fileP.prodDBlock = pacDS
- fileP.lfn = pacFile
- fileP.type = 'input'
- job.addFile(fileP)
-
-job.jobParameters = argStr
-
-s,o = Client.submitJobs([job])
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/killDefJobs.py b/current/pandaserver/test/killDefJobs.py
deleted file mode 100755
index a646ea202..000000000
--- a/current/pandaserver/test/killDefJobs.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import datetime
-from taskbuffer.DBProxy import DBProxy
-import userinterface.Client as Client
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# time limit
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=1)
-
-# instantiate DB proxies
-proxyS = DBProxy()
-proxyS.connect('adbpro.usatlas.bnl.gov',passwd,'panda-developer','PandaDevDB')
-
-# get PandaIDs from jobsDefined
-res = proxyS.querySQL("SELECT PandaID,modificationTime from jobsDefined4 ORDER BY modificationTime")
-
-# kill f old
-jobs=[]
-for (id,modTime) in res:
- if modTime < timeLimit:
- jobs.append(id)
-
-Client.killJobs(jobs)
-
diff --git a/current/pandaserver/test/killJob.py b/current/pandaserver/test/killJob.py
deleted file mode 100755
index 0238f2e79..000000000
--- a/current/pandaserver/test/killJob.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import sys
-import optparse
-import userinterface.Client as Client
-
-optP = optparse.OptionParser(conflict_handler="resolve")
-optP.add_option('-9',action='store_const',const=True,dest='forceKill',
- default=False,help='kill jobs before next heartbeat is coming')
-optP.add_option('--killOwnProdJobs',action='store_const',const=True,dest='killOwnProdJobs',
- default=False,help='kill own production jobs without a production role')
-optP.add_option('--killUserJobs',action='store_const',const=True,dest='killUserJobs',
- default=False,help='kill user jobs using a production role')
-options,args = optP.parse_args()
-
-
-aSrvID = None
-
-codeV = None
-useMailAsIDV = False
-
-if options.forceKill:
- codeV = 9
-elif options.killUserJobs:
- codeV = 91
-if options.killOwnProdJobs:
- useMailAsIDV = True
-
-if len(args) == 1:
- Client.killJobs([args[0]],code=codeV,useMailAsID=useMailAsIDV)
-else:
- startID = int(args[0])
- endID = int(args[1])
- if startID > endID:
- print '%d is less than %d' % (endID,startID)
- sys.exit(1)
- Client.killJobs(range(startID,endID+1),code=codeV,useMailAsID=useMailAsIDV)
-
diff --git a/current/pandaserver/test/killJobLowPrio.py b/current/pandaserver/test/killJobLowPrio.py
deleted file mode 100755
index 347da336a..000000000
--- a/current/pandaserver/test/killJobLowPrio.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import time
-import sys
-import optparse
-
-import userinterface.Client as Client
-
-aSrvID = None
-
-from taskbuffer.OraDBProxy import DBProxy
-# password
-from config import panda_config
-
-usageStr = """%prog [options]
-
-Description: kill jobs with low priorities below a given value"""
-optP = optparse.OptionParser(conflict_handler="resolve",usage=usageStr)
-optP.add_option('-9',action='store_const',const=True,dest='forceKill',
- default=False,help='kill jobs before next heartbeat is coming')
-optP.add_option('--running',action='store_const',const=True,dest='killRunning',
- default=False,help='kill running jobs to free up CPU slots. jobs will be killed regardless of job status if omitted')
-optP.add_option('--site',action='store',dest='site',default=None,help='computingSite')
-optP.add_option('--cloud',action='store',dest='cloud',default=None,help='cloud')
-optP.add_option('--maxJobs',action='store',dest='maxJobs',default=None,help='max number of jobs to be killed')
-options,args = optP.parse_args()
-
-if options.cloud == None and options.site == None:
- optP.error("--site= and/or --cloud= is required")
-
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-jobsMap = {}
-
-if len(args) == 0:
- optP.error('priority is required')
-
-varMap = {}
-varMap[':prodSourceLabel'] = 'managed'
-varMap[':currentPriority'] = args[0]
-sql = "SELECT PandaID,currentPriority FROM %s WHERE prodSourceLabel=:prodSourceLabel AND currentPriority<:currentPriority "
-if options.killRunning:
- sql += "AND jobStatus=:jobStatus "
- varMap[':jobStatus'] = 'running'
-if options.cloud != None:
- sql += "AND cloud=:cloud "
- varMap[':cloud'] = options.cloud
-if options.site != None:
- sql += "AND computingSite=:site "
- varMap[':site'] = options.site
-for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']:
- status,res = proxyS.querySQLS(sql % table,varMap)
- if res != None:
- for id,prio in res:
- if not jobsMap.has_key(prio):
- jobsMap[prio] = []
- if not id in jobsMap[prio]:
- jobsMap[prio].append(id)
-
-# order by PandaID and currentPriority
-jobs = []
-prioList = jobsMap.keys()
-prioList.sort()
-for prio in prioList:
- # reverse order by PandaID to kill newer jobs
- ids = jobsMap[prio]
- ids.sort()
- ids.reverse()
- jobs += ids
-
-if options.maxJobs != None:
- jobs = jobs[:int(options.maxJobs)]
-
-print 'The number of jobs with priorities below %s : %s' % (args[0],len(jobs))
-if len(jobs):
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- print 'kill %s' % str(jobs[iJob:iJob+nJob])
- if options.forceKill:
- Client.killJobs(jobs[iJob:iJob+nJob],9)
- else:
- Client.killJobs(jobs[iJob:iJob+nJob])
- iJob += nJob
- time.sleep(1)
-
-
diff --git a/current/pandaserver/test/killJobsInTask.py b/current/pandaserver/test/killJobsInTask.py
deleted file mode 100755
index 26c9ddb16..000000000
--- a/current/pandaserver/test/killJobsInTask.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import time
-import sys
-import optparse
-
-import userinterface.Client as Client
-
-aSrvID = None
-
-from taskbuffer.OraDBProxy import DBProxy
-# password
-from config import panda_config
-
-optP = optparse.OptionParser(conflict_handler="resolve")
-optP.add_option('-9',action='store_const',const=True,dest='forceKill',
- default=False,help='kill jobs before next heartbeat is coming')
-options,args = optP.parse_args()
-
-useMailAsIDV = False
-if options.killOwnProdJobs:
- useMailAsIDV = True
-
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-jobs = []
-
-varMap = {}
-varMap[':prodSourceLabel'] = 'managed'
-varMap[':taskID'] = args[0]
-varMap[':pandaIDl'] = args[1]
-varMap[':pandaIDu'] = args[2]
-sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID AND PandaID BETWEEN :pandaIDl AND :pandaIDu ORDER BY PandaID"
-for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']:
- status,res = proxyS.querySQLS(sql % table,varMap)
- if res != None:
- for id, in res:
- if not id in jobs:
- jobs.append(id)
-
-print 'The number of jobs to be killed : %s' % len(jobs)
-if len(jobs):
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- print 'kill %s' % str(jobs[iJob:iJob+nJob])
- if options.forceKill:
- Client.killJobs(jobs[iJob:iJob+nJob],9,useMailAsID=useMailAsIDV)
- else:
- Client.killJobs(jobs[iJob:iJob+nJob],useMailAsID=useMailAsIDV)
- iJob += nJob
- time.sleep(1)
-
-
diff --git a/current/pandaserver/test/killProdJobs.py b/current/pandaserver/test/killProdJobs.py
deleted file mode 100755
index 85e8113ea..000000000
--- a/current/pandaserver/test/killProdJobs.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import sys
-
-import userinterface.Client as Client
-
-if len(sys.argv) == 2:
- jobDefIDs = [sys.argv[1]]
-else:
- startID = int(sys.argv[1])
- endID = int(sys.argv[2])
- if startID > endID:
- print '%d is less than %d' % (endID,startID)
- sys.exit(1)
- jobDefIDs = range(startID,endID+1)
-
-# quesry PandaID
-status, ids = Client.queryPandaIDs(jobDefIDs)
-
-if status != 0:
- sys.exit(0)
-
-# remove None
-while True:
- if not None in ids:
- break
- ids.remove(None)
-
-# kill
-if len(ids) != 0:
- Client.killJobs(ids)
-
diff --git a/current/pandaserver/test/killTask.py b/current/pandaserver/test/killTask.py
deleted file mode 100755
index 0784a18b9..000000000
--- a/current/pandaserver/test/killTask.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import time
-import sys
-import optparse
-
-import userinterface.Client as Client
-
-aSrvID = None
-
-from taskbuffer.OraDBProxy import DBProxy
-# password
-from config import panda_config
-
-optP = optparse.OptionParser(conflict_handler="resolve")
-optP.add_option('-9',action='store_const',const=True,dest='forceKill',
- default=False,help='kill jobs even if they are still running')
-optP.add_option('--noRunning',action='store_const',const=True,dest='noRunning',
- default=False,help='kill only activated/assigned/waiting jobs')
-options,args = optP.parse_args()
-
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-jobs = []
-
-varMap = {}
-varMap[':prodSourceLabel'] = 'managed'
-varMap[':taskID'] = args[0]
-if not options.noRunning:
- sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID ORDER BY PandaID"
-else:
- sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID AND jobStatus<>:jobStatus ORDER BY PandaID"
- varMap[':jobStatus'] = 'running'
-for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']:
- status,res = proxyS.querySQLS(sql % table,varMap)
- if res != None:
- for id, in res:
- if not id in jobs:
- jobs.append(id)
-
-print 'The number of jobs to be killed : %s' % len(jobs)
-if len(jobs):
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- print 'kill %s' % str(jobs[iJob:iJob+nJob])
- if options.forceKill:
- Client.killJobs(jobs[iJob:iJob+nJob],9)
- else:
- Client.killJobs(jobs[iJob:iJob+nJob])
- iJob += nJob
- time.sleep(1)
-
-
diff --git a/current/pandaserver/test/killUser.py b/current/pandaserver/test/killUser.py
deleted file mode 100644
index 4e3bbaa19..000000000
--- a/current/pandaserver/test/killUser.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import sys
-import time
-import datetime
-import optparse
-
-from taskbuffer.OraDBProxy import DBProxy
-# password
-from config import panda_config
-
-optP = optparse.OptionParser(conflict_handler="resolve")
-optP.add_option('--user', action='store',dest='user', default=None,help='prodUserName')
-optP.add_option('--jobID',action='store',dest='jobID',default=None,help='jobDefinitionID')
-optP.add_option('--jobsetID',action='store',dest='jobsetID',default=None,help="jobsetID, or 'all' to kill all jobs")
-optP.add_option('--prodSourceLabel',action='store',dest='prodSourceLabel',default=None,help='additional prodSourceLabel')
-
-
-options,args = optP.parse_args()
-
-if options.user == None:
- print "--user= is required"
- sys.exit(1)
-if options.jobID == None and options.jobsetID == None:
- print "--jobID= or --jobsetID= is required"
- sys.exit(1)
-
-
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-prodUserName = sys.argv[1]
-import userinterface.Client as Client
-
-varMap = {}
-varMap[':src1'] = 'user'
-varMap[':src2'] = 'panda'
-varMap[':prodUserName'] = options.user
-srcSQL = '(:src1,:src2'
-if options.jobID != None:
- varMap[':jobDefinitionID'] = options.jobID
-if not options.jobsetID in (None,'all'):
- varMap[':jobsetID'] = options.jobsetID
-if options.prodSourceLabel != None:
- varMap[':src3'] = options.prodSourceLabel
- srcSQL += ',:src3'
-srcSQL += ')'
-
-jobs = []
-tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']
-for table in tables:
- sql = "SELECT PandaID FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel IN %s " % (table,srcSQL)
- if options.jobID != None:
- sql += "AND jobDefinitionID=:jobDefinitionID "
- if not options.jobsetID in (None,'all'):
- sql += "AND jobsetID=:jobsetID "
- sql += "ORDER BY PandaID "
- status,res = proxyS.querySQLS(sql,varMap)
- if res != None:
- for id, in res:
- if not id in jobs:
- jobs.append(id)
-if len(jobs):
- iJob = 0
- nJob = 1000
- while iJob < len(jobs):
- subJobs = jobs[iJob:iJob+nJob]
- print "kill %s %s/%s" % (str(subJobs),iJob,len(jobs))
- Client.killJobs(subJobs,code=9)
- iJob += nJob
-else:
- print "no job was killed"
-
diff --git a/current/pandaserver/test/killWaiting.py b/current/pandaserver/test/killWaiting.py
deleted file mode 100755
index fe76014a8..000000000
--- a/current/pandaserver/test/killWaiting.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import sys
-import time
-import datetime
-from taskbuffer.DBProxy import DBProxy
-import userinterface.Client as Client
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-cloud = sys.argv[1]
-
-# instantiate DB proxies
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-while True:
- # get PandaIDs
- res = proxyS.querySQL("SELECT PandaID FROM jobsWaiting4 WHERE cloud='%s' ORDER BY PandaID" % cloud)
- # escape
- if len(res) == 0:
- break
- # convert to list
- jobs = []
- for id, in res:
- jobs.append(id)
- # reassign
- nJob = 300
- iJob = 0
- while iJob < len(jobs):
- print 'killJobs(%s)' % jobs[iJob:iJob+nJob]
- Client.killJobs(jobs[iJob:iJob+nJob])
- iJob += nJob
- time.sleep(60)
-
diff --git a/current/pandaserver/test/logrotate.sh b/current/pandaserver/test/logrotate.sh
deleted file mode 100755
index 51db686c0..000000000
--- a/current/pandaserver/test/logrotate.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-/usr/sbin/logrotate /usatlas/u/sm/prod/panda/config/logrotate.conf -s /usatlas/u/sm/logrotate.status
diff --git a/current/pandaserver/test/missing.py b/current/pandaserver/test/missing.py
deleted file mode 100755
index b77eaeecf..000000000
--- a/current/pandaserver/test/missing.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import re
-import commands
-
-stMap = []
-tmpMap = {}
-nLog = 30
-for i in range(0,nLog):
- if i == 0:
- out = commands.getoutput('cat /data/sm/prod/httpd/logs/panda-Adder.log')
- else:
- out = commands.getoutput('zcat /data/sm/prod/httpd/logs/panda-Adder.log.%s.gz' % (nLog-i))
- for line in out.split('\n'):
- stStr = re.search('start: finished',line)
- idsStr = re.search('ids = .*$',line)
- mapStr = re.search('idMap = .*$',line)
- if stStr == None and idsStr == None and mapStr == None:
- continue
- items = line.split()
- try:
- pandaID = int(items[4])
- except:
- continue
- if stStr != None:
- stMap.append(pandaID)
- if idsStr != None:
- exec idsStr.group(0)
- tmpMap[pandaID] = ids
- if mapStr != None:
- exec mapStr.group(0)
- if (pandaID in stMap) and idMap == {} and tmpMap[pandaID] != ([], []):
- print pandaID
- print tmpMap[pandaID]
- try:
- del tmpMap[pandaID]
- except:
- pass
- try:
- stMap.remove(pandaID)
- except:
- pass
-if tmpMap != {}:
- print tmpMap
-
diff --git a/current/pandaserver/test/pandadb.sql b/current/pandaserver/test/pandadb.sql
deleted file mode 100644
index 5bc00b59d..000000000
--- a/current/pandaserver/test/pandadb.sql
+++ /dev/null
@@ -1,430 +0,0 @@
-DROP TABLE jobsDefined4;
-DROP TABLE jobsActive4;
-DROP TABLE jobsArchived4;
-DROP TABLE jobsWaiting4;
-DROP TABLE filesTable4;
-DROP TABLE Datasets;
-DROP TABLE metaTable;
-DROP TABLE subCounter;
-
-
-CREATE TABLE jobsDefined4
-(
- PandaID NUMBER(11) default 0 primary key,
- jobDefinitionID NUMBER(11) default 0,
- schedulerID VARCHAR(128),
- pilotID VARCHAR(128),
- creationTime DATE,
- creationHost VARCHAR(128),
- modificationTime DATE,
- modificationHost VARCHAR(128),
- AtlasRelease VARCHAR(64),
- transformation VARCHAR(250),
- homepackage VARCHAR(64),
- prodSeriesLabel VARCHAR(20) default 'pandatest',
- prodSourceLabel VARCHAR(20) default 'managed',
- prodUserID VARCHAR(250),
- assignedPriority NUMBER(9) default 0,
- currentPriority NUMBER(9) default 0,
- attemptNr NUMBER(2) default 0,
- maxAttempt NUMBER(2) default 0,
- jobStatus VARCHAR(15) default 'defined',
- jobName VARCHAR(128),
- maxCpuCount NUMBER(9) default 0,
- maxCpuUnit VARCHAR(32),
- maxDiskCount NUMBER(9) default 0,
- maxDiskUnit CHAR(2),
- ipConnectivity CHAR(3),
- minRamCount NUMBER(9) default 0,
- minRamUnit CHAR(2),
- startTime DATE,
- endTime DATE,
- cpuConsumptionTime NUMBER(20) default 0,
- cpuConsumptionUnit VARCHAR(128),
- commandToPilot VARCHAR(250),
- transExitCode VARCHAR(128),
- pilotErrorCode NUMBER(6) default 0,
- pilotErrorDiag VARCHAR(250),
- exeErrorCode NUMBER(6) default 0,
- exeErrorDiag VARCHAR(250),
- supErrorCode NUMBER(6) default 0,
- supErrorDiag VARCHAR(250) default NULL,
- ddmErrorCode NUMBER(6) default 0,
- ddmErrorDiag VARCHAR(250) default NULL,
- brokerageErrorCode NUMBER(6) default 0,
- brokerageErrorDiag VARCHAR(250) default NULL,
- jobDispatcherErrorCode NUMBER(6) default 0,
- jobDispatcherErrorDiag VARCHAR(250) default NULL,
- taskBufferErrorCode NUMBER(6) default 0,
- taskBufferErrorDiag VARCHAR(250) default NULL,
- computingSite VARCHAR(128),
- computingElement VARCHAR(128),
- jobParameters VARCHAR(4000) default NULL,
- metadata VARCHAR(32) default NULL,
- prodDBlock VARCHAR(250),
- dispatchDBlock VARCHAR(250),
- destinationDBlock VARCHAR(250),
- destinationSE VARCHAR(250),
- nEvents NUMBER(9) default 0,
- grid VARCHAR(32),
- cloud VARCHAR(32),
- cpuConversion NUMBER(9,4) default NULL,
- sourceSite VARCHAR(36),
- destinationSite VARCHAR(36),
- transferType VARCHAR(10),
- taskID NUMBER(9) default NULL,
- cmtConfig VARCHAR(250),
- stateChangeTime DATE,
- prodDBUpdateTime DATE,
- lockedby VARCHAR(128),
- relocationFlag NUMBER(1) default 0,
- jobExecutionID NUMBER(11) default 0,
- VO VARCHAR(16),
- pilotTiming VARCHAR(100),
- workingGroup VARCHAR(20)
-);
-
-
-CREATE TABLE jobsActive4
-(
- PandaID NUMBER(11) default 0 primary key,
- jobDefinitionID NUMBER(11) default 0,
- schedulerID VARCHAR(128),
- pilotID VARCHAR(128),
- creationTime DATE,
- creationHost VARCHAR(128),
- modificationTime DATE,
- modificationHost VARCHAR(128),
- AtlasRelease VARCHAR(64),
- transformation VARCHAR(250),
- homepackage VARCHAR(64),
- prodSeriesLabel VARCHAR(20) default 'pandatest',
- prodSourceLabel VARCHAR(20) default 'managed',
- prodUserID VARCHAR(250),
- assignedPriority NUMBER(9) default 0,
- currentPriority NUMBER(9) default 0,
- attemptNr NUMBER(2) default 0,
- maxAttempt NUMBER(2) default 0,
- jobStatus VARCHAR(15) default 'activated',
- jobName VARCHAR(128),
- maxCpuCount NUMBER(9) default 0,
- maxCpuUnit VARCHAR(32),
- maxDiskCount NUMBER(9) default 0,
- maxDiskUnit CHAR(2),
- ipConnectivity CHAR(3),
- minRamCount NUMBER(9) default 0,
- minRamUnit CHAR(2),
- startTime DATE,
- endTime DATE,
- cpuConsumptionTime NUMBER(20) default 0,
- cpuConsumptionUnit VARCHAR(128),
- commandToPilot VARCHAR(250),
- transExitCode VARCHAR(128),
- pilotErrorCode NUMBER(6) default 0,
- pilotErrorDiag VARCHAR(250),
- exeErrorCode NUMBER(6) default 0,
- exeErrorDiag VARCHAR(250),
- supErrorCode NUMBER(6) default 0,
- supErrorDiag VARCHAR(250) default NULL,
- ddmErrorCode NUMBER(6) default 0,
- ddmErrorDiag VARCHAR(250) default NULL,
- brokerageErrorCode NUMBER(6) default 0,
- brokerageErrorDiag VARCHAR(250) default NULL,
- jobDispatcherErrorCode NUMBER(6) default 0,
- jobDispatcherErrorDiag VARCHAR(250) default NULL,
- taskBufferErrorCode NUMBER(6) default 0,
- taskBufferErrorDiag VARCHAR(250) default NULL,
- computingSite VARCHAR(128),
- computingElement VARCHAR(128),
- jobParameters VARCHAR(4000) default NULL,
- metadata VARCHAR(32) default NULL,
- prodDBlock VARCHAR(250),
- dispatchDBlock VARCHAR(250),
- destinationDBlock VARCHAR(250),
- destinationSE VARCHAR(250),
- nEvents NUMBER(9) default 0,
- grid VARCHAR(32),
- cloud VARCHAR(32),
- cpuConversion NUMBER(9,4) default NULL,
- sourceSite VARCHAR(36),
- destinationSite VARCHAR(36),
- transferType VARCHAR(10),
- taskID NUMBER(9) default NULL,
- cmtConfig VARCHAR(250),
- stateChangeTime DATE,
- prodDBUpdateTime DATE,
- lockedby VARCHAR(128),
- relocationFlag NUMBER(1) default 0,
- jobExecutionID NUMBER(11) default 0,
- VO VARCHAR(16),
- pilotTiming VARCHAR(100),
- workingGroup VARCHAR(20)
-);
-
-CREATE TABLE jobsWaiting4
-(
- PandaID NUMBER(11) default 0 primary key,
- jobDefinitionID NUMBER(11) default 0,
- schedulerID VARCHAR(128),
- pilotID VARCHAR(128),
- creationTime DATE,
- creationHost VARCHAR(128),
- modificationTime DATE,
- modificationHost VARCHAR(128),
- AtlasRelease VARCHAR(64),
- transformation VARCHAR(250),
- homepackage VARCHAR(64),
- prodSeriesLabel VARCHAR(20) default 'pandatest',
- prodSourceLabel VARCHAR(20) default 'managed',
- prodUserID VARCHAR(250),
- assignedPriority NUMBER(9) default 0,
- currentPriority NUMBER(9) default 0,
- attemptNr NUMBER(2) default 0,
- maxAttempt NUMBER(2) default 0,
- jobStatus VARCHAR(15) default 'activated',
- jobName VARCHAR(128),
- maxCpuCount NUMBER(9) default 0,
- maxCpuUnit VARCHAR(32),
- maxDiskCount NUMBER(9) default 0,
- maxDiskUnit CHAR(2),
- ipConnectivity CHAR(3),
- minRamCount NUMBER(9) default 0,
- minRamUnit CHAR(2),
- startTime DATE,
- endTime DATE,
- cpuConsumptionTime NUMBER(20) default 0,
- cpuConsumptionUnit VARCHAR(128),
- commandToPilot VARCHAR(250),
- transExitCode VARCHAR(128),
- pilotErrorCode NUMBER(6) default 0,
- pilotErrorDiag VARCHAR(250),
- exeErrorCode NUMBER(6) default 0,
- exeErrorDiag VARCHAR(250),
- supErrorCode NUMBER(6) default 0,
- supErrorDiag VARCHAR(250) default NULL,
- ddmErrorCode NUMBER(6) default 0,
- ddmErrorDiag VARCHAR(250) default NULL,
- brokerageErrorCode NUMBER(6) default 0,
- brokerageErrorDiag VARCHAR(250) default NULL,
- jobDispatcherErrorCode NUMBER(6) default 0,
- jobDispatcherErrorDiag VARCHAR(250) default NULL,
- taskBufferErrorCode NUMBER(6) default 0,
- taskBufferErrorDiag VARCHAR(250) default NULL,
- computingSite VARCHAR(128),
- computingElement VARCHAR(128),
- jobParameters VARCHAR(4000) default NULL,
- metadata VARCHAR(32) default NULL,
- prodDBlock VARCHAR(250),
- dispatchDBlock VARCHAR(250),
- destinationDBlock VARCHAR(250),
- destinationSE VARCHAR(250),
- nEvents NUMBER(9) default 0,
- grid VARCHAR(32),
- cloud VARCHAR(32),
- cpuConversion NUMBER(9,4) default NULL,
- sourceSite VARCHAR(36),
- destinationSite VARCHAR(36),
- transferType VARCHAR(10),
- taskID NUMBER(9) default NULL,
- cmtConfig VARCHAR(250),
- stateChangeTime DATE,
- prodDBUpdateTime DATE,
- lockedby VARCHAR(128),
- relocationFlag NUMBER(1) default 0,
- jobExecutionID NUMBER(11) default 0,
- VO VARCHAR(16),
- pilotTiming VARCHAR(100),
- workingGroup VARCHAR(20)
-);
-
-CREATE TABLE jobsArchived4
-(
- PandaID NUMBER(11) default 0 primary key,
- jobDefinitionID NUMBER(11) default 0,
- schedulerID VARCHAR(128),
- pilotID VARCHAR(128),
- creationTime DATE,
- creationHost VARCHAR(128),
- modificationTime DATE,
- modificationHost VARCHAR(128),
- AtlasRelease VARCHAR(64),
- transformation VARCHAR(250),
- homepackage VARCHAR(64),
- prodSeriesLabel VARCHAR(20) default 'pandatest',
- prodSourceLabel VARCHAR(20) default 'managed',
- prodUserID VARCHAR(250),
- assignedPriority NUMBER(9) default 0,
- currentPriority NUMBER(9) default 0,
- attemptNr NUMBER(2) default 0,
- maxAttempt NUMBER(2) default 0,
- jobStatus VARCHAR(15) default 'activated',
- jobName VARCHAR(128),
- maxCpuCount NUMBER(9) default 0,
- maxCpuUnit VARCHAR(32),
- maxDiskCount NUMBER(9) default 0,
- maxDiskUnit CHAR(2),
- ipConnectivity CHAR(3),
- minRamCount NUMBER(9) default 0,
- minRamUnit CHAR(2),
- startTime DATE,
- endTime DATE,
- cpuConsumptionTime NUMBER(20) default 0,
- cpuConsumptionUnit VARCHAR(128),
- commandToPilot VARCHAR(250),
- transExitCode VARCHAR(128),
- pilotErrorCode NUMBER(6) default 0,
- pilotErrorDiag VARCHAR(250),
- exeErrorCode NUMBER(6) default 0,
- exeErrorDiag VARCHAR(250),
- supErrorCode NUMBER(6) default 0,
- supErrorDiag VARCHAR(250) default NULL,
- ddmErrorCode NUMBER(6) default 0,
- ddmErrorDiag VARCHAR(250) default NULL,
- brokerageErrorCode NUMBER(6) default 0,
- brokerageErrorDiag VARCHAR(250) default NULL,
- jobDispatcherErrorCode NUMBER(6) default 0,
- jobDispatcherErrorDiag VARCHAR(250) default NULL,
- taskBufferErrorCode NUMBER(6) default 0,
- taskBufferErrorDiag VARCHAR(250) default NULL,
- computingSite VARCHAR(128),
- computingElement VARCHAR(128),
- jobParameters VARCHAR(4000) default NULL,
- metadata VARCHAR(32) default NULL,
- prodDBlock VARCHAR(250),
- dispatchDBlock VARCHAR(250),
- destinationDBlock VARCHAR(250),
- destinationSE VARCHAR(250),
- nEvents NUMBER(9) default 0,
- grid VARCHAR(32),
- cloud VARCHAR(32),
- cpuConversion NUMBER(9,4) default NULL,
- sourceSite VARCHAR(36),
- destinationSite VARCHAR(36),
- transferType VARCHAR(10),
- taskID NUMBER(9) default NULL,
- cmtConfig VARCHAR(250),
- stateChangeTime DATE,
- prodDBUpdateTime DATE,
- lockedby VARCHAR(128),
- relocationFlag NUMBER(1) default 0,
- jobExecutionID NUMBER(11) default 0,
- VO VARCHAR(16),
- pilotTiming VARCHAR(100),
- workingGroup VARCHAR(20)
-);
-
-
-CREATE TABLE filesTable4
-(
- row_ID NUMBER(11) default 0 primary key,
- PandaID NUMBER(11) default 0,
- GUID VARCHAR(64),
- lfn VARCHAR(256),
- type VARCHAR(20),
- dataset VARCHAR(128),
- status VARCHAR(64),
- prodDBlock VARCHAR(250),
- prodDBlockToken VARCHAR(250),
- dispatchDBlock VARCHAR(250),
- dispatchDBlockToken VARCHAR(250),
- destinationDBlock VARCHAR(250),
- destinationDBlockToken VARCHAR(250),
- destinationSE VARCHAR(250),
- fsize NUMBER(10) default 0,
- md5sum CHAR(36),
- checksum CHAR(36)
-);
-
-
-CREATE TABLE Datasets
-(
- vuid VARCHAR(40) default '' primary key,
- name VARCHAR(250),
- version VARCHAR(10) default NULL,
- type VARCHAR(20) default NULL,
- status VARCHAR(10) default NULL,
- numberfiles NUMBER(9) default NULL,
- currentfiles NUMBER(9) default NULL,
- creationdate DATE,
- modificationdate DATE,
- MoverID NUMBER(11) default 0,
- transferStatus NUMBER(2) default 0
-);
-
-
-CREATE TABLE metaTable
-(
- PandaID NUMBER(11) default 0 primary key,
- metaData VARCHAR(4000) default NULL
-);
-
-
-CREATE TABLE subCounter
-(
- subID NUMBER(11) default 0
-);
-
-
-
-CREATE INDEX jobsA4_currentPriority_IDX ON jobsActive4 (currentPriority);
-CREATE INDEX jobsA4_jobStatus_IDX ON jobsActive4 (jobStatus);
-CREATE INDEX jobsA4_computingSite_IDX ON jobsActive4 (computingSite);
-
-CREATE INDEX file4_PandaID_IDX ON filesTable4 (PandaID);
-CREATE INDEX file4_status_IDX ON filesTable4 (status);
-CREATE INDEX file4_dispDBlock_IDX ON filesTable4 (dispatchDBlock);
-CREATE INDEX file4_destDBlock_IDX ON filesTable4 (destinationDBlock);
-
-CREATE INDEX Datasets_name_IDX ON Datasets (name);
-
-DROP SEQUENCE PandaID_SEQ;
-DROP SEQUENCE rowID_SEQ;
-DROP SEQUENCE subID_SEQ;
-
-
-CREATE SEQUENCE PandaID_SEQ;
-CREATE SEQUENCE rowID_SEQ;
-CREATE SEQUENCE subID_SEQ;
-
-
-CREATE OR REPLACE TRIGGER PandaID_TRIGGER
-BEFORE INSERT ON jobsDefined4
-FOR EACH ROW
-BEGIN
- IF (:NEW.PandaID IS NULL) THEN
- SELECT PandaID_SEQ.NEXTVAL INTO :NEW.PandaID FROM DUAL ;
- END IF;
-END;
-/
-
-
-CREATE OR REPLACE TRIGGER rowID_TRIGGER
-BEFORE INSERT ON filesTable4
-FOR EACH ROW
-BEGIN
- SELECT rowID_SEQ.NEXTVAL INTO :NEW.row_ID FROM DUAL ;
-END;
-/
-
-
-CREATE OR REPLACE TRIGGER subID_TRIGGER
-BEFORE INSERT ON subCounter
-FOR EACH ROW
-BEGIN
- SELECT subID_SEQ.NEXTVAL INTO :NEW.subID FROM DUAL ;
-END;
-/
-
-
-CREATE OR REPLACE FUNCTION BITOR( P_BITS1 IN NATURAL, P_BITS2 IN NATURAL )
-RETURN NATURAL
-IS
-BEGIN
- RETURN UTL_RAW.CAST_TO_BINARY_INTEGER(
- UTL_RAW.BIT_OR(
- UTL_RAW.CAST_FROM_BINARY_INTEGER(P_BITS1),
- UTL_RAW.CAST_FROM_BINARY_INTEGER(P_BITS2)
- )
- );
-END;
-/
diff --git a/current/pandaserver/test/pandameta.sql b/current/pandaserver/test/pandameta.sql
deleted file mode 100644
index ed234a5d2..000000000
--- a/current/pandaserver/test/pandameta.sql
+++ /dev/null
@@ -1,97 +0,0 @@
-DROP TABLE cloudconfig;
-DROP TABLE schedconfig;
-
-
-CREATE TABLE cloudconfig
-(
- name VARCHAR(20) primary key,
- description VARCHAR(50),
- tier1 VARCHAR(20),
- tier1SE VARCHAR(400),
- relocation VARCHAR(10),
- weight NUMBER(11) default 0,
- server VARCHAR(100),
- status VARCHAR(20),
- transtimelo NUMBER(11) default 0,
- transtimehi NUMBER(11) default 0,
- waittime NUMBER(11) default 0,
- cloudcomment VARCHAR(200),
- space NUMBER(11) default 0,
- moduser VARCHAR(30),
- modtime DATE default CURRENT_DATE,
- validation VARCHAR(20),
- mcshare NUMBER(11) default 0,
- countries VARCHAR(80)
-);
-
-
-CREATE TABLE schedconfig
-(
- name VARCHAR(60) default 'default',
- nickname VARCHAR(60) primary key,
- queue VARCHAR(60),
- localqueue VARCHAR(20),
- system VARCHAR(60),
- sysconfig VARCHAR(20),
- environ VARCHAR(250),
- gatekeeper VARCHAR(40),
- jobmanager VARCHAR(80),
- se VARCHAR(250),
- ddm VARCHAR(80),
- jdladd CLOB default NULL,
- globusadd VARCHAR(100),
- jdl VARCHAR(60),
- jdltxt CLOB default NULL,
- version VARCHAR(60),
- site VARCHAR(60),
- region VARCHAR(60),
- gstat VARCHAR(60),
- tags VARCHAR(200),
- cmd VARCHAR(200),
- lastmod TIMESTAMP default CURRENT_TIMESTAMP,
- errinfo VARCHAR(80),
- nqueue NUMBER(11) default 0,
- queuecomment CLOB default NULL,
- appdir VARCHAR(80),
- datadir VARCHAR(80),
- tmpdir VARCHAR(80),
- wntmpdir VARCHAR(80),
- dq2url VARCHAR(80),
- special_par VARCHAR(80),
- python_path VARCHAR(80),
- nodes NUMBER(11) default 0,
- status VARCHAR(10),
- copytool VARCHAR(80),
- copysetup VARCHAR(200),
- releases VARCHAR(500),
- sepath VARCHAR(80),
- envsetup VARCHAR(200),
- copyprefix VARCHAR(160),
- lfcpath VARCHAR(80),
- seopt VARCHAR(60),
- sein VARCHAR(60),
- seinopt VARCHAR(60),
- lfchost VARCHAR(80),
- cloud VARCHAR(60),
- siteid VARCHAR(60),
- proxy VARCHAR(80),
- retry VARCHAR(10),
- queuehours NUMBER(9) default 0,
- envsetupin VARCHAR(200),
- copytoolin VARCHAR(180),
- copysetupin VARCHAR(200),
- seprodpath VARCHAR(200),
- lfcprodpath VARCHAR(80),
- copyprefixin VARCHAR(80),
- recoverdir VARCHAR(80),
- memory NUMBER(11) default 0,
- maxtime NUMBER(11) default 0,
- space NUMBER(11) default 0,
- tspace TIMESTAMP default TO_DATE('0001-01-01 00:00:00','YYYY-MM-DD HH24:MI:SS'),
- cmtconfig VARCHAR(250),
- setokens VARCHAR(80),
- glexec VARCHAR(10),
- priorityoffset VARCHAR(60),
- allowedgroups VARCHAR(100),
- defaulttoken VARCHAR(100)
-);
diff --git a/current/pandaserver/test/pcron.sh b/current/pandaserver/test/pcron.sh
deleted file mode 100755
index 4cb8f3653..000000000
--- a/current/pandaserver/test/pcron.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-"exec" "python" "$0" "$@"
-
-import os
-import sys
-import time
-import commands
-
-_python = "/direct/usatlas+u/gfg/python-latest/python-2.4.1/python-2.4.1/bin/python"
-
-class Woker:
- # constructor
- def __init__(self):
- pass
-
- # main
- def run(self):
- os.chdir('/direct/usatlas+u/sm/panda/pilot2')
- com = "python pilot.py -a /usatlas/projects/OSG -d /tmp -l /usatlas/prodjob/share/ -q http://dms02.usatlas.bnl.gov:8000/dq2/ -s BNL_ATLAS_DDM"
- os.spawnv(os.P_NOWAIT,_python,com.split())
-
-# count # of processes
-out = commands.getoutput('ps auxww | grep pilot.py | grep -v auxww | grep -v "sh -c" | grep -v grep' )
-if out == '':
- nPilot = 0
-else:
- nPilot = len(out.split('\n'))
-maxPilot = 10
-print nPilot
-if nPilot >= maxPilot:
- sys.exit(0)
-
-for i in range(maxPilot-nPilot):
- thr = Woker()
- thr.run()
- time.sleep(5)
diff --git a/current/pandaserver/test/pdq2_cr b/current/pandaserver/test/pdq2_cr
deleted file mode 100755
index 538a6c5a6..000000000
--- a/current/pandaserver/test/pdq2_cr
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/bin/bash
-
-"exec" "python" "$0" "$@"
-
-
-def _usage():
- print \
-"""
-NAME
- pdq2_cr - copy and register DQ2 dataset via PANDA
-
-SYNOPSIS
-
- pdq2_cr [ -h | --help]
- [ -p | --parallel n ]
- [ -t | --timeout n ]
- [ -d | --destination destination ]
- [ -r | --remote remoteSite ]
- [ -s | --source sourceSite ]
- datasetname
- [lfn1 [lfn2 [...]]]
-DESCRIPTION
-
- dq2_cr copies and registers DQ2 dataset. It scans the LRC to find missing or corrupted
- files in a dataset, copies the files to the local SE using 3rd-party transfers, and
- registers the files to the LRC.
-
-OPTIONS
-
- -h | --help Print this message
-
- -p | --parallel Number of copy threads (default:3)
-
- -t | --timeout Timeout limit in second for each file transfer (default:1800)
-
- -d | --destination Directory in the storage element where files will be put.
-
- -r | --remote Specify remote site to which files get copied
-
- -s | --source Specify source site from which files get copied
-
-"""
-
-# error codes
-EC_Configuration = 20
-EC_VUID = 30
-EC_QueryFiles = 40
-EC_Location = 50
-EC_Copy = 60
-EC_Main = 70
-EC_PFNfromLFC = 80
-EC_INVALIDSIZE = 90
-EC_RegisterLRC = 100
-EC_LS = 110
-
-####################################################################
-# main
-def main():
- import sys
- import getopt
-
- # option class
- class _options:
- def __init__(self):
- pass
- options = _options()
- del _options
- # set default values
- options.source = ''
- options.destination = ''
- options.remote = ''
- # get command-line parameters
- try:
- opts, args = getopt.getopt(sys.argv[1:],"hvn:cd:p:t:s:r:l:u",
- ["help","verbose","ntry=","choose",
- "destination=","parallel=","timeout=",
- "source=","remote=","location=","uber",
- "noSleep","uberHost=","gsiHost=","srmHost=",
- "guids=","lfns=","debug",
- ])
- except:
- _usage()
- print "ERROR : Invalid options"
- sys.exit(EC_Main)
- # set options
- for o, a in opts:
- if o in ("-h","--help"):
- _usage()
- sys.exit()
- if o in ("-s","--source"):
- options.source = a
- if o in ("-r","--remote"):
- options.remote = a
- if o in ("-d","--destination"):
- options.destination = a
- # datasetname
- if len(args) == 0:
- print "ERROR : no datasetname"
- sys.exit(EC_Main)
- # source
- if options.source == "":
- print "ERROR : no source. use -s"
- sys.exit(EC_Main)
- # destination
- if options.destination == "":
- print "ERROR : no destination. use -d"
- sys.exit(EC_Main)
- # remote
- if options.remote == "":
- print "ERROR : no remote. use -r"
- sys.exit(EC_Main)
-
- # submit
- import time
- import commands
- import userinterface.Client as Client
- from taskbuffer.JobSpec import JobSpec
- from taskbuffer.FileSpec import FileSpec
-
- site = "BNL_ATLAS_DDM"
-
- datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
- destName = 'BNL_SE'
-
- jobList = []
-
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s" % commands.getoutput('uuidgen')
- job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/run_dq2_cr'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.currentPriority = 100000
- job.prodSourceLabel = 'test'
- job.computingSite = site
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- argStr = ""
- for arg in sys.argv[1:]:
- argStr += "%s " % arg
- job.jobParameters = argStr
-
- jobList.append(job)
-
- s,o = Client.submitJobs(jobList)
- print "---------------------"
- print s
- for x in o:
- print "PandaID=%s" % x[0]
-
-if __name__ == "__main__":
- main()
diff --git a/current/pandaserver/test/plot.py b/current/pandaserver/test/plot.py
deleted file mode 100755
index 9d37de977..000000000
--- a/current/pandaserver/test/plot.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import re
-import time
-import datetime
-import pylab
-file = open('panda-DBProxy.log')
-datesMap = {}
-valuesMap = {}
-for line in file:
- items = re.findall('countPilotRequests[^\']+\'([^\']+)\': (\d+)',line)
- if len(items) != 0:
- # statistics
- site = items[0][0]
- count = float(items[0][1])
- # date
- items = re.split(' |,',line)
- if len(items) >= 2:
- strDate = '%s %s' % tuple(items[:2])
- datetimeTime = datetime.datetime(*time.strptime(strDate,'%Y-%m-%d %H:%M:%S')[:6])
- # assign
- if not datesMap.has_key(site):
- datesMap[site] = []
- valuesMap[site] = []
- datesMap[site].append(pylab.date2num(datetimeTime))
- valuesMap[site].append(count)
-# close file
-file.close()
-# plot
-nRow = 1 #len(datesMap.keys())
-nCol = 1
-nFig = 1
-tFig = 1
-sites = datesMap.keys()
-sites.sort()
-for site in sites:
- if nFig == (nRow*nCol+1):
- pylab.savefig('pilot%d.png' % tFig)
- tFig += 1
- pylab.figure(tFig)
- nFig = 1
- pylab.subplot(int('%d%d%d' % (nRow,nCol,nFig)))
- pylab.title('Number of pilots @%s' % site)
- pylab.plot_date(datesMap[site],valuesMap[site])
- nFig += 1
-# save the last figure
-pylab.savefig('pilot%d.png' % tFig)
-# show
-#pylab.show()
-
-
-
-
diff --git a/current/pandaserver/test/prioryMassage.py b/current/pandaserver/test/prioryMassage.py
deleted file mode 100644
index 887bca19f..000000000
--- a/current/pandaserver/test/prioryMassage.py
+++ /dev/null
@@ -1,364 +0,0 @@
-import os
-import re
-import sys
-import datetime
-from taskbuffer.TaskBuffer import taskBuffer
-from pandalogger.PandaLogger import PandaLogger
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# logger
-_logger = PandaLogger().getLogger('prioryMassage')
-
-_logger.debug("================= start ==================")
-
-# instantiate TB
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-
-# get usage breakdown
-usageBreakDownPerUser = {}
-usageBreakDownPerSite = {}
-workingGroupList = []
-for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']:
- varMap = {}
- varMap[':prodSourceLabel'] = 'user'
- if table == 'ATLAS_PANDA.jobsActive4':
- sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table
- else:
- # with time range for archived table
- varMap[':modificationTime'] = datetime.datetime.utcnow() - datetime.timedelta(minutes=60)
- sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel AND modificationTime>:modificationTime GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table
- # exec
- status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10000)
- if res == None:
- _logger.debug("total %s " % res)
- else:
- _logger.debug("total %s " % len(res))
- # make map
- for cnt,prodUserName,jobStatus,workingGroup,computingSite in res:
- # use workingGroup name as prodUserName
- if workingGroup != None:
- if not workingGroup in workingGroupList:
- workingGroupList.append(workingGroup)
- prodUserName = workingGroup
- workingGroup = None
- # append to PerUser map
- if not usageBreakDownPerUser.has_key(prodUserName):
- usageBreakDownPerUser[prodUserName] = {}
- if not usageBreakDownPerUser[prodUserName].has_key(workingGroup):
- usageBreakDownPerUser[prodUserName][workingGroup] = {}
- if not usageBreakDownPerUser[prodUserName][workingGroup].has_key(computingSite):
- usageBreakDownPerUser[prodUserName][workingGroup][computingSite] = {'rundone':0,'activated':0}
- # append to PerSite map
- if not usageBreakDownPerSite.has_key(computingSite):
- usageBreakDownPerSite[computingSite] = {}
- if not usageBreakDownPerSite[computingSite].has_key(prodUserName):
- usageBreakDownPerSite[computingSite][prodUserName] = {}
- if not usageBreakDownPerSite[computingSite][prodUserName].has_key(workingGroup):
- usageBreakDownPerSite[computingSite][prodUserName][workingGroup] = {'rundone':0,'activated':0}
- # count # of running/done and activated
- if jobStatus in ['activated']:
- usageBreakDownPerUser[prodUserName][workingGroup][computingSite]['activated'] += cnt
- usageBreakDownPerSite[computingSite][prodUserName][workingGroup]['activated'] += cnt
- elif jobStatus in ['cancelled','holding']:
- pass
- else:
- usageBreakDownPerUser[prodUserName][workingGroup][computingSite]['rundone'] += cnt
- usageBreakDownPerSite[computingSite][prodUserName][workingGroup]['rundone'] += cnt
-
-# get total number of users and running/done jobs
-totalUsers = 0
-totalRunDone = 0
-for prodUserName,wgValMap in usageBreakDownPerUser.iteritems():
- for workingGroup,siteValMap in wgValMap.iteritems():
- # ignore group production
- if workingGroup != None:
- continue
- totalUsers += 1
- for computingSite,statValMap in siteValMap.iteritems():
- totalRunDone += statValMap['rundone']
-
-_logger.debug("total users : %s" % totalUsers)
-_logger.debug("total RunDone : %s" % totalRunDone)
-_logger.debug("")
-
-if totalUsers == 0:
- sys.exit(0)
-
-# global average
-globalAverageRunDone = float(totalRunDone)/float(totalUsers)
-
-_logger.debug("global average : %s" % globalAverageRunDone)
-
-# count the number of users and run/done jobs for each site
-siteRunDone = {}
-siteUsers = {}
-for computingSite,userValMap in usageBreakDownPerSite.iteritems():
- for prodUserName,wgValMap in userValMap.iteritems():
- for workingGroup,statValMap in wgValMap.iteritems():
- # ignore group production
- if workingGroup != None:
- continue
- # count the number of users and running/done jobs
- if not siteUsers.has_key(computingSite):
- siteUsers[computingSite] = 0
- siteUsers[computingSite] += 1
- if not siteRunDone.has_key(computingSite):
- siteRunDone[computingSite] = 0
- siteRunDone[computingSite] += statValMap['rundone']
-
-# get site average
-_logger.debug("site average")
-siteAverageRunDone = {}
-for computingSite,nRunDone in siteRunDone.iteritems():
- siteAverageRunDone[computingSite] = float(nRunDone)/float(siteUsers[computingSite])
- _logger.debug(" %-25s : %s" % (computingSite,siteAverageRunDone[computingSite]))
-
-# check if the number of user's jobs is lower than the average
-for prodUserName,wgValMap in usageBreakDownPerUser.iteritems():
- _logger.debug("---> %s" % prodUserName)
- # no private jobs
- if not wgValMap.has_key(None):
- _logger.debug("no private jobs")
- continue
- # count the number of running/done jobs
- userTotalRunDone = 0
- for workingGroup,siteValMap in wgValMap.iteritems():
- if workingGroup != None:
- continue
- for computingSite,statValMap in siteValMap.iteritems():
- userTotalRunDone += statValMap['rundone']
- # no priority boost when the number of jobs is higher than the average
- if userTotalRunDone >= globalAverageRunDone:
- _logger.debug("enough running %s > %s (global average)" % (userTotalRunDone,globalAverageRunDone))
- continue
- _logger.debug("user total:%s global average:%s" % (userTotalRunDone,globalAverageRunDone))
- # check with site average
- toBeBoostedSites = []
- for computingSite,statValMap in wgValMap[None].iteritems():
- # the number of running/done jobs is lower than the average and activated jobs are waiting
- if statValMap['rundone'] >= siteAverageRunDone[computingSite]:
- _logger.debug("enough running %s > %s (site average) at %s" % \
- (statValMap['rundone'],siteAverageRunDone[computingSite],computingSite))
- elif statValMap['activated'] == 0:
- _logger.debug("no activated jobs at %s" % computingSite)
- else:
- toBeBoostedSites.append(computingSite)
- # no boost is required
- if toBeBoostedSites == []:
- _logger.debug("no sites to be boosted")
- continue
- # check special prioritized site
- siteAccessForUser = {}
- varMap = {}
- varMap[':dn'] = prodUserName
- sql = "SELECT pandaSite,pOffset,status,workingGroups FROM ATLAS_PANDAMETA.siteAccess WHERE dn=:dn"
- status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10000)
- if res != None:
- for pandaSite,pOffset,pStatus,workingGroups in res:
- # ignore special working group for now
- if not workingGroups in ['',None]:
- continue
- # only approved sites
- if pStatus != 'approved':
- continue
- # no priority boost
- if pOffset == 0:
- continue
- # append
- siteAccessForUser[pandaSite] = pOffset
- # set weight
- totalW = 0
- defaultW = 100
- for computingSite in toBeBoostedSites:
- totalW += defaultW
- if siteAccessForUser.has_key(computingSite):
- totalW += siteAccessForUser[computingSite]
- totalW = float(totalW)
- # the total number of jobs to be boosted
- numBoostedJobs = globalAverageRunDone - float(userTotalRunDone)
- # get quota
- quotaFactor = 1.0 + taskBuffer.checkQuota(prodUserName)
- _logger.debug("quota factor:%s" % quotaFactor)
- # make priority boost
- nJobsPerPrioUnit = 5
- highestPrio = 1000
- for computingSite in toBeBoostedSites:
- weight = float(defaultW)
- if siteAccessForUser.has_key(computingSite):
- weight += float(siteAccessForUser[computingSite])
- weight /= totalW
- # the number of boosted jobs at the site
- numBoostedJobsSite = int(numBoostedJobs * weight / quotaFactor)
- _logger.debug("nSite:%s nAll:%s W:%s Q:%s at %s" % (numBoostedJobsSite,numBoostedJobs,weight,quotaFactor,computingSite))
- if numBoostedJobsSite/nJobsPerPrioUnit == 0:
- _logger.debug("too small number of jobs %s to be boosted at %s" % (numBoostedJobsSite,computingSite))
- continue
- # get the highest prio of activated jobs at the site
- varMap = {}
- varMap[':jobStatus'] = 'activated'
- varMap[':prodSourceLabel'] = 'user'
- varMap[':prodUserName'] = prodUserName
- varMap[':computingSite'] = computingSite
- sql = "SELECT MAX(currentPriority) FROM ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND prodUserName=:prodUserName AND workingGroup IS NULL AND jobStatus=:jobStatus AND computingSite=:computingSite"
- status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10)
- maxPrio = None
- if res != None:
- try:
- maxPrio = res[0][0]
- except:
- pass
- if maxPrio == None:
- _logger.debug("cannot get the highest prio at %s" % computingSite)
- continue
- # delta for priority boost
- prioDelta = highestPrio - maxPrio
- # already boosted
- if prioDelta <= 0:
- _logger.debug("already boosted (prio=%s) at %s" % (maxPrio,computingSite))
- continue
- # lower limit
- minPrio = maxPrio - numBoostedJobsSite/nJobsPerPrioUnit
- # SQL for priority boost
- varMap = {}
- varMap[':jobStatus'] = 'activated'
- varMap[':prodSourceLabel'] = 'user'
- varMap[':prodUserName'] = prodUserName
- varMap[':computingSite'] = computingSite
- varMap[':prioDelta'] = prioDelta
- varMap[':maxPrio'] = maxPrio
- varMap[':minPrio'] = minPrio
- varMap[':rlimit'] = numBoostedJobsSite
- sql = "UPDATE ATLAS_PANDA.jobsActive4 SET currentPriority=currentPriority+:prioDelta "
- sql += "WHERE prodSourceLabel=:prodSourceLabel "
- if prodUserName in workingGroupList:
- sql += "AND workingGroup=:prodUserName "
- else:
- sql += "AND prodUserName=:prodUserName AND workingGroup IS NULL "
- sql += "AND jobStatus=:jobStatus AND computingSite=:computingSite AND currentPriority>:minPrio "
- sql += "AND currentPriority<=:maxPrio AND rownum<=:rlimit"
- _logger.debug("boost %s" % str(varMap))
- status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10)
- _logger.debug(" database return : %s" % res)
-
-
-# redo stalled analysis jobs
-_logger.debug("=== redo stalled jobs")
-try:
- varMap = {}
- varMap[':prodSourceLabel'] = 'user'
- sqlJ = "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsDefined4 "
- sqlJ += "WHERE prodSourceLabel=:prodSourceLabel AND modificationTime delete downstream jobs")
- # FIXME
- #taskBuffer.deleteStalledJobs(libLFN)
- else:
- # activate
- if useLib and libStatus == 'ready' and (not libGUID in [None,'']) and (not libDSName in [None,'']):
- # update GUID
- _logger.debug(" set GUID:%s for %s" % (libGUID,libLFN))
- #retG = taskBuffer.setGUIDs([{'lfn':libLFN,'guid':libGUID}])
- # FIXME
- retG = True
- if not retG:
- _logger.error(" failed to update GUID for %s" % libLFN)
- else:
- # get PandaID with lib.tgz
- #ids = taskBuffer.updateInFilesReturnPandaIDs(libDSName,'ready')
- ids = []
- # get jobs
- jobs = taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False)
- # remove None and unknown
- acJobs = []
- for job in jobs:
- if job == None or job.jobStatus == 'unknown':
- continue
- acJobs.append(job)
- # activate
- _logger.debug(" -> activate downstream jobs")
- #taskBuffer.activateJobs(acJobs)
- else:
- # wait
- _logger.debug(" -> wait")
- varMap = {}
- varMap[':prodSourceLabel'] = 'user'
- varMap[':jobDefinitionID'] = jobDefinitionID
- varMap[':prodUserName'] = prodUserName
- # FIXME
- #stU,resU = taskBuffer.querySQLS(sqlU,varMap)
-except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("failed to redo stalled jobs with %s %s" % (errtype,errvalue))
-
-_logger.debug("-------------- end")
diff --git a/current/pandaserver/test/proxy.sh b/current/pandaserver/test/proxy.sh
deleted file mode 100755
index 674e1d248..000000000
--- a/current/pandaserver/test/proxy.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash -l
-
-echo '************** start'
-date
-source /afs/cern.ch/project/gd/LCG-share/current/external/etc/profile.d/grid-env.sh
-echo '************** check proxy'
-voms-proxy-info -all
-echo '************** check novoms'
-voms-proxy-info -all -file /tmp/x509up_u`id -u`_novoms
-echo '************** voms-proxy-init'
-voms-proxy-init -voms atlas:/atlas/usatlas/Role=production -valid 100000:0 -noregen -debug -cert /tmp/x509up_u`id -u`_novoms
-echo '************** check new proxy'
-voms-proxy-info -all
-echo '************** end'
-echo
diff --git a/current/pandaserver/test/reassignDefJobs.py b/current/pandaserver/test/reassignDefJobs.py
deleted file mode 100755
index 3aecd1374..000000000
--- a/current/pandaserver/test/reassignDefJobs.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import sys
-import time
-import datetime
-from taskbuffer.OraDBProxy import DBProxy
-import userinterface.Client as Client
-from dataservice.DDM import ddm
-
-timeL = 60
-if len(sys.argv) == 2:
- timeL = int(sys.argv[1])
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# erase datasets
-def eraseDispDatasets(ids):
- datasets = []
- # get jobs
- status,jobs = Client.getJobStatus(ids)
- if status != 0:
- return
- # gather dispDBlcoks
- for job in jobs:
- for file in job.Files:
- if not file.dispatchDBlock in datasets:
- datasets.append(file.dispatchDBlock)
- # erase
- for dataset in datasets:
- ddm.DQ2.main(['eraseDataset',datasets])
-
-# time limit
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=int(timeL))
-
-# instantiate DB proxies
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-while True:
- # get PandaIDs
- varMap = {}
- varMap[':jobStatus'] = 'defined'
- varMap[':modificationTime'] = timeLimit
- varMap[':prodSourceLabel'] = 'managed'
- sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID"
- status,res = proxyS.querySQLS(sql,varMap)
- # escape
- if len(res) == 0:
- break
- # convert to list
- jobs = []
- for id, in res:
- jobs.append(id)
- # reassign
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- print 'reassignJobs(%s)' % jobs[iJob:iJob+nJob]
- Client.reassignJobs(jobs[iJob:iJob+nJob])
- iJob += nJob
- time.sleep(120)
-
-
diff --git a/current/pandaserver/test/reassignJobs.py b/current/pandaserver/test/reassignJobs.py
deleted file mode 100755
index ab17c5b42..000000000
--- a/current/pandaserver/test/reassignJobs.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import sys
-
-import userinterface.Client as Client
-
-if len(sys.argv) == 2:
- Client.reassignJobs([sys.argv[1]])
-else:
- startID = int(sys.argv[1])
- endID = int(sys.argv[2])
- if startID > endID:
- print '%d is less than %d' % (endID,startID)
- sys.exit(1)
- Client.reassignJobs(range(startID,endID+1))
-
diff --git a/current/pandaserver/test/reassignSite.py b/current/pandaserver/test/reassignSite.py
deleted file mode 100644
index 2d80aaa36..000000000
--- a/current/pandaserver/test/reassignSite.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import sys
-import time
-import datetime
-
-from taskbuffer.OraDBProxy import DBProxy
-# password
-from config import panda_config
-
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-site = sys.argv[1]
-import userinterface.Client as Client
-
-# erase dispatch datasets
-def eraseDispDatasets(ids):
- print "eraseDispDatasets"
- datasets = []
- # get jobs
- status,jobs = Client.getJobStatus(ids)
- if status != 0:
- return
- # gather dispDBlcoks
- for job in jobs:
- # dispatchDS is not a DQ2 dataset in US
- if job.cloud == 'US':
- continue
- # erase disp datasets for production jobs only
- if job.prodSourceLabel != 'managed':
- continue
- for file in job.Files:
- if file.dispatchDBlock == 'NULL':
- continue
- if (not file.dispatchDBlock in datasets) and \
- re.search('_dis\d+$',file.dispatchDBlock) != None:
- datasets.append(file.dispatchDBlock)
- # erase
- for dataset in datasets:
- print 'erase %s' % dataset
- status,out = ddm.DQ2.main('eraseDataset',dataset)
- print out
-
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=4)
-varMap[':jobStatus'] = 'activated'
-varMap[':modificationTime'] = timeLimit
-varMap[':prodSourceLabel'] = 'managed'
-varMap[':computingSite'] = site
-sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND computingSite=:computingSite AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID"
-status,res = proxyS.querySQLS(sql,varMap)
-
-jobs = []
-if res != None:
- for (id,) in res:
- jobs.append(id)
-if len(jobs):
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- print 'reassign %s' % str(jobs[iJob:iJob+nJob])
- eraseDispDatasets(jobs[iJob:iJob+nJob])
- Client.reassignJobs(jobs[iJob:iJob+nJob])
- iJob += nJob
- time.sleep(10)
-
diff --git a/current/pandaserver/test/reassignTask.py b/current/pandaserver/test/reassignTask.py
deleted file mode 100644
index 475975aeb..000000000
--- a/current/pandaserver/test/reassignTask.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import re
-import sys
-import time
-import datetime
-
-from taskbuffer.OraDBProxy import DBProxy
-# password
-from config import panda_config
-
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-taskid = sys.argv[1]
-import userinterface.Client as Client
-
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1)
-varMap = {}
-varMap[':modificationTime'] = timeLimit
-varMap[':prodSourceLabel'] = 'managed'
-varMap[':taskID'] = taskid
-sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE taskID=:taskID AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID"
-status,res = proxyS.querySQLS(sql,varMap)
-
-jobs = []
-if res != None:
- for (id,) in res:
- jobs.append(id)
-if len(jobs):
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- print 'reassign %s' % str(jobs[iJob:iJob+nJob])
- Client.reassignJobs(jobs[iJob:iJob+nJob])
- iJob += nJob
- time.sleep(10)
-
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1)
-varMap = {}
-varMap[':jobStatus'] = 'activated'
-varMap[':modificationTime'] = timeLimit
-varMap[':prodSourceLabel'] = 'managed'
-varMap[':taskID'] = taskid
-sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND taskID=:taskID AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID"
-status,res = proxyS.querySQLS(sql,varMap)
-
-jobs = []
-if res != None:
- for (id,) in res:
- jobs.append(id)
-if len(jobs):
- nJob = 100
- iJob = 0
- while iJob < len(jobs):
- print 'reassign %s' % str(jobs[iJob:iJob+nJob])
- Client.reassignJobs(jobs[iJob:iJob+nJob])
- iJob += nJob
- time.sleep(10)
-
-
-
diff --git a/current/pandaserver/test/reassignWaiting.py b/current/pandaserver/test/reassignWaiting.py
deleted file mode 100755
index 24c8a232f..000000000
--- a/current/pandaserver/test/reassignWaiting.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import time
-import datetime
-from taskbuffer.OraDBProxy import DBProxy
-import userinterface.Client as Client
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# time limit
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1)
-
-# instantiate DB proxies
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-while True:
- # get PandaIDs
- varMap = {}
- varMap[':modificationTime'] = timeLimit
- sql = "SELECT PandaID FROM ATLAS_PANDA.jobsWaiting4 WHERE modificationTime<:modificationTime ORDER BY PandaID"
- status,res = proxyS.querySQLS(sql,varMap)
-
- # escape
- if len(res) == 0:
- break
- # convert to list
- jobs = []
- for id, in res:
- jobs.append(id)
- # reassign
- nJob = 300
- iJob = 0
- while iJob < len(jobs):
- print 'reassignJobs(%s)' % jobs[iJob:iJob+nJob]
- Client.reassignJobs(jobs[iJob:iJob+nJob])
- iJob += nJob
- time.sleep(60)
-
diff --git a/current/pandaserver/test/redirectLog.py b/current/pandaserver/test/redirectLog.py
deleted file mode 100755
index 351d4a192..000000000
--- a/current/pandaserver/test/redirectLog.py
+++ /dev/null
@@ -1,40 +0,0 @@
-
-"""
-redirect apache log to the logging server
-
-"""
-
-import re
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_loggerMap = {}
-pandaLogger = PandaLogger()
-
-while True:
- # read line
- line = raw_input()
- # extract host, request and response
- items = re.findall('(\S+) - - \[[^\]]+\] ("[^"]+") (\d+)',line)
- if len(items) == 1:
- # host
- host = items[0][0]
- # request
- request = items[0][1].split()[1].split('/')[-1]
- if request == 'isAlive':
- # somehow isAlive is not recorded
- request = 'IsAlive'
- # set logtype
- if request.startswith('datasetCompleted'):
- logtype = 'datasetCompleted'
- else:
- logtype = request
- # response
- response = items[0][2]
- # make message
- message = '%s - %s %s' % (host,request,response)
- # get logger
- pandaLogger.setParam('Type',logtype)
- logger = pandaLogger.getHttpLogger('prod')
- # add message
- logger.info(message)
diff --git a/current/pandaserver/test/redirectLog.sh b/current/pandaserver/test/redirectLog.sh
deleted file mode 100755
index c60e9ff27..000000000
--- a/current/pandaserver/test/redirectLog.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-BASEPATH=/usatlas/u/sm/prod
-BINPATH=/usatlas/u/sm/latest
-LOG=$BASEPATH/httpd/logs/access_log
-
-# for python
-export PATH=$BINPATH/python/bin:$PATH
-export PYTHONPATH=$BASEPATH/panda:$PYTHONPATH
-
-tail -F $LOG | python $BASEPATH/panda/test/redirectLog.py
diff --git a/current/pandaserver/test/resubmitJobs.py b/current/pandaserver/test/resubmitJobs.py
deleted file mode 100755
index 7272d19ca..000000000
--- a/current/pandaserver/test/resubmitJobs.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import sys
-
-import userinterface.Client as Client
-
-if len(sys.argv) == 2:
- Client.resubmitJobs([sys.argv[1]])
-else:
- startID = int(sys.argv[1])
- endID = int(sys.argv[2])
- if startID > endID:
- print '%d is less than %d' % (endID,startID)
- sys.exit(1)
- Client.resubmitJobs(range(startID,endID+1))
-
diff --git a/current/pandaserver/test/runMerger.py b/current/pandaserver/test/runMerger.py
deleted file mode 100644
index ba765b16f..000000000
--- a/current/pandaserver/test/runMerger.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import os
-import re
-import sys
-import time
-import datetime
-import commands
-import threading
-
-from config import panda_config
-
-# initialize cx_Oracle using dummy connection
-from taskbuffer.Initializer import initializer
-initializer.init()
-
-from dataservice.Merger import Merger
-from taskbuffer.TaskBuffer import taskBuffer
-from pandalogger.PandaLogger import PandaLogger
-
-
-# logger
-_logger = PandaLogger().getLogger('runMerger')
-
-_logger.debug("================= start ==================")
-
-# overall timeout value
-overallTimeout = 60
-
-# kill old process
-try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName)
- for line in out.split('\n'):
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("kill process : %s %s" % (type,value))
-
-# time limit
-timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=5)
-timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(hours=12)
-timeLimitX = datetime.datetime.utcnow() - datetime.timedelta(hours=6)
-
-# instantiate TB
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-
-# thread pool
-class ThreadPool:
- def __init__(self):
- self.lock = threading.Lock()
- self.list = []
-
- def add(self,obj):
- self.lock.acquire()
- self.list.append(obj)
- self.lock.release()
-
- def remove(self,obj):
- self.lock.acquire()
- self.list.remove(obj)
- self.lock.release()
-
- def join(self):
- self.lock.acquire()
- thrlist = tuple(self.list)
- self.lock.release()
- for thr in thrlist:
- thr.join()
-
-
-# thread to merge dataset
-class MergerThr (threading.Thread):
- def __init__(self,lock,proxyLock,datasets,pool):
- threading.Thread.__init__(self)
- self.datasets = datasets
- self.lock = lock
- self.proxyLock = proxyLock
- self.pool = pool
- self.maxTry = 3
- self.pool.add(self)
-
- def run(self):
- self.lock.acquire()
- try:
- # loop over all datasets
- for vuid,name,modDate,verNum in self.datasets:
- try:
- try:
- verNum = int(verNum)
- except:
- verNum = 0
- _logger.debug("Merge %s %s %s" % (modDate,name,verNum))
- toBeClosed = False
- # close old datasets anyway
- if modDate < timeLimitX or verNum >= self.maxTry:
- toBeClosed = True
- # check version
- dsSpec = taskBuffer.queryDatasetWithMap({'vuid':vuid})
- if dsSpec == None:
- _logger.error("failed to get dataset spec for %s:%s" % (name,vuid))
- continue
- try:
- if int(dsSpec.version) != verNum+1:
- _logger.debug("skip %s due to version mismatch %s != %s+1" % (name,dsSpec.version,verNum))
- continue
- except:
- _logger.error("failed to convert version='%s' to int for %s" % (dsSpec.version,name))
- continue
- # get PandaID
- self.proxyLock.acquire()
- proxyS = taskBuffer.proxyPool.getProxy()
- pandaID = proxyS.getPandaIDwithDestDBlock(name)
- taskBuffer.proxyPool.putProxy(proxyS)
- self.proxyLock.release()
- if pandaID == None:
- _logger.error("failed to find PandaID for %s" % name)
- toBeClosed = True
- else:
- # get job
- self.proxyLock.acquire()
- pandaJob = taskBuffer.peekJobs([pandaID])[0]
- self.proxyLock.release()
- if pandaJob == None:
- _logger.error("failed to get job for %s PandaID=%s" % (name,pandaID))
- toBeClosed = True
- else:
- # run merger
- _logger.debug("run merger for %s" % name)
- merger = Merger(taskBuffer,pandaJob)
- mRet = merger.run()
- if mRet == None:
- _logger.debug("got unrecoverable for %s" % name)
- toBeClosed = True
- elif mRet == True:
- _logger.debug("succeeded for %s" % name)
- toBeClosed = True
- else:
- _logger.debug("failed for %s" % name)
- # close dataset
- if toBeClosed:
- _logger.debug("close %s" % name)
- self.proxyLock.acquire()
- varMap = {}
- varMap[':vuid'] = vuid
- varMap[':status'] = 'tobeclosed'
- taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid",
- varMap)
- self.proxyLock.release()
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("Failed %s with %s:%s" % (name,errType,errValue))
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("MergerThr failed with %s:%s" % (errType,errValue))
- self.pool.remove(self)
- self.lock.release()
-
-
-# start merger
-mergeLock = threading.Semaphore(3)
-mergeProxyLock = threading.Lock()
-mergeThreadPool = ThreadPool()
-maxRows = 10000
-sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows
-while True:
- # lock
- mergeLock.acquire()
- # get datasets
- mergeProxyLock.acquire()
- varMap = {}
- varMap[':modificationdateU'] = timeLimitU
- varMap[':modificationdateL'] = timeLimitL
- varMap[':type'] = 'output'
- varMap[':status'] = 'tobemerged'
- proxyS = taskBuffer.proxyPool.getProxy()
- res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60',getVersion=True)
- taskBuffer.proxyPool.putProxy(proxyS)
- if res == None:
- _logger.debug("# of datasets to be merged: %s" % res)
- else:
- _logger.debug("# of datasets to be merged: %s" % len(res))
- if res==None or len(res)==0:
- mergeProxyLock.release()
- mergeLock.release()
- break
- # release
- mergeProxyLock.release()
- mergeLock.release()
- # run thread
- iRows = 0
- nRows = 100
- while iRows < len(res):
- mergerThr = MergerThr(mergeLock,mergeProxyLock,res[iRows:iRows+nRows],mergeThreadPool)
- mergerThr.start()
- iRows += nRows
- mergeThreadPool.join()
- if len(res) < maxRows:
- break
-
-
-_logger.debug("================= end ==================")
diff --git a/current/pandaserver/test/runRebro.py b/current/pandaserver/test/runRebro.py
deleted file mode 100755
index 494a0798d..000000000
--- a/current/pandaserver/test/runRebro.py
+++ /dev/null
@@ -1,198 +0,0 @@
-import os
-import re
-import sys
-import pytz
-import time
-import fcntl
-import types
-import shelve
-import random
-import datetime
-import commands
-import threading
-import userinterface.Client as Client
-from dataservice.DDM import ddm
-from dataservice.DDM import dashBorad
-from taskbuffer.OraDBProxy import DBProxy
-from taskbuffer.TaskBuffer import taskBuffer
-from pandalogger.PandaLogger import PandaLogger
-from jobdispatcher.Watcher import Watcher
-from brokerage.SiteMapper import SiteMapper
-from dataservice.Adder import Adder
-from dataservice.Finisher import Finisher
-from dataservice.MailUtils import MailUtils
-from taskbuffer import ProcessGroups
-import brokerage.broker_util
-import brokerage.broker
-import taskbuffer.ErrorCode
-import dataservice.DDM
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# logger
-_logger = PandaLogger().getLogger('runRebro')
-
-_logger.debug("===================== start =====================")
-
-# memory checker
-def _memoryCheck(str):
- try:
- proc_status = '/proc/%d/status' % os.getpid()
- procfile = open(proc_status)
- name = ""
- vmSize = ""
- vmRSS = ""
- # extract Name,VmSize,VmRSS
- for line in procfile:
- if line.startswith("Name:"):
- name = line.split()[-1]
- continue
- if line.startswith("VmSize:"):
- vmSize = ""
- for item in line.split()[1:]:
- vmSize += item
- continue
- if line.startswith("VmRSS:"):
- vmRSS = ""
- for item in line.split()[1:]:
- vmRSS += item
- continue
- procfile.close()
- _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str))
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("memoryCheck() : %s %s" % (type,value))
- _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str))
- return
-
-_memoryCheck("start")
-
-# kill old process
-try:
- # time limit
- timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=7)
- # get process list
- scriptName = sys.argv[0]
- out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName)
- for line in out.split('\n'):
- items = line.split()
- # owned process
- if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron
- continue
- # look for python
- if re.search('python',line) == None:
- continue
- # PID
- pid = items[1]
- # start time
- timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line)
- startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6])
- # kill old process
- if startTime < timeLimit:
- _logger.debug("old process : %s %s" % (pid,startTime))
- _logger.debug(line)
- commands.getoutput('kill -9 %s' % pid)
-except:
- type, value, traceBack = sys.exc_info()
- _logger.error("kill process : %s %s" % (type,value))
-
-
-# instantiate TB
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-
-# instantiate sitemapper
-siteMapper = SiteMapper(taskBuffer)
-
-_memoryCheck("rebroker")
-
-# rebrokerage
-_logger.debug("Rebrokerage start")
-try:
- normalTimeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=24)
- sortTimeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
- sql = "SELECT jobDefinitionID,prodUserName,prodUserID,computingSite,MAX(modificationTime) FROM ATLAS_PANDA.jobsActive4 "
- sql += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus=:jobStatus "
- sql += "AND modificationTime<:modificationTime "
- sql += "AND jobsetID IS NOT NULL "
- sql += "AND processingType IN (:processingType1,:processingType2) "
- sql += "GROUP BY jobDefinitionID,prodUserName,prodUserID,computingSite "
- varMap = {}
- varMap[':prodSourceLabel1'] = 'user'
- varMap[':prodSourceLabel2'] = 'panda'
- varMap[':modificationTime'] = sortTimeLimit
- varMap[':processingType1'] = 'pathena'
- varMap[':processingType2'] = 'prun'
- varMap[':jobStatus'] = 'activated'
- # get jobs older than threshold
- ret,res = taskBuffer.querySQLS(sql, varMap)
- sql = "SELECT PandaID,modificationTime FROM %s WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID "
- sql += "AND modificationTime>:modificationTime AND rownum <= 1"
- if res != None:
- from userinterface.ReBroker import ReBroker
- recentRuntimeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3)
- # loop over all user/jobID combinations
- iComb = 0
- nComb = len(res)
- _logger.debug("total combinations = %s" % nComb)
- for jobDefinitionID,prodUserName,prodUserID,computingSite,maxModificationTime in res:
- # check time if it is closed to log-rotate
- timeNow = datetime.datetime.now(pytz.timezone('Europe/Zurich'))
- timeCron = timeNow.replace(hour=4,minute=0,second=0,microsecond=0)
- if (timeNow-timeCron) < datetime.timedelta(seconds=60*10) and \
- (timeCron-timeNow) < datetime.timedelta(seconds=60*30):
- _logger.debug("terminate since close to log-rotate time")
- break
- # check if jobs with the jobID have run recently
- varMap = {}
- varMap[':prodUserName'] = prodUserName
- varMap[':jobDefinitionID'] = jobDefinitionID
- varMap[':modificationTime'] = recentRuntimeLimit
- _logger.debug(" rebro:%s/%s:ID=%s:%s" % (iComb,nComb,jobDefinitionID,prodUserName))
- iComb += 1
- hasRecentJobs = False
- # check site
- if not siteMapper.checkSite(computingSite):
- _logger.debug(" -> skip unknown site=%s" % computingSite)
- continue
- # check site status
- tmpSiteStatus = siteMapper.getSite(computingSite).status
- if not tmpSiteStatus in ['offline','test']:
- # use normal time limit for nornal site status
- if maxModificationTime > normalTimeLimit:
- _logger.debug(" -> skip wait for normal timelimit=%s skip %s ran recently at %s" % (resU[0][0],resU[0][1]))
- break
- else:
- _logger.debug(" -> immidiate rebro due to site status=%s" % tmpSiteStatus)
- if hasRecentJobs:
- # skip since some jobs have run recently
- continue
- else:
- reBroker = ReBroker(taskBuffer)
- # try to lock
- rebRet,rebOut = reBroker.lockJob(prodUserID,jobDefinitionID)
- if not rebRet:
- # failed to lock
- _logger.debug(" -> failed to lock : %s" % rebOut)
- continue
- else:
- # start
- _logger.debug(" -> start")
- reBroker.start()
- reBroker.join()
-except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("rebrokerage failed with %s:%s" % (errType,errValue))
-
-_logger.debug("===================== end =====================")
diff --git a/current/pandaserver/test/setPriority.py b/current/pandaserver/test/setPriority.py
deleted file mode 100755
index 7dab5b3c2..000000000
--- a/current/pandaserver/test/setPriority.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import time
-import sys
-import optparse
-
-
-from taskbuffer.OraDBProxy import DBProxy
-
-# password
-from config import panda_config
-
-usage = """%prog
-
- Set a priority to jobs in a task"""
-
-optP = optparse.OptionParser(usage=usage,conflict_handler="resolve")
-options,args = optP.parse_args()
-
-
-proxyS = DBProxy()
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-varMap = {}
-varMap[':prodSourceLabel'] = 'managed'
-varMap[':taskID'] = sys.argv[1]
-varMap[':prio'] = sys.argv[2]
-sql = "UPDATE %s SET currentPriority=:prio WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID"
-for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']:
- status,res = proxyS.querySQLS(sql % table,varMap)
-
-
diff --git a/current/pandaserver/test/testDB.py b/current/pandaserver/test/testDB.py
deleted file mode 100755
index 752bf3f77..000000000
--- a/current/pandaserver/test/testDB.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/python
-
-"""
-test DB access
-
-"""
-
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-from taskbuffer.DatasetSpec import DatasetSpec
-from taskbuffer.DBProxyPool import DBProxyPool
-
-import getpass
-passwd = getpass.getpass()
-
-pool = DBProxyPool('adbpro.usatlas.bnl.gov',passwd,2)
-
-proxy = pool.getProxy()
-
-import sys
-import commands
-
-job1 = JobSpec()
-job1.PandaID='NULL'
-job1.jobStatus='unknown'
-job1.computingSite="aaa"
-f11 = FileSpec()
-f11.lfn = 'in1.pool.root'
-f11.type = 'input'
-job1.addFile(f11)
-f12 = FileSpec()
-f12.lfn = 'out1.pool.root'
-f12.type = 'output'
-job1.addFile(f12)
-
-job2 = JobSpec()
-job2.PandaID='NULL'
-job2.jobStatus='unknown'
-job2.computingSite="bbb"
-f21 = FileSpec()
-f21.lfn = 'in2.pool.root'
-f21.type = 'input'
-job2.addFile(f21)
-f22 = FileSpec()
-f22.lfn = 'out2.pool.root'
-f22.type = 'output'
-job2.addFile(f22)
-
-proxy.insertNewJob(job1)
-proxy.insertNewJob(job2)
-print "Inserted %d %d" % (job1.PandaID,job2.PandaID)
-proxy.activateJob(job1)
-proxy.activateJob(job2)
-print "activated"
-ret = proxy.getJobs(1,"aaa")
-print "Got Jobs"
-for j in ret:
- print j.PandaID
-print proxy.peekJob(job1.PandaID).jobStatus
-proxy.updateJobStatus(job1.PandaID,"unknown")
-print " ->" ,proxy.peekJob(job1.PandaID).jobStatus
-
-print proxy.peekJob(job2.PandaID).jobStatus
-job2.jobStatus = "running"
-proxy.updateJob(job2,False)
-print " ->" ,proxy.peekJob(job2.PandaID).jobStatus
-print "Updated"
-proxy.archiveJob(job1,False)
-proxy.archiveJobLite(job2.PandaID,job2.jobStatus)
-print "Archived"
-proxy.querySQL("DELETE FROM jobsArchived3 WHERE PandaID=%d" % job1.PandaID)
-proxy.querySQL("DELETE FROM jobsArchived3 WHERE PandaID=%d" % job2.PandaID)
-print "job Deleted"
-
-print "dataset"
-dataset = DatasetSpec()
-dataset.vuid = commands.getoutput('/usr/bin/uuidgen')
-dataset.name = 'test.%s' % dataset.vuid
-
-proxy.insertDataset(dataset)
-print dataset.vuid
-dataset2 = proxy.queryDataset(dataset.vuid)
-print dataset2.values()
-dataset2.type = 'test'
-proxy.updateDataset(dataset2)
-dataset3 = proxy.queryDataset(dataset.vuid)
-print dataset3.values()
-proxy.querySQL("DELETE FROM Datasets WHERE vuid='%s'" % dataset.vuid)
diff --git a/current/pandaserver/test/testDQ.py b/current/pandaserver/test/testDQ.py
deleted file mode 100755
index 381cdece8..000000000
--- a/current/pandaserver/test/testDQ.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import commands
-from dataservice.DDM import ddm
-
-#print ddm.DQ2ProductionClient.generateUUID()
-#print ddm.DQ2.getFilesFromCatalog('aho.xml')
-#print ddm.DQ2ProductionClient.dq2_makeblocks('input.data')
-
-ids=['pandatest.000003.dd.input._00047.junk','09801b0a-9fd0-4237-8caf-a37932c26e39',
- 'pandatest.000003.dd.input._00050.junk','6dd3d367-4aa3-4e1a-9ac3-9ad14b7311f4',
- 'pandatest.000003.dd.input._00037.junk','817c2c92-467b-4a1b-9482-f2ec8468cf2e',
- 'pandatest.000003.dd.input._00021.junk','7720527f-817e-40c7-9e29-ce237f59edfa',
- 'pandatest.000003.dd.input._00023.junk','5f1f9982-85a3-4d1a-9ee9-f1de22c02544',
- 'pandatest.000003.dd.input._00042.junk','610cc91a-c731-4bce-ac7a-ff5133e7d18b',
- 'pandatest.000003.dd.input._00027.junk','bd987478-3c59-4551-b12b-2853bac25613',
- 'pandatest.000003.dd.input._00032.junk','9d0424f3-7552-4282-92f2-dfe74e9a6c12',
- 'pandatest.000003.dd.input._00009.junk','dce33d4a-4569-49ee-95c5-b619b161c777',
- 'pandatest.000003.dd.input._00036.junk','2fc9836b-82d6-41b0-b966-a5c37662172d',
- 'pandatest.000003.dd.input._00031.junk','65b957e0-5ecc-44bb-a1f9-cccb61ca2d16',
- 'pandatest.000003.dd.input._00025.junk','be29fe82-17e2-4122-b4c8-f49a0b76c81f',
- 'pandatest.000003.dd.input._00029.junk','afa4322f-409b-4327-9169-229d8d48ad5a',
- 'pandatest.000003.dd.input._00013.junk','cf236d3b-45fd-4b58-bdfb-59abc983c886',
- 'pandatest.000003.dd.input._00020.junk','b02f98da-0138-4b58-89ba-a88f37214a89',
- 'pandatest.000003.dd.input._00001.junk','12ab5bb9-944e-4e75-bb90-b64c462d4cd8',
- 'pandatest.000003.dd.input._00001.junk','12ab5bb9-944e-4e75-bb90-b64c462d4cd8',
- 'pandatest.000003.dd.input._00006.junk','c0a422ad-e9f1-44bb-9539-cfef7e739da2',
- 'pandatest.000003.dd.input._00034.junk','da670db3-3638-4f06-b650-a9315eb2bd63',
- 'pandatest.000003.dd.input._00046.junk','2fcef270-2e41-472d-83c0-53749b401b74',
- 'pandatest.000003.dd.input._00012.junk','5e212fa1-201f-494d-a2b2-420b229b08fc',
- 'pandatest.000003.dd.input._00044.junk','87c8ebcc-a637-4204-b77b-8219e68b98d7',
- 'pandatest.000003.dd.input._00030.junk','87ad811f-7d39-43d9-8a13-e117079bb208',
- 'pandatest.000003.dd.input._00022.junk','6b902506-1ee1-46b1-a105-1521a8c0dbca',
- 'pandatest.000003.dd.input._00017.junk','2bbed213-943c-41be-b9d7-7d86a309b0b2',
- 'pandatest.000003.dd.input._00049.junk','8366e269-f9ae-4b9c-bd98-df4027c992c7',
- 'pandatest.000003.dd.input._00015.junk','f3c5f37c-b4c2-4933-9633-467ba3a7c364',
- 'pandatest.000003.dd.input._00004.junk','35d66be2-9d21-44a3-96f7-903a7abf4a87',
- 'pandatest.000003.dd.input._00010.junk','2279ea3e-ebbb-4b19-9a69-9868f0cce694',
- 'pandatest.000003.dd.input._00040.junk','a847dbbb-4f98-4b5b-b353-e29e3e3b3fd5',
- 'pandatest.000003.dd.input._00007.junk','abfef002-62ca-4d84-9813-6329764e38bd',
- 'pandatest.000003.dd.input._00048.junk','52854023-67d8-4a0f-99ac-bb1f0bd1dc98',
- 'pandatest.000003.dd.input._00016.junk','bddf7441-6ac9-4087-bafe-32e47448cdc1',
- 'pandatest.000003.dd.input._00041.junk','c76999ba-4cdf-49e9-bfa5-ff3525fbf1ab',
- 'pandatest.000003.dd.input._00003.junk','4865119e-367f-4dd8-bdff-505bd878dfde',
- 'pandatest.000003.dd.input._00019.junk','b9fce1fd-8d4c-4fc4-932f-12b13263ca0c',
- 'pandatest.000003.dd.input._00011.junk','f93a4e08-fd4f-45fc-b324-91ff59555b1c',
- 'pandatest.000003.dd.input._00018.junk','e4894561-9589-40d8-871b-b57d70564384',
- 'pandatest.000003.dd.input._00002.junk','58934980-5ab3-4a66-b3da-55f86d4b54bd',
- 'pandatest.000003.dd.input._00005.junk','5993fe60-bc8c-4fd8-aac1-dfd55700c9c3',
- 'pandatest.000003.dd.input._00028.junk','6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27',
- 'pandatest.000003.dd.input._00033.junk','98f79ba1-1793-4253-aac7-bdf90a51d1ee',
- 'pandatest.000003.dd.input._00039.junk','33660dd5-7cef-422a-a7fc-6c24cb10deb1',
- 'pandatest.000003.dd.input._00014.junk','5c0e9ed8-05a6-41c4-8c07-39b2be33ebc1',
- 'pandatest.000003.dd.input._00008.junk','b0c184d1-5f5e-45a6-9cc8-8b0f20a85463',
- 'pandatest.000003.dd.input._00038.junk','b9171997-4d2b-4075-b154-579ebe9438fa',
- 'pandatest.000003.dd.input._00026.junk','89e5bdf1-15de-44ae-a388-06c1e7d7e2fc',
- 'pandatest.000003.dd.input._00024.junk','c77b77a2-e6d1-4360-8751-19d9fb77e1f1',
- 'pandatest.000003.dd.input._00043.junk','cc6ac2a1-4616-4551-80a7-d96f79252b64',
- 'pandatest.000003.dd.input._00045.junk','ddbed17a-6d65-4e8d-890a-21e1eaa3e9d6',
- 'pandatest.000003.dd.input._00035.junk','8ed1875a-eb90-4906-8fc4-0449d300ddfe'
- ]
-
-for i in range(1):
- datasetName='testDQ.%s' % commands.getoutput('/usr/bin/uuidgen')
- print datasetName
-
- #['pandatest.000003.dd.input._00004.junk','35d66be2-9d21-44a3-96f7-903a7abf4a87']
- #'pandatest.000003.dd.input._00028.junk','6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27',
- # 'pandatest.000003.dd.input._00033.junk','98f79ba1-1793-4253-aac7-bdf90a51d1ee']
- print (['registerNewDataset','-c',datasetName]+ids[i*2:i*2+2])
- ddm.DQ2.main(['registerNewDataset','-c',datasetName]+ids[i*2:i*2+2])
- '''
- status,out = ddm.RepositoryClient.main(['queryDatasetByName',datasetName])
- exec "vuids = %s" % out.split('\n')[0]
- if vuids.has_key(datasetName):
- vuid = vuids[datasetName]
- print vuid
- status,out = ddm.RepositoryClient.main(['resolveVUID',vuid])
- status,out = ddm.DQ2.getFilesFromCatalog('baka.xml')
- exec "rets = %s" % out.split('\n')[0]
- print rets[0]
- exec "ids = %s" % out
- print ddm.DQ2.main(['addFilesToDataset',datasetName]+ids)
- status,out = ddm.DQ2.main(['listFilesInDataset',datasetName])
- print out
- '''
- print (['registerDatasetLocations','-c',datasetName,'http://dms02.usatlas.bnl.gov/sites/bnl/lrc'])
- ddm.DQ2.main(['registerDatasetLocations','-c',datasetName,
- 'http://dms02.usatlas.bnl.gov/sites/bnl/lrc'])
- print (['registerDatasetSubscription',datasetName,'http://doe-dhcp241.bu.edu:8000/dq2/'])
- ddm.DQ2.main(['registerDatasetSubscription',datasetName,'http://doe-dhcp241.bu.edu:8000/dq2/'])
-#print ddm.DQ2.main(['eraseDataset',datasetName])
-
-#print ddm.DQ2.main(['eraseDataset',datasetName])
-#print ddm.DQ2ProductionClient.dq2_create_dataset(datasetName)
-#status,out = ddm.DQ2ProductionClient.dq2_assign_destination(datasetName,'BNL_SE')
-#print out
-#print ddm.DQ2.main(['eraseDataset',datasetName])
-#status,out = ddm.DQ2.main(['listFilesInDataset','panda.destDB.11aed982-8079-4db9-964c-37a284b8597a'])
-#print out
-
-ddm.DQ2_iter.listFileReplicasBySites('mc11_7TeV.151900.madgraph_SM_SG_SS_direct_1200_600_395.merge.AOD.e1095_a131_s1353_a145_r2993_tid723983_00',
- 0,['SARA-MATRIX_DATADISK'],
- 0,300)
diff --git a/current/pandaserver/test/testEvgen.py b/current/pandaserver/test/testEvgen.py
deleted file mode 100755
index db636a439..000000000
--- a/current/pandaserver/test/testEvgen.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = None
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = None
-
-jobList = []
-
-for i in range(1):
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i)
- job.AtlasRelease = 'Atlas-14.1.0'
- job.homepackage = 'AtlasProduction/14.1.0.3'
- job.transformation = 'csc_evgen_trf.py'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.currentPriority = 100
- job.prodSourceLabel = 'test'
- job.computingSite = site
- job.cloud = 'US'
- job.cmtConfig = 'i686-slc4-gcc34-opt'
-
- file = FileSpec()
- file.lfn = "%s.evgen.pool.root" % job.jobName
- file.destinationDBlock = job.destinationDBlock
- file.destinationSE = job.destinationSE
- file.dataset = job.destinationDBlock
- file.destinationDBlockToken = 'ATLASDATADISK'
- file.type = 'output'
- job.addFile(file)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="5144 1 5000 1 CSC.005144.PythiaZee.py %s NONE NONE NONE" % file.lfn
- jobList.append(job)
-
-for i in range(1):
- s,o = Client.submitJobs(jobList)
- print "---------------------"
- print s
- for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testEvgen14.py b/current/pandaserver/test/testEvgen14.py
deleted file mode 100755
index af53c0e95..000000000
--- a/current/pandaserver/test/testEvgen14.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = None
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = 'BNL_SE'
-
-jobList = []
-
-for i in range(1):
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i)
- job.AtlasRelease = 'Atlas-14.1.0'
- job.homepackage = 'AtlasProduction/14.1.0.3'
- job.transformation = 'csc_evgen_trf.py'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.currentPriority = 1000
- job.prodSourceLabel = 'test'
- job.computingSite = site
- job.processingType = 'test'
- job.cmtConfig = 'i686-slc4-gcc34-opt'
-
- file = FileSpec()
- file.lfn = "%s.evgen.pool.root" % job.jobName
- file.destinationDBlock = job.destinationDBlock
- file.destinationSE = job.destinationSE
- file.dataset = job.destinationDBlock
- file.destinationDBlockToken = 'ATLASDATADISK'
- file.type = 'output'
- job.addFile(file)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="5144 1 5000 1 CSC.005144.PythiaZee.py %s NONE NONE NONE" % file.lfn
- jobList.append(job)
-
-for i in range(1):
- s,o = Client.submitJobs(jobList)
- print "---------------------"
- print s
- for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testEvgen15.py b/current/pandaserver/test/testEvgen15.py
deleted file mode 100755
index 0753e3329..000000000
--- a/current/pandaserver/test/testEvgen15.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-site = sys.argv[1]
-cloud = sys.argv[2]
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = None
-
-jobList = []
-
-for i in range(1):
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i)
- job.AtlasRelease = 'Atlas-15.6.10'
- job.homepackage = 'AtlasProduction/15.6.10.1'
- job.transformation = 'Evgen_trf.py'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.currentPriority = 10000
- job.prodSourceLabel = 'test'
- job.computingSite = site
- job.cloud = cloud
- job.cmtConfig = 'i686-slc5-gcc43-opt'
-
- file = FileSpec()
- file.lfn = "%s.evgen.pool.root" % job.jobName
- file.destinationDBlock = job.destinationDBlock
- file.destinationSE = job.destinationSE
- file.dataset = job.destinationDBlock
- file.destinationDBlockToken = 'ATLASDATADISK'
- file.type = 'output'
- job.addFile(file)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="10000 105815 12330001 5000 12467 MC9.105815.JF140_pythia_jet_filter.py %s NONE NONE NONE MC09JobOpts-00-01-88.tar.gz" % file.lfn
- jobList.append(job)
-
-for i in range(1):
- s,o = Client.submitJobs(jobList)
- print "---------------------"
- print s
- for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testEvgen16.py b/current/pandaserver/test/testEvgen16.py
deleted file mode 100755
index 0c0cc67f4..000000000
--- a/current/pandaserver/test/testEvgen16.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-site = sys.argv[1]
-cloud = sys.argv[2]
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = None
-
-jobList = []
-
-for i in range(1):
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i)
- job.AtlasRelease = 'Atlas-16.6.2'
- job.homepackage = 'AtlasProduction/16.6.2.1'
- job.transformation = 'Evgen_trf.py'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.currentPriority = 10000
- job.prodSourceLabel = 'test'
- job.computingSite = site
- job.cloud = cloud
- job.cmtConfig = 'i686-slc5-gcc43-opt'
-
- file = FileSpec()
- file.lfn = "%s.evgen.pool.root" % job.jobName
- file.destinationDBlock = job.destinationDBlock
- file.destinationSE = job.destinationSE
- file.dataset = job.destinationDBlock
- file.destinationDBlockToken = 'ATLASDATADISK'
- file.type = 'output'
- job.addFile(file)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="2760 105048 19901 101 200 MC10.105048.PythiaB_ccmu3mu1X.py %s NONE NONE NONE MC10JobOpts-latest-test.tar.gz" % file.lfn
- jobList.append(job)
-
-for i in range(1):
- s,o = Client.submitJobs(jobList)
- print "---------------------"
- print s
- for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testEvgen17.py b/current/pandaserver/test/testEvgen17.py
deleted file mode 100755
index ce808e4e6..000000000
--- a/current/pandaserver/test/testEvgen17.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-site = sys.argv[1]
-cloud = sys.argv[2]
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = None
-
-jobList = []
-
-for i in range(1):
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i)
- job.AtlasRelease = 'Atlas-17.0.5'
- job.homepackage = 'AtlasProduction/17.0.5.6'
- job.transformation = 'Evgen_trf.py'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.currentPriority = 10000
- job.prodSourceLabel = 'test'
- job.computingSite = site
- job.cloud = cloud
- job.cmtConfig = 'i686-slc5-gcc43-opt'
-
- file = FileSpec()
- file.lfn = "%s.evgen.pool.root" % job.jobName
- file.destinationDBlock = job.destinationDBlock
- file.destinationSE = job.destinationSE
- file.dataset = job.destinationDBlock
- file.destinationDBlockToken = 'ATLASDATADISK'
- file.type = 'output'
- job.addFile(file)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="7000 108316 1 5000 1 MC11.108316.Pythia8_minbias_ND.py %s" % file.lfn
-
- jobList.append(job)
-
-for i in range(1):
- s,o = Client.submitJobs(jobList)
- print "---------------------"
- print s
- for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testFinder.py b/current/pandaserver/test/testFinder.py
deleted file mode 100644
index 09bb9574d..000000000
--- a/current/pandaserver/test/testFinder.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import sys
-from taskbuffer.OraDBProxy import DBProxy
-
-from dataservice import AddressFinder
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# instantiate DB proxies
-proxyS = DBProxy(True)
-proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)
-
-# get DN and address
-status,res = proxyS.querySQLS("SELECT dn,email,name FROM ATLAS_PANDAMETA.users",{},arraySize=1000000)
-if res == None:
- print "SQL error"
- sys.exit(0)
-
-# to upper chrs
-def toUpper(emails):
- retA = []
- for email in emails:
- retA.append(email.upper())
- return retA
-
-outF = open('newemail.sql','w')
-
-for dn,origEmail,name in res:
- if dn == None:
- dn = name
- if dn == None:
- continue
- emailsP = AddressFinder.getEmailPhonebook(dn)
- emailsX = AddressFinder.getEmailXwho(dn)
- if toUpper(emailsP) != toUpper(emailsX) and len(emailsP) != 0:
- print dn
- print "ERROR : xwho != phone"
- print "phone : %s" % str(emailsP)
- print "xwho : %s" % str(emailsX)
- print "DB : %s" % origEmail
- print
- elif len(emailsP) == 0:
- print dn
- print "ERROR : not found"
- print "DB : %s" % origEmail
- print
- elif len(emailsP) > 1:
- print dn
- print "ERROR : non-unique %s" % str(emailsP)
- print "DB : %s" % origEmail
- print
- elif origEmail == None or origEmail.upper() != emailsP[0].upper() and origEmail != 'notsend':
- print dn
- print "phone : %s" % str(emailsP)
- print "xwho : %s" % str(emailsX)
- print "ERROR : %-40s new: %s\n" % (origEmail,emailsP[0])
- outF.write("/* %-40s new: %s */\n" % (origEmail,emailsP[0]))
- outF.write("UPDATE atlas_pandameta.users SET email='%s' WHERE name='%s';\n" % (emailsP[0],name))
- pass
- else:
- pass
- #print dn
- #print "OK"
-
-outF.write('COMMIT;')
-outF.close()
-
-
diff --git a/current/pandaserver/test/testG4sim.py b/current/pandaserver/test/testG4sim.py
deleted file mode 100755
index b2f8f2f9a..000000000
--- a/current/pandaserver/test/testG4sim.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import sys
-import time
-import random
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = None
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = 'BNL_ATLAS_2'
-#destName = 'BU_ATLAS_Tier2'
-
-files = {
- 'mc11.007204.singlepart_mu4.evgen.EVNT.v11000302._00037.pool.root.1':None,
- 'mc11.007204.singlepart_mu4.evgen.EVNT.v11000302._00038.pool.root.1':None,
- }
-
-jobList = []
-
-for lfn in files.keys():
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = commands.getoutput('uuidgen')
- job.AtlasRelease = 'Atlas-11.0.3'
- job.homepackage = 'JobTransforms-11-00-03-02'
- job.transformation = 'share/csc.simul.trf'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.computingSite = site
- job.prodDBlock = 'mc11.007204.singlepart_mu4.evgen.EVNT.v11000302'
- job.cmtConfig = 'i686-slc4-gcc34-opt'
-
- job.prodSourceLabel = 'test'
- job.currentPriority = 1000
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileOE = FileSpec()
- fileOE.lfn = "%s.HITS.pool.root" % commands.getoutput('uuidgen')
- fileOE.destinationDBlock = job.destinationDBlock
- fileOE.destinationSE = job.destinationSE
- fileOE.dataset = job.destinationDBlock
- fileOE.destinationDBlockToken = 'ATLASDATADISK'
- fileOE.type = 'output'
- job.addFile(fileOE)
-
- fileOA = FileSpec()
- fileOA.lfn = "%s.RDO.pool.root" % commands.getoutput('uuidgen')
- fileOA.destinationDBlock = job.destinationDBlock
- fileOA.destinationSE = job.destinationSE
- fileOA.dataset = job.destinationDBlock
- fileOA.destinationDBlockToken = 'ATLASDATADISK'
- fileOA.type = 'output'
- job.addFile(fileOA)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen')
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="%s %s %s 100 700 2158" % (fileI.lfn,fileOE.lfn,fileOA.lfn)
-
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testG4sim15.py b/current/pandaserver/test/testG4sim15.py
deleted file mode 100644
index 19b8d4e4b..000000000
--- a/current/pandaserver/test/testG4sim15.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import sys
-import time
-import random
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-site = sys.argv[1]
-cloud = sys.argv[2]
-
-prodDBlock = 'mc09_10TeV.105807.JF35_pythia_jet_filter.evgen.EVNT.e469_tid095268'
-inputFile = 'EVNT.095268._000110.pool.root.1'
-
-if len(sys.argv)==5:
- site = sys.argv[1]
- cloud = sys.argv[2]
- prodDBlock = sys.argv[3]
- inputFile = sys.argv[4]
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-
-files = {
- inputFile:None,
- }
-
-jobList = []
-
-index = 0
-for lfn in files.keys():
- index += 1
- job = JobSpec()
- job.jobDefinitionID = (time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index)
- job.AtlasRelease = 'Atlas-15.3.1'
- job.homepackage = 'AtlasProduction/15.3.1.5'
- job.transformation = 'csc_atlasG4_trf.py'
- job.destinationDBlock = datasetName
- job.computingSite = site
- job.prodDBlock = prodDBlock
-
- job.prodSourceLabel = 'test'
- job.processingType = 'test'
- job.currentPriority = 10000
- job.cloud = cloud
- job.cmtConfig = 'i686-slc4-gcc34-opt'
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileD = FileSpec()
- fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v070302'
- fileD.prodDBlock = fileD.dataset
- fileD.lfn = 'DBRelease-7.3.2.tar.gz'
- fileD.type = 'input'
- job.addFile(fileD)
-
- fileOA = FileSpec()
- fileOA.lfn = "%s.HITS.pool.root" % job.jobName
- fileOA.destinationDBlock = job.destinationDBlock
- fileOA.destinationSE = job.destinationSE
- fileOA.dataset = job.destinationDBlock
- fileOA.destinationDBlockToken = 'ATLASDATADISK'
- fileOA.type = 'output'
- job.addFile(fileOA)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="%s %s 5 1850 8738 ATLAS-GEO-08-00-01 QGSP_BERT VertexPos.py %s OFLCOND-SIM-01-00-00 False s595" % \
- (fileI.lfn,fileOA.lfn,fileD.lfn)
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testG4sim16.py b/current/pandaserver/test/testG4sim16.py
deleted file mode 100644
index c540c4cba..000000000
--- a/current/pandaserver/test/testG4sim16.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import sys
-import time
-import random
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-site = sys.argv[1]
-cloud = sys.argv[2]
-
-prodDBlock = 'mc10_7TeV.105001.pythia_minbias.evgen.EVNT.e574_tid153937_00'
-inputFile = 'EVNT.153937._000184.pool.root.1'
-
-if len(sys.argv)==5:
- site = sys.argv[1]
- cloud = sys.argv[2]
- prodDBlock = sys.argv[3]
- inputFile = sys.argv[4]
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-
-files = {
- inputFile:None,
- }
-
-jobList = []
-
-index = 0
-for lfn in files.keys():
- index += 1
- job = JobSpec()
- job.jobDefinitionID = (time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index)
- job.AtlasRelease = 'Atlas-16.6.2'
- job.homepackage = 'AtlasProduction/16.6.2.1'
- job.transformation = 'AtlasG4_trf.py'
- job.destinationDBlock = datasetName
- job.computingSite = site
- job.prodDBlock = prodDBlock
-
- job.prodSourceLabel = 'test'
- job.processingType = 'test'
- job.currentPriority = 10000
- job.cloud = cloud
- job.cmtConfig = 'i686-slc5-gcc43-opt'
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileD = FileSpec()
- fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v140201'
- fileD.prodDBlock = fileD.dataset
- fileD.lfn = 'DBRelease-14.2.1.tar.gz'
- fileD.type = 'input'
- job.addFile(fileD)
-
- fileOA = FileSpec()
- fileOA.lfn = "%s.HITS.pool.root" % job.jobName
- fileOA.destinationDBlock = job.destinationDBlock
- fileOA.destinationSE = job.destinationSE
- fileOA.dataset = job.destinationDBlock
- fileOA.destinationDBlockToken = 'ATLASDATADISK'
- fileOA.type = 'output'
- job.addFile(fileOA)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters='inputEvgenFile=%s outputHitsFile=%s maxEvents=3 skipEvents=1700 DBRelease=%s preInclude=SimuJobTransforms/VertexFromCondDB.py postExec="from InDetBeamSpotService.InDetBeamSpotServiceConf import BeamCondSvc;ServiceMgr+=BeamCondSvc();ServiceMgr.BeamCondSvc.useDB=False;ServiceMgr.BeamCondSvc.posX=0.1352;ServiceMgr.BeamCondSvc.posY=1.1621;ServiceMgr.BeamCondSvc.posZ=2.87;ServiceMgr.BeamCondSvc.sigmaX=0;ServiceMgr.BeamCondSvc.sigmaY=0;ServiceMgr.BeamCondSvc.sigmaZ=0" geometryVersion=ATLAS-GEO-16-00-00 conditionsTag=OFLCOND-SDR-BS7T-02 AMITag=s1019 randomSeed=568 physicsList=QGSP_BERT firstEvent=1701 RunNumber=106047' % \
- (fileI.lfn,fileOA.lfn,fileD.lfn)
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testG4sim17.py b/current/pandaserver/test/testG4sim17.py
deleted file mode 100644
index 0b53acb0d..000000000
--- a/current/pandaserver/test/testG4sim17.py
+++ /dev/null
@@ -1,88 +0,0 @@
-import sys
-import time
-import random
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-site = sys.argv[1]
-cloud = sys.argv[2]
-
-prodDBlock = 'mc10_7TeV.105001.pythia_minbias.evgen.EVNT.e574_tid153937_00'
-inputFile = 'EVNT.153937._000184.pool.root.1'
-
-if len(sys.argv)==5:
- site = sys.argv[1]
- cloud = sys.argv[2]
- prodDBlock = sys.argv[3]
- inputFile = sys.argv[4]
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-
-files = {
- inputFile:None,
- }
-
-jobList = []
-
-index = 0
-for lfn in files.keys():
- index += 1
- job = JobSpec()
- job.jobDefinitionID = (time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index)
- job.AtlasRelease = 'Atlas-17.0.5'
- job.homepackage = 'AtlasProduction/17.0.5.6'
- job.transformation = 'AtlasG4_trf.py'
- job.destinationDBlock = datasetName
- job.computingSite = site
- job.prodDBlock = prodDBlock
-
- job.prodSourceLabel = 'test'
- job.processingType = 'test'
- job.currentPriority = 10000
- job.cloud = cloud
- job.cmtConfig = 'i686-slc5-gcc43-opt'
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileD = FileSpec()
- fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v170602'
- fileD.prodDBlock = fileD.dataset
- fileD.lfn = 'DBRelease-17.6.2.tar.gz'
- fileD.type = 'input'
- job.addFile(fileD)
-
- fileOA = FileSpec()
- fileOA.lfn = "%s.HITS.pool.root" % job.jobName
- fileOA.destinationDBlock = job.destinationDBlock
- fileOA.destinationSE = job.destinationSE
- fileOA.dataset = job.destinationDBlock
- fileOA.destinationDBlockToken = 'ATLASDATADISK'
- fileOA.type = 'output'
- job.addFile(fileOA)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters='inputEvgenFile=%s outputHitsFile=%s maxEvents=3 skipEvents=0 DBRelease=%s geometryVersion=ATLAS-GEO-18-01-03_VALIDATION conditionsTag=OFLCOND-SDR-BS7T-05-14 randomSeed=1 physicsList=QGSP_BERT RunNumber=116870 firstEvent=1' % (fileI.lfn,fileOA.lfn,fileD.lfn)
-
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testGetJobStatus.py b/current/pandaserver/test/testGetJobStatus.py
deleted file mode 100755
index 4e47c2547..000000000
--- a/current/pandaserver/test/testGetJobStatus.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-
-id = sys.argv[1]
-
-s,o = Client.getJobStatus([id])
-print s
-if s == 0:
- for job in o:
- if job == None:
- continue
- print job.PandaID
- for file in job.Files:
- print file.lfn,file.type
-
diff --git a/current/pandaserver/test/testMultiTRF.py b/current/pandaserver/test/testMultiTRF.py
deleted file mode 100755
index c9fcd9853..000000000
--- a/current/pandaserver/test/testMultiTRF.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import sys
-import time
-import random
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = None
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-
-index = 0
-
-job = JobSpec()
-job.jobDefinitionID = int(time.time()) % 10000
-job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index)
-job.AtlasRelease = 'Atlas-14.1.0\nAtlas-14.1.0'
-job.homepackage = 'AtlasProduction/14.1.0.3\nAtlasProduction/14.1.0.3'
-job.transformation = 'csc_digi_trf.py\ncsc_reco_trf.py'
-job.destinationDBlock = datasetName
-
-job.computingSite = site
-
-job.prodDBlock = 'valid1.005200.T1_McAtNlo_Jimmy.simul.HITS.e322_s429_tid022081'
-
-job.prodSourceLabel = 'test'
-job.currentPriority = 10000
-job.cloud = 'US'
-
-for lfn in ['HITS.022081._00001.pool.root','HITS.022081._00002.pool.root']:
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
-fileD1 = FileSpec()
-fileD1.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v050001'
-fileD1.prodDBlock = fileD1.dataset
-fileD1.lfn = 'DBRelease-5.0.1.tar.gz'
-fileD1.type = 'input'
-job.addFile(fileD1)
-
-fileD2 = FileSpec()
-fileD2.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v050101'
-fileD2.prodDBlock = fileD2.dataset
-fileD2.lfn = 'DBRelease-5.1.1.tar.gz'
-fileD2.type = 'input'
-job.addFile(fileD2)
-
-fileOE = FileSpec()
-fileOE.lfn = "%s.ESD.pool.root" % job.jobName
-fileOE.destinationDBlock = job.destinationDBlock
-fileOE.destinationSE = job.destinationSE
-fileOE.dataset = job.destinationDBlock
-fileOE.type = 'output'
-job.addFile(fileOE)
-
-fileOA = FileSpec()
-fileOA.lfn = "%s.AOD.pool.root" % job.jobName
-fileOA.destinationDBlock = job.destinationDBlock
-fileOA.destinationSE = job.destinationSE
-fileOA.dataset = job.destinationDBlock
-fileOA.type = 'output'
-job.addFile(fileOA)
-
-fileOC = FileSpec()
-fileOC.lfn = "%s.NTUP.root" % job.jobName
-fileOC.destinationDBlock = job.destinationDBlock
-fileOC.destinationSE = job.destinationSE
-fileOC.dataset = job.destinationDBlock
-fileOC.type = 'output'
-job.addFile(fileOC)
-
-fileOL = FileSpec()
-fileOL.lfn = "%s.job.log.tgz" % job.jobName
-fileOL.destinationDBlock = job.destinationDBlock
-fileOL.destinationSE = job.destinationSE
-fileOL.dataset = job.destinationDBlock
-fileOL.type = 'log'
-job.addFile(fileOL)
-
-job.jobParameters="HITS.022081._[00001,00002].pool.root RDO.TMP._00001_tmp.pool.root 250 0 ATLAS-CSC-05-00-00 1 1 NONE NONE None %s AtRndmGenSvc QGSP_EMV DEFAULT NONE NONE NONE NONE NONE\n RDO.TMP._00001_tmp.pool.root %s %s %s 250 0 ATLAS-CSC-05-00-00 DEFAULT None %s NONE" % \
- (fileD1.lfn,fileOE.lfn,fileOA.lfn,fileOC.lfn,fileD2.lfn)
-
-s,o = Client.submitJobs([job])
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testReco.py b/current/pandaserver/test/testReco.py
deleted file mode 100755
index 0eb597e45..000000000
--- a/current/pandaserver/test/testReco.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import sys
-import time
-import random
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = None
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = None
-
-files = {
- 'misal1_mc12.005802.JF17_pythia_jet_filter.digit.RDO.v12000601_tid008610._11615.pool.root.1':None,
- #'misal1_mc12.005802.JF17_pythia_jet_filter.digit.RDO.v12000601_tid008610._11639.pool.root.1':None,
- #'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554._03634.pool.root.1':None,
- #'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554._03248.pool.root.1':None,
- #'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554._03634.pool.root.1':None,
- }
-
-jobList = []
-
-index = 0
-for lfn in files.keys():
- index += 1
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index)
- job.AtlasRelease = 'Atlas-12.0.6'
- job.homepackage = 'AtlasProduction/12.0.6.4'
- job.transformation = 'csc_reco_trf.py'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.computingSite = site
- #job.prodDBlock = 'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554'
- job.prodDBlock = 'misal1_mc12.005802.JF17_pythia_jet_filter.digit.RDO.v12000601_tid008610'
- job.cloud = 'US'
-
- job.prodSourceLabel = 'test'
- job.currentPriority = 10000
- job.cmtConfig = 'i686-slc4-gcc34-opt'
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileD = FileSpec()
- fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v030101'
- fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101'
- fileD.lfn = 'DBRelease-3.1.1.tar.gz'
- fileD.type = 'input'
- job.addFile(fileD)
-
- fileOE = FileSpec()
- fileOE.lfn = "%s.ESD.pool.root" % job.jobName
- fileOE.destinationDBlock = job.destinationDBlock
- fileOE.destinationSE = job.destinationSE
- fileOE.dataset = job.destinationDBlock
- fileOE.destinationDBlockToken = 'ATLASDATADISK'
- fileOE.type = 'output'
- job.addFile(fileOE)
-
- fileOA = FileSpec()
- fileOA.lfn = "%s.AOD.pool.root" % job.jobName
- fileOA.destinationDBlock = job.destinationDBlock
- fileOA.destinationSE = job.destinationSE
- fileOA.dataset = job.destinationDBlock
- fileOA.destinationDBlockToken = 'ATLASDATADISK'
- fileOA.type = 'output'
- job.addFile(fileOA)
-
- fileOC = FileSpec()
- fileOC.lfn = "%s.NTUP.root" % job.jobName
- fileOC.destinationDBlock = job.destinationDBlock
- fileOC.destinationSE = job.destinationSE
- fileOC.dataset = job.destinationDBlock
- fileOC.destinationDBlockToken = 'ATLASDATADISK'
- fileOC.type = 'output'
- job.addFile(fileOC)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="%s %s %s %s 250 0 ATLAS-CSC-01-02-00 CSC-06 NoRestrictedESDRecConfig.py %s" % \
- (fileI.lfn,fileOE.lfn,fileOA.lfn,fileOC.lfn,fileD.lfn)
-
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testRepro.py b/current/pandaserver/test/testRepro.py
deleted file mode 100755
index 9b0b7f679..000000000
--- a/current/pandaserver/test/testRepro.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import re
-import sys
-import time
-import random
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-cloud = sys.argv[1]
-if len(sys.argv)>2:
- site = sys.argv[2]
-else:
- site = None
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = None
-
-files = {
- 'daq.ATLAS.0092045.physics.RPCwBeam.LB0016.SFO-2._0009.data':None,
- }
-
-jobList = []
-
-index = 0
-for lfn in files.keys():
- index += 1
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index)
- job.AtlasRelease = 'Atlas-14.4.0'
- job.homepackage = 'AtlasTier0/14.4.0.2'
- job.transformation = 'Reco_trf.py'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.computingSite = site
- job.prodDBlock = 'data08_cos.00092045.physics_RPCwBeam.daq.RAW.o4_T1224560091'
-
- job.prodSourceLabel = 'test'
- job.processingType = 'reprocessing'
- job.currentPriority = 10000
- job.cloud = cloud
- job.cmtConfig = 'i686-slc4-gcc34-opt'
-
- origParams = """inputBSFile=daq.ATLAS.0092045.physics.RPCwBeam.LB0016.SFO-2._0009.data maxEvents=5 skipEvents=0 autoConfiguration=FieldAndGeo preInclude=RecExCommission/RecExCommission.py,RecExCommission/MinimalCommissioningSetup.py,RecJobTransforms/UseOracle.py preExec="jetFlags.Enabled.set_Value_and_Lock(False)" DBRelease=DBRelease-6.2.1.5.tar.gz conditionsTag=COMCOND-ES1C-000-00 RunNumber=92045 beamType=cosmics AMITag=r595 projectName=data08_cos trigStream=physics_RPCwBeam outputTypes=DPDCOMM outputESDFile=ESD.029868._01110.pool.root outputTAGComm=TAG_COMM.029868._01110.pool.root outputAODFile=AOD.029868._01110.pool.root outputMergedDQMonitorFile=DQM_MERGED.029868._01110.root DPD_PIXELCOMM=DPD_PIXELCOMM.029868._01110.pool.root DPD_SCTCOMM=DPD_SCTCOMM.029868._01110.pool.root DPD_IDCOMM=DPD_IDCOMM.029868._01110.pool.root DPD_IDPROJCOMM=DPD_IDPROJCOMM.029868._01110.pool.root DPD_CALOCOMM=DPD_CALOCOMM.029868._01110.pool.root DPD_TILECOMM=DPD_TILECOMM.029868._01110.pool.root DPD_EMCLUSTCOMM=DPD_EMCLUSTCOMM.029868._01110.pool.root DPD_EGAMMACOMM=DPD_EGAMMACOMM.029868._01110.pool.root DPD_RPCCOMM=DPD_RPCCOMM.029868._01110.pool.root DPD_TGCCOMM=DPD_TGCCOMM.029868._01110.pool.root --ignoreunknown"""
-
- match = re.findall("([^\s]+=[^\s]+)",origParams)
- outMap = {}
- for item in match:
- arg = item.split('=')[0]
- var = item.split('=')[-1]
- # output
- if arg.startswith('output') or arg.startswith('DPD_'):
- # skip some keys
- if arg in ['outputTypes']:
- continue
- prefix = var.split('.')[0]
- sumatch = re.search('(\.[^\.]+\.[^\.]+)(\.\d+)*$',var)
- suffix = sumatch.group(1)
- newName = '%s.%s%s' % (job.jobName,prefix,suffix)
- outMap[arg] = (var,newName)
- # DBRelease
- elif arg == 'DBRelease':
- dbrMap = (arg,var)
- # input
- elif arg.startswith('input') and arg.endswith('File'):
- inputMap = (arg,var)
-
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileD = FileSpec()
- fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v06020105'
- fileD.prodDBlock = fileD.dataset
- fileD.lfn = 'DBRelease-6.2.1.5.tar.gz'
- fileD.type = 'input'
- job.addFile(fileD)
-
- newParams = origParams
- newParams = newParams.replace(dbrMap[0]+'='+dbrMap[1],dbrMap[0]+'='+fileD.lfn)
- newParams = newParams.replace(inputMap[0]+'='+inputMap[1],inputMap[0]+'='+fileI.lfn)
-
- for arg,vars in outMap.iteritems():
- fileO = FileSpec()
- fileO.lfn = vars[1]
- fileO.destinationDBlock = job.destinationDBlock
- fileO.destinationSE = job.destinationSE
- fileO.dataset = job.destinationDBlock
- fileO.destinationDBlockToken = 'ATLASDATADISK'
- fileO.type = 'output'
- job.addFile(fileO)
- newParams = newParams.replace(arg+'='+vars[0],arg+'='+fileO.lfn)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters=newParams
-
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testScript.py b/current/pandaserver/test/testScript.py
deleted file mode 100755
index 2299a441d..000000000
--- a/current/pandaserver/test/testScript.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-aSrvID = None
-
-for idx,argv in enumerate(sys.argv):
- if argv == '-s':
- aSrvID = sys.argv[idx+1]
- sys.argv = sys.argv[:idx]
- break
-
-site = sys.argv[1]
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = None
-
-job = JobSpec()
-job.jobDefinitionID = int(time.time()) % 10000
-job.jobName = "%s" % commands.getoutput('uuidgen')
-job.transformation = 'https://atlpan.web.cern.ch/atlpan/test.sh'
-job.destinationDBlock = datasetName
-job.destinationSE = destName
-job.currentPriority = 1000
-job.prodSourceLabel = 'test'
-job.computingSite = site
-
-job.jobParameters="aaaaa"
-
-fileOL = FileSpec()
-fileOL.lfn = "%s.job.log.tgz" % job.jobName
-fileOL.destinationDBlock = job.destinationDBlock
-fileOL.destinationSE = job.destinationSE
-fileOL.dataset = job.destinationDBlock
-fileOL.type = 'log'
-job.addFile(fileOL)
-
-
-s,o = Client.submitJobs([job],srvID=aSrvID)
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testSimul13.py b/current/pandaserver/test/testSimul13.py
deleted file mode 100644
index 4b8ef5247..000000000
--- a/current/pandaserver/test/testSimul13.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import sys
-import time
-import random
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = None
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = 'BNL_ATLAS_2'
-
-files = {
- 'EVNT.019128._00011.pool.root.1':None,
- }
-
-jobList = []
-
-index = 0
-for lfn in files.keys():
- index += 1
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index)
- job.AtlasRelease = 'Atlas-13.0.40'
- job.homepackage = 'AtlasProduction/13.0.40.3'
- job.transformation = 'csc_simul_trf.py'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.computingSite = site
- job.prodDBlock = 'valid1.005001.pythia_minbias.evgen.EVNT.e306_tid019128'
-
- job.prodSourceLabel = 'test'
- job.currentPriority = 10000
- job.cloud = 'IT'
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileD = FileSpec()
- fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v040701'
- fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101'
- fileD.lfn = 'DBRelease-4.7.1.tar.gz'
- fileD.type = 'input'
- job.addFile(fileD)
-
- fileOE = FileSpec()
- fileOE.lfn = "%s.HITS.pool.root" % job.jobName
- fileOE.destinationDBlock = job.destinationDBlock
- fileOE.destinationSE = job.destinationSE
- fileOE.dataset = job.destinationDBlock
- fileOE.destinationDBlockToken = 'ATLASDATADISK'
- fileOE.type = 'output'
- job.addFile(fileOE)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="%s %s NONE 1 3250 55866 ATLAS-CSC-02-01-00 55866 55866 QGSP_EMV None %s DEFAULT" % \
- (fileI.lfn,fileOE.lfn,fileD.lfn)
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testSimulReco14.py b/current/pandaserver/test/testSimulReco14.py
deleted file mode 100644
index 41c78c68d..000000000
--- a/current/pandaserver/test/testSimulReco14.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import sys
-import time
-import random
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
- cloud = None
-else:
- site = None
- cloud = 'US'
-
-
-
-#cloud = 'TW'
-#Recent changes (BNL migration to LFC?) forvce the cloud to be specified
-cloud = 'US'
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = 'BNL_ATLAS_2'
-
-files = {
- 'EVNT.023986._00001.pool.root.1':None,
- #'EVNT.023989._00001.pool.root.1':None,
- }
-
-jobList = []
-
-index = 0
-for lfn in files.keys():
- index += 1
- job = JobSpec()
- job.jobDefinitionID = (time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index)
- job.AtlasRelease = 'Atlas-14.2.20'
- job.homepackage = 'AtlasProduction/14.2.20.1'
- job.transformation = 'csc_simul_reco_trf.py'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.computingSite = site
- job.prodDBlock = 'mc08.105031.Jimmy_jetsJ2.evgen.EVNT.e347_tid023986'
- #job.prodDBlock = 'mc08.105034.Jimmy_jetsJ5.evgen.EVNT.e347_tid023989'
-
- job.prodSourceLabel = 'test'
- job.processingType = 'test'
- job.currentPriority = 10000
- job.cloud = cloud
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileD = FileSpec()
- fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v050601'
- fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v050601'
- fileD.lfn = 'DBRelease-5.6.1.tar.gz'
- fileD.type = 'input'
- job.addFile(fileD)
-
- fileOA = FileSpec()
- fileOA.lfn = "%s.AOD.pool.root" % job.jobName
- fileOA.destinationDBlock = job.destinationDBlock
- fileOA.destinationSE = job.destinationSE
- fileOA.dataset = job.destinationDBlock
- fileOA.destinationDBlockToken = 'ATLASDATADISK'
- fileOA.type = 'output'
- job.addFile(fileOA)
-
- fileOE = FileSpec()
- fileOE.lfn = "%s.ESD.pool.root" % job.jobName
- fileOE.destinationDBlock = job.destinationDBlock
- fileOE.destinationSE = job.destinationSE
- fileOE.dataset = job.destinationDBlock
- fileOE.destinationDBlockToken = 'ATLASDATADISK'
- fileOE.type = 'output'
- job.addFile(fileOE)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.destinationDBlockToken = 'ATLASDATADISK'
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="%s %s 30 500 3 ATLAS-GEO-02-01-00 3 3 QGSP_BERT jobConfig.VertexPosFastIDKiller.py FastSimulationJobTransforms/FastCaloSimAddCellsRecConfig.py,NoTrackSlimming.py %s OFF NONE NONE %s NONE" % (fileI.lfn, fileOA.lfn, fileD.lfn, fileOE.lfn)
-
- jobList.append(job)
-
-s,o = Client.submitJobs(jobList)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
diff --git a/current/pandaserver/test/testSiteMap.py b/current/pandaserver/test/testSiteMap.py
deleted file mode 100755
index f11053958..000000000
--- a/current/pandaserver/test/testSiteMap.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import os
-import re
-import sys
-import time
-import random
-import datetime
-import commands
-from taskbuffer.TaskBuffer import taskBuffer
-from brokerage import SiteMapper
-
-# password
-from config import panda_config
-passwd = panda_config.dbpasswd
-
-# instantiate TB
-taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
-
-siteMapper = SiteMapper.SiteMapper(taskBuffer)
-
-#x = siteMapper.getSite('BNL_ATLAS_1')
-#print x
-
-
diff --git a/current/pandaserver/test/testTB.py b/current/pandaserver/test/testTB.py
deleted file mode 100755
index d94e06560..000000000
--- a/current/pandaserver/test/testTB.py
+++ /dev/null
@@ -1,145 +0,0 @@
-"""
-test TaskBuffer and JobDispatcher on local PC
-
-$ python -i testTB.py
->>> testGetJobs(10)
->>> testGetJobStatus(1)
->>> testUpdateJob(1,'running')
->>> testGetJobStatus(1)
->>> testUpdateJob(1,'finished')
->>> testGetJobStatus(1)
->>> taskBuffer.peekJobs([1,])
->>> taskBuffer.queryPandaIDs([0,])
-
-
-"""
-
-
-import time
-import commands
-import threading
-
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-class TestThread (threading.Thread):
- def __init__(self,tb,i,n,siteName):
- threading.Thread.__init__(self)
- self.taskbuffer = tb
- self.interval = i
- self.jobDefinitionID = n
- self.siteName = siteName
-
- def run(self):
- for i in range(1):
- prodDBlock = 'rome.004201.evgen.ZeeJimmy'
- destinationDBlock = 'pandatest.000123.test.simul'
- destinationSE = 'BNL_SE'
- jobs = []
- #for i in range(self.interval):
- for i in range(2):
- job = JobSpec()
- job.jobDefinitionID=self.jobDefinitionID
- job.AtlasRelease='Atlas-11.0.1'
- job.prodDBlock=prodDBlock
- job.destinationDBlock=destinationDBlock
- job.destinationSE=destinationSE
- job.currentPriority=i
-
- lfnI = 'rome.004201.evgen.ZeeJimmy._00001.pool.root'
- file = FileSpec()
- file.lfn = lfnI
- file.dataset = 'rome.004201.evgen.ZeeJimmy'
- file.type = 'input'
- file.prodDBlock = prodDBlock
- file.dataset = prodDBlock
- job.addFile(file)
-
- lfnO ='%s.pool.root.1' % commands.getoutput('uuidgen')
- file = FileSpec()
- file.lfn = lfnO
- file.type = 'output'
- file.destinationDBlock = destinationDBlock
- file.dataset = destinationDBlock
- file.destinationSE = destinationSE
- job.addFile(file)
-
- job.homepackage='JobTransforms-11-00-01-01'
- job.transformation='share/rome.g4sim.standard.trf'
- job.jobParameters='%s %s 1 2 14268' % (lfnI,lfnO)
- jobs.append(job)
- self.taskbuffer.storeJobs(jobs,None)
- time.sleep(self.interval)
-
-from taskbuffer.TaskBuffer import taskBuffer
-from jobdispatcher.JobDispatcher import jobDispatcher
-from userinterface.UserIF import userIF
-
-import getpass
-passwd = getpass.getpass()
-
-taskBuffer.init('adbpro.usatlas.bnl.gov',passwd,nDBConnection=3)
-
-jobDispatcher.init(taskBuffer)
-userIF.init(taskBuffer)
-
-jobDefID = int(time.time()) % 10000
-thr1 = TestThread(taskBuffer,4,jobDefID,"myhost")
-thr2 = TestThread(taskBuffer,3,jobDefID+1,"testsite")
-
-thr1.start()
-#thr2.start()
-
-from jobdispatcher.JobDispatcher import getJob,updateJob
-from userinterface.UserIF import submitJobs,getJobStatus,queryPandaIDs
-
-
-### emulate HTTP requests
-
-class Request:
- def __init__(self):
- self.subprocess_env = {}
- self.subprocess_env['SSL_CLIENT_S_DN'] = "aaa"
- self.subprocess_env['HTTPS'] = "on"
-
-req = Request()
-
-def testGetJob():
- print getJob(req,"BNL_ATLAS_2")
-
-def testGetJobStatus(arg):
- print getJobStatus(req,arg)
-
-def testSubmitJobs(arg):
- print submitJobs(req,arg)
-
-def testUpdateJob(arg0,arg1):
- print updateJob(req,arg0,arg1)
-
-def testQueryPandaIDs(arg):
- print queryPandaIDs(req,arg)
-
-"""
-
-import cPickle as pickle
-ids=[3023,3414]
-testGetJobStatus(pickle.dumps(ids))
-
-job = JobSpec()
-job.jobDefinitionID='user.%s' % commands.getoutput('/usr/bin/uuidgen')
-ids = {'pandatest.000003.dd.input._00028.junk':'6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27',
- 'pandatest.000003.dd.input._00033.junk':'98f79ba1-1793-4253-aac7-bdf90a51d1ee',
- 'pandatest.000003.dd.input._00039.junk':'33660dd5-7cef-422a-a7fc-6c24cb10deb1'}
-for lfn in ids.keys():
- file = FileSpec()
- file.lfn = lfn
- file.GUID = ids[file.lfn]
- file.dataset = 'pandatest.000003.dd.input'
- file.type = 'input'
- job.addFile(file)
-
-testSubmitJobs(pickle.dumps([job]))
-
-testQueryPandaIDs(pickle.dumps([10]))
-
-"""
diff --git a/current/pandaserver/test/testTaskA2.py b/current/pandaserver/test/testTaskA2.py
deleted file mode 100755
index e54e3948f..000000000
--- a/current/pandaserver/test/testTaskA2.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = None
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-#destName = 'BNL_SE'
-
-jobList = []
-
-for i in [999905,999906,999907]:
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i)
- job.AtlasRelease = 'Atlas-14.1.0'
- job.homepackage = 'AtlasProduction/12.0.6.2'
- job.transformation = 'csc_evgen_trf.py'
- job.destinationDBlock = datasetName
- #job.destinationSE = destName
- job.currentPriority = 1000
- job.prodSourceLabel = 'managed'
- #job.prodSourceLabel = 'test'
- #job.computingSite = site
- job.cmtConfig = 'i686-slc4-gcc34-opt'
- job.metadata = 'evgen;%s;%s;%s' % (str({'FR': 46, 'NL': 45, 'NDGF': 300, 'CERN': 19, 'TW': 44110, 'CA': 2922, 'DE': 9903, 'IT': 1168, 'US': 6226, 'UK': 1026, 'ES': 26619}),str({999907:100,999906:200,999905:300}),str({999905:100,999906:910,999907:500}))
- #job.metadata = 'evgen;%s' % str({'FR': 46, 'NL': 45, 'NDGF': 300, 'CERN': 19, 'TW': 44110, 'CA': 2922, 'DE': 9903, 'IT': 1168, 'US': 6226, 'UK': 1026, 'ES': 26619})
-
- #job.cloud = "UK"
- job.taskID = i
-
- file = FileSpec()
- file.lfn = "%s.evgen.pool.root" % job.jobName
- file.destinationDBlock = job.destinationDBlock
- file.destinationSE = job.destinationSE
- file.dataset = job.destinationDBlock
- #file.destinationDBlockToken = 'ATLASDATADISK'
- file.type = 'output'
- job.addFile(file)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % job.jobName
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="7087 0 500000 1 DC3.007087.singlepart_fwdgamma_etaplus_E500.py %s NONE NONE NONE" % file.lfn
- jobList.append(job)
-
-for i in range(1):
- #s,o = Client.submitJobs(jobList)
- s,outS = Client.runTaskAssignment(jobList)
- print "---------------------"
- print s
- for tmpOut in outS:
- print tmpOut
diff --git a/current/pandaserver/test/testUser.py b/current/pandaserver/test/testUser.py
deleted file mode 100755
index fd51cd1af..000000000
--- a/current/pandaserver/test/testUser.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-job = JobSpec()
-job.jobDefinitionID = int(time.time()) % 10000
-job.jobName = commands.getoutput('/usr/bin/uuidgen')
-job.AtlasRelease = 'Atlas-9.0.4'
-job.prodDBlock = 'pandatest.000003.dd.input'
-job.destinationDBlock = 'panda.destDB.%s' % commands.getoutput('/usr/bin/uuidgen')
-job.destinationSE = 'BNL_SE'
-
-ids = {'pandatest.000003.dd.input._00028.junk':'6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27',
- 'pandatest.000003.dd.input._00033.junk':'98f79ba1-1793-4253-aac7-bdf90a51d1ee',
- 'pandatest.000003.dd.input._00039.junk':'33660dd5-7cef-422a-a7fc-6c24cb10deb1'}
-for lfn in ids.keys():
- file = FileSpec()
- file.lfn = lfn
- file.GUID = ids[file.lfn]
- file.dataset = 'pandatest.000003.dd.input'
- file.type = 'input'
- job.addFile(file)
-
-s,o = Client.submitJobs([job])
-print "---------------------"
-print s
-print o
-print "---------------------"
-s,o = Client.getJobStatus([4934, 4766, 4767, 4768, 4769])
-print s
-if s == 0:
- for job in o:
- if job == None:
- continue
- print job.PandaID
- for file in job.Files:
- print file.lfn,file.type
-print "---------------------"
-s,o = Client.queryPandaIDs([0])
-print s
-print o
-
diff --git a/current/pandaserver/test/testWait.py b/current/pandaserver/test/testWait.py
deleted file mode 100755
index adbd9c246..000000000
--- a/current/pandaserver/test/testWait.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import sys
-import time
-import commands
-import userinterface.Client as Client
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.FileSpec import FileSpec
-
-if len(sys.argv)>1:
- site = sys.argv[1]
-else:
- site = None
-
-datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen')
-destName = 'BNL_SE'
-
-jobListE = []
-lfnListE = []
-
-for i in range(2):
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = commands.getoutput('uuidgen')
- job.AtlasRelease = 'Atlas-11.0.3'
- job.homepackage = 'JobTransforms-11-00-03-03'
- job.transformation = 'share/csc.evgen.trf'
- job.destinationDBlock = datasetName
- job.destinationSE = destName
- job.currentPriority = 1000
- job.prodSourceLabel = 'test'
- job.computingSite = site
-
- file = FileSpec()
- file.lfn = "%s.evgen.pool.root" % commands.getoutput('uuidgen')
- lfnListE.append(file.lfn)
- file.lfn += ('.%d' % (i+1))
- file.destinationDBlock = job.destinationDBlock
- file.destinationSE = job.destinationSE
- file.dataset = job.destinationDBlock
- file.type = 'output'
- job.addFile(file)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen')
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="5056 %s NONE 81000 9000 10 DC3.005056.PythiaPhotonJet2.py NONE" % file.lfn
- jobListE.append(job)
-
-s,o = Client.submitJobs(jobListE)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
-
-time.sleep(20)
-
-datasetNameS = 'panda.simu.%s' % commands.getoutput('uuidgen')
-
-jobListS = []
-
-for lfn in lfnListE:
- job = JobSpec()
- job.jobDefinitionID = int(time.time()) % 10000
- job.jobName = commands.getoutput('uuidgen')
- job.AtlasRelease = 'Atlas-11.0.3'
- job.homepackage = 'JobTransforms-11-00-03-04'
- job.transformation = 'share/csc.simul.trf'
- job.destinationDBlock = datasetNameS
- job.destinationSE = destName
- job.prodDBlock = datasetName
-
- job.prodSourceLabel = 'test'
- job.currentPriority = 1000
-
- fileI = FileSpec()
- fileI.dataset = job.prodDBlock
- fileI.prodDBlock = job.prodDBlock
- fileI.lfn = lfn
- fileI.type = 'input'
- job.addFile(fileI)
-
- fileOE = FileSpec()
- fileOE.lfn = "%s.HITS.pool.root" % commands.getoutput('uuidgen')
- fileOE.destinationDBlock = job.destinationDBlock
- fileOE.destinationSE = job.destinationSE
- fileOE.dataset = job.destinationDBlock
- fileOE.type = 'output'
- job.addFile(fileOE)
-
- fileOA = FileSpec()
- fileOA.lfn = "%s.RDO.pool.root" % commands.getoutput('uuidgen')
- fileOA.destinationDBlock = job.destinationDBlock
- fileOA.destinationSE = job.destinationSE
- fileOA.dataset = job.destinationDBlock
- fileOA.type = 'output'
- job.addFile(fileOA)
-
- fileOL = FileSpec()
- fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen')
- fileOL.destinationDBlock = job.destinationDBlock
- fileOL.destinationSE = job.destinationSE
- fileOL.dataset = job.destinationDBlock
- fileOL.type = 'log'
- job.addFile(fileOL)
-
- job.jobParameters="%s %s %s 100 4900 400" % (fileI.lfn,fileOE.lfn,fileOA.lfn)
-
- jobListS.append(job)
-
-s,o = Client.submitJobs(jobListS)
-print "---------------------"
-print s
-for x in o:
- print "PandaID=%s" % x[0]
-
diff --git a/current/pandaserver/test/tmpwatch.py b/current/pandaserver/test/tmpwatch.py
deleted file mode 100644
index ee75d2720..000000000
--- a/current/pandaserver/test/tmpwatch.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import os
-import glob
-import optparse
-import datetime
-
-# options
-optP = optparse.OptionParser(conflict_handler="resolve")
-optP.add_option('-t',action='store_const',const=True,dest='test',default=False,
- help='test mode')
-optP.add_option('-h',action='store',type='int',dest='limit',default=12,
- help='time limit in hour')
-options,args = optP.parse_args()
-
-# patterns of tmp files
-tmpPatts = ['/tmp/tmp*','/tmp/atlpan/tmp*']
-
-# limit
-timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=options.limit)
-
-# loop over all pattern
-for tmpPatt in tmpPatts:
- tmpFiles = glob.glob(tmpPatt)
- # loop over all files
- for tmpFile in tmpFiles:
- try:
- print 'INFO: tmpfile -> %s' % tmpFile
- # only file
- if not os.path.isfile(tmpFile):
- continue
- # not symlink
- if os.path.islink(tmpFile):
- continue
- # writable
- if not os.access(tmpFile,os.W_OK):
- continue
- # check time stamp
- timeStamp = os.path.getmtime(tmpFile)
- timeStamp = datetime.datetime.fromtimestamp(timeStamp)
- if timeStamp > timeLimit:
- continue
- # remove
- print 'INFO: remove %s' % tmpFile
- if not options.test:
- os.remove(tmpFile)
- except:
- errType,errValue = sys.exc_info()[:2]
- print 'ERROR: failed with %s:%s' % (errType,errValue)
diff --git a/current/pandaserver/test/update.sh b/current/pandaserver/test/update.sh
deleted file mode 100755
index c1edbf515..000000000
--- a/current/pandaserver/test/update.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/python
-
-import os
-import sys
-
-os.chdir('..')
-
-option = ''
-if len(sys.argv) > 1 and sys.argv[1] == '-n':
- option = ' -n'
-
-packages = ['liveconfigparser','pandalogger','taskbuffer',
- 'brokerage','jobdispatcher','userinterface',
- 'dataservice','test','server'] #,'config']
-
-for pack in packages:
- com = 'cvs%s update %s' % (option,pack)
- print com
- os.system(com)
diff --git a/current/pandaserver/test/valConf.py b/current/pandaserver/test/valConf.py
deleted file mode 100644
index 69ec8688c..000000000
--- a/current/pandaserver/test/valConf.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from config import panda_config
-from config import panda_config_new
-
-for item in dir(panda_config):
- if item.startswith('__'):
- continue
- old = getattr(panda_config,item)
- if not hasattr(panda_config_new,item):
- print "NG : %s not found" % item
- continue
- new = getattr(panda_config_new,item)
- if old != new:
- print "NG : %s missmatch" % item
- print " old:%s" % old
- print " new:%s" % new
diff --git a/current/pandaserver/userinterface/Client.py b/current/pandaserver/userinterface/Client.py
deleted file mode 100755
index 529e2d11c..000000000
--- a/current/pandaserver/userinterface/Client.py
+++ /dev/null
@@ -1,880 +0,0 @@
-'''
-client methods
-
-'''
-
-import os
-import re
-import sys
-import urllib
-import commands
-import cPickle as pickle
-
-
-# configuration
-try:
- baseURL = os.environ['PANDA_URL']
-except:
- baseURL = 'http://pandaserver.cern.ch:25080/server/panda'
-try:
- baseURLSSL = os.environ['PANDA_URL_SSL']
-except:
- baseURLSSL = 'https://pandaserver.cern.ch:25443/server/panda'
-
-
-# exit code
-EC_Failed = 255
-
-
-# panda server URLs
-if os.environ.has_key('PANDA_URL_MAP'):
- serverURLs = {'default' : {'URL' : baseURL,
- 'URLSSL' : baseURLSSL},
- }
- # decode envvar to map
- try:
- for tmpCompStr in os.environ['PANDA_URL_MAP'].split('|'):
- tmpKey,tmpURL,tmpURLSSL = tmpCompStr.split(',')
- # append
- serverURLs[tmpKey] = {'URL' : tmpURL,
- 'URLSSL' : tmpURLSSL}
- except:
- pass
-else:
- # default
- serverURLs = {'default' : {'URL' : baseURL,
- 'URLSSL' : baseURLSSL},
- 'CERN' : {'URL' : 'http://pandaserver.cern.ch:25080/server/panda',
- 'URLSSL' : 'https://pandaserver.cern.ch:25443/server/panda'},
- }
-
-# bamboo
-baseURLBAMBOO = 'http://pandabamboo.cern.ch:25070/bamboo/bamboo'
-
-
-# get URL
-def _getURL(type,srvID=None):
- if serverURLs.has_key(srvID):
- urls = serverURLs[srvID]
- else:
- urls = serverURLs['default']
- return urls[type]
-
-
-# get Panda srvIDs
-def getPandas():
- srvs = serverURLs.keys()
- # remove 'default'
- try:
- srvs.remove('default')
- except:
- pass
- return srvs
-
-
-# look for a grid proxy certificate
-def _x509():
- # see X509_USER_PROXY
- try:
- return os.environ['X509_USER_PROXY']
- except:
- pass
- # see the default place
- x509 = '/tmp/x509up_u%s' % os.getuid()
- if os.access(x509,os.R_OK):
- return x509
- # no valid proxy certificate
- # FIXME
- print "No valid grid proxy certificate found"
- return ''
-
-
-# curl class
-class _Curl:
- # constructor
- def __init__(self):
- # path to curl
- self.path = 'curl'
- # verification of the host certificate
- self.verifyHost = False
- # request a compressed response
- self.compress = True
- # SSL cert/key
- self.sslCert = ''
- self.sslKey = ''
- # verbose
- self.verbose = False
-
-
- # GET method
- def get(self,url,data):
- # make command
- com = '%s --silent --get' % self.path
- if not self.verifyHost:
- com += ' --insecure'
- if self.compress:
- com += ' --compressed'
- if self.sslCert != '':
- com += ' --cert %s' % self.sslCert
- if self.sslKey != '':
- com += ' --key %s' % self.sslKey
- # timeout
- com += ' -m 600'
- # data
- strData = ''
- for key in data.keys():
- strData += 'data="%s"\n' % urllib.urlencode({key:data[key]})
- # write data to temporary config file
- try:
- tmpName = os.environ['PANDA_TMP']
- except:
- tmpName = '/tmp'
- tmpName += '/%s_%s' % (commands.getoutput('whoami'),commands.getoutput('uuidgen'))
- tmpFile = open(tmpName,'w')
- tmpFile.write(strData)
- tmpFile.close()
- com += ' --config %s' % tmpName
- com += ' %s' % url
- # execute
- if self.verbose:
- print com
- print commands.getoutput('cat %s' % tmpName)
- ret = commands.getstatusoutput(com)
- # remove temporary file
- os.remove(tmpName)
- if ret[0] != 0:
- ret = (ret[0]%255,ret[1])
- if self.verbose:
- print ret
- return ret
-
-
- # POST method
- def post(self,url,data):
- # make command
- com = '%s --silent' % self.path
- if not self.verifyHost:
- com += ' --insecure'
- if self.compress:
- com += ' --compressed'
- if self.sslCert != '':
- com += ' --cert %s' % self.sslCert
- if self.sslKey != '':
- com += ' --key %s' % self.sslKey
- # timeout
- com += ' -m 600'
- # data
- strData = ''
- for key in data.keys():
- strData += 'data="%s"\n' % urllib.urlencode({key:data[key]})
- # write data to temporary config file
- try:
- tmpName = os.environ['PANDA_TMP']
- except:
- tmpName = '/tmp'
- tmpName += '/%s_%s' % (commands.getoutput('whoami'),commands.getoutput('uuidgen'))
- tmpFile = open(tmpName,'w')
- tmpFile.write(strData)
- tmpFile.close()
- com += ' --config %s' % tmpName
- com += ' %s' % url
- # execute
- if self.verbose:
- print com
- print commands.getoutput('cat %s' % tmpName)
- ret = commands.getstatusoutput(com)
- # remove temporary file
- os.remove(tmpName)
- if ret[0] != 0:
- ret = (ret[0]%255,ret[1])
- if self.verbose:
- print ret
- return ret
-
-
- # PUT method
- def put(self,url,data):
- # make command
- com = '%s --silent' % self.path
- if not self.verifyHost:
- com += ' --insecure'
- if self.compress:
- com += ' --compressed'
- if self.sslCert != '':
- com += ' --cert %s' % self.sslCert
- if self.sslKey != '':
- com += ' --key %s' % self.sslKey
- # emulate PUT
- for key in data.keys():
- com += ' -F "%s=@%s"' % (key,data[key])
- com += ' %s' % url
- # execute
- if self.verbose:
- print com
- ret = commands.getstatusoutput(com)
- if ret[0] != 0:
- ret = (ret[0]%255,ret[1])
- if self.verbose:
- print ret
- return ret
-
-
-'''
-public methods
-
-'''
-
-# use web cache
-def useWebCache():
- global baseURL
- baseURL = 'http://pandaserver.cern.ch:25085/server/panda'
- global serverURLs
- for tmpKey,tmpVal in serverURLs.iteritems():
- tmpVal['URL'] = baseURL
-
-
-# submit jobs
-def submitJobs(jobs,srvID=None,toPending=False):
- # set hostname
- hostname = commands.getoutput('hostname')
- for job in jobs:
- job.creationHost = hostname
- # serialize
- strJobs = pickle.dumps(jobs)
- # instantiate curl
- curl = _Curl()
- curl.sslCert = _x509()
- curl.sslKey = _x509()
- # execute
- url = _getURL('URLSSL',srvID) + '/submitJobs'
- data = {'jobs':strJobs}
- if toPending:
- data['toPending'] = True
- status,output = curl.post(url,data)
- if status!=0:
- print output
- return status,output
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR submitJobs : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# run task assignment
-def runTaskAssignment(jobs):
- # set hostname
- hostname = commands.getoutput('hostname')
- for job in jobs:
- job.creationHost = hostname
- # serialize
- strJobs = pickle.dumps(jobs)
- # instantiate curl
- curl = _Curl()
- curl.sslCert = _x509()
- curl.sslKey = _x509()
- # execute
- url = baseURLSSL + '/runTaskAssignment'
- data = {'jobs':strJobs}
- status,output = curl.post(url,data)
- if status!=0:
- print output
- return status,output
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR runTaskAssignment : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# get job status
-def getJobStatus(ids,srvID=None):
- # serialize
- strIDs = pickle.dumps(ids)
- # instantiate curl
- curl = _Curl()
- # execute
- url = _getURL('URL',srvID) + '/getJobStatus'
- data = {'ids':strIDs}
- status,output = curl.post(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getJobStatus : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# get PandaID with jobexeID
-def getPandaIDwithJobExeID(ids):
- # serialize
- strIDs = pickle.dumps(ids)
- # instantiate curl
- curl = _Curl()
- # execute
- url = _getURL('URL') + '/getPandaIDwithJobExeID'
- data = {'ids':strIDs}
- status,output = curl.post(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getPandaIDwithJobExeID : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# get assigning task
-def getAssigningTask():
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/getAssigningTask'
- status,output = curl.get(url,{})
- try:
- return status,pickle.loads(output)
- except:
- print output
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getAssigningTask : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# get assigned cloud for tasks
-def seeCloudTask(ids):
- # serialize
- strIDs = pickle.dumps(ids)
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/seeCloudTask'
- data = {'ids':strIDs}
- status,output = curl.post(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR seeCloudTask : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# kill jobs
-def killJobs(ids,code=None,verbose=False,srvID=None,useMailAsID=False):
- # serialize
- strIDs = pickle.dumps(ids)
- # instantiate curl
- curl = _Curl()
- curl.sslCert = _x509()
- curl.sslKey = _x509()
- curl.verbose = verbose
- # execute
- url = _getURL('URLSSL',srvID) + '/killJobs'
- data = {'ids':strIDs,'code':code,'useMailAsID':useMailAsID}
- status,output = curl.post(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR killJobs : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# reassign jobs
-def reassignJobs(ids,forPending=False):
- # serialize
- strIDs = pickle.dumps(ids)
- # instantiate curl
- curl = _Curl()
- curl.sslCert = _x509()
- curl.sslKey = _x509()
- # execute
- url = baseURLSSL + '/reassignJobs'
- data = {'ids':strIDs}
- if forPending:
- data['forPending'] = True
- status,output = curl.post(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR reassignJobs : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# query PandaIDs
-def queryPandaIDs(ids):
- # serialize
- strIDs = pickle.dumps(ids)
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/queryPandaIDs'
- data = {'ids':strIDs}
- status,output = curl.post(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR queryPandaIDs : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# query job info per cloud
-def queryJobInfoPerCloud(cloud,schedulerID=None):
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/queryJobInfoPerCloud'
- data = {'cloud':cloud}
- if schedulerID != None:
- data['schedulerID'] = schedulerID
- status,output = curl.post(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR queryJobInfoPerCloud : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# get job statistics
-def getJobStatistics(sourcetype=None):
- # instantiate curl
- curl = _Curl()
- # execute
- ret = {}
- for srvID in getPandas():
- url = _getURL('URL',srvID) + '/getJobStatistics'
- data = {}
- if sourcetype != None:
- data['sourcetype'] = sourcetype
- status,output = curl.get(url,data)
- try:
- tmpRet = status,pickle.loads(output)
- if status != 0:
- return tmpRet
- except:
- print output
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getJobStatistics : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
- # gather
- for tmpCloud,tmpVal in tmpRet[1].iteritems():
- if not ret.has_key(tmpCloud):
- # append cloud values
- ret[tmpCloud] = tmpVal
- else:
- # sum statistics
- for tmpStatus,tmpCount in tmpVal.iteritems():
- if ret[tmpCloud].has_key(tmpStatus):
- ret[tmpCloud][tmpStatus] += tmpCount
- else:
- ret[tmpCloud][tmpStatus] = tmpCount
- return 0,ret
-
-
-# get job statistics for Bamboo
-def getJobStatisticsForBamboo(useMorePG=False):
- # instantiate curl
- curl = _Curl()
- # execute
- ret = {}
- for srvID in getPandas():
- url = _getURL('URL',srvID) + '/getJobStatisticsForBamboo'
- data = {}
- if useMorePG != False:
- data['useMorePG'] = useMorePG
- status,output = curl.get(url,data)
- try:
- tmpRet = status,pickle.loads(output)
- if status != 0:
- return tmpRet
- except:
- print output
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getJobStatisticsForBamboo : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
- # gather
- for tmpCloud,tmpMap in tmpRet[1].iteritems():
- if not ret.has_key(tmpCloud):
- # append cloud values
- ret[tmpCloud] = tmpMap
- else:
- # sum statistics
- for tmpPType,tmpVal in tmpMap.iteritems():
- if not ret[tmpCloud].has_key(tmpPType):
- ret[tmpCloud][tmpPType] = tmpVal
- else:
- for tmpStatus,tmpCount in tmpVal.iteritems():
- if ret[tmpCloud][tmpPType].has_key(tmpStatus):
- ret[tmpCloud][tmpPType][tmpStatus] += tmpCount
- else:
- ret[tmpCloud][tmpPType][tmpStatus] = tmpCount
- return 0,ret
-
-
-# get highest prio jobs
-def getHighestPrioJobStat(perPG=False,useMorePG=False):
- # instantiate curl
- curl = _Curl()
- # execute
- ret = {}
- url = baseURL + '/getHighestPrioJobStat'
- data = {'perPG':perPG}
- if useMorePG != False:
- data['useMorePG'] = useMorePG
- status,output = curl.get(url,data)
- try:
- return status,pickle.loads(output)
- except:
- print output
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getHighestPrioJobStat : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# get jobs updated recently
-def getJobsToBeUpdated(limit=5000,lockedby='',srvID=None):
- # instantiate curl
- curl = _Curl()
- # execute
- url = _getURL('URL',srvID) + '/getJobsToBeUpdated'
- status,output = curl.get(url,{'limit':limit,'lockedby':lockedby})
- try:
- return status,pickle.loads(output)
- except:
- print output
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getJobsToBeUpdated : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# update prodDBUpdateTimes
-def updateProdDBUpdateTimes(params,verbose=False,srvID=None):
- # serialize
- strPar = pickle.dumps(params)
- # instantiate curl
- curl = _Curl()
- curl.sslCert = _x509()
- curl.sslKey = _x509()
- curl.verbose = verbose
- # execute
- url = _getURL('URLSSL',srvID) + '/updateProdDBUpdateTimes'
- data = {'params':strPar}
- status,output = curl.post(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR updateProdDBUpdateTimes : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# get PandaID at site
-def getPandaIDsSite(site,status,limit=500):
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/getPandaIDsSite'
- status,output = curl.get(url,{'site':site,'status':status,'limit':limit})
- try:
- return status,pickle.loads(output)
- except:
- print output
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getPandaIDsSite : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# get job statistics per site
-def getJobStatisticsPerSite(predefined=False,workingGroup='',countryGroup='',jobType='',minPriority=None,
- readArchived=None):
- # instantiate curl
- curl = _Curl()
- # execute
- ret = {}
- for srvID in getPandas():
- url = _getURL('URL',srvID) + '/getJobStatisticsPerSite'
- data = {'predefined':predefined}
- if not workingGroup in ['',None]:
- data['workingGroup'] = workingGroup
- if not countryGroup in ['',None]:
- data['countryGroup'] = countryGroup
- if not jobType in ['',None]:
- data['jobType'] = jobType
- if not minPriority in ['',None]:
- data['minPriority'] = minPriority
- if not readArchived in ['',None]:
- data['readArchived'] = readArchived
- status,output = curl.get(url,data)
- try:
- tmpRet = status,pickle.loads(output)
- if status != 0:
- return tmpRet
- except:
- print output
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getJobStatisticsPerSite : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
- # gather
- for tmpSite,tmpVal in tmpRet[1].iteritems():
- if not ret.has_key(tmpSite):
- # append site values
- ret[tmpSite] = tmpVal
- else:
- # sum statistics
- for tmpStatus,tmpCount in tmpVal.iteritems():
- if ret[tmpSite].has_key(tmpStatus):
- ret[tmpSite][tmpStatus] += tmpCount
- else:
- ret[tmpSite][tmpStatus] = tmpCount
- return 0,ret
-
-
-# get job statistics per site with label
-def getJobStatisticsWithLabel(site=''):
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/getJobStatisticsWithLabel'
- data = {}
- if not site in ['',None]:
- data['site'] = site
- status,output = curl.get(url,data)
- try:
- return status,pickle.loads(output)
- except:
- print output
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getJobStatisticsWithLabel : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# get the number of waiting jobs per site and user
-def getJobStatisticsPerUserSite():
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/getJobStatisticsPerUserSite'
- data = {}
- status,output = curl.get(url,data)
- try:
- return status,pickle.loads(output)
- except:
- print output
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getJobStatisticsPerUserSite : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# query last files in datasets
-def queryLastFilesInDataset(datasets):
- # serialize
- strDSs = pickle.dumps(datasets)
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/queryLastFilesInDataset'
- data = {'datasets':strDSs}
- status,output = curl.post(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- print "ERROR queryLastFilesInDataset : %s %s" % (type,value)
- return EC_Failed,None
-
-
-# insert sandbox file info
-def insertSandboxFileInfo(userName,fileName,fileSize,checkSum,verbose=False):
- # instantiate curl
- curl = _Curl()
- curl.sslCert = _x509()
- curl.sslKey = _x509()
- curl.verbose = verbose
- # execute
- url = baseURLSSL + '/insertSandboxFileInfo'
- data = {'userName':userName,'fileName':fileName,'fileSize':fileSize,'checkSum':checkSum}
- return curl.post(url,data)
-
-
-# put file
-def putFile(file):
- # instantiate curl
- curl = _Curl()
- curl.sslCert = _x509()
- curl.sslKey = _x509()
- # execute
- url = baseURLSSL + '/putFile'
- data = {'file':file}
- return curl.put(url,data)
-
-
-# delete file
-def deleteFile(file):
- # instantiate curl
- curl = _Curl()
- curl.sslCert = _x509()
- curl.sslKey = _x509()
- # execute
- url = baseURLSSL + '/deleteFile'
- data = {'file':file}
- return curl.post(url,data)
-
-
-# touch file
-def touchFile(sourceURL,filename):
- # instantiate curl
- curl = _Curl()
- curl.sslCert = _x509()
- curl.sslKey = _x509()
- # execute
- url = sourceURL + '/server/panda/touchFile'
- data = {'filename':filename}
- return curl.post(url,data)
-
-
-# resubmit jobs
-def resubmitJobs(ids):
- # serialize
- strIDs = pickle.dumps(ids)
- # instantiate curl
- curl = _Curl()
- curl.sslCert = _x509()
- curl.sslKey = _x509()
- # execute
- url = baseURLSSL + '/resubmitJobs'
- data = {'ids':strIDs}
- status,output = curl.post(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- print "ERROR resubmitJobs : %s %s" % (type,value)
- return EC_Failed,None
-
-
-# get site specs
-def getSiteSpecs(siteType=None):
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/getSiteSpecs'
- data = {}
- if siteType != None:
- data = {'siteType':siteType}
- status,output = curl.get(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getSiteSpecs : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# get cloud specs
-def getCloudSpecs():
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/getCloudSpecs'
- status,output = curl.get(url,{})
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getCloudSpecs : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# get nPilots
-def getNumPilots():
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/getNumPilots'
- status,output = curl.get(url,{})
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getNumPilots : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# run brokerage
-def runBrokerage(sites,atlasRelease,cmtConfig=None):
- # serialize
- strSites = pickle.dumps(sites)
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURL + '/runBrokerage'
- data = {'sites':strSites,
- 'atlasRelease':atlasRelease}
- if cmtConfig != None:
- data['cmtConfig'] = cmtConfig
- return curl.get(url,data)
-
-
-# get RW
-def getRW(priority=0):
- # instantiate curl
- curl = _Curl()
- # execute
- url = baseURLBAMBOO + '/getRW'
- # get RWs for high priority tasks
- data = {'priority':priority}
- status,output = curl.get(url,data)
- try:
- return status,pickle.loads(output)
- except:
- type, value, traceBack = sys.exc_info()
- errStr = "ERROR getRW : %s %s" % (type,value)
- print errStr
- return EC_Failed,output+'\n'+errStr
-
-
-# change job priorities
-def changeJobPriorities(newPrioMap):
- # serialize
- newPrioMapStr = pickle.dumps(newPrioMap)
- # instantiate curl
- curl = _Curl()
- curl.sslCert = _x509()
- curl.sslKey = _x509()
- # execute
- url = baseURLSSL + '/changeJobPriorities'
- data = {'newPrioMap':newPrioMapStr}
- status,output = curl.post(url,data)
- try:
- return status,pickle.loads(output)
- except:
- errtype,errvalue = sys.exc_info()[:2]
- errStr = "ERROR changeJobPriorities : %s %s" % (errtype,errvalue)
- return EC_Failed,output+'\n'+errStr
-
-
diff --git a/current/pandaserver/userinterface/RbLauncher.py b/current/pandaserver/userinterface/RbLauncher.py
deleted file mode 100755
index a23a6fbcf..000000000
--- a/current/pandaserver/userinterface/RbLauncher.py
+++ /dev/null
@@ -1,52 +0,0 @@
-'''
-launcer for ReBroker
-
-'''
-
-import sys
-import time
-import commands
-import threading
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('RbLauncher')
-
-
-class RbLauncher (threading.Thread):
- # constructor
- def __init__(self,dn,jobID,cloud=None,excludedSite=None):
- threading.Thread.__init__(self)
- self.dn = dn
- self.jobID = jobID
- self.cloud = cloud
- self.excludedSite = excludedSite
- # time stamp
- self.timestamp = time.asctime()
-
-
- # main
- def run(self):
- try:
- _logger.debug('%s startRun' % self.timestamp)
- # run
- com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd)
- com += 'source %s; ' % panda_config.glite_source
- com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/userinterface/runReBroker.py ' % \
- (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python,
- panda_config.pandaPython_dir)
- com += '-j %s -d "%s" ' % (self.jobID,self.dn)
- if self.cloud != None:
- com += '-c %s ' % self.cloud
- if self.excludedSite != None:
- com += '-e %s ' % self.excludedSite
- # exeute
- _logger.debug('%s com=%s' % (self.timestamp,com))
- status,output = commands.getstatusoutput(com)
- _logger.debug("%s Ret from another process: %s %s" % (self.timestamp,status,output))
- _logger.debug('%s endRun' % self.timestamp)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("run() : %s %s" % (type,value))
diff --git a/current/pandaserver/userinterface/ReBroker.py b/current/pandaserver/userinterface/ReBroker.py
deleted file mode 100644
index 205b375ee..000000000
--- a/current/pandaserver/userinterface/ReBroker.py
+++ /dev/null
@@ -1,1022 +0,0 @@
-'''
-find another candidate site for analysis
-
-'''
-
-import re
-import sys
-import time
-import random
-import datetime
-import threading
-
-from dataservice.DDM import ddm
-from dataservice.DDM import dq2Common
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.OraDBProxy import DBProxy
-from dataservice.Setupper import Setupper
-from brokerage.SiteMapper import SiteMapper
-import brokerage.broker
-
-from config import panda_config
-from pandalogger.PandaLogger import PandaLogger
-
-# logger
-_logger = PandaLogger().getLogger('ReBroker')
-
-def initLogger(pLogger):
- # redirect logging to parent as it doesn't work in nested threads
- global _logger
- _logger = pLogger
-
-
-class ReBroker (threading.Thread):
-
- # constructor
- def __init__(self,taskBuffer,cloud=None,excludedSite=None,overrideSite=True,
- simulation=False,forceOpt=False,userRequest=False,forFailed=False,
- avoidSameSite=False):
- threading.Thread.__init__(self)
- self.job = None
- self.jobID = None
- self.pandaID = None
- self.cloud = cloud
- self.pandaJobList = []
- self.buildStatus = None
- self.taskBuffer = taskBuffer
- self.token = None
- self.newDatasetMap = {}
- self.simulation = simulation
- self.forceOpt = forceOpt
- self.excludedSite = excludedSite
- self.overrideSite = overrideSite
- self.maxPandaIDlibDS = None
- self.userRequest = userRequest
- self.forFailed = forFailed
- self.revNum = 0
- self.avoidSameSite = avoidSameSite
- self.brokerageInfo = []
-
-
- # main
- def run(self):
- try:
- # get job
- tmpJobs = self.taskBuffer.getFullJobStatus([self.rPandaID])
- if tmpJobs == [] or tmpJobs[0] == None:
- _logger.debug("cannot find job for PandaID=%s" % self.rPandaID)
- return
- self.job = tmpJobs[0]
- _logger.debug("%s start %s:%s:%s" % (self.token,self.job.jobDefinitionID,self.job.prodUserName,self.job.computingSite))
- # using output container
- if not self.job.destinationDBlock.endswith('/'):
- _logger.debug("%s ouput dataset container is required" % self.token)
- _logger.debug("%s end" % self.token)
- return
- # FIXEME : dont' touch group jobs for now
- if self.job.destinationDBlock.startswith('group') and (not self.userRequest):
- _logger.debug("%s skip group jobs" % self.token)
- _logger.debug("%s end" % self.token)
- return
- # check processingType
- typesForRebro = ['pathena','prun','ganga','ganga-rbtest']
- if not self.job.processingType in typesForRebro:
- _logger.debug("%s skip processingType=%s not in %s" % \
- (self.token,self.job.processingType,str(typesForRebro)))
- _logger.debug("%s end" % self.token)
- return
- # check jobsetID
- if self.job.jobsetID in [0,'NULL',None]:
- _logger.debug("%s jobsetID is undefined" % self.token)
- _logger.debug("%s end" % self.token)
- return
- # check metadata
- if self.job.metadata in [None,'NULL']:
- _logger.debug("%s metadata is unavailable" % self.token)
- _logger.debug("%s end" % self.token)
- return
- # check --disableRebrokerage
- match = re.search("--disableRebrokerage",self.job.metadata)
- if match != None and (not self.simulation) and (not self.forceOpt) \
- and (not self.userRequest):
- _logger.debug("%s diabled rebrokerage" % self.token)
- _logger.debug("%s end" % self.token)
- return
- # check --site
- match = re.search("--site",self.job.metadata)
- if match != None and (not self.simulation) and (not self.forceOpt) \
- and (not self.userRequest):
- _logger.debug("%s --site is used" % self.token)
- _logger.debug("%s end" % self.token)
- return
- # check --libDS
- match = re.search("--libDS",self.job.metadata)
- if match != None:
- _logger.debug("%s --libDS is used" % self.token)
- _logger.debug("%s end" % self.token)
- return
- # check --workingGroup since it is site-specific
- match = re.search("--workingGroup",self.job.metadata)
- if match != None:
- _logger.debug("%s workingGroup is specified" % self.token)
- _logger.debug("%s end" % self.token)
- return
- # avoid too many rebrokerage
- if not self.checkRev():
- _logger.debug("%s avoid too many rebrokerage" % self.token)
- _logger.debug("%s end" % self.token)
- return
- # check if multiple JobIDs use the same libDS
- if self.bPandaID != None and self.buildStatus not in ['finished','failed']:
- if self.minPandaIDlibDS == None or self.maxPandaIDlibDS == None:
- _logger.debug("%s max/min PandaIDs are unavailable for the libDS" % self.token)
- _logger.debug("%s end" % self.token)
- return
- tmpPandaIDsForLibDS = self.taskBuffer.getFullJobStatus([self.minPandaIDlibDS,self.maxPandaIDlibDS])
- if len(tmpPandaIDsForLibDS) != 2 or tmpPandaIDsForLibDS[0] == None or tmpPandaIDsForLibDS[1] == None:
- _logger.debug("%s failed to get max/min PandaIDs for the libDS" % self.token)
- _logger.debug("%s end" % self.token)
- return
- # check
- if tmpPandaIDsForLibDS[0].jobDefinitionID != tmpPandaIDsForLibDS[1].jobDefinitionID:
- _logger.debug("%s multiple JobIDs use the libDS %s:%s %s:%s" % (self.token,tmpPandaIDsForLibDS[0].jobDefinitionID,
- self.minPandaIDlibDS,tmpPandaIDsForLibDS[1].jobDefinitionID,
- self.maxPandaIDlibDS))
- _logger.debug("%s end" % self.token)
- return
- # check excludedSite
- if self.excludedSite == None:
- self.excludedSite = []
- match = re.search("--excludedSite( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata)
- if match != None:
- self.excludedSite = match.group(3).split(',')
- # remove empty
- try:
- self.excludedSite.remove('')
- except:
- pass
- _logger.debug("%s excludedSite=%s" % (self.token,str(self.excludedSite)))
- # check cloud
- if self.cloud == None:
- match = re.search("--cloud( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata)
- if match != None:
- self.cloud = match.group(3)
- _logger.debug("%s cloud=%s" % (self.token,self.cloud))
- # get inDS/LFNs
- status,tmpMapInDS,maxFileSize = self.taskBuffer.getInDatasetsForReBrokerage(self.jobID,self.userName)
- if not status:
- # failed
- _logger.error("%s failed to get inDS/LFN from DB" % self.token)
- return
- status,inputDS = self.getListDatasetsUsedByJob(tmpMapInDS)
- if not status:
- # failed
- _logger.error("%s failed" % self.token)
- return
- # get relicas
- replicaMap = {}
- unknownSites = {}
- for tmpDS in inputDS:
- if tmpDS.endswith('/'):
- # container
- status,tmpRepMaps = self.getListDatasetReplicasInContainer(tmpDS)
- else:
- # normal dataset
- status,tmpRepMap = self.getListDatasetReplicas(tmpDS)
- tmpRepMaps = {tmpDS:tmpRepMap}
- if not status:
- # failed
- _logger.debug("%s failed" % self.token)
- return
- # make map per site
- for tmpDS,tmpRepMap in tmpRepMaps.iteritems():
- for tmpSite,tmpStat in tmpRepMap.iteritems():
- # ignore special sites
- if tmpSite in ['CERN-PROD_TZERO','CERN-PROD_DAQ','CERN-PROD_TMPDISK']:
- continue
- # ignore tape sites
- if tmpSite.endswith('TAPE'):
- continue
- # keep sites with unknown replica info
- if tmpStat[-1]['found'] == None:
- if not unknownSites.has_key(tmpDS):
- unknownSites[tmpDS] = []
- unknownSites[tmpDS].append(tmpSite)
- # ignore ToBeDeleted
- if tmpStat[-1]['archived'] in ['ToBeDeleted',]:
- continue
- # change EOS
- if tmpSite.startswith('CERN-PROD_EOS'):
- tmpSite = 'CERN-PROD_EOS'
- # change EOS TMP
- if tmpSite.startswith('CERN-PROD_TMP'):
- tmpSite = 'CERN-PROD_TMP'
- # change DISK to SCRATCHDISK
- tmpSite = re.sub('_[^_-]+DISK$','',tmpSite)
- # change PERF-XYZ to SCRATCHDISK
- tmpSite = re.sub('_PERF-[^_-]+$','',tmpSite)
- # change PHYS-XYZ to SCRATCHDISK
- tmpSite = re.sub('_PHYS-[^_-]+$','',tmpSite)
- # patch for BNLPANDA
- if tmpSite in ['BNLPANDA']:
- tmpSite = 'BNL-OSG2'
- # add to map
- if not replicaMap.has_key(tmpSite):
- replicaMap[tmpSite] = {}
- replicaMap[tmpSite][tmpDS] = tmpStat[-1]
- _logger.debug("%s replica map -> %s" % (self.token,str(replicaMap)))
- # refresh replica info in needed
- self.refreshReplicaInfo(unknownSites)
- # instantiate SiteMapper
- siteMapper = SiteMapper(self.taskBuffer)
- # get original DDM
- origSiteDDM = self.getAggName(siteMapper.getSite(self.job.computingSite).ddm)
- # check all datasets
- maxDQ2Sites = []
- if inputDS != []:
- # loop over all sites
- for tmpSite,tmpDsVal in replicaMap.iteritems():
- # loop over all datasets
- appendFlag = True
- for tmpOrigDS in inputDS:
- # check completeness
- if tmpDsVal.has_key(tmpOrigDS) and tmpDsVal[tmpOrigDS]['found'] != None and \
- tmpDsVal[tmpOrigDS]['total'] == tmpDsVal[tmpOrigDS]['found']:
- pass
- else:
- appendFlag = False
- # append
- if appendFlag:
- if not tmpSite in maxDQ2Sites:
- maxDQ2Sites.append(tmpSite)
- _logger.debug("%s candidate DQ2s -> %s" % (self.token,str(maxDQ2Sites)))
- if inputDS != [] and maxDQ2Sites == []:
- _logger.debug("%s no DQ2 candidate" % self.token)
- else:
- maxPandaSites = []
- # original maxinputsize
- origMaxInputSize = siteMapper.getSite(self.job.computingSite).maxinputsize
- # look for Panda siteIDs
- for tmpSiteID,tmpSiteSpec in siteMapper.siteSpecList.iteritems():
- # use ANALY_ only
- if not tmpSiteID.startswith('ANALY_'):
- continue
- # remove test and local
- if re.search('_test',tmpSiteID,re.I) != None:
- continue
- if re.search('_local',tmpSiteID,re.I) != None:
- continue
- # avoid same site
- if self.avoidSameSite and self.getAggName(tmpSiteSpec.ddm) == origSiteDDM:
- continue
- # check DQ2 ID
- if self.cloud in [None,tmpSiteSpec.cloud] \
- and (self.getAggName(tmpSiteSpec.ddm) in maxDQ2Sites or inputDS == []):
- # excluded sites
- excludedFlag = False
- for tmpExcSite in self.excludedSite:
- if re.search(tmpExcSite,tmpSiteID) != None:
- excludedFlag = True
- break
- if excludedFlag:
- _logger.debug("%s skip %s since excluded" % (self.token,tmpSiteID))
- continue
- # use online only
- if tmpSiteSpec.status != 'online':
- _logger.debug("%s skip %s status=%s" % (self.token,tmpSiteID,tmpSiteSpec.status))
- continue
- # check maxinputsize
- if (maxFileSize == None and origMaxInputSize > siteMapper.getSite(tmpSiteID).maxinputsize) or \
- maxFileSize > siteMapper.getSite(tmpSiteID).maxinputsize:
- _logger.debug("%s skip %s due to maxinputsize" % (self.token,tmpSiteID))
- continue
- # append
- if not tmpSiteID in maxPandaSites:
- maxPandaSites.append(tmpSiteID)
- # choose at most 20 sites randomly to avoid too many lookup
- random.shuffle(maxPandaSites)
- maxPandaSites = maxPandaSites[:20]
- _logger.debug("%s candidate PandaSites -> %s" % (self.token,str(maxPandaSites)))
- # no Panda siteIDs
- if maxPandaSites == []:
- _logger.debug("%s no Panda site candidate" % self.token)
- else:
- # set AtlasRelease and cmtConfig to dummy job
- tmpJobForBrokerage = JobSpec()
- if self.job.AtlasRelease in ['NULL',None]:
- tmpJobForBrokerage.AtlasRelease = ''
- else:
- tmpJobForBrokerage.AtlasRelease = self.job.AtlasRelease
- # use nightlies
- matchNight = re.search('^AnalysisTransforms-.*_(rel_\d+)$',self.job.homepackage)
- if matchNight != None:
- tmpJobForBrokerage.AtlasRelease += ':%s' % matchNight.group(1)
- # use cache
- else:
- matchCache = re.search('^AnalysisTransforms-([^/]+)',self.job.homepackage)
- if matchCache != None:
- tmpJobForBrokerage.AtlasRelease = matchCache.group(1).replace('_','-')
- if not self.job.cmtConfig in ['NULL',None]:
- tmpJobForBrokerage.cmtConfig = self.job.cmtConfig
- # memory size
- if not self.job.minRamCount in ['NULL',None,0]:
- tmpJobForBrokerage.minRamCount = self.job.minRamCount
- # CPU count
- if not self.job.maxCpuCount in ['NULL',None,0]:
- tmpJobForBrokerage.maxCpuCount = self.job.maxCpuCount
- # run brokerage
- brokerage.broker.schedule([tmpJobForBrokerage],self.taskBuffer,siteMapper,forAnalysis=True,
- setScanSiteList=maxPandaSites,trustIS=True,reportLog=True)
- newSiteID = tmpJobForBrokerage.computingSite
- self.brokerageInfo += tmpJobForBrokerage.brokerageErrorDiag
- _logger.debug("%s runBrokerage - > %s" % (self.token,newSiteID))
- # unknown site
- if not siteMapper.checkSite(newSiteID):
- _logger.error("%s unknown site" % self.token)
- _logger.debug("%s failed" % self.token)
- return
- # get new site spec
- newSiteSpec = siteMapper.getSite(newSiteID)
- # avoid repetition
- if self.getAggName(newSiteSpec.ddm) == origSiteDDM:
- _logger.debug("%s assigned to the same site %s " % (self.token,newSiteID))
- _logger.debug("%s end" % self.token)
- return
- # simulation mode
- if self.simulation:
- _logger.debug("%s end simulation" % self.token)
- return
- # prepare jobs
- status = self.prepareJob(newSiteID,newSiteSpec.cloud)
- if status:
- # run SetUpper
- statusSetUp = self.runSetUpper()
- if not statusSetUp:
- _logger.debug("%s runSetUpper failed" % self.token)
- else:
- _logger.debug("%s successfully assigned to %s" % (self.token,newSiteID))
- _logger.debug("%s end" % self.token)
- except:
- errType,errValue,errTraceBack = sys.exc_info()
- _logger.error("%s run() : %s %s" % (self.token,errType,errValue))
-
-
- # get aggregated DQ2 ID
- def getAggName(self,origName):
- if origName.startswith('CERN-PROD_EOS'):
- return 'CERN-PROD_EOS'
- if origName.startswith('CERN-PROD_TMP'):
- return 'CERN-PROD_TMP'
- return re.sub('_[^_-]+DISK$','',origName)
-
-
- # lock job to disable multiple broker running in parallel
- def lockJob(self,dn,jobID):
- # make token
- tmpProxy = DBProxy()
- self.token = "%s:%s:" % (tmpProxy.cleanUserID(dn),jobID)
- _logger.debug("%s lockJob" % self.token)
- # lock
- resST,resVal = self.taskBuffer.lockJobForReBrokerage(dn,jobID,self.simulation,self.forceOpt,
- forFailed=self.forFailed)
- # failed
- if not resST:
- _logger.debug("%s lockJob failed since %s" % (self.token,resVal['err']))
- return False,resVal['err']
- # keep jobID
- self.jobID = jobID
- # set PandaID,buildStatus,userName
- self.rPandaID = resVal['rPandaID']
- self.bPandaID = resVal['bPandaID']
- self.userName = resVal['userName']
- self.buildStatus = resVal['bStatus']
- self.buildJobID = resVal['bJobID']
- self.minPandaIDlibDS = resVal['minPandaIDlibDS']
- self.maxPandaIDlibDS = resVal['maxPandaIDlibDS']
- # use JobID as rev num
- self.revNum = self.taskBuffer.getJobIdUser(dn)
- _logger.debug("%s run PandaID=%s / build PandaID=%s Status=%s JobID=%s rev=%s" % \
- (self.token,self.rPandaID,self.bPandaID,self.buildStatus,
- self.buildJobID,self.revNum))
- # return
- return True,''
-
-
- # move build job to jobsDefined4
- def prepareJob(self,site,cloud):
- _logger.debug("%s prepareJob" % self.token)
- # reuse buildJob + all runJobs
- if self.jobID == self.buildJobID and self.buildStatus in ['defined','activated']:
- if self.buildStatus == 'activated':
- # move build job to jobsDefined4
- ret = self.taskBuffer.resetBuildJobForReBrokerage(self.bPandaID)
- if not ret:
- _logger.error("%s failed to move build job %s to jobsDefined" % (self.token,self.bPandaID))
- return False
- # get PandaIDs from jobsDefined4
- tmpPandaIDs = self.taskBuffer.getPandaIDsForReBrokerage(self.userName,self.jobID,False,
- forFailed=self.forFailed)
- if tmpPandaIDs == []:
- _logger.error("%s cannot find PandaDSs" % self.token)
- return False
- # get jobSpecs
- iBunchJobs = 0
- nBunchJobs = 500
- tmpJobsMap = {}
- while iBunchJobs < len(tmpPandaIDs):
- # get IDs
- tmpJobs = self.taskBuffer.peekJobs(tmpPandaIDs[iBunchJobs:iBunchJobs+nBunchJobs],True,False,False,False)
- for tmpJob in tmpJobs:
- if tmpJob != None and tmpJob.jobStatus in ['defined','assigned']:
- # remove _sub suffix
- for tmpFile in tmpJob.Files:
- if tmpFile.type != 'input':
- tmpFile.destinationDBlock = re.sub('_sub\d+$','',tmpFile.destinationDBlock)
- self.pandaJobList.append(tmpJob)
- # increment index
- iBunchJobs += nBunchJobs
- # make new bunch
- else:
- # make new buildJob
- if self.bPandaID != None:
- tmpJobs = self.taskBuffer.getFullJobStatus([self.bPandaID])
- if tmpJobs == [] or tmpJobs[0] == None:
- _logger.debug("cannot find build job for PandaID=%s" % self.bPandaID)
- return False
- # make
- tmpBuildJob,oldLibDS,newLibDS = self.makeNewBuildJobForRebrokerage(tmpJobs[0])
- # set parameters
- tmpBuildJob.jobExecutionID = self.jobID
- tmpBuildJob.jobsetID = -1
- tmpBuildJob.sourceSite = self.job.jobsetID
- # register
- status = self.registerNewDataset(newLibDS)
- if not status:
- _logger.debug("%s failed to register new libDS" % self.token)
- return False
- # append
- self.pandaJobList = [tmpBuildJob]
- # prepare outputDS
- status = self.prepareDS()
- if not status:
- _logger.error("%s failed to prepare outputDS" % self.token)
- return False
- # get PandaIDs
- if self.buildStatus in ['finished',None]:
- # from jobsActivated when buildJob already finished or noBuild
- tmpPandaIDs = self.taskBuffer.getPandaIDsForReBrokerage(self.userName,self.jobID,True,
- forFailed=self.forFailed)
- else:
- # from jobsDefined
- tmpPandaIDs = self.taskBuffer.getPandaIDsForReBrokerage(self.userName,self.jobID,False,
- forFailed=self.forFailed)
- if tmpPandaIDs == []:
- _logger.error("%s cannot find PandaDSs" % self.token)
- return False
- # get jobSpecs
- iBunchJobs = 0
- nBunchJobs = 500
- tmpJobsMap = {}
- while iBunchJobs < len(tmpPandaIDs):
- # get jobs
- tmpJobs = self.taskBuffer.peekJobs(tmpPandaIDs[iBunchJobs:iBunchJobs+nBunchJobs],True,True,False,False,True)
- for tmpJob in tmpJobs:
- # reset parameters for retry
- if self.forFailed and tmpJob != None:
- self.taskBuffer.retryJob(tmpJob.PandaID,{},failedInActive=True,
- changeJobInMem=True,inMemJob=tmpJob)
- # set holding to be compatible with rebro jobs
- tmpJob.jobStatus = 'holding'
- # check job status. activated jobs were changed to holding by getPandaIDsForReBrokerage
- if tmpJob != None and tmpJob.jobStatus in ['defined','assigned','holding']:
- # reset parameter
- tmpJob.parentID = tmpJob.PandaID
- tmpJob.PandaID = None
- tmpJob.jobExecutionID = tmpJob.jobDefinitionID
- tmpJob.jobsetID = -1
- tmpJob.sourceSite = self.job.jobsetID
- if self.bPandaID != None:
- tmpJob.jobParameters = re.sub(oldLibDS,newLibDS,tmpJob.jobParameters)
- for tmpFile in tmpJob.Files:
- tmpFile.row_ID = None
- tmpFile.PandaID = None
- if tmpFile.type == 'input':
- if self.bPandaID != None and tmpFile.dataset == oldLibDS:
- tmpFile.status = 'unknown'
- tmpFile.GUID = None
- tmpFile.dataset = newLibDS
- tmpFile.dispatchDBlock = newLibDS
- tmpFile.lfn = re.sub(oldLibDS,newLibDS,tmpFile.lfn)
- else:
- # use new dataset
- tmpFile.destinationDBlock = re.sub('_sub\d+$','',tmpFile.destinationDBlock)
- if not self.newDatasetMap.has_key(tmpFile.destinationDBlock):
- _logger.error("%s cannot find new dataset for %s:%s" % (self.token,tmpFile.PandaID,tmpFile.destinationDBlock))
- return False
- tmpFile.destinationDBlock = self.newDatasetMap[tmpFile.destinationDBlock]
- # append
- self.pandaJobList.append(tmpJob)
- # increment index
- iBunchJobs += nBunchJobs
- # no jobs
- if self.pandaJobList == []:
- _logger.error("%s no jobs" % self.token)
- return False
- # set cloud, site, and specialHandling
- for tmpJob in self.pandaJobList:
- # set specialHandling
- if tmpJob.specialHandling in [None,'NULL','']:
- if not self.forFailed:
- tmpJob.specialHandling = 'rebro'
- else:
- tmpJob.specialHandling = 'sretry'
- else:
- if not self.forFailed:
- tmpJob.specialHandling += ',rebro'
- else:
- tmpJob.specialHandling += ',sretry'
- # check if --destSE is used
- oldComputingSite = tmpJob.computingSite
- if tmpJob.destinationSE == oldComputingSite:
- tmpJob.destinationSE = site
- # set site and cloud
- tmpJob.computingSite = site
- tmpJob.cloud = cloud
- # reset destinationDBlock
- for tmpFile in tmpJob.Files:
- if tmpFile.type in ['output','log']:
- # set destSE
- if tmpFile.destinationSE == oldComputingSite:
- tmpFile.destinationSE = site
- # set the same specialHandling since new build may have different specialHandling
- self.pandaJobList[0].specialHandling = self.pandaJobList[-1].specialHandling
- # return
- return True
-
-
- # prepare libDS
- def prepareDS(self):
- _logger.debug("%s prepareDS" % self.token)
- # get all outDSs
- shadowDsName = None
- for tmpFile in self.job.Files:
- if tmpFile.type in ['output','log']:
- tmpDS = re.sub('_sub\d+$','',tmpFile.destinationDBlock)
- # append new rev number
- match = re.search('_rev(\d+)$',tmpDS)
- if match == None:
- newDS = tmpDS + '_rev%s' % self.revNum
- else:
- newDS = re.sub('_rev(\d+)$','_rev%s' % self.revNum,tmpDS)
- # add shadow
- """
- if shadowDsName == None and tmpFile.type == 'log':
- shadowDsName = "%s_shadow" % newDS
- status = self.registerNewDataset(shadowDsName)
- if not status:
- _logger.debug("%s prepareDS failed for shadow" % self.token)
- return False
- """
- # add datasets
- if not tmpDS in self.newDatasetMap:
- # register
- status = self.registerNewDataset(newDS,tmpFile.dataset)
- if not status:
- _logger.debug("%s prepareDS failed" % self.token)
- return False
- # append
- self.newDatasetMap[tmpDS] = newDS
- return True
-
-
- # run SetUpper
- def runSetUpper(self):
- # reuse buildJob + all runJobs
- reuseFlag = False
- if self.jobID == self.buildJobID and self.buildStatus in ['defined','activated']:
- reuseFlag = True
- _logger.debug("%s start Setupper for JobID=%s" % (self.token,self.jobID))
- thr = Setupper(self.taskBuffer,self.pandaJobList,resetLocation=True)
- thr.start()
- thr.join()
- # new bunch
- else:
- # fake FQANs
- fqans = []
- if not self.job.countryGroup in ['','NULL',None]:
- fqans.append('/atlas/%s/Role=NULL' % self.job.countryGroup)
- if self.job.destinationDBlock.startswith('group') and not self.job.workingGroup in ['','NULL',None]:
- fqans.append('/atlas/%s/Role=production' % self.job.workingGroup)
- # insert jobs
- _logger.debug("%s start storeJobs for JobID=%s" % (self.token,self.jobID))
- ret = self.taskBuffer.storeJobs(self.pandaJobList,self.job.prodUserID,True,False,fqans,
- self.job.creationHost,True,checkSpecialHandling=False)
- if ret == []:
- _logger.error("%s storeJobs failed with [] for JobID=%s" % (self.token,self.jobID))
- return False
- # get PandaIDs to be killed
- pandaIDsTobeKilled = []
- newJobDefinitionID = None
- newJobsetID = None
- strNewIDsList = []
- for tmpIndex,tmpItem in enumerate(ret):
- if not tmpItem[0] in ['NULL',None]:
- tmpJob = self.pandaJobList[tmpIndex]
- if not tmpJob.parentID in [0,None,'NULL']:
- pandaIDsTobeKilled.append(tmpJob.parentID)
- if newJobDefinitionID == None:
- newJobDefinitionID = tmpItem[1]
- if newJobsetID == None:
- newJobsetID = tmpItem[2]['jobsetID']
- strNewIDs = 'PandaID=%s JobsetID=%s JobID=%s' % (tmpItem[0],newJobsetID,newJobDefinitionID)
- strNewIDsList.append(strNewIDs)
- if pandaIDsTobeKilled != []:
- strNewJobIDs = "JobsetID=%s JobID=%s" % (newJobsetID,newJobDefinitionID)
- _logger.debug("%s kill jobs for JobID=%s -> new %s : %s" % \
- (self.token,self.jobID,strNewJobIDs,str(pandaIDsTobeKilled)))
- for tmpIdx,tmpPandaID in enumerate(pandaIDsTobeKilled):
- if not self.forFailed:
- self.taskBuffer.killJobs([tmpPandaID],strNewIDsList[tmpIdx],'8',True)
- else:
- self.taskBuffer.killJobs([tmpPandaID],strNewIDsList[tmpIdx],'7',True)
- # send brokerage info
- if not self.forFailed:
- tmpMsg = 'action=rebrokerage ntry=%s ' % self.pandaJobList[0].specialHandling.split(',').count('rebro')
- else:
- tmpMsg = 'action=serverretry ntry=%s ' % self.pandaJobList[0].specialHandling.split(',').count('sretry')
- tmpMsg += 'old_jobset=%s old_jobdef=%s old_site=%s' % (self.job.jobsetID,self.jobID,self.job.computingSite)
- self.brokerageInfo.append(tmpMsg)
- brokerage.broker.sendMsgToLoggerHTTP(self.brokerageInfo,self.pandaJobList[0])
- # succeeded
- _logger.debug("%s completed for JobID=%s" % (self.token,self.jobID))
- return True
-
-
- # check DDM response
- def isDQ2ok(self,out):
- if out.find("DQ2 internal server exception") != -1 \
- or out.find("An error occurred on the central catalogs") != -1 \
- or out.find("MySQL server has gone away") != -1 \
- or out == '()':
- return False
- return True
-
-
- # get list of datasets
- def getListDatasets(self,dataset):
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug("%s %s/%s listDatasets %s" % (self.token,iDDMTry,nTry,dataset))
- status,out = ddm.DQ2.main('listDatasets',dataset,0,True)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if status != 0 or out.startswith('Error'):
- _logger.error(self.token+' '+out)
- _logger.error('%s bad DQ2 response for %s' % (self.token,dataset))
- return False,{}
- try:
- # convert res to map
- exec "tmpDatasets = %s" % out
- # remove _sub/_dis
- resList = []
- for tmpDS in tmpDatasets.keys():
- if re.search('(_sub|_dis)\d+$',tmpDS) == None and re.search('(_shadow$',tmpDS) == None:
- resList.append(tmpDS)
- _logger.debug('%s getListDatasets->%s' % (self.token,str(resList)))
- return True,resList
- except:
- _logger.error(self.token+' '+out)
- _logger.error('%s could not convert HTTP-res to datasets for %s' % (self.token,dataset))
- return False,{}
-
-
- # get list of replicas for a dataset
- def getListDatasetReplicas(self,dataset):
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug("%s %s/%s listDatasetReplicas %s" % (self.token,iDDMTry,nTry,dataset))
- status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if status != 0 or out.startswith('Error'):
- _logger.error(self.token+' '+out)
- _logger.error('%s bad DQ2 response for %s' % (self.token,dataset))
- return False,{}
- try:
- # convert res to map
- exec "tmpRepSites = %s" % out
- _logger.debug('%s getListDatasetReplicas->%s' % (self.token,str(tmpRepSites)))
- return True,tmpRepSites
- except:
- _logger.error(self.token+' '+out)
- _logger.error('%s could not convert HTTP-res to replica map for %s' % (self.token,dataset))
- return False,{}
-
-
- # get replicas for a container
- def getListDatasetReplicasInContainer(self,container):
- # response for failure
- resForFailure = False,{}
- # get datasets in container
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug('%s %s/%s listDatasetsInContainer %s' % (self.token,iDDMTry,nTry,container))
- status,out = ddm.DQ2.main('listDatasetsInContainer',container)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error(self.token+' '+out)
- _logger.error('%s bad DQ2 response for %s' % (self.token,container))
- return resForFailure
- datasets = []
- try:
- # convert to list
- exec "datasets = %s" % out
- except:
- _logger.error('%s could not convert HTTP-res to dataset list for %s' % (self.token,container))
- return resForFailure
- # loop over all datasets
- allRepMap = {}
- for dataset in datasets:
- # get replicas
- status,tmpRepSites = self.getListDatasetReplicas(dataset)
- if not status:
- return resForFailure
- # append
- allRepMap[dataset] = tmpRepSites
- # return
- _logger.debug('%s getListDatasetReplicasInContainer done')
- return True,allRepMap
-
-
- # delete original locations
- def deleteDatasetReplicas(self,datasets):
- # loop over all datasets
- for dataset in datasets:
- # get locations
- status,tmpRepSites = self.getListDatasetReplicas(dataset)
- if not status:
- return False
- # no replicas
- if len(tmpRepSites.keys()) == 0:
- continue
- # delete
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug("%s %s/%s deleteDatasetReplicas %s" % (self.token,iDDMTry,nTry,dataset))
- status,out = ddm.DQ2.main('deleteDatasetReplicas',dataset,tmpRepSites.keys())
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if status != 0 or out.startswith('Error'):
- _logger.error(self.token+' '+out)
- _logger.error('%s bad DQ2 response for %s' % (self.token,dataset))
- return False
- _logger.debug(self.token+' '+out)
- # return
- _logger.debug('%s deleted replicas for %s' % (self.token,str(datasets)))
- return True
-
-
- # check if datasets are empty
- def checkDatasetContents(self,datasets):
- # loop over all datasets
- for dataset in datasets:
- # check
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug("%s %s/%s getNumberOfFiles %s" % (self.token,iDDMTry,nTry,dataset))
- status,out = ddm.DQ2.main('getNumberOfFiles',dataset)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if status != 0 or out.startswith('Error'):
- _logger.error(self.token+' '+out)
- _logger.error('%s bad DQ2 response for %s' % (self.token,dataset))
- return False
- # convert to int
- _logger.debug(self.token+' '+out)
- try:
- nFile = int(out)
- # not empty
- if nFile != 0:
- _logger.error('%s %s is not empty' % (self.token,dataset))
- return False
- except:
- _logger.error("%s could not convert HTTP-res to nFiles" % (self.token,dataset))
- return False
- # all OK
- return True
-
-
- # register dataset
- def registerNewDataset(self,dataset,container=''):
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug("%s %s/%s registerNewDataset %s" % (self.token,iDDMTry,nTry,dataset))
- status,out = ddm.DQ2.main('registerNewDataset',dataset)
- if out.find('DQDatasetExistsException') != -1:
- break
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if out.find('DQDatasetExistsException') != -1:
- # ignore DQDatasetExistsException
- pass
- elif status != 0 or out.startswith('Error'):
- _logger.error(self.token+' '+out)
- _logger.error('%s failed to register new dataset %s' % (self.token,dataset))
- return False
- # remove /CN=proxy and /CN=limited from DN
- tmpRealDN = self.job.prodUserID
- tmpRealDN = re.sub('/CN=limited proxy','',tmpRealDN)
- tmpRealDN = re.sub('/CN=proxy','',tmpRealDN)
- status,out = dq2Common.parse_dn(tmpRealDN)
- if status != 0:
- _logger.error(self.token+' '+out)
- _logger.error('%s failed to truncate DN:%s' % (self.token,self.job.prodUserID))
- return False
- tmpRealDN = out
- # set owner
- for iDDMTry in range(nTry):
- _logger.debug("%s %s/%s setMetaDataAttribute %s %s" % (self.token,iDDMTry,nTry,dataset,tmpRealDN))
- status,out = ddm.DQ2.main('setMetaDataAttribute',dataset,'owner',tmpRealDN)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error(self.token+' '+out)
- _logger.error('%s failed to set owner to dataset %s' % (self.token,dataset))
- return False
- # add to contaner
- if container != '' and container.endswith('/'):
- for iDDMTry in range(nTry):
- _logger.debug("%s %s/%s registerDatasetsInContainer %s to %s" % (self.token,iDDMTry,nTry,dataset,container))
- status,out = ddm.DQ2.main('registerDatasetsInContainer',container,[dataset])
- if out.find('DQContainerAlreadyHasDataset') != -1:
- break
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if out.find('DQContainerAlreadyHasDataset') != -1:
- # ignore DQContainerAlreadyHasDataset
- pass
- elif status != 0 or out.startswith('Error'):
- _logger.error(self.token+' '+out)
- _logger.error('%s add %s to container:%s' % (self.token,dataset,container))
- return False
- # return
- return True
-
-
- # get list of dataset used by the job
- def getListDatasetsUsedByJob(self,mapDsLFN):
- # response for failure
- resForFailure = False,[]
- # loop over all datasets
- retList = []
- for tmpDsContainer,tmpLFNs in mapDsLFN.iteritems():
- # not a container
- if not tmpDsContainer.endswith('/'):
- if not tmpDsContainer in retList:
- retList.append(tmpDsContainer)
- continue
- # get datasets in container
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug('%s %s/%s listDatasetsInContainer %s' % (self.token,iDDMTry,nTry,tmpDsContainer))
- status,out = ddm.DQ2.main('listDatasetsInContainer',tmpDsContainer)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error(self.token+' '+out)
- _logger.error('%s bad DQ2 response for %s' % (self.token,tmpDsContainer))
- return resForFailure
- tmpDatasets = []
- try:
- # convert to list
- exec "tmpDatasets = %s" % out
- except:
- _logger.error('%s could not convert HTTP-res to dataset list for %s' % (self.token,tmpDsContainer))
- return resForFailure
- # get files in dataset
- for tmpDS in tmpDatasets:
- if tmpDS in retList:
- continue
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug('%s %s/%s listFilesInDataset %s' % (self.token,iDDMTry,nTry,tmpDS))
- status,out = ddm.DQ2.main('listFilesInDataset',tmpDS)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- if status != 0 or out.startswith('Error'):
- _logger.error(self.token+' '+out)
- _logger.error('%s bad DQ2 response for %s' % (self.token,tmpDS))
- return resForFailure
- # get LFN map
- tmpMapDQ2 = {}
- try:
- # convert to list
- exec "tmpMapDQ2 = %s[0]" % out
- for tmpGUID,tmpVal in tmpMapDQ2.iteritems():
- # check if a file in DS is used by the job
- if tmpVal['lfn'] in tmpLFNs:
- # append
- if not tmpDS in retList:
- retList.append(tmpDS)
- break
- except:
- _logger.error('%s could not convert HTTP-res to LFN map for %s' % (self.token,tmpDS))
- return resForFailure
- # return
- _logger.debug('%s getListDatasetsUsedByJob done %s' % (self.token,str(retList)))
- return True,retList
-
-
- # refresh replica info in needed
- def refreshReplicaInfo(self,unknownSites):
- for tmpDS,sites in unknownSites.iteritems():
- nTry = 3
- for iDDMTry in range(nTry):
- _logger.debug("%s %s/%s listFileReplicasBySites %s %s" % (self.token,iDDMTry,nTry,tmpDS,str(sites)))
- status,out = ddm.DQ2_iter.listFileReplicasBySites(tmpDS,0,sites,0,300)
- if status != 0 or (not self.isDQ2ok(out)):
- time.sleep(60)
- else:
- break
- # result
- if status != 0 or out.startswith('Error'):
- _logger.error(self.token+' '+out)
- _logger.error('%s bad DQ2 response for %s' % (self.token,dataset))
- # return
- return True
-
-
- # check rev to avoid too many rebrokerage
- def checkRev(self):
- # check specialHandling
- if self.job.specialHandling in [None,'NULL','']:
- revNum = 0
- else:
- revNum = self.job.specialHandling.split(',').count('rebro')
- revNum += self.job.specialHandling.split(',').count('sretry')
- # check with limit
- if revNum < 5:
- return True
- return False
-
-
- # make buildJob for re-brokerage
- def makeNewBuildJobForRebrokerage(self,buildJob):
- # new libDS
- oldLibDS = buildJob.destinationDBlock
- match = re.search('_rev(\d+)$',oldLibDS)
- if match == None:
- newLibDS = oldLibDS + '__id%s_rev%s' % (self.job.jobDefinitionID,self.revNum)
- else:
- newLibDS = re.sub('_rev(\d+)$','_rev%s' % self.revNum,oldLibDS)
- # reset parameters
- buildJob.PandaID = None
- buildJob.jobStatus = None
- buildJob.commandToPilot = None
- buildJob.schedulerID = None
- buildJob.pilotID = None
- for attr in buildJob._attributes:
- if attr.endswith('ErrorCode') or attr.endswith('ErrorDiag'):
- setattr(buildJob,attr,None)
- buildJob.transExitCode = None
- buildJob.creationTime = datetime.datetime.utcnow()
- buildJob.modificationTime = buildJob.creationTime
- buildJob.startTime = None
- buildJob.endTime = None
- buildJob.destinationDBlock = newLibDS
- buildJob.jobParameters = re.sub(oldLibDS,newLibDS,buildJob.jobParameters)
- for tmpFile in buildJob.Files:
- tmpFile.row_ID = None
- tmpFile.GUID = None
- tmpFile.status = 'unknown'
- tmpFile.PandaID = None
- tmpFile.dataset = newLibDS
- tmpFile.destinationDBlock = tmpFile.dataset
- tmpFile.lfn = re.sub(oldLibDS,newLibDS,tmpFile.lfn)
- return buildJob,oldLibDS,newLibDS
diff --git a/current/pandaserver/userinterface/UserIF.py b/current/pandaserver/userinterface/UserIF.py
deleted file mode 100755
index 31aa5cc0c..000000000
--- a/current/pandaserver/userinterface/UserIF.py
+++ /dev/null
@@ -1,1570 +0,0 @@
-'''
-provide web interface to users
-
-'''
-
-import re
-import sys
-import time
-import types
-import cPickle as pickle
-import jobdispatcher.Protocol as Protocol
-import brokerage.broker
-import taskbuffer.ProcessGroups
-from config import panda_config
-from taskbuffer.JobSpec import JobSpec
-from taskbuffer.WrappedPickle import WrappedPickle
-from brokerage.SiteMapper import SiteMapper
-from pandalogger.PandaLogger import PandaLogger
-from RbLauncher import RbLauncher
-from ReBroker import ReBroker
-from taskbuffer import PrioUtil
-from dataservice.DDM import dq2Info
-
-# logger
-_logger = PandaLogger().getLogger('UserIF')
-
-
-# main class
-class UserIF:
- # constructor
- def __init__(self):
- self.taskBuffer = None
-
-
- # initialize
- def init(self,taskBuffer):
- self.taskBuffer = taskBuffer
-
-
- # submit jobs
- def submitJobs(self,jobsStr,user,host,userFQANs,prodRole=False,toPending=False):
- try:
- # deserialize jobspecs
- jobs = WrappedPickle.loads(jobsStr)
- _logger.debug("submitJobs %s len:%s FQAN:%s" % (user,len(jobs),str(userFQANs)))
- maxJobs = 5000
- if len(jobs) > maxJobs:
- _logger.error("too may jobs more than %s" % maxJobs)
- jobs = jobs[:maxJobs]
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("submitJobs : %s %s" % (type,value))
- jobs = []
- # check prodSourceLabel
- try:
- goodProdSourceLabel = True
- for tmpJob in jobs:
- # prevent internal jobs from being submitted from outside
- if tmpJob.prodSourceLabel in taskbuffer.ProcessGroups.internalSourceLabels:
- _logger.error("submitJobs %s wrong prodSourceLabel=%s" % (user,tmpJob.prodSourceLabel))
- goodProdSourceLabel = False
- break
- # check production role
- if tmpJob.prodSourceLabel in ['managed']:
- if not prodRole:
- _logger.error("submitJobs %s missing prod-role for prodSourceLabel=%s" % (user,tmpJob.prodSourceLabel))
- goodProdSourceLabel = False
- break
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("submitJobs : checking goodProdSourceLabel %s %s" % (errType,errValue))
- goodProdSourceLabel = False
- # reject injection for bad prodSourceLabel
- if not goodProdSourceLabel:
- return "ERROR: production role is required for production jobs"
- # store jobs
- ret = self.taskBuffer.storeJobs(jobs,user,forkSetupper=True,fqans=userFQANs,
- hostname=host,toPending=toPending)
- _logger.debug("submitJobs %s ->:%s" % (user,len(ret)))
- # serialize
- return pickle.dumps(ret)
-
-
- # logger interface
- def sendLogInfo(self,user,msgType,msgListStr):
- try:
- # deserialize message
- msgList = WrappedPickle.loads(msgListStr)
- # short user name
- cUID = self.taskBuffer.cleanUserID(user)
- # logging
- iMsg = 0
- for msgBody in msgList:
- # make message
- message = "dn='%s' %s" % (cUID,msgBody)
- # send message to logger
- if msgType in ['analy_brokerage']:
- brokerage.broker.sendMsgToLogger(message)
- # get logger
- _pandaLogger = PandaLogger()
- _pandaLogger.lock()
- _pandaLogger.setParams({'Type':msgType})
- logger = _pandaLogger.getHttpLogger(panda_config.loggername)
- # add message
- logger.info(message)
- # release HTTP handler
- _pandaLogger.release()
- # sleep
- iMsg += 1
- if iMsg % 5 == 0:
- time.sleep(1)
- except:
- pass
- # return
- return True
-
-
- # run task assignment
- def runTaskAssignment(self,jobsStr):
- try:
- # deserialize jobspecs
- jobs = WrappedPickle.loads(jobsStr)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("runTaskAssignment : %s %s" % (type,value))
- jobs = []
- # run
- ret = self.taskBuffer.runTaskAssignment(jobs)
- # serialize
- return pickle.dumps(ret)
-
-
- # get serial number for group job
- def getSerialNumberForGroupJob(self,name):
- # get
- ret = self.taskBuffer.getSerialNumberForGroupJob(name)
- # serialize
- return pickle.dumps(ret)
-
-
- # change job priorities
- def changeJobPriorities(self,user,prodRole,newPrioMapStr):
- # check production role
- if not prodRole:
- return False,"production role is required"
- try:
- # deserialize map
- newPrioMap = WrappedPickle.loads(newPrioMapStr)
- _logger.debug("changeJobPriorities %s : %s" % (user,str(newPrioMap)))
- # change
- ret = self.taskBuffer.changeJobPriorities(newPrioMap)
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("changeJobPriorities : %s %s" % (errType,errValue))
- return False,'internal server error'
- # serialize
- return ret
-
-
- # run rebrokerage
- def runReBrokerage(self,dn,jobID,cloud,excludedSite,forceRebro):
- returnVal = "True"
- try:
- # lock job in simulation mode to check
- checker = ReBroker(self.taskBuffer,simulation=True,userRequest=True)
- stLock,retLock = checker.lockJob(dn,jobID)
- # failed
- if not stLock:
- returnVal = "ERROR: "+retLock
- return returnVal
- # continue to run rebrokerage in background
- if excludedSite in [None,'']:
- # use None for empty excludedSite
- excludedSite = None
- _logger.debug("runReBrokerage %s JobID:%s cloud=%s ex=%s forceOpt=%s" % (dn,jobID,cloud,str(excludedSite),forceRebro))
- # instantiate ReBroker
- thr = RbLauncher(dn,jobID,cloud,excludedSite)
- # start ReBroker
- thr.start()
- except:
- errType,errValue,errTraceBack = sys.exc_info()
- _logger.error("runReBrokerage: %s %s" % (errType,errValue))
- returnVal = "ERROR: runReBrokerage crashed"
- # return
- return returnVal
-
-
- # retry failed subjobs in running job
- def retryFailedJobsInActive(self,dn,jobID):
- returnVal = False
- try:
- _logger.debug("retryFailedJobsInActive %s JobID:%s" % (dn,jobID))
- cUID = self.taskBuffer.cleanUserID(dn)
- # instantiate ReBroker
- tmpRet = self.taskBuffer.retryJobsInActive(cUID,jobID)
- returnVal = True
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("retryFailedJobsInActive: %s %s" % (errType,errValue))
- returnVal = "ERROR: server side crash"
- # return
- return returnVal
-
-
- # set debug mode
- def setDebugMode(self,dn,pandaID,prodManager,modeOn):
- ret = self.taskBuffer.setDebugMode(dn,pandaID,prodManager,modeOn)
- # return
- return ret
-
-
- # insert sandbox file info
- def insertSandboxFileInfo(self,userName,hostName,fileName,fileSize,checkSum):
- ret = self.taskBuffer.insertSandboxFileInfo(userName,hostName,fileName,fileSize,checkSum)
- # return
- return ret
-
-
- # check duplicated sandbox file
- def checkSandboxFile(self,userName,fileSize,checkSum):
- ret = self.taskBuffer.checkSandboxFile(userName,fileSize,checkSum)
- # return
- return ret
-
-
- # get job status
- def getJobStatus(self,idsStr):
- try:
- # deserialize jobspecs
- ids = WrappedPickle.loads(idsStr)
- _logger.debug("getJobStatus len : %s" % len(ids))
- maxIDs = 5500
- if len(ids) > maxIDs:
- _logger.error("too long ID list more than %s" % maxIDs)
- ids = ids[:maxIDs]
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getJobStatus : %s %s" % (type,value))
- ids = []
- _logger.debug("getJobStatus start : %s" % ids)
- # peek jobs
- ret = self.taskBuffer.peekJobs(ids)
- _logger.debug("getJobStatus end")
- # serialize
- return pickle.dumps(ret)
-
-
- # get PandaID with jobexeID
- def getPandaIDwithJobExeID(self,idsStr):
- try:
- # deserialize jobspecs
- ids = WrappedPickle.loads(idsStr)
- _logger.debug("getPandaIDwithJobExeID len : %s" % len(ids))
- maxIDs = 5500
- if len(ids) > maxIDs:
- _logger.error("too long ID list more than %s" % maxIDs)
- ids = ids[:maxIDs]
- except:
- errtype,errvalue = sys.exc_info()[:2]
- _logger.error("getPandaIDwithJobExeID : %s %s" % (errtype,errvalue))
- ids = []
- _logger.debug("getPandaIDwithJobExeID start : %s" % ids)
- # peek jobs
- ret = self.taskBuffer.getPandaIDwithJobExeID(ids)
- _logger.debug("getPandaIDwithJobExeID end")
- # serialize
- return pickle.dumps(ret)
-
-
- # get assigned cloud for tasks
- def seeCloudTask(self,idsStr):
- try:
- # deserialize jobspecs
- ids = WrappedPickle.loads(idsStr)
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("seeCloudTask : %s %s" % (type,value))
- ids = []
- _logger.debug("seeCloudTask start : %s" % ids)
- # peek jobs
- ret = {}
- for id in ids:
- tmpRet = self.taskBuffer.seeCloudTask(id)
- ret[id] = tmpRet
- _logger.debug("seeCloudTask end")
- # serialize
- return pickle.dumps(ret)
-
-
- # get active datasets
- def getActiveDatasets(self,computingSite,prodSourceLabel):
- # run
- ret = self.taskBuffer.getActiveDatasets(computingSite,prodSourceLabel)
- # return
- return ret
-
-
- # get assigning task
- def getAssigningTask(self):
- # run
- ret = self.taskBuffer.getAssigningTask()
- # serialize
- return pickle.dumps(ret)
-
-
- # set task by user
- def setCloudTaskByUser(self,user,tid,cloud,status):
- # run
- ret = self.taskBuffer.setCloudTaskByUser(user,tid,cloud,status)
- return ret
-
-
- # add files to memcached
- def addFilesToMemcached(self,site,node,files):
- # add
- ret = self.taskBuffer.addFilesToMemcached(site,node,files)
- # return
- return ret
-
-
- # delete files from memcached
- def deleteFilesFromMemcached(self,site,node,files):
- # delete
- ret = self.taskBuffer.deleteFilesFromMemcached(site,node,files)
- # return
- return ret
-
-
- # flush memcached
- def flushMemcached(self,site,node):
- # flush
- ret = self.taskBuffer.flushMemcached(site,node)
- # return
- return ret
-
-
- # check files with memcached
- def checkFilesWithMemcached(self,site,node,files):
- # check
- ret = self.taskBuffer.checkFilesWithMemcached(site,node,files)
- # return
- return ret
-
-
- # get job statistics
- def getJobStatistics(self,sourcetype=None):
- # get job statistics
- ret = self.taskBuffer.getJobStatisticsForExtIF(sourcetype)
- # serialize
- return pickle.dumps(ret)
-
-
- # get highest prio jobs
- def getHighestPrioJobStat(self,perPG=False,useMorePG=False):
- # get job statistics
- ret = self.taskBuffer.getHighestPrioJobStat(perPG,useMorePG)
- # serialize
- return pickle.dumps(ret)
-
-
- # get queued analysis jobs at a site
- def getQueuedAnalJobs(self,site,dn):
- # get job statistics
- ret = self.taskBuffer.getQueuedAnalJobs(site,dn)
- # serialize
- return pickle.dumps(ret)
-
-
- # get job statistics for Bamboo
- def getJobStatisticsForBamboo(self,useMorePG=False):
- # get job statistics
- ret = self.taskBuffer.getJobStatisticsForBamboo(useMorePG)
- # serialize
- return pickle.dumps(ret)
-
-
- # get job statistics per site
- def getJobStatisticsPerSite(self,predefined=False,workingGroup='',countryGroup='',jobType='',
- minPriority=None,readArchived=True):
- # get job statistics
- ret = self.taskBuffer.getJobStatistics(readArchived,predefined,workingGroup,countryGroup,jobType,
- minPriority=minPriority)
- # serialize
- return pickle.dumps(ret)
-
-
- # get the number of waiting jobs per site and use
- def getJobStatisticsPerUserSite(self):
- # get job statistics
- ret = self.taskBuffer.getJobStatisticsPerUserSite()
- # serialize
- return pickle.dumps(ret)
-
-
- # get job statistics per site with label
- def getJobStatisticsWithLabel(self,site):
- # get job statistics
- ret = self.taskBuffer.getJobStatisticsWithLabel(site)
- # serialize
- return pickle.dumps(ret)
-
-
- # query PandaIDs
- def queryPandaIDs(self,idsStr):
- # deserialize IDs
- ids = WrappedPickle.loads(idsStr)
- # query PandaIDs
- ret = self.taskBuffer.queryPandaIDs(ids)
- # serialize
- return pickle.dumps(ret)
-
-
- # get number of analysis jobs per user
- def getNUserJobs(self,siteName,nJobs):
- # get
- ret = self.taskBuffer.getNUserJobs(siteName,nJobs)
- # serialize
- return pickle.dumps(ret)
-
-
- # query job info per cloud
- def queryJobInfoPerCloud(self,cloud,schedulerID):
- # query PandaIDs
- ret = self.taskBuffer.queryJobInfoPerCloud(cloud,schedulerID)
- # serialize
- return pickle.dumps(ret)
-
-
- # query PandaIDs at site
- def getPandaIDsSite(self,site,status,limit):
- # query PandaIDs
- ret = self.taskBuffer.getPandaIDsSite(site,status,limit)
- # serialize
- return pickle.dumps(ret)
-
-
- # get PandaIDs to be updated in prodDB
- def getJobsToBeUpdated(self,limit,lockedby):
- # query PandaIDs
- ret = self.taskBuffer.getPandaIDsForProdDB(limit,lockedby)
- # serialize
- return pickle.dumps(ret)
-
-
- # update prodDBUpdateTimes
- def updateProdDBUpdateTimes(self,paramsStr):
- # deserialize IDs
- params = WrappedPickle.loads(paramsStr)
- # get jobs
- ret = self.taskBuffer.updateProdDBUpdateTimes(params)
- # serialize
- return pickle.dumps(True)
-
-
- # query last files in datasets
- def queryLastFilesInDataset(self,datasetStr):
- # deserialize names
- datasets = WrappedPickle.loads(datasetStr)
- # get files
- ret = self.taskBuffer.queryLastFilesInDataset(datasets)
- # serialize
- return pickle.dumps(ret)
-
-
- # get input files currently in used for analysis
- def getFilesInUseForAnal(self,outDataset):
- # get files
- ret = self.taskBuffer.getFilesInUseForAnal(outDataset)
- # serialize
- return pickle.dumps(ret)
-
-
- # get list of dis dataset to get input files in shadow
- def getDisInUseForAnal(self,outDataset):
- # get files
- ret = self.taskBuffer.getDisInUseForAnal(outDataset)
- # serialize
- return pickle.dumps(ret)
-
-
- # get input LFNs currently in use for analysis with shadow dis
- def getLFNsInUseForAnal(self,inputDisListStr):
- # deserialize IDs
- inputDisList = WrappedPickle.loads(inputDisListStr)
- # get files
- ret = self.taskBuffer.getLFNsInUseForAnal(inputDisList)
- # serialize
- return pickle.dumps(ret)
-
-
- # kill jobs
- def killJobs(self,idsStr,user,host,code,prodManager,useMailAsID,fqans):
- # deserialize IDs
- ids = WrappedPickle.loads(idsStr)
- if not isinstance(ids,types.ListType):
- ids = [ids]
- _logger.debug("killJob : %s %s %s %s %s" % (user,code,prodManager,fqans,ids))
- try:
- if useMailAsID:
- _logger.debug("killJob : getting mail address for %s" % user)
- realDN = re.sub('/CN=limited proxy','',user)
- realDN = re.sub('(/CN=proxy)+','',realDN)
- nTry = 3
- for iDDMTry in range(nTry):
- status,out = dq2Info.finger(realDN)
- if status == 0:
- exec "userInfo=%s" % out
- _logger.debug("killJob : %s is converted to %s" % (user,userInfo['email']))
- user = userInfo['email']
- break
- time.sleep(1)
- except:
- errType,errValue = sys.exc_info()[:2]
- _logger.error("killJob : failed to convert email address %s : %s %s" % (user,errType,errValue))
- # get working groups with prod role
- wgProdRole = []
- for fqan in fqans:
- tmpMatch = re.search('/atlas/([^/]+)/Role=production',fqan)
- if tmpMatch != None:
- # ignore usatlas since it is used as atlas prod role
- tmpWG = tmpMatch.group(1)
- if not tmpWG in ['','usatlas']+wgProdRole:
- wgProdRole.append(tmpWG)
- # group production
- wgProdRole.append('gr_%s' % tmpWG)
- # kill jobs
- ret = self.taskBuffer.killJobs(ids,user,code,prodManager,wgProdRole)
- # logging
- try:
- # make message
- message = '%s - PandaID =' % host
- maxID = 10
- for id in ids[:maxID]:
- message += ' %s' % id
- if len(ids) > maxID:
- message += ' ...'
- # get logger
- _pandaLogger = PandaLogger()
- _pandaLogger.lock()
- _pandaLogger.setParams({'Type':'killJobs','User':user})
- logger = _pandaLogger.getHttpLogger(panda_config.loggername)
- # add message
- logger.info(message)
- # release HTTP handler
- _pandaLogger.release()
- except:
- pass
- # serialize
- return pickle.dumps(ret)
-
-
- # reassign jobs
- def reassignJobs(self,idsStr,user,host,forPending):
- # deserialize IDs
- ids = WrappedPickle.loads(idsStr)
- # reassign jobs
- ret = self.taskBuffer.reassignJobs(ids,forkSetupper=True,forPending=forPending)
- # logging
- try:
- # make message
- message = '%s - PandaID =' % host
- maxID = 10
- for id in ids[:maxID]:
- message += ' %s' % id
- if len(ids) > maxID:
- message += ' ...'
- # get logger
- _pandaLogger = PandaLogger()
- _pandaLogger.lock()
- _pandaLogger.setParams({'Type':'reassignJobs','User':user})
- logger = _pandaLogger.getHttpLogger(panda_config.loggername)
- # add message
- logger.info(message)
- # release HTTP handler
- _pandaLogger.release()
- except:
- pass
- # serialize
- return pickle.dumps(ret)
-
-
- # resubmit jobs
- def resubmitJobs(self,idsStr):
- # deserialize IDs
- ids = WrappedPickle.loads(idsStr)
- # kill jobs
- ret = self.taskBuffer.resubmitJobs(ids)
- # serialize
- return pickle.dumps(ret)
-
-
- # get list of site spec
- def getSiteSpecs(self,siteType='analysis'):
- # get analysis site list
- specList = {}
- siteMapper = SiteMapper(self.taskBuffer)
- for id,spec in siteMapper.siteSpecList.iteritems():
- if siteType == 'all' or spec.type == siteType:
- # convert to map
- tmpSpec = {}
- for attr in spec._attributes:
- tmpSpec[attr] = getattr(spec,attr)
- specList[id] = tmpSpec
- # serialize
- return pickle.dumps(specList)
-
-
- # get list of cloud spec
- def getCloudSpecs(self):
- # get cloud list
- siteMapper = SiteMapper(self.taskBuffer)
- # serialize
- return pickle.dumps(siteMapper.cloudSpec)
-
-
- # get list of cache prefix
- def getCachePrefixes(self):
- # get
- ret = self.taskBuffer.getCachePrefixes()
- # serialize
- return pickle.dumps(ret)
-
-
- # get nPilots
- def getNumPilots(self):
- # get nPilots
- ret = self.taskBuffer.getCurrentSiteData()
- numMap = {}
- for siteID,siteNumMap in ret.iteritems():
- nPilots = 0
- # nPilots = getJob+updateJob
- if siteNumMap.has_key('getJob'):
- nPilots += siteNumMap['getJob']
- if siteNumMap.has_key('updateJob'):
- nPilots += siteNumMap['updateJob']
- # append
- numMap[siteID] = {'nPilots':nPilots}
- # serialize
- return pickle.dumps(numMap)
-
-
- # run brokerage
- def runBrokerage(self,sitesStr,cmtConfig,atlasRelease,trustIS=False,processingType=None,
- dn=None,loggingFlag=False,memorySize=None,workingGroup=None,fqans=[],
- nJobs=None,preferHomeCountry=False,siteReliability=None,maxCpuCount=None):
- if not loggingFlag:
- ret = 'NULL'
- else:
- ret = {'site':'NULL','logInfo':[]}
- try:
- # deserialize sites
- sites = WrappedPickle.loads(sitesStr)
- # instantiate siteMapper
- siteMapper = SiteMapper(self.taskBuffer)
- # instantiate job
- job = JobSpec()
- job.AtlasRelease = atlasRelease
- job.cmtConfig = cmtConfig
- if processingType != None:
- job.processingType = processingType
- if memorySize != None:
- job.minRamCount = memorySize
- if workingGroup != None:
- userDefinedWG = True
- validWorkingGroup = True
- job.workingGroup = workingGroup
- else:
- userDefinedWG = False
- validWorkingGroup = False
- if maxCpuCount != None:
- job.maxCpuCount = maxCpuCount
- # get parameters related to priority
- withProdRole,workingGroup,priorityOffset,serNum,weight = self.taskBuffer.getPrioParameters([job],dn,fqans,
- userDefinedWG,
- validWorkingGroup)
- # get min priority using nJobs
- try:
- nJobs = long(nJobs)
- except:
- # use 200 as a default # of jobs
- nJobs =200
- minPrio = PrioUtil.calculatePriority(priorityOffset,serNum+nJobs,weight)
- # get countryGroup
- prefCountries = []
- if preferHomeCountry:
- for tmpFQAN in fqans:
- match = re.search('^/atlas/([^/]+)/',tmpFQAN)
- if match != None:
- tmpCountry = match.group(1)
- # use country code or usatlas
- if len(tmpCountry) == 2:
- prefCountries.append(tmpCountry)
- break
- # usatlas
- if tmpCountry in ['usatlas']:
- prefCountries.append('us')
- break
- # run brokerage
- _logger.debug("runBrokerage for dn=%s FQAN=%s minPrio=%s preferred:%s:%s" % (dn,str(fqans),minPrio,
- preferHomeCountry,
- str(prefCountries)))
- brokerage.broker.schedule([job],self.taskBuffer,siteMapper,True,sites,trustIS,dn,
- reportLog=loggingFlag,minPriority=minPrio,preferredCountries=prefCountries,
- siteReliability=siteReliability)
- # get computingSite
- if not loggingFlag:
- ret = job.computingSite
- else:
- ret = pickle.dumps({'site':job.computingSite,'logInfo':job.brokerageErrorDiag})
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("runBrokerage : %s %s" % (type,value))
- return ret
-
-
- # get script for offline running
- def getScriptOfflineRunning(self,pandaID):
- # register
- ret = self.taskBuffer.getScriptOfflineRunning(pandaID)
- # return
- return ret
-
-
- # register proxy key
- def registerProxyKey(self,params):
- # register
- ret = self.taskBuffer.registerProxyKey(params)
- # return
- return ret
-
-
- # get client version
- def getPandaClientVer(self):
- # get
- ret = self.taskBuffer.getPandaClientVer()
- # return
- return ret
-
-
- # get proxy key
- def getProxyKey(self,dn):
- # get files
- ret = self.taskBuffer.getProxyKey(dn)
- # serialize
- return pickle.dumps(ret)
-
-
- # get slimmed file info with PandaIDs
- def getSlimmedFileInfoPandaIDs(self,pandaIDsStr,dn):
- try:
- # deserialize IDs
- pandaIDs = WrappedPickle.loads(pandaIDsStr)
- # truncate
- maxIDs = 5500
- if len(pandaIDs) > maxIDs:
- _logger.error("too long ID list more than %s" % maxIDs)
- pandaIDs = pandaIDs[:maxIDs]
- # get
- _logger.debug("getSlimmedFileInfoPandaIDs start : %s %s" % (dn,len(pandaIDs)))
- ret = self.taskBuffer.getSlimmedFileInfoPandaIDs(pandaIDs)
- _logger.debug("getSlimmedFileInfoPandaIDs end")
- except:
- ret = {}
- # serialize
- return pickle.dumps(ret)
-
-
- # get JobIDs in a time range
- def getJobIDsInTimeRange(self,dn,timeRange):
- # get IDs
- ret = self.taskBuffer.getJobIDsInTimeRange(dn,timeRange)
- # serialize
- return pickle.dumps(ret)
-
-
- # get PandaIDs for a JobID
- def getPandIDsWithJobID(self,dn,jobID,nJobs):
- # get IDs
- ret = self.taskBuffer.getPandIDsWithJobID(dn,jobID,nJobs)
- # serialize
- return pickle.dumps(ret)
-
-
- # check merge job generation status
- def checkMergeGenerationStatus(self,dn,jobID):
- # check
- ret = self.taskBuffer.checkMergeGenerationStatus(dn,jobID)
- # serialize
- return pickle.dumps(ret)
-
-
- # get full job status
- def getFullJobStatus(self,idsStr,dn):
- try:
- # deserialize jobspecs
- ids = WrappedPickle.loads(idsStr)
- # truncate
- maxIDs = 5500
- if len(ids) > maxIDs:
- _logger.error("too long ID list more than %s" % maxIDs)
- ids = ids[:maxIDs]
- except:
- type, value, traceBack = sys.exc_info()
- _logger.error("getFullJobStatus : %s %s" % (type,value))
- ids = []
- _logger.debug("getFullJobStatus start : %s %s" % (dn,str(ids)))
- # peek jobs
- ret = self.taskBuffer.getFullJobStatus(ids)
- _logger.debug("getFullJobStatus end")
- # serialize
- return pickle.dumps(ret)
-
-
- # add account to siteaccess
- def addSiteAccess(self,siteID,dn):
- # add
- ret = self.taskBuffer.addSiteAccess(siteID,dn)
- # serialize
- return pickle.dumps(ret)
-
-
- # list site access
- def listSiteAccess(self,siteID,dn,longFormat=False):
- # list
- ret = self.taskBuffer.listSiteAccess(siteID,dn,longFormat)
- # serialize
- return pickle.dumps(ret)
-
-
- # update site access
- def updateSiteAccess(self,method,siteid,requesterDN,userName,attrValue):
- # list
- ret = self.taskBuffer.updateSiteAccess(method,siteid,requesterDN,userName,attrValue)
- # serialize
- return str(ret)
-
-
-# Singleton
-userIF = UserIF()
-del UserIF
-
-
-# get FQANs
-def _getFQAN(req):
- fqans = []
- for tmpKey,tmpVal in req.subprocess_env.iteritems():
- # compact credentials
- if tmpKey.startswith('GRST_CRED_'):
- # VOMS attribute
- if tmpVal.startswith('VOMS'):
- # FQAN
- fqan = tmpVal.split()[-1]
- # append
- fqans.append(fqan)
- # old style
- elif tmpKey.startswith('GRST_CONN_'):
- tmpItems = tmpVal.split(':')
- # FQAN
- if len(tmpItems)==2 and tmpItems[0]=='fqan':
- fqans.append(tmpItems[-1])
- # return
- return fqans
-
-
-# get DN
-def _getDN(req):
- realDN = ''
- if req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- realDN = req.subprocess_env['SSL_CLIENT_S_DN']
- # remove redundant CN
- realDN = re.sub('/CN=limited proxy','',realDN)
- realDN = re.sub('/CN=proxy(/CN=proxy)+','/CN=proxy',realDN)
- return realDN
-
-
-# check role
-def _isProdRoleATLAS(req):
- # check role
- prodManager = False
- # get FQANs
- fqans = _getFQAN(req)
- # loop over all FQANs
- for fqan in fqans:
- # check production role
- for rolePat in ['/atlas/usatlas/Role=production','/atlas/Role=production']:
- if fqan.startswith(rolePat):
- return True
- return False
-
-
-
-"""
-web service interface
-
-"""
-
-# security check
-def isSecure(req):
- # check security
- if not Protocol.isSecure(req):
- return False
- # disable limited proxy
- if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']:
- _logger.warning("access via limited proxy : %s" % req.subprocess_env['SSL_CLIENT_S_DN'])
- return False
- return True
-
-
-# submit jobs
-def submitJobs(req,jobs,toPending=None):
- # check security
- if not isSecure(req):
- return False
- # get DN
- user = None
- if req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- user = _getDN(req)
- # get FQAN
- fqans = _getFQAN(req)
- # hostname
- host = req.get_remote_host()
- # production Role
- prodRole = _isProdRoleATLAS(req)
- # to pending
- if toPending == 'True':
- toPending = True
- else:
- toPending = False
- return userIF.submitJobs(jobs,user,host,fqans,prodRole,toPending)
-
-
-# run task assignment
-def runTaskAssignment(req,jobs):
- # check security
- if not isSecure(req):
- return "False"
- return userIF.runTaskAssignment(jobs)
-
-
-# get job status
-def getJobStatus(req,ids):
- return userIF.getJobStatus(ids)
-
-
-# get PandaID with jobexeID
-def getPandaIDwithJobExeID(req,ids):
- return userIF.getPandaIDwithJobExeID(ids)
-
-
-# get queued analysis jobs at a site
-def getQueuedAnalJobs(req,site):
- # check security
- if not isSecure(req):
- return "ERROR: SSL is required"
- # get DN
- user = None
- if req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- user = _getDN(req)
- return userIF.getQueuedAnalJobs(site,user)
-
-
-# get active datasets
-def getActiveDatasets(req,computingSite,prodSourceLabel='managed'):
- return userIF.getActiveDatasets(computingSite,prodSourceLabel)
-
-
-# get assigning task
-def getAssigningTask(req):
- return userIF.getAssigningTask()
-
-
-# get assigned cloud for tasks
-def seeCloudTask(req,ids):
- return userIF.seeCloudTask(ids)
-
-
-# set task by user
-def setCloudTaskByUser(req,tid,cloud='',status=''):
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return "ERROR: SSL connection is required"
- user = _getDN(req)
- # check role
- if not _isProdRoleATLAS(req):
- return "ERROR: production role is required"
- return userIF.setCloudTaskByUser(user,tid,cloud,status)
-
-
-# set debug mode
-def setDebugMode(req,pandaID,modeOn):
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return "ERROR: SSL connection is required"
- user = _getDN(req)
- # check role
- prodManager = _isProdRoleATLAS(req)
- # mode
- if modeOn == 'True':
- modeOn = True
- else:
- modeOn = False
- # exec
- return userIF.setDebugMode(user,pandaID,prodManager,modeOn)
-
-
-# insert sandbox file info
-def insertSandboxFileInfo(req,userName,fileName,fileSize,checkSum):
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return "ERROR: SSL connection is required"
- user = _getDN(req)
- # check role
- prodManager = _isProdRoleATLAS(req)
- if not prodManager:
- return "ERROR: missing role"
- # hostname
- hostName = req.get_remote_host()
- # exec
- return userIF.insertSandboxFileInfo(userName,hostName,fileName,fileSize,checkSum)
-
-
-# check duplicated sandbox file
-def checkSandboxFile(req,fileSize,checkSum):
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return "ERROR: SSL connection is required"
- user = _getDN(req)
- # exec
- return userIF.checkSandboxFile(user,fileSize,checkSum)
-
-
-# add files to memcached
-def addFilesToCacheDB(req,site,node,guids='',lfns=''):
- # exec
- return userIF.addFilesToMemcached(site,node,lfns)
-
-
-# delete files from memcached
-def deleteFilesFromCacheDB(req,site,node,guids='',lfns=''):
- # exec
- return userIF.deleteFilesFromMemcached(site,node,lfns)
-
-
-# flush memcached
-def flushCacheDB(req,site,node):
- # exec
- return userIF.flushMemcached(site,node)
-
-
-# check files with memcached
-def checkFilesWithCacheDB(req,site,node,guids='',lfns=''):
- # exec
- return userIF.checkFilesWithMemcached(site,node,lfns)
-
-
-# query PandaIDs
-def queryPandaIDs(req,ids):
- return userIF.queryPandaIDs(ids)
-
-
-# query job info per cloud
-def queryJobInfoPerCloud(req,cloud,schedulerID=None):
- return userIF.queryJobInfoPerCloud(cloud,schedulerID)
-
-
-# get PandaIDs at site
-def getPandaIDsSite(req,site,status,limit=500):
- return userIF.getPandaIDsSite(site,status,limit)
-
-
-# get PandaIDs to be updated in prodDB
-def getJobsToBeUpdated(req,limit=5000,lockedby=''):
- limit = int(limit)
- return userIF.getJobsToBeUpdated(limit,lockedby)
-
-
-# update prodDBUpdateTimes
-def updateProdDBUpdateTimes(req,params):
- # check security
- if not isSecure(req):
- return False
- return userIF.updateProdDBUpdateTimes(params)
-
-
-# get job statistics
-def getJobStatistics(req,sourcetype=None):
- return userIF.getJobStatistics(sourcetype)
-
-
-# get highest prio jobs
-def getHighestPrioJobStat(req,perPG=None,useMorePG=None):
- if perPG == 'True':
- perPG = True
- else:
- perPG = False
- if useMorePG == 'True':
- useMorePG = taskbuffer.ProcessGroups.extensionLevel_1
- elif useMorePG in ['False',None]:
- useMorePG = False
- else:
- try:
- useMorePG = int(useMorePG)
- except:
- useMorePG = False
- return userIF.getHighestPrioJobStat(perPG,useMorePG)
-
-
-# get job statistics for Babmoo
-def getJobStatisticsForBamboo(req,useMorePG=None):
- if useMorePG == 'True':
- useMorePG = taskbuffer.ProcessGroups.extensionLevel_1
- elif useMorePG in ['False',None]:
- useMorePG = False
- else:
- try:
- useMorePG = int(useMorePG)
- except:
- useMorePG = False
- return userIF.getJobStatisticsForBamboo(useMorePG)
-
-
-# get the number of waiting jobs per site and user
-def getJobStatisticsPerUserSite(req):
- return userIF.getJobStatisticsPerUserSite()
-
-
-# get job statistics per site
-def getJobStatisticsPerSite(req,predefined='False',workingGroup='',countryGroup='',jobType='',
- minPriority=None,readArchived=None):
- if predefined=='True':
- predefined=True
- else:
- predefined=False
- if minPriority != None:
- try:
- minPriority = int(minPriority)
- except:
- minPriority = None
- if readArchived=='True':
- readArchived = True
- elif readArchived=='False':
- readArchived = False
- else:
- host = req.get_remote_host()
- # read jobsArchived for panglia
- if re.search('panglia.*\.triumf\.ca$',host) != None or host in ['gridweb.triumf.ca']:
- readArchived = True
- else:
- readArchived = False
- return userIF.getJobStatisticsPerSite(predefined,workingGroup,countryGroup,jobType,
- minPriority,readArchived)
-
-
-# get job statistics per site with label
-def getJobStatisticsWithLabel(req,site=''):
- return userIF.getJobStatisticsWithLabel(site)
-
-
-# query last files in datasets
-def queryLastFilesInDataset(req,datasets):
- return userIF.queryLastFilesInDataset(datasets)
-
-
-# get input files currently in used for analysis
-def getFilesInUseForAnal(req,outDataset):
- return userIF.getFilesInUseForAnal(outDataset)
-
-
-# get list of dis dataset to get input files in shadow
-def getDisInUseForAnal(req,outDataset):
- return userIF.getDisInUseForAnal(outDataset)
-
-
-# get input LFNs currently in use for analysis with shadow dis
-def getLFNsInUseForAnal(req,inputDisList):
- return userIF.getLFNsInUseForAnal(inputDisList)
-
-
-# kill jobs
-def killJobs(req,ids,code=None,useMailAsID=None):
- # check security
- if not isSecure(req):
- return False
- # get DN
- user = None
- if req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- user = _getDN(req)
- # check role
- prodManager = False
- # get FQANs
- fqans = _getFQAN(req)
- # loop over all FQANs
- for fqan in fqans:
- # check production role
- for rolePat in ['/atlas/usatlas/Role=production','/atlas/Role=production']:
- if fqan.startswith(rolePat):
- prodManager = True
- break
- # escape
- if prodManager:
- break
- # use email address as ID
- if useMailAsID == 'True':
- useMailAsID = True
- else:
- useMailAsID = False
- # hostname
- host = req.get_remote_host()
- return userIF.killJobs(ids,user,host,code,prodManager,useMailAsID,fqans)
-
-
-# reassign jobs
-def reassignJobs(req,ids,forPending=None):
- # check security
- if not isSecure(req):
- return False
- # get DN
- user = None
- if req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- user = _getDN(req)
- # hostname
- host = req.get_remote_host()
- # for pending
- if forPending == 'True':
- forPending = True
- else:
- forPending = False
- return userIF.reassignJobs(ids,user,host,forPending)
-
-
-# resubmit jobs
-def resubmitJobs(req,ids):
- # check security
- if not isSecure(req):
- return False
- return userIF.resubmitJobs(ids)
-
-
-# change job priorities
-def changeJobPriorities(req,newPrioMap=None):
- # check security
- if not isSecure(req):
- return pickle.dumps((False,'secure connection is required'))
- # get DN
- user = None
- if req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- user = _getDN(req)
- # check role
- prodRole = _isProdRoleATLAS(req)
- ret = userIF.changeJobPriorities(user,prodRole,newPrioMap)
- return pickle.dumps(ret)
-
-
-# get list of site spec
-def getSiteSpecs(req,siteType=None):
- if siteType != None:
- return userIF.getSiteSpecs(siteType)
- else:
- return userIF.getSiteSpecs()
-
-# get list of cloud spec
-def getCloudSpecs(req):
- return userIF.getCloudSpecs()
-
-# get list of cache prefix
-def getCachePrefixes(req):
- return userIF.getCachePrefixes()
-
-# get client version
-def getPandaClientVer(req):
- return userIF.getPandaClientVer()
-
-# get nPilots
-def getNumPilots(req):
- return userIF.getNumPilots()
-
-# run brokerage
-def runBrokerage(req,sites,cmtConfig=None,atlasRelease=None,trustIS=False,processingType=None,
- loggingFlag=False,memorySize=None,workingGroup=None,nJobs=None,
- siteGroup=None,maxCpuCount=None):
- if trustIS=='True':
- trustIS = True
- else:
- trustIS = False
- if loggingFlag=='True':
- loggingFlag = True
- else:
- loggingFlag = False
- if memorySize != None:
- try:
- memorySize = long(memorySize)
- except:
- pass
- if siteGroup != None:
- try:
- siteGroup = int(siteGroup)
- except:
- siteGroup = None
- if maxCpuCount != None:
- try:
- maxCpuCount = int(maxCpuCount)
- except:
- maxCpuCount = None
- preferHomeCountry = True
- dn = _getDN(req)
- fqans = _getFQAN(req)
- return userIF.runBrokerage(sites,cmtConfig,atlasRelease,trustIS,processingType,dn,
- loggingFlag,memorySize,workingGroup,fqans,nJobs,preferHomeCountry,
- siteGroup,maxCpuCount)
-
-# run rebrokerage
-def runReBrokerage(req,jobID,libDS='',cloud=None,excludedSite=None,forceOpt=None):
- # check SSL
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return "ERROR: SSL connection is required"
- # get DN
- dn = _getDN(req)
- if dn == '':
- return "ERROR: could not get DN"
- # convert jobID to long
- try:
- jobID = long(jobID)
- except:
- return "ERROR: jobID is not an integer"
- # force option
- if forceOpt == 'True':
- forceOpt = True
- else:
- forceOpt = False
- return userIF.runReBrokerage(dn,jobID,cloud,excludedSite,forceOpt)
-
-
-# retry failed subjobs in running job
-def retryFailedJobsInActive(req,jobID):
- # check SSL
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return "ERROR: SSL connection is required"
- # get DN
- dn = _getDN(req)
- if dn == '':
- return "ERROR: could not get DN"
- # convert jobID to long
- try:
- jobID = long(jobID)
- except:
- return "ERROR: jobID is not an integer"
- return userIF.retryFailedJobsInActive(dn,jobID)
-
-
-# logger interface
-def sendLogInfo(req,msgType,msgList):
- # check SSL
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return "ERROR: SSL connection is required"
- # get DN
- dn = _getDN(req)
- if dn == '':
- return "ERROR: could not get DN"
- return userIF.sendLogInfo(dn,msgType,msgList)
-
-
-# get serial number for group job
-def getSerialNumberForGroupJob(req):
- # check SSL
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return "ERROR: SSL connection is required"
- # get DN
- dn = _getDN(req)
- if dn == '':
- return "ERROR: could not get DN"
- return userIF.getSerialNumberForGroupJob(dn)
-
-
-# get script for offline running
-def getScriptOfflineRunning(req,pandaID):
- return userIF.getScriptOfflineRunning(pandaID)
-
-
-# register proxy key
-def registerProxyKey(req,credname,origin,myproxy):
- # check security
- if not isSecure(req):
- return False
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return False
- # get expiration date
- if not req.subprocess_env.has_key('SSL_CLIENT_V_END'):
- return False
- params = {}
- params['dn'] = _getDN(req)
- # set parameters
- params['credname'] = credname
- params['origin'] = origin
- params['myproxy'] = myproxy
- # convert SSL_CLIENT_V_END
- try:
- expTime = req.subprocess_env['SSL_CLIENT_V_END']
- # remove redundant white spaces
- expTime = re.sub('\s+',' ',expTime)
- # convert to timestamp
- expTime = time.strptime(expTime,'%b %d %H:%M:%S %Y %Z')
- params['expires'] = time.strftime('%Y-%m-%d %H:%M:%S',expTime)
- except:
- _logger.error("registerProxyKey : failed to convert %s" % \
- req.subprocess_env['SSL_CLIENT_V_END'])
- # execute
- return userIF.registerProxyKey(params)
-
-
-# register proxy key
-def getProxyKey(req):
- # check security
- if not isSecure(req):
- return False
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return False
- dn = _getDN(req)
- # execute
- return userIF.getProxyKey(dn)
-
-
-# get JobIDs in a time range
-def getJobIDsInTimeRange(req,timeRange,dn=None):
- # check security
- if not isSecure(req):
- return False
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return False
- if dn == None:
- dn = _getDN(req)
- _logger.debug("getJobIDsInTimeRange %s %s" % (dn,timeRange))
- # execute
- return userIF.getJobIDsInTimeRange(dn,timeRange)
-
-
-# get PandaIDs for a JobID
-def getPandIDsWithJobID(req,jobID,nJobs,dn=None):
- # check security
- if not isSecure(req):
- return False
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return False
- if dn == None:
- dn = _getDN(req)
- _logger.debug("getPandIDsWithJobID %s JobID=%s nJobs=%s" % (dn,jobID,nJobs))
- # execute
- return userIF.getPandIDsWithJobID(dn,jobID,nJobs)
-
-
-# check merge job generation status
-def checkMergeGenerationStatus(req,jobID,dn=None):
- # check security
- if not isSecure(req):
- return False
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return False
- if dn == None:
- dn = _getDN(req)
- _logger.debug("checkMergeGenerationStatus %s JobID=%s" % (dn,jobID))
- # execute
- return userIF.checkMergeGenerationStatus(dn,jobID)
-
-
-# get slimmed file info with PandaIDs
-def getSlimmedFileInfoPandaIDs(req,ids):
- # check security
- if not isSecure(req):
- return False
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return False
- dn = _getDN(req)
- return userIF.getSlimmedFileInfoPandaIDs(ids,dn)
-
-
-# get full job status
-def getFullJobStatus(req,ids):
- # check security
- if not isSecure(req):
- return False
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return False
- dn = _getDN(req)
- return userIF.getFullJobStatus(ids,dn)
-
-
-# get number of analysis jobs per user
-def getNUserJobs(req,siteName,nJobs=100):
- # check security
- prodManager = False
- if not isSecure(req):
- return "Failed : HTTPS connection is required"
- # get FQANs
- fqans = _getFQAN(req)
- # loop over all FQANs
- for fqan in fqans:
- # check production role
- for rolePat in ['/atlas/usatlas/Role=production',
- '/atlas/Role=production',
- '/atlas/usatlas/Role=pilot',
- '/atlas/Role=pilot',
- ]:
- if fqan.startswith(rolePat):
- prodManager = True
- break
- # escape
- if prodManager:
- break
- # only prod managers can use this method
- if not prodManager:
- return "Failed : VOMS authorization failure"
- # convert nJobs to int
- try:
- nJobs = int(nJobs)
- except:
- nJobs = 100
- # execute
- return userIF.getNUserJobs(siteName,nJobs)
-
-
-# add account to siteaccess
-def addSiteAccess(req,siteID):
- # check security
- if not isSecure(req):
- return "False"
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return "False"
- dn = req.subprocess_env['SSL_CLIENT_S_DN']
- return userIF.addSiteAccess(siteID,dn)
-
-
-# list site access
-def listSiteAccess(req,siteID=None,longFormat=False):
- # check security
- if not isSecure(req):
- return "False"
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return "False"
- # set DN if siteID is none
- dn = None
- if siteID==None:
- dn = req.subprocess_env['SSL_CLIENT_S_DN']
- # convert longFormat option
- if longFormat == 'True':
- longFormat = True
- else:
- longFormat = False
- return userIF.listSiteAccess(siteID,dn,longFormat)
-
-
-# update site access
-def updateSiteAccess(req,method,siteid,userName,attrValue=''):
- # check security
- if not isSecure(req):
- return "non HTTPS"
- # get DN
- if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'):
- return "invalid DN"
- # set requester's DN
- requesterDN = req.subprocess_env['SSL_CLIENT_S_DN']
- # update
- return userIF.updateSiteAccess(method,siteid,requesterDN,userName,attrValue)
diff --git a/current/pandaserver/userinterface/__init__.py b/current/pandaserver/userinterface/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/current/pandaserver/userinterface/runReBroker.py b/current/pandaserver/userinterface/runReBroker.py
deleted file mode 100755
index e20e6d595..000000000
--- a/current/pandaserver/userinterface/runReBroker.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# exec
-def run(dn,jobID,cloud=None,excludedSite=None):
- # check parameters
- if dn == '':
- return False
- if jobID < 0:
- return False
- # password
- from config import panda_config
- passwd = panda_config.dbpasswd
- # initialize cx_Oracle using dummy connection
- from taskbuffer.Initializer import initializer
- initializer.init()
- # instantiate TB
- from taskbuffer.TaskBuffer import taskBuffer
- taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
- # run ReBroker
- from userinterface.ReBroker import ReBroker
- reThr = ReBroker(taskBuffer,cloud,excludedSite,userRequest=True)
- # lock
- stLock,retLock = reThr.lockJob(dn,jobID)
- # failed
- if not stLock:
- return False
- # start
- reThr.start()
- reThr.join()
- return True
-
-
-####################################################################
-# main
-def main():
- import sys
- import getopt
- # option class
- class _options:
- def __init__(self):
- pass
- options = _options()
- del _options
- # set default values
- options.jobID = -1
- options.dn = ''
- options.cloud = None
- options.excludedSite = None
- # get command-line parameters
- try:
- opts, args = getopt.getopt(sys.argv[1:],"j:d:c:e:")
- # set options
- for o, a in opts:
- if o in ("-j",):
- options.jobID = long(a)
- if o in ("-d",):
- options.dn = a
- if o in ("-c",):
- options.cloud = a
- if o in ("-e",):
- options.excludedSite = a.split(',')
- except:
- print("ERROR : Invalid options")
- sys.exit(1)
- # run
- run(options.dn,options.jobID,options.cloud,options.excludedSite)
- # return
- sys.exit(0)
-
-
-if __name__ == "__main__":
- main()
diff --git a/current/setup.cfg b/current/setup.cfg
deleted file mode 100644
index 74c606520..000000000
--- a/current/setup.cfg
+++ /dev/null
@@ -1,7 +0,0 @@
-[global]
-
-[bdist_rpm]
-provides = panda-server
-release = 1
-packager = Panda Team
-requires = python, panda-common
diff --git a/current/setup.py b/current/setup.py
deleted file mode 100755
index c88adbd41..000000000
--- a/current/setup.py
+++ /dev/null
@@ -1,203 +0,0 @@
-#!/usr/bin/env python
-#
-# Setup prog for Panda Server
-#
-#
-release_version='0.0.5'
-
-import re
-import sys
-import commands
-from distutils.core import setup
-from distutils.command.install import install as install_org
-from distutils.command.install_data import install_data as install_data_org
-
-# get panda specific params
-optPanda = {}
-newArgv = []
-idx = 0
-while idx < len(sys.argv):
- tmpArg = sys.argv[idx]
- if tmpArg.startswith('--panda_'):
- # panda params
- idx += 1
- if len(tmpArg.split('=')) == 2:
- # split to par and val if = is contained
- tmpVal = tmpArg.split('=')[-1]
- tmpArg = tmpArg.split('=')[0]
- elif len(tmpArg.split('=')) == 1:
- tmpVal = sys.argv[idx]
- idx += 1
- else:
- raise RuntimeError,"invalid panda option : %s" % tmpArg
- # get key
- tmpKey = re.sub('--panda_','',tmpArg)
- # set params
- optPanda[tmpKey] = tmpVal
- else:
- # normal opts
- idx += 1
- newArgv.append(tmpArg)
-# set new argv
-sys.argv = newArgv
-
-
-# set overall prefix for bdist_rpm
-class install_panda(install_org):
- def initialize_options (self):
- install_org.initialize_options(self)
- self.prefix = '/data/atlpan/srv'
-
-
-# generates files using templates and install them
-class install_data_panda (install_data_org):
-
- def initialize_options (self):
- install_data_org.initialize_options (self)
- self.install_purelib = None
-
- def finalize_options (self):
- # set install_purelib
- self.set_undefined_options('install',
- ('install_purelib','install_purelib'))
- # set reaming params
- install_data_org.finalize_options(self)
- # set hostname
- if optPanda.has_key('hostname') and optPanda['hostname'] != '':
- self.hostname = optPanda['hostname']
- else:
- self.hostname = commands.getoutput('hostname -f')
- # set user and group
- if optPanda.has_key('username') and optPanda['username'] != '':
- self.username = optPanda['username']
- else:
- self.username = commands.getoutput('id -un')
- if optPanda.has_key('usergroup') and optPanda['usergroup'] != '':
- self.usergroup = optPanda['usergroup']
- else:
- self.usergroup = commands.getoutput('id -gn')
-
-
- def run (self):
- # remove /usr for bdist/bdist_rpm
- match = re.search('(build/[^/]+/dumb)/usr',self.install_dir)
- if match != None:
- self.install_dir = re.sub(match.group(0),match.group(1),self.install_dir)
- # remove /var/tmp/*-buildroot for bdist_rpm
- match = re.search('(/var/tmp/.*-buildroot)/usr',self.install_dir)
- if match != None:
- self.install_dir = re.sub(match.group(0),match.group(1),self.install_dir)
- # create tmp area
- tmpDir = 'build/tmp'
- self.mkpath(tmpDir)
- new_data_files = []
- for destDir,dataFiles in self.data_files:
- newFilesList = []
- for srcFile in dataFiles:
- # check extension
- if not srcFile.endswith('.template'):
- raise RuntimeError,"%s doesn't have the .template extension" % srcFile
- # dest filename
- destFile = re.sub('(\.exe)*\.template$','',srcFile)
- destFile = destFile.split('/')[-1]
- destFile = '%s/%s' % (tmpDir,destFile)
- # open src
- inFile = open(srcFile)
- # read
- filedata=inFile.read()
- # close
- inFile.close()
- # replace patterns
- for item in re.findall('@@([^@]+)@@',filedata):
- if not hasattr(self,item):
- raise RuntimeError,'unknown pattern %s in %s' % (item,srcFile)
- # get pattern
- patt = getattr(self,item)
- # remove build/*/dump for bdist
- patt = re.sub('build/[^/]+/dumb','',patt)
- # remove /var/tmp/*-buildroot for bdist_rpm
- patt = re.sub('/var/tmp/.*-buildroot','',patt)
- # replace
- filedata = filedata.replace('@@%s@@' % item, patt)
- # write to dest
- oFile = open(destFile,'w')
- oFile.write(filedata)
- oFile.close()
- # chmod for exe
- if srcFile.endswith('.exe.template'):
- commands.getoutput('chmod +x %s' % destFile)
- # append
- newFilesList.append(destFile)
- # replace dataFiles to install generated file
- new_data_files.append((destDir,newFilesList))
- # install
- self.data_files = new_data_files
- install_data_org.run(self)
-
-
-# setup for distutils
-setup(
- name="panda-server",
- version=release_version,
- description=' PanDA Server Package',
- long_description='''This package contains PanDA Server Components''',
- license='GPL',
- author='Panda Team',
- author_email='hn-atlas-panda-pathena@cern.ch',
- url='https://twiki.cern.ch/twiki/bin/view/Atlas/PanDA',
- packages=[ 'pandaserver',
- 'pandaserver.brokerage',
- 'pandaserver.config',
- 'pandaserver.dataservice',
- 'pandaserver.jobdispatcher',
- 'pandaserver.server',
- 'pandaserver.taskbuffer',
- 'pandaserver.test',
- 'pandaserver.userinterface',
- ],
- data_files=[
- # config files
- ('etc/panda', ['templates/panda_server-httpd.conf.rpmnew.template',
- 'templates/panda_server-httpd-FastCGI.conf.rpmnew.template',
- 'templates/panda_server.cfg.rpmnew.template',
- 'templates/panda_server-grid-env.sh.template',
- ]
- ),
- # sysconfig
- ('etc/sysconfig', ['templates/panda_server-sysconfig.rpmnew.template',
- ]
- ),
- # logrotate
- ('etc/logrotate.d', ['templates/panda_server-logrotate.template',
- ]
- ),
- # init script
- ('etc/init.d', ['templates/panda_server-ctl.exe.template',
- ]
- ),
- # crons
- ('usr/bin', ['templates/panda_server-add.sh.exe.template',
- 'templates/panda_server-priority.sh.exe.template',
- 'templates/panda_server-copyArchive.sh.exe.template',
- 'templates/panda_server-copyROOT.sh.exe.template',
- 'templates/panda_server-vomsrenew.sh.exe.template',
- 'templates/panda_server-archivelog.sh.exe.template',
- 'templates/panda_server-tmpwatch.sh.exe.template',
- 'templates/panda_server-backupJobArch.sh.exe.template',
- 'templates/panda_server-deleteJobs.sh.exe.template',
- 'templates/panda_server-merge.sh.exe.template',
- 'templates/panda_server-datasetManager.sh.exe.template',
- 'templates/panda_server-evpPD2P.sh.exe.template',
- 'templates/panda_server-callback.sh.exe.template',
- 'templates/panda_server-makeSlsXml.exe.template',
- 'templates/panda_server-boostUser.sh.exe.template',
- 'templates/panda_server-runRebro.sh.exe.template',
- ]
- ),
- # var dirs
- #('var/log/panda', []),
- #('var/cache/pandaserver', []),
- ],
- cmdclass={'install': install_panda,
- 'install_data': install_data_panda}
-)
diff --git a/current/templates/panda_server-add.sh.exe.template b/current/templates/panda_server-add.sh.exe.template
deleted file mode 100755
index cce611988..000000000
--- a/current/templates/panda_server-add.sh.exe.template
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-# setup grid stuff
-source /opt/glite/etc/profile.d/grid-env.sh
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-# set PYTHONPATH for LFC.py
-export PYTHONPATH=/opt/lcg/lib64/python2.5/site-packages:$PYTHONPATH
-
-python2.5 @@install_purelib@@/pandaserver/test/add.py
diff --git a/current/templates/panda_server-archivelog.sh.exe.template b/current/templates/panda_server-archivelog.sh.exe.template
deleted file mode 100755
index 8a0a2c5ab..000000000
--- a/current/templates/panda_server-archivelog.sh.exe.template
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python @@install_purelib@@/pandaserver/test/archivelogs.py
diff --git a/current/templates/panda_server-backupJobArch.sh.exe.template b/current/templates/panda_server-backupJobArch.sh.exe.template
deleted file mode 100644
index bc896d843..000000000
--- a/current/templates/panda_server-backupJobArch.sh.exe.template
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python @@install_purelib@@/pandaserver/test/backupJobArch.py
diff --git a/current/templates/panda_server-boostUser.sh.exe.template b/current/templates/panda_server-boostUser.sh.exe.template
deleted file mode 100755
index f1541998e..000000000
--- a/current/templates/panda_server-boostUser.sh.exe.template
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-echo $1 | python2.5 @@install_purelib@@/pandaserver/test/boostUser.py
diff --git a/current/templates/panda_server-callback.sh.exe.template b/current/templates/panda_server-callback.sh.exe.template
deleted file mode 100755
index da833c70c..000000000
--- a/current/templates/panda_server-callback.sh.exe.template
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# setup grid stuff
-source /opt/glite/etc/profile.d/grid-env.sh
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python2.5 @@install_purelib@@/pandaserver/test/fileCallbackListener.py
diff --git a/current/templates/panda_server-copyArchive.sh.exe.template b/current/templates/panda_server-copyArchive.sh.exe.template
deleted file mode 100755
index 8005b4d3e..000000000
--- a/current/templates/panda_server-copyArchive.sh.exe.template
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# setup grid stuff
-source /opt/glite/etc/profile.d/grid-env.sh
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python2.5 @@install_purelib@@/pandaserver/test/copyArchive.py
diff --git a/current/templates/panda_server-copyROOT.sh.exe.template b/current/templates/panda_server-copyROOT.sh.exe.template
deleted file mode 100755
index efbd483be..000000000
--- a/current/templates/panda_server-copyROOT.sh.exe.template
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python @@install_purelib@@/pandaserver/test/copyROOT.py
diff --git a/current/templates/panda_server-ctl.exe.template b/current/templates/panda_server-ctl.exe.template
deleted file mode 100755
index 70a849b9c..000000000
--- a/current/templates/panda_server-ctl.exe.template
+++ /dev/null
@@ -1,139 +0,0 @@
-#!/bin/sh
-#
-# Copyright 2000-2004 The Apache Software Foundation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-# Apache control script designed to allow an easy command line interface
-# to controlling Apache. Written by Marc Slemko, 1997/08/23
-#
-# The exit codes returned are:
-# XXX this doc is no longer correct now that the interesting
-# XXX functions are handled by httpd
-# 0 - operation completed successfully
-# 1 -
-# 2 - usage error
-# 3 - httpd could not be started
-# 4 - httpd could not be stopped
-# 5 - httpd could not be started during a restart
-# 6 - httpd could not be restarted during a restart
-# 7 - httpd could not be restarted during a graceful restart
-# 8 - configuration syntax error
-#
-# When multiple arguments are given, only the error from the _last_
-# one is reported. Run "apachectl help" for usage info
-#
-ARGV="$@"
-#
-# |||||||||||||||||||| START CONFIGURATION SECTION ||||||||||||||||||||
-# -------------------- --------------------
-#
-# the path to your httpd binary, including options if necessary
-HTTPD='/usr/sbin/httpd.worker'
-
-#
-# a command that outputs a formatted text version of the HTML at the
-# url given on the command line. Designed for lynx, however other
-# programs may work.
-if [ -x /usr/bin/links ]; then
- LYNX="links -dump"
-elif [ -x /usr/bin/lynx ]; then
- LYNX="lynx -dump"
-else
- LYNX="none"
-fi
-
-#
-# the URL to your server's mod_status status page. If you do not
-# have one, then status and fullstatus will not work.
-STATUSURL="http://localhost:80/server-status"
-
-# Source /etc/sysconfig/httpd for $HTTPD setting, etc.
-if [ -r @@install_dir@@/etc/sysconfig/panda_server-sysconfig ]; then
- . @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-fi
-
-ERROR=0
-if [ "x$ARGV" = "x" ] ; then
- ARGV="-h"
-fi
-
-function check13() {
-# check for 1.3 configuration
-GONE="(ServerType|BindAddress|Port|AddModule|ClearModuleList|"
-GONE="${GONE}AgentLog|RefererLog|RefererIgnore|FancyIndexing|"
-GONE="${GONE}AccessConfig|ResourceConfig)"
-if grep -Eiq "^[[:space:]]*($GONE)" /etc/httpd/conf/httpd.conf; then
- echo "$0: Apache 1.3 configuration directives found"
- echo "$0: please read /usr/share/doc/httpd-2.0.52/migration.html"
- exit 2
-fi
-}
-
-function checklynx() {
-if [ "$LYNX" = "none" ]; then
- echo "The 'links' package is required for this functionality."
- exit 8
-fi
-}
-
-function testconfig() {
-# httpd is denied terminal access in SELinux, so run in the
-# current context to get stdout from $HTTPD -t.
-if test -x /usr/sbin/selinuxenabled && /usr/sbin/selinuxenabled; then
- runcon -- `id -Z` $HTTPD $OPTIONS -t
-else
- $HTTPD $OPTIONS -t
-fi
-ERROR=$?
-}
-
-case $ARGV in
-restart|graceful)
- if $HTTPD -t >&/dev/null; then
- $HTTPD $OPTIONS -k $ARGV
- ERROR=$?
- else
- echo "apachectl: Configuration syntax error, will not run \"$ARGV\":"
- testconfig
- fi
- ;;
-start|stop)
- check13
- $HTTPD $OPTIONS -k $ARGV
- ERROR=$?
- ;;
-startssl|sslstart|start-SSL)
- check13
- $HTTPD $OPTIONS -DSSL -k start
- ERROR=$?
- ;;
-configtest)
- testconfig
- ;;
-status)
- checklynx
- $LYNX $STATUSURL | awk ' /process$/ { print; exit } { print } '
- ;;
-fullstatus)
- checklynx
- $LYNX $STATUSURL
- ;;
-*)
- $HTTPD $OPTIONS $ARGV
- ERROR=$?
-esac
-
-exit $ERROR
-
diff --git a/current/templates/panda_server-datasetManager.sh.exe.template b/current/templates/panda_server-datasetManager.sh.exe.template
deleted file mode 100644
index 32abd2976..000000000
--- a/current/templates/panda_server-datasetManager.sh.exe.template
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# setup grid stuff
-source /opt/glite/etc/profile.d/grid-env.sh
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python2.5 @@install_purelib@@/pandaserver/test/datasetManager.py
diff --git a/current/templates/panda_server-deleteJobs.sh.exe.template b/current/templates/panda_server-deleteJobs.sh.exe.template
deleted file mode 100644
index fd48e9e7e..000000000
--- a/current/templates/panda_server-deleteJobs.sh.exe.template
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python @@install_purelib@@/pandaserver/test/deleteJobs.py
diff --git a/current/templates/panda_server-evpPD2P.sh.exe.template b/current/templates/panda_server-evpPD2P.sh.exe.template
deleted file mode 100755
index 8786da667..000000000
--- a/current/templates/panda_server-evpPD2P.sh.exe.template
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# setup grid stuff
-source /opt/glite/etc/profile.d/grid-env.sh
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python2.5 @@install_purelib@@/pandaserver/test/evpPD2P.py
diff --git a/current/templates/panda_server-grid-env.sh.template b/current/templates/panda_server-grid-env.sh.template
deleted file mode 100644
index c1e0d3321..000000000
--- a/current/templates/panda_server-grid-env.sh.template
+++ /dev/null
@@ -1,3 +0,0 @@
-export LD_LIBRARY_PATH=/opt/glite/lib64:/opt/globus/lib:/opt/lcg/lib64:$LD_LIBRARY_PATH
-export PYTHONPATH=/opt/glite/lib64/python:/opt/lcg/lib64/python:$PYTHONPATH
-export PATH=/opt/edg/bin:/opt/glite/bin:/opt/globus/bin:/opt/lcg/bin:$PATH
diff --git a/current/templates/panda_server-httpd-FastCGI.conf.rpmnew.template b/current/templates/panda_server-httpd-FastCGI.conf.rpmnew.template
deleted file mode 100644
index 0148c1eb0..000000000
--- a/current/templates/panda_server-httpd-FastCGI.conf.rpmnew.template
+++ /dev/null
@@ -1,177 +0,0 @@
-LoadModule access_module modules/mod_access.so
-LoadModule alias_module modules/mod_alias.so
-LoadModule rewrite_module modules/mod_rewrite.so
-LoadModule mime_magic_module modules/mod_mime_magic.so
-LoadModule mime_module modules/mod_mime.so
-LoadModule include_module modules/mod_include.so
-LoadModule log_config_module modules/mod_log_config.so
-LoadModule env_module modules/mod_env.so
-LoadModule deflate_module modules/mod_deflate.so
-LoadModule setenvif_module modules/mod_setenvif.so
-LoadModule dir_module modules/mod_dir.so
-LoadModule ssl_module modules/mod_ssl.so
-LoadModule headers_module modules/mod_headers.so
-LoadModule gridsite_module modules/mod_gridsite.so
-
-# FastCGI/WSGI
-#LoadModule fastcgi_module modules/mod_fastcgi.so
-LoadModule wsgi_module modules/mod_wsgi.so
-
-
-User atlpan
-Group zp
-
-
-StartServers 25
-MinSpareServers 25
-ServerLimit 512
-MaxSpareServers 512
-MaxClients 512
-MaxRequestsPerChild 2000
-
-
-ServerName pandaserver.cern.ch
-
-DocumentRoot "@@install_purelib@@/pandaserver"
-
-
- Order allow,deny
- Deny from all
-
-
-RedirectMatch 403 "/panda.py$"
-
-
- Options FollowSymLinks
- AllowOverride None
- Order allow,deny
- Allow from all
- Deny from 192.203.218.14
-
-
-Alias /trf/ "@@install_dir@@/var/trf/"
-Alias /cache/ "@@install_dir@@/var/cache/pandaserver/"
-Alias /appdir/ "@@install_dir@@/var/appdir/"
-
-
- Options FollowSymLinks
- AllowOverride None
- Order allow,deny
- Allow from all
- Deny from 192.203.218.14
-
-
-
- FastCgiIpcDir @@install_dir@@/var/log/panda/fastsocks
- FastCgiServer @@install_purelib@@/pandaserver/server/panda.py \
- -processes 25 -idle-timeout 300 -listen-queue-depth 1 -flush \
- -initial-env PYTHONPATH \
- -initial-env TZ \
- -initial-env HOME \
- -initial-env PANDA_HOME \
- -initial-env X509_CERT_DIR \
- -initial-env X509_USER_PROXY \
- -initial-env PANDA_URL \
- -initial-env PANDA_URL_SSL
- ScriptAliasMatch ^/server/panda/(.+)$ @@install_purelib@@/pandaserver/server/panda.py
-
-
-
- WSGIDaemonProcess pandasrv_daemon processes=25 threads=2 home=/home/atlpan
- WSGIProcessGroup pandasrv_daemon
- WSGIApplicationGroup %{GLOBAL}
- WSGIScriptAliasMatch ^/server/panda/(.+)$ @@install_purelib@@/pandaserver/server/panda.py
- WSGISocketPrefix @@install_dir@@/var/log/panda/wsgisocks/wsgi
-
-
-
-Listen 25080
-
-
-RewriteEngine on
-RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK)
-RewriteRule .* - [F]
-# use Cassandra for cache
-RewriteRule ^/cscache/(.*)$ /server/panda/getFile?fileName=$1 [PT,L]
-
-
-
-
- Order allow,deny
- Allow from all
- Deny from 192.203.218.14
-
-
- # allow .py
-
- Order allow,deny
- Allow from all
-
-
- # enable CGI for FastCGI/WSGI
- Options FollowSymLinks +ExecCGI
-
- # mod_gridsite
- GridSiteIndexes on
- GridSiteAuth on
- GridSiteDNlists /etc/grid-security/dn-lists/
- GridSiteEnvs on
-
-
-
-
-
-Listen 25443
-
-
-RewriteEngine on
-RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK)
-RewriteRule .* - [F]
-# use Cassandra for cache
-RewriteRule ^/cscache/(.*)$ /server/panda/getFile?fileName=$1 [PT,L]
-
-# CERN security recommendation to only allow the seven strongest ssl ciphers
-SSLProtocol -all +TLSv1 +SSLv3
-SSLCipherSuite HIGH:MEDIUM:+SSLv3
-
-SSLEngine on
-SSLCertificateFile /etc/grid-security/hostcert.pem
-SSLCertificateKeyFile /etc/grid-security/hostkey.pem
-SSLCACertificatePath /etc/grid-security/certificates
-SSLVerifyClient optional
-SSLVerifyDepth 10
-SSLOptions +ExportCertData +StdEnvVars
-
-
-
- # allow .py
-
- Order allow,deny
- Allow from all
-
-
- # enable CGI for FastCGI/WSGI
- Options FollowSymLinks +ExecCGI
-
- # mod_gridsite
- GridSiteIndexes on
- GridSiteAuth on
- GridSiteDNlists /etc/grid-security/dn-lists/
- GridSiteGSIProxyLimit 1
- GridSiteEnvs on
-
-
-
-
-LogLevel info
-
-LogFormat "%t %h \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined
-LogFormat "%t %h \"%r\" %>s %b" common
-LogFormat "%{Referer}i -> %U" referer
-LogFormat "%{User-agent}i" agent
-CustomLog @@install_dir@@/var/log/panda/panda_server_access_log common
-ErrorLog @@install_dir@@/var/log/panda/panda_server_error_log
-
-PidFile @@install_dir@@/var/log/panda/panda_server_httpd.pid
-
-TypesConfig /etc/mime.types
diff --git a/current/templates/panda_server-httpd.conf.rpmnew.template b/current/templates/panda_server-httpd.conf.rpmnew.template
deleted file mode 100644
index 6057f0cc4..000000000
--- a/current/templates/panda_server-httpd.conf.rpmnew.template
+++ /dev/null
@@ -1,141 +0,0 @@
-LoadModule access_module modules/mod_access.so
-LoadModule alias_module modules/mod_alias.so
-LoadModule rewrite_module modules/mod_rewrite.so
-LoadModule mime_magic_module modules/mod_mime_magic.so
-LoadModule mime_module modules/mod_mime.so
-LoadModule include_module modules/mod_include.so
-LoadModule log_config_module modules/mod_log_config.so
-LoadModule env_module modules/mod_env.so
-LoadModule deflate_module modules/mod_deflate.so
-LoadModule setenvif_module modules/mod_setenvif.so
-LoadModule dir_module modules/mod_dir.so
-LoadModule ssl_module modules/mod_ssl.so
-LoadModule python_module modules/mod_python.so
-LoadModule gridsite_module modules/mod_gridsite.so
-
-User atlpan
-Group zp
-
-
-StartServers 50
-MinSpareServers 50
-MaxSpareServers 50
-MaxClients 50
-MaxRequestsPerChild 0
-
-
-
-ServerLimit 10
-StartServers 10
-MaxClients 50
-MinSpareThreads 50
-MaxSpareThreads 50
-ThreadsPerChild 5
-MaxRequestsPerChild 0
-
-
-ServerName pandaserver.cern.ch
-
-DocumentRoot "@@install_purelib@@/pandaserver"
-
-
- Order allow,deny
- Deny from all
-
-
-
- Options FollowSymLinks
- AllowOverride None
- Order allow,deny
- Allow from all
- Deny from 192.203.218.14
-
-
-Alias /cache/ "@@install_dir@@/var/cache/pandaserver/"
-
-
- Options FollowSymLinks
- AllowOverride None
- Order allow,deny
- Allow from all
- Deny from 192.203.218.14
-
-
-Listen 25080
-
-
-RewriteEngine on
-RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK)
-RewriteRule .* - [F]
-
-
-
-
- Order allow,deny
- Allow from all
- Deny from 192.203.218.14
-
-
- # mod_python
- SetHandler python-program
- PythonHandler mod_python.publisher
- PythonDebug On
-
- # mod_gridsite
- GridSiteIndexes on
- GridSiteAuth on
- GridSiteDNlists /etc/grid-security/dn-lists/
- GridSiteEnvs on
-
-
-
-
-
-Listen 25443
-
-
-RewriteEngine on
-RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK)
-RewriteRule .* - [F]
-
-# CERN security recommendation to only allow the seven strongest ssl ciphers
-SSLProtocol -all +TLSv1 +SSLv3
-SSLCipherSuite HIGH:MEDIUM:+SSLv3
-
-SSLEngine on
-SSLCertificateFile /etc/grid-security/hostcert.pem
-SSLCertificateKeyFile /etc/grid-security/hostkey.pem
-SSLCACertificatePath /etc/grid-security/certificates
-SSLVerifyClient optional
-SSLVerifyDepth 10
-SSLOptions +ExportCertData +StdEnvVars
-
-
-
- # mod_python
- SetHandler python-program
- PythonHandler mod_python.publisher
- PythonDebug On
-
- # mod_gridsite
- GridSiteIndexes on
- GridSiteAuth on
- GridSiteDNlists /etc/grid-security/dn-lists/
- GridSiteGSIProxyLimit 1
- GridSiteEnvs on
-
-
-
-
-LogLevel info
-
-LogFormat "%t %h \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined
-LogFormat "%t %h \"%r\" %>s %b" common
-LogFormat "%{Referer}i -> %U" referer
-LogFormat "%{User-agent}i" agent
-CustomLog @@install_dir@@/var/log/panda/panda_server_access_log common
-ErrorLog @@install_dir@@/var/log/panda/panda_server_error_log
-
-PidFile @@install_dir@@/var/log/panda/panda_server_httpd.pid
-
-TypesConfig /etc/mime.types
diff --git a/current/templates/panda_server-logrotate.template b/current/templates/panda_server-logrotate.template
deleted file mode 100644
index a474741f6..000000000
--- a/current/templates/panda_server-logrotate.template
+++ /dev/null
@@ -1,14 +0,0 @@
-@@install_dir@@/var/log/panda/*log {
- rotate 180
- daily
- compress
- missingok
- notifempty
- sharedscripts
- daily
- postrotate
- killall -u atlpan python || true
- killall -u atlpan python2.5 || true
- /sbin/service httpd-pandasrv restart > /dev/null 2>/dev/null || true
- endscript
-}
diff --git a/current/templates/panda_server-makeSlsXml.exe.template b/current/templates/panda_server-makeSlsXml.exe.template
deleted file mode 100755
index 23e23b3d3..000000000
--- a/current/templates/panda_server-makeSlsXml.exe.template
+++ /dev/null
@@ -1,334 +0,0 @@
-#!/usr/bin/python2.5
-
-import SLSxml
-import socket
-import subprocess
-import re
-import sys
-import optparse
-
-###########################################
-## define options
-###########################################
-parser = optparse.OptionParser()
-parser.add_option( "-u", "--use", dest="use", type="string",
- help="Use of xml, allowed values: 'mon', 'server' or 'bamboo'" )
-parser.add_option( "--host", dest="host", type="string",
- help="Hostname of server to check, default is current machine hostname" )
-parser.add_option( "-d", "--dir", dest="dir", type="string",
- help="Filename of the xml file output. Default is " +
- "/data/atlpan/oracle/panda/monitoring" )
-parser.add_option( "--debug", action="store_true", dest="debug",
- default=False, help="Print out debug statements." )
-
-( options, args ) = parser.parse_args()
-
-def __main__() :
-
- if( options.host ) :
- host = options.host
- else :
- host = socket.gethostname()
- host = re.sub( r'^(\w+).*', r'\1', host )
-
- if( options.use == 'mon' ) :
- tmp_xml = make_monitor( host )
- file_part = 'PandaMon'
- elif( options.use == 'server' ) :
- tmp_xml = make_server( host )
- file_part = 'PandaServer'
- elif( options.use == 'bamboo' ) :
- tmp_xml = make_bamboo( host )
- file_part = 'PandaBamboo'
- else :
- print "Err: please choose a use, 'mon', 'server' or 'bamboo'."
- return
-
- if( options.dir ) :
- file_dir = options.dir
- else :
- file_dir = '/data/atlpan/oracle/panda/monitoring'
-
- file_name = '%s/%s_%s.xml' % ( file_dir, file_part, host )
- tmp_file = open( file_name, 'w' )
- tmp_file.write( tmp_xml )
- tmp_file.close
-
-def make_server( host ) :
-
- if( options.debug ) : print "Creating the server monitoring xml"
-
- server_avail = server_availability( host )
- add_processes = count_add_processes()
- num_holdings = count_holdings()
- data_used = volume_use( 'data' )
- var_used = volume_use( 'var' )
- ave_regtime = registration_time()
- ave_regtimeDQ2 = registration_time(onlyDQ2=True)
-
- sls_xml = SLSxml.xml_doc()
- sls_xml.set_id( 'PandaServer_%s' % ( host ) )
- sls_xml.set_shortname( 'PandaServer monitoring service at %s' % ( host ) )
- sls_xml.set_fullname( 'PandaServer monitoring service at %s' % ( host ) )
- sls_xml.set_availability( str( server_avail ) )
-
- sls_xml.add_data( "AddProcesses", "Number of processes for DQ2+LFC registration",
- str( add_processes ) )
- sls_xml.add_data( "HoldingJobs", "Number of holding jobs to be registered",
- str( num_holdings ) )
- sls_xml.add_data( "RegistrationTime", "Average time for DQ2+LFC registration in second",
- str( ave_regtime ) )
- sls_xml.add_data( "RegistrationTimeDQ2", "Average time for DQ2 registration in second",
- str( ave_regtimeDQ2 ) )
- sls_xml.add_data( "DataVolumeUse", "Percent use of the local /data volume",
- str( data_used ) )
- sls_xml.add_data( "VarVolumeUse", "Percent use of the local /var volume",
- str( var_used ) )
-
- return sls_xml.print_xml()
-
-def make_bamboo( host ) :
-
- if( options.debug ) : print "Creating the server monitoring xml"
-
- server_avail = bamboo_availability( host )
-
- sls_xml = SLSxml.xml_doc()
- sls_xml.set_id( 'PandaBamboo_%s' % ( host ) )
- sls_xml.set_shortname( 'PandaBamboo monitoring service at %s' % ( host ) )
- sls_xml.set_fullname( 'PandaBamboo monitoring service at %s' % ( host ) )
- sls_xml.set_availability( str( server_avail ) )
- return sls_xml.print_xml()
-
-def make_monitor( host ) :
-
- if( options.debug ) : print "Creating the monitor monitoring xml"
-
- errormes = False
- messagetext = ''
-
- http_avail = httpd_availability( host )
- if( http_avail == 0 ) :
- errormes = True
- messagetext += "Error: web server on %s not working\n" % ( host )
-
- squid_avail = squid_availability()
- if( squid_avail == 0 ) :
- errormes = True
- messagetext += "Error: squid server on %s not working\n" % ( host )
-
- panda_avail = panda_availability( host )
- if( panda_avail == 0 ) :
- errormes = True
- messagetext += "Error: panda monitor on %s not working\n" % ( host )
-
- http_processes = count_processes()
-
- data_used = volume_use( 'data' )
- var_used = volume_use( 'var' )
-
- if( errormes ) :
- error_mail( host, messagetext )
-
- if( options.debug ) :
- print 'web - %s, squid - %s, panda - %s' % ( http_avail, squid_avail,
- panda_avail )
-
- sls_xml = SLSxml.xml_doc()
- sls_xml.set_id( 'PandaMon_%s' % ( host ) )
- sls_xml.set_shortname( 'PandaMonitor monitoring service at %s' % ( host ) )
- sls_xml.set_fullname( 'PandaMonitor monitoring service at %s' % ( host ) )
- sls_xml.set_availability( str( panda_avail ) )
-
- #adding intervention by hand here
- #sls_xml.add_intervention( "2011-01-16T20:00:00", "PT36H",
- # "Panda services with be out for over a day due to database server changes." )
-
- sls_xml.add_data( "HttpdAvailability", "Availability of the httpd server",
- str( http_avail ) )
- sls_xml.add_data( "SquidAvailability", "Availability of the squid server",
- str( squid_avail ) )
- sls_xml.add_data( "PandaAvailability", "Availability of the panda monitor",
- str( panda_avail ) )
- sls_xml.add_data( "HttpProcesses", "Number of processes for the panda monitor",
- str( http_processes ) )
- sls_xml.add_data( "DataVolumeUse", "Percent use of the local /data volume",
- str( data_used ) )
- sls_xml.add_data( "VarVolumeUse", "Percent use of the local /var volume",
- str( var_used ) )
- return sls_xml.print_xml()
-
-def httpd_availability( host ) :
- url = 'http://%s.cern.ch/robots.txt' % ( host )
- return check_url( url, "go away" )
-
-def squid_availability() :
- command = '/usr/bin/squidclient -p 25980 cache_object://localhost/info'
- return check_command( command, 'OK' )
-
-def panda_availability( host ) :
-
- port = '25980'
- baseurl = 'http://' + host + ':' + port + '/server/pandamon/query?'
-
- reply = check_url( baseurl + 'isAlive', 'yes' )
- if( reply != '100' ) : return '0'
-
- return '100'
-
- #The above is a simpler test of the python code, for now, until the
- #panda monitor migration is more stable, and all network tweaks are
- #in quator, so things are stable on reboot/upgrade. Once that is
- #true the below tests should be put back.
-
- reply = check_url( baseurl + 'dash=prod', 'CERN:OK' )
- if( reply != '100' ) : return '0'
-
- reply = check_url( baseurl + 'dash=clouds', 'Cloud status' )
- if( reply != '100' ) : return '0'
-
- reply = check_url( baseurl + 'overview=incidents', 'Recorded incidents' )
- if( reply != '100' ) : return '0'
-
- reply = check_url( baseurl + 'dash=ddm', 'Space available' )
- if( reply != '100' ) : return '0'
-
- return '100'
-
-def server_availability( host ) :
-
- tmp_url = '--no-check-certificate https://%s:25443/server/panda/isAlive' % ( host )
- reply = check_url( tmp_url, 'alive=yes' )
- if( reply != '100' ) : return '0'
-
- return '100'
-
-def bamboo_availability( host ) :
-
- tmp_url = 'http://%s:25070/bamboo/bamboo/isAlive' % ( host )
- reply = check_url( tmp_url, 'alive=yes' )
- if( reply != '100' ) : return '0'
-
- return '100'
-
-def check_url( url, check_string ) :
- command = "wget -q -O - " + url
- return check_command( command, check_string )
-
-def check_command( command, check_string ) :
-
- if( options.debug ) :
- print "Checking command : %s" % ( command )
- print "For string : %s" % ( check_string )
-
- tmp_array = command.split()
- output = subprocess.Popen( tmp_array, stdout=subprocess.PIPE ).communicate()[0]
-
- if( re.search( check_string, output ) ) :
- if( options.debug ) : print "Found the string, return 100"
- return '100'
- else :
- if( options.debug ) : print "String not found, return 0"
- return '0'
-
-def count_processes() :
- output = subprocess.Popen( ['ps', 'aux'], stdout=subprocess.PIPE ).communicate()[0]
- count = 0
- for line in output.split( '\n' ) :
- if( re.match( 'atlpan', line ) ) :
- if( re.search( 'http', line ) ) :
- count += 1
- return count
-
-def count_add_processes() :
- output = subprocess.Popen( "pgrep -f add.py",
- stdout=subprocess.PIPE,shell=True).communicate()[0]
- count = 0
- for line in output.split( '\n' ) :
- line = line.strip()
- if line == '':
- continue
- count += 1
- return count
-
-def count_holdings() :
- output = subprocess.Popen("ls /data/atlpan/srv/var/log/panda/ | egrep '(finished|failed)'",
- stdout=subprocess.PIPE,shell=True).communicate()[0]
- count =0
- for line in output.split( '\n' ) :
- line = line.strip()
- if line == '':
- continue
- count += 1
- return count
-
-def registration_time(timeSlice=False,onlyDQ2=False) :
- aveRegTime = '0.0'
- try:
- if onlyDQ2:
- com = "grep registraion /data/atlpan/srv/var/log/panda/panda-Adder.log | grep DQ2 | grep -v LFC"
- else:
- com = "grep 'LFC+DQ2' /data/atlpan/srv/var/log/panda/panda-Adder.log"
- if not timeSlice:
- com += ' | tail -1000'
- output = subprocess.Popen(com,stdout=subprocess.PIPE,shell=True).communicate()[0]
- regtimeMap = {}
- for line in output.split('\n'):
- try:
- items = line.split()
- timestamp = items[1][:2]
- regtime = float(items[-2])
- if not regtimeMap.has_key(timestamp):
- regtimeMap[timestamp] = {'totalTime':0.,'totalReg':0}
- regtimeMap[timestamp]['totalTime'] += regtime
- regtimeMap[timestamp]['totalReg'] += 1
- except:
- pass
- timestamps = regtimeMap.keys()
- if timeSlice:
- timestamps.sort()
- for timestamp in timestamps:
- print "%s %4.1fsec" % (timestamp,regtimeMap[timestamp]['totalTime']/float(regtimeMap[timestamp]['totalReg']))
- else:
- totalTime = 0.
- totalReg = 0
- for timestamp in timestamps:
- totalTime += regtimeMap[timestamp]['totalTime']
- totalReg += regtimeMap[timestamp]['totalReg']
- if totalReg > 0:
- aveRegTime = '%4.1f' % (totalTime/float(totalReg))
- except:
- errtype,ervalue = sys.exc_info()[:2]
- print "ERROR : %s:%s in registration_time" % (errtype,ervalue)
- return aveRegTime
-
-def volume_use( volume_name ) :
- command = "df -Pkh /" + volume_name
-
- tmp_array = command.split()
- output = subprocess.Popen( tmp_array, stdout=subprocess.PIPE ).communicate()[0]
-
- for line in output.split( '\n' ) :
- if( re.search( volume_name, line ) ) :
- used_amount = re.search( r"(\d+)\%", line ).group(1)
-
- return used_amount
-
-def error_mail( host, message ) :
-
- mail_cmd = []
- mail_cmd.append( 'mail' )
- mail_cmd.append( '-s' )
- mail_cmd.append( 'Problems with %s' % ( host ) )
- mail_cmd.append( 'douglas@cern.ch' )
-
- text = "Problems with %s :\n\n" % ( host )
- text += message
-
- p = subprocess.Popen( mail_cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE )
- p.stdin.write( text )
- p.stdin.close()
-
-
-#run program
-__main__()
diff --git a/current/templates/panda_server-merge.sh.exe.template b/current/templates/panda_server-merge.sh.exe.template
deleted file mode 100755
index 6acf67c5f..000000000
--- a/current/templates/panda_server-merge.sh.exe.template
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# setup grid stuff
-source /opt/glite/etc/profile.d/grid-env.sh
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python2.5 @@install_purelib@@/pandaserver/test/runMerger.py
diff --git a/current/templates/panda_server-priority.sh.exe.template b/current/templates/panda_server-priority.sh.exe.template
deleted file mode 100755
index 70363d85b..000000000
--- a/current/templates/panda_server-priority.sh.exe.template
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python2.5 @@install_purelib@@/pandaserver/test/prioryMassage.py
diff --git a/current/templates/panda_server-runRebro.sh.exe.template b/current/templates/panda_server-runRebro.sh.exe.template
deleted file mode 100755
index 24dfc91c7..000000000
--- a/current/templates/panda_server-runRebro.sh.exe.template
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# setup grid stuff
-source /opt/glite/etc/profile.d/grid-env.sh
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python2.5 @@install_purelib@@/pandaserver/test/runRebro.py
diff --git a/current/templates/panda_server-sysconfig.rpmnew.template b/current/templates/panda_server-sysconfig.rpmnew.template
deleted file mode 100644
index 7d4d5f482..000000000
--- a/current/templates/panda_server-sysconfig.rpmnew.template
+++ /dev/null
@@ -1,31 +0,0 @@
-# Configuration file for the httpd service.
-
-OPTIONS="-f @@install_dir@@/etc/panda/panda_server-httpd.conf"
-
-# for FastCGI/WSGI
-#OPTIONS="-f @@install_dir@@/etc/panda/panda_server-httpd-FastCGI.conf"
-#HTTPD='/usr/sbin/httpd'
-
-# for DQ2
-export X509_CERT_DIR=/etc/grid-security/certificates
-export RUCIO_ACCOUNT=panda
-export RUCIO_APPID=pandasrv
-
-# panda home
-export PANDA_HOME=@@install_dir@@
-
-# timezone
-export TZ=UTC
-
-# import panda modules
-export PYTHONPATH=@@install_purelib@@/pandacommon:@@install_purelib@@/pandaserver
-
-# avoid to use AFS
-export HOME=/home/atlpan
-
-# set user's proxy
-export X509_USER_PROXY=FIXME
-
-# panda server URLs
-export PANDA_URL='http://localhost:25080/server/panda'
-export PANDA_URL_SSL='https://localhost:25443/server/panda'
diff --git a/current/templates/panda_server-tmpwatch.sh.exe.template b/current/templates/panda_server-tmpwatch.sh.exe.template
deleted file mode 100644
index 40fbd2711..000000000
--- a/current/templates/panda_server-tmpwatch.sh.exe.template
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-# import env vars from sysconfig
-source @@install_dir@@/etc/sysconfig/panda_server-sysconfig
-
-python @@install_purelib@@/pandaserver/test/tmpwatch.py
diff --git a/current/templates/panda_server-vomsrenew.sh.exe.template b/current/templates/panda_server-vomsrenew.sh.exe.template
deleted file mode 100755
index c4771655e..000000000
--- a/current/templates/panda_server-vomsrenew.sh.exe.template
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-source /etc/profile.d/grid-env.sh
-
-NOVOMS=/data/atlpan/x509up_u25606_novoms
-
-voms-proxy-init -voms atlas:/atlas/Role=production -out /data/atlpan/x509up_u25606 -valid 96:00 -cert=$NOVOMS
-
-# check lifetime of certificate
-grid-proxy-info -e -h 504 -f $NOVOMS
-if [ $? -ne 0 ]; then
- echo $NOVOMS expires in 3 weeks on `hostname` | mail -s "WARNING : Grid certificate expires soon on panda server" atlas-adc-panda-support@cern.ch
-fi
-
-# check lifetime of certificate
-voms-proxy-info -exists -hours 72 -file /data/atlpan/x509up_u25606
-if [ $? -ne 0 ]; then
- echo /data/atlpan/x509up_u25606 expires in 3 days on `hostname` | mail -s "WARNING : Grid proxy expires soon on panda server" atlas-adc-panda-support@cern.ch,atlas-adc-expert@cern.ch
-fi
-
diff --git a/current/templates/panda_server.cfg.rpmnew.template b/current/templates/panda_server.cfg.rpmnew.template
deleted file mode 100644
index f3274cec3..000000000
--- a/current/templates/panda_server.cfg.rpmnew.template
+++ /dev/null
@@ -1,258 +0,0 @@
-[server]
-
-
-##########################
-#
-# Logger parameters
-#
-
-# log directory
-logdir=@@install_dir@@/var/log/panda
-
-# logger name
-loggername = prod
-
-
-
-##########################
-#
-# Transaction parameters
-#
-
-# lock file for getJobs
-lockfile_getJobs = %(logdir)s/getJobs.lock
-
-# lock file for getSerialNumber
-lockfile_getSN = %(logdir)s/getSN.lock
-
-# lock file for accessing email DB
-lockfile_getMail = %(logdir)s/getMail.lock
-
-# lock file for updateDatasetStatus
-lockfile_setDS = %(logdir)s/setDS.lock
-
-# lock file for getCloudTask
-lockfile_getCT = %(logdir)s/getCT.lock
-
-# lock file for uuidgen
-lockfile_getUU = %(logdir)s/getUU.lock
-
-
-
-##########################
-#
-# DA parameters
-#
-
-# cache space
-cache_dir = @@install_dir@@/var/cache/pandaserver
-
-
-
-##########################
-#
-# DDM parameters
-#
-
-# dq2 dir
-dq2_dir = /opt/dq2
-
-# globus dir
-globus_dir = /opt/globus
-
-# path to native python
-native_python = /data/atlpan/bin
-
-# path to python for lfc client (/data/atlpan/bin/python cannot be used due to lack of libpythonX.Y.so)
-native_python32 = /usr/bin
-
-# glite source file
-glite_source = /opt/glite/etc/profile.d/grid-env.sh
-
-# location for Panda common
-pandaCommon_dir = @@install_purelib@@/pandacommon
-
-# location for Panda server
-pandaPython_dir = @@install_purelib@@/pandaserver
-
-# location for LFCclient
-lfcClient_dir = %(pandaPython_dir)s/brokerage
-
-# home dir to change CWD
-home_dir_cwd = /home/atlpan
-
-
-
-##########################
-#
-# Database parameters
-#
-
-# host
-dbhost = ADCR_PANDA
-
-# user
-dbuser = ATLAS_PANDA_WRITER
-
-# password
-dbpasswd = FIXME
-
-# database
-dbname = PandaDB
-
-# number of connections
-nDBConnection = 2
-
-# number of connections for FastCGI/WSGI
-nDBConForFastCGIWSGI = 1
-
-# use timeout
-usedbtimeout = True
-
-# timout value
-dbtimeout = 300
-
-# verbose in bridge
-dbbridgeverbose = False
-
-# SQL dumper
-dump_sql = False
-
-
-
-##########################
-#
-# Panda server parameters
-#
-
-# port
-pserverport = 25443
-
-
-
-##########################
-#
-# proxy parameters
-#
-
-# http
-httpProxy = ""
-
-
-
-##########################
-#
-# E-mail DB parameters
-#
-
-# database name for local caching
-emailDB = %(logdir)s/email_db
-
-# SMTP server
-emailSMTPsrv = cernmx.cern.ch
-
-# sender address for notification
-emailSender = atlpan@cern.ch
-
-# login name for SMTP
-emailLogin = atlpan
-
-# login password for SMTP
-emailPass = FIXME
-
-
-
-##########################
-#
-# parameters for dynamic task assignment
-#
-
-# enable dynamic task assignment
-enableDynamicTA = True
-
-
-
-##########################
-#
-# parameters for redirection service
-#
-
-# enable redirection service
-enableRedirection = False
-
-
-
-##########################
-#
-# parameters for FastCGI/WSGI
-#
-
-# use FastCGI with flup
-useFastCGI = False
-
-# use WSGI without flup
-useWSGI = True
-
-# verbose in entry point
-entryVerbose = False
-
-
-
-##########################
-#
-# parameters for memcached
-#
-
-# use memcached
-memcached_enable = True
-
-# memcached servers
-memcached_srvs = voatlas248.cern.ch:11211,voatlas249.cern.ch:11211,voatlas250.cern.ch:11211,voatlas251.cern.ch:11211,voatlas252.cern.ch:11211,voatlas253.cern.ch:11211
-
-# expiration time in memcached
-memcached_exptime = 86400
-
-
-
-##########################
-#
-# nRunning parameters
-#
-
-# interval
-nrun_interval = 5
-
-# the number of hosts
-nrun_hosts = 3
-
-# serial number
-nrun_snum = 999
-
-
-
-##########################
-#
-# Cassandra
-#
-
-# use Cassandra for PandaCache
-cacheUseCassandra = False
-
-# ignore Cassandra error
-cacheIgnoreCassandraError = True
-
-# keyspace for PandaCache
-cacheKeySpace = PandaCacheKeySpace
-
-# column family for files
-cacheFileTable = FileTable
-
-
-
-##########################
-#
-# Job Status Monitor
-#
-
-# enable job status change monitoring
-record_statuschange = False