From 37ae8ecd091879066ef9ae184efc1ad1c7a88fa9 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Tue, 2 Jul 2013 11:51:19 +0000 Subject: [PATCH] 0.0.17 --- current/INSTALL.txt | 137 + current/MANIFEST.in | 2 + current/README.txt | 967 ++ current/pandaserver/__init__.py | 0 current/pandaserver/brokerage/ErrorCode.py | 9 + current/pandaserver/brokerage/LFCclient.py | 152 + current/pandaserver/brokerage/PandaSiteIDs.py | 198 + current/pandaserver/brokerage/SiteMapper.py | 205 + current/pandaserver/brokerage/VomsResolver.py | 56 + current/pandaserver/brokerage/__init__.py | 0 current/pandaserver/brokerage/broker.py | 1684 +++ current/pandaserver/brokerage/broker_util.py | 399 + current/pandaserver/config/__init__.py | 0 current/pandaserver/config/panda_config.py | 33 + current/pandaserver/dataservice/Activator.py | 47 + current/pandaserver/dataservice/Adder.py | 742 ++ current/pandaserver/dataservice/Adder2.py | 1014 ++ .../pandaserver/dataservice/AddressFinder.py | 308 + current/pandaserver/dataservice/Closer.py | 290 + current/pandaserver/dataservice/DDM.py | 344 + current/pandaserver/dataservice/DDMHandler.py | 48 + .../pandaserver/dataservice/DataService.py | 99 + .../dataservice/DataServiceUtils.py | 281 + .../dataservice/DynDataDistributer.py | 1657 +++ current/pandaserver/dataservice/ErrorCode.py | 16 + .../pandaserver/dataservice/EventPicker.py | 288 + current/pandaserver/dataservice/Finisher.py | 178 + current/pandaserver/dataservice/MailUtils.py | 103 + current/pandaserver/dataservice/Merger.py | 692 + current/pandaserver/dataservice/Notifier.py | 396 + .../pandaserver/dataservice/ProcessLimiter.py | 54 + current/pandaserver/dataservice/RetryMaker.py | 125 + current/pandaserver/dataservice/Setupper.py | 2420 ++++ current/pandaserver/dataservice/TaLauncher.py | 55 + .../pandaserver/dataservice/TaskAssigner.py | 1180 ++ current/pandaserver/dataservice/Waker.py | 55 + current/pandaserver/dataservice/__init__.py | 0 .../dataservice/countGuidsClient.py | 72 + .../pandaserver/dataservice/datriHandler.py | 207 + .../dataservice/eventLookupClient.py | 201 + .../pandaserver/dataservice/forkSetupper.py | 74 + .../pandaserver/jobdispatcher/ErrorCode.py | 11 + .../jobdispatcher/JobDispatcher.py | 541 + current/pandaserver/jobdispatcher/Protocol.py | 212 + current/pandaserver/jobdispatcher/Watcher.py | 172 + current/pandaserver/jobdispatcher/__init__.py | 0 current/pandaserver/server/panda.py | 180 + .../taskbuffer/ArchiveDBProxyPool.py | 55 + current/pandaserver/taskbuffer/CloudSpec.py | 27 + .../pandaserver/taskbuffer/CloudTaskSpec.py | 99 + current/pandaserver/taskbuffer/CloudURLMap.py | 36 + current/pandaserver/taskbuffer/ConBridge.py | 502 + current/pandaserver/taskbuffer/DBProxy.py | 3066 +++++ current/pandaserver/taskbuffer/DBProxyPool.py | 88 + current/pandaserver/taskbuffer/DatasetSpec.py | 118 + current/pandaserver/taskbuffer/ErrorCode.py | 37 + current/pandaserver/taskbuffer/FileSpec.py | 213 + current/pandaserver/taskbuffer/Initializer.py | 46 + current/pandaserver/taskbuffer/JobSpec.py | 239 + current/pandaserver/taskbuffer/LogDBProxy.py | 790 ++ .../pandaserver/taskbuffer/LogDBProxyPool.py | 52 + current/pandaserver/taskbuffer/MemProxy.py | 205 + current/pandaserver/taskbuffer/OraDBProxy.py | 10739 ++++++++++++++++ .../pandaserver/taskbuffer/OraLogDBProxy.py | 727 ++ current/pandaserver/taskbuffer/PrioUtil.py | 4 + .../pandaserver/taskbuffer/ProcessGroups.py | 101 + current/pandaserver/taskbuffer/SQLDumper.py | 22 + current/pandaserver/taskbuffer/SiteSpec.py | 31 + current/pandaserver/taskbuffer/TaskBuffer.py | 2294 ++++ current/pandaserver/taskbuffer/Utils.py | 512 + .../pandaserver/taskbuffer/WrappedPickle.py | 38 + current/pandaserver/taskbuffer/__init__.py | 0 current/pandaserver/test/XrdAna.py | 59 + current/pandaserver/test/XrdTest.py | 65 + current/pandaserver/test/activateBNL.py | 63 + current/pandaserver/test/activateDefJobs.py | 36 + current/pandaserver/test/activateDefJobs.sh | 10 + current/pandaserver/test/activateJobs.py | 41 + current/pandaserver/test/activator.py | 24 + current/pandaserver/test/add.py | 434 + current/pandaserver/test/add.sh | 9 + current/pandaserver/test/aho.xml | 21 + current/pandaserver/test/analysis.py | 78 + current/pandaserver/test/analyzeLog.py | 55 + current/pandaserver/test/archivelogs.py | 45 + current/pandaserver/test/backupJobArch.py | 176 + current/pandaserver/test/banUser.py | 41 + current/pandaserver/test/boostPrio.py | 20 + current/pandaserver/test/boostUser.py | 34 + current/pandaserver/test/callbackDDM.py | 12 + current/pandaserver/test/checkGetJob.py | 18 + current/pandaserver/test/checkSetupper.py | 31 + current/pandaserver/test/cl_testEvgen.py | 70 + current/pandaserver/test/cl_testG4sim.py | 120 + current/pandaserver/test/cl_testMXreco.py | 112 + current/pandaserver/test/cleanup.py | 10 + current/pandaserver/test/closeDS.py | 55 + current/pandaserver/test/copyArchive.py | 1653 +++ current/pandaserver/test/copyArchive.sh | 9 + current/pandaserver/test/copyROOT.py | 81 + .../pandaserver/test/createPandaSiteIDs.py | 54 + current/pandaserver/test/datasetManager.py | 924 ++ current/pandaserver/test/deleteJobs.py | 175 + current/pandaserver/test/directSubmit.py | 163 + current/pandaserver/test/distributeDefJobs.py | 53 + current/pandaserver/test/dq2cr.py | 45 + current/pandaserver/test/emailfix.py | 16 + current/pandaserver/test/evpPD2P.py | 156 + current/pandaserver/test/execute.py | 66 + .../pandaserver/test/fileCallbackListener.py | 253 + current/pandaserver/test/fileClean.py | 145 + current/pandaserver/test/finishJob.py | 74 + current/pandaserver/test/getJobs.py | 54 + current/pandaserver/test/input.data | 2 + current/pandaserver/test/installSW.py | 83 + current/pandaserver/test/killDefJobs.py | 26 + current/pandaserver/test/killJob.py | 36 + current/pandaserver/test/killJobLowPrio.py | 86 + current/pandaserver/test/killJobsInTask.py | 53 + current/pandaserver/test/killProdJobs.py | 30 + current/pandaserver/test/killTask.py | 53 + current/pandaserver/test/killUser.py | 71 + current/pandaserver/test/killWaiting.py | 35 + current/pandaserver/test/logrotate.sh | 3 + current/pandaserver/test/missing.py | 43 + current/pandaserver/test/pandadb.sql | 430 + current/pandaserver/test/pandameta.sql | 97 + current/pandaserver/test/pcron.sh | 37 + current/pandaserver/test/pdq2_cr | 159 + current/pandaserver/test/plot.py | 51 + current/pandaserver/test/prioryMassage.py | 364 + current/pandaserver/test/proxy.sh | 15 + current/pandaserver/test/reassignDefJobs.py | 63 + current/pandaserver/test/reassignJobs.py | 14 + current/pandaserver/test/reassignSite.py | 64 + current/pandaserver/test/reassignTask.py | 60 + current/pandaserver/test/reassignWaiting.py | 39 + current/pandaserver/test/redirectLog.py | 40 + current/pandaserver/test/redirectLog.sh | 11 + current/pandaserver/test/resubmitJobs.py | 14 + current/pandaserver/test/runMerger.py | 219 + current/pandaserver/test/runRebro.py | 198 + current/pandaserver/test/setPriority.py | 30 + current/pandaserver/test/testDB.py | 88 + current/pandaserver/test/testDQ.py | 102 + current/pandaserver/test/testEvgen.py | 59 + current/pandaserver/test/testEvgen14.py | 59 + current/pandaserver/test/testEvgen15.py | 57 + current/pandaserver/test/testEvgen16.py | 57 + current/pandaserver/test/testEvgen17.py | 58 + current/pandaserver/test/testFinder.py | 69 + current/pandaserver/test/testG4sim.py | 83 + current/pandaserver/test/testG4sim15.py | 88 + current/pandaserver/test/testG4sim16.py | 88 + current/pandaserver/test/testG4sim17.py | 88 + current/pandaserver/test/testGetJobStatus.py | 17 + current/pandaserver/test/testMultiTRF.py | 95 + current/pandaserver/test/testReco.py | 106 + current/pandaserver/test/testRepro.py | 116 + current/pandaserver/test/testScript.py | 45 + current/pandaserver/test/testSimul13.py | 81 + current/pandaserver/test/testSimulReco14.py | 101 + current/pandaserver/test/testSiteMap.py | 23 + current/pandaserver/test/testTB.py | 145 + current/pandaserver/test/testTaskA2.py | 64 + current/pandaserver/test/testUser.py | 44 + current/pandaserver/test/testWait.py | 119 + current/pandaserver/test/tmpwatch.py | 47 + current/pandaserver/test/update.sh | 19 + current/pandaserver/test/valConf.py | 15 + current/pandaserver/userinterface/Client.py | 880 ++ .../pandaserver/userinterface/RbLauncher.py | 52 + current/pandaserver/userinterface/ReBroker.py | 1022 ++ current/pandaserver/userinterface/UserIF.py | 1570 +++ current/pandaserver/userinterface/__init__.py | 0 .../pandaserver/userinterface/runReBroker.py | 70 + current/setup.cfg | 7 + current/setup.py | 203 + .../panda_server-add.sh.exe.template | 12 + .../panda_server-archivelog.sh.exe.template | 6 + ...panda_server-backupJobArch.sh.exe.template | 6 + .../panda_server-boostUser.sh.exe.template | 6 + .../panda_server-callback.sh.exe.template | 9 + .../panda_server-copyArchive.sh.exe.template | 9 + .../panda_server-copyROOT.sh.exe.template | 6 + .../templates/panda_server-ctl.exe.template | 139 + ...anda_server-datasetManager.sh.exe.template | 9 + .../panda_server-deleteJobs.sh.exe.template | 6 + .../panda_server-evpPD2P.sh.exe.template | 9 + .../panda_server-grid-env.sh.template | 3 + ..._server-httpd-FastCGI.conf.rpmnew.template | 177 + .../panda_server-httpd.conf.rpmnew.template | 141 + .../templates/panda_server-logrotate.template | 14 + .../panda_server-makeSlsXml.exe.template | 334 + .../panda_server-merge.sh.exe.template | 9 + .../panda_server-priority.sh.exe.template | 6 + .../panda_server-runRebro.sh.exe.template | 9 + .../panda_server-sysconfig.rpmnew.template | 31 + .../panda_server-tmpwatch.sh.exe.template | 6 + .../panda_server-vomsrenew.sh.exe.template | 20 + .../panda_server.cfg.rpmnew.template | 258 + 201 files changed, 50893 insertions(+) create mode 100644 current/INSTALL.txt create mode 100644 current/MANIFEST.in create mode 100644 current/README.txt create mode 100644 current/pandaserver/__init__.py create mode 100644 current/pandaserver/brokerage/ErrorCode.py create mode 100755 current/pandaserver/brokerage/LFCclient.py create mode 100644 current/pandaserver/brokerage/PandaSiteIDs.py create mode 100644 current/pandaserver/brokerage/SiteMapper.py create mode 100644 current/pandaserver/brokerage/VomsResolver.py create mode 100755 current/pandaserver/brokerage/__init__.py create mode 100755 current/pandaserver/brokerage/broker.py create mode 100755 current/pandaserver/brokerage/broker_util.py create mode 100644 current/pandaserver/config/__init__.py create mode 100755 current/pandaserver/config/panda_config.py create mode 100755 current/pandaserver/dataservice/Activator.py create mode 100755 current/pandaserver/dataservice/Adder.py create mode 100644 current/pandaserver/dataservice/Adder2.py create mode 100644 current/pandaserver/dataservice/AddressFinder.py create mode 100755 current/pandaserver/dataservice/Closer.py create mode 100755 current/pandaserver/dataservice/DDM.py create mode 100755 current/pandaserver/dataservice/DDMHandler.py create mode 100755 current/pandaserver/dataservice/DataService.py create mode 100644 current/pandaserver/dataservice/DataServiceUtils.py create mode 100644 current/pandaserver/dataservice/DynDataDistributer.py create mode 100755 current/pandaserver/dataservice/ErrorCode.py create mode 100644 current/pandaserver/dataservice/EventPicker.py create mode 100755 current/pandaserver/dataservice/Finisher.py create mode 100755 current/pandaserver/dataservice/MailUtils.py create mode 100644 current/pandaserver/dataservice/Merger.py create mode 100755 current/pandaserver/dataservice/Notifier.py create mode 100644 current/pandaserver/dataservice/ProcessLimiter.py create mode 100755 current/pandaserver/dataservice/RetryMaker.py create mode 100755 current/pandaserver/dataservice/Setupper.py create mode 100755 current/pandaserver/dataservice/TaLauncher.py create mode 100644 current/pandaserver/dataservice/TaskAssigner.py create mode 100755 current/pandaserver/dataservice/Waker.py create mode 100755 current/pandaserver/dataservice/__init__.py create mode 100644 current/pandaserver/dataservice/countGuidsClient.py create mode 100644 current/pandaserver/dataservice/datriHandler.py create mode 100644 current/pandaserver/dataservice/eventLookupClient.py create mode 100755 current/pandaserver/dataservice/forkSetupper.py create mode 100755 current/pandaserver/jobdispatcher/ErrorCode.py create mode 100755 current/pandaserver/jobdispatcher/JobDispatcher.py create mode 100755 current/pandaserver/jobdispatcher/Protocol.py create mode 100755 current/pandaserver/jobdispatcher/Watcher.py create mode 100755 current/pandaserver/jobdispatcher/__init__.py create mode 100755 current/pandaserver/server/panda.py create mode 100644 current/pandaserver/taskbuffer/ArchiveDBProxyPool.py create mode 100644 current/pandaserver/taskbuffer/CloudSpec.py create mode 100644 current/pandaserver/taskbuffer/CloudTaskSpec.py create mode 100644 current/pandaserver/taskbuffer/CloudURLMap.py create mode 100644 current/pandaserver/taskbuffer/ConBridge.py create mode 100755 current/pandaserver/taskbuffer/DBProxy.py create mode 100755 current/pandaserver/taskbuffer/DBProxyPool.py create mode 100755 current/pandaserver/taskbuffer/DatasetSpec.py create mode 100755 current/pandaserver/taskbuffer/ErrorCode.py create mode 100755 current/pandaserver/taskbuffer/FileSpec.py create mode 100644 current/pandaserver/taskbuffer/Initializer.py create mode 100755 current/pandaserver/taskbuffer/JobSpec.py create mode 100755 current/pandaserver/taskbuffer/LogDBProxy.py create mode 100755 current/pandaserver/taskbuffer/LogDBProxyPool.py create mode 100644 current/pandaserver/taskbuffer/MemProxy.py create mode 100755 current/pandaserver/taskbuffer/OraDBProxy.py create mode 100755 current/pandaserver/taskbuffer/OraLogDBProxy.py create mode 100644 current/pandaserver/taskbuffer/PrioUtil.py create mode 100644 current/pandaserver/taskbuffer/ProcessGroups.py create mode 100644 current/pandaserver/taskbuffer/SQLDumper.py create mode 100644 current/pandaserver/taskbuffer/SiteSpec.py create mode 100755 current/pandaserver/taskbuffer/TaskBuffer.py create mode 100755 current/pandaserver/taskbuffer/Utils.py create mode 100644 current/pandaserver/taskbuffer/WrappedPickle.py create mode 100755 current/pandaserver/taskbuffer/__init__.py create mode 100755 current/pandaserver/test/XrdAna.py create mode 100755 current/pandaserver/test/XrdTest.py create mode 100755 current/pandaserver/test/activateBNL.py create mode 100755 current/pandaserver/test/activateDefJobs.py create mode 100755 current/pandaserver/test/activateDefJobs.sh create mode 100755 current/pandaserver/test/activateJobs.py create mode 100755 current/pandaserver/test/activator.py create mode 100755 current/pandaserver/test/add.py create mode 100755 current/pandaserver/test/add.sh create mode 100755 current/pandaserver/test/aho.xml create mode 100755 current/pandaserver/test/analysis.py create mode 100755 current/pandaserver/test/analyzeLog.py create mode 100644 current/pandaserver/test/archivelogs.py create mode 100755 current/pandaserver/test/backupJobArch.py create mode 100644 current/pandaserver/test/banUser.py create mode 100755 current/pandaserver/test/boostPrio.py create mode 100755 current/pandaserver/test/boostUser.py create mode 100755 current/pandaserver/test/callbackDDM.py create mode 100644 current/pandaserver/test/checkGetJob.py create mode 100644 current/pandaserver/test/checkSetupper.py create mode 100644 current/pandaserver/test/cl_testEvgen.py create mode 100644 current/pandaserver/test/cl_testG4sim.py create mode 100644 current/pandaserver/test/cl_testMXreco.py create mode 100644 current/pandaserver/test/cleanup.py create mode 100755 current/pandaserver/test/closeDS.py create mode 100755 current/pandaserver/test/copyArchive.py create mode 100755 current/pandaserver/test/copyArchive.sh create mode 100644 current/pandaserver/test/copyROOT.py create mode 100644 current/pandaserver/test/createPandaSiteIDs.py create mode 100644 current/pandaserver/test/datasetManager.py create mode 100755 current/pandaserver/test/deleteJobs.py create mode 100755 current/pandaserver/test/directSubmit.py create mode 100755 current/pandaserver/test/distributeDefJobs.py create mode 100755 current/pandaserver/test/dq2cr.py create mode 100755 current/pandaserver/test/emailfix.py create mode 100644 current/pandaserver/test/evpPD2P.py create mode 100755 current/pandaserver/test/execute.py create mode 100644 current/pandaserver/test/fileCallbackListener.py create mode 100755 current/pandaserver/test/fileClean.py create mode 100755 current/pandaserver/test/finishJob.py create mode 100755 current/pandaserver/test/getJobs.py create mode 100755 current/pandaserver/test/input.data create mode 100755 current/pandaserver/test/installSW.py create mode 100755 current/pandaserver/test/killDefJobs.py create mode 100755 current/pandaserver/test/killJob.py create mode 100755 current/pandaserver/test/killJobLowPrio.py create mode 100755 current/pandaserver/test/killJobsInTask.py create mode 100755 current/pandaserver/test/killProdJobs.py create mode 100755 current/pandaserver/test/killTask.py create mode 100644 current/pandaserver/test/killUser.py create mode 100755 current/pandaserver/test/killWaiting.py create mode 100755 current/pandaserver/test/logrotate.sh create mode 100755 current/pandaserver/test/missing.py create mode 100644 current/pandaserver/test/pandadb.sql create mode 100644 current/pandaserver/test/pandameta.sql create mode 100755 current/pandaserver/test/pcron.sh create mode 100755 current/pandaserver/test/pdq2_cr create mode 100755 current/pandaserver/test/plot.py create mode 100644 current/pandaserver/test/prioryMassage.py create mode 100755 current/pandaserver/test/proxy.sh create mode 100755 current/pandaserver/test/reassignDefJobs.py create mode 100755 current/pandaserver/test/reassignJobs.py create mode 100644 current/pandaserver/test/reassignSite.py create mode 100644 current/pandaserver/test/reassignTask.py create mode 100755 current/pandaserver/test/reassignWaiting.py create mode 100755 current/pandaserver/test/redirectLog.py create mode 100755 current/pandaserver/test/redirectLog.sh create mode 100755 current/pandaserver/test/resubmitJobs.py create mode 100644 current/pandaserver/test/runMerger.py create mode 100755 current/pandaserver/test/runRebro.py create mode 100755 current/pandaserver/test/setPriority.py create mode 100755 current/pandaserver/test/testDB.py create mode 100755 current/pandaserver/test/testDQ.py create mode 100755 current/pandaserver/test/testEvgen.py create mode 100755 current/pandaserver/test/testEvgen14.py create mode 100755 current/pandaserver/test/testEvgen15.py create mode 100755 current/pandaserver/test/testEvgen16.py create mode 100755 current/pandaserver/test/testEvgen17.py create mode 100644 current/pandaserver/test/testFinder.py create mode 100755 current/pandaserver/test/testG4sim.py create mode 100644 current/pandaserver/test/testG4sim15.py create mode 100644 current/pandaserver/test/testG4sim16.py create mode 100644 current/pandaserver/test/testG4sim17.py create mode 100755 current/pandaserver/test/testGetJobStatus.py create mode 100755 current/pandaserver/test/testMultiTRF.py create mode 100755 current/pandaserver/test/testReco.py create mode 100755 current/pandaserver/test/testRepro.py create mode 100755 current/pandaserver/test/testScript.py create mode 100644 current/pandaserver/test/testSimul13.py create mode 100644 current/pandaserver/test/testSimulReco14.py create mode 100755 current/pandaserver/test/testSiteMap.py create mode 100755 current/pandaserver/test/testTB.py create mode 100755 current/pandaserver/test/testTaskA2.py create mode 100755 current/pandaserver/test/testUser.py create mode 100755 current/pandaserver/test/testWait.py create mode 100644 current/pandaserver/test/tmpwatch.py create mode 100755 current/pandaserver/test/update.sh create mode 100644 current/pandaserver/test/valConf.py create mode 100755 current/pandaserver/userinterface/Client.py create mode 100755 current/pandaserver/userinterface/RbLauncher.py create mode 100644 current/pandaserver/userinterface/ReBroker.py create mode 100755 current/pandaserver/userinterface/UserIF.py create mode 100644 current/pandaserver/userinterface/__init__.py create mode 100755 current/pandaserver/userinterface/runReBroker.py create mode 100644 current/setup.cfg create mode 100755 current/setup.py create mode 100755 current/templates/panda_server-add.sh.exe.template create mode 100755 current/templates/panda_server-archivelog.sh.exe.template create mode 100644 current/templates/panda_server-backupJobArch.sh.exe.template create mode 100755 current/templates/panda_server-boostUser.sh.exe.template create mode 100755 current/templates/panda_server-callback.sh.exe.template create mode 100755 current/templates/panda_server-copyArchive.sh.exe.template create mode 100755 current/templates/panda_server-copyROOT.sh.exe.template create mode 100755 current/templates/panda_server-ctl.exe.template create mode 100644 current/templates/panda_server-datasetManager.sh.exe.template create mode 100644 current/templates/panda_server-deleteJobs.sh.exe.template create mode 100755 current/templates/panda_server-evpPD2P.sh.exe.template create mode 100644 current/templates/panda_server-grid-env.sh.template create mode 100644 current/templates/panda_server-httpd-FastCGI.conf.rpmnew.template create mode 100644 current/templates/panda_server-httpd.conf.rpmnew.template create mode 100644 current/templates/panda_server-logrotate.template create mode 100755 current/templates/panda_server-makeSlsXml.exe.template create mode 100755 current/templates/panda_server-merge.sh.exe.template create mode 100755 current/templates/panda_server-priority.sh.exe.template create mode 100755 current/templates/panda_server-runRebro.sh.exe.template create mode 100644 current/templates/panda_server-sysconfig.rpmnew.template create mode 100644 current/templates/panda_server-tmpwatch.sh.exe.template create mode 100755 current/templates/panda_server-vomsrenew.sh.exe.template create mode 100644 current/templates/panda_server.cfg.rpmnew.template diff --git a/current/INSTALL.txt b/current/INSTALL.txt new file mode 100644 index 000000000..e3358f664 --- /dev/null +++ b/current/INSTALL.txt @@ -0,0 +1,137 @@ +Installation +-------------------- + +1. Checkout panda-common and panda-server. + +$ svn co svn+ssh://svn.cern.ch/reps/panda/panda-common/tags/X.Y.Z panda-common +$ svn co svn+ssh://svn.cern.ch/reps/panda/panda-server/tags/A.B.C panda-server + +* For tar-ball installation + +$ cd panda-common +$ python setup.py install --prefix=INSTALLDIR +$ cd ../panda-server +$ python setup.py install --prefix=INSTALLDIR + +where INSTALLDIR is /data/atlpan/testsrv, for example. + +* For RPM installation + +$ cd panda-common +$ python setup.py bdist_rpm +$ sudo rpm -Uvh dist/panda-common-*.noarch.rpm +$ cd ../panda-server +$ python setup.py bdist_rpm +$ sudo rpm -Uvh dist/panda-server-*.noarch.rpm + +INSTALLDIR is set to /data/atlpan/srv automatically for RPMs + + +2. Modify config files + +$ cd INSTALLDIR/etc/panda +$ mv panda_common.cfg.rpmnew panda_common.cfg +$ mv panda_server.cfg.rpmnew panda_server.cfg +$ mv panda_server-httpd.conf.rpmnew panda_server-httpd.conf +$ emacs -nw panda_server.cfg + +fix FIXME + +dq2_dir = /opt/dq2 + +-> + +dq2_dir = /data/atlpan/DQ2Clients/DQ2Clients + +$ emacs -nw panda_server-httpd.conf + +SSLCertificateFile InstallDir/etc/panda/server.crt +SSLCertificateKeyFile InstallDir/etc/panda/server.key + +-> + +SSLCertificateFile /etc/httpd/conf/ssl.crt/server.crt +SSLCertificateKeyFile /etc/httpd/conf/ssl.key/server.key + +$ cd INSTALLDIR/etc/sysconfig +$ mv panda_server-sysconfig.rpmnew panda_server-sysconfig +$ emacs -nw panda_server-sysconfig + +add + +export X509_USER_PROXY=/data/atlpan/x509up_u25606 + + +3. Add .gacl + +$ cd INSTALLDIR/lib/python*/site-packages/pandaserver/server/ +$ emacs -nw .gacl + + + + + + + + +4. Add grid-env.sh if needed + +e.g., +$ cat INSTALLDIR/etc/grid-env.sh +export LD_LIBRARY_PATH=/opt/glite/lib64:/opt/globus/lib:/opt/lcg/lib64:$LD_LIBRARY_PATH +export PYTHONPATH=/opt/glite/lib64/python:/opt/lcg/lib64/python:$PYTHONPATH +export PATH=/opt/edg/bin:/opt/glite/bin:/opt/globus/bin:/opt/lcg/bin:$PATH + +and modify panda_server.cfg + +$ emacs -nw INSTALLDIR/etc/panda/panda_server.cfg + +glite_source = /opt/glite/etc/profile.d/grid-env.sh + +-> + +glite_source = INSTALLDIR/etc/grid-env.sh + + +5. Make log and cache dirs, and change owner if RPM is used + +mkdir -p INSTALLDIR/var/log/panda +mkdir -p INSTALLDIR/var/log/panda/wsgisocks +mkdir -p INSTALLDIR/var/cache/pandaserver +chown atlpan:zp INSTALLDIR/var/log/panda +chown atlpan:zp INSTALLDIR/var/log/panda/wsgisocks +chown atlpan:zp INSTALLDIR/var/cache/pandaserver + +6. For voatlas + +cp ~/devsrv/share/httpd-pandasrv /etc/rc.d/init.d/ +/sbin/chkconfig --add httpd-pandasrv +cp ~/devsrv/share/panda_server-httpd.conf.VM /data/atlpan/srv/etc/panda/panda_server-httpd.conf +cp ~/devsrv/share/panda_server.cfg.VM /data/atlpan/srv/etc/panda/panda_server.cfg +cp ~/devsrv/share/x509up_u25606_novoms /data/atlpan/ +chown atlpan:zp /data/atlpan/x509up_u25606_novoms +cp ~/devsrv/share/pandasrv /etc/logrotate.d/ +cp ~/devsrv/share/pandasrv.cron /etc/cron.d/ + + +Start the server +-------------------- + +Add the following to crontab. + +0-59/5 * * * * INSTALLDIR/usr/bin/panda_server-add.sh > /dev/null 2>&1 +15 0-21/3 * * * INSTALLDIR/usr/bin/panda_server-copyArchive.sh > /dev/null 2>&1 + +Run the server. + +$ sudo INSTALLDIR/etc/init.d/panda_server-ctl start + +Stop the server. + +$ sudo INSTALLDIR/etc/init.d/panda_server-ctl stop + + + + + + diff --git a/current/MANIFEST.in b/current/MANIFEST.in new file mode 100644 index 000000000..46666ccfe --- /dev/null +++ b/current/MANIFEST.in @@ -0,0 +1,2 @@ +include *.txt *.py *.cfg +recursive-include templates *.template diff --git a/current/README.txt b/current/README.txt new file mode 100644 index 000000000..7df534fd3 --- /dev/null +++ b/current/README.txt @@ -0,0 +1,967 @@ +Release Note + +* 0.0.18 (7/2/2013) + * tagged for JEDI + * fixed datriHandler for SLC6 + * improved getLFNsInUseForAnal + * fixed getScriptOfflineRunning for athenaMP + * fixed dispatcher so that install jobs can run on sites with status=test + * fixed for ANALY_BNL_SHORT and ANALY_BNL_LONG + * included group analysis jobs in priority massage + * removed priority boost for group analysis jobs + * fixed brokerage to respect preset computingSite even for too many input + jobs in cloud with negative t1weight + +* 0.0.17 (4/27/2013) + * giving a higher prio to install jobs + * split runRebro from copyArchived + * fixed retryInActive to reset file status + * modified dispatcher to send prodSourceLabel for getJob + * changed ATLAS_PANDALOG.USERS_ID_SEQ to ATLAS_PANDAMETA.USERS_ID_SEQ + * added TaskMonitor link to email notifications + * changed getJob() to allow the prod/analy pilot to get installation jobs + * fixed retryJobsInActive + * fixed datasetManager to delete sub from foreign T1 instead of home T1 + * improved getDisInUseForAnal + * added boostUser + * improved fairshare to support per-cloud shares + * changed Setupper to register both DATADISK and PRODDISK as locations for sub + * changed job/task brokerages not to check DBR with DQ2 at CVMFS sites + * changed the brokerage to skip release checks for releases=ANY + * fixed for site.priorityoffset + * fixed T2 cleanup to check if there is active subscription + * fixed brokerage and copyArchive for RU + * changed insertNewJob not to insert metadata when it is empty + * fixed killUser to kill jobs gradually + * fixed Setupper to make dis for pin at MCP sites in ND cloud + * fixed Setupper to take cloudconfig.tier1se into account for dis subscriptions + * set a limit on G/U in the brokerage + * sending more info in PD2P logging + * fixed LFC lookup in the brokerage + * changed PD2P to be triggered by the second job + * removed multiCloudFactor from the brokerage for NL + * added a protection to updateJobStatus to prevent holding->transferring + * fixed getUserParameter to insert new row if the user is missing + * fixed Setupper to trigger prestaging when sites with multi-endpoints use TAPE + * put all info to ErrorDiag in the brokerage + * added modificationTime constraint to URL sent to the user by Notifier + * introduced ProcessLimiter + * changed TA to shorten retry interval after refreshing replica info + * skipping file availability check for log datasets in TA + * using cloudconfig.tier1SE to count files at T1 + * setting scope only for ATLAS + * improved the task brokerage to check datasets with fewer replicas first + * set limit on the number of IDs to be sent to the logger for reassign/killJobs + * removed LFC lookup from TA + * changed PD2P to use secondary share + * fixed to use correct DQ2 site ID for pinning at sites with multi-endpoints + * modified to send scopes for output files to the pilot + * added changeJobPriorities + * using DATADISK for MCP T1 input at all T1s except US + * added filespec.scope + * reducing lifetime of dis when corresponding jobs finished and some of them failed + * improved the brokerage to count the number of running jobs per processingType + * using transferringlimit in the brokerage + * fixed the bulk OK file lookup again for unique ddm endpoint sites + * reduced interval of PandaMover reattempts to 15min from 3h + * fixed the bulk OK file lookup in the brokerge for multiple ddm endpoints + * increased the number of PandaMover channels to 15 + * using DATADISK for MCP T1 input at CERN + * using a default fareshare defined per cloud if T2 doesn't define share + * added a protection against overwriting of dataset status by datasetMgr + * implemented a nested fareshare management mechanism + * fixed the brokerage message when release is missing for repro + * fixed TA since replicas at T1 non DATADISK prevented T2 replicas from being used + * using DATADISK for MCP T1 input at ND,ES,DE,NL,TW + * added a patch for MWT2 to associate MWT2_DATADISK in TA + * allowed wildcards in cloudconfig.tier1SE + * fixed Merger for standalone ROOT + * fixed Closer to trigger merging for cancelled jobs + * fixed Setupper to pin DBR as well + * added a protection to Setupper for file lost after job submission + * fixed getHighestPrioJobStatPerPG for group queue + * added group queue to all clouds + * added FOR UPDATE when getting jobdefID for users + * removed hard-coded FZK-LCG2_DATATAPE removal in TA + * set activity=Production to TA subscriptions + * fixed weight reduction in TA for no input tasks + * fixed the brokerage to send message to logger for too many transferring's + * fixed wrong error message in TA when open dataset is incomplete + * updated TA to use a special weight reduction when only TAPE is available + * removed selector from fileCallbackListener + * fixed for TMPDISK + * fixed Setupper to scan T2 LFC per LFC host instead of per SE + * fixed Setupper to use correct location when pinning dis at foreign T1 + * fixed sitemapper to allow multiple DQ2 site IDs to use the same token + * added DQ2 registration time to SLS + * fixed vomsrenew.sh to check certificate and proxy lifetime + * fixed file-check in the brokerage for BNL@non-US + * fixed brokerage not to overwrite file's destSE for destSE=local + * introduced mcore queue in PG + * added iscvmfs to SiteSpec + +* 0.0.16 (8/29/2012) + * changed Setupper to make sub when data is available only at T2 + * changed Setupper to make sub when data is missing at T1 + * change TA to pin input and skip replicas with ToBeDeleted + * using share=secondary for non T2-close-source PD2P + * added useWebCache() to Client + * fixed getJobStatistics not to read archived via http by default + * fixed Adder2 to skip destSE check for ddm=local + * fixed LFCclient to randomly resolve DNS alias for LFC host + * added makeSlsXml + * patched smtplib.stderr to send debug info to logger + * added 32/64 to getScriptOfflineRunning + * changed JOBSARCHIVED4_MODTIME_IDX hint + * enabled maxtime check for analysis brokerage + * fixed to check T2 files when get reassigned + * removed hints related to JOBSACTIVE4_JOBSTATUS_IDX + * fixed setOK to check map + * fixed resetDefinedJob for for recordStatusChange + * fixed updateJobStatus not to reset modificationTime of holding jobs + * fixed file check not to use TAPE replicas when T1 is used as T2 + * disabled release check for CERN-RELEASE + * enabled release check for CERN + * removed EVNT from PD2P + * removed the higher priority to phys-higgs + * added _LONG as a suffix of hospital queue + * fixed queryLastFilesInDataset agains missing jobs which are still in fileDB + * added setPriority.py + * fixed updateJobStatus for endTime + * updated the brokerage log to have timestamp + * updated the brokerage to take maxtime into account + * updated file-level callback + * added Job Status Monitor + * added --killUserJobs to killJob.py + * added reliability-based brokerage for analysis jobs + * fixed getDestSE to look into ARCH for sub datasets for failed log files + * fixed rebrokerage when orig replica is set to ToBeDeleted + * temporally gave a higher priority to phys-higgs for ICHEP2012 + * added code=91 to allow prod role to kill user jobs gracefully + * check LFC every hour for high prio transferring jobs + * fixed datasetManager for T2 cleanup by recognizing T1 PRODDISK correctly + * delete sub from PRODDISK except US clous + * added protection to ReBroker against redundant comma in excludedSite + * added fatal errors for datri in Adder2 + * fixed Adder2 for missing src in schedconfig for analysis with destSE + * changed brokeage to make a chunk for each diskCount/memory + * added RbLauncher to run ReBroker in grid env + * added more message to Finisher + * fixed Adder2 for failed jobs to add files to sub + * reduced the number of add.py + * modified getHighestPrioJobStat to calculate per PG + * added --noRunning to killTask + * fixed insertSandboxInfo to use real file size + * added checkSandboxFile + * fixed brokerage for nightlies + * extracting crc from input sandbox in putFile + * added changes for debug mode + * setting prestage sites with PandaMover dynamically + * removed BNL_ATLAS_1 from SiteMapper + * removed FILESTABLE4_DATASET_IDX + * added more info to putFile + * optimized getDisInUseForAnal in TB + * fixed TA to ignore non-DATADISK replicas at T1 + * fixed brokerage for preassigned repro jobs + * fixed dataset update timing check in Notifier + * rixed zero suppression with wildcard in brokerage + * fixed rebro to set the same specialHandling to build since new build may have different specialHandling + * removed old hints + * fixed DataServiceUtils to return an empty map when DQ2Map is set + * using FOR UPDATE in lockJobForReBrokerage + * added more debug INFO to Setupper + * fixed DBProxy not to freeze top datasets for HC when build failed + * fixed anal brokerage to take # of defined jobs into account + * setting RUCIO_ACCOUNT and RUCIO_APPID + * pin dis for foreign T2s in US cloud + * removed special treatment for BNL from Adder + * fixed the brokerage to get hospital queues automatically + * updated brokerage to use coreCount + * fixed Closer not to freeze any HC datasets + * fixed Adder since Register2 gives DatasetExist error when it got deleted + * enabled cap based on priority for CERN + * not reset retried jobs in Watcher + * check attemprNr in retryJob + * added double quotas to all params in getScriptOfflineRunning + * added jobMetrics + * added a protection against non-integer PandaID in peekJob + * changed to update only changed attributes in job tables + * fixed runMerge not to be stopped due to a single dataset error + * added debug message for execution time of DQ2(+LFC) registration + * fixed storeJob to reset changed attribute list + * disabled beyond-pledge for HC jobs + * changed to update only changed attributes in filesTable4 + * added nOutputDataFiles and outputFileBytes to job tables + * modified getScriptOfflineRunning to use parallel transfers + * removed shadow lookup in Adder + * disabled sub for computingSite=destinationSE + * added getScriptOfflineRunning + * added retry to Cassandra operations + * changed killing with group prod role not to be case-sensitive + * added getDis/LFNsInUseForAnal + * added getPledgeResourceRatio to TB + * added Cassandra file cache + * added TAG support in EventPicker + * added countGuidsClient + * using SCRIPT_NAME in panda.py + * removed _shadow creation in ReBroker + * fixed queryLastFilesInDataset for the fileTable change + * remove deleting datasets from the Datasets table + * sending error log to the logger when TA cannot find dataset in DQ2 + * sending fsize and checksum to the pilot + * added modificationTime<=CURRENT in getFilesInUseForAnal + * added hint when deleting rows from Datasets + * making larger subs by sorting jobs by site + * instantiating dq2api in each thread + * added hint to use 11g cashing + * removed constraint in TA to consider T1 and T2 equally + * increased the lifetime of the proxy to 96h + * fixed TA to select candidate T2s correctly + * getting shadow info from filesTable + * added vomsrenew.sh + * fixed TA to count the number of files at US T2 + * check attmptNr + * fixed for non-MC/DATA space at split T1 + * fixed TA to check completeness at T2 + * use correct locations for GEN dis when jobs directly go to T2 + * added protection to Adder2 against sites disappearance from schedconfig + * added preferential analysis brokerage based on countryGroup + * added more verbose message in Adder + * Mikhail Titov updated datriHandler + * fixed cloudlist to skip None + * added getJobStatisticsPerUserSite + * added 64bit in copyROOT + * avoid priority reduction for merge jobs + * use <= for maxDiskCount in getJob + * fixed rebrokerage for --destSE + * updated rebrokerage to be triggered 3 hours after the site is blacklisted + * set maxAttempt to allow users to disable auto retry + * changed global file map to local in brokerage + * fixed Adder2 to use proper destination for token=TAPE when running at T1 as T2 + * updated killJob to take group prod role into account + * updated brokerage to take priorities into account for prod jobs + * using native DQ2 call in ToA + * modified brokerage to do bulk LFC lookup per site + * fixed brokerage_util to do LFC lookup per 1000 files instead of 100 files + * fixed brokerageErrorDiag for repro + missingRel + * fixed port of pandamon in email notification + * fixed brokerageErrorDiag for useT2 + repro + * set replica pin lifetime before deleting from T2 + * improved brokerage error diag + * cleaned the brokerage for hospital queues + * use 0 when memory=0 in one of online sites with the same siteID + * fixed the brokerage to use RAL-LCG2_H​IME as UK T1 + * touch input sandbox when tried to be overwritten + * permit overwriting of input sandbox + * reject limited proxy + * added priority boost for gangarobot-pft + * fixed getCriteria for aggregated sites + * fixed brokerage for group=any:0% + * fixed brokerage more for type=any:0% + * fixed brokerage to take zero shares into account + * fixed getCriteriaForProdShare for zero shares + * added minPriority to Client.getJobStatisticsPerSite + * using MV in getJobStatisticsWithLabel + * added fairshare to getJob + * fixed retryJob not to change the name of lib.tgz for ptest + * fixed retryJob not to retry buildJob to keep the PandaID order + * fixed TB to give higher prio to buildJob with prodRole + * fixed Merger to use the largest SN for merged files + * fixed queryLastFilesInDataset to ignore merged files + * fixed brokerageErrorDiag for non missing release errors + * added tmpwatch.py + * changed hint in getJobs + * fixed updateProdDBUpdateTime for pending jobs + * fixed brokerage to accept test sites for prod_test jobs + * changed getJobs for test pilots to get gangarobot jobs + * setup glite in TaLuncher + * added lock in lockDatasets + * added version check in Merger to avoid duplicating merge jobs + * changed Merger to fail when container name is too long + * use lockJobsForReassign for reassign in copyArchive + * use native DQ2 in copyArchive and datasetMgr + * use python2.5 for copyArchive and prio-mgr + * use native DQ2 in Setupper + * fixed guid generation for user's log + * introduced 2 staged submission for prod jobs + * using T2 in TA + * using materialized view get getJobStatistics family + * updated Merger to put log files of merge jobs to a separate container + * fixed Merger for --transferredDS + * enabled rebrokerage for processingType=ganga + * updated Adder for unique constraint error + * added copyROOT + * updated Adder to immediately go to failed when subscription failures + * disabled prio boost for gangarobot derivatives + * added protection to TA against undefined maxinputsize + * updated TA and brokerage to use T2 datasets in prod + * updated for DQ2 client 0.1.37 + +* 0.0.15 (11/07/2011) + * removed redundant freshness checks in getSN + * changed hint in getSerialNumber + * randomized job order in adder + * decreased the number of adder processes + * added more tight constraint to getJobStatistics family + * reduced prio by 10 for pilot-retry jobs + * increased the factor of the RW limit to 8000 + * updated Merger for --mexec + * modified rebroekrage to send brokerage log + * modified brokerage to send user's countryGroup and nJobs to logger + * added a protection to httpd.conf for interesting panda.py + * not attach attemptNr to lib.tgz for rc_test+buildJob + * fixed parentID for retryJob with new PandaID + * randomized the order of site check in analysis brokerage + * added --killOwnProdJobs to killJob.py and killJobsInTask.py + * fixed brokerage to require cache=None for release check + * pinning input datasets + * added limitation of exe/pilotErrorDiags in JD + * fixed short->long mapping in retryJob + * generates new PandaID for pilot-retried job + * using negative errorcode for pilot-retry + * added invalid character check to DDM + * fixed the brokerage for --transferredDS + +* 0.0.14 (10/11/2011) + * fixed TaskAssigner for MCshare=0 + * updated brokerage to consider priorities for analysis jobs + * fixed brokerage for BNL_CVMFS_1 + * modified managed pilots to get prod_test as well + * call addShadow even if DaTRI failed + * fixed the error message of location registration in Setupper + * modified ReBroker for server-side retry + * reverted the brokerage change + * changed brokerage to skip sites with memory=0 for analysis with memory + * increaded MaxClients + * use DQ2 for foreign T2 in US cloud + * use IN2P3-CC and IN2P3-CC_SGE_VL as FR T1 in brokerage + * unset commandToPilot for jobs reassigned by rebrokerage + * added retryJobsInActive + * added --maxJobs and --running to killJobLowPrio.py + * added killJobLowPrio.py + * fixed killJob + * simplified anal_finalizer + * added SiteSpec.lfcregister + * added getAttr + * keep failed analysis jobs in Active until all jobs finished + +* 0.0.13 (8/30/2011) + * fixed Adder2.removeUnmerged to catch DQ2 errors correctly + * using subType in datasetManager + * filling datasets.subtype + * added protection against too large inputFileBytes + * removed CN=Robot: from DN + * added hint to DBProxy.getLockDatasets + * reduced the number of table scan in datasetMgr and runMerge + * fixed brokerage not to count jobs for usermerge or pandamover + * changed brokerage to use ANALY_CERN_XROOTD and not to use ANALY_CERN + * added Forker to add.py + * updated dispatcher to send taskID + * using schedconfig.multicloud + * fixed brokerage for test sites + * fixed brokerage not to count jobs for HC + * fixed rebrokerage for CERN TMP + * updated the brokerage to stop assigning prod jobs to sites which have many transferring + * added jobdefID to libDS in ReBrokerage + * disabled short -> long for HC + * fixed SiteMapper to respect online even if another queue is not online + * put attempt number to output file name in Merger + * changed = to == in redundant messages + * job-chaining for ptest+prun + * added initLogger to Notifier + * removed redundant suffix from DN for DaTRI request in EventPicker + * added more message in EventPicker for DaTRI request + * changed Notifier to non-thread + * fixed Notifier to take into account old jobs in Arch + * implemented new PD2P scheme using MoU and close sites + * increased the number of concurrent Mergers + * incrementing Datasets.currentfile only for the first failed job + * fixed Watcher to append attemptNr when sent->activated + * fixed resetDefJob + * limited the number of jobs with the same GEN dis + * fixed EventPicker to take input files into account + * fixed Merger to use .tgz for text merging + * added EventPicker + * added statusmodtime to SiteSpec + * updated Merger for runDir + * updated rebrokerage to take --cloud into account + * added tags into PD2P logging + * updated Merger for mergeScript + * fixed getFilesInUseForAnal to skip NULL dis datasets + * updated analy_brokerage to use memory size + * added cmtconfig to broker logging + * enabled cross-cloud for US in PD2P + * enabled banUser in storeJobs + * enabled role-check in submitJobs + * added WrappedPickle to avoid deserializing insecure objects + * added banUser to storeJob + * added prodSourceLabel check to UserIF + +* 0.0.12 (6/13/2011) + * fixed Merger for --useContElement + * fixed inputFileProject extraction for wildcard-uses + * using basename in Utils methods + * fixed fetchLog to disallow chdir + * fixed panda.py to disallow unexpected methods + * added getVomsAttr + * updated getJob to decompose CERN-XYZ to CERN-PROD+processingType + * updated the brokerage to use installedsw.cmtConfig + * use MoU share for T1 PD2P + * added getNumPilots + * added prodSourceLabel=ssc as user's label + * added --prodSourceLabel to killUser + * fixed archiveJob for failed jobs with multiple dis + * fixed Setupper to store GEN dis + * disabled release check in the brokerage for x86_64-slc5-gcc43 + * implemented aggressive cleaning for PRODDISK + * added priority boost for gangarobot + * updated T2 cleanup to use grace_period='00:00:00' + * cleanup copyArchive + * changed analysis brokerage to use nRunning(max in last 24h) + * increased # of active subscriptions to 2 in PD2P + * added nRunning calculator to add.py + * disabled priority reduction for merge jods + * sending analysis brokerage info to logger + * updated PD2P not to check provenance since group datasets have mc*/data* + * disabled PD2P to CERN-PROD_EOSDATADISK + * added checkMergeGenerationStatus + * enforce LFN-lookup to trigger getting replica map when reassigned + * fixed brokerge for test jobs at test sites + * use release matching for T2s in CERN cloud + * skip release check for CERN and ND + * set correct info to brokerageErrorDiag + * send jobs to waiting when release/cache is missing + * remove '' for |pilotOwners| + * put cloud-boundary back to US + * use SourcesPolicy.ALL_SOURCES for PD2P subscriptions + * improved PD2P logger + * included CERN to trigger PD2P + * fixed typo in PD2P skip message + * fixed zero-division in PD2P + * enabled T1-T1 in PD2P + +* 0.0.11 (4/18/2011) + * fixed getExpressJobs + * use c-t-s for all files in merge jobs + * modified runMerger to kill old process + * disable Initializer when nDBConnection is 0 + * increased max attempt for rebrokerage to 5 + * changed the rebrokerage interval to 24h + * skip init for jobDispather,dataService,userIF when nCon=0 + * added parameters in email notification + * ignore LOCALGROUPDISK in PD2P + * fixed auto type detection of Merger for THIST + * use IN2P3-CC_VL for too many input or high prio jobs + * gave T1 weight to IN2P3-CC_VL + * added protection to Adder2 against DQ2 failure for jumbo datasets + * updated Adder2 to avoid making DaTRI request for unmerged files + * added protection against generating multiple Mergers for --individualOutDS + * updated brokerage to give T1 weight to NIKHEF for repro jobs + * fixed Merger for lib.tgz + * added automatic merge type detection to Merger + * updated Closer to redirect logging to parent as it doesn't work in nested threads + * changed parameter convention for Merger + * added merge job generation + * set secondary for TA subscription + * use TAIWAN-LCG2_HOTDISK for TW HOTDISK + * disabled PD2P for ESD + * set file.dispDBlock even if they are already available at the site + * send jobDefID and cloud to the pilot + * updated Setupper/Adder2 for T1 used as T2 + * set destDBlockToken to DATADISK + * using home cloud to skip release check in the brokerage + * reassign stuck T2 evgensimul more frequently + * enabled release/cache check for US + * using nRunning(cloud) in brokerage for multi-cloud + * added fileGUID to updateInFilesReturnPandaIDs for file-level callback + * set source to _subs for all clouds + * using DQ2 API directly in Adder + * added nInputDataFiles,inputFileType,inputFileProject,inputFileBytes + * add hacks again to TA and Setupper for split T1 + * added EventLookup to PD2P + * updated SiteMapper for multi-cloud + * removed hacks from TA and Setupper for split T1 + * added forceOpt to runReBrokerage + * fixed PD2P not to make sub when dataset is being deleted + * changed PD2P not to send ESD to EOS + * added a hint to getPandaIDsForProdDB to enforce function index + * added comment_ to SiteSpec + * put hacks back to TA and Setupper for split T1 which uses NIKHEF as src + * set hidden metadata to _dis and _sub + * removed REGEXP from Datasets cleanup + * enabled rebrokerage for ganga-rbtest + * fixed ReBroker for EOS + * fixed ReBroker to add _shadow + * use DATADISK for all PD2P subscriptions + * close user datasets in container + * set lifetime for dis and sub datasets + * added --jobsetID to killUser.py + * added protection against missing argument for jobID/jobsetID to killUser.py + * trigger PD2P for EOS when nUsed >= 3 + * updated brokerage to take transferType into account + * update modificationTime when going to Archived4 + * disabled extra replica making in PD2P + * trigger PD2P for EOS when nUsed >= 2 + * added testG4sim16.py and testEvgen16.py + * use diskThr=max(5%,3TB)-diskSize in PD2P + * added killJobsInTask + * set disk threshold in PD2P to 5GB + * updated PD2P so that any analysis job using data makes subscriptions to CERN EOS + * set specialHandling=rebro when reassigned by rebrokerage + * fixed DQ2 ID conversion in PD2P for EOS + * check free disk size in PD2P using DQ2.queryStorageUsage + * use function index in getPandaIDsForProdDB + * reduced the number of rotated logs + * use cernmx.cern.ch + * added getLockDatasets + * added the number of succeeded jobs to the subject of Notification + * added pd2p logging + * added deleteJobs.py + * split arch procedure to another cron + * call taskbuffer.Initializer in forkSetupper.py to acquire Oracle environment handle correctly + * use truncated DN when setting dataset owner + * reassign evgen/simul with active state at T1 more aggressively + * made SQLDumper iterable + * added SQLDumper + * added reassignTask + * use getFullJobStatus in Notifier since some jobs can go to ARCH before notification + * seprate retry for Notifier + * added retry to Notifier when failing to send notifications + * express jobs + * make new dis datasets even if files are already available at T2 + * short/long mapping for ANALY_LYON-T2 + * updated PD2P to use a negative weight based on the number of subs + * ignore hidden datasets in PD2P + * don't use modTime index on jobs_ARCH + * set/increment nUsed in PD2P + * use LFN for WN-level matchmaking + * ignore datasets with provenance=GP for PD2P + * don't reuse the same site in a single PD2P cycle + * fixed brokerage to send warning when cache is missing + * removed redundant holding for prod jobs in Watcher + * more fix to SetUpper for rc_test + * not reset holding analysis jobs when stateChangeTime=modTime + * set stateChangeTime when job goes to holding for finished/failed + * job chain for rc_test + gangarobot-rctest + * added archivelogs + * set tobeclosed to sub datasets of failed downstream jobs + * rctest -> rc_test + * reduced time interval to reassign waiting jobs to 30min + * enabled user-triggered rebrokerage + * send currentPriority in dispatcher + * set localpool to specialHandling when beyond-pledge pilot got the job + * fixed makeSub in TA for getAva change + * added random sleep for Finisher in copyArchive + * improved del in copyArchive to avoid redundant deletion + * increased timelimit for copyArchive + * added auto rebrokerage to copyArchive + * report new PandaID to taskBufferErrorDiag when rebrokered + * check procesingType in rebrokerage + * added code=8 to killJob for rebrokerage + * first implementation of auto rebrokerage + * added getCachePrefixes + * removed apostrophes from prodUserName + * fixed useNiotifier in Closer for completed sub datasets + * changed queryLastFilesInDataset to use MAX(lfn) + * improved the space shortage message in TA + * don't check missing files with LFC when site is already set + * added -9 to killTask + * added forceKill for prod jobs + * changed the brokerage to use CERN-PROD_EOSDATADISK as the dest for CERN-EOS jobs + * added enforce to Activator + * changes for merge/unmerge jobs + * rctest + * deleteStalledJobs + * removed hacks for last_insert_id of InnoDB + * allowOtherCountry + * updated datriHandler to prevent false http-requests + * added a hint to getJobIDsInTimeRange against jobsActive4 + * added a hint to getJobIDsInTimeRange against jobsArchived4 + * changed hint in DBProxy.updateTransferStatus + * changing TRF URL from BNL to CERN on the server side + * fixed error message in brokerage for sites with status!=brokeroff + * fixed brokerage for release check when schedconfig.rel != '' + * changed countryGroup=ustlas to us + * ignore gangarobot family in PD2P + * disabled priority decreasing for HC jobs + * use installedSW for base-release matching for analysis + * $GROUPJOBSN + * added getSerialNumberForGroupJob + * use jobsetID in Notifier + * use max memory/inputsize for each site + * set jobsetID for ptest + * changes for output container and short LFN for analysis + +* 0.0.10 (8/2/2010) + * tagged for output container and short LFN for analysis + * added setCloudTaskByUser + * get list of PD2P clouds dynamically + * send transferType to the pilot + * imposed a size limit on uploaded files by users + * fixed the task brokerage to take maxDiskCount into account + * added a protection againt empty jobParameters only for new jobs + * fixed PD2P to remove the cloud boundary when counting nSites + * disable brokerage for gangarobot + * ignore HC and group jobs in PD2P + * fixed PD2P to take non-PD2P sites into account when checking comp/incomp + * fixed AtlasRelese for PD2P + * enabled WN brokerage for ANALY_GLASGOW + * updated Adder for --destSE=multiSites + * use Data Brokering fr PD2P + * change MWT2_UC_DATADISK to MWT2_DATADISK in PD2P + * delete replicas from T2 when locations != [] + * protection against meta/para=None in peekJob + * kill ITB_INTEGRATION jobs in sent status + * batchID + * ignore dis/sub in PD2P + * dispatchDBlockTokenForOut + * added banUser.py and made --jobID optional in killUser.py + * set activity='Data Consolidation' and acl_alias='secondary' to PD2P subscriptions + * check replica at T1 in PD2P + * added getActiveDatasets + * don't move RAW,HITS,RDO by PD2P + * allow prod proxy to kill anal jobs with 2 or 4 + * added PD2P + * regard found=None as an incomplete replica + * invoke listFileReplicasBySites only for incomplete sites in TA + * fixed re-brokerage + * fixed used file check for cancelled jobs + * increased wait interval for reconnection in connection pool + * updated ConBridge to kill child when connection failure + * changed URL of panda mover trf + * added a protection against method execution failure in panda.py + * set dataset status for DaTRI requests + * ignore DaTRI failure for duplicated requests + * use DQ2 for email extraction + * added -9 to killJob.py + * added killUser.py + * added alias to httpd.conf for trf URL + * changed reading order in getPandIDsWithJobID to avoid missing jobs + * set taskBufferErrorDiag when running jobs are killed + * prevent prod proxy from killing analysis jobs + * added priority massager + * added NG words to Notifier + * avoid sending DaTRI requests for failed jobs + * fixed replica registration for --destSE + * set type in datriHandler for analysis system + * testpanda -> panda + * introduced datriHandler + * delete sub datasets from EGEE T2 when callback is received + * set REMOTE_HOST to creationHost + * increased priority boost for activated jobs + * delete cancelled from jobsDefined4 + * added boostPrio.py + * added cvs,svn,grid,librarian to NG words + * True/False for schedconfig.validation + * added admin to NG words for Notifier + * added cancelled state + +* 0.0.9 (4/13/2010) + * increased the subscription limit to 600 in TA + * protection against reassigning analysis jobs + * enabled cache-matching brokerage for all EGEE clouds + * enabled cache-matching brokerage for NL/FR + * added a protection for containers composed of multiple datasets + * added processingType to runBrokerage for HC + * doesn't check release matching for CERN + * cache-matching in the brokerage for DE + * added getHighestPrioJobStat + * changed weight for the task brokerage to use RW instead of fullRW + * fixed getFilesInUseForAnal for --individualOutDS + * added getQueuedAnalJobs + * updated brokerage to assign one prod_test job to a site + * disable prod role for non-group activity + * use maxinputsize in the brokerage + * added schedconfig stuff to template + * removed cx_Oracle from FileSpec + * removed MySQLdb from broker_utils + * added maxinputsize + * modified xyzCacheDB to take a list of siteIDs + * suppressed warning messages in dashboard + +* 0.0.8 (2/2/2010) + * tagging for SLC5 migration + * added hostname matching for T3 pilots + * use listFileReplicasBySites in TA + * added checkFilesWithCacheDB + * changed the default cmtconfig to SL4 for analysis in brokerage + * updated the brokerage to allow slc4 jobs on slc5 sites + * added killTask.py + * added addFilesToCacheDB and flushCacheDB + * modified dispatcher to accept service proxy + * added WN-level file matching to getJob + * added MemProxy + * fixed brokerage to skip release/cache matching for ND + * use all source locations for dis + * use long hint for queryDatasetWithMap + * added /Engage/LBNE/Role=pilot to acceptable roles + * added analy_test to getJob for test pilots + * use poffset regardless of accesscontrol + * removed / from FQAN check in allowedgroups + * limit the max number of files in sub dataset + * use fasttrack only for evgen/simul + * added cleanup in updateSiteData + * added chdir to LFC + * added chdir for dq2 and fork + * removed logging updateJob/getJob from dispatcher + * use averaged updateJob/getJob + * ignore test when summing SiteData + * don't update SiteData when logrotate is running + * randomized the order of sites in updateSiteData to avoid concatenation + * fixed checkSitesWithCache + * multi-threads in adder.py + * count number of updateJob/getJob in add.py + * use taskBuffer in add.py for all DB access + * use fasttrack for all tasktypes and prio>=700 + * use taskBuffer for reassignment in copyArchived + * cleanup old PandaSiteIDs for UK + * set the number of treads to 2 in wsgi daemon + * set MaxRequestsPerChild + * enabled KeepAlive for proxy sites + * check filename FieldStorage when a param is treated as file + * not delete dis datasets when jobs are reassigned + * check useFastCGI before importing flup + * introduced nDBConForFastCGIWSGI + * fixed Setupper to re-register location at next attempt when previous was failed + * changed logLevel in httpd + * added flag to control verbosity of entry point + * added FastCGI stuff + +* 0.0.7 (11/20/2009) + * removed verbose message from DBProxyPool + * more verbose info to DBProxyPool + * fixed ReBrokerage to require the same distribution pattern of input datasets + * set encoded nJobs to taskID for analysis jobs + * fixed ReBrokerage + * propagate bad state from dashboard + * removed threading in dispatcher and dataservice + * fixed typo in dashboard access + * fixed CloudTaskSpec for serialization + * close non-DQ2 destinationDBlock in Closer + * use infinite loop in ProxyPool.__init__ + * add random sleep to ConBridge.connect + * use TaskBuffer instead of DBProxy in copyArchive + * added querySQLS to DBProxy + * use ping for wakeUp + * degrade message level of child termination in ConBridge + * added ConBridge for database timeout + * re-implemented rebrokerage to allow the case where build finished + +* 0.0.6 (11/13/2009) + * destinationSE=local + * propage failed_transfer from dashboard + * added activity to subscriptions + * added cleanup for Datasets table + * added workaround for x86_64-slc5-gcc43 + * removed TO_DATE for Datasets.modificationdate + * set priority of buildJob back to 2000 + * renamed testpanda.ddm to pandaddm_ + * added /osg/Role=pilot + * added lower limit for TO_DATE against Datasets table + * added protection in JobDispatch against non-proxy pilots + * added ReBroker + * removed UAT stuff + * use long queue in brokerage in addition + * increased max subjobs in UserIF to 5000 + * send log message from brokerage when disk shortage + * use ANALY_LONG_BNL_ATLAS for UAT + * added temporary priority boost for UAT + * added YY.MM.DD to destinationDBlock of PandaMover + * skipped release check in brokerage when weight is negative + * removed T1 constaint on high prio jobs in brokerage only for i686-slc5-gcc43-opt + * limit matching of cmtconfig=i686-slc5-gcc43-opt to i686-slc5-gcc43-opt jobs only + * changed brokerage to use only T1 for many input jobs when weight is negative + * removed computingElement matching in getJob for test jobs + * use transtimelo for timeout of analysis transfers + * fixed for site->siteid in installedSW + * added protection to _checkRole() + * use cache version matching for analysis + * added 'user' to NG words in Notifier + * take '_' into account in Closer for new naming convention + * use onlyNames in dq2.listDatasets + * changes for destSE + * changed cmtconfig for slc5 to match to slc4 and slc5 + * set pandamover priorities using original job priorities + * added HOTDISK to Setupper + * added PandaMonURL to email notification + * send email notification to site contact in addition to cloud contact + * use schedconfig.DN for privilege check in addition to cloudconfig + * ptest for analy tests + * use SARA-MATRIX for all T1 sources + * more NG words in address finding + * skip VUID lookup for analysis jobs + * added getSlimmedFileInfoPandaIDs + * added a hint for filesTable_ARCH + * limited modificationTime on filesTable_ARCH queries + * allowed the pilot to set status for failed input files + * make subscription for ptest + * use /atlas for auth of updateFileStatusInDisp + * added updateFileStatusInDisp to flag lost files + * removed double counting of jobs in Notifier + * updated template + * changed LogFormat for SLS + * send prodDBlockToken to the pilot + * modified Adder to take DQUnknownDatasetException into account + * make subscriptions for rc_test + * flagged all missing files in Setupper + * added jobType to Client.getJobStatisticsPerSite + * use stage-priority for prestaging + * updated the brokerage to take input size into account + * use cleanUserID in Notifier + * add copysetup to SiteSpec + * fixed getCurrentSiteData for analysis + * use pilotowners for checkRole in dispatcher + * ignore DBRelease when adding shadow + * support getJobStatisticsPerSite(countryGroup=None,workingGroup=None) + * added two more filed to dis datasetname + * calculate priority for each workingGroup + * added finder for email address using phonebook + * reverted the change in Setupper + * register location for _sub even when src=dest + * workingGroup/countryGroup in getJobStatisticsPerSite + * added getPandaClientVer + * fixed MailUtils for multiple recipients + * reuse unknown input files when build failed + * use T1 in brokerage when too many inputs are required + * added a timeout to Client + * set sources of dis for all clouds + * use MCTAPE for subscriptions + * added trustIS to runBrokerage + * added longFormat to listSiteAccess + * added set to updateSiteAccess + * verify workingGroup + * send email update/request for site access + * kill old dq2 processes + * addded updateSiteAccess + * workingGroup + * added MailUtils + * prestaging for MCTAPE + * set processingType for mover + * get proxy for each job in getFullJobStatus + * fixed address-check to trigger xwho + * introduced NG words in email-adder finding + * put size limit in putFile + * set higher priority for installation mover + * skip files used by failed/finished jobs in getFilesInUseForAnal + * removed BNL and old bamboo stuff from Client.py + * added a hint to updateInFilesReturnPandaIDs + * added getFilesInUseForAnal + * set sources for ES + * added a hint to getJobIDsInTimeRangeLog + * removed write spaces from md5sum/checksum in peekJobLog + +* 0.0.5 (5/15/2009) + * subtract N*250M from available space in brokerage + * use tasktype2 for RW recalculation + * allow transferring in updateJob + * use job stat per process group in brokerage + * added prodUserName + * added validation to test + * fixed TA + * use prodUserName for users + * added nEvents to JD + * added pilotowners + * added rc_test + * added a hint for Datasets.name + * enabled validatedReleases for all clouds + * set high priority for production role + * added realDatasetsIn + * get empty list of LFNs for empty dataset + * set modificationTime to ARCH tables + * fixed getUserParameter + * added nInputFiles for HC + * added countryGroup for country share + * use a hint for filesTable4.dataset + * fixed lookup for mail addr + * use PandaMover for US + * give higher priorities to /atlas/xyz/Role=production + * set workingGroup when jobs are submitted with prod role + * fixed peekJobLog + * replica location lookup for containers + * fixed broker_util to use proper python + * use jobParamsTable + * fixed python path to use 64bit glite + * fixed for ArchivedDB + * fixed FQAN extraction for GRST_CONN + * dispatchDBlockToken + * converted datetime to str for stateChangeTime + * use 12hr limit in getJobStatisticsForBamboo + * use CERN-PROD_DAQ for prestaging when _DATATAPE is not a location + * ignore token=ATLASDATATAPE when no tape copy + * pandasrv -> pandaserver + * set old=False for listDatasetReplicas + * fixed copyArchived for ArchiveDB + * added _zStr/_nullAttrs in JobSpec + * fixed getJobStatisticsForExtIF() + * fixed for schedID/pilotID + * removed redundant debug message + * fixed for Notification + * input token for mover + * set NULL for creationHost,AtlasRelease,transformation,homepackage + * use sequences directly for PandaID and row_ID + * use SUBCOUNTER_SUBID_SEQ directly + * added a hint to countFilesWithMap + * fixed getNUserJobs + * removed log/cache dirs making + * put alias to filesTable4 in countFilesWithMap + * introduced PANDA_URL_MAP + * suppressed meta in JobSpec + * error handling in Adder + * fixed enddate in Notifier + * use CURRENT_DATE in copyArch + * added nprestage + * added startTime/endTime in updateJob + * validatedreleases and accesscontrol + * 3 -> 1hour for movers (discarded) + * added 'IS NULL' to copyArch + * added bulk reading for PandaID to copyArch to avoid redundant lookup + * added a hint to updateOutFilesReturnPandaIDs + * use Null instead of 'NULL' + * don't reset jobParameters when reassigned + * added a hint to all fileTable4+destinationDBlock + * use JOBSARCHIVED4_MODTIME_IDX + * addSiteAccess and listSiteAccess + * hours=1 -> 3 for movers + * retry in peekJob + * reconnection in rollback + * added hint to queryDatasetWithMap + * use bind-variables for all queries + * fixed freezeDS + * fixed a duplicated variable in Closer + * truncate ddmErrorDiag + * hint to freezeDS + * removed deleteFiles in copyArchived + * not update modTime in copyArchived when peekJob failed + * container-aware + * validatedreleases and space check in brokerage + * added deleteJobSimple + * use validatedreleases for FR too + * fixed reassignXYZ + * use archivedFlag for copy/delete + * fine lock for reassignRepro + * threading for reassignRepro + * improved expiration messages + * failed when input dataset is not found in DQ2 + * debug messages in Setupper + * added other error codes in rollback + +* 0.0.4 (2/23/2009) + * GSI authentication for pilots + * tag-based security mechanism for scheduler-pilot-server chain + * fixed test/add.py to use Oracle instead of MySQL + * fixed querySQLS for DELETE + * added panda_server-grid-env.sh + * merged DB proxies to reduce the number of connections + * added lock for worker MPM + * use common write account + +* 0.0.3 (2/16/2009) + * sync to production version + +* 0.0.2 (12/18/2008) + * adjustments for CERN + +* 0.0.1 (12/4/2008) + * first import + + LocalWords: ConBridge diff --git a/current/pandaserver/__init__.py b/current/pandaserver/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/current/pandaserver/brokerage/ErrorCode.py b/current/pandaserver/brokerage/ErrorCode.py new file mode 100644 index 000000000..ea80122e4 --- /dev/null +++ b/current/pandaserver/brokerage/ErrorCode.py @@ -0,0 +1,9 @@ +############## errror code + +# release is not found +EC_Release = 100 + +# voms authentication failure +EC_Voms = 101 + + diff --git a/current/pandaserver/brokerage/LFCclient.py b/current/pandaserver/brokerage/LFCclient.py new file mode 100755 index 000000000..06a857e8e --- /dev/null +++ b/current/pandaserver/brokerage/LFCclient.py @@ -0,0 +1,152 @@ +import re +import os +import sys +import socket +import random + +# error codes +EC_Main = 70 +EC_LFC = 80 + +# import lfc api +try: + import lfc +except: + print "ERROR : could not import lfc" + sys.exit(EC_LFC) + + +# get files from LFC +def _getFilesLFC(files,lfcHost,storages,verbose=False): + # randomly resolve DNS alias + if lfcHost in ['prod-lfc-atlas.cern.ch']: + lfcHost = random.choice(socket.gethostbyname_ex(lfcHost)[2]) + # set LFC HOST + os.environ['LFC_HOST'] = lfcHost + # timeout + os.environ['LFC_CONNTIMEOUT'] = '60' + os.environ['LFC_CONRETRY'] = '2' + os.environ['LFC_CONRETRYINT'] = '6' + # get PFN + iGUID = 0 + nGUID = 1000 + pfnMap = {} + listGUID = [] + for guid in files.keys(): + if verbose: + sys.stdout.write('.') + sys.stdout.flush() + iGUID += 1 + listGUID.append(guid) + if iGUID % nGUID == 0 or iGUID == len(files): + # get replica + ret,resList = lfc.lfc_getreplicas(listGUID,'') + if ret == 0: + for fr in resList: + if fr != None and ((not hasattr(fr,'errcode')) or \ + (hasattr(fr,'errcode') and fr.errcode == 0)): + # get host + match = re.search('^[^:]+://([^:/]+):*\d*/',fr.sfn) + if match==None: + continue + # check host + host = match.group(1) + if storages != [] and (not host in storages): + continue + # append + if not pfnMap.has_key(fr.guid): + pfnMap[fr.guid] = [] + pfnMap[fr.guid].append(fr.sfn) + else: + print "ERROR : %s" % lfc.sstrerror(lfc.cvar.serrno) + sys.exit(EC_LFC) + # reset + listGUID = [] + # collect LFNs + retLFNs = {} + for guid,lfn in files.iteritems(): + if guid in pfnMap.keys(): + retLFNs[lfn] = pfnMap[guid] + # return + return retLFNs + + + +#################################################################### +# main +def main(): + import sys + import getopt + # option class + class _options: + def __init__(self): + pass + options = _options() + del _options + # set default values + options.verbose = False + options.lfns = [] + options.guids = [] + options.lfchost = '' + options.storages = [] + options.infile = None + options.outfile = None + # get command-line parameters + try: + opts, args = getopt.getopt(sys.argv[1:],"s:i:g:vl:o:f:") + except: + _usage() + print "ERROR : Invalid options" + sys.exit(EC_Main) + # set options + for o, a in opts: + if o in ("-v",): + options.verbose = True + if o in ("-s",): + options.storages = a.split(',') + if o in ("-i",): + options.lfns = a.split(',') + if o in ("-g",): + options.guids = a.split(',') + if o in ("-l",): + options.lfchost = a + if o in ("-f",): + options.infile = a + if o in ("-o",): + options.outfile = a + # read GUID/LFN + files = {} + if options.infile == None: + for idx in range(len(options.guids)): + guid = options.guids[idx] + lfn = options.lfns[idx] + if guid != 'NULL': + files[guid] = lfn + else: + try: + # read from file + ifile = open(options.infile) + for line in ifile: + items = line.split() + if len(items) == 2: + guid = items[1] + lfn = items[0] + if guid != 'NULL': + files[guid] = lfn + # close and delete + ifile.close() + os.remove(options.infile) + except: + errType,errValue = sys.exc_info()[:2] + print "ERROR: %s:%s" % (errType,errValue) + sys.exit(1) + # get files + retFiles = _getFilesLFC(files,options.lfchost,options.storages,options.verbose) + print "LFCRet : %s " % retFiles + # return + sys.exit(0) + + +if __name__ == "__main__": + main() + diff --git a/current/pandaserver/brokerage/PandaSiteIDs.py b/current/pandaserver/brokerage/PandaSiteIDs.py new file mode 100644 index 000000000..5819cc4c0 --- /dev/null +++ b/current/pandaserver/brokerage/PandaSiteIDs.py @@ -0,0 +1,198 @@ +# !!!!!!! This file is OBSOLETE. Its content has been absorbed into pilotController.py in the autopilot repository. +# !!!!!!! Questions to Torre Wenaus. +PandaSiteIDs = { + 'AGLT2' : {'nickname':'AGLT2-condor','status':'OK'}, + 'ALBERTA-LCG2' : {'nickname':'ALBERTA-LCG2-lcgce01-atlas-lcgpbs','status':'OK'}, + 'ANALY_AGLT2' : {'nickname':'ANALY_AGLT2-condor','status':'OK'}, + 'ANALY_ALBERTA' : {'nickname':'ALBERTA-LCG2-lcgce01-atlas-lcgpbs','status':'OK'}, + 'ANALY_BEIJING' : {'nickname':'BEIJING-LCG2-lcg002-atlas-lcgpbs','status':'OK'}, + 'ANALY_BNL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, + 'ANALY_BNL_ATLAS_1' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, + 'ANALY_BNL_ATLAS_2' : {'nickname':'BNL_ATLAS_2-condor','status':'OK'}, + #'ANALY_BNL_LOCAL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, + 'ANALY_BNL_test' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, + 'ANALY_BNL_test2' : {'nickname':'ANALY_BNL_ATLAS_1-condor','status':'OK'}, + 'ANALY_BNL_test3' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, + 'ANALY_BRUNEL' : {'nickname':'UKI-LT2-Brunel-dgc-grid-44-atlas-lcgpbs','status':'notOK'}, + 'ANALY_CERN' : {'nickname':'CERN-PROD-ce123-grid_atlas-lcglsf','status':'notOK'}, + 'ANALY_CNAF' : {'nickname':'INFN-CNAF-gridit-ce-001-lcg-lcgpbs','status':'notOK'}, + 'ANALY_CPPM' : {'nickname':'IN2P3-CPPM-marce01-atlas-pbs','status':'OK'}, + 'ANALY_FZK' : {'nickname':'FZK-LCG2-ce-5-fzk-atlasXS-pbspro','status':'OK'}, + 'ANALY_GLASGOW' : {'nickname':'UKI-SCOTGRID-GLASGOW-svr021-q3d-lcgpbs','status':'OK'}, + 'ANALY_GLOW-ATLAS' : {'nickname':'GLOW-ATLAS-condor','status':'OK'}, + 'ANALY_GRIF-IRFU' : {'nickname':'GRIF-IRFU-node07-atlas-lcgpbs','status':'OK'}, + 'ANALY_GRIF-LAL' : {'nickname':'GRIF-LAL-grid10-atlasana-pbs','status':'notOK'}, + 'ANALY_GRIF-LPNHE' : {'nickname':'GRIF-LPNHE-lpnce-atlas-pbs','status':'notOK'}, + 'ANALY_HU_ATLAS_Tier2' : {'nickname':'ANALY_HU_ATLAS_Tier2-lsf','status':'OK'}, + 'ANALY_LANCS' : {'nickname':'UKI-NORTHGRID-LANCS-HEP-fal-pygrid-18-atlas-lcgpbs','status':'notOK'}, + 'ANALY_LAPP' : {'nickname':'IN2P3-LAPP-lapp-ce01-atlas-pbs','status':'notOK'}, + 'ANALY_LIV' : {'nickname':'UKI-NORTHGRID-LIV-HEP-hepgrid2-atlas-lcgpbs','status':'notOK'}, + 'ANALY_LONG_BNL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, + 'ANALY_LONG_BNL_ATLAS' : {'nickname':'BNL_ATLAS_2-condor','status':'OK'}, + 'ANALY_LONG_BNL_LOCAL' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, + 'ANALY_LONG_LYON' : {'nickname':'IN2P3-CC-T2-cclcgceli05-long-bqs','status':'OK'}, + 'ANALY_LPC' : {'nickname':'IN2P3-LPC-clrlcgce03-atlas-lcgpbs','status':'notOK'}, + 'ANALY_LPSC' : {'nickname':'IN2P3-LPSC-lpsc-ce-atlas-pbs','status':'OK'}, + 'ANALY_LYON' : {'nickname':'IN2P3-CC-T2-cclcgceli05-medium-bqs','status':'OK'}, + 'ANALY_MANC' : {'nickname':'UKI-NORTHGRID-MAN-HEP-ce01-atlas-lcgpbs','status':'OK'}, + 'ANALY_MCGILL' : {'nickname':'MCGILL-LCG2-atlas-ce-atlas-lcgpbs','status':'OK'}, + 'ANALY_MWT2' : {'nickname':'ANALY_MWT2-condor','status':'notOK'}, + 'ANALY_MWT2_SHORT' : {'nickname':'ANALY_MWT2_SHORT-pbs','status':'notOK'}, + 'ANALY_NET2' : {'nickname':'ANALY_NET2-pbs','status':'OK'}, + 'ANALY_OU_OCHEP_SWT2' : {'nickname':'ANALY_OU_OCHEP_SWT2-condor','status':'notOK'}, + 'ANALY_PIC' : {'nickname':'pic-ce07-gshort-lcgpbs','status':'OK'}, + 'ANALY_RAL' : {'nickname':'RAL-LCG2-lcgce01-atlasL-lcgpbs','status':'OK'}, + 'ANALY_ROMANIA02' : {'nickname':'RO-02-NIPNE-tbat01-atlas-lcgpbs','status':'notOK'}, + 'ANALY_ROMANIA07' : {'nickname':'RO-07-NIPNE-tbit01-atlas-lcgpbs','status':'notOK'}, + 'ANALY_SARA' : {'nickname':'SARA-MATRIX-mu6-short-pbs','status':'notOK'}, + 'ANALY_SFU' : {'nickname':'SFU-LCG2-snowpatch-hep-atlas-lcgpbs','status':'notOK'}, + 'ANALY_SHEF' : {'nickname':'UKI-NORTHGRID-SHEF-HEP-lcgce0-atlas-lcgpbs','status':'OK'}, + 'ANALY_SLAC' : {'nickname':'ANALY_SLAC-lsf','status':'OK'}, + 'ANALY_SWT2_CPB' : {'nickname':'ANALY_SWT2_CPB-pbs','status':'OK'}, + 'ANALY_TAIWAN' : {'nickname':'Taiwan-LCG2-w-ce01-atlas-lcgpbs','status':'OK'}, + 'ANALY_TEST' : {'nickname':'ANALY_TEST','status':'OK'}, + 'ANALY_TORONTO' : {'nickname':'TORONTO-LCG2-bigmac-lcg-ce2-atlas-pbs','status':'OK'}, + 'ANALY_TOKYO' : {'nickname':'TOKYO-LCG2-lcg-ce01-atlas-lcgpbs','status':'OK'}, + 'ANALY_TRIUMF' : {'nickname':'TRIUMF-LCG2-ce1-atlas-lcgpbs','status':'OK'}, + 'ANALY_UBC' : {'nickname':'UBC-pbs','status':'OK'}, + 'ANALY_UIUC-HEP' : {'nickname':'ANALY_UIUC-HEP-condor','status':'OK'}, + 'ANALY_UTA' : {'nickname':'UTA-DPCC-pbs','status':'OK'}, + 'ANALY_UTA-DPCC' : {'nickname':'UTA-DPCC-test-pbs','status':'notOK'}, + 'ANALY_VICTORIA' : {'nickname':'VICTORIA-LCG2-lcg-ce-general-lcgpbs','status':'OK'}, + 'AUVERGRID' : {'nickname':'AUVERGRID-iut15auvergridce01-atlas-lcgpbs','status':'notOK'}, + 'ASGC' : {'nickname':'Taiwan-LCG2-w-ce01-atlas-lcgpbs','status':'OK'}, + 'ASGC_REPRO' : {'nickname':'ASGC_REPRO','status':'notOK'}, + 'Australia-ATLAS' : {'nickname':'Australia-ATLAS-agh2-atlas-lcgpbs','status':'OK'}, + 'BARNETT_TEST' : {'nickname':'BARNETT_TEST','status':'notOK'}, + 'BEIJING' : {'nickname':'BEIJING-LCG2-lcg002-atlas-lcgpbs','status':'OK'}, + 'BNLPROD' : {'nickname':'BNL_ATLAS_1-condor','status':'notOK'}, + 'BNL_ATLAS_1' : {'nickname':'BNL_ATLAS_1-condor','status':'OK'}, + 'BNL_ATLAS_2' : {'nickname':'BNL_ATLAS_2-condor','status':'OK'}, + 'BNL_ATLAS_DDM' : {'nickname':'BNL_DDM-condor','status':'notOK'}, + 'BNL_ATLAS_test' : {'nickname':'BNL_ATLAS_2-condor','status':'notOK'}, + 'BU_ATLAS_Tier2' : {'nickname':'BU_ATLAS_Tier2-pbs','status':'OK'}, + 'BU_ATLAS_Tier2o' : {'nickname':'BU_ATLAS_Tier2o-pbs','status':'OK'}, + 'BU_ATLAS_test' : {'nickname':'BU_ATLAS_Tier2-pbs','status':'NOTOK'}, + 'HU_ATLAS_Tier2' : {'nickname':'HU_ATLAS_Tier2-lsf','status':'OK'}, + 'CERN-BUILDS' : {'nickname':'CERN-BUILDS','status':'notOK'}, + 'CERN-RELEASE' : {'nickname':'CERN-RELEASE','status':'notOK'}, + 'CERN-UNVALID' : {'nickname':'CERN-UNVALID','status':'notOK'}, + 'CGG' : {'nickname':'CGG-LCG2-ce1-atlas-lcgpbs','status':'notOK'}, + 'CHARMM' : {'nickname':'CHARMM','status':'notOK'}, + 'CNR-ILC-PISA' : {'nickname':'CNR-ILC-PISA-gridce-atlas-lcgpbs','status':'notOK'}, + 'CPPM' : {'nickname':'IN2P3-CPPM-marce01-atlas-pbs','status':'OK'}, + 'CSCS-LCG2' : {'nickname':'CSCS-LCG2-ce01-egee48h-lcgpbs','status':'OK'}, + 'csTCDie' : {'nickname':'csTCDie-gridgate-himem-pbs','status':'OK'}, + 'CYF' : {'nickname':'CYFRONET-LCG2-ce-atlas-pbs','status':'OK'}, + 'DESY-HH' : {'nickname':'DESY-HH-grid-ce3-default-lcgpbs','status':'OK'}, + 'DESY-ZN' : {'nickname':'DESY-ZN-lcg-ce0-atlas-lcgpbs','status':'OK'}, + 'EFDA-JET' : {'nickname':'EFDA-JET-grid002-atlas-lcgpbs','status':'notok'}, + 'FZK-LCG2' : {'nickname':'FZK-LCG2-ce-1-fzk-atlasXL-pbspro','status':'OK'}, + 'FZK_REPRO' : {'nickname':'FZK_REPRO','status':'notOK'}, + 'FZU' : {'nickname':'praguelcg2-golias25-lcgatlas-lcgpbs','status':'OK'}, + 'GLOW' : {'nickname':'GLOW-CMS-cmsgrid02-atlas-condor','status':'notOK'}, + 'GLOW-ATLAS' : {'nickname':'GLOW-ATLAS-condor','status':'OK'}, + 'GoeGrid' : {'nickname':'GoeGrid-ce-goegrid-atlas-lcgpbs','status':'OK'}, + 'GRIF-IRFU' : {'nickname':'GRIF-IRFU-node07-atlas-lcgpbs','status':'OK'}, + 'GRIF-LAL' : {'nickname':'GRIF-LAL-grid10-atlas-pbs','status':'OK'}, + 'GRIF-LPNHE' : {'nickname':'GRIF-LPNHE-lpnce-atlas-pbs','status':'OK'}, + 'HEPHY-UIBK' : {'nickname':'HEPHY-UIBK-hepx4-atlas-lcgpbs','status':'OK'}, + 'IFAE' : {'nickname':'ifae-ifaece01-ifae-lcgpbs','status':'OK'}, + 'IFIC' : {'nickname':'IFIC-LCG2-ce01-atlas-pbs','status':'OK'}, + 'IHEP' : {'nickname':'BEIJING-LCG2-lcg002-atlas-lcgpbs','status':'OK'}, + 'ITEP' : {'nickname':'ITEP-ceglite-atlas-lcgpbs','status':'OK'}, + 'IN2P3-LPSC' : {'nickname':'IN2P3-LPSC-lpsc-ce-atlas-pbs','status':'OK'}, + 'JINR-LCG2' : {'nickname':'JINR-LCG2-lcgce01-atlas-lcgpbs', 'status':'OK'}, + 'LAPP' : {'nickname':'IN2P3-LAPP-lapp-ce01-atlas-pbs','status':'OK'}, + 'LIP-COIMBRA' : {'nickname':'LIP-Coimbra-grid006-atlas-lcgpbs','status':'OK'}, + 'LIP-LISBON' : {'nickname':'LIP-Lisbon-ce02-atlasgrid-lcgsge','status':'OK'}, + 'LLR' : {'nickname':'GRIF-LLR-polgrid1-atlas-pbs','status':'notOK'}, + 'LPC' : {'nickname':'IN2P3-LPC-clrlcgce03-atlas-lcgpbs','status':'OK'}, + 'LRZ' : {'nickname':'LRZ-LMU-lcg-lrz-ce-atlas-sge','status':'OK'}, + 'LYON' : {'nickname':'IN2P3-CC-cclcgceli02-long-bqs','status':'OK'}, + 'LYON_REPRO' : {'nickname':'LYON_REPRO','status':'notOK'}, + 'Lyon-T2' : {'nickname':'IN2P3-CC-T2-cclcgceli05-long-bqs','status':'OK'}, + 'LTU_CCT' : {'nickname':'LTU_CCT-pbs','status':'OK'}, + 'MANC' : {'nickname':'UKI-NORTHGRID-MAN-HEP-ce02-atlas-lcgpbs','status':'OK'}, + 'MCGILL-LCG2' : {'nickname':'MCGILL-LCG2-atlas-ce-atlas-pbs','status':'OK'}, + 'MONTREAL' : {'nickname':'Umontreal-LCG2-lcg-ce-atlas-lcgpbs','status':'notOK'}, + 'MPP' : {'nickname':'MPPMU-grid-ce-long-sge','status':'OK'}, + 'MWT2_IU' : {'nickname':'MWT2_IU-pbs','status':'OK'}, + 'MWT2_UC' : {'nickname':'MWT2_UC-pbs','status':'OK'}, + 'NDGF' : {'nickname':'NDGF-condor','status':'OK'}, + 'NIKHEF-ELPROD' : {'nickname':'NIKHEF-ELPROD-gazon-atlas-pbs','status':'OK'}, + 'NIKHEF_REPRO' : {'nickname':'NIKHEF_REPRO','status':'notOK'}, + 'OUHEP_ITB' : {'nickname':'OUHEP_ITB-condor','status':'notOK'}, + 'OU_PAUL_TEST' : {'nickname':'OU_OCHEP_SWT2-condor','status':'notOK'}, + 'OU_OCHEP_SWT2' : {'nickname':'OU_OCHEP_SWT2-condor','status':'OK'}, + 'OU_OSCER_ATLAS' : {'nickname':'OU_OSCER_ATLAS-lsf','status':'OK'}, + 'OU_OSCER_ATLASdeb' : {'nickname':'OU_OSCER_ATLASdeb-lsf','status':'notOK'}, + 'PSNC' : {'nickname':'PSNC-ce-atlas-pbs','status':'OK'}, + 'PIC' : {'nickname':'pic-ce05-glong-lcgpbs','status':'OK'}, + 'PIC_REPRO' : {'nickname':'PIC_REPRO','status':'notOK'}, + 'prague_cesnet_lcg2' : {'nickname':'prague_cesnet_lcg2-skurut17-egee_atlas-lcgpbs','status':'notOK'}, + 'RAL' : {'nickname':'RAL-LCG2-lcgce02-grid1000M-lcgpbs','status':'OK'}, + 'RAL_REPRO' : {'nickname':'RAL_REPRO','status':'notOK'}, + 'ru-Moscow-SINP-LCG2' : {'nickname':'ru-Moscow-SINP-LCG2-lcg02-atlas-lcgpbs','status':'OK'}, + 'ru-PNPI' : {'nickname':'ru-PNPI-cluster-atlas-pbs','status':'OK'}, + 'RDIGTEST' : {'nickname':'RDIGTEST','status':'notOK'}, + 'ROMANIA02' : {'nickname':'RO-02-NIPNE-tbat01-atlas-lcgpbs','status':'OK'}, + 'ROMANIA07' : {'nickname':'RO-07-NIPNE-tbit01-atlas-lcgpbs','status':'OK'}, + 'RRC-KI' : {'nickname':'RRC-KI-gate-atlas-lcgpbs','status':'OK'}, + 'RU-Protvino-IHEP' : {'nickname':'RU-Protvino-IHEP-ce0003-atlas-lcgpbs','status':'OK'}, + 'SARA_REPRO' : {'nickname':'SARA_REPRO','status':'notOK'}, + 'SFU-LCG2' : {'nickname':'SFU-LCG2-snowpatch-atlas-lcgpbs','status':'OK'}, + 'SLACXRD' : {'nickname':'SLACXRD-lsf','status':'OK'}, + 'SLAC_PAUL_TEST' : {'nickname':'SLACXRD-lsf','status':'notOK'}, + 'SNS-PISA' : {'nickname':'SNS-PISA-gridce-atlas-lcgpbs','status':'notOK'}, + 'SPACI-CS-IA64' : {'nickname':'SPACI-CS-IA64-square-atlas-lsf','status':'notOK'}, + 'SWT2_CPB' : {'nickname':'SWT2_CPB-pbs','status':'OK'}, + 'Taiwan-IPAS-LCG2' : {'nickname':'Taiwan-IPAS-LCG2-atlasce-atlas-lcgcondor','status':'notOK'}, + 'TEST1' : {'nickname':'TEST1','status':'notOK'}, + 'TEST2' : {'nickname':'TEST2','status':'notOK'}, + 'TEST3' : {'nickname':'TEST3','status':'notOK'}, + 'TEST4' : {'nickname':'TEST4','status':'notOK'}, + 'TESTCHARMM' : {'nickname':'TESTCHARMM','status':'notOK'}, + 'TESTGLIDE' : {'nickname':'TESTGLIDE','status':'notOK'}, + 'TOKYO' : {'nickname':'TOKYO-LCG2-lcg-ce01-atlas-lcgpbs','status':'OK'}, + 'TORONTO-LCG2' : {'nickname':'TORONTO-LCG2-bigmac-lcg-ce2-atlas-pbs','status':'OK'}, + 'TPATHENA' : {'nickname':'TPATHENA','status':'notOK'}, + 'TPPROD' : {'nickname':'TPPROD','status':'notOK'}, + 'TRIUMF' : {'nickname':'TRIUMF-LCG2-ce1-atlas-lcgpbs','status':'OK'}, + 'TRIUMF_DDM' : {'nickname':'TRIUMF_DDM','status':'notOK'}, + 'TRIUMF_REPRO' : {'nickname':'TRIUMF_REPRO','status':'notOK'}, + 'TW-FTT' : {'nickname':'TW-FTT-f-ce01-atlas-lcgpbs','status':'OK'}, + 'TWTEST' : {'nickname':'TWTEST','status':'notOK'}, + 'TestPilot' : {'nickname':'TestPilot','status':'notOK'}, + 'UAM-LCG2' : {'nickname':'UAM-LCG2-grid003-atlas-lcgpbs','status':'OK'}, + 'UBC' : {'nickname':'UBC-pbs','status':'OK'}, + 'UBC_PAUL_TEST' : {'nickname':'UBC-pbs','status':'notOK'}, + 'UIUC-HEP' : {'nickname':'UIUC-HEP-condor','status':'OK'}, + 'UCITB_EDGE7' : {'nickname':'UCITB_EDGE7-pbs','status':'OK'}, + 'UC_ATLAS_MWT2' : {'nickname':'UC_ATLAS_MWT2-condor','status':'OK'}, + 'UC_ATLAS_test' : {'nickname':'UC_ATLAS_MWT2-condor','status':'OK'}, + 'UC_Teraport' : {'nickname':'UC_Teraport-pbs','status':'notOK'}, + 'UMESHTEST' : {'nickname':'UMESHTEST','status':'notOK'}, + 'UNI-FREIBURG' : {'nickname':'UNI-FREIBURG-ce-atlas-pbs','status':'OK'}, + 'UTA-DPCC' : {'nickname':'UTA-DPCC-pbs','status':'OK'}, + 'UTA-DPCC-test' : {'nickname':'UTA-DPCC-test-pbs','status':'OK'}, + 'UTA_PAUL_TEST' : {'nickname':'UTA-SWT2-pbs','status':'notOK'}, + 'UTA_SWT2' : {'nickname':'UTA-SWT2-pbs','status':'OK'}, + 'UTD-HEP' : {'nickname':'UTD-HEP-pbs','status':'OK'}, + 'VICTORIA-LCG2' : {'nickname':'VICTORIA-LCG2-lcg-ce-general-lcgpbs','status':'OK'}, + 'Wuppertal' : {'nickname':'wuppertalprod-grid-ce-dg_long-lcgpbs','status':'OK'}, +} + + +# cloud-MoverID mapping +PandaMoverIDs = { + 'US' : 'BNL_ATLAS_DDM', + 'CA' : 'TRIUMF_DDM', + 'FR' : 'TRIUMF_DDM', + 'IT' : 'TRIUMF_DDM', + 'NL' : 'TRIUMF_DDM', + 'DE' : 'TRIUMF_DDM', + 'TW' : 'TRIUMF_DDM', + 'UK' : 'TRIUMF_DDM', + 'ES' : 'TRIUMF_DDM', + } diff --git a/current/pandaserver/brokerage/SiteMapper.py b/current/pandaserver/brokerage/SiteMapper.py new file mode 100644 index 000000000..a0ad2c0a6 --- /dev/null +++ b/current/pandaserver/brokerage/SiteMapper.py @@ -0,0 +1,205 @@ +import re +import sys + +# logger +from pandalogger.PandaLogger import PandaLogger +_logger = PandaLogger().getLogger('SiteMapper') + +# PandaIDs +from PandaSiteIDs import PandaSiteIDs + +# default site +from taskbuffer.SiteSpec import SiteSpec +defSite = SiteSpec() +defSite.sitename = 'BNL_ATLAS_1' +defSite.nickname = 'BNL_ATLAS_1-condor' +defSite.dq2url = 'http://dms02.usatlas.bnl.gov:8000/dq2/' +defSite.ddm = 'PANDA_UNDEFINED' +defSite.type = 'production' +defSite.gatekeeper = 'gridgk01.racf.bnl.gov' +defSite.status = 'online' +defSite.setokens = {} + + +######################################################################## + +class SiteMapper: + + # constructor + def __init__(self,taskBuffer,verbose=False): + _logger.debug('__init__ SiteMapper') + try: + # site list + self.siteSpecList = {} + + # sites not belonging to a cloud + self.defCloudSites = [] + + # cloud specification + self.cloudSpec = {} + + # create CloudSpec list + tmpCloudListDB = taskBuffer.getCloudList() + for tmpName,tmpCloudSpec in tmpCloudListDB.iteritems(): + self.cloudSpec[tmpName] = {} + # copy attributes from CloudSepc + for tmpAttr in tmpCloudSpec._attributes: + self.cloudSpec[tmpName][tmpAttr] = getattr(tmpCloudSpec,tmpAttr) + # append additional attributes + # source : Panda siteID for source + # dest : Panda siteID for dest + # sites : Panda siteIDs in the cloud + self.cloudSpec[tmpName]['source'] = self.cloudSpec[tmpName]['tier1'] + self.cloudSpec[tmpName]['dest'] = self.cloudSpec[tmpName]['tier1'] + self.cloudSpec[tmpName]['sites'] = [] + _logger.debug('Cloud->%s %s' % (tmpName,str(self.cloudSpec[tmpName]))) + # get list of PandaIDs + siteIDsList = taskBuffer.getSiteList() + firstDefault = True + # read full list from DB + siteFullList = taskBuffer.getSiteInfo() + # read DB to produce paramters in siteinfo dynamically + for tmpID,tmpNicknameList in siteIDsList.iteritems(): + for tmpNickname in tmpNicknameList: + # invalid nickname + if not siteFullList.has_key(tmpNickname): + continue + # get full spec + ret = siteFullList[tmpNickname] + # append + if ret == None: + _logger.error('Could not read site info for %s:%s' % (tmpID,tmpNickname)) + elif (firstDefault and tmpID == defSite.sitename) or (not self.siteSpecList.has_key(tmpID)) \ + or (self.siteSpecList.has_key(tmpID) and self.siteSpecList[tmpID].status in ['offline','']): + # overwrite default or remove existing offline + if firstDefault and tmpID == defSite.sitename: + del self.siteSpecList[tmpID] + firstDefault = False + elif self.siteSpecList.has_key(tmpID) and self.siteSpecList[tmpID].status in ['offline','']: + del self.siteSpecList[tmpID] + # append + if not self.siteSpecList.has_key(tmpID): + # determine type following a convention + tmpType = 'production' + if tmpID.startswith('ANALY_'): + tmpType = 'analysis' + elif re.search('test',tmpID,re.I) or \ + (PandaSiteIDs.has_key(tmpID) and PandaSiteIDs[tmpID]['status']!='OK'): + tmpType = 'test' + # set type + ret.sitename = tmpID + ret.type = tmpType + # don't use site for production when cloud is undefined + if ret.type == 'production' and ret.cloud == '': + _logger.error('Empty cloud for %s:%s' % (tmpID,tmpNickname)) + else: + self.siteSpecList[tmpID] = ret + else: + # overwrite status + if not ret.status in ['offline','']: + if self.siteSpecList[tmpID].status != 'online': + self.siteSpecList[tmpID].status = ret.status + # use larger maxinputsize and memory + try: + if ret.status in ['online']: + if self.siteSpecList[tmpID].maxinputsize < ret.maxinputsize or \ + ret.maxinputsize == 0: + self.siteSpecList[tmpID].maxinputsize = ret.maxinputsize + if (self.siteSpecList[tmpID].memory != 0 and self.siteSpecList[tmpID].memory < ret.memory) or \ + ret.memory == 0: + self.siteSpecList[tmpID].memory = ret.memory + except: + errtype, errvalue = sys.exc_info()[:2] + _logger.error("%s memory/inputsize failuer : %s %s" % (tmpID,errtype,errvalue)) + # make cloudSpec + for siteSpec in self.siteSpecList.values(): + # choose only prod sites + if siteSpec.type != 'production': + continue + # append prod site in cloud + for tmpCloud in siteSpec.cloudlist: + if self.cloudSpec.has_key(tmpCloud): + if not siteSpec.sitename in self.cloudSpec[tmpCloud]['sites']: + # append + self.cloudSpec[tmpCloud]['sites'].append(siteSpec.sitename) + else: + # append to the default cloud + if not siteSpec.sitename in self.defCloudSites: + # append + self.defCloudSites.append(siteSpec.sitename) + # set defCloudSites for backward compatibility + if self.cloudSpec.has_key('US'): + # use US sites + self.defCloudSites = self.cloudSpec['US']['sites'] + else: + # add def site as a protection if defCloudSites is empty + self.defCloudSites.append(defSite.sitename) + # dump sites + if verbose: + _logger.debug('========= dump =========') + for tmpSite,tmpSiteSpec in self.siteSpecList.iteritems(): + _logger.debug('Site->%s' % str(tmpSiteSpec)) + # check + for tmpCloud,tmpVals in self.cloudSpec.iteritems(): + # set T1 + try: + tmpVals['sites'].remove(tmpVals['dest']) + except: + pass + tmpVals['sites'].insert(0,tmpVals['dest']) + # dump + _logger.debug('Cloud:%s has %s' % (tmpCloud,tmpVals['sites'])) + for tmpSite in tmpVals['sites']: + if not self.siteSpecList.has_key(tmpSite): + _logger.debug(" '%s' doesn't exist" % tmpSite) + continue + tmpSiteSpec = self.siteSpecList[tmpSite] + if tmpSiteSpec.status in ['offline']: + _logger.debug(' %s:%s' % (tmpSite,tmpSiteSpec.status)) + _logger.debug('Cloud:XX has %s' % self.defCloudSites) + except: + type, value, traceBack = sys.exc_info() + _logger.error("__init__ SiteMapper : %s %s" % (type,value)) + _logger.debug('__init__ SiteMapper done') + + + # accessor for site + def getSite(self,site): + if self.siteSpecList.has_key(site): + return self.siteSpecList[site] + else: + # return default site + return defSite + + + # check if site exists + def checkSite(self,site): + return self.siteSpecList.has_key(site) + + + # accessor for cloud + def getCloud(self,cloud): + if self.cloudSpec.has_key(cloud): + return self.cloudSpec[cloud] + else: + # return sites in default cloud + ret = { 'source' : 'default', + 'dest' : 'default', + 'sites' : self.defCloudSites, + 'transtimelo' : 2, + 'transtimehi' : 1, + } + return ret + + + # accessor for cloud + def checkCloud(self,cloud): + if self.cloudSpec.has_key(cloud): + return True + else: + return False + + + # accessor for cloud list + def getCloudList(self): + return self.cloudSpec.keys() diff --git a/current/pandaserver/brokerage/VomsResolver.py b/current/pandaserver/brokerage/VomsResolver.py new file mode 100644 index 000000000..7bc432002 --- /dev/null +++ b/current/pandaserver/brokerage/VomsResolver.py @@ -0,0 +1,56 @@ +import re +import sys + +# logger +from pandalogger.PandaLogger import PandaLogger +_logger = PandaLogger().getLogger('VomsResolver') + + +######################################################################## + +class VomsResolver: + + # constructor + def __init__(self): + self.vomsUserMap = {} + try: + # read grid-mapfile + mapFile = open('/home/sm/grid-mapfile') + vo = None + for line in mapFile: + if line.startswith("#----"): + # get vo name + vo = line.split()[-2] + _logger.debug('get VO:%s' % vo) + self.vomsUserMap[vo] = [] + else: + # get DN + match = re.search('^"([^"]+)"',line) + if match != None: + # append + self.vomsUserMap[vo] = match.group(1) + # close grid-mapfile + mapFile.close() + except: + type, value, traceBack = sys.exc_info() + _logger.error("init : %s %s" % (type,value)) + + + # check the user is on VO + def checkUser(self,voms,dn): + _logger.debug('checkUser VO:%s DN:%s' % (voms,dn)) + if not self.vomsUserMap.has_key(voms): + _logger.debug(' NG - VO:%s is unsupported' % voms) + return False + # look for DN + for tmpDN in self.vomsUserMap[voms]: + if dn.startswith(tmpDN): + _logger.debug(' OK' % dn) + return True + _logger.debug(' NG - DN:%s is not found' % dn) + return False + + + # check voms is supported + def checkVoms(self,voms): + return self.vomsUserMap.has_key(voms) diff --git a/current/pandaserver/brokerage/__init__.py b/current/pandaserver/brokerage/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/current/pandaserver/brokerage/broker.py b/current/pandaserver/brokerage/broker.py new file mode 100755 index 000000000..0cfc98dee --- /dev/null +++ b/current/pandaserver/brokerage/broker.py @@ -0,0 +1,1684 @@ +import re +import sys +import time +import types +import fcntl +import random +import datetime +import commands +import ErrorCode +import broker_util +import PandaSiteIDs +from taskbuffer import ProcessGroups +from dataservice import DataServiceUtils +from config import panda_config + +from pandalogger.PandaLogger import PandaLogger +_log = PandaLogger().getLogger('broker') + +# all known sites +_allSites = PandaSiteIDs.PandaSiteIDs.keys() + +# sites for prestaging +#prestageSites = ['BNL_ATLAS_test','BNL_ATLAS_1','BNL_ATLAS_2'] + +# non LRC checking +_disableLRCcheck = [] + +# lock for uuidgen +_lockGetUU = open(panda_config.lockfile_getUU, 'w') + +# short-long mapping +shortLongMap = {'ANALY_BNL_ATLAS_1':'ANALY_LONG_BNL_ATLAS', + 'ANALY_LYON-T2' :'ANALY_LONG_LYON-T2', + 'ANALY_LYON_DCACHE':'ANALY_LONG_LYON_DCACHE', + 'ANALY_BNL_SHORT' :'ANALY_BNL_LONG', + } + +# processingType to skip brokerage +skipBrokerageProTypes = ['prod_test'] + +# comparison function for sort +def _compFunc(jobA,jobB): + # append site if not in list + if not jobA.computingSite in _allSites: + _allSites.append(jobA.computingSite) + if not jobB.computingSite in _allSites: + _allSites.append(jobB.computingSite) + # compare + indexA = _allSites.index(jobA.computingSite) + indexB = _allSites.index(jobB.computingSite) + if indexA > indexB: + return 1 + elif indexA < indexB: + return -1 + else: + return 0 + + +# release checker +def _checkRelease(jobRels,siteRels): + # all on/off + if "True" in siteRels: + return True + if "False" in siteRels: + return False + # loop over all releases + for tmpRel in jobRels.split('\n'): + relVer = re.sub('^Atlas-','',tmpRel) + # not available releases + if not relVer in siteRels: + return False + return True + + +# get list of files which already exist at the site +def _getOkFiles(v_ce,v_files,v_guids,allLFNs,allGUIDs,allOkFilesMap,tmpLog=None): + # DQ2 URL + dq2URL = v_ce.dq2url + dq2IDs = v_ce.setokens.values() + try: + dq2IDs.remove('') + except: + pass + dq2IDs.sort() + if dq2IDs == []: + dq2ID = v_ce.ddm + else: + dq2ID = '' + for tmpID in dq2IDs: + dq2ID += '%s,' % tmpID + dq2ID = dq2ID[:-1] + # set LFC and SE name + tmpSE = [] + if not v_ce.lfchost in [None,'']: + dq2URL = 'lfc://'+v_ce.lfchost+':/grid/atlas/' + tmpSE = broker_util.getSEfromSched(v_ce.se) + if tmpLog != None: + tmpLog.debug('getOkFiles for %s with dq2ID:%s,LFC:%s,SE:%s' % (v_ce.sitename,dq2ID,dq2URL,str(tmpSE))) + # use bulk lookup + if allLFNs != []: + # get bulk lookup data + if not allOkFilesMap.has_key(dq2ID): + # get files from LRC + allOkFilesMap[dq2ID] = broker_util.getFilesFromLRC(allLFNs,dq2URL,guids=allGUIDs, + storageName=tmpSE,getPFN=True) + # make return map + retMap = {} + for tmpLFN in v_files: + if allOkFilesMap[dq2ID].has_key(tmpLFN): + retMap[tmpLFN] = allOkFilesMap[dq2ID][tmpLFN] + # return + return retMap + else: + # old style + return broker_util.getFilesFromLRC(v_files,dq2URL,guids=v_guids, + storageName=tmpSE,getPFN=True) + + +# check reprocessing or not +def _isReproJob(tmpJob): + if tmpJob != None: + if tmpJob.processingType in ['reprocessing']: + return True + if tmpJob.transformation in ['csc_cosmics_trf.py','csc_BSreco_trf.py','BStoESDAODDPD_trf.py']: + return True + return False + + +# set 'ready' if files are already there +def _setReadyToFiles(tmpJob,okFiles,siteMapper,tmpLog): + allOK = True + tmpSiteSpec = siteMapper.getSite(tmpJob.computingSite) + tmpSrcSpec = siteMapper.getSite(siteMapper.getCloud(tmpJob.cloud)['source']) + # direct usage of remote SE + if tmpSiteSpec.ddm != tmpSrcSpec.ddm and tmpSrcSpec.ddm in tmpSiteSpec.setokens.values(): + tmpSiteSpec = tmpSrcSpec + tmpLog.debug('%s uses remote SiteSpec of %s for %s' % (tmpJob.PandaID,tmpSrcSpec.sitename,tmpJob.computingSite)) + tmpLog.debug('%s %s' % (tmpJob.PandaID,str(tmpSiteSpec.seprodpath))) + prestageSites = getPrestageSites(siteMapper) + for tmpFile in tmpJob.Files: + if tmpFile.type == 'input': + if DataServiceUtils.isCachedFile(tmpFile.dataset,tmpSiteSpec): + # cached file + tmpFile.status = 'cached' + tmpFile.dispatchDBlock = 'NULL' + elif (tmpJob.computingSite.endswith('_REPRO') or tmpJob.computingSite == siteMapper.getCloud(tmpJob.cloud)['source'] \ + or tmpSiteSpec.ddm == tmpSrcSpec.ddm) \ + and (not tmpJob.computingSite in prestageSites): + # EGEE T1. use DQ2 prestage only for on-tape files + if tmpSiteSpec.seprodpath.has_key('ATLASDATATAPE') and tmpSiteSpec.seprodpath.has_key('ATLASMCTAPE') and \ + okFiles.has_key(tmpFile.lfn): + tapeOnly = True + tapeCopy = False + for okPFN in okFiles[tmpFile.lfn]: + if re.search(tmpSiteSpec.seprodpath['ATLASDATATAPE'],okPFN) == None and \ + re.search(tmpSiteSpec.seprodpath['ATLASMCTAPE'],okPFN) == None: + # there is a disk copy + if tmpJob.cloud == 'US': + # check for BNLPANDA + if (tmpSiteSpec.seprodpath.has_key('ATLASMCDISK') and \ + re.search(tmpSiteSpec.seprodpath['ATLASMCDISK'],okPFN) != None) or \ + (tmpSiteSpec.seprodpath.has_key('ATLASDATADISK') and + re.search(tmpSiteSpec.seprodpath['ATLASDATADISK'],okPFN) != None): + tapeOnly = False + else: + tapeOnly = False + else: + # there is a tape copy + tapeCopy = True + # trigger prestage when disk copy doesn't exist or token is TAPE + if tapeOnly or (tapeCopy and tmpFile.dispatchDBlockToken in ['ATLASDATATAPE','ATLASMCTAPE']): + allOK = False + else: + # set ready + tmpFile.status = 'ready' + tmpFile.dispatchDBlock = 'NULL' + else: + # set ready anyway even if LFC is down. i.e. okFiles doesn't contain the file + tmpFile.status = 'ready' + tmpFile.dispatchDBlock = 'NULL' + elif (((tmpFile.lfn in okFiles) or (tmpJob.computingSite == tmpJob.destinationSE)) \ + and (not tmpJob.computingSite in prestageSites or \ + (tmpJob.computingSite in prestageSites and not tmpJob.cloud in ['US']))) \ + or tmpFile.status == 'missing': + # don't use TAPE replicas when T1 is used as T2 + if okFiles.has_key(tmpFile.lfn) and \ + tmpSiteSpec.seprodpath.has_key('ATLASDATATAPE') and len(okFiles[tmpFile.lfn]) == 1 and \ + re.search(tmpSiteSpec.seprodpath['ATLASDATATAPE'],okFiles[tmpFile.lfn][0]) != None: + allOK = False + else: + # set ready if the file exists and the site doesn't use prestage + tmpFile.status = 'ready' + tmpFile.dispatchDBlock = 'NULL' + else: + # prestage with PandaMover + allOK = False + # unset disp dataset + if allOK: + tmpJob.dispatchDBlock = 'NULL' + + + +# check number/size of inputs +def _isTooManyInput(nFilesPerJob,inputSizePerJob): + # the number of inputs is larger than 5 or + # size of inputs is larger than 500MB + if nFilesPerJob > 5 or inputSizePerJob > 500*1024*1024: + return True + return False + + +# send analysis brokerage info +def sendAnalyBrokeageInfo(results,prevRelease,diskThreshold,chosenSite,prevCmtConfig, + siteReliability): + # send log messages + messageList = [] + for resultType,resultList in results.iteritems(): + for resultItem in resultList: + if resultType == 'rel': + if prevCmtConfig in ['','NULL',None]: + msgBody = 'action=skip site=%s reason=missingapp - app=%s is missing' % (resultItem,prevRelease) + else: + msgBody = 'action=skip site=%s reason=missingapp - app=%s/%s is missing' % (resultItem,prevRelease,prevCmtConfig) + elif resultType == 'pilot': + msgBody = 'action=skip site=%s reason=nopilot - no pilots for last 3 hours' % resultItem + elif resultType == 'disk': + msgBody = 'action=skip site=%s reason=diskshortage - disk shortage < %sGB' % (resultItem,diskThreshold) + elif resultType == 'memory': + msgBody = 'action=skip site=%s reason=ramshortage - RAM shortage' % resultItem + elif resultType == 'maxtime': + msgBody = 'action=skip site=%s reason=maxtime - shorter walltime limit' % resultItem + elif resultType == 'status': + msgBody = 'action=skip site=%s reason=sitestatus - not online' % resultItem + elif resultType == 'reliability': + msgBody = 'action=skip site=%s reason=reliability - insufficient>%s' % (resultItem ,siteReliability) + elif resultType == 'weight': + tmpSite,tmpWeight = resultItem + if tmpSite == chosenSite: + msgBody = 'action=choose site=%s reason=maxweight - max weight=%s' % (tmpSite,tmpWeight) + else: + msgBody = 'action=skip site=%s reason=notmaxweight - weight=%s' % (tmpSite,tmpWeight) + elif resultType == 'prefcountry': + tmpSite,tmpCountry = resultItem + if tmpSite == chosenSite: + msgBody = 'action=prefer country=%s reason=countrygroup - preferential brokerage for beyond-pledge' % tmpCountry + else: + continue + else: + continue + messageList.append(msgBody) + # return + return messageList + + +# send analysis brokerage info to logger +def sendMsgToLogger(message): + _log.debug(message) + + +# send analysis brokerage info to logger with HTTP +def sendMsgToLoggerHTTP(msgList,job): + try: + # logging + iMsg = 0 + # message type + msgType = 'analy_brokerage' + # make header + if not job.jobsetID in [None,'NULL']: + msgHead = "dn='%s' : jobset=%s jobdef=%s" % (job.prodUserName,job.jobsetID,job.jobDefinitionID) + else: + msgHead = "dn='%s' : jobdef=%s" % (job.prodUserName,job.jobDefinitionID) + for msgBody in msgList: + # make message + message = msgHead + ' : ' + msgBody + # dump locally + _log.debug(message) + # get logger + _pandaLogger = PandaLogger() + _pandaLogger.lock() + _pandaLogger.setParams({'Type':msgType}) + logger = _pandaLogger.getHttpLogger(panda_config.loggername) + # add message + logger.info(message) + # release HTTP handler + _pandaLogger.release() + # sleep + iMsg += 1 + if iMsg % 5 == 0: + time.sleep(1) + except: + errType,errValue = sys.exc_info()[:2] + _log.error("sendMsgToLoggerHTTP : %s %s" % (errType,errValue)) + + +# get T2 candidates when files are missing at T2 +def getT2CandList(tmpJob,siteMapper,t2FilesMap): + if tmpJob == None: + return [] + # no cloud info + if not t2FilesMap.has_key(tmpJob.cloud): + return [] + # loop over all files + tmpCandT2s = None + for tmpFile in tmpJob.Files: + if tmpFile.type == 'input' and tmpFile.status == 'missing': + # no dataset info + if not t2FilesMap[tmpJob.cloud].has_key(tmpFile.dataset): + return [] + # initial candidates + if tmpCandT2s == None: + tmpCandT2s = t2FilesMap[tmpJob.cloud][tmpFile.dataset]['sites'] + # check all candidates + newCandT2s = [] + for tmpCandT2 in tmpCandT2s: + # site doesn't have the dataset + if not t2FilesMap[tmpJob.cloud][tmpFile.dataset]['sites'].has_key(tmpCandT2): + continue + # site has the file + if tmpFile.lfn in t2FilesMap[tmpJob.cloud][tmpFile.dataset]['sites'][tmpCandT2]: + if not tmpCandT2 in newCandT2s: + newCandT2s.append(tmpCandT2) + # set new candidates + tmpCandT2s = newCandT2s + if tmpCandT2s == []: + break + # return [] if no missing files + if tmpCandT2s == None: + return [] + # return + tmpCandT2s.sort() + return tmpCandT2s + + +# get hospital queues +def getHospitalQueues(siteMapper): + retMap = {} + # hospital words + goodWordList = ['CORE$','VL$','MEM$','MP\d+$','LONG$'] + # loop over all clouds + for tmpCloudName in siteMapper.getCloudList(): + # get cloud + tmpCloudSpec = siteMapper.getCloud(tmpCloudName) + # get T1 + tmpT1Name = tmpCloudSpec['source'] + tmpT1Spec = siteMapper.getSite(tmpT1Name) + # skip if DDM is undefined + if tmpT1Spec.ddm == []: + continue + # loop over all sites + for tmpSiteName in tmpCloudSpec['sites']: + # skip T1 defined in cloudconfig + if tmpSiteName == tmpT1Name: + continue + # check hospital words + checkHospWord = False + for tmpGoodWord in goodWordList: + if re.search(tmpGoodWord,tmpSiteName) != None: + checkHospWord = True + break + if not checkHospWord: + continue + # check site + if not siteMapper.checkSite(tmpSiteName): + continue + tmpSiteSpec = siteMapper.getSite(tmpSiteName) + # check DDM + if tmpT1Spec.ddm == tmpSiteSpec.ddm: + # append + if not retMap.has_key(tmpCloudName): + retMap[tmpCloudName] = [] + if not tmpSiteName in retMap[tmpCloudName]: + retMap[tmpCloudName].append(tmpSiteName) + _log.debug('hospital queues : %s' % str(retMap)) + # return + return retMap + + +# get prestage sites +def getPrestageSites(siteMapper): + retList = [] + # get cloud + tmpCloudSpec = siteMapper.getCloud('US') + # get T1 + tmpT1Name = tmpCloudSpec['source'] + tmpT1Spec = siteMapper.getSite(tmpT1Name) + # loop over all sites + for tmpSiteName in tmpCloudSpec['sites']: + # check site + if not siteMapper.checkSite(tmpSiteName): + continue + # get spec + tmpSiteSpec = siteMapper.getSite(tmpSiteName) + # add if DDM is the same as T1 + if tmpT1Spec.ddm == tmpSiteSpec.ddm and not tmpSiteName in retList: + retList.append(tmpSiteName) + _log.debug('US prestage sites : %s' % str(retList)) + # return + return retList + + +# make compact dialog message +def makeCompactDiagMessage(header,results): + # limit + maxSiteList = 5 + # types for compact format + compactTypeList = ['status','cpucore'] + # message mapping + messageMap = {'rel' : 'missing rel/cache', + 'pilot' : 'no pilot', + 'status' : 'not online', + 'disk' : 'SE full', + 'memory' : 'RAM shortage', + 'transferring' : 'many transferring', + 'share' : 'zero share', + 'maxtime' : 'short walltime', + 'cpucore' : 'CPU core mismatch', + 'scratch' : 'small scratch disk' + } + # put header + if header in ['',None]: + retStr = 'No candidate - ' + else: + retStr = 'special brokerage for %s - ' % header + # count number of sites per type + numTypeMap = {} + for resultType,resultList in results.iteritems(): + # ignore empty + if len(resultList) == 0: + continue + # add + nSites = len(resultList) + if not numTypeMap.has_key(nSites): + numTypeMap[nSites] = [] + numTypeMap[nSites].append(resultType) + # sort + numTypeKeys = numTypeMap.keys() + numTypeKeys.sort() + # use compact format for largest one + largeTypes = None + if len(numTypeKeys) > 0: + largeTypes = numTypeMap[numTypeKeys[-1]] + # loop over all types + for numTypeKey in numTypeKeys: + for resultType in numTypeMap[numTypeKey]: + # label + if messageMap.has_key(resultType): + retStr += '%s at ' % messageMap[resultType] + else: + retStr += '%s at' % resultType + # use comact format or not + if (resultType in compactTypeList+largeTypes \ + or len(results[resultType]) >= maxSiteList) \ + and header in ['',None,'reprocessing'] : + if len(results[resultType]) == 1: + retStr += '%s site' % len(results[resultType]) + else: + retStr += '%s sites' % len(results[resultType]) + else: + for tmpSite in results[resultType]: + retStr += '%s,' % tmpSite + retStr = retStr[:-1] + retStr += '. ' + retStr = retStr[:-2] + # return + return retStr + + +# message class +class MsgWrapper: + def __init__(self): + self.timestamp = datetime.datetime.utcnow().isoformat('/') + + def info(self,msg): + _log.info(self.timestamp + ' ' + msg) + + def debug(self,msg): + _log.debug(self.timestamp + ' ' + msg) + + def error(self,msg): + _log.error(self.timestamp + ' ' + msg) + + def warning(self,msg): + _log.warning(self.timestamp + ' ' + msg) + + + +# schedule +def schedule(jobs,taskBuffer,siteMapper,forAnalysis=False,setScanSiteList=[],trustIS=False, + distinguishedName=None,specialWeight={},getWeight=False,sizeMapForCheck={}, + datasetSize=0,replicaMap={},pd2pT1=False,reportLog=False,minPriority=None, + t2FilesMap={},preferredCountries=[],siteReliability=None): + # make a message instance + tmpLog = MsgWrapper() + try: + tmpLog.debug('start %s %s %s %s minPrio=%s pref=%s siteRel=%s' % (forAnalysis,str(setScanSiteList),trustIS, + distinguishedName,minPriority, + str(preferredCountries), + siteReliability)) + if specialWeight != {}: + tmpLog.debug('PD2P weight : %s' % str(specialWeight)) + tmpLog.debug('replicaMap : %s' % str(replicaMap)) + # no jobs + if len(jobs) == 0: + tmpLog.debug('finished : no jobs') + return + allOkFilesMap = {} + # use ANALY_CERN_XROOTD and not ANALY_CERN for EOS migration + if forAnalysis: + if 'ANALY_CERN_XROOTD' in setScanSiteList and 'ANALY_CERN' in setScanSiteList: + setScanSiteList.remove('ANALY_CERN') + tmpLog.debug('remove ANALY_CERN since ANALY_CERN_XROOTD is also a candidate') + nJob = 20 + iJob = 0 + nFile = 20 + fileList = [] + guidList = [] + okFiles = {} + totalNumInputs = 0 + totalInputSize = 0 + chosen_ce = None + prodDBlock = None + computingSite = None + dispatchDBlock = None + previousCloud = None + prevRelease = None + prevMemory = None + prevCmtConfig = None + prevProType = None + prevSourceLabel= None + prevDiskCount = None + prevHomePkg = None + prevDirectAcc = None + prevCoreCount = None + prevBrokergageSiteList = None + prevManualPreset = None + prevGoToT2Flag = None + prevWorkingGroup = None + prevMaxCpuCount = None + prevBrokerageNote = None + prevPriority = None + + nWNmap = {} + indexJob = 0 + vomsOK = None + + diskThreshold = 200 + diskThresholdPD2P = 1024 * 3 + manyInputsThr = 20 + weightUsedByBrokerage = {} + + prestageSites = getPrestageSites(siteMapper) + + # get statistics + faresharePolicy = {} + newJobStatWithPrio = {} + jobStatBrokerCloudsWithPrio = {} + if len(jobs) > 0 and (jobs[0].processingType.startswith('gangarobot') or \ + jobs[0].processingType.startswith('hammercloud') or \ + jobs[0].processingType in ['pandamover','usermerge']): + # disable redundant counting for HC + jobStatistics = {} + jobStatBroker = {} + jobStatBrokerClouds = {} + nRunningMap = {} + hospitalQueueMap = {} + else: + jobStatistics = taskBuffer.getJobStatistics(forAnal=forAnalysis) + if not forAnalysis: + jobStatBroker = {} + jobStatBrokerClouds = taskBuffer.getJobStatisticsBrokerage() + faresharePolicy = taskBuffer.getFaresharePolicy() + else: + if minPriority == None: + jobStatBroker = taskBuffer.getJobStatisticsAnalBrokerage() + else: + jobStatBroker = taskBuffer.getJobStatisticsAnalBrokerage(minPriority=minPriority) + nRunningMap = taskBuffer.getnRunningInSiteData() + hospitalQueueMap = getHospitalQueues(siteMapper) + # sort jobs by siteID. Some jobs may already define computingSite + jobs.sort(_compFunc) + # brokerage for analysis + candidateForAnal = True + relCloudMap = {} + loggerMessages = [] + # get all input files for bulk LFC lookup + allLFNs = [] + allGUIDs = [] + for tmpJob in jobs: + if tmpJob.prodSourceLabel in ('test','managed'): + for tmpFile in tmpJob.Files: + if tmpFile.type == 'input' and not tmpFile.lfn in allLFNs: + allLFNs.append(tmpFile.lfn) + allGUIDs.append(tmpFile.GUID) + # loop over all jobs + terminator(None) + for job in jobs+[None]: + indexJob += 1 + # ignore failed jobs + if job == None: + pass + elif job.jobStatus == 'failed': + continue + # list of sites for special brokerage + specialBrokergageSiteList = [] + # note for brokerage + brokerageNote = '' + # send jobs to T2 when files are missing at T1 + goToT2Flag = False + if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \ + and specialBrokergageSiteList == []: + currentT2CandList = getT2CandList(job,siteMapper,t2FilesMap) + if currentT2CandList != []: + goToT2Flag = True + specialBrokergageSiteList = currentT2CandList + tmpLog.debug('PandaID:%s -> set SiteList=%s to use T2 for missing files at T1' % (job.PandaID,specialBrokergageSiteList)) + brokerageNote = 'useT2' + # hack for split T1 + if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \ + and job.cloud == 'NL' and specialBrokergageSiteList == []: + # loop over all input datasets + tmpCheckedDS = [] + useSplitT1 = None + for tmpFile in job.Files: + if tmpFile.type == 'input' and (not tmpFile.dataset.startswith('ddo')) \ + and (not tmpFile.dataset in tmpCheckedDS): + # init + if useSplitT1 == None: + useSplitT1 = True + # no replica map + if not replicaMap.has_key(tmpFile.dataset): + # not set + useSplitT1 = False + break + # check if input datasets are available only at NIKHEF + tmpRepMap = replicaMap[tmpFile.dataset] + splitT1HasDS = False + for tmpSplitT1Key in tmpRepMap.keys(): + if tmpSplitT1Key.startswith('NIKHEF-ELPROD'): + splitT1HasDS = True + break + if splitT1HasDS \ + and not tmpRepMap.has_key('SARA-MATRIX_MCDISK') \ + and not tmpRepMap.has_key('SARA-MATRIX_DATADISK') \ + and not tmpRepMap.has_key('SARA-MATRIX_MCTAPE') \ + and not tmpRepMap.has_key('SARA-MATRIX_DATATAPE'): + pass + else: + # not set + useSplitT1 = False + break + # set + if useSplitT1 == True: + specialBrokergageSiteList = ['NIKHEF-ELPROD'] + tmpLog.debug('PandaID:%s -> set SiteList=%s for split T1' % (job.PandaID,specialBrokergageSiteList)) + brokerageNote = 'useSplitNLT1' + # set computingSite to T1 for high priority jobs + if job != None and job.currentPriority >= 950 and job.computingSite == 'NULL' \ + and job.prodSourceLabel in ('test','managed') and specialBrokergageSiteList == []: + specialBrokergageSiteList = [siteMapper.getCloud(job.cloud)['source']] + # set site list to use T1 and T1_VL + if hospitalQueueMap.has_key(job.cloud): + specialBrokergageSiteList += hospitalQueueMap[job.cloud] + tmpLog.debug('PandaID:%s -> set SiteList=%s for high prio' % (job.PandaID,specialBrokergageSiteList)) + brokerageNote = 'highPrio' + # set computingSite to T1 when too many inputs are required + if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \ + and specialBrokergageSiteList == []: + # counts # of inputs + tmpTotalInput = 0 + for tmpFile in job.Files: + if tmpFile.type == 'input': + tmpTotalInput += 1 + if tmpTotalInput >= manyInputsThr: + specialBrokergageSiteList = [siteMapper.getCloud(job.cloud)['source']] + # set site list to use T1 and T1_VL + if hospitalQueueMap.has_key(job.cloud): + specialBrokergageSiteList += hospitalQueueMap[job.cloud] + tmpLog.debug('PandaID:%s -> set SiteList=%s for too many inputs' % (job.PandaID,specialBrokergageSiteList)) + brokerageNote = 'manyInput' + # use limited sites for reprocessing + if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \ + and job.processingType in ['reprocessing'] and specialBrokergageSiteList == []: + for tmpSiteName in siteMapper.getCloud(job.cloud)['sites']: + if siteMapper.checkSite(tmpSiteName): + tmpSiteSpec = siteMapper.getSite(tmpSiteName) + if _checkRelease(job.AtlasRelease,tmpSiteSpec.validatedreleases): + specialBrokergageSiteList.append(tmpSiteName) + tmpLog.debug('PandaID:%s -> set SiteList=%s for processingType=%s' % (job.PandaID,specialBrokergageSiteList,job.processingType)) + brokerageNote = '%s' % job.processingType + # use limited sites for MP jobs + if job != None and job.computingSite == 'NULL' and job.prodSourceLabel in ('test','managed') \ + and not job.coreCount in [None,'NULL'] and job.coreCount > 1 and specialBrokergageSiteList == []: + for tmpSiteName in siteMapper.getCloud(job.cloud)['sites']: + if siteMapper.checkSite(tmpSiteName): + tmpSiteSpec = siteMapper.getSite(tmpSiteName) + if tmpSiteSpec.coreCount > 1: + specialBrokergageSiteList.append(tmpSiteName) + tmpLog.debug('PandaID:%s -> set SiteList=%s for MP=%scores' % (job.PandaID,specialBrokergageSiteList,job.coreCount)) + brokerageNote = 'MP=%score' % job.coreCount + # manually set site + manualPreset = False + if job != None and job.computingSite != 'NULL' and job.prodSourceLabel in ('test','managed') \ + and specialBrokergageSiteList == []: + specialBrokergageSiteList = [job.computingSite] + manualPreset = True + brokerageNote = 'presetSite' + overwriteSite = False + # new bunch or terminator + if job == None or len(fileList) >= nFile \ + or (dispatchDBlock == None and job.homepackage.startswith('AnalysisTransforms')) \ + or prodDBlock != job.prodDBlock or job.computingSite != computingSite or iJob > nJob \ + or previousCloud != job.cloud or prevRelease != job.AtlasRelease \ + or prevCmtConfig != job.cmtConfig \ + or (computingSite in ['RAL_REPRO','INFN-T1_REPRO'] and len(fileList)>=2) \ + or (prevProType in skipBrokerageProTypes and iJob > 0) \ + or prevDirectAcc != job.transferType \ + or prevMemory != job.minRamCount \ + or prevDiskCount != job.maxDiskCount \ + or prevCoreCount != job.coreCount \ + or prevWorkingGroup != job.workingGroup \ + or prevProType != job.processingType \ + or prevMaxCpuCount != job.maxCpuCount \ + or prevBrokergageSiteList != specialBrokergageSiteList: + if indexJob > 1: + tmpLog.debug('new bunch') + tmpLog.debug(' iJob %s' % iJob) + tmpLog.debug(' cloud %s' % previousCloud) + tmpLog.debug(' rel %s' % prevRelease) + tmpLog.debug(' sourceLabel %s' % prevSourceLabel) + tmpLog.debug(' cmtConfig %s' % prevCmtConfig) + tmpLog.debug(' memory %s' % prevMemory) + tmpLog.debug(' priority %s' % prevPriority) + tmpLog.debug(' prodDBlock %s' % prodDBlock) + tmpLog.debug(' computingSite %s' % computingSite) + tmpLog.debug(' processingType %s' % prevProType) + tmpLog.debug(' workingGroup %s' % prevWorkingGroup) + tmpLog.debug(' coreCount %s' % prevCoreCount) + tmpLog.debug(' maxCpuCount %s' % prevMaxCpuCount) + tmpLog.debug(' transferType %s' % prevDirectAcc) + tmpLog.debug(' goToT2 %s' % prevGoToT2Flag) + # brokerage decisions + resultsForAnal = {'rel':[],'pilot':[],'disk':[],'status':[],'weight':[],'memory':[], + 'share':[],'transferring':[],'prefcountry':[],'cpucore':[], + 'reliability':[],'maxtime':[],'scratch':[]} + # determine site + if (iJob == 0 or chosen_ce != 'TOBEDONE') and prevBrokergageSiteList in [None,[]]: + # file scan for pre-assigned jobs + jobsInBunch = jobs[indexJob-iJob-1:indexJob-1] + if jobsInBunch != [] and fileList != [] and (not computingSite in prestageSites) \ + and (jobsInBunch[0].prodSourceLabel in ['managed','software'] or \ + re.search('test',jobsInBunch[0].prodSourceLabel) != None): + # get site spec + tmp_chosen_ce = siteMapper.getSite(computingSite) + # get files from LRC + okFiles = _getOkFiles(tmp_chosen_ce,fileList,guidList,allLFNs,allGUIDs,allOkFilesMap,tmpLog) + # loop over all jobs + for tmpJob in jobsInBunch: + # set 'ready' if files are already there + _setReadyToFiles(tmpJob,okFiles,siteMapper,tmpLog) + else: + # load balancing + minSites = {} + nMinSites = 2 + if prevBrokergageSiteList != []: + # special brokerage + scanSiteList = prevBrokergageSiteList + elif setScanSiteList == []: + if siteMapper.checkCloud(previousCloud): + # use cloud sites + scanSiteList = siteMapper.getCloud(previousCloud)['sites'] + else: + # use default sites + scanSiteList = siteMapper.getCloud('default')['sites'] + else: + # use given sites + scanSiteList = setScanSiteList + # add long queue + for tmpShortQueue,tmpLongQueue in shortLongMap.iteritems(): + if tmpShortQueue in scanSiteList: + if not tmpLongQueue in scanSiteList: + scanSiteList.append(tmpLongQueue) + # the number/size of inputs per job + nFilesPerJob = float(totalNumInputs)/float(iJob) + inputSizePerJob = float(totalInputSize)/float(iJob) + # use T1 for jobs with many inputs when weight is negative + if (not forAnalysis) and _isTooManyInput(nFilesPerJob,inputSizePerJob) and \ + siteMapper.getCloud(previousCloud)['weight'] < 0 and prevManualPreset == False: + scanSiteList = [siteMapper.getCloud(previousCloud)['source']] + # set site list to use T1 and T1_VL + if hospitalQueueMap.has_key(previousCloud): + scanSiteList += hospitalQueueMap[previousCloud] + # get availabe sites with cache + useCacheVersion = False + siteListWithCache = [] + if forAnalysis: + if re.search('-\d+\.\d+\.\d+\.\d+',prevRelease) != None: + useCacheVersion = True + siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList,caches=prevRelease,cmtConfig=prevCmtConfig) + tmpLog.debug(' using installSW for cache %s' % prevRelease) + elif re.search('-\d+\.\d+\.\d+$',prevRelease) != None: + useCacheVersion = True + siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList,releases=prevRelease,cmtConfig=prevCmtConfig) + tmpLog.debug(' using installSW for release %s' % prevRelease) + elif re.search(':rel_\d+$$',prevRelease) != None: + useCacheVersion = True + iteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList, + releases=prevRelease.split(':')[0], + caches=prevRelease.split(':')[1], + cmtConfig=prevCmtConfig) + tmpLog.debug(' using installSW for release:cache %s' % prevRelease) + elif previousCloud in ['DE','NL','FR','CA','ES','IT','TW','UK','US','ND','CERN','RU']: + useCacheVersion = True + # change / to - + convedPrevHomePkg = prevHomePkg.replace('/','-') + if re.search('rel_\d+(\n|$)',prevHomePkg) == None: + # only cache is used for normal jobs + siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList,caches=convedPrevHomePkg, + cmtConfig=prevCmtConfig) + else: + # both AtlasRelease and homepackage are used for nightlies + siteListWithCache = taskBuffer.checkSitesWithRelease(scanSiteList, + releases=prevRelease, + caches=convedPrevHomePkg, + cmtConfig=prevCmtConfig) + tmpLog.debug(' cache %s' % prevHomePkg) + if useCacheVersion: + tmpLog.debug(' cache/relSites %s' % str(siteListWithCache)) + # release/cmtconfig check + foundRelease = False + # found candidate + foundOneCandidate = False + # randomize the order + if forAnalysis: + random.shuffle(scanSiteList) + # get cnadidates + if True: + # loop over all sites + for site in scanSiteList: + tmpLog.debug('calculate weight for site:%s' % site) + # _allSites may conain NULL after sort() + if site == 'NULL': + continue + # ignore test sites + if (prevManualPreset == False) and (site.endswith('test') or \ + site.endswith('Test') or site.startswith('Test')): + continue + # ignore analysis queues + if (not forAnalysis) and site.startswith('ANALY'): + continue + # get SiteSpec + if siteMapper.checkSite(site): + tmpSiteSpec = siteMapper.getSite(site) + else: + tmpLog.debug(" skip: %s doesn't exist in DB" % site) + continue + # check status + if tmpSiteSpec.status in ['offline','brokeroff'] and computingSite in ['NULL',None,'']: + if forAnalysis and tmpSiteSpec.status == 'brokeroff' and tmpSiteSpec.accesscontrol == 'grouplist': + # ignore brokeroff for grouplist site + pass + elif forAnalysis and prevProType in ['hammercloud','gangarobot','gangarobot-squid']: + # ignore site status for HC + pass + else: + tmpLog.debug(' skip: status %s' % tmpSiteSpec.status) + resultsForAnal['status'].append(site) + continue + if tmpSiteSpec.status == 'test' and (not prevProType in ['prod_test','hammercloud','gangarobot','gangarobot-squid']) \ + and not prevSourceLabel in ['test','prod_test']: + tmpLog.debug(' skip: status %s for %s' % (tmpSiteSpec.status,prevProType)) + resultsForAnal['status'].append(site) + continue + tmpLog.debug(' status=%s' % tmpSiteSpec.status) + # check core count + if tmpSiteSpec.coreCount > 1: + # use multi-core queue for MP jobs + if not prevCoreCount in [None,'NULL'] and prevCoreCount > 1: + pass + else: + tmpLog.debug(' skip: MP site (%s core) for job.coreCount=%s' % (tmpSiteSpec.coreCount, + prevCoreCount)) + resultsForAnal['cpucore'].append(site) + continue + else: + # use single core for non-MP jobs + if not prevCoreCount in [None,'NULL'] and prevCoreCount > 1: + tmpLog.debug(' skip: single core site (%s core) for job.coreCount=%s' % (tmpSiteSpec.coreCount, + prevCoreCount)) + resultsForAnal['cpucore'].append(site) + continue + # check memory + if tmpSiteSpec.memory != 0 and not prevMemory in [None,0,'NULL']: + try: + if int(tmpSiteSpec.memory) < int(prevMemory): + tmpLog.debug(' skip: memory shortage %s<%s' % (tmpSiteSpec.memory,prevMemory)) + resultsForAnal['memory'].append(site) + continue + except: + errtype,errvalue = sys.exc_info()[:2] + tmpLog.error("memory check : %s %s" % (errtype,errvalue)) + # check maxcpucount + if tmpSiteSpec.maxtime != 0 and not prevMaxCpuCount in [None,0,'NULL']: + try: + if int(tmpSiteSpec.maxtime) < int(prevMaxCpuCount): + tmpLog.debug(' skip: insufficient maxtime %s<%s' % (tmpSiteSpec.maxtime,prevMaxCpuCount)) + resultsForAnal['maxtime'].append(site) + continue + except: + errtype,errvalue = sys.exc_info()[:2] + tmpLog.error("maxtime check : %s %s" % (errtype,errvalue)) + # check max input size + if tmpSiteSpec.maxinputsize != 0 and (not prevDiskCount in [None,0,'NULL']): + try: + if int(tmpSiteSpec.maxinputsize) < int(prevDiskCount): + tmpLog.debug(' skip: not enough disk %s<%s' % (tmpSiteSpec.maxinputsize,prevDiskCount)) + resultsForAnal['scratch'].append(site) + continue + except: + errtype,errvalue = sys.exc_info()[:2] + tmpLog.error("disk check : %s %s" % (errtype,errvalue)) + tmpLog.debug(' maxinput=%s' % tmpSiteSpec.maxinputsize) + # reliability + if forAnalysis and isinstance(siteReliability,types.IntType): + if tmpSiteSpec.reliabilityLevel != None and tmpSiteSpec.reliabilityLevel > siteReliability: + tmpLog.debug(' skip: insufficient reliability %s > %s' % (tmpSiteSpec.reliabilityLevel,siteReliability)) + resultsForAnal['reliability'].append(site) + continue + # change NULL cmtconfig to slc3/4 + if prevCmtConfig in ['NULL','',None]: + if forAnalysis: + tmpCmtConfig = 'i686-slc4-gcc34-opt' + else: + tmpCmtConfig = 'i686-slc3-gcc323-opt' + else: + tmpCmtConfig = prevCmtConfig + # set release + releases = tmpSiteSpec.releases + origReleases = releases + if prevProType in ['reprocessing']: + # use validated releases for reprocessing + releases = tmpSiteSpec.validatedreleases + if not useCacheVersion: + tmpLog.debug(' %s' % str(releases)) + if origReleases == ['ANY']: + # doesn't check releases for catch all + tmpLog.debug(' no release check due to releases=%s' % origReleases) + foundRelease = True + elif forAnalysis and (tmpSiteSpec.cloud in ['ND'] or prevRelease==''): + # doesn't check releases for analysis + tmpLog.debug(' no release check') + pass + elif forAnalysis and useCacheVersion: + # cache matching + if not site in siteListWithCache: + tmpLog.debug(' skip: cache %s/%s not found' % (prevRelease.replace('\n',' '),prevCmtConfig)) + if trustIS: + resultsForAnal['rel'].append(site) + continue + elif prevRelease != None and \ + (useCacheVersion and not tmpSiteSpec.cloud in ['ND'] and not site in ['CERN-RELEASE']) and \ + (not prevProType in ['reprocessing']) and \ + (not site in siteListWithCache): + tmpLog.debug(' skip: cache %s/%s not found' % (prevHomePkg.replace('\n',' '),prevCmtConfig)) + # send message to logger + try: + if prevSourceLabel in ['managed','test']: + resultsForAnal['rel'].append(site) + # make message + message = '%s - cache %s/%s not found' % (site,prevHomePkg.replace('\n',' '),prevCmtConfig) + if not message in loggerMessages: + loggerMessages.append(message) + except: + pass + continue + elif prevRelease != None and \ + ((not useCacheVersion and releases != [] and not tmpSiteSpec.cloud in ['ND'] and not site in ['CERN-RELEASE']) or prevProType in ['reprocessing']) and \ + (((not _checkRelease(prevRelease,releases) and prevManualPreset == False) or not site in siteListWithCache) and not tmpSiteSpec.cloud in ['ND'] and not site in ['CERN-RELEASE']): + # release matching + if not useCacheVersion: + tmpLog.debug(' skip: release %s/%s not found' % (prevRelease.replace('\n',' '),prevCmtConfig)) + else: + tmpLog.debug(' skip: repro cache %s/%s not found' % (prevHomePkg.replace('\n',' '),prevCmtConfig)) + resultsForAnal['rel'].append(site) + continue + elif not foundRelease: + # found at least one site has the release + foundRelease = True + # direct access + if prevDirectAcc == 'direct' and not tmpSiteSpec.allowdirectaccess: + tmpLog.debug(' skip: no direct access support') + continue + # get pilot statistics + nPilotsGet = 0 + nPilotsUpdate = 0 + if nWNmap == {}: + nWNmap = taskBuffer.getCurrentSiteData() + if nWNmap.has_key(site): + nPilots = nWNmap[site]['getJob'] + nWNmap[site]['updateJob'] + nPilotsGet = nWNmap[site]['getJob'] + nPilotsUpdate = nWNmap[site]['updateJob'] + else: + nPilots = 0 + tmpLog.debug(' original nPilots:%s get:%s update:%s' % (nPilots,nPilotsGet,nPilotsUpdate)) + # limit on (G+1)/(U+1) + limitOnGUmax = 2.0 + limitOnGUmin = 0.5 + guRatio = float(1+nPilotsGet)/float(1+nPilotsUpdate) + if guRatio > limitOnGUmax: + nPilotsGet = limitOnGUmax * float(1+nPilotsUpdate) - 1.0 + elif guRatio < limitOnGUmin: + nPilotsGet = limitOnGUmin * float(1+nPilotsUpdate) - 1.0 + tmpLog.debug(' limited nPilots:%s get:%s update:%s' % (nPilots,nPilotsGet,nPilotsUpdate)) + # if no pilots + if nPilots == 0 and nWNmap != {}: + tmpLog.debug(" skip: %s no pilot" % site) + resultsForAnal['pilot'].append(site) + continue + # if no jobs in jobsActive/jobsDefined + if not jobStatistics.has_key(site): + jobStatistics[site] = {'assigned':0,'activated':0,'running':0,'transferring':0} + # set nRunning + if forAnalysis: + if not nRunningMap.has_key(site): + nRunningMap[site] = 0 + # check space + if specialWeight != {}: + # for PD2P + if sizeMapForCheck.has_key(site): + # threshold for PD2P max(5%,3TB) + thrForThisSite = long(sizeMapForCheck[site]['total'] * 5 / 100) + if thrForThisSite < diskThresholdPD2P: + thrForThisSite = diskThresholdPD2P + remSpace = sizeMapForCheck[site]['total'] - sizeMapForCheck[site]['used'] + tmpLog.debug(' space available=%s remain=%s thr=%s' % (sizeMapForCheck[site]['total'], + remSpace,thrForThisSite)) + if remSpace-datasetSize < thrForThisSite: + tmpLog.debug(' skip: disk shortage %s-%s< %s' % (remSpace,datasetSize,thrForThisSite)) + if getWeight: + weightUsedByBrokerage[site] = "NA : disk shortage" + continue + elif site != siteMapper.getCloud(previousCloud)['source']: + # for T2 + if tmpSiteSpec.space != 0: + nRemJobs = jobStatistics[site]['assigned']+jobStatistics[site]['activated']+jobStatistics[site]['running'] + if not forAnalysis: + # take assigned/activated/running jobs into account for production + remSpace = tmpSiteSpec.space - 0.250*nRemJobs + else: + remSpace = tmpSiteSpec.space + tmpLog.debug(' space available=%s remain=%s' % (tmpSiteSpec.space,remSpace)) + if remSpace < diskThreshold: + tmpLog.debug(' skip: disk shortage < %s' % diskThreshold) + resultsForAnal['disk'].append(site) + # keep message to logger + try: + if prevSourceLabel in ['managed','test']: + # make message + message = '%s - disk %s < %s' % (site,remSpace,diskThreshold) + if not message in loggerMessages: + loggerMessages.append(message) + except: + pass + continue + # get the process group + tmpProGroup = ProcessGroups.getProcessGroup(prevProType) + if prevProType in skipBrokerageProTypes: + # use original processingType since prod_test is in the test category and thus is interfered by validations + tmpProGroup = prevProType + # production share + skipDueToShare = False + try: + if not forAnalysis and prevSourceLabel in ['managed'] and faresharePolicy.has_key(site): + for tmpPolicy in faresharePolicy[site]['policyList']: + # ignore priority policy + if tmpPolicy['priority'] != None: + continue + # only zero share + if tmpPolicy['share'] != '0%': + continue + # check group + if tmpPolicy['group'] != None: + if '*' in tmpPolicy['group']: + # wildcard + tmpPatt = '^' + tmpPolicy['group'].replace('*','.*') + '$' + if re.search(tmpPatt,prevWorkingGroup) == None: + continue + else: + # normal definition + if prevWorkingGroup != tmpPolicy['group']: + continue + else: + # catch all except WGs used by other policies + groupInDefList = faresharePolicy[site]['groupList'] + usedByAnother = False + # loop over all groups + for groupInDefItem in groupInDefList: + if '*' in groupInDefItem: + # wildcard + tmpPatt = '^' + groupInDefItem.replace('*','.*') + '$' + if re.search(tmpPatt,prevWorkingGroup) != None: + usedByAnother = True + break + else: + # normal definition + if prevWorkingGroup == groupInDefItem: + usedByAnother = True + break + if usedByAnother: + continue + # check type + if tmpPolicy['type'] != None: + if tmpPolicy['type'] == tmpProGroup: + skipDueToShare = True + break + else: + # catch all except PGs used by other policies + typeInDefList = faresharePolicy[site]['typeList'][tmpPolicy['group']] + usedByAnother = False + for typeInDefItem in typeInDefList: + if typeInDefItem == tmpProGroup: + usedByAnother = True + break + if not usedByAnother: + skipDueToShare = True + break + # skip + if skipDueToShare: + tmpLog.debug(" skip: %s zero share" % site) + resultsForAnal['share'].append(site) + continue + except: + errtype,errvalue = sys.exc_info()[:2] + tmpLog.error("share check : %s %s" % (errtype,errvalue)) + # the number of assigned and activated + if not forAnalysis: + if not jobStatBrokerClouds.has_key(previousCloud): + jobStatBrokerClouds[previousCloud] = {} + # use number of jobs in the cloud + jobStatBroker = jobStatBrokerClouds[previousCloud] + if not jobStatBroker.has_key(site): + jobStatBroker[site] = {} + if not jobStatBroker[site].has_key(tmpProGroup): + jobStatBroker[site][tmpProGroup] = {'assigned':0,'activated':0,'running':0,'transferring':0} + # count # of assigned and activated jobs for prod by taking priorities in to account + nRunJobsPerGroup = None + if not forAnalysis and prevSourceLabel in ['managed','test']: + if not jobStatBrokerCloudsWithPrio.has_key(prevPriority): + jobStatBrokerCloudsWithPrio[prevPriority] = taskBuffer.getJobStatisticsBrokerage(prevPriority) + if not jobStatBrokerCloudsWithPrio[prevPriority].has_key(previousCloud): + jobStatBrokerCloudsWithPrio[prevPriority][previousCloud] = {} + if not jobStatBrokerCloudsWithPrio[prevPriority][previousCloud].has_key(site): + jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site] = {} + if not jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site].has_key(tmpProGroup): + jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup] = {'assigned':0,'activated':0,'running':0,'transferring':0} + nAssJobs = jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup]['assigned'] + nActJobs = jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup]['activated'] + nRunJobsPerGroup = jobStatBrokerCloudsWithPrio[prevPriority][previousCloud][site][tmpProGroup]['running'] + # add newly assigned jobs + for tmpNewPriority in newJobStatWithPrio.keys(): + if tmpNewPriority < prevPriority: + continue + if not newJobStatWithPrio[tmpNewPriority].has_key(previousCloud): + continue + if not newJobStatWithPrio[tmpNewPriority][previousCloud].has_key(site): + continue + if not newJobStatWithPrio[tmpNewPriority][previousCloud][site].has_key(tmpProGroup): + continue + nAssJobs += newJobStatWithPrio[tmpNewPriority][previousCloud][site][tmpProGroup] + else: + nAssJobs = jobStatBroker[site][tmpProGroup]['assigned'] + if forAnalysis and jobStatBroker[site][tmpProGroup].has_key('defined'): + nAssJobs += jobStatBroker[site][tmpProGroup]['defined'] + nActJobs = jobStatBroker[site][tmpProGroup]['activated'] + # number of jobs per node + if not nWNmap.has_key(site): + nJobsPerNode = 1 + elif jobStatistics[site]['running']==0 or nWNmap[site]['updateJob']==0: + nJobsPerNode = 1 + else: + if nRunJobsPerGroup == None: + nJobsPerNode = float(jobStatistics[site]['running'])/float(nWNmap[site]['updateJob']) + else: + if nRunJobsPerGroup == 0: + nJobsPerNode = 1.0/float(nWNmap[site]['updateJob']) + else: + nJobsPerNode = float(nRunJobsPerGroup)/float(nWNmap[site]['updateJob']) + # limit of the number of transferring jobs + if tmpSiteSpec.transferringlimit == 0: + maxTransferring = 2000 + else: + maxTransferring = tmpSiteSpec.transferringlimit + # get ration of transferring to running + if not forAnalysis and not tmpSiteSpec.cloud in ['ND']: + nTraJobs = 0 + nRunJobs = 0 + for tmpGroupForTra,tmpCountsForTra in jobStatBroker[site].iteritems(): + if tmpCountsForTra.has_key('running'): + nRunJobs += tmpCountsForTra['running'] + if tmpCountsForTra.has_key('transferring'): + nTraJobs += tmpCountsForTra['transferring'] + tmpLog.debug(' running=%s transferring=%s max=%s' % (nRunJobs,nTraJobs,maxTransferring)) + if max(maxTransferring,2*nRunJobs) < nTraJobs: + tmpLog.debug(" skip: %s many transferring=%s > max(%s,2*running=%s)" % (site,nTraJobs,maxTransferring,nRunJobs)) + resultsForAnal['transferring'].append(site) + if prevSourceLabel in ['managed','test']: + # make message + message = '%s - too many transferring' % site + if not message in loggerMessages: + loggerMessages.append(message) + continue + # get ratio of running jobs = run(cloud)/run(all) for multi cloud + multiCloudFactor = 1 + if not forAnalysis and not previousCloud in ['NL']: + tmpTotalRunningMulti = 0 + tmpNCloudMulti = 0 + for tmpCloudMulti,tmpCloudValMulti in jobStatBrokerClouds.iteritems(): + if tmpCloudValMulti.has_key(site): + if tmpCloudValMulti[site].has_key(tmpProGroup): + tmpNCloudMulti += 1 + if tmpCloudValMulti[site][tmpProGroup].has_key('running'): + tmpTotalRunningMulti += tmpCloudValMulti[site][tmpProGroup]['running'] + # no running + if tmpTotalRunningMulti == 0: + if tmpNCloudMulti != 0: + multiCloudFactor = tmpNCloudMulti + else: + multiCloudFactor = float(tmpTotalRunningMulti+1)/float(jobStatBroker[site][tmpProGroup]['running']+1) + tmpLog.debug(' totalRun:%s cloudRun:%s multiCloud:%s' % (tmpTotalRunningMulti, + jobStatBroker[site][tmpProGroup]['running'], + multiCloudFactor)) + # country preference + preferredCountryWeight = 1.0 + preferredCountryWeightStr = '' + if forAnalysis: + if preferredCountries != [] and tmpSiteSpec.countryGroup != []: + for tmpCountry in preferredCountries: + if tmpCountry in tmpSiteSpec.countryGroup: + # avoid negative weight or zero-divide + if tmpSiteSpec.availableCPU >= tmpSiteSpec.pledgedCPU and tmpSiteSpec.pledgedCPU > 0: + preferredCountryWeight = float(tmpSiteSpec.availableCPU) / float(tmpSiteSpec.pledgedCPU) + preferredCountryWeightStr = "*(%s/%s)" % (tmpSiteSpec.availableCPU,tmpSiteSpec.pledgedCPU) + resultsForAnal['prefcountry'].append((site,tmpCountry)) + break + tmpLog.debug(' country preference=%s' % preferredCountryWeightStr[1:]) + # calculate weight + if specialWeight != {}: + if not pd2pT1: + # weight for T2 PD2P + nSubs = 1 + if specialWeight.has_key(site): + nSubs = specialWeight[site] + tmpLog.debug(' %s nSubs:%s assigned:%s activated:%s running:%s nWNsG:%s nWNsU:%s' % \ + (site,nSubs,nAssJobs,nActJobs,nRunningMap[site],nPilotsGet,nPilotsUpdate)) + winv = float(nSubs) * float(nAssJobs+nActJobs) / float(1+nRunningMap[site]) / (1.0+float(nPilotsGet)/float(1+nPilotsUpdate)) + if getWeight: + weightUsedByBrokerage[site] = "(1+%s/%s)*%s/%s/%s" % (nPilotsGet,1+nPilotsUpdate,1+nRunningMap[site],nAssJobs+nActJobs,nSubs) + else: + # weight for T1 PD2P + tmpLog.debug(' %s MoU:%s' % (site,specialWeight[site])) + winv = 1.0 / float(specialWeight[site]) + if getWeight: + weightUsedByBrokerage[site] = "%s" % specialWeight[site] + else: + if not forAnalysis: + if nRunJobsPerGroup == None: + tmpLog.debug(' %s assigned:%s activated:%s running:%s nPilotsGet:%s nPilotsUpdate:%s multiCloud:%s' % + (site,nAssJobs,nActJobs,jobStatistics[site]['running'],nPilotsGet,nPilotsUpdate,multiCloudFactor)) + else: + tmpLog.debug(' %s assigned:%s activated:%s runningGroup:%s nPilotsGet:%s nPilotsUpdate:%s multiCloud:%s' % + (site,nAssJobs,nActJobs,nRunJobsPerGroup,nPilotsGet,nPilotsUpdate,multiCloudFactor)) + else: + tmpLog.debug(' %s assigned:%s activated:%s running:%s nWNsG:%s nWNsU:%s' % + (site,nAssJobs,nActJobs,nRunningMap[site],nPilotsGet,nPilotsUpdate)) + if forAnalysis: + winv = float(nAssJobs+nActJobs) / float(1+nRunningMap[site]) / (1.0+float(nPilotsGet)/float(1+nPilotsUpdate)) + else: + if nRunJobsPerGroup == None: + winv = float(nAssJobs+nActJobs) / float(1+jobStatistics[site]['running']) / (float(1+nPilotsGet)/float(1+nPilotsUpdate)) + else: + winv = float(nAssJobs+nActJobs) / float(1+nRunJobsPerGroup) / (float(1+nPilotsGet)/float(1+nPilotsUpdate)) + winv *= float(multiCloudFactor) + # send jobs to T1 when they require many or large inputs + if _isTooManyInput(nFilesPerJob,inputSizePerJob): + if site == siteMapper.getCloud(previousCloud)['source'] or \ + (site=='NIKHEF-ELPROD' and previousCloud=='NL' and prevProType=='reprocessing') or \ + (hospitalQueueMap.has_key(previousCloud) and site in hospitalQueueMap[previousCloud]): + cloudT1Weight = 2.0 + # use weight in cloudconfig + try: + tmpCloudT1Weight = float(siteMapper.getCloud(previousCloud)['weight']) + if tmpCloudT1Weight != 0.0: + cloudT1Weight = tmpCloudT1Weight + except: + pass + winv /= cloudT1Weight + tmpLog.debug(' special weight for %s : nInputs/Job=%s inputSize/Job=%s weight=%s' % + (site,nFilesPerJob,inputSizePerJob,cloudT1Weight)) + # found at least one candidate + foundOneCandidate = True + tmpLog.debug('Site:%s 1/Weight:%s' % (site,winv)) + if forAnalysis and trustIS and reportLog: + resultsForAnal['weight'].append((site,'(1+%s/%s)*%s/%s%s' % (nPilotsGet,1+nPilotsUpdate,1+nRunningMap[site], + nAssJobs+nActJobs,preferredCountryWeightStr))) + # choose largest nMinSites weights + minSites[site] = winv + if len(minSites) > nMinSites: + maxSite = site + maxWinv = winv + for tmpSite,tmpWinv in minSites.iteritems(): + if tmpWinv > maxWinv: + maxSite = tmpSite + maxWinv = tmpWinv + # delte max one + del minSites[maxSite] + # remove too different weights + if len(minSites) >= 2: + # look for minimum + minSite = minSites.keys()[0] + minWinv = minSites[minSite] + for tmpSite,tmpWinv in minSites.iteritems(): + if tmpWinv < minWinv: + minSite = tmpSite + minWinv = tmpWinv + # look for too different weights + difference = 2 + removeSites = [] + for tmpSite,tmpWinv in minSites.iteritems(): + if tmpWinv > minWinv*difference: + removeSites.append(tmpSite) + # remove + for tmpSite in removeSites: + del minSites[tmpSite] + # set default + if len(minSites) == 0: + # cloud's list + if forAnalysis or siteMapper.checkCloud(previousCloud): + minSites[scanSiteList[0]] = 0 + else: + minSites['BNL_ATLAS_1'] = 0 + # release not found + if forAnalysis and trustIS: + candidateForAnal = False + # use only one site for prod_test to skip LFC scan + if prevProType in skipBrokerageProTypes: + if len(minSites) > 1: + minSites = {minSites.keys()[0]:0} + # choose site + tmpLog.debug('Min Sites:%s' % minSites) + if len(fileList) ==0: + # choose min 1/weight + minSite = minSites.keys()[0] + minWinv = minSites[minSite] + for tmpSite,tmpWinv in minSites.iteritems(): + if tmpWinv < minWinv: + minSite = tmpSite + minWinv = tmpWinv + chosenCE = siteMapper.getSite(minSite) + else: + # compare # of files in LRC + maxNfiles = -1 + for site in minSites: + tmp_chosen_ce = siteMapper.getSite(site) + # search LRC + if site in _disableLRCcheck: + tmpOKFiles = {} + else: + # get files from LRC + tmpOKFiles = _getOkFiles(tmp_chosen_ce,fileList,guidList,allLFNs,allGUIDs,allOkFilesMap,tmpLog) + nFiles = len(tmpOKFiles) + tmpLog.debug('site:%s - nFiles:%s/%s %s' % (site,nFiles,len(fileList),str(tmpOKFiles))) + # choose site holding max # of files + if nFiles > maxNfiles: + chosenCE = tmp_chosen_ce + maxNfiles = nFiles + okFiles = tmpOKFiles + # set job spec + tmpLog.debug('indexJob : %s' % indexJob) + tmpLog.debug('nInputs/Job : %s' % nFilesPerJob) + tmpLog.debug('inputSize/Job : %s' % inputSizePerJob) + for tmpJob in jobs[indexJob-iJob-1:indexJob-1]: + # set computingSite + if (not candidateForAnal) and forAnalysis and trustIS: + resultsForAnalStr = 'ERROR : No candidate. ' + if resultsForAnal['rel'] != []: + if prevCmtConfig in ['','NULL',None]: + resultsForAnalStr += 'Release:%s was not found at %s. ' % (prevRelease,str(resultsForAnal['rel'])) + else: + resultsForAnalStr += 'Release:%s/%s was not found at %s. ' % (prevRelease,prevCmtConfig,str(resultsForAnal['rel'])) + if resultsForAnal['pilot'] != []: + resultsForAnalStr += '%s are inactive (no pilots for last 3 hours). ' % str(resultsForAnal['pilot']) + if resultsForAnal['disk'] != []: + resultsForAnalStr += 'Disk shortage < %sGB at %s. ' % (diskThreshold,str(resultsForAnal['disk'])) + if resultsForAnal['memory'] != []: + resultsForAnalStr += 'Insufficient RAM at %s. ' % str(resultsForAnal['memory']) + if resultsForAnal['maxtime'] != []: + resultsForAnalStr += 'Shorter walltime limit than maxCpuCount:%s at ' % prevMaxCpuCount + for tmpItem in resultsForAnal['maxtime']: + if siteMapper.checkSite(tmpItem): + resultsForAnalStr += '%s:%s,' % (tmpItem,siteMapper.getSite(tmpItem).maxtime) + resultsForAnalStr = resultsForAnalStr[:-1] + resultsForAnalStr += '. ' + if resultsForAnal['status'] != []: + resultsForAnalStr += '%s are not online. ' % str(resultsForAnal['status']) + if resultsForAnal['reliability'] != []: + resultsForAnalStr += 'Insufficient reliability at %s. ' % str(resultsForAnal['reliability']) + resultsForAnalStr = resultsForAnalStr[:-1] + tmpJob.computingSite = resultsForAnalStr + else: + tmpJob.computingSite = chosenCE.sitename + # send log + if forAnalysis and trustIS and reportLog: + # put logging info to ErrorDiag just to give it back to the caller + tmpJob.brokerageErrorDiag = sendAnalyBrokeageInfo(resultsForAnal,prevRelease,diskThreshold, + tmpJob.computingSite,prevCmtConfig, + siteReliability) + tmpLog.debug('PandaID:%s -> site:%s' % (tmpJob.PandaID,tmpJob.computingSite)) + if tmpJob.computingElement == 'NULL': + if tmpJob.prodSourceLabel == 'ddm': + # use nickname for ddm jobs + tmpJob.computingElement = chosenCE.nickname + else: + tmpJob.computingElement = chosenCE.gatekeeper + # fail jobs if no sites have the release + if (not foundRelease or (tmpJob.relocationFlag != 1 and not foundOneCandidate)) and (tmpJob.prodSourceLabel in ['managed','test']): + # reset + if tmpJob.relocationFlag != 1: + tmpJob.computingSite = None + tmpJob.computingElement = None + # go to waiting + tmpJob.jobStatus = 'waiting' + tmpJob.brokerageErrorCode = ErrorCode.EC_Release + if tmpJob.relocationFlag == 1: + try: + if resultsForAnal['pilot'] != []: + tmpJob.brokerageErrorDiag = '%s no pilots' % tmpJob.computingSite + elif resultsForAnal['disk'] != []: + tmpJob.brokerageErrorDiag = 'SE full at %s' % tmpJob.computingSite + elif resultsForAnal['memory'] != []: + tmpJob.brokerageErrorDiag = 'RAM shortage at %s' % tmpJob.computingSite + elif resultsForAnal['status'] != []: + tmpJob.brokerageErrorDiag = '%s not online' % tmpJob.computingSite + elif resultsForAnal['share'] != []: + tmpJob.brokerageErrorDiag = '%s zero share' % tmpJob.computingSite + elif resultsForAnal['cpucore'] != []: + tmpJob.brokerageErrorDiag = "CPU core mismatch at %s" % tmpJob.computingSite + elif resultsForAnal['maxtime'] != []: + tmpJob.brokerageErrorDiag = "short walltime at %s" % tmpJob.computingSite + elif resultsForAnal['transferring'] != []: + tmpJob.brokerageErrorDiag = 'too many transferring at %s' % tmpJob.computingSite + elif resultsForAnal['scratch'] != []: + tmpJob.brokerageErrorDiag = 'small scratch disk at %s' % tmpJob.computingSite + elif useCacheVersion: + tmpJob.brokerageErrorDiag = '%s/%s not found at %s' % (tmpJob.homepackage,tmpJob.cmtConfig,tmpJob.computingSite) + else: + tmpJob.brokerageErrorDiag = '%s/%s not found at %s' % (tmpJob.AtlasRelease,tmpJob.cmtConfig,tmpJob.computingSite) + except: + errtype,errvalue = sys.exc_info()[:2] + tmpLog.error("failed to set diag for %s: %s %s" % (tmpJob.PandaID,errtype,errvalue)) + tmpJob.brokerageErrorDiag = 'failed to set diag. see brokerage log in the panda server' + elif not prevBrokergageSiteList in [[],None]: + try: + # make message + tmpJob.brokerageErrorDiag = makeCompactDiagMessage(prevBrokerageNote,resultsForAnal) + except: + errtype,errvalue = sys.exc_info()[:2] + tmpLog.error("failed to set special diag for %s: %s %s" % (tmpJob.PandaID,errtype,errvalue)) + tmpJob.brokerageErrorDiag = 'failed to set diag. see brokerage log in the panda server' + elif prevProType in ['reprocessing']: + tmpJob.brokerageErrorDiag = '%s/%s not found at reprocessing sites' % (tmpJob.homepackage,tmpJob.cmtConfig) + elif not useCacheVersion: + tmpJob.brokerageErrorDiag = '%s/%s not found at online sites with enough memory and disk' % \ + (tmpJob.AtlasRelease,tmpJob.cmtConfig) + else: + try: + tmpJob.brokerageErrorDiag = makeCompactDiagMessage('',resultsForAnal) + except: + errtype,errvalue = sys.exc_info()[:2] + tmpLog.error("failed to set compact diag for %s: %s %s" % (tmpJob.PandaID,errtype,errvalue)) + tmpJob.brokerageErrorDiag = 'failed to set diag. see brokerage log in the panda server' + tmpLog.debug('PandaID:%s %s' % (tmpJob.PandaID,tmpJob.brokerageErrorDiag)) + continue + # set ready if files are already there + _setReadyToFiles(tmpJob,okFiles,siteMapper,tmpLog) + # update statistics + tmpProGroup = ProcessGroups.getProcessGroup(tmpJob.processingType) + if tmpJob.processingType in skipBrokerageProTypes: + # use original processingType since prod_test is in the test category and thus is interfered by validations + tmpProGroup = tmpJob.processingType + if not jobStatistics.has_key(tmpJob.computingSite): + jobStatistics[tmpJob.computingSite] = {'assigned':0,'activated':0,'running':0} + if not jobStatBroker.has_key(tmpJob.computingSite): + jobStatBroker[tmpJob.computingSite] = {} + if not jobStatBroker[tmpJob.computingSite].has_key(tmpProGroup): + jobStatBroker[tmpJob.computingSite][tmpProGroup] = {'assigned':0,'activated':0,'running':0} + jobStatistics[tmpJob.computingSite]['assigned'] += 1 + jobStatBroker[tmpJob.computingSite][tmpProGroup]['assigned'] += 1 + # update statistics by taking priorities into account + if not forAnalysis and prevSourceLabel in ['managed','test']: + if not newJobStatWithPrio.has_key(prevPriority): + newJobStatWithPrio[prevPriority] = {} + if not newJobStatWithPrio[prevPriority].has_key(tmpJob.cloud): + newJobStatWithPrio[prevPriority][tmpJob.cloud] = {} + if not newJobStatWithPrio[prevPriority][tmpJob.cloud].has_key(tmpJob.computingSite): + newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite] = {} + if not newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite].has_key(tmpProGroup): + newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite][tmpProGroup] = 0 + newJobStatWithPrio[prevPriority][tmpJob.cloud][tmpJob.computingSite][tmpProGroup] += 1 + # terminate + if job == None: + break + # reset iJob + iJob = 0 + # reset file list + fileList = [] + guidList = [] + okFiles = {} + totalNumInputs = 0 + totalInputSize = 0 + # create new dispDBlock + if job.prodDBlock != 'NULL': + # get datatype + try: + tmpDataType = job.prodDBlock.split('.')[-2] + except: + # default + tmpDataType = 'GEN' + if len(tmpDataType) > 20: + # avoid too long name + tmpDataType = 'GEN' + dispatchDBlock = "panda.%s.%s.%s.%s_dis%s" % (job.taskID,time.strftime('%m.%d'),tmpDataType, + commands.getoutput('uuidgen'),job.PandaID) + tmpLog.debug('New dispatchDBlock: %s' % dispatchDBlock) + prodDBlock = job.prodDBlock + # already define computingSite + if job.computingSite != 'NULL': + # instantiate KnownSite + chosen_ce = siteMapper.getSite(job.computingSite) + # if site doesn't exist, use ANALY_BNL_ATLAS_1 + if job.homepackage.startswith('AnalysisTransforms'): + if chosen_ce.sitename == 'BNL_ATLAS_1': + chosen_ce = siteMapper.getSite('ANALY_BNL_ATLAS_1') + overwriteSite = True + else: + # default for Analysis jobs + if job.homepackage.startswith('AnalysisTransforms'): + chosen_ce = siteMapper.getSite('ANALY_BNL_ATLAS_1') + overwriteSite = True + else: + # set chosen_ce + chosen_ce = 'TOBEDONE' + # increment iJob + iJob += 1 + # reserve computingSite and cloud + computingSite = job.computingSite + previousCloud = job.cloud + prevRelease = job.AtlasRelease + prevMemory = job.minRamCount + prevCmtConfig = job.cmtConfig + prevProType = job.processingType + prevSourceLabel = job.prodSourceLabel + prevDiskCount = job.maxDiskCount + prevHomePkg = job.homepackage + prevDirectAcc = job.transferType + prevCoreCount = job.coreCount + prevMaxCpuCount = job.maxCpuCount + prevBrokergageSiteList = specialBrokergageSiteList + prevManualPreset = manualPreset + prevGoToT2Flag = goToT2Flag + prevWorkingGroup = job.workingGroup + prevBrokerageNote = brokerageNote + # truncate prio to avoid too many lookups + if not job.currentPriority in [None,'NULL']: + prevPriority = (job.currentPriority / 50) * 50 + # assign site + if chosen_ce != 'TOBEDONE': + job.computingSite = chosen_ce.sitename + if job.computingElement == 'NULL': + if job.prodSourceLabel == 'ddm': + # use nickname for ddm jobs + job.computingElement = chosen_ce.nickname + else: + job.computingElement = chosen_ce.gatekeeper + # update statistics + if not jobStatistics.has_key(job.computingSite): + jobStatistics[job.computingSite] = {'assigned':0,'activated':0,'running':0} + jobStatistics[job.computingSite]['assigned'] += 1 + tmpLog.debug('PandaID:%s -> preset site:%s' % (job.PandaID,chosen_ce.sitename)) + # set cloud + if job.cloud in ['NULL',None,'']: + job.cloud = chosen_ce.cloud + # set destinationSE + destSE = job.destinationSE + if siteMapper.checkCloud(job.cloud): + # use cloud dest for non-exsiting sites + if job.prodSourceLabel != 'user' and (not job.destinationSE in siteMapper.siteSpecList.keys()) \ + and job.destinationSE != 'local': + destSE = siteMapper.getCloud(job.cloud)['dest'] + job.destinationSE = destSE + # use CERN-PROD_EOSDATADISK for CERN-EOS jobs + if job.computingSite in ['CERN-EOS']: + overwriteSite = True + if overwriteSite: + # overwrite SE for analysis jobs which set non-existing sites + destSE = job.computingSite + job.destinationSE = destSE + # set dispatchDBlock and destinationSE + first = True + for file in job.Files: + # dispatchDBlock. Set dispDB for prestaging jobs too + if file.type == 'input' and file.dispatchDBlock == 'NULL' and \ + ((not file.status in ['ready','missing']) or job.computingSite in prestageSites): + if first: + first = False + job.dispatchDBlock = dispatchDBlock + file.dispatchDBlock = dispatchDBlock + file.status = 'pending' + if not file.lfn in fileList: + fileList.append(file.lfn) + guidList.append(file.GUID) + try: + # get total number/size of inputs except DBRelease + # tgz inputs for evgen may be negligible + if re.search('\.tar\.gz',file.lfn) == None: + totalNumInputs += 1 + totalInputSize += file.fsize + except: + pass + # destinationSE + if file.type in ['output','log'] and destSE != '': + if job.prodSourceLabel == 'user' and job.computingSite == file.destinationSE: + pass + elif destSE == 'local': + pass + else: + file.destinationSE = destSE + # pre-assign GUID to log + if file.type == 'log': + # get lock + fcntl.flock(_lockGetUU.fileno(), fcntl.LOCK_EX) + # generate GUID + file.GUID = commands.getoutput('uuidgen') + # release lock + fcntl.flock(_lockGetUU.fileno(), fcntl.LOCK_UN) + # send log messages + try: + for message in loggerMessages: + # get logger + _pandaLogger = PandaLogger() + _pandaLogger.lock() + _pandaLogger.setParams({'Type':'brokerage'}) + logger = _pandaLogger.getHttpLogger(panda_config.loggername) + # add message + logger.warning(message) + # release HTTP handler + _pandaLogger.release() + time.sleep(1) + except: + pass + # send analysis brokerage info when jobs are submitted + if len(jobs) > 0 and jobs[0] != None and not forAnalysis and not pd2pT1 and specialWeight=={}: + # for analysis job. FIXME once ganga is updated to send analy brokerage info + if jobs[0].prodSourceLabel in ['user','panda'] and jobs[0].processingType in ['pathena','prun']: + # send countryGroup + tmpMsgList = [] + tmpNumJobs = len(jobs) + if jobs[0].prodSourceLabel == 'panda': + tmpNumJobs -= 1 + tmpMsg = 'nJobs=%s ' % tmpNumJobs + if jobs[0].countryGroup in ['NULL','',None]: + tmpMsg += 'countryGroup=None' + else: + tmpMsg += 'countryGroup=%s' % jobs[0].countryGroup + tmpMsgList.append(tmpMsg) + # send log + sendMsgToLoggerHTTP(tmpMsgList,jobs[0]) + # finished + tmpLog.debug('finished') + if getWeight: + return weightUsedByBrokerage + except: + type, value, traceBack = sys.exc_info() + tmpLog.error("schedule : %s %s" % (type,value)) + if getWeight: + return {} + diff --git a/current/pandaserver/brokerage/broker_util.py b/current/pandaserver/brokerage/broker_util.py new file mode 100755 index 000000000..ca8564a91 --- /dev/null +++ b/current/pandaserver/brokerage/broker_util.py @@ -0,0 +1,399 @@ +import re +import urllib +import time +import sys +import types +import commands +import xml.dom.minidom + + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger +_log = PandaLogger().getLogger('broker_util') + +# curl class +class _Curl: + # constructor + def __init__(self,useProxy=False): + # path to curl + self.path = 'curl --user-agent "dqcurl" -m 180' + # verification of the host certificate + self.verifyHost = False + # use proxy + if useProxy and panda_config.httpProxy != '': + self.path = 'env http_proxy=%s %s' % (panda_config.httpProxy,self.path) + + # GET method + def get(self,url,data={}): + # make command + com = '%s --silent --get' % self.path + if not self.verifyHost: + com += ' --insecure' + # data + for key,value in data.iteritems(): + com += ' --data "%s"' % urllib.urlencode({key:value}) + com += ' %s' % url + # execute + _log.debug(com) + ret = commands.getstatusoutput(com) + _log.debug(ret) + return ret + + +# get default storage +def _getDefaultStorage(baseURL,sePath=None,seProdPath={}): + _log.debug('_getDefaultStorage (%s %s %s)' % (baseURL,sePath,seProdPath)) + # use se+seprodpath when baseURL='' + if baseURL=='': + # get token + match = re.search('^token:([^:]+):',sePath) + if match == None: + _log.error("could not get token from %s" % sePath) + return "" + token = match.group(1) + # get corresponding path + if not seProdPath.has_key(token): + _log.error("could not find path for % in %s" % (token,seProdPath)) + return "" + # set se+seprodpath + out = sePath+seProdPath[token] + # append / + if not out.endswith('/'): + out += '/' + _log.debug(out) + else: + # check port to set proxy + useProxy = False + if panda_config.httpProxy != '': + pMatch = re.search('http://[^:/]+:*(\d+)/',baseURL) + if pMatch == None: + # default port + useProxy = True + elif pMatch.group(1) == '80': + # standard port + useProxy = True + # instantiate curl + curl = _Curl(useProxy) + # get default storage + url = baseURL + 'storages/default' + status,out = curl.get(url) + _log.debug(out) + if status != 0: + _log.error("could not get default storage from %s:%s" % (baseURL,status)) + return "" + # parse + match = re.search('^[^/]+://[^/]+(/.+)$',out) + if match == None: + _log.error("could not parse string : %s" % out) + return "" + return match.group(1) + + +# get PoolFileCatalog +def _getPoolFileCatalog(lfns,dq2url): + _log.debug('_getPoolFileCatalog') + # check port to set proxy + useProxy = False + if panda_config.httpProxy != '': + pMatch = re.search('http://[^:/]+:*(\d+)/',dq2url) + if pMatch == None: + # default port + useProxy = True + elif pMatch.group(1) == '80': + # standard port + useProxy = True + # instantiate curl + curl = _Curl(useProxy) + # get PoolFileCatalog + iLFN = 0 + outXML ='' + strLFNs = '' + if not dq2url.endswith('_'): + url = dq2url + '/lrc/PoolFileCatalog' + else: + # NDGF LRC + url = dq2url + 'lrc/PoolFileCatalog' + for lfn in lfns: + iLFN += 1 + # make argument + strLFNs += '%s ' % lfn + if iLFN % 40 == 0 or iLFN == len(lfns): + # get PoolFileCatalog + strLFNs = strLFNs.rstrip() + data = {'lfns':strLFNs} + # avoid too long argument + strLFNs = '' + # execute + status,out = curl.get(url,data) + _log.debug(status) + # sleep + time.sleep(2) + if status != 0: + _log.error("_getPoolFileCatalog : %s %s %s" % (dq2url,status,out)) + return status + if status != 0 or out.startswith('Error'): + continue + if not out.startswith('<\?xml version="1.0" encoding="UTF-8" standalone="no" \?> + + + +""" + outXML = re.sub(th,'',outXML) + outXML = re.sub("""\s*""",'',outXML) + outXML = re.sub("""\s*""",'',outXML) + outXML = re.sub("""\s*""",'',outXML) + outXML = re.sub("""\s*""",'',outXML) + outXML = re.sub("""\s*""",'',outXML) + outXML = re.sub("""\s*""",'',outXML) + + # return XML + return outXML + + +# get files from MySQL +def _getPFNFromMySQL(lfns,dq2url): + _log.debug('_getPFNFromMySQL') + import MySQLdb + comment = ' /* broker_util._getPFNFromMySQL */' + outStr = '' + # parse connection string + match = re.search('^mysql://([^:]+):([^@]+)@([^/:]+):(\d+)/(.+)$',dq2url) + if match == None: + return outStr + # parameters for DB connection + connStr = "mysql -h %s -u %s -p%s -P %s %s" + dbhost = match.group(3) + dbuser = match.group(1) + dbpswd = match.group(2) + dbport = int(match.group(4)) + dbname = match.group(5) + connStr = "mysql -h %s -u %s -p%s -P %s %s" % (dbhost,dbuser,dbpswd,dbport,dbname) + try: + _log.debug(connStr) + # connect + dbConn = MySQLdb.connect(db=dbname,host=dbhost,port=dbport,user=dbuser,passwd=dbpswd) + # make cursor + dbCur = dbConn.cursor() + # query files + iLFN = 0 + strLFNs = '' + for lfn in lfns: + iLFN += 1 + # make argument + strLFNs += " lfname='%s' OR " % lfn + if iLFN % 40 == 0 or iLFN == len(lfns): + # get PoolFileCatalog + strLFNs = strLFNs[:-3] + # construct SQL + sql = 'SELECT lfname FROM t_lfn WHERE %s' % strLFNs + # reset + strLFNs = '' + # execute + _log.debug(sql) + dbCur.execute(sql+comment) + res = dbCur.fetchall() + _log.debug(res) + # append LFNs + if res != None and len(res) != 0: + for resLFN in res: + outStr += '%s ' % resLFN + # close cursor + dbCur.close() + # close connection + dbConn.close() + except: + type, value, traceBack = sys.exc_info() + _log.error("_getPFNFromMySQL : %s %s %s" % (dq2url,type,value)) + return -1 + # return + return outStr + + +# get files from LFC +def _getPFNFromLFC(lfns,dq2url,guids,storageName): + _log.debug('_getPFNFromLFC') + outStr = '' + # check paramter + if guids == [] or storageName == [] or (len(lfns) != len(guids)): + return outStr + # extract LFC host + lfcHost = re.sub('[/:]',' ',dq2url).split()[1] + # loop over all LFNs + iLFN = 0 + nLFN = 1000 + strFiles = '' + outStr = '' + for iLFN in range(len(lfns)): + strFiles += '%s %s\n' % (lfns[iLFN],guids[iLFN]) + # bulk operation + if (iLFN+1) % nLFN == 0 or (iLFN+1) >= len(lfns): + # write to file + inFileName = '%s/lfcin.%s' % (panda_config.logdir,commands.getoutput('uuidgen')) + ifile = open(inFileName,'w') + ifile.write(strFiles) + ifile.close() + # construct commands + strStorage = '' + for storage in storageName: + strStorage += '%s,' % storage + strStorage = strStorage[:-1] + com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) + com+= 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; ' + com+= 'source %s; %s/python -Wignore %s/LFCclient.py -f %s -l %s -s %s' % \ + (panda_config.glite_source,panda_config.native_python32,panda_config.lfcClient_dir, + inFileName,lfcHost,strStorage) + _log.debug(com) + # exeute + status,output = commands.getstatusoutput(com) + _log.debug(status) + if status == 0: + outStr += output + else: + _log.error("_getPFNFromLFC : %s %s %s" % (dq2url,status,output)) + # send message to logger + try: + # make message + message = 'LFC access : %s %s %s' % (dq2url,status,output) + # get logger + _pandaLogger = PandaLogger() + _pandaLogger.lock() + _pandaLogger.setParams({'Type':'broker_util'}) + logger = _pandaLogger.getHttpLogger(panda_config.loggername) + # add message + logger.error(message) + # release HTTP handler + _pandaLogger.release() + except: + pass + return status + # reset + strFiles = '' + # return + return outStr + + +# get files from LRC +def getFilesFromLRC(files,url,guids=[],storageName=[],terminateWhenFailed=False,getPFN=False): + _log.debug('getFilesFromLRC "%s" %s' % (url,str(storageName))) + # get PFC + outSTR = '' + if url.startswith('mysql://'): + # from MySQL + outSTR = _getPFNFromMySQL(files,url) + # get PFN + if getPFN: + outPFN = {} + # FIXME + _log.debug('RetPFN:%s ' % str(outPFN)) + return outPFN + elif url.startswith('http://'): + # from HTTP I/F + outSTR = _getPoolFileCatalog(files,url) + # get PFN + if getPFN: + outPFN = {} + try: + if not outSTR in ['',None]: + root = xml.dom.minidom.parseString(outSTR) + fileNodes = root.getElementsByTagName('File') + for file in fileNodes: + # get PFN and LFN nodes + physical = file.getElementsByTagName('physical')[0] + pfnNode = physical.getElementsByTagName('pfn')[0] + logical = file.getElementsByTagName('logical')[0] + lfnNode = logical.getElementsByTagName('lfn')[0] + # convert UTF8 to Raw + pfn = str(pfnNode.getAttribute('name')) + lfn = str(lfnNode.getAttribute('name')) + # assign + if not outPFN.has_key(lfn): + outPFN[lfn] = [] + outPFN[lfn].append(pfn) + except: + type, value, traceBack = sys.exc_info() + _log.error(outSTR) + _log.error("could not parse XML - %s %s" % (type, value)) + _log.debug('RetPFN:%s ' % str(outPFN)) + return outPFN + elif url.startswith('lfc://'): + # from LFC + outSTR = _getPFNFromLFC(files,url,guids,storageName) + # get PFN + if getPFN: + outPFN = {} + try: + if not outSTR in ['',None]: + tmpItems = outSTR.split('LFCRet :') + tmpItems.remove('') + # loop over all returns + for tmpItem in tmpItems: + exec "tmpLFNmap = %s" % tmpItem + for tmpLFN,tmpPFN in tmpLFNmap.iteritems(): + outPFN[tmpLFN] = tmpPFN + except: + type, value, traceBack = sys.exc_info() + _log.error(outSTR) + _log.error("could not parse LFC ret - %s %s" % (type, value)) + _log.debug('RetPFN:%s ' % str(outPFN)) + return outPFN + # check return + if not isinstance(outSTR,types.StringType): + if terminateWhenFailed: + return None + # set empty string + outSTR = '' + # collect OK Files + okFiles = [] + for file in files: + if re.search(file,outSTR) != None: + okFiles.append(file) + _log.debug('Ret:%s ' % str(okFiles)) + return okFiles + + +# get # of files from LRC +def getNFilesFromLRC(files,url): + _log.debug('getNFilesFromLRC') + # get okFiles + okFiles = getFilesFromLRC(files,url) + nFiles = len(okFiles) + _log.debug('Ret:%s ' % nFiles) + return nFiles + + +# get list of missing LFNs from LRC +def getMissLFNsFromLRC(files,url,guids=[],storageName=[]): + _log.debug('getMissLFNsFromLRC') + # get OF files + okFiles = getFilesFromLRC(files,url,guids,storageName) + # collect missing files + missFiles = [] + for file in files: + if not file in okFiles: + missFiles.append(file) + _log.debug('Ret:%s ' % str(missFiles)) + return missFiles + + +# extract list of se hosts from schedconfig +def getSEfromSched(seStr): + tmpSE = [] + if seStr != None: + for tmpSrcSiteSE in seStr.split(','): + # extract host + match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE) + if match != None: + tmpSE.append(match.group(1)) + # sort + tmpSE.sort() + # return + return tmpSE + + diff --git a/current/pandaserver/config/__init__.py b/current/pandaserver/config/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/current/pandaserver/config/panda_config.py b/current/pandaserver/config/panda_config.py new file mode 100755 index 000000000..68034b586 --- /dev/null +++ b/current/pandaserver/config/panda_config.py @@ -0,0 +1,33 @@ +import re +import sys +import commands +from liveconfigparser.LiveConfigParser import LiveConfigParser + +# get ConfigParser +tmpConf = LiveConfigParser() + +# read +tmpConf.read('panda_server.cfg') + +# get server section +tmpDict = tmpConf.server + +# expand all values +tmpSelf = sys.modules[ __name__ ] +for tmpKey,tmpVal in tmpDict.iteritems(): + # convert string to bool/int + if tmpVal == 'True': + tmpVal = True + elif tmpVal == 'False': + tmpVal = False + elif re.match('^\d+$',tmpVal): + tmpVal = int(tmpVal) + # update dict + tmpSelf.__dict__[tmpKey] = tmpVal + +# set hostname +tmpSelf.__dict__['pserverhost'] = commands.getoutput('hostname -f') + +# change the number of database connections for FastCGI/WSGI +if tmpSelf.__dict__['useFastCGI'] or tmpSelf.__dict__['useWSGI']: + tmpSelf.__dict__['nDBConnection'] = tmpSelf.__dict__['nDBConForFastCGIWSGI'] diff --git a/current/pandaserver/dataservice/Activator.py b/current/pandaserver/dataservice/Activator.py new file mode 100755 index 000000000..af3909050 --- /dev/null +++ b/current/pandaserver/dataservice/Activator.py @@ -0,0 +1,47 @@ +''' +activate job + +''' + +import threading + +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('Activator') + + +class Activator (threading.Thread): + # constructor + def __init__(self,taskBuffer,dataset,enforce=False): + threading.Thread.__init__(self) + self.dataset = dataset + self.taskBuffer = taskBuffer + self.enforce = enforce + + + # main + def run(self): + _logger.debug("start: %s" % self.dataset.name) + if self.dataset.status in ['completed','deleting','deleted'] and not self.enforce: + _logger.debug(" skip: %s" % self.dataset.name) + else: + # update input files + ids = self.taskBuffer.updateInFilesReturnPandaIDs(self.dataset.name,'ready') + _logger.debug("IDs: %s" % ids) + if len(ids) != 0: + # get job + jobs = self.taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False) + # remove None and unknown + acJobs = [] + for job in jobs: + if job == None or job.jobStatus == 'unknown': + continue + acJobs.append(job) + # activate + self.taskBuffer.activateJobs(acJobs) + # update dataset in DB + if self.dataset.type == 'dispatch': + self.dataset.status = 'completed' + self.taskBuffer.updateDatasets([self.dataset]) + _logger.debug("end: %s" % self.dataset.name) diff --git a/current/pandaserver/dataservice/Adder.py b/current/pandaserver/dataservice/Adder.py new file mode 100755 index 000000000..7209704e5 --- /dev/null +++ b/current/pandaserver/dataservice/Adder.py @@ -0,0 +1,742 @@ +''' +add data to dataset + +''' + +import os +import re +import sys +import time +import fcntl +import commands +import threading +import xml.dom.minidom +import ErrorCode +import brokerage.broker_util +from DDM import ddm +from Closer import Closer + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('Adder') + + +class Adder (threading.Thread): + # constructor + def __init__(self,taskBuffer,jobID,fileCatalog,jobStatus,xmlFile='',ignoreDDMError=True,joinCloser=False, + addOutput=False,pandaDDM=False,siteMapper=None,attemptNr=None): + threading.Thread.__init__(self) + self.job = None + self.jobID = jobID + self.jobStatus = jobStatus + self.taskBuffer = taskBuffer + self.ignoreDDMError = ignoreDDMError + self.joinCloser = joinCloser + self.addOutput = addOutput + self.pandaDDM = pandaDDM + self.lockXML = None + self.datasetMap = {} + self.siteMapper = siteMapper + self.addToTopOnly = False + self.goToTransferring = False + self.subscriptionMap = {} + self.attemptNr = attemptNr + # dump Catalog into file + if xmlFile=='': + if attemptNr == None: + self.xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,jobID,jobStatus, + commands.getoutput('uuidgen')) + else: + self.xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,jobID,jobStatus, + commands.getoutput('uuidgen'),attemptNr) + file = open(self.xmlFile,'w') + file.write(fileCatalog) + file.close() + else: + self.xmlFile = xmlFile + + + # main + def run(self): + try: + _logger.debug("%s new start: %s" % (self.jobID,self.jobStatus)) + # lock XML except last trial + if self.addOutput and self.ignoreDDMError: + self.lockXML = open(self.xmlFile) + try: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) + except: + _logger.debug("%s cannot get lock : %s" % (self.jobID,self.xmlFile)) + self.lockXML.close() + return + # query job + self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, + fromArchived=False, + fromWaiting=False)[0] + # check if job has finished + if self.job == None: + _logger.debug('%s : not found' % self.jobID) + elif self.job.jobStatus in ['finished','failed','unknown','cancelled']: + _logger.error('%s : invalid state -> %s' % (self.jobID,self.job.jobStatus)) + else: + # add files only to top-level datasets for transferring jobs + if self.job.jobStatus == 'transferring': + self.addToTopOnly = True + _logger.debug("%s adder for transferring" % self.jobID) + # use PandaDDM for ddm jobs + if self.job.prodSourceLabel == 'ddm': + self.pandaDDM = True + # set job status + self.job.jobStatus = self.jobStatus + # add outputs. Cannot add self.pandaDDM here since minidom.parse() produces seg-fault + if self.addOutput: + # check if the job should go to trasnferring + tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm + tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se) + destSEwasSet = False + if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(self.job.destinationSE): + # DQ2 ID was set by using --destSE for analysis job to transfer output + destSEwasSet = True + tmpDstDDM = self.job.destinationSE + tmpDstSEs = self.job.destinationSE + else: + tmpDstDDM = self.siteMapper.getSite(self.job.destinationSE).ddm + tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.destinationSE).se) + if re.search('^ANALY_',self.job.computingSite) != None: + # analysis site + pass + elif (re.search('BNL', self.job.computingSite) != None or self.job.computingSite == "TPATHENA"): + # BNL + pass + elif self.job.computingSite == self.job.destinationSE: + # same site ID for computingSite and destinationSE + pass + elif tmpSrcDDM == tmpDstDDM: + # same DQ2ID for src/dest + pass + elif tmpSrcSEs == tmpDstSEs: + # same SEs + pass + elif self.job.computingSite.endswith("_REPRO"): + # reprocessing sites + pass + elif self.addToTopOnly: + # already in transferring + pass + elif self.job.jobStatus == 'failed': + # failed jobs + pass + else: + self.goToTransferring = True + self._updateOutputs() + else: + _logger.debug('%s : not added' % self.jobID) + _logger.debug('%s escape' % self.jobID) + return + _logger.debug('%s updated outputs' % self.jobID) + # ignore DDMError + if self.ignoreDDMError and \ + (re.search('could not add files',self.job.ddmErrorDiag) != None or \ + re.search('could not register subscription',self.job.ddmErrorDiag) != None) and \ + re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None and \ + re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None and \ + re.search('DQUnknownDatasetException',self.job.ddmErrorDiag) == None and \ + re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None and \ + re.search('KeyError',self.job.ddmErrorDiag) == None: + _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag)) + _logger.debug('%s escape' % self.jobID) + # unlock XML + try: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) + self.lockXML.close() + except: + type, value, traceBack = sys.exc_info() + _logger.debug("%s : %s %s" % (self.jobID,type,value)) + _logger.debug("%s cannot unlock XML" % self.jobID) + return + # update shadow dataset + if self.job.prodSourceLabel == 'user' and self.jobStatus == 'finished' and self.job.ddmErrorDiag == 'NULL' \ + and not self.goToTransferring: + self._updateShadow() + # ignore DDMError + if self.ignoreDDMError and re.search('could not add files',self.job.ddmErrorDiag) != None \ + and re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None \ + and re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None \ + and re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None \ + and re.search('KeyError',self.job.ddmErrorDiag) == None: + _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag)) + _logger.debug('%s escape' % self.jobID) + # unlock XML + try: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) + self.lockXML.close() + except: + type, value, traceBack = sys.exc_info() + _logger.debug("%s : %s %s" % (self.jobID,type,value)) + _logger.debug("%s cannot unlock XML" % self.jobID) + return + # set file status + if self.job.jobStatus == 'failed': + for file in self.job.Files: + if file.type == 'output' or file.type == 'log': + file.status = 'failed' + else: + # reset errors + self.job.jobDispatcherErrorCode = 0 + self.job.jobDispatcherErrorDiag = 'NULL' + # set job status + hasOutput = False + if self.goToTransferring or self.subscriptionMap != {}: + # set status to transferring + for file in self.job.Files: + if file.type == 'output' or file.type == 'log' or \ + self.subscriptionMap.has_key(file.destinationDBlock): + file.status = 'transferring' + hasOutput = True + if hasOutput: + self.job.jobStatus = 'transferring' + # propagate transition to prodDB + self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + # endtime + if self.job.endTime=='NULL': + self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + # set cancelled state + if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': + self.job.jobStatus = 'cancelled' + # update job + retU = self.taskBuffer.updateJobs([self.job],False) + _logger.debug("%s retU: %s" % (self.jobID,retU)) + # failed + if not retU[0]: + _logger.error('failed to update DB for %s' % self.jobID) + # unlock XML + try: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) + self.lockXML.close() + except: + type, value, traceBack = sys.exc_info() + _logger.debug("%s : %s %s" % (self.jobID,type,value)) + _logger.debug("%s cannot unlock XML" % self.jobID) + return + # setup for closer + destDBList = [] + guidList = [] + for file in self.job.Files: + # ignore inputs + if file.type == 'input': + continue + # start closer for output/log datasets + if not file.destinationDBlock in destDBList: + destDBList.append(file.destinationDBlock) + # collect GUIDs + if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test'] and \ + self.job.processingType in ['pathena','gangarobot-rctest'])) \ + and file.type == 'output': + guidList.append({'lfn':file.lfn, 'guid':file.GUID, 'type':file.type}) + if guidList != []: + retG = self.taskBuffer.setGUIDs(guidList) + if destDBList != []: + # start Closer + cThr = Closer(self.taskBuffer,destDBList,self.job,pandaDDM=self.pandaDDM, + datasetMap=self.datasetMap) + _logger.debug("%s start Closer" % self.jobID) + cThr.start() + if self.joinCloser: + cThr.join() + _logger.debug("%s end Closer" % self.jobID) + _logger.debug("%s end" % self.jobID) + try: + # remove Catalog + os.remove(self.xmlFile) + except: + pass + # unlock XML + if self.lockXML != None: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) + self.lockXML.close() + except: + type, value, traceBack = sys.exc_info() + _logger.debug("%s : %s %s" % (self.jobID,type,value)) + _logger.debug("%s except" % self.jobID) + # unlock XML just in case + try: + if self.lockXML != None: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) + except: + type, value, traceBack = sys.exc_info() + _logger.debug("%s : %s %s" % (self.jobID,type,value)) + _logger.debug("%s cannot unlock XML" % self.jobID) + + + # update output files + def _updateOutputs(self): + # get LFN and GUID + _logger.debug("%s %s" % (self.jobID,self.xmlFile)) + # no outputs + if self.job.Files == []: + _logger.debug("%s has no outputs" % self.jobID) + _logger.debug("%s addFiles end" % self.jobID) + return + # get input files + inputLFNs = [] + for file in self.job.Files: + if file.type == 'input': + inputLFNs.append(file.lfn) + # parse XML + lfns = [] + guids = [] + fsizes = [] + md5sums = [] + chksums = [] + try: + root = xml.dom.minidom.parse(self.xmlFile) + files = root.getElementsByTagName('File') + for file in files: + # get GUID + guid = str(file.getAttribute('ID')) + _logger.debug(guid) + # get PFN and LFN nodes + logical = file.getElementsByTagName('logical')[0] + lfnNode = logical.getElementsByTagName('lfn')[0] + # convert UTF8 to Raw + lfn = str(lfnNode.getAttribute('name')) + # get metadata + fsize = None + md5sum = None + adler32 = None + for meta in file.getElementsByTagName('metadata'): + # get fsize + name = str(meta.getAttribute('att_name')) + if name == 'fsize': + fsize = long(meta.getAttribute('att_value')) + elif name == 'md5sum': + md5sum = str(meta.getAttribute('att_value')) + # check + if re.search("^[a-fA-F0-9]{32}$",md5sum) == None: + md5sum = None + elif name == 'adler32': + adler32 = str(meta.getAttribute('att_value')) + # error check + if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None)): + raise RuntimeError, 'fsize/md5sum/adler32=None' + # append + lfns.append(lfn) + guids.append(guid) + fsizes.append(fsize) + md5sums.append(md5sum) + if adler32 != None: + # use adler32 if available + chksums.append("ad:%s" % adler32) + else: + chksums.append("md5:%s" % md5sum) + except: + # check if file exists + if os.path.exists(self.xmlFile): + type, value, traceBack = sys.exc_info() + _logger.error("%s : %s %s" % (self.jobID,type,value)) + # set failed anyway + self.job.jobStatus = 'failed' + # XML error happens when pilot got killed due to wall-time limit or failures in wrapper + if (self.job.pilotErrorCode in [0,'0','NULL']) and \ + (self.job.transExitCode in [0,'0','NULL']): + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "Adder._updateOutputs() could not get GUID/LFN/MD5/FSIZE" + return + else: + # XML was deleted + self.job.ddmErrorDiag = "Adder._updateOutputs() could not add files" + self.ignoreDDMError = True + return + # check files + idMap = {} + fileList = [] + subMap = {} + for file in self.job.Files: + if file.type == 'input': + if file.lfn in lfns: + if self.job.prodSourceLabel in ['user','panda']: + # skipped file + file.status = 'skipped' + elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']: + # failed by pilot + file.status = 'failed' + elif file.type == 'output' or file.type == 'log': + # append to fileList + fileList.append(file.lfn) + # add only log file for failed jobs + if self.jobStatus == 'failed' and file.type != 'log': + continue + # add only log file for unmerge jobs + if self.job.prodSourceLabel == 'panda' and self.job.processingType in ['unmerge'] \ + and file.type != 'log': + continue + # look for GUID with LFN + try: + i = lfns.index(file.lfn) + file.GUID = guids[i] + file.fsize = fsizes[i] + file.md5sum = md5sums[i] + file.checksum = chksums[i] + # status + file.status = 'ready' + # fsize + fsize = None + if not file.fsize in ['NULL','',0]: + try: + fsize = long(file.fsize) + except: + type, value, traceBack = sys.exc_info() + _logger.error("%s : %s %s" % (self.jobID,type,value)) + # append to map + if not idMap.has_key(file.destinationDBlock): + idMap[file.destinationDBlock] = [] + idMap[file.destinationDBlock].append({'guid' : file.GUID, + 'lfn' : lfns[i], + 'size' : fsize, + 'checksum' : file.checksum}) + # for subscription + if self.job.prodSourceLabel in ['managed','test','software','rc_test','ptest','user'] and \ + re.search('_sub\d+$',file.destinationDBlock) != None and (not self.addToTopOnly) and \ + self.job.destinationSE != 'local': + if self.siteMapper == None: + _logger.error("%s : SiteMapper==None" % self.jobID) + else: + # get dataset spec + if not self.datasetMap.has_key(file.destinationDBlock): + tmpDS = self.taskBuffer.queryDatasetWithMap({'name':file.destinationDBlock}) + self.datasetMap[file.destinationDBlock] = tmpDS + # check if valid dataset + if self.datasetMap[file.destinationDBlock] == None: + _logger.error("%s : cannot find %s in DB" % (self.jobID,file.destinationDBlock)) + else: + if not self.datasetMap[file.destinationDBlock].status in ['defined']: + # not a fresh dataset + _logger.debug("%s : subscription was already made for %s:%s" % \ + (self.jobID,self.datasetMap[file.destinationDBlock].status, + file.destinationDBlock)) + else: + # get DQ2 IDs + tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm + tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se) + if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(file.destinationSE): + # DQ2 ID was set by using --destSE for analysis job to transfer output + tmpDstDDM = file.destinationSE + tmpDstSEs = file.destinationSE + else: + tmpDstDDM = self.siteMapper.getSite(file.destinationSE).ddm + tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(file.destinationSE).se) + # if src != dest or multi-token + if (tmpSrcDDM != tmpDstDDM and tmpSrcSEs != tmpDstSEs) or \ + (tmpSrcDDM == tmpDstDDM and file.destinationDBlockToken.count(',') != 0): + optSub = {'DATASET_COMPLETE_EVENT' : ['https://%s:%s/server/panda/datasetCompleted' % \ + (panda_config.pserverhost,panda_config.pserverport)]} + # append + if not subMap.has_key(file.destinationDBlock): + subMap[file.destinationDBlock] = [] + # sources + optSource = {} + # set sources for NL/FR/ES to handle T2s in another cloud + if self.job.cloud in ['NL','FR','ES']: + if file.destinationDBlockToken in ['NULL','']: + # use default DQ2 ID as source + optSource[tmpSrcDDM] = {'policy' : 0} + else: + # convert token to DQ2 ID + dq2ID = tmpSrcDDM + # use the first token's location as source for T1D1 + tmpSrcToken = file.destinationDBlockToken.split(',')[0] + if self.siteMapper.getSite(self.job.computingSite).setokens.has_key(tmpSrcToken): + dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens[tmpSrcToken] + optSource[dq2ID] = {'policy' : 0} + # use another location when token is set + if not file.destinationDBlockToken in ['NULL','']: + tmpDQ2IDList = [] + tmpDstTokens = file.destinationDBlockToken.split(',') + # remove the first one because it is already used as a location + if tmpSrcDDM == tmpDstDDM: + tmpDstTokens = tmpDstTokens[1:] + # loop over all tokens + for idxToken,tmpDstToken in enumerate(tmpDstTokens): + dq2ID = tmpDstDDM + if self.siteMapper.getSite(file.destinationSE).setokens.has_key(tmpDstToken): + dq2ID = self.siteMapper.getSite(file.destinationSE).setokens[tmpDstToken] + # keep the fist destination for multi-hop + if idxToken == 0: + firstDestDDM = dq2ID + else: + # use the fist destination as source for T1D1 + optSource = {} + optSource[firstDestDDM] = {'policy' : 0} + # remove looping subscription + if dq2ID == tmpSrcDDM: + continue + # avoid duplication + if not dq2ID in tmpDQ2IDList: + subMap[file.destinationDBlock].append((dq2ID,optSub,optSource)) + else: + # use default DDM + for dq2ID in tmpDstDDM.split(','): + subMap[file.destinationDBlock].append((dq2ID,optSub,optSource)) + except: + # status + file.status = 'failed' + type, value, traceBack = sys.exc_info() + _logger.error("%s : %s %s" % (self.jobID,type,value)) + # cleanup submap + tmpKeys = subMap.keys() + for tmpKey in tmpKeys: + if subMap[tmpKey] == []: + del subMap[tmpKey] + # check consistency between XML and filesTable + for lfn in lfns: + if (not lfn in fileList) and (not lfn in inputLFNs): + _logger.error("%s %s is not found in filesTable" % (self.jobID,lfn)) + self.job.jobStatus = 'failed' + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "Adder._updateOutputs() XML is inconsistent with filesTable" + return + # return if PandaDDM is used or non-DQ2 + if self.pandaDDM or self.job.destinationSE == 'local': + return + # add data to original dataset + for destinationDBlock in idMap.keys(): + match = re.findall('(.+)_sub\d+$',destinationDBlock) + if len(match): + # add files to top-level datasets + if not self.goToTransferring: + origDBlock = match[0] + idMap[origDBlock] = idMap[destinationDBlock] + # add files to top-level datasets only + if self.addToTopOnly: + del idMap[destinationDBlock] + # print idMap + _logger.debug("%s idMap = %s" % (self.jobID,idMap)) + # add data + _logger.debug("%s addFiles start" % self.jobID) + # number of retry + nTry = 3 + for iTry in range(nTry): + # empty + if idMap == {}: + break + # add data to datasets + time.sleep(1) + _logger.debug((self.jobID, 'registerFilesInDatasets',idMap)) + status,out = ddm.DQ2.main('registerFilesInDatasets',idMap) + isFailed = False + if status != 0 and out.find('DQFileExistsInDatasetException') == -1 \ + and (out.find('The file LFN or GUID is already registered') == -1 or \ + out.find('already registered in vuid') == -1): + isFailed = True + if not isFailed: + _logger.debug('%s %s' % (self.jobID,out)) + # failed + if isFailed: + _logger.error('%s %s' % (self.jobID,out)) + if (iTry+1) == nTry or out.find('DQClosedDatasetException') != 0 or \ + out.find('DQFrozenDatasetException') != 0 or \ + out.find('DQUnknownDatasetException') != 0 or \ + out.find('DQFileMetaDataMismatchException') != 0: + self.job.jobStatus = 'failed' + self.job.ddmErrorCode = ErrorCode.EC_Adder + errMsg = "Adder._updateOutputs() could not add files to %s\n" % idMap.keys() + self.job.ddmErrorDiag = errMsg + out.split('\n')[-1] + return + _logger.error("%s Try:%s" % (self.jobID,iTry)) + # sleep + time.sleep(120) + else: + break + # register dataset subscription + subActivity = 'Production' + if not self.job.prodSourceLabel in ['user']: + # make DQ2 subscription for prod jobs + for tmpName,tmpVal in subMap.iteritems(): + for dq2ID,optSub,optSource in tmpVal: + _logger.debug((self.jobID,'registerDatasetSubscription',tmpName,dq2ID,0,0,optSub, + optSource,001000 | 010000,0,None,0,"production",None,subActivity,None,"14 days")) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('registerDatasetSubscription',tmpName,dq2ID,0,0,optSub, + optSource,001000 | 010000,0,None,0,"production",None,subActivity,None,"14 days") + if (status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1) and \ + out.find('DQSubscriptionExistsException') == -1: + time.sleep(60) + else: + break + if status != 0 and (out != 'None' and out.find('DQSubscriptionExistsException') == -1): + _logger.error('%s %s' % (self.jobID,out)) + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "Adder._updateOutputs() could not register subscription : %s" % tmpName + return + _logger.debug('%s %s' % (self.jobID,out)) + # set dataset status + self.datasetMap[tmpName].status = 'running' + # keep subscriptions + self.subscriptionMap = subMap + else: + # send request to DaTRI + tmpTopDatasets = {} + # collect top-level datasets + for tmpName,tmpVal in subMap.iteritems(): + for dq2ID,optSub,optSource in tmpVal: + tmpTopName = re.sub('_sub\d+','',tmpName) + # append + if not tmpTopDatasets.has_key(tmpTopName): + tmpTopDatasets[tmpTopName] = [] + if not dq2ID in tmpTopDatasets[tmpTopName]: + tmpTopDatasets[tmpTopName].append(dq2ID) + # remove redundant CN from DN + tmpDN = self.job.prodUserID + tmpDN = re.sub('/CN=limited proxy','',tmpDN) + tmpDN = re.sub('(/CN=proxy)+$','',tmpDN) + # send request + if tmpTopDatasets != {} and self.jobStatus == 'finished': + try: + from datriHandler import datriHandler + if self.job.lockedby.startswith('Ganga'): + tmpHandler = datriHandler(type='ganga') + else: + tmpHandler = datriHandler(type='pathena') + # loop over all output datasets + for tmpDsName,dq2IDlist in tmpTopDatasets.iteritems(): + for tmpDQ2ID in dq2IDlist: + tmpMsg = "%s %s ds=%s site=%s id=%s" % (self.jobID,'datriHandler.sendRequest', + tmpDsName,tmpDQ2ID,tmpDN) + _logger.debug(tmpMsg) + tmpHandler.setParameters(data_pattern=tmpDsName, + site=tmpDQ2ID, + userid=tmpDN) + # number of retry + nTry = 3 + for iTry in range(nTry): + dhStatus,dhOut = tmpHandler.sendRequest() + # succeeded + if dhStatus == 0 or "such request is exist" in dhOut: + _logger.debug("%s %s %s" % (self.jobID,dhStatus,dhOut)) + break + if iTry+1 < nTry: + # sleep + time.sleep(60) + else: + # final attempt failed + tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,dhStatus,dhOut) + _logger.error(tmpMsg) + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "DaTRI failed for %s with %s %s" % (tmpDsName,dhStatus,dhOut) + return + # set dataset status + for tmpName,tmpVal in subMap.iteritems(): + self.datasetMap[tmpName].status = 'running' + except: + errType,errValue = sys.exc_info()[:2] + tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,errType,errValue) + _logger.error(tmpMsg) + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "DaTRI failed with %s %s" % (errType,errValue) + return + # properly finished + _logger.debug("%s addFiles end" % self.jobID) + + + # update shadow dataset + def _updateShadow(self): + # return if PandaDDM is used or non-DQ2 + if self.pandaDDM or self.job.destinationSE == 'local': + return + _logger.debug("%s updateShadow" % self.jobID) + # get shadow DS and contents + shadowList = [] + shadowFiles = [] + for file in self.job.Files: + if file.type == 'output' or file.type == 'log': + # get shadow name + shadowDS = re.sub('_sub\d+$','',file.destinationDBlock) + '_shadow' + if not shadowDS in shadowList: + shadowList.append(shadowDS) + elif file.type == 'input': + # remove skipped files + if file.status in ['skipped']: + continue + # ignore lib.tgz + if re.search('lib\.tgz\.*\d*',file.lfn) != None: + continue + # ignore DBRelease + if re.search('DBRelease',file.lfn) != None: + continue + # ignore when noshadow is set + if file.destinationDBlockToken == 'noshadow': + continue + # fsize + fsize = None + if not file.fsize in ['NULL','',0]: + try: + fsize = long(file.fsize) + except: + type, value, traceBack = sys.exc_info() + _logger.error("%s : %s %s" % (self.jobID,type,value)) + # append + if len(str(file.GUID))==36: + shadowFiles.append({'guid' : file.GUID, + 'lfn' : file.lfn, + 'size' : fsize, + 'checksum' : None}) + # create idMap + idMap = {} + for shadowDS in shadowList: + nTry = 3 + findFlag = False + for iTry in range(nTry): + # check if shadow dataset exists + _logger.debug((self.jobID, 'listDatasets',shadowDS,0,True)) + status,out = ddm.DQ2.main('listDatasets',shadowDS,0,True) + if status == 0: + if (out.find(shadowDS) == -1): + _logger.debug("%s shadow %s doesn't exist" % (self.jobID,shadowDS)) + else: + findFlag = True + break + # sleep + time.sleep(120) + # append + if findFlag and shadowFiles != []: + idMap[shadowDS] = shadowFiles + # add data + _logger.debug("%s shadow idMap = %s" % (self.jobID,idMap)) + if idMap == {}: + return + _logger.debug("%s addFilesToShadow start" % self.jobID) + # number of retry + nTry = 3 + for iTry in range(nTry): + # add data to datasets + time.sleep(1) + _logger.debug((self.jobID, 'registerFilesInDatasets',idMap)) + status,out = ddm.DQ2.main('registerFilesInDatasets',idMap) + isFailed = False + if status != 0 and out.find('DQFileExistsInDatasetException') == -1 \ + and (out.find('The file LFN or GUID is already registered') == -1 or \ + out.find('already registered in vuid') == -1): + isFailed = True + if not isFailed: + _logger.debug('%s %s' % (self.jobID,out)) + # failed + if isFailed: + _logger.error('%s %s' % (self.jobID,out)) + if (iTry+1) == nTry or out.find('DQClosedDatasetException') != 0 or \ + out.find('DQFrozenDatasetException') != 0 or \ + out.find('DQFileMetaDataMismatchException') != 0: + self.job.jobStatus = 'failed' + self.job.ddmErrorCode = ErrorCode.EC_Adder + errMsg = "Adder._updateOutputs() could not add files to %s\n" % idMap.keys() + self.job.ddmErrorDiag = errMsg + out.split('\n')[-1] + return + _logger.error("%s shadow Try:%s" % (self.jobID,iTry)) + # sleep + time.sleep(120) + else: + break + _logger.debug("%s addFilesToShadow end" % self.jobID) diff --git a/current/pandaserver/dataservice/Adder2.py b/current/pandaserver/dataservice/Adder2.py new file mode 100644 index 000000000..521526d7b --- /dev/null +++ b/current/pandaserver/dataservice/Adder2.py @@ -0,0 +1,1014 @@ +''' +add data to dataset + +''' + +import os +import re +import sys +import time +import fcntl +import datetime +import commands +import threading +import xml.dom.minidom +import ErrorCode +from dq2.clientapi import DQ2 +try: + from dq2.clientapi.cli import Register2 +except: + pass + +import brokerage.broker_util +import Closer + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('Adder') +Closer.initLogger(_logger) + + +class Adder (threading.Thread): + # constructor + def __init__(self,taskBuffer,jobID,fileCatalog,jobStatus,xmlFile='',ignoreDDMError=True,joinCloser=False, + addOutput=False,pandaDDM=False,siteMapper=None,attemptNr=None): + threading.Thread.__init__(self) + self.job = None + self.jobID = jobID + self.jobStatus = jobStatus + self.taskBuffer = taskBuffer + self.ignoreDDMError = ignoreDDMError + self.joinCloser = joinCloser + self.addOutput = addOutput + self.pandaDDM = pandaDDM + self.lockXML = None + self.datasetMap = {} + self.siteMapper = siteMapper + self.addToTopOnly = False + self.goToTransferring = False + self.logTransferring = False + self.subscriptionMap = {} + self.dq2api = None + self.attemptNr = attemptNr + # dump Catalog into file + if xmlFile=='': + if attemptNr == None: + self.xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,jobID,jobStatus, + commands.getoutput('uuidgen')) + else: + self.xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,jobID,jobStatus, + commands.getoutput('uuidgen'),attemptNr) + file = open(self.xmlFile,'w') + file.write(fileCatalog) + file.close() + else: + self.xmlFile = xmlFile + # exstract attemptNr + try: + tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1] + if re.search('^\d+$',tmpAttemptNr) != None: + self.attemptNr = int(tmpAttemptNr) + except: + pass + # main + def run(self): + try: + _logger.debug("%s new start: %s attemptNr=%s" % (self.jobID,self.jobStatus,self.attemptNr)) + # instantiate DQ2 + self.dq2api = DQ2.DQ2() + # lock XML except last trial + if self.addOutput and self.ignoreDDMError: + self.lockXML = open(self.xmlFile) + try: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) + except: + _logger.debug("%s cannot get lock : %s" % (self.jobID,self.xmlFile)) + self.lockXML.close() + return + # query job + self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, + fromArchived=False, + fromWaiting=False)[0] + # check if job has finished + if self.job == None: + _logger.debug('%s : not found' % self.jobID) + elif self.job.jobStatus in ['finished','failed','unknown','cancelled']: + _logger.error('%s : invalid state -> %s' % (self.jobID,self.job.jobStatus)) + elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: + _logger.error('%s : wrong attemptNr -> job=%s <> %s' % (self.jobID,self.job.attemptNr,self.attemptNr)) + else: + # add files only to top-level datasets for transferring jobs + if self.job.jobStatus == 'transferring': + self.addToTopOnly = True + _logger.debug("%s adder for transferring" % self.jobID) + # use PandaDDM for ddm jobs + if self.job.prodSourceLabel == 'ddm': + self.pandaDDM = True + # set job status + self.job.jobStatus = self.jobStatus + # add outputs. Cannot add self.pandaDDM here since minidom.parse() produces seg-fault + if self.addOutput: + # check if the job should go to trasnferring + tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm + tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se) + destSEwasSet = False + brokenSched = False + if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(self.job.destinationSE): + # DQ2 ID was set by using --destSE for analysis job to transfer output + destSEwasSet = True + tmpDstDDM = self.job.destinationSE + tmpDstSEs = self.job.destinationSE + else: + tmpDstDDM = self.siteMapper.getSite(self.job.destinationSE).ddm + tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.destinationSE).se) + # protection against disappearance of dest from schedconfig + if not self.siteMapper.checkSite(self.job.destinationSE) and self.job.destinationSE != 'local': + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "destinaitonSE %s is unknown in schedconfig" % self.job.destinationSE + self.job.jobStatus = 'failed' + self.jobStatus = 'failed' + _logger.error("%s %s" % (self.jobID,self.job.ddmErrorDiag)) + brokenSched = True + # protection against disappearance of src from schedconfig + if not self.siteMapper.checkSite(self.job.computingSite): + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "computingSite %s is unknown in schedconfig" % self.job.computingSite + self.job.jobStatus = 'failed' + self.jobStatus = 'failed' + _logger.error("%s %s" % (self.jobID,self.job.ddmErrorDiag)) + brokenSched = True + _logger.debug('%s DDM src:%s dst:%s' % (self.jobID,tmpSrcDDM,tmpDstDDM)) + _logger.debug('%s SE src:%s dst:%s' % (self.jobID,tmpSrcSEs,tmpDstSEs)) + if re.search('^ANALY_',self.job.computingSite) != None: + # analysis site + pass + elif self.job.computingSite == self.job.destinationSE: + # same site ID for computingSite and destinationSE + pass + elif tmpSrcDDM == tmpDstDDM: + # same DQ2ID for src/dest + pass + elif tmpSrcSEs == tmpDstSEs: + # same SEs + pass + elif self.addToTopOnly: + # already in transferring + pass + elif self.job.jobStatus == 'failed': + # failed jobs + if self.job.prodSourceLabel in ['managed','test']: + self.logTransferring = True + pass + else: + self.goToTransferring = True + _logger.debug('%s goToTransferring=%s' % (self.jobID,self.goToTransferring)) + _logger.debug('%s logTransferring=%s' % (self.jobID,self.logTransferring)) + if not brokenSched: + self._updateOutputs() + else: + _logger.debug('%s : not added' % self.jobID) + _logger.debug('%s escape' % self.jobID) + return + _logger.debug('%s updated outputs' % self.jobID) + # ignore DDMError + if self.ignoreDDMError and \ + (re.search('could not add files',self.job.ddmErrorDiag) != None or \ + re.search('could not register subscription',self.job.ddmErrorDiag) != None) and \ + re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None and \ + re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None and \ + re.search('DQUnknownDatasetException',self.job.ddmErrorDiag) == None and \ + re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None and \ + re.search('DQDatasetExistsException',self.job.ddmErrorDiag) == None and \ + re.search('Exceeded the maximum number of files',self.job.ddmErrorDiag) == None and \ + re.search('KeyError',self.job.ddmErrorDiag) == None and \ + not self.job.ddmErrorCode in [ErrorCode.EC_Subscription]: + _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag)) + _logger.debug('%s escape' % self.jobID) + # unlock XML + try: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) + self.lockXML.close() + except: + type, value, traceBack = sys.exc_info() + _logger.debug("%s : %s %s" % (self.jobID,type,value)) + _logger.debug("%s cannot unlock XML" % self.jobID) + return + # update shadow dataset + """ + if self.job.prodSourceLabel == 'user' and self.jobStatus == 'finished' and \ + (self.job.ddmErrorDiag == 'NULL' or re.search('DaTRI failed',self.job.ddmErrorDiag) != None) and \ + not self.goToTransferring: + self._updateShadow() + # ignore DDMError + if self.ignoreDDMError and re.search('could not add files',self.job.ddmErrorDiag) != None \ + and re.search('DQClosedDatasetException',self.job.ddmErrorDiag) == None \ + and re.search('DQFrozenDatasetException',self.job.ddmErrorDiag) == None \ + and re.search('DQFileMetaDataMismatchException',self.job.ddmErrorDiag) == None \ + and re.search('Exceeded the maximum number of files',self.job.ddmErrorDiag) == None \ + and re.search('KeyError',self.job.ddmErrorDiag) == None: + _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag)) + _logger.debug('%s escape' % self.jobID) + # unlock XML + try: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) + self.lockXML.close() + except: + type, value, traceBack = sys.exc_info() + _logger.debug("%s : %s %s" % (self.jobID,type,value)) + _logger.debug("%s cannot unlock XML" % self.jobID) + return + """ + # remove unmerged + if self.job.processingType == 'usermerge' and self.job.prodSourceLabel == 'user' and \ + self.jobStatus == 'finished' and self.job.ddmErrorDiag == 'NULL': + retMerge = self._removeUnmerged() + # ignore DDMError + if self.ignoreDDMError and retMerge == None: + _logger.debug('%s : ignore %s ' % (self.jobID,self.job.ddmErrorDiag)) + _logger.debug('%s escape' % self.jobID) + # unlock XML + try: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) + self.lockXML.close() + except: + type, value, traceBack = sys.exc_info() + _logger.debug("%s : %s %s" % (self.jobID,type,value)) + _logger.debug("%s cannot unlock XML" % self.jobID) + return + # set file status + if self.job.jobStatus == 'failed': + for file in self.job.Files: + if file.type == 'output' or file.type == 'log': + file.status = 'failed' + else: + # reset errors + self.job.jobDispatcherErrorCode = 0 + self.job.jobDispatcherErrorDiag = 'NULL' + # set job status + hasOutput = False + if self.goToTransferring or self.subscriptionMap != {}: + # set status to transferring + for file in self.job.Files: + if file.type == 'output' or file.type == 'log' or \ + self.subscriptionMap.has_key(file.destinationDBlock): + file.status = 'transferring' + hasOutput = True + if hasOutput: + self.job.jobStatus = 'transferring' + # propagate transition to prodDB + self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + # endtime + if self.job.endTime=='NULL': + self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + # output size and # of outputs + self.job.nOutputDataFiles = 0 + self.job.outputFileBytes = 0 + for tmpFile in self.job.Files: + if tmpFile.type == 'output': + self.job.nOutputDataFiles += 1 + try: + self.job.outputFileBytes += tmpFile.fsize + except: + pass + # protection + maxOutputFileBytes = 99999999999 + if self.job.outputFileBytes > maxOutputFileBytes: + self.job.outputFileBytes = maxOutputFileBytes + # set cancelled state + if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': + self.job.jobStatus = 'cancelled' + # update job + retU = self.taskBuffer.updateJobs([self.job],False) + _logger.debug("%s retU: %s" % (self.jobID,retU)) + # failed + if not retU[0]: + _logger.error('failed to update DB for %s' % self.jobID) + # unlock XML + try: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) + self.lockXML.close() + except: + type, value, traceBack = sys.exc_info() + _logger.debug("%s : %s %s" % (self.jobID,type,value)) + _logger.debug("%s cannot unlock XML" % self.jobID) + return + # setup for closer + destDBList = [] + guidList = [] + for file in self.job.Files: + # ignore inputs + if file.type == 'input': + continue + # start closer for output/log datasets + if not file.destinationDBlock in destDBList: + destDBList.append(file.destinationDBlock) + # collect GUIDs + if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test'] and \ + self.job.processingType in ['pathena','prun','gangarobot-rctest'])) \ + and file.type == 'output': + guidList.append({'lfn':file.lfn,'guid':file.GUID,'type':file.type, + 'checksum':file.checksum,'md5sum':file.md5sum, + 'fsize':file.fsize,'scope':file.scope}) + if guidList != []: + retG = self.taskBuffer.setGUIDs(guidList) + if destDBList != []: + # start Closer + cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,pandaDDM=self.pandaDDM, + datasetMap=self.datasetMap) + _logger.debug("%s start Closer" % self.jobID) + cThr.start() + if self.joinCloser: + cThr.join() + _logger.debug("%s end Closer" % self.jobID) + _logger.debug("%s end" % self.jobID) + try: + # remove Catalog + os.remove(self.xmlFile) + except: + pass + # unlock XML + if self.lockXML != None: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) + self.lockXML.close() + except: + type, value, traceBack = sys.exc_info() + _logger.debug("%s : %s %s" % (self.jobID,type,value)) + _logger.debug("%s except" % self.jobID) + # unlock XML just in case + try: + if self.lockXML != None: + fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) + except: + type, value, traceBack = sys.exc_info() + _logger.debug("%s : %s %s" % (self.jobID,type,value)) + _logger.debug("%s cannot unlock XML" % self.jobID) + + + # update output files + def _updateOutputs(self): + # get LFN and GUID + _logger.debug("%s %s" % (self.jobID,self.xmlFile)) + # no outputs + if self.job.Files == []: + _logger.debug("%s has no outputs" % self.jobID) + _logger.debug("%s addFiles end" % self.jobID) + return + # get input files + inputLFNs = [] + for file in self.job.Files: + if file.type == 'input': + inputLFNs.append(file.lfn) + # parse XML + lfns = [] + guids = [] + fsizes = [] + md5sums = [] + chksums = [] + surls = [] + try: + root = xml.dom.minidom.parse(self.xmlFile) + files = root.getElementsByTagName('File') + for file in files: + # get GUID + guid = str(file.getAttribute('ID')) + _logger.debug(guid) + # get PFN and LFN nodes + logical = file.getElementsByTagName('logical')[0] + lfnNode = logical.getElementsByTagName('lfn')[0] + # convert UTF8 to Raw + lfn = str(lfnNode.getAttribute('name')) + # get metadata + fsize = None + md5sum = None + adler32 = None + surl = None + for meta in file.getElementsByTagName('metadata'): + # get fsize + name = str(meta.getAttribute('att_name')) + if name == 'fsize': + fsize = long(meta.getAttribute('att_value')) + elif name == 'md5sum': + md5sum = str(meta.getAttribute('att_value')) + # check + if re.search("^[a-fA-F0-9]{32}$",md5sum) == None: + md5sum = None + elif name == 'adler32': + adler32 = str(meta.getAttribute('att_value')) + elif name == 'surl': + surl = str(meta.getAttribute('att_value')) + # error check + if (not lfn in inputLFNs) and (fsize == None or (md5sum == None and adler32 == None) \ + or (self.useCentralLFC() and surl == None)): + raise RuntimeError, 'fsize/md5sum/adler32/surl=None' + # append + lfns.append(lfn) + guids.append(guid) + fsizes.append(fsize) + md5sums.append(md5sum) + surls.append(surl) + if adler32 != None: + # use adler32 if available + chksums.append("ad:%s" % adler32) + else: + chksums.append("md5:%s" % md5sum) + except: + # check if file exists + if os.path.exists(self.xmlFile): + type, value, traceBack = sys.exc_info() + _logger.error("%s : %s %s" % (self.jobID,type,value)) + # set failed anyway + self.job.jobStatus = 'failed' + # XML error happens when pilot got killed due to wall-time limit or failures in wrapper + if (self.job.pilotErrorCode in [0,'0','NULL']) and \ + (self.job.transExitCode in [0,'0','NULL']): + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "Adder._updateOutputs() could not get GUID/LFN/MD5/FSIZE/SURL" + return + else: + # XML was deleted + self.job.ddmErrorDiag = "Adder._updateOutputs() could not add files" + self.ignoreDDMError = True + return + # check files + idMap = {} + fileList = [] + subMap = {} + for file in self.job.Files: + if file.type == 'input': + if file.lfn in lfns: + if self.job.prodSourceLabel in ['user','panda']: + # skipped file + file.status = 'skipped' + elif self.job.prodSourceLabel in ['managed','test','rc_test','ptest']: + # failed by pilot + file.status = 'failed' + elif file.type == 'output' or file.type == 'log': + # append to fileList + fileList.append(file.lfn) + # add only log file for failed jobs + if self.jobStatus == 'failed' and file.type != 'log': + continue + # add only log file for unmerge jobs + if self.job.prodSourceLabel == 'panda' and self.job.processingType in ['unmerge'] \ + and file.type != 'log': + continue + # look for GUID with LFN + try: + i = lfns.index(file.lfn) + file.GUID = guids[i] + file.fsize = fsizes[i] + file.md5sum = md5sums[i] + file.checksum = chksums[i] + surl = surls[i] + # status + file.status = 'ready' + # fsize + fsize = None + if not file.fsize in ['NULL','',0]: + try: + fsize = long(file.fsize) + except: + type, value, traceBack = sys.exc_info() + _logger.error("%s : %s %s" % (self.jobID,type,value)) + # append to map + if not idMap.has_key(file.destinationDBlock): + idMap[file.destinationDBlock] = [] + fileAttrs = {'guid' : file.GUID, + 'lfn' : lfns[i], + 'size' : fsize, + 'checksum' : file.checksum} + # add SURLs if LFC registration is required + if self.useCentralLFC(): + fileAttrs['surl'] = surl + idMap[file.destinationDBlock].append(fileAttrs) + # for subscription + if self.job.prodSourceLabel in ['managed','test','software','rc_test','ptest','user'] and \ + re.search('_sub\d+$',file.destinationDBlock) != None and (not self.addToTopOnly) and \ + self.job.destinationSE != 'local': + if self.siteMapper == None: + _logger.error("%s : SiteMapper==None" % self.jobID) + else: + # get dataset spec + if not self.datasetMap.has_key(file.destinationDBlock): + tmpDS = self.taskBuffer.queryDatasetWithMap({'name':file.destinationDBlock}) + self.datasetMap[file.destinationDBlock] = tmpDS + # check if valid dataset + if self.datasetMap[file.destinationDBlock] == None: + _logger.error("%s : cannot find %s in DB" % (self.jobID,file.destinationDBlock)) + else: + if not self.datasetMap[file.destinationDBlock].status in ['defined']: + # not a fresh dataset + _logger.debug("%s : subscription was already made for %s:%s" % \ + (self.jobID,self.datasetMap[file.destinationDBlock].status, + file.destinationDBlock)) + else: + # get DQ2 IDs + tmpSrcDDM = self.siteMapper.getSite(self.job.computingSite).ddm + tmpSrcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(self.job.computingSite).se) + if self.job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(file.destinationSE): + # DQ2 ID was set by using --destSE for analysis job to transfer output + tmpDstDDM = file.destinationSE + tmpDstSEs = file.destinationSE + else: + tmpDstDDM = self.siteMapper.getSite(file.destinationSE).ddm + tmpDstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(file.destinationSE).se) + # if src != dest or multi-token + if (tmpSrcDDM != tmpDstDDM and tmpSrcSEs != tmpDstSEs) or \ + (tmpSrcDDM == tmpDstDDM and file.destinationDBlockToken.count(',') != 0): + optSub = {'DATASET_COMPLETE_EVENT' : ['https://%s:%s/server/panda/datasetCompleted' % \ + (panda_config.pserverhost,panda_config.pserverport)]} + # append + if not subMap.has_key(file.destinationDBlock): + subMap[file.destinationDBlock] = [] + # sources + optSource = {} + # set sources + if file.destinationDBlockToken in ['NULL','']: + # use default DQ2 ID as source + optSource[tmpSrcDDM] = {'policy' : 0} + else: + # convert token to DQ2 ID + dq2ID = tmpSrcDDM + # use the first token's location as source for T1D1 + tmpSrcToken = file.destinationDBlockToken.split(',')[0] + if self.siteMapper.getSite(self.job.computingSite).setokens.has_key(tmpSrcToken): + dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens[tmpSrcToken] + optSource[dq2ID] = {'policy' : 0} + # T1 used as T2 + if self.siteMapper.getSite(self.job.computingSite).cloud != self.job.cloud and \ + (not tmpSrcDDM.endswith('PRODDISK')) and \ + (not self.job.prodSourceLabel in ['user','panda']): + # register both DATADISK and PRODDISK as source locations + if self.siteMapper.getSite(self.job.computingSite).setokens.has_key('ATLASPRODDISK'): + dq2ID = self.siteMapper.getSite(self.job.computingSite).setokens['ATLASPRODDISK'] + optSource[dq2ID] = {'policy' : 0} + if not optSource.has_key(tmpSrcDDM): + optSource[tmpSrcDDM] = {'policy' : 0} + # use another location when token is set + if not file.destinationDBlockToken in ['NULL','']: + tmpDQ2IDList = [] + tmpDstTokens = file.destinationDBlockToken.split(',') + # remove the first one because it is already used as a location + if tmpSrcDDM == tmpDstDDM: + tmpDstTokens = tmpDstTokens[1:] + # loop over all tokens + for idxToken,tmpDstToken in enumerate(tmpDstTokens): + dq2ID = tmpDstDDM + if self.siteMapper.getSite(file.destinationSE).setokens.has_key(tmpDstToken): + dq2ID = self.siteMapper.getSite(file.destinationSE).setokens[tmpDstToken] + # keep the fist destination for multi-hop + if idxToken == 0: + firstDestDDM = dq2ID + else: + # use the fist destination as source for T1D1 + optSource = {} + optSource[firstDestDDM] = {'policy' : 0} + # remove looping subscription + if dq2ID == tmpSrcDDM: + continue + # avoid duplication + if not dq2ID in tmpDQ2IDList: + subMap[file.destinationDBlock].append((dq2ID,optSub,optSource)) + else: + # use default DDM + for dq2ID in tmpDstDDM.split(','): + subMap[file.destinationDBlock].append((dq2ID,optSub,optSource)) + except: + # status + file.status = 'failed' + type, value, traceBack = sys.exc_info() + _logger.error("%s : %s %s" % (self.jobID,type,value)) + # cleanup submap + tmpKeys = subMap.keys() + for tmpKey in tmpKeys: + if subMap[tmpKey] == []: + del subMap[tmpKey] + # check consistency between XML and filesTable + for lfn in lfns: + if (not lfn in fileList) and (not lfn in inputLFNs): + _logger.error("%s %s is not found in filesTable" % (self.jobID,lfn)) + self.job.jobStatus = 'failed' + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "Adder._updateOutputs() XML is inconsistent with filesTable" + return + # return if PandaDDM is used or non-DQ2 + if self.pandaDDM or self.job.destinationSE == 'local': + return + # add data to original dataset + for destinationDBlock in idMap.keys(): + origDBlock = None + match = re.search('^(.+)_sub\d+$',destinationDBlock) + if match != None: + # add files to top-level datasets + origDBlock = match.group(1) + if not self.goToTransferring: + idMap[origDBlock] = idMap[destinationDBlock] + # add files to top-level datasets only + if self.addToTopOnly: + del idMap[destinationDBlock] + # skip sub unless getting transferred + if origDBlock != None: + if not self.goToTransferring and not self.logTransferring \ + and idMap.has_key(destinationDBlock): + del idMap[destinationDBlock] + # print idMap + _logger.debug("%s idMap = %s" % (self.jobID,idMap)) + _logger.debug("%s subMap = %s" % (self.jobID,subMap)) + # add data + _logger.debug("%s addFiles start" % self.jobID) + # count the number of files + regNumFiles = 0 + regFileList = [] + for tmpRegDS,tmpRegList in idMap.iteritems(): + for tmpRegItem in tmpRegList: + if not tmpRegItem['lfn'] in regFileList: + regNumFiles += 1 + regFileList.append(tmpRegItem['lfn']) + # number of retry + nTry = 3 + for iTry in range(nTry): + # empty + if idMap == {}: + break + # add data to datasets + time.sleep(1) + isFailed = False + isFatal = False + setErrorDiag = False + out = 'OK' + fatalErrStrs = ['[ORA-00001] unique constraint (ATLAS_DQ2.UQ_01_FILES_GUID) violated'] + regStart = datetime.datetime.utcnow() + try: + if not self.useCentralLFC(): + regMsgStr = "DQ2 registraion for %s files " % regNumFiles + _logger.debug('%s %s %s' % (self.jobID,'registerFilesInDatasets',str(idMap))) + self.dq2api.registerFilesInDatasets(idMap) + else: + regMsgStr = "LFC+DQ2 registraion for %s files " % regNumFiles + _logger.debug('%s %s %s' % (self.jobID,'Register.registerFilesInDatasets',str(idMap))) + registerAPI = Register2.Register(self.siteMapper.getSite(self.job.computingSite).ddm) + out = registerAPI.registerFilesInDatasets(idMap) + except DQ2.DQFileExistsInDatasetException: + # hamless error + errType,errValue = sys.exc_info()[:2] + out = '%s : %s' % (errType,errValue) + except (DQ2.DQClosedDatasetException, + DQ2.DQFrozenDatasetException, + DQ2.DQUnknownDatasetException, + DQ2.DQFileMetaDataMismatchException): + # fatal errors + errType,errValue = sys.exc_info()[:2] + out = '%s : %s' % (errType,errValue) + isFatal = True + except: + # unknown errors + errType,errValue = sys.exc_info()[:2] + out = '%s : %s' % (errType,errValue) + for tmpFatalErrStr in fatalErrStrs: + if tmpFatalErrStr in str(errValue): + self.job.ddmErrorDiag = 'failed to add files : ' + tmpFatalErrStr + setErrorDiag = True + break + isFatal = True + regTime = datetime.datetime.utcnow() - regStart + _logger.debug('%s ' % self.jobID + regMsgStr + \ + 'took %s.%03d sec' % (regTime.seconds,regTime.microseconds/1000)) + # failed + if isFailed or isFatal: + _logger.error('%s %s' % (self.jobID,out)) + if (iTry+1) == nTry or isFatal: + self.job.jobStatus = 'failed' + self.job.ddmErrorCode = ErrorCode.EC_Adder + if not setErrorDiag: + errMsg = "Adder._updateOutputs() could not add files : " + self.job.ddmErrorDiag = errMsg + out.split('\n')[-1] + return + _logger.error("%s Try:%s" % (self.jobID,iTry)) + # sleep + time.sleep(120) + else: + _logger.debug('%s %s' % (self.jobID,out)) + break + # register dataset subscription + subActivity = 'Production' + if not self.job.prodSourceLabel in ['user']: + # make DQ2 subscription for prod jobs + for tmpName,tmpVal in subMap.iteritems(): + for dq2ID,optSub,optSource in tmpVal: + _logger.debug("%s %s %s %s" % (self.jobID,'registerDatasetSubscription', + (tmpName,dq2ID), + {'version':0,'archived':0,'callbacks':optSub, + 'sources':optSource,'sources_policy':(001000 | 010000), + 'wait_for_sources':0,'destination':None,'query_more_sources':0, + 'sshare':"production",'group':None,'activity':subActivity, + 'acl_alias':None,'replica_lifetime':"14 days"})) + for iDDMTry in range(3): + out = 'OK' + isFailed = False + try: + self.dq2api.registerDatasetSubscription(tmpName,dq2ID,version=0,archived=0,callbacks=optSub, + sources=optSource,sources_policy=(001000 | 010000), + wait_for_sources=0,destination=None,query_more_sources=0, + sshare="production",group=None,activity=subActivity, + acl_alias=None,replica_lifetime="14 days") + except DQ2.DQSubscriptionExistsException: + # harmless error + errType,errValue = sys.exc_info()[:2] + out = '%s : %s' % (errType,errValue) + except: + # unknown errors + errType,errValue = sys.exc_info()[:2] + out = '%s : %s' % (errType,errValue) + isFailed = True + if 'is not a Tiers of Atlas Destination' in str(errValue) or \ + 'is not in Tiers of Atlas' in str(errValue): + # fatal error + self.job.ddmErrorCode = ErrorCode.EC_Subscription + else: + # retry for temporary errors + time.sleep(60) + else: + break + if isFailed: + _logger.error('%s %s' % (self.jobID,out)) + if self.job.ddmErrorCode == ErrorCode.EC_Subscription: + # fatal error + self.job.ddmErrorDiag = "subscription failure with %s" % out + self.job.jobStatus = 'failed' + else: + # temoprary errors + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "Adder._updateOutputs() could not register subscription : %s" % tmpName + return + _logger.debug('%s %s' % (self.jobID,out)) + # set dataset status + self.datasetMap[tmpName].status = 'running' + # keep subscriptions + self.subscriptionMap = subMap + elif not "--mergeOutput" in self.job.jobParameters: + # send request to DaTRI unless files will be merged + tmpTopDatasets = {} + # collect top-level datasets + for tmpName,tmpVal in subMap.iteritems(): + for dq2ID,optSub,optSource in tmpVal: + tmpTopName = re.sub('_sub\d+','',tmpName) + # append + if not tmpTopDatasets.has_key(tmpTopName): + tmpTopDatasets[tmpTopName] = [] + if not dq2ID in tmpTopDatasets[tmpTopName]: + tmpTopDatasets[tmpTopName].append(dq2ID) + # remove redundant CN from DN + tmpDN = self.job.prodUserID + tmpDN = re.sub('/CN=limited proxy','',tmpDN) + tmpDN = re.sub('(/CN=proxy)+$','',tmpDN) + # send request + if tmpTopDatasets != {} and self.jobStatus == 'finished': + try: + from datriHandler import datriHandler + if self.job.lockedby.startswith('Ganga'): + tmpHandler = datriHandler(type='ganga') + else: + tmpHandler = datriHandler(type='pathena') + # loop over all output datasets + for tmpDsName,dq2IDlist in tmpTopDatasets.iteritems(): + for tmpDQ2ID in dq2IDlist: + tmpMsg = "%s %s ds=%s site=%s id=%s" % (self.jobID,'datriHandler.sendRequest', + tmpDsName,tmpDQ2ID,tmpDN) + _logger.debug(tmpMsg) + tmpHandler.setParameters(data_pattern=tmpDsName, + site=tmpDQ2ID, + userid=tmpDN) + # number of retry + nTry = 3 + for iTry in range(nTry): + dhStatus,dhOut = tmpHandler.sendRequest() + # succeeded + if dhStatus == 0 or "such request is exist" in dhOut: + _logger.debug("%s %s %s" % (self.jobID,dhStatus,dhOut)) + break + # faital errors + if "No input data or input data is incorrect" in dhOut: + tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,dhStatus,dhOut) + _logger.error(tmpMsg) + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "DaTRI failed for %s with %s %s" % (tmpDsName,dhStatus,dhOut) + return + # retry + if iTry+1 < nTry: + # sleep + time.sleep(60) + else: + # final attempt failed + tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,dhStatus,dhOut) + _logger.error(tmpMsg) + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "DaTRI failed for %s with %s %s" % (tmpDsName,dhStatus,dhOut) + return + # set dataset status + for tmpName,tmpVal in subMap.iteritems(): + self.datasetMap[tmpName].status = 'running' + except: + errType,errValue = sys.exc_info()[:2] + tmpMsg = "%s datriHandler failed with %s %s" % (self.jobID,errType,errValue) + _logger.error(tmpMsg) + self.job.ddmErrorCode = ErrorCode.EC_Adder + self.job.ddmErrorDiag = "DaTRI failed with %s %s" % (errType,errValue) + return + # properly finished + _logger.debug("%s addFiles end" % self.jobID) + + + # update shadow dataset + def _updateShadow(self): + # return if PandaDDM is used or non-DQ2 + if self.pandaDDM or self.job.destinationSE == 'local': + return + _logger.debug("%s updateShadow" % self.jobID) + # get shadow DS and contents + shadowList = [] + shadowFiles = [] + for file in self.job.Files: + if file.type == 'output' or file.type == 'log': + # get shadow name + shadowDS = re.sub('_sub\d+$','',file.destinationDBlock) + '_shadow' + if not shadowDS in shadowList: + shadowList.append(shadowDS) + elif file.type == 'input': + # remove skipped files + if file.status in ['skipped']: + continue + # ignore lib.tgz + if re.search('lib\.tgz\.*\d*',file.lfn) != None: + continue + # ignore DBRelease + if re.search('DBRelease',file.lfn) != None: + continue + # ignore when noshadow is set + if file.destinationDBlockToken == 'noshadow': + continue + # fsize + fsize = None + if not file.fsize in ['NULL','',0]: + try: + fsize = long(file.fsize) + except: + type, value, traceBack = sys.exc_info() + _logger.error("%s : %s %s" % (self.jobID,type,value)) + # append + if len(str(file.GUID))==36: + shadowFiles.append({'guid' : file.GUID, + 'lfn' : file.lfn, + 'size' : fsize, + 'checksum' : None}) + # create idMap + idMap = {} + for shadowDS in shadowList: + nTry = 3 + findFlag = False + for iTry in range(nTry): + # check if shadow dataset exists + _logger.debug((self.jobID, 'listDatasets',shadowDS,0,True)) + try: + out = self.dq2api.listDatasets(shadowDS,0,True) + if not out.has_key(shadowDS): + _logger.debug("%s shadow %s doesn't exist" % (self.jobID,shadowDS)) + else: + findFlag = True + break + except: + # sleep + time.sleep(120) + # append + if findFlag and shadowFiles != []: + idMap[shadowDS] = shadowFiles + # add data + _logger.debug("%s shadow idMap = %s" % (self.jobID,idMap)) + if idMap == {}: + return + _logger.debug("%s addFilesToShadow start" % self.jobID) + # number of retry + nTry = 3 + for iTry in range(nTry): + # add data to datasets + _logger.debug((self.jobID, 'registerFilesInDatasets',idMap)) + isFailed = False + isFatal = False + out = 'OK' + try: + self.dq2api.registerFilesInDatasets(idMap) + except DQ2.DQFileExistsInDatasetException: + # hamless error + errType,errValue = sys.exc_info()[:2] + out = '%s : %s' % (errType,errValue) + except (DQ2.DQClosedDatasetException, + DQ2.DQFrozenDatasetException, + DQ2.DQUnknownDatasetException, + DQ2.DQFileMetaDataMismatchException): + # fatal errors + errType,errValue = sys.exc_info()[:2] + out = '%s : %s' % (errType,errValue) + isFatal = True + except: + # unknown errors + errType,errValue = sys.exc_info()[:2] + out = '%s : %s' % (errType,errValue) + isFatal = True + # failed + if isFailed or isFatal: + _logger.error('%s %s' % (self.jobID,out)) + if (iTry+1) == nTry or isFatal: + self.job.jobStatus = 'failed' + self.job.ddmErrorCode = ErrorCode.EC_Adder + errMsg = "Adder._updateOutputs() could not add files : " + self.job.ddmErrorDiag = errMsg + out.split('\n')[-1] + return + _logger.error("%s shadow Try:%s" % (self.jobID,iTry)) + # sleep + time.sleep(120) + else: + _logger.debug('%s %s' % (self.jobID,out)) + break + _logger.debug("%s addFilesToShadow end" % self.jobID) + + + # use cerntral LFC + def useCentralLFC(self): + tmpSiteSpec = self.siteMapper.getSite(self.job.computingSite) + if not self.addToTopOnly and tmpSiteSpec.lfcregister in ['server']: + return True + return False + + + # remove unmerged files + def _removeUnmerged(self): + _logger.debug("%s removeUnmerged" % self.jobID) + # get input files + inputFileGUIDs = [] + inputFileStr = '' + for file in self.job.Files: + if file.type == 'input': + # remove skipped files + if file.status in ['skipped']: + continue + # ignore lib.tgz + if re.search('lib\.tgz\.*\d*',file.lfn) != None: + continue + # ignore DBRelease + if re.search('DBRelease',file.lfn) != None: + continue + # append + inputFileGUIDs.append(file.GUID) + inputFileStr += '%s,' % file.lfn + # extract parent dataset name + tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters) + # failed + if tmpMatch == None: + _logger.error("%s failed to extract parentDS from params=%s" % (self.jobID,self.job.jobParameters)) + return False + parentDS = tmpMatch.group(1) + # delete + _logger.debug("%s deleteFilesFromDataset %s %s" % (self.jobID,parentDS,inputFileStr[:-1])) + nTry = 3 + for iTry in range(nTry): + # add data to datasets + isFailed = False + isFatal = False + out = 'OK' + try: + self.dq2api.deleteFilesFromDataset(parentDS,inputFileGUIDs) + except (DQ2.DQClosedDatasetException, + DQ2.DQFrozenDatasetException, + DQ2.DQUnknownDatasetException, + DQ2.DQFileMetaDataMismatchException): + # fatal errors + errType,errValue = sys.exc_info()[:2] + out = '%s : %s' % (errType,errValue) + isFatal = True + except: + # unknown errors + errType,errValue = sys.exc_info()[:2] + out = '%s : %s' % (errType,errValue) + isFailed = True + # failed + if isFailed or isFatal: + _logger.error('%s %s' % (self.jobID,out)) + if (iTry+1) == nTry or isFatal: + self.job.jobStatus = 'failed' + self.job.ddmErrorCode = ErrorCode.EC_Adder + errMsg = "failed to remove unmerged files : " + self.job.ddmErrorDiag = errMsg + out.split('\n')[-1] + if not isFatal: + # retrun None to retry later + return None + return False + _logger.error("%s removeUnmerged Try:%s" % (self.jobID,iTry)) + # sleep + time.sleep(120) + else: + _logger.debug('%s %s' % (self.jobID,out)) + break + # succeeded + _logger.debug("%s removeUnmerged end" % self.jobID) + return True diff --git a/current/pandaserver/dataservice/AddressFinder.py b/current/pandaserver/dataservice/AddressFinder.py new file mode 100644 index 000000000..c96099bff --- /dev/null +++ b/current/pandaserver/dataservice/AddressFinder.py @@ -0,0 +1,308 @@ +import re +import sys +import urllib +import commands + +from config import panda_config +from taskbuffer.OraDBProxy import DBProxy +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('AddressFinder') + +# NG words in email address +_ngWordsInMailAddr = ['support','system','stuff','service','secretariat','club','user'] + + +# insert * +def insertWC(str): + retStr = ".*" + for item in str: + retStr += item + retStr += ".*" + return retStr + + +# clean name +def cleanName(dn): + # extract First Last from DN + dbProxy = DBProxy() + extractedDN = dbProxy.cleanUserID(dn) + # replace -. + extractedDN = re.sub('-|\.',' ',extractedDN) + # change to lower + extractedDN = extractedDN.lower() + # remove ATLAS + extractedDN = re.sub('\(*atlas\)*','',extractedDN) + # remove numbers + extractedDN = re.sub('\d*','',extractedDN) + # remove Jr + extractedDN = re.sub(' jr( |$)',' ',extractedDN) + # remove whitespaces + extractedDN = re.sub(' +',' ',extractedDN) + extractedDN = extractedDN.strip() + # return + return extractedDN + + +# get email address using phonebook +def getEmailPhonebook(dn): + _logger.debug('Getting email via phonebook for %s' % dn) + # clean DN + extractedDN = cleanName(dn) + # dump + _logger.debug(extractedDN) + # construct command + for sTry in ['full','full_rev','fullwc','fullwc_rev,', + 'suronly', 'firstonly','suronly_rev','firstonly_rev', + 'email']: + if sTry == 'full': + # try full name + com = '~atlpan/phonebook --firstname "%s" --surname "%s" --all' \ + % (extractedDN.split()[0],extractedDN.split()[-1]) + if sTry == 'full_rev': + # try full name + com = '~atlpan/phonebook --firstname "%s" --surname "%s" --all' \ + % (extractedDN.split()[-1],extractedDN.split()[0]) + elif sTry == 'fullwc': + # try full name with wildcard + com = '~atlpan/phonebook --firstname "*%s*" --surname "*%s*" --all' \ + % (extractedDN.split()[0],extractedDN.split()[-1]) + elif sTry == 'fullwc_rev': + # try full name with wildcard + com = '~atlpan/phonebook --firstname "*%s*" --surname "*%s*" --all' \ + % (extractedDN.split()[-1],extractedDN.split()[0]) + elif sTry == 'suronly': + if len(extractedDN.split()) == 2: + # try surname only + com = '~atlpan/phonebook --surname "%s" --all' \ + % extractedDN.split()[-1] + else: + # try surname with wildcard + com = '~atlpan/phonebook --surname "*%s*" --all' \ + % extractedDN.split()[-1] + elif sTry == 'suronly_rev': + if len(extractedDN.split()) == 2: + # try surname only + com = '~atlpan/phonebook --surname "%s" --all' \ + % extractedDN.split()[0] + else: + # try surname with wildcard + com = '~atlpan/phonebook --surname "*%s*" --all' \ + % extractedDN.split()[0] + elif sTry == 'firstonly': + if len(extractedDN.split()) == 2: + # try firstname only + com = '~atlpan/phonebook --firstname "%s" --all' \ + % extractedDN.split()[0] + else: + # try firstname with wildcard + com = '~atlpan/phonebook --firstname "*%s*" --all' \ + % extractedDN.split()[0] + elif sTry == 'firstonly_rev': + if len(extractedDN.split()) == 2: + # try firstname only + com = '~atlpan/phonebook --firstname "%s" --all' \ + % extractedDN.split()[-1] + else: + # try firstname with wildcard + com = '~atlpan/phonebook --firstname "*%s*" --all' \ + % extractedDN.split()[-1] + elif sTry == 'email': + # try email + mailPatt = re.sub(' +','*',extractedDN) + com = '~atlpan/phonebook --email "*%s*" --all' \ + % mailPatt + _logger.debug(com) + # execute + sStat,sOut = commands.getstatusoutput(com) + _logger.debug(sOut) + # failed + if sStat != 0: + _logger.debug('phonebook failed with %s' % sStat) + return [] + # extract email + emails = [] + groups = [] + dnames = [] + for line in sOut.split('\n'): + if line.startswith('E-mail:'): + # append + tmpStr = line.split()[-1] + emails.append(tmpStr) + elif line.startswith('Group:'): + # append + tmpStr = line.split()[-1] + groups.append(tmpStr) + elif line.startswith('Display Name:'): + # append + tmpStr = re.sub('^[^:]+:','',line).strip() + dnames.append(tmpStr) + # check groups + newGroups = [] + newEmails = [] + newDNames = [] + for idx,group in enumerate(groups): + if group.startswith('A') or group in ['UAT','GS','-']: + newGroups.append(group) + newEmails.append(emails[idx]) + newDNames.append(dnames[idx]) + # replace + groups = newGroups + emails = newEmails + dnames = newDNames + # check dname + if len(emails) > 1 and len(emails) == len(dnames): + newGroups = [] + newEmails = [] + newDNames = [] + newGroupsWC = [] + newEmailsWC = [] + newDNamesWC = [] + for idx,dname in enumerate(dnames): + # check fragments + nameItems = extractedDN.split() + nMatch = 0 + nMatchWC = 0 + for nameItem in nameItems: + # check w/o wildcard + if re.search(nameItem,dname,re.I) != None: + nMatch += 1 + # check with wildcard + if re.search(insertWC(nameItem),dname,re.I) != None: + nMatchWC += 1 + # append if totally matched or partially matched ignoring middle-name etc + if len(nameItems) == nMatch or (len(nameItems) > 2 and (len(nameItems)-nMatch) < 2): + newGroups.append(groups[idx]) + newEmails.append(emails[idx]) + newDNames.append(dname) + # append if matched with wildcard + if len(nameItems) == nMatchWC or (len(nameItems) > 2 and (len(nameItems)-nMatchWC) < 2): + newGroupsWC.append(groups[idx]) + newEmailsWC.append(emails[idx]) + newDNamesWC.append(dname) + # replace + if len(newGroups)>0: + # use strict matching + groups = newGroups + emails = newEmails + dnames = newDNames + else: + # use loose matching + groups = newGroupsWC + emails = newEmailsWC + dnames = newDNamesWC + _logger.debug('emails=%s' % str(emails)) + # return + if len(emails) == 1: + _logger.debug('Succeeded %s %s' % (groups[0],emails[0])) + return emails + # failed + _logger.error('Failed for %s' % dn) + return [] + + +# get email address using xwho +def getEmailXwho(dn): + # get email from CERN/xwho + _logger.debug('Getting email via xwho for %s' % dn) + for sTry in ['full','firstlastonly']: + try: + # remove middle name + encodedDN = cleanName(dn) + encodedDN = re.sub(' . ',' ',encodedDN) + # remove _ + encodedDN = encodedDN.replace('_',' ') + # use fist and lastnames only + if sTry == 'firstlastonly': + newEncodedDN = '%s %s' % (encodedDN.split()[0],encodedDN.split()[-1]) + # skip if it was already tried + if encodedDN == newEncodedDN: + continue + encodedDN = newEncodedDN + # URL encode + encodedDN = encodedDN.replace(' ','%20') + url = 'http://consult.cern.ch/xwho?'+encodedDN + if panda_config.httpProxy != '': + proxies = proxies={'http': panda_config.httpProxy} + else: + proxies = proxies={} + opener = urllib.FancyURLopener(proxies) + fd=opener.open(url) + data = fd.read() + if re.search(' not found',data,re.I) == None: + break + except: + type, value, traceBack = sys.exc_info() + _logger.error("xwho failure with %s %s" % (type,value)) + return [] + # parse HTML + emails = [] + headerItem = ["Family Name","First Name","Phone","Dep"] + findTable = False + _logger.debug(data) + for line in data.split('\n'): + # look for table + if not findTable: + # look for header + tmpFlag = True + for item in headerItem: + if re.search(item,line) == None: + tmpFlag = False + break + findTable = tmpFlag + continue + else: + # end of table + if re.search(item,"") != None: + findTable = False + continue + # look for link to individual page + match = re.search('href="(/xwho/people/\d+)"',line) + if match == None: + continue + link = match.group(1) + try: + url = 'http://consult.cern.ch'+link + if panda_config.httpProxy != '': + proxies = proxies={'http': panda_config.httpProxy} + else: + proxies = proxies={} + opener = urllib.FancyURLopener(proxies) + fd=opener.open(url) + data = fd.read() + _logger.debug(data) + except: + type, value, traceBack = sys.exc_info() + _logger.error("xwho failure with %s %s" % (type,value)) + return [] + # get mail adder + match = re.search("mailto:([^@]+@[^>]+)>",data) + if match != None: + adder = match.group(1) + # check NG words + okAddr = True + for ngWord in _ngWordsInMailAddr: + if re.search(ngWord,adder,re.I): + _logger.error("%s has NG word:%s" % (adder,ngWord)) + okAddr = False + break + if okAddr and (not adder in emails): + emails.append(adder) + _logger.debug("emails from xwho : '%s'" % emails) + # return + if len(emails) == 1: + _logger.debug('Succeeded : %s %s' % (str(emails),dn)) + return emails + # multiple candidates + if len(emails) > 1: + _logger.error("non unique address : %s for %s" % (str(emails),dn)) + return [] + # failed + _logger.error('Failed to find address for %s' % dn) + return [] + + + + + diff --git a/current/pandaserver/dataservice/Closer.py b/current/pandaserver/dataservice/Closer.py new file mode 100755 index 000000000..8301945d3 --- /dev/null +++ b/current/pandaserver/dataservice/Closer.py @@ -0,0 +1,290 @@ +''' +update dataset DB, and then close dataset and start Activator if needed + +''' + +import re +import sys +import time +import urllib +import commands +import threading +from DDM import ddm +import Notifier +import RetryMaker +from Activator import Activator +from pandalogger.PandaLogger import PandaLogger +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec +from taskbuffer.DatasetSpec import DatasetSpec +from brokerage.SiteMapper import SiteMapper +from config import panda_config +import brokerage.broker_util + +# logger +_logger = PandaLogger().getLogger('Closer') + +def initLogger(pLogger): + # redirect logging to parent as it doesn't work in nested threads + global _logger + _logger = pLogger + Notifier.initLogger(_logger) + RetryMaker.initLogger(_logger) + + +class Closer (threading.Thread): + # constructor + def __init__(self,taskBuffer,destinationDBlocks,job,pandaDDM=False,datasetMap={}): + threading.Thread.__init__(self) + self.taskBuffer = taskBuffer + self.destinationDBlocks = destinationDBlocks + self.job = job + self.pandaID = job.PandaID + self.pandaDDM = pandaDDM + self.siteMapper = None + self.datasetMap = datasetMap + + + # main + def run(self): + try: + _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus)) + flagComplete = True + ddmJobs = [] + topUserDsList = [] + usingMerger = False + disableNotifier = False + firstIndvDS = True + for destinationDBlock in self.destinationDBlocks: + dsList = [] + _logger.debug('%s start %s' % (self.pandaID,destinationDBlock)) + # ignore tid datasets + if re.search('_tid[\d_]+$',destinationDBlock): + _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock)) + continue + # query dataset + if self.datasetMap.has_key(destinationDBlock): + dataset = self.datasetMap[destinationDBlock] + else: + dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock}) + if dataset == None: + _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock)) + flagComplete = False + continue + # skip tobedeleted/tobeclosed + if dataset.status in ['cleanup','tobeclosed','completed']: + _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status)) + continue + dsList.append(dataset) + # sort + dsList.sort() + # count number of completed files + notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock, + 'status':'unknown'}) + if notFinish < 0: + _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish)) + flagComplete = False + continue + # check if completed + _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish)) + if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']: + # close non-DQ2 destinationDBlock immediately + finalStatus = 'closed' + elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \ + and self.job.processingType != 'usermerge': + # merge output files + if firstIndvDS: + # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS + finalStatus = 'tobemerged' + firstIndvDS = False + else: + finalStatus = 'tobeclosed' + # set merging to top dataset + usingMerger = True + # disable Notifier + disableNotifier = True + else: + # set status to 'tobeclosed' to trigger DQ2 closing + finalStatus = 'tobeclosed' + if notFinish==0: + _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock)) + # set status + dataset.status = finalStatus + # update dataset in DB + retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", + criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) + if len(retT) > 0 and retT[0]==1: + # close user datasets + if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \ + and (dataset.name.startswith('user') or dataset.name.startswith('group')): + # get top-level user dataset + topUserDsName = re.sub('_sub\d+$','',dataset.name) + # update if it is the first attempt + if topUserDsName != dataset.name and not topUserDsName in topUserDsList: + topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName}) + if topUserDs != None: + # check status + if topUserDs.status in ['completed','cleanup','tobeclosed', + 'tobemerged','merging']: + _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status)) + else: + # set status + if self.job.processingType.startswith('gangarobot') or \ + self.job.processingType.startswith('hammercloud'): + # not trigger freezing for HC datasets so that files can be appended + topUserDs.status = 'completed' + elif not usingMerger: + topUserDs.status = finalStatus + else: + topUserDs.status = 'merging' + # append to avoid repetition + topUserDsList.append(topUserDsName) + # update DB + retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus", + criteriaMap={':crStatus':topUserDs.status}) + if len(retTopT) > 0 and retTopT[0]==1: + _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName)) + else: + _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName)) + # get parent dataset for merge job + if self.job.processingType == 'usermerge': + tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters) + if tmpMatch == None: + _logger.error('%s failed to extract parentDS' % self.pandaID) + else: + unmergedDsName = tmpMatch.group(1) + # update if it is the first attempt + if not unmergedDsName in topUserDsList: + unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName}) + if unmergedDs == None: + _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName)) + else: + # check status + if unmergedDs.status in ['completed','cleanup','tobeclosed']: + _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status)) + else: + # set status + unmergedDs.status = finalStatus + # append to avoid repetition + topUserDsList.append(unmergedDsName) + # update DB + retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus", + criteriaMap={':crStatus':unmergedDs.status}) + if len(retTopT) > 0 and retTopT[0]==1: + _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName)) + else: + _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName)) + if self.pandaDDM and self.job.prodSourceLabel=='managed': + # instantiate SiteMapper + if self.siteMapper == None: + self.siteMapper = SiteMapper(self.taskBuffer) + # get file list for PandaDDM + retList = self.taskBuffer.queryFilesWithMap({'destinationDBlock':destinationDBlock}) + lfnsStr = '' + guidStr = '' + for tmpFile in retList: + if tmpFile.type in ['log','output']: + lfnsStr += '%s,' % tmpFile.lfn + guidStr += '%s,' % tmpFile.GUID + if lfnsStr != '': + guidStr = guidStr[:-1] + lfnsStr = lfnsStr[:-1] + # create a DDM job + ddmjob = JobSpec() + ddmjob.jobDefinitionID = int(time.time()) % 10000 + ddmjob.jobName = "%s" % commands.getoutput('uuidgen') + ddmjob.transformation = 'http://pandaserver.cern.ch:25080/trf/mover/run_dq2_cr' + ddmjob.destinationDBlock = 'testpanda.%s' % ddmjob.jobName + ddmjob.computingSite = "BNL_ATLAS_DDM" + ddmjob.destinationSE = ddmjob.computingSite + ddmjob.currentPriority = 200000 + ddmjob.prodSourceLabel = 'ddm' + ddmjob.transferType = 'sub' + # append log file + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % ddmjob.jobName + fileOL.destinationDBlock = ddmjob.destinationDBlock + fileOL.destinationSE = ddmjob.destinationSE + fileOL.dataset = ddmjob.destinationDBlock + fileOL.type = 'log' + ddmjob.addFile(fileOL) + # make arguments + dstDQ2ID = 'BNLPANDA' + srcDQ2ID = self.siteMapper.getSite(self.job.computingSite).ddm + callBackURL = 'https://%s:%s/server/panda/datasetCompleted?vuid=%s&site=%s' % \ + (panda_config.pserverhost,panda_config.pserverport, + dataset.vuid,dstDQ2ID) + _logger.debug(callBackURL) + # set src/dest + ddmjob.sourceSite = srcDQ2ID + ddmjob.destinationSite = dstDQ2ID + # if src==dst, send callback without ddm job + if dstDQ2ID == srcDQ2ID: + comout = commands.getoutput('curl -k %s' % callBackURL) + _logger.debug(comout) + else: + # run dq2_cr + callBackURL = urllib.quote(callBackURL) + # get destination dir + destDir = brokerage.broker_util._getDefaultStorage(self.siteMapper.getSite(self.job.computingSite).dq2url) + argStr = "-s %s -r %s --guids %s --lfns %s --callBack %s -d %s/%s %s" % \ + (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,callBackURL,destDir, + destinationDBlock,destinationDBlock) + # set job parameters + ddmjob.jobParameters = argStr + _logger.debug('%s pdq2_cr %s' % (self.pandaID,ddmjob.jobParameters)) + ddmJobs.append(ddmjob) + # start Activator + if re.search('_sub\d+$',dataset.name) == None: + if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']: + # don't trigger Activator for merge jobs + pass + else: + if self.job.jobStatus == 'finished': + aThr = Activator(self.taskBuffer,dataset) + aThr.start() + aThr.join() + else: + # unset flag since another thread already updated + flagComplete = False + else: + # update dataset in DB + self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", + criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) + # unset flag + flagComplete = False + # end + _logger.debug('%s end %s' % (self.pandaID,destinationDBlock)) + # start DDM jobs + if ddmJobs != []: + self.taskBuffer.storeJobs(ddmJobs,self.job.prodUserID,joinThr=True) + # change pending jobs to failed + if flagComplete and self.job.prodSourceLabel=='user': + #_logger.debug('%s call RetryMaker for %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID)) + #retryMaker = RetryMaker.RetryMaker(self.taskBuffer,self.job) + #retryMaker.run() + _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID)) + self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID) + # start notifier + _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete)) + if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \ + (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')): + # don't send email for merge jobs + if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']: + useNotifier = True + summaryInfo = {} + # check all jobDefIDs in jobsetID + if not self.job.jobsetID in [0,None,'NULL']: + useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID, + self.job.prodUserName) + _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier)) + if useNotifier: + _logger.debug('%s start Notifier' % self.pandaID) + nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo) + nThr.run() + _logger.debug('%s end Notifier' % self.pandaID) + _logger.debug('%s End' % self.pandaID) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s %s" % (errType,errValue)) + diff --git a/current/pandaserver/dataservice/DDM.py b/current/pandaserver/dataservice/DDM.py new file mode 100755 index 000000000..5888a36b3 --- /dev/null +++ b/current/pandaserver/dataservice/DDM.py @@ -0,0 +1,344 @@ +""" +provide primitive methods for DDM + +""" + +import sys +import types +import commands +from config import panda_config + + +# change cwd +_cwd = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) + +# environment variables +_env = 'PATH=%s:%s:$PATH ' % (panda_config.native_python,panda_config.globus_dir+'/bin') +_env+= 'LD_LIBRARY_PATH=%s ' % (panda_config.globus_dir+'/lib') +_env+= 'DQ2_HOME=%s/opt/dq2 ' % panda_config.dq2_dir +_env+= 'http_proxy=%s ' % panda_config.httpProxy +_env+= 'https_proxy=%s ' % panda_config.httpProxy + +_env+= 'PYTHONPATH=%s/usr/lib/python2.3/site-packages:$PYTHONPATH' \ + % panda_config.dq2_dir + +# method object wrapping DQ2 method +class _DQMethod: + # constructor + def __init__(self,moduleName,methodName): + self.moduleName = moduleName + self.methodName = methodName + + # method emulation + def __call__(self,*args,**kwargs): + # main method has disappeared since 0.3 + args = list(args) + if self.methodName == 'main': + self.methodName = args[0] + args.pop(0) + # build command + com = 'import dq2.clientapi.cli.cliutil; ' + #com += 'import sys; sys.tracebacklimit=0; ' + com += 'dq2api = dq2.clientapi.cli.cliutil.getDQ2(None); ' + if self.moduleName == 'DQ2': + # DQ2 is top-level module + com += 'print dq2api.%s(' % self.methodName + elif self.moduleName == 'DQ2_iter': + # iterator + com += 'iter = dq2api.%s(' % self.methodName + else: + com += 'print dq2api.%s.%s(' % (self.moduleName,self.methodName) + # expand args + for i in range(len(args)): + arg = args[i] + if isinstance(arg,types.StringType): + # check invalid characters + for invCh in ['"',"'",'(',')',';']: + if invCh in arg: + return -1,"invalid character %s in %s" % (invCh,arg) + com = "%s'%s'," % (com,arg) + else: + com = '%s%s,' % (com,str(arg)) + for tmpK,tmpV in kwargs.iteritems(): + if isinstance(tmpV,types.StringType): + com += "%s='%s'," % (tmpK,tmpV) + else: + com += "%s=%s," % (tmpK,tmpV) + com = com[:-1] + com += ")" + # loop over iterator + if self.moduleName == 'DQ2_iter': + com += ";exec 'for item in iter:print item'" + # execute + return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com)) + + +# DQ module class +class _DQModule: + # constructor + def __init__(self,moduleName): + self.moduleName = moduleName + + # factory method + def __getattr__(self,methodName): + return _DQMethod(self.moduleName,methodName) + + +# native DQ2 method class +class NativeDQ2Method: + # constructor + def __init__(self): + self.moduleName = None + self.methodName = None + # set module and method name + def setNames(self,moduleName,methodName): + self.moduleName = moduleName + self.methodName = methodName + # method emulation + def __call__(self,*args,**kwargs): + try: + # make dq2api locally since global dq2 object is not thread-safe + import dq2.clientapi.cli.cliutil + dq2api = dq2.clientapi.cli.cliutil.getDQ2(None) + # main method has disappeared since 0.3 + args = list(args) + if self.methodName == 'main': + self.methodName = args[0] + args.pop(0) + # get method object + if self.moduleName in ['DQ2','DQ2_iter']: + methodObj = getattr(dq2api,self.methodName) + else: + methodObj = getattr(getattr(dq2api,self.moduleName),self.methodName) + # execute + retVal = apply(methodObj,args,kwargs) + # loop over for iterator + if self.moduleName == 'DQ2_iter': + strRet = '' + for item in retVal: + strRet += str(item) + else: + strRet = str(retVal) + # return + return 0,strRet + except: + errType,errVale = sys.exc_info()[:2] + return 1,'%s %s' % (errType,errVale) + + + +# native DQ2 module class +class NativeDQ2Module: + # constructor + def __init__(self): + self.moduleName = None + # set module name + def setModName(self,moduleName): + self.moduleName = moduleName + # getter + def __getattr__(self,methodName): + # set method name + api = NativeDQ2Method() + api.setNames(self.moduleName,methodName) + return api + + +# factory class +class DDM: + # constructor + def __init__(self): + self.usingNativeDQ2 = False + # switch to use DQ2 in the same session + def useDirectDQ2(self): + self.usingNativeDQ2 = True + # getter + def __getattr__(self,moduleName): + if not self.usingNativeDQ2: + # run dq2 comamnd in another session + return _DQModule(moduleName) + else: + # run dq2 command in the same session + nativeDQ2 = NativeDQ2Module() + nativeDQ2.setModName(moduleName) + return nativeDQ2 + +# instantiate +ddm = DDM() +del DDM + + +# method object wrapping TOA method +class _TOAMethod: + # constructor + def __init__(self,methodName): + self.methodName = methodName + + # method emulation + def __call__(self,*args): + args = list(args) + # build command + com = 'from dq2.info import TiersOfATLAS; ' + com += 'print TiersOfATLAS.%s(' % self.methodName + # expand args + for i in range(len(args)): + arg = args[i] + if isinstance(arg,types.StringType): + com += "'%s'," % arg + else: + com = '%s,' % arg + com = com[:-1] + com += ")" + # execute + return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com)) + + +# native ToA method class +class NativeTOAMethod: + # constructor + def __init__(self): + self.methodName = None + from dq2.info import TiersOfATLAS + self.api = TiersOfATLAS + # set method name + def setName(self,methodName): + self.methodName = methodName + # method emulation + def __call__(self,*args,**kwargs): + try: + methodObj = getattr(self.api,self.methodName) + # execute + retVal = apply(methodObj,args,kwargs) + strRet = str(retVal) + # return + return 0,strRet + except: + errType,errVale = sys.exc_info()[:2] + return 1,'%s %s' % (errType,errVale) + + +# TOA module class +class TOA: + # constructor + def __init__(self): + self.usingNativeDQ2 = False + self.nativeTOA = None + # getter + def __getattr__(self,methodName): + if not ddm.usingNativeDQ2: + # run dq2 comamnd in another session + return _TOAMethod(methodName) + else: + # make method object + if self.nativeTOA == None: + self.nativeTOA = NativeTOAMethod() + # run dq2 command in the same session + self.nativeTOA.setName(methodName) + return self.nativeTOA + + + +# instantiate +toa = TOA() +del TOA + + +# method object wrapping Dashboard method +class _DashBoradMethod: + # constructor + def __init__(self,methodName): + self.methodName = methodName + + # method emulation + def __call__(self,*args): + args = list(args) + # build command + com = "import sys;sys.stderr=open('/dev/null','w');" + com += "import datetime;from dashboard.api.data.DataQuery import DataQuery;" + com += "sys.stderr=sys.__stderr__;" + com += "dash=DataQuery('dashb-atlas-data.cern.ch', 80);" + com += "print dash.%s(%s,'%s'," % (self.methodName,args[0],args[1]) + com += "startDate=datetime.datetime.utcnow()-datetime.timedelta(hours=24))" + # execute + return commands.getstatusoutput('%s python -c "%s"' % (_cwd,com)) + + +# TOA module class +class DashBorad: + def __getattr__(self,methodName): + return _DashBoradMethod(methodName) + +# instantiate +dashBorad = DashBorad() +del DashBorad + + +# method object wrapping DQ2Info method +class _DQ2InfoMethod: + # constructor + def __init__(self,methodName): + self.methodName = methodName + + # method emulation + def __call__(self,*args): + args = list(args) + # build command + com = 'from dq2.info.client.infoClient import infoClient; ' + com += 'print infoClient().%s(' % self.methodName + # expand args + for i in range(len(args)): + arg = args[i] + if isinstance(arg,types.StringType): + com += "'%s'," % arg + else: + com = '%s,' % arg + com = com[:-1] + com += ")" + # execute + return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com)) + + +# TOA module class +class DQ2Info: + def __getattr__(self,methodName): + return _DQ2InfoMethod(methodName) + + +# instantiate +dq2Info = DQ2Info() +del DQ2Info + + +# method object wrapping dq2 common +class _DQ2CommonMethod: + # constructor + def __init__(self,methodName): + self.methodName = methodName + + # method emulation + def __call__(self,*args): + args = list(args) + # build command + com = 'from dq2.common import %s; ' % self.methodName + com += 'print %s(' % self.methodName + # expand args + for i in range(len(args)): + arg = args[i] + if isinstance(arg,types.StringType): + com += "'%s'," % arg + else: + com = '%s,' % arg + com = com[:-1] + com += ")" + # execute + return commands.getstatusoutput('%s env %s python -c "%s"' % (_cwd,_env,com)) + + +# TOA module class +class DQ2Common: + def __getattr__(self,methodName): + return _DQ2CommonMethod(methodName) + + +# instantiate +dq2Common = DQ2Common() +del DQ2Common diff --git a/current/pandaserver/dataservice/DDMHandler.py b/current/pandaserver/dataservice/DDMHandler.py new file mode 100755 index 000000000..165738c8e --- /dev/null +++ b/current/pandaserver/dataservice/DDMHandler.py @@ -0,0 +1,48 @@ +''' +master hander for DDM + +''' + +import re +import threading + +from Waker import Waker +from Finisher import Finisher +from Activator import Activator + +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('DDMHandler') + + +class DDMHandler (threading.Thread): + # constructor + def __init__(self,taskBuffer,vuid,site=None): + threading.Thread.__init__(self) + self.vuid = vuid + self.taskBuffer = taskBuffer + self.site = site + + + # main + def run(self): + # query dataset + _logger.debug("start: %s %s" % (self.vuid,self.site)) + dataset = self.taskBuffer.queryDatasetWithMap({'vuid':self.vuid}) + if dataset == None: + _logger.error("Not found : %s" % self.vuid) + _logger.debug("end: %s" % self.vuid) + return + _logger.debug("vuid:%s type:%s name:%s" % (self.vuid,dataset.type,dataset.name)) + if dataset.type == 'dispatch': + # activate jobs in jobsDefined + Activator(self.taskBuffer,dataset).start() + if dataset.type == 'output': + if dataset.name != None and re.search('^panda\..*_zip$',dataset.name) != None: + # start unmerge jobs + Activator(self.taskBuffer,dataset,enforce=True).start() + else: + # finish transferring jobs + Finisher(self.taskBuffer,dataset,site=self.site).start() + _logger.debug("end: %s" % self.vuid) diff --git a/current/pandaserver/dataservice/DataService.py b/current/pandaserver/dataservice/DataService.py new file mode 100755 index 000000000..540987e1a --- /dev/null +++ b/current/pandaserver/dataservice/DataService.py @@ -0,0 +1,99 @@ +""" +provide web service for DDM + +""" + +import re +import sys +import cPickle as pickle +from config import panda_config +from taskbuffer.WrappedPickle import WrappedPickle +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('DataService') + + +class DataService: + # constructor + def __init__(self): + self.taskBuffer = None + + # set taskbuffer + def init(self,taskBuffer): + self.taskBuffer = taskBuffer + +# Singleton +dataService = DataService() +del DataService + + +''' +web interface + +''' + +from DDMHandler import DDMHandler + + +# callback for dataset verification +def datasetCompleted(req,vuid,site=None): + thr = DDMHandler(dataService.taskBuffer,vuid,site) + thr.start() + thr.join() + return True + + +# get FQANs +def _getFQAN(req): + fqans = [] + for tmpKey,tmpVal in req.subprocess_env.iteritems(): + # compact credentials + if tmpKey.startswith('GRST_CRED_'): + # VOMS attribute + if tmpVal.startswith('VOMS'): + # FQAN + fqan = tmpVal.split()[-1] + # append + fqans.append(fqan) + # old style + elif tmpKey.startswith('GRST_CONN_'): + tmpItems = tmpVal.split(':') + # FQAN + if len(tmpItems)==2 and tmpItems[0]=='fqan': + fqans.append(tmpItems[-1]) + # return + return fqans + + +# set file status +def updateFileStatusInDisp(req,dataset,fileStatus): + try: + # get FQAN + fqans = _getFQAN(req) + roleOK = False + # loop over all FQANs + for fqan in fqans: + # check production role + for rolePat in ['/atlas/usatlas/Role=production', + '/atlas/Role=production', + # use /atlas since delegation proxy doesn't inherit roles + '/atlas/']: + if fqan.startswith(rolePat): + roleOK = True + break + if not roleOK: + _logger.error('updateFileStatusInDisp : invalid proxy %s' % fqans) + return "False" + # deserialize fileStatus + fileStatusMap = WrappedPickle.loads(fileStatus) + _logger.debug('updateFileStatusInDisp : start %s - %s' % (dataset,fileStatusMap)) + # update status + dataService.taskBuffer.updateFileStatusInDisp(dataset,fileStatusMap) + _logger.debug('updateFileStatusInDisp : done') + return "True" + except: + type,value,traceBack = sys.exc_info() + _logger.error("updateFileStatusInDisp : %s %s" % (type,value)) + return "False" + diff --git a/current/pandaserver/dataservice/DataServiceUtils.py b/current/pandaserver/dataservice/DataServiceUtils.py new file mode 100644 index 000000000..0e4093cbb --- /dev/null +++ b/current/pandaserver/dataservice/DataServiceUtils.py @@ -0,0 +1,281 @@ +import re +import sys + +# get prefix for DQ2 +def getDQ2Prefix(dq2SiteID): + # prefix of DQ2 ID + tmpDQ2IDPrefix = re.sub('_[A-Z,0-9]+DISK$','',dq2SiteID) + # remove whitespace + tmpDQ2IDPrefix = tmpDQ2IDPrefix.strip() + # patchfor MWT2 + if tmpDQ2IDPrefix == 'MWT2_UC': + tmpDQ2IDPrefix = 'MWT2' + return tmpDQ2IDPrefix + + +# check if the file is cached +def isCachedFile(datasetName,siteSpec): + # using CVMFS + if siteSpec.iscvmfs != True: + return False + # FIXME + if not siteSpec.cloud in ['IT']: + return False + # look for DBR + if not datasetName.startswith('ddo.'): + return False + # look for three digits + if re.search('v\d{6}$',datasetName) == None: + return False + return True + + +# get the list of sites where dataset is available +def getSitesWithDataset(tmpDsName,siteMapper,replicaMap,cloudKey,useHomeCloud=False,getDQ2ID=False, + useOnlineSite=False,includeT1=False): + retList = [] + retDQ2Map = {} + # no replica map + if not replicaMap.has_key(tmpDsName): + if getDQ2ID: + return retDQ2Map + return retList + # use valid cloud + if not siteMapper.checkCloud(cloudKey): + if getDQ2ID: + return retDQ2Map + return retList + # check sites in the cloud + for tmpSiteName in siteMapper.getCloud(cloudKey)['sites']: + # skip T1 + if not includeT1: + # T1 + if tmpSiteName == siteMapper.getCloud(cloudKey)['source']: + continue + # hospital queue + if siteMapper.getSite(tmpSiteName).ddm == siteMapper.getSite(siteMapper.getCloud(cloudKey)['source']).ddm: + continue + # use home cloud + if useHomeCloud: + if siteMapper.getSite(tmpSiteName).cloud != cloudKey: + continue + # online + if siteMapper.getSite(tmpSiteName).status != 'online': + continue + # check all associated DQ2 IDs + tmpFoundFlag = False + tmpSiteSpec = siteMapper.getSite(tmpSiteName) + for tmpSiteDQ2ID in [tmpSiteSpec.ddm]+tmpSiteSpec.setokens.values(): + # prefix of DQ2 ID + tmpDQ2IDPrefix = getDQ2Prefix(tmpSiteDQ2ID) + # ignore empty + if tmpDQ2IDPrefix == '': + continue + # loop over all replica DQ2 IDs + for tmpDQ2ID in replicaMap[tmpDsName].keys(): + # use DATADISK or GROUPDISK + if '_SCRATCHDISK' in tmpDQ2ID or \ + '_USERDISK' in tmpDQ2ID or \ + '_PRODDISK' in tmpDQ2ID or \ + '_LOCALGROUPDISK' in tmpDQ2ID or \ + 'TAPE' in tmpDQ2ID or \ + '_DAQ' in tmpDQ2ID or \ + '_TMPDISK' in tmpDQ2ID or \ + '_TZERO' in tmpDQ2ID: + continue + # check DQ2 prefix + if tmpDQ2ID.startswith(tmpDQ2IDPrefix): + tmpFoundFlag = True + if not getDQ2ID: + break + # append map + if not retDQ2Map.has_key(tmpSiteName): + retDQ2Map[tmpSiteName] = [] + if not tmpDQ2ID in retDQ2Map[tmpSiteName]: + retDQ2Map[tmpSiteName].append(tmpDQ2ID) + # append + if tmpFoundFlag: + retList.append(tmpSiteName) + # return map + if getDQ2ID: + return retDQ2Map + # retrun + return retList + + +# get the number of files available at the site +def getNumAvailableFilesSite(siteName,siteMapper,replicaMap,badMetaMap,additionalSEs=[], + noCheck=[],fileCounts=None): + try: + # get DQ2 endpoints + tmpSiteSpec = siteMapper.getSite(siteName) + prefixList = [] + for tmpSiteDQ2ID in [tmpSiteSpec.ddm]+tmpSiteSpec.setokens.values(): + # prefix of DQ2 ID + tmpDQ2IDPrefix = getDQ2Prefix(tmpSiteDQ2ID) + # ignore empty + if tmpDQ2IDPrefix != '': + prefixList.append(tmpDQ2IDPrefix) + # loop over datasets + totalNum = 0 + for tmpDsName,tmpSitesData in replicaMap.iteritems(): + # cached files + if isCachedFile(tmpDsName,tmpSiteSpec) and fileCounts != None and \ + fileCounts.has_key(tmpDsName): + # add with no check + totalNum += fileCounts[tmpDsName] + continue + # dataset type + datasetType = getDatasetType(tmpDsName) + # use total num to effectively skip file availability check + if datasetType in noCheck: + columnName = 'total' + else: + columnName = 'found' + # get num of files + maxNumFile = 0 + # for T1 or T2 + if additionalSEs != []: + # check T1 endpoints + for tmpSePat in additionalSEs: + # ignore empty + if tmpSePat == '': + continue + # make regexp pattern + if '*' in tmpSePat: + tmpSePat = tmpSePat.replace('*','.*') + tmpSePat = '^' + tmpSePat +'$' + # loop over all sites + for tmpSE in tmpSitesData.keys(): + # skip bad metadata + if badMetaMap.has_key(tmpDsName) and tmpSE in badMetaMap[tmpDsName]: + continue + # check match + if re.search(tmpSePat,tmpSE) == None: + continue + # get max num of files + tmpN = tmpSitesData[tmpSE][0][columnName] + if tmpN != None and tmpN > maxNumFile: + maxNumFile = tmpN + else: + # check explicit endpoint name + for tmpSiteDQ2ID in [tmpSiteSpec.ddm]+tmpSiteSpec.setokens.values(): + # skip bad metadata + if badMetaMap.has_key(tmpDsName) and tmpSiteDQ2ID in badMetaMap[tmpDsName]: + continue + # ignore empty + if tmpSiteDQ2ID == '': + continue + # get max num of files + if tmpSitesData.has_key(tmpSiteDQ2ID): + tmpN = tmpSitesData[tmpSiteDQ2ID][0][columnName] + if tmpN != None and tmpN > maxNumFile: + maxNumFile = tmpN + # check prefix + for tmpDQ2IDPrefix in prefixList: + for tmpDQ2ID,tmpStat in tmpSitesData.iteritems(): + # skip bad metadata + if badMetaMap.has_key(tmpDsName) and tmpDQ2ID in badMetaMap[tmpDsName]: + continue + # ignore NG + if '_SCRATCHDISK' in tmpDQ2ID or \ + '_USERDISK' in tmpDQ2ID or \ + '_PRODDISK' in tmpDQ2ID or \ + '_LOCALGROUPDISK' in tmpDQ2ID or \ + '_DAQ' in tmpDQ2ID or \ + '_TMPDISK' in tmpDQ2ID or \ + '_TZERO' in tmpDQ2ID: + continue + # check prefix + if tmpDQ2ID.startswith(tmpDQ2IDPrefix): + tmpN = tmpSitesData[tmpDQ2ID][0][columnName] + if tmpN != None and tmpN > maxNumFile: + maxNumFile = tmpN + # sum + totalNum += maxNumFile + # return + return True,totalNum + except: + errtype,errvalue = sys.exc_info()[:2] + return False,'%s:%s' % (errtype,errvalue) + + +# get the list of sites where dataset is available +def getEndpointsAtT1(tmpRepMap,siteMapper,cloudName): + retList = [] + # get cloud SEs + tmpCloud = siteMapper.getCloud(cloudName) + cloudSEs = tmpCloud['tier1SE'] + # check T1 endpoints + for tmpSePat in cloudSEs: + # ignore empty + if tmpSePat == '': + continue + # make regexp pattern + if '*' in tmpSePat: + tmpSePat = tmpSePat.replace('*','.*') + tmpSePat = '^' + tmpSePat +'$' + # loop over all sites + for tmpSE in tmpRepMap.keys(): + # check match + if re.search(tmpSePat,tmpSE) == None: + continue + # append + if not tmpSE in retList: + retList.append(tmpSE) + # return + return retList + + +# check DDM response +def isDQ2ok(out): + if out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1 \ + or out == '()': + return False + return True + + +# check if DBR +def isDBR(datasetName): + if datasetName.startswith('ddo'): + return True + return False + + +# get the list of sites in a cloud which cache a dataset +def getSitesWithCacheDS(cloudKey,excludedSites,siteMapper,datasetName): + retList = [] + # check sites in the cloud + for tmpSiteName in siteMapper.getCloud(cloudKey)['sites']: + # excluded + if tmpSiteName in excludedSites: + continue + # skip T1 + if tmpSiteName == siteMapper.getCloud(cloudKey)['source']: + continue + # hospital queue + if siteMapper.getSite(tmpSiteName).ddm == siteMapper.getSite(siteMapper.getCloud(cloudKey)['source']).ddm: + continue + # not home cloud + if siteMapper.getSite(tmpSiteName).cloud != cloudKey: + continue + # online + if siteMapper.getSite(tmpSiteName).status != 'online': + continue + # check CVMFS + if isCachedFile(datasetName,siteMapper.getSite(tmpSiteName)): + retList.append(tmpSiteName) + # return + return retList + + +# get dataset type +def getDatasetType(dataset): + datasetType = None + try: + datasetType = dataset.split('.')[4] + except: + pass + return datasetType diff --git a/current/pandaserver/dataservice/DynDataDistributer.py b/current/pandaserver/dataservice/DynDataDistributer.py new file mode 100644 index 000000000..8a808a54c --- /dev/null +++ b/current/pandaserver/dataservice/DynDataDistributer.py @@ -0,0 +1,1657 @@ +''' +find candidate site to distribute input datasets + +''' + +import re +import sys +import time +import math +import types +import random +import datetime + +from dataservice.DDM import ddm +from dataservice.DDM import toa +from taskbuffer.JobSpec import JobSpec +import brokerage.broker + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('DynDataDistributer') + +def initLogger(pLogger): + # redirect logging to parent + global _logger + _logger = pLogger + + +# NG datasets +ngDataTypes = ['RAW','HITS','RDO','ESD','EVNT'] + +# excluded provenance +ngProvenance = [] + +# protection for max number of replicas +protectionMaxNumReplicas = 10 + +# max number of waiting jobs +maxWaitingJobs = 200 + +# max number of waiting jobsets +maxWaitingJobsets = 2 + +# clouds with small T1 to make replica at T2 +cloudsWithSmallT1 = ['IT'] + +# files in datasets +g_filesInDsMap = {} + + +class DynDataDistributer: + + # constructor + def __init__(self,jobs,taskBuffer,siteMapper,simul=False,token=None): + self.jobs = jobs + self.taskBuffer = taskBuffer + self.siteMapper = siteMapper + if token == None: + self.token = datetime.datetime.utcnow().isoformat(' ') + else: + self.token = token + # use a fixed list since some clouds don't have active T2s + self.pd2pClouds = ['CA','DE','ES','FR','IT','ND','NL','TW','UK','US'] + self.simul = simul + self.lastMessage = '' + self.cachedSizeMap = {} + self.shareMoUForT2 = None + self.mapTAGandParentGUIDs = {} + self.tagParentInfo = {} + self.parentLfnToTagMap = {} + + + # main + def run(self): + try: + self.putLog("start for %s" % self.jobs[0].PandaID) + # check cloud + if not self.jobs[0].cloud in self.pd2pClouds+['CERN',]: + self.putLog("skip cloud=%s not one of PD2P clouds %s" % (self.jobs[0].cloud,str(self.pd2pClouds))) + self.putLog("end for %s" % self.jobs[0].PandaID) + return + # ignore HC and group production + if self.jobs[0].processingType in ['hammercloud','gangarobot'] or self.jobs[0].processingType.startswith('gangarobot'): + self.putLog("skip due to processingType=%s" % self.jobs[0].processingType) + self.putLog("end for %s" % self.jobs[0].PandaID) + return + # ignore HC and group production + if not self.jobs[0].workingGroup in ['NULL',None,'']: + self.putLog("skip due to workingGroup=%s" % self.jobs[0].workingGroup) + self.putLog("end for %s" % self.jobs[0].PandaID) + return + # get input datasets + inputDatasets = [] + for tmpJob in self.jobs: + if tmpJob.prodSourceLabel == 'user': + for tmpFile in tmpJob.Files: + if tmpFile.type == 'input' and not tmpFile.lfn.endswith('.lib.tgz'): + if not tmpFile.dataset in inputDatasets: + inputDatasets.append(tmpFile.dataset) + # loop over all input datasets + for inputDS in inputDatasets: + # only mc/data datasets + moveFlag = False + for projectName in ['mc','data']: + if inputDS.startswith(projectName): + moveFlag = True + if not moveFlag: + self.putLog("skip non official dataset %s" % inputDS) + continue + if re.search('_sub\d+$',inputDS) != None or re.search('_dis\d+$',inputDS) != None: + self.putLog("skip dis/sub dataset %s" % inputDS) + continue + # check type + tmpItems = inputDS.split('.') + if len(tmpItems) < 5: + self.putLog("cannot get type from %s" % inputDS) + continue + if tmpItems[4] in ngDataTypes: + self.putLog("don't move %s : %s" % (tmpItems[4],inputDS)) + continue + # get candidate sites + self.putLog("get candidates for %s" % inputDS) + status,sitesMaps = self.getCandidates(inputDS,useCloseSites=True) + if not status: + self.putLog("failed to get candidates") + continue + # get size of input container + totalInputSize = 0 + if inputDS.endswith('/'): + status,totalInputSize = self.getDatasetSize(inputDS) + if not status: + self.putLog("failed to get size of %s" % inputDS) + continue + # get number of waiting jobs and jobsets + nWaitingJobsAll = self.taskBuffer.getNumWaitingJobsForPD2P(inputDS) + nWaitingJobsets = self.taskBuffer.getNumWaitingJobsetsForPD2P(inputDS) + # loop over all datasets + usedSites = [] + for tmpDS,tmpVal in sitesMaps.iteritems(): + self.putLog("triggered for %s" % tmpDS,sendLog=True) + # increment used counter + if not self.simul: + nUsed = self.taskBuffer.incrementUsedCounterSubscription(tmpDS) + else: + nUsed = 5 + # insert dummy for new dataset which is used to keep track of usage even if subscription is not made + if nUsed == 0: + retAddUserSub = self.taskBuffer.addUserSubscription(tmpDS,['DUMMY']) + if not retAddUserSub: + self.putLog("failed to add dummy subscription to database for %s " % tmpDS,type='error',sendLog=True) + continue + # collect candidates + allCandidates = [] + totalUserSub = 0 + allCompPd2pSites = [] + allOKClouds = [] + totalSecReplicas = 0 + allT1Candidates = [] + totalT1Sub = 0 + cloudCandMap = {} + nReplicasInCloud = {} + allCandidatesMoU = [] + nTier1Copies = 0 + for tmpCloud,(candSites,sitesComDS,sitesPd2pDS,nUserSub,t1HasReplica,t1HasPrimary,nSecReplicas,nT1Sub,candForMoU) in tmpVal.iteritems(): + self.putLog("%s sites with comp DS:%s compPD2P:%s candidates:%s nSub:%s T1:%s Pri:%s nSec:%s nT1Sub:%s candMoU:%s" % \ + (tmpCloud,str(sitesComDS),str(sitesPd2pDS),str(candSites),nUserSub,t1HasReplica,t1HasPrimary, + nSecReplicas,nT1Sub,str(candForMoU))) + # add + totalUserSub += nUserSub + totalT1Sub += nT1Sub + allCompPd2pSites += sitesPd2pDS + totalSecReplicas += nSecReplicas + cloudCandMap[tmpCloud] = candSites + nReplicasInCloud[tmpCloud] = len(sitesComDS) + len(sitesPd2pDS) + # cloud is candidate for T1-T1 when T1 doesn't have primary or secondary replicas or old subscriptions + if not t1HasPrimary and nSecReplicas == 0 and nT1Sub == 0: + allT1Candidates.append(tmpCloud) + # the number of T1s with replica + if t1HasPrimary or nSecReplicas > 0: + nTier1Copies += 1 + # add candidates + for tmpCandSite in candSites: + if not tmpCandSite in usedSites: + allCandidates.append(tmpCandSite) + # add candidates for MoU + for tmpCandSite in candForMoU: + if not tmpCandSite in usedSites: + allCandidatesMoU.append(tmpCandSite) + # add clouds + if not tmpCloud in allOKClouds: + allOKClouds.append(tmpCloud) + self.putLog("PD2P sites with comp replicas : %s" % str(allCompPd2pSites)) + self.putLog("PD2P T2 candidates : %s" % str(allCandidates)) + self.putLog("PD2P T2 MoU candidates : %s" % str(allCandidatesMoU)) + self.putLog("PD2P # of T2 subscriptions : %s" % totalUserSub) + self.putLog("PD2P # of T1 secondaries : %s" % totalSecReplicas) + self.putLog("PD2P # of T1 subscriptions : %s" % nT1Sub) + self.putLog("PD2P # of T1 replicas : %s" % nTier1Copies) + self.putLog("PD2P T1 candidates : %s" % str(allT1Candidates)) + self.putLog("PD2P nUsed : %s" % nUsed) + # get dataset size + retDsSize,dsSize = self.getDatasetSize(tmpDS) + if not retDsSize: + self.putLog("failed to get dataset size of %s" % tmpDS,type='error',sendLog=True) + continue + self.putLog("PD2P nWaitingJobsets : %s" % nWaitingJobsets) + if totalInputSize != 0: + self.putLog("PD2P nWaitingJobs : %s = %s(all)*%s(dsSize)/%s(contSize)" % \ + (int((float(nWaitingJobsAll * dsSize) / float(totalInputSize))), + nWaitingJobsAll,dsSize,totalInputSize)) + else: + self.putLog("PD2P nWaitingJobs : %s = %s(all)" % \ + (nWaitingJobsAll,nWaitingJobsAll)) + # make T1-T1 + triggeredT1PD2P = False + if nUsed > 0: + # extract integer part. log10(nUsed) and log10(nUsed)+1 are used to avoid round-off error + intLog10nUsed = int(math.log10(nUsed)) + if self.simul or (int(math.log10(nUsed)) > totalSecReplicas and \ + (nUsed == 10**intLog10nUsed or nUsed == 10**(intLog10nUsed+1)) and \ + nT1Sub == 0 and allT1Candidates != []): + self.putLog("making T1-T1",sendLog=True) + # make subscription + retT1Sub,useSmallT1 = self.makeT1Subscription(allT1Candidates,tmpDS,dsSize,nUsed) + self.putLog("done for T1-T1") + triggeredT1PD2P = True + # make a T2 copy when T1 PD2P was triggered + if triggeredT1PD2P: + # TODO + retT2MoU,selectedSite = self.makeT2SubscriptionMoU(allCandidatesMoU,tmpDS,dsSize,'T1MOU',nUsed) + if retT2MoU and selectedSite != None: + # remove from candidate list + if selectedSite in allCandidates: + allCandidates.remove(selectedSite) + if selectedSite in allCandidatesMoU: + allCandidatesMoU.remove(selectedSite) + # increment the number of T2 subscriptions + totalUserSub += 1 + # set the number of T2 PD2P replicas + maxSitesHaveDS = 1 + # additional replicas + if nWaitingJobsets > maxWaitingJobsets: + # the number of waiting jobs for this dataset + if totalInputSize != 0: + # dataset in container + tmpN = float(nWaitingJobsAll * dsSize) / float(totalInputSize) + else: + # dataset + tmpN = float(nWaitingJobsAll) + tmpN = int(math.log10(tmpN/float(maxWaitingJobs))) + nTier1Copies + maxSitesHaveDS = max(maxSitesHaveDS,tmpN) + # protection against too many replications + maxSitesHaveDS = min(maxSitesHaveDS,protectionMaxNumReplicas) + self.putLog("PD2P maxSitesHaveDS : %s" % maxSitesHaveDS) + # ignore the first job + if nUsed == 0: + self.putLog("skip the first job", + sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'FIRSTJOB','dataset':tmpDS}) + if not self.simul: + continue + # check number of replicas + if len(allCompPd2pSites) >= maxSitesHaveDS and nUsed != 1: + self.putLog("skip since many T2 PD2P sites (%s>=%s) have the replica" % (len(allCompPd2pSites),maxSitesHaveDS), + sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'TOO_MANY_T2_REPLICAS','dataset':tmpDS}) + if not self.simul: + continue + # check the number of subscriptions + maxNumSubInAllCloud = max(0,maxSitesHaveDS-len(allCompPd2pSites)) + maxNumSubInAllCloud = min(2,maxNumSubInAllCloud) + self.putLog("PD2P maxNumSubInAllCloud : %s" % maxNumSubInAllCloud) + if totalUserSub >= maxNumSubInAllCloud: + self.putLog("skip since enough subscriptions (%s>=%s) were already made for T2 PD2P" % \ + (totalUserSub,maxNumSubInAllCloud), + sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'TOO_MANY_T2_SUBSCRIPTIONS','dataset':tmpDS}) + if not self.simul: + continue + # no candidates + if len(allCandidates) == 0: + self.putLog("skip since no candidates",sendLog=True,actionTag='SKIPPED',tagsMap={'reason':'NO_T2_CANDIDATE','dataset':tmpDS}) + continue + # get inverse weight for brokerage + weightForBrokerage = self.getWeightForBrokerage(allCandidates,tmpDS,nReplicasInCloud) + self.putLog("inverse weight %s" % str(weightForBrokerage)) + # get free disk size + self.putLog("getting free disk size for T2 PD2P") + retFreeSizeMap,freeSizeMap = self.getFreeDiskSize(tmpDS,allCandidates) + if not retFreeSizeMap: + self.putLog("failed to get free disk size",type='error',sendLog=True) + continue + # run brokerage + tmpJob = JobSpec() + tmpJob.AtlasRelease = '' + self.putLog("run brokerage for %s" % tmpDS) + usedWeight = brokerage.broker.schedule([tmpJob],self.taskBuffer,self.siteMapper,True,allCandidates, + True,specialWeight=weightForBrokerage,getWeight=True, + sizeMapForCheck=freeSizeMap,datasetSize=dsSize) + selectedSite = tmpJob.computingSite + for tmpWeightSite,tmpWeightStr in usedWeight.iteritems(): + tmpTagsMap = {'site':tmpWeightSite,'weight':tmpWeightStr,'dataset':tmpDS} + if tmpWeightSite == selectedSite: + if nUsed == 1: + tmpActionTag = 'SELECTEDT2_JOB' + elif len(allCompPd2pSites) == 0: + tmpActionTag = 'SELECTEDT2_NOREP' + else: + tmpActionTag = 'SELECTEDT2_WAIT' + tmpTagsMap['nused'] = nUsed + tmpTagsMap['nwaitingjobs'] = nWaitingJobsAll + tmpTagsMap['nwaitingjobsets'] = nWaitingJobsets + tmpTagsMap['nsiteshaveds'] = len(allCompPd2pSites) + else: + tmpActionTag = 'UNSELECTEDT2' + self.putLog("weight %s %s" % (tmpWeightSite,tmpWeightStr),sendLog=True, + actionTag=tmpActionTag,tagsMap=tmpTagsMap) + self.putLog("site for T2 PD2P -> %s" % selectedSite) + # remove from candidate list + if selectedSite in allCandidates: + allCandidates.remove(selectedSite) + if selectedSite in allCandidatesMoU: + allCandidatesMoU.remove(selectedSite) + # make subscription + if not self.simul: + subRet,dq2ID = self.makeSubscription(tmpDS,selectedSite,ddmShare='secondary') + self.putLog("made subscription to %s:%s" % (selectedSite,dq2ID),sendLog=True) + usedSites.append(selectedSite) + # update database + if subRet: + self.taskBuffer.addUserSubscription(tmpDS,[dq2ID]) + # additional T2 copy with MoU share when it is the second submission + if nUsed == 1 or self.simul: + retT2MoU,selectedSite = self.makeT2SubscriptionMoU(allCandidatesMoU,tmpDS,dsSize,'T2MOU',nUsed) + self.putLog("end for %s" % self.jobs[0].PandaID) + except: + errType,errValue = sys.exc_info()[:2] + self.putLog("%s %s" % (errType,errValue),'error') + + + # get candidate sites for subscription + def getCandidates(self,inputDS,checkUsedFile=True,useHidden=False,useCloseSites=False): + # return for failure + failedRet = False,{'':{'':([],[],[],0,False,False,0,0,[])}} + # get replica locations + if inputDS.endswith('/'): + # container + status,tmpRepMaps = self.getListDatasetReplicasInContainer(inputDS) + # get used datasets + if status and checkUsedFile: + status,tmpUsedDsList = self.getUsedDatasets(tmpRepMaps) + # remove unused datasets + newRepMaps = {} + for tmpKey,tmpVal in tmpRepMaps.iteritems(): + if tmpKey in tmpUsedDsList: + newRepMaps[tmpKey] = tmpVal + tmpRepMaps = newRepMaps + else: + # normal dataset + status,tmpRepMap = self.getListDatasetReplicas(inputDS) + tmpRepMaps = {inputDS:tmpRepMap} + if not status: + # failed + self.putLog("failed to get replica locations for %s" % inputDS,'error') + return failedRet + # get close sites + closeSitesMap = {} + for tmpDS,tmpRepMap in tmpRepMaps.iteritems(): + # loop over all DQ2 IDs + for tmpDQ2ID in tmpRepMap.keys(): + if not closeSitesMap.has_key(tmpDQ2ID): + status,tmpCloseSiteList = toa.getCloseSites(tmpDQ2ID) + exec "tmpCloseSiteList = %s" % tmpCloseSiteList + closeSitesMap[tmpDQ2ID] = [] + # select only DATADISK + for tmpCloseSite in tmpCloseSiteList: + if tmpCloseSite.endswith('_DATADISK'): + closeSitesMap[tmpDQ2ID].append(tmpCloseSite) + # get all sites + allSiteMap = {} + for tmpSiteName,tmpSiteSpec in self.siteMapper.siteSpecList.iteritems(): + # check cloud + if not tmpSiteSpec.cloud in self.pd2pClouds: + continue + # ignore test sites + if 'test' in tmpSiteName.lower(): + continue + # analysis only + if not tmpSiteName.startswith('ANALY'): + continue + # online + if not tmpSiteSpec.status in ['online']: + self.putLog("skip %s due to status=%s" % (tmpSiteName,tmpSiteSpec.status)) + continue + if not allSiteMap.has_key(tmpSiteSpec.cloud): + allSiteMap[tmpSiteSpec.cloud] = [] + allSiteMap[tmpSiteSpec.cloud].append(tmpSiteSpec) + # NG DQ2 IDs + ngDQ2SuffixList = ['LOCALGROUPDISK'] + # loop over all clouds + returnMap = {} + checkedMetaMap = {} + userSubscriptionsMap = {} + for cloud in self.pd2pClouds: + # DQ2 prefix of T1 + tmpT1SiteID = self.siteMapper.getCloud(cloud)['source'] + tmpT1DQ2ID = self.siteMapper.getSite(tmpT1SiteID).ddm + prefixDQ2T1 = re.sub('[^_]+DISK$','',tmpT1DQ2ID) + # loop over all datasets + for tmpDS,tmpRepMap in tmpRepMaps.iteritems(): + candSites = [] + sitesComDS = [] + sitesCompPD2P = [] + # check metadata + if not checkedMetaMap.has_key(tmpDS): + checkedMetaMap[tmpDS] = self.getDatasetMetadata(tmpDS) + retMeta,tmpMetadata = checkedMetaMap[tmpDS] + if not retMeta: + self.putLog("failed to get metadata for %s" % tmpDS,'error') + return failedRet + if tmpMetadata['provenance'] in ngProvenance: + self.putLog("provenance=%s of %s is excluded" % (tmpMetadata['provenance'],tmpDS)) + continue + if tmpMetadata['hidden'] in [True,'True'] and not useHidden: + self.putLog("%s is hidden" % tmpDS) + continue + # check T1 has a replica and get close sites + t1HasReplica = False + t1HasPrimary = False + nSecReplicas = 0 + closeSiteList = [] + candForMoU = [] + for tmpDQ2ID,tmpStatMap in tmpRepMap.iteritems(): + # check NG suffix + ngSuffixFlag = False + for tmpNGSuffix in ngDQ2SuffixList: + if tmpDQ2ID.endswith(tmpNGSuffix): + ngSuffixFlag = True + break + if ngSuffixFlag: + continue + # get close sites + if closeSitesMap.has_key(tmpDQ2ID): + for tmpCloseSiteID in closeSitesMap[tmpDQ2ID]: + if not tmpCloseSiteID in closeSiteList: + closeSiteList.append(tmpCloseSiteID) + # checks for T1 + if tmpDQ2ID.startswith(prefixDQ2T1): + if tmpStatMap[0]['total'] == tmpStatMap[0]['found']: + t1HasReplica = True + # check replica metadata to get archived info + retRepMeta,tmpRepMetadata = self.getReplicaMetadata(tmpDS,tmpDQ2ID) + if not retRepMeta: + self.putLog("failed to get replica metadata for %s:%s" % \ + (tmpDS,tmpDQ2ID),'error') + return failedRet + # check archived field + if isinstance(tmpRepMetadata,types.DictType) and tmpRepMetadata.has_key('archived') and \ + tmpRepMetadata['archived'] == 'primary': + # primary + t1HasPrimary = True + break + elif isinstance(tmpRepMetadata,types.DictType) and tmpRepMetadata.has_key('archived') and \ + tmpRepMetadata['archived'] == 'secondary': + # secondary + nSecReplicas += 1 + break + self.putLog("close sites : %s" % str(closeSiteList)) + # get on-going subscriptions + timeRangeSub = 7 + if not userSubscriptionsMap.has_key(tmpDS): + userSubscriptionsMap[tmpDS] = self.taskBuffer.getUserSubscriptions(tmpDS,timeRangeSub) + userSubscriptions = userSubscriptionsMap[tmpDS] + # unused cloud + if not allSiteMap.has_key(cloud): + continue + # count the number of T1 subscriptions + nT1Sub = 0 + for tmpUserSub in userSubscriptions: + if tmpUserSub.startswith(prefixDQ2T1): + nT1Sub += 1 + # check sites + nUserSub = 0 + for tmpSiteSpec in allSiteMap[cloud]: + # check cloud + if tmpSiteSpec.cloud != cloud: + continue + # prefix of DQ2 ID + prefixDQ2 = re.sub('[^_]+DISK$','',tmpSiteSpec.ddm) + # skip T1 + if prefixDQ2 == prefixDQ2T1: + continue + # check if corresponding DQ2 ID is a replica location + hasReplica = False + for tmpDQ2ID,tmpStatMap in tmpRepMap.iteritems(): + # check NG suffix + ngSuffixFlag = False + for tmpNGSuffix in ngDQ2SuffixList: + if tmpDQ2ID.endswith(tmpNGSuffix): + ngSuffixFlag = True + break + if ngSuffixFlag: + continue + if tmpDQ2ID.startswith(prefixDQ2): + if tmpStatMap[0]['total'] == tmpStatMap[0]['found']: + # complete + sitesComDS.append(tmpSiteSpec.sitename) + if tmpSiteSpec.cachedse == 1: + sitesCompPD2P.append(tmpSiteSpec.sitename) + hasReplica = True + break + # site doesn't have a replica + if (not hasReplica) and tmpSiteSpec.cachedse == 1: + candForMoU.append(tmpSiteSpec.sitename) + if not useCloseSites: + candSites.append(tmpSiteSpec.sitename) + else: + # use close sites only + if self.getDQ2ID(tmpSiteSpec.sitename,tmpDS) in closeSiteList: + candSites.append(tmpSiteSpec.sitename) + # the number of subscriptions + for tmpUserSub in userSubscriptions: + if tmpUserSub.startswith(prefixDQ2): + nUserSub += 1 + break + # append + if not returnMap.has_key(tmpDS): + returnMap[tmpDS] = {} + returnMap[tmpDS][cloud] = (candSites,sitesComDS,sitesCompPD2P,nUserSub,t1HasReplica,t1HasPrimary, + nSecReplicas,nT1Sub,candForMoU) + # return + return True,returnMap + + + # check DDM response + def isDQ2ok(self,out): + if out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1 \ + or out == '()': + return False + return True + + + # get map of DQ2 IDs + def getDQ2ID(self,sitename,dataset): + # get DQ2 ID + if not self.siteMapper.checkSite(sitename): + self.putLog("cannot find SiteSpec for %s" % sitename) + return '' + dq2ID = self.siteMapper.getSite(sitename).ddm + if True: + # data + matchEOS = re.search('_EOS[^_]+DISK$',dq2ID) + if matchEOS != None: + dq2ID = re.sub('_EOS[^_]+DISK','_EOSDATADISK',dq2ID) + else: + dq2ID = re.sub('_[^_]+DISK','_DATADISK',dq2ID) + else: + # unsupported prefix for subscription + self.putLog('%s has unsupported prefix for subscription' % dataset,'error') + return '' + # patch for MWT2_UC + if dq2ID == 'MWT2_UC_DATADISK': + dq2ID = 'MWT2_DATADISK' + # return + return dq2ID + + + # get list of datasets + def makeSubscription(self,dataset,sitename,givenDQ2ID=None,ddmShare='secondary'): + # return for failuer + retFailed = False,'' + # get DQ2 IDs + if givenDQ2ID == None: + dq2ID = self.getDQ2ID(sitename,dataset) + else: + dq2ID = givenDQ2ID + if dq2ID == '': + self.putLog("cannot find DQ2 ID for %s:%s" % (sitename,dataset)) + return retFailed + # make subscription + optSrcPolicy = 000001 + nTry = 3 + for iDDMTry in range(nTry): + # register subscription + self.putLog('%s/%s registerDatasetSubscription %s %s' % (iDDMTry,nTry,dataset,dq2ID)) + status,out = ddm.DQ2.main('registerDatasetSubscription',dataset,dq2ID,version=0,archived=0, + callbacks={},sources={},sources_policy=optSrcPolicy, + wait_for_sources=0,destination=None,query_more_sources=0, + sshare=ddmShare,group=None,activity='Data Brokering',acl_alias='secondary') + if out.find('DQSubscriptionExistsException') != -1: + break + elif out.find('DQLocationExistsException') != -1: + break + elif status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if out.find('DQSubscriptionExistsException') != -1: + pass + elif status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response for %s' % dataset,'error') + return retFailed + # update + self.putLog('%s %s' % (status,out)) + return True,dq2ID + + + # get weight for brokerage + def getWeightForBrokerage(self,sitenames,dataset,nReplicasInCloud): + # return for failuer + retFailed = False,{} + retMap = {} + # get the number of subscriptions for last 24 hours + numUserSubs = self.taskBuffer.getNumUserSubscriptions() + # loop over all sites + for sitename in sitenames: + # get DQ2 ID + dq2ID = self.getDQ2ID(sitename,dataset) + if dq2ID == '': + self.putLog("cannot find DQ2 ID for %s:%s" % (sitename,dataset)) + return retFailed + # append + if numUserSubs.has_key(dq2ID): + retMap[sitename] = 1 + numUserSubs[dq2ID] + else: + retMap[sitename] = 1 + # negative weight if a cloud already has replicas + tmpCloud = self.siteMapper.getSite(sitename).cloud + retMap[sitename] *= (1 + nReplicasInCloud[tmpCloud]) + # return + return retMap + + + # get free disk size + def getFreeDiskSize(self,dataset,siteList): + # return for failuer + retFailed = False,{} + # loop over all sites + sizeMap = {} + for sitename in siteList: + # reuse cached value + if self.cachedSizeMap.has_key(sitename): + sizeMap[sitename] = self.cachedSizeMap[sitename] + continue + # get DQ2 IDs + dq2ID = self.getDQ2ID(sitename,dataset) + if dq2ID == '': + self.putLog("cannot find DQ2 ID for %s:%s" % (sitename,dataset)) + return retFailed + for valueItem in ['used','total']: + nTry = 3 + for iDDMTry in range(nTry): + status,out = ddm.DQ2.main('queryStorageUsage','srm',valueItem,dq2ID) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if status != 0 or out.startswith('Error'): + self.putLog("%s/%s queryStorageUsage key=%s value=%s site=%s" % (iDDMTry,nTry,'srm',valueItem,dq2ID)) + self.putLog(out,'error') + self.putLog('bad DQ2 response for %s:%s' % (dq2ID,valueItem), 'error') + return retFailed + try: + # convert res to map + exec "tmpGigaVal = %s[0]['giga']" % out + if not sizeMap.has_key(sitename): + sizeMap[sitename] = {} + # append + sizeMap[sitename][valueItem] = tmpGigaVal + # cache + self.cachedSizeMap[sitename] = sizeMap[sitename] + except: + self.putLog("%s/%s queryStorageUsage key=%s value=%s site=%s" % (iDDMTry,nTry,'srm',valueItem,dq2ID)) + self.putLog(out,'error') + self.putLog('could not convert HTTP-res to free size map for %s%s' % (dq2ID,valueItem), 'error') + return retFailed + # return + self.putLog('getFreeDiskSize done->%s' % str(sizeMap)) + return True,sizeMap + + + + # get list of replicas for a dataset + def getListDatasetReplicas(self,dataset): + nTry = 3 + for iDDMTry in range(nTry): + self.putLog("%s/%s listDatasetReplicas %s" % (iDDMTry,nTry,dataset)) + status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response for %s' % dataset, 'error') + return False,{} + try: + # convert res to map + exec "tmpRepSites = %s" % out + self.putLog('getListDatasetReplicas->%s' % str(tmpRepSites)) + return True,tmpRepSites + except: + self.putLog(out,'error') + self.putLog('could not convert HTTP-res to replica map for %s' % dataset, 'error') + return False,{} + + + # get replicas for a container + def getListDatasetReplicasInContainer(self,container): + # response for failure + resForFailure = False,{} + # get datasets in container + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s listDatasetsInContainer %s' % (iDDMTry,nTry,container)) + status,out = ddm.DQ2.main('listDatasetsInContainer',container) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response for %s' % container, 'error') + return resForFailure + datasets = [] + try: + # convert to list + exec "datasets = %s" % out + except: + self.putLog('could not convert HTTP-res to dataset list for %s' % container, 'error') + return resForFailure + # loop over all datasets + allRepMap = {} + for dataset in datasets: + # get replicas + status,tmpRepSites = self.getListDatasetReplicas(dataset) + if not status: + return resForFailure + # append + allRepMap[dataset] = tmpRepSites + # return + self.putLog('getListDatasetReplicasInContainer done') + return True,allRepMap + + + # get dataset metadata + def getDatasetMetadata(self,datasetName): + # response for failure + resForFailure = False,{} + metaDataAttrs = ['provenance','hidden'] + # get datasets in container + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s getMetaDataAttribute %s' % (iDDMTry,nTry,datasetName)) + status,out = ddm.DQ2.main('getMetaDataAttribute',datasetName,metaDataAttrs) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response for %s' % datasetName, 'error') + return resForFailure + metadata = {} + try: + # convert to map + exec "metadata = %s" % out + except: + self.putLog('could not convert HTTP-res to metadata for %s' % datasetName, 'error') + return resForFailure + # check whether all attributes are available + for tmpAttr in metaDataAttrs: + if not metadata.has_key(tmpAttr): + self.putLog('%s is missing in %s' % (tmpAttr,str(metadata)), 'error') + return resForFailure + # return + self.putLog('getDatasetMetadata -> %s' % str(metadata)) + return True,metadata + + + # get replica metadata + def getReplicaMetadata(self,datasetName,locationName): + # response for failure + resForFailure = False,{} + # get metadata + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s listMetaDataReplica %s %s' % (iDDMTry,nTry,datasetName,locationName)) + status,out = ddm.DQ2.main('listMetaDataReplica',locationName,datasetName) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response for %s' % datasetName, 'error') + return resForFailure + metadata = {} + try: + # convert to map + exec "metadata = %s" % out + except: + self.putLog('could not convert HTTP-res to replica metadata for %s:%s' % \ + (datasetName,locationName), 'error') + return resForFailure + # return + self.putLog('getReplicaMetadata -> %s' % str(metadata)) + return True,metadata + + + # check subscription info + def checkSubscriptionInfo(self,destDQ2ID,datasetName): + resForFailure = (False,False) + # get datasets in container + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s listSubscriptionInfo %s %s' % (iDDMTry,nTry,destDQ2ID,datasetName)) + status,out = ddm.DQ2.main('listSubscriptionInfo',datasetName,destDQ2ID,0) + if status != 0: + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response for %s' % datasetName, 'error') + return resForFailure + self.putLog(out) + if out == '()': + # no subscription + retVal = False + else: + # already exists + retVal = True + self.putLog('checkSubscriptionInfo -> %s' % retVal) + return True,retVal + + + # get size of dataset + def getDatasetSize(self,datasetName): + self.putLog("get size of %s" % datasetName) + resForFailure = (False,0) + # get size of datasets + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s listFilesInDataset %s' % (iDDMTry,nTry,datasetName)) + status,out = ddm.DQ2.listFilesInDataset(datasetName) + if status != 0: + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response to get size of %s' % datasetName, 'error') + return resForFailure + self.putLog("OK") + # get total size + dsSize = 0 + try: + exec "outList = %s" % out + for guid,vals in outList[0].iteritems(): + dsSize += long(vals['filesize']) + except: + self.putLog('failed to get size from DQ2 response for %s' % datasetName, 'error') + return resForFailure + # GB + dsSize /= (1024*1024*1024) + self.putLog("dataset size = %s" % dsSize) + return True,dsSize + + + # get datasets used by jobs + def getUsedDatasets(self,datasetMap): + resForFailure = (False,[]) + # loop over all datasets + usedDsList = [] + for datasetName in datasetMap.keys(): + # get file list + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s listFilesInDataset %s' % (iDDMTry,nTry,datasetName)) + status,out = ddm.DQ2.listFilesInDataset(datasetName) + if status != 0: + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response to get size of %s' % datasetName, 'error') + return resForFailure + # convert to map + try: + tmpLfnList = [] + exec "outList = %s" % out + for guid,vals in outList[0].iteritems(): + tmpLfnList.append(vals['lfn']) + except: + self.putLog('failed to get file list from DQ2 response for %s' % datasetName, 'error') + return resForFailure + # check if jobs use the dataset + usedFlag = False + for tmpJob in self.jobs: + for tmpFile in tmpJob.Files: + if tmpFile.type == 'input' and tmpFile.lfn in tmpLfnList: + usedFlag = True + break + # escape + if usedFlag: + break + # used + if usedFlag: + usedDsList.append(datasetName) + # return + self.putLog("used datasets = %s" % str(usedDsList)) + return True,usedDsList + + + # get file from dataset + def getFileFromDataset(self,datasetName,guid,randomMode=False,nSamples=1): + resForFailure = (False,None) + # get files in datasets + global g_filesInDsMap + if not g_filesInDsMap.has_key(datasetName): + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s listFilesInDataset %s' % (iDDMTry,nTry,datasetName)) + status,out = ddm.DQ2.listFilesInDataset(datasetName) + if status != 0: + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response to get size of %s' % datasetName, 'error') + return resForFailure + # get file + try: + exec "outList = %s" % out + # append + g_filesInDsMap[datasetName] = outList[0] + except: + self.putLog('failed to get file list from DQ2 response for %s' % datasetName, 'error') + return resForFailure + # random mode + if randomMode: + tmpList = g_filesInDsMap[datasetName].keys() + random.shuffle(tmpList) + retList = [] + for iSamples in range(nSamples): + if iSamples < len(tmpList): + guid = tmpList[iSamples] + retMap = g_filesInDsMap[datasetName][guid] + retMap['guid'] = guid + retMap['dataset'] = datasetName + retList.append(retMap) + return True,retList + # return + if g_filesInDsMap[datasetName].has_key(guid): + retMap = g_filesInDsMap[datasetName][guid] + retMap['guid'] = guid + retMap['dataset'] = datasetName + return True,retMap + return resForFailure + + + # make subscriptions to EOS + def makeSubscriptionToEOS(self,datasetName): + self.putLog("start making EOS subscription for %s" % datasetName) + destDQ2IDs = ['CERN-PROD_EOSDATADISK'] + # get dataset replica locations + if datasetName.endswith('/'): + statRep,replicaMaps = self.getListDatasetReplicasInContainer(datasetName) + else: + statRep,replicaMap = self.getListDatasetReplicas(datasetName) + replicaMaps = {datasetName:replicaMap} + if not statRep: + self.putLog("failed to get replica map for EOS",type='error') + return False + # loop over all datasets + for tmpDsName,replicaMap in replicaMaps.iteritems(): + # check if replica is already there + for destDQ2ID in destDQ2IDs: + if replicaMap.has_key(destDQ2ID): + self.putLog("skip EOS sub for %s:%s since replica is already there" % (destDQ2ID,tmpDsName)) + else: + statSubEx,subExist = self.checkSubscriptionInfo(destDQ2ID,tmpDsName) + if not statSubEx: + self.putLog("failed to check subscription for %s:%s" % (destDQ2ID,tmpDsName),type='error') + continue + # make subscription + if subExist: + self.putLog("skip EOS sub for %s:%s since subscription is already there" % (destDQ2ID,tmpDsName)) + else: + statMkSub,retMkSub = self.makeSubscription(tmpDsName,'',destDQ2ID) + if statMkSub: + self.putLog("made subscription to %s for %s" % (destDQ2ID,tmpDsName)) + else: + self.putLog("failed to make subscription to %s for %s" % (destDQ2ID,tmpDsName),type='error') + # return + self.putLog("end making EOS subscription for %s" % datasetName) + return True + + + # register new dataset container with datasets + def registerDatasetContainerWithDatasets(self,containerName,files,replicaMap): + # sort by locations + filesMap = {} + for tmpFile in files: + tmpLocations = replicaMap[tmpFile['dataset']] + tmpLocations.sort() + tmpKey = tuple(tmpLocations) + if not filesMap.has_key(tmpKey): + filesMap[tmpKey] = [] + # append file + filesMap[tmpKey].append(tmpFile) + # register new datasets + datasetNames = [] + tmpIndex = 1 + for tmpLocations,tmpFiles in filesMap.iteritems(): + tmpDsName = containerName[:-1] + '_%04d' % tmpIndex + tmpRet = self.registerDatasetWithLocation(tmpDsName,tmpFiles,tmpLocations) + # failed + if not tmpRet: + self.putLog('failed to register %s' % tmpDsName, 'error') + return False + # append dataset + datasetNames.append(tmpDsName) + tmpIndex += 1 + # register container + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s registerContainer %s' % (iDDMTry,nTry,containerName)) + status,out = ddm.DQ2.main('registerContainer',containerName,datasetNames) + if status != 0 and out.find('DQDatasetExistsException') == -1: + time.sleep(60) + else: + break + if out.find('DQDatasetExistsException') != -1: + pass + elif status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response to register %s' % containerName, 'error') + return False + # return + return True + + + + # register new dataset with locations + def registerDatasetWithLocation(self,datasetName,files,locations): + resForFailure = False + # get file info + guids = [] + lfns = [] + fsizes = [] + chksums = [] + for tmpFile in files: + guids.append(tmpFile['guid']) + lfns.append(tmpFile['lfn']) + fsizes.append(None) + chksums.append(None) + # register new dataset + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s registerNewDataset %s' % (iDDMTry,nTry,datasetName)) + status,out = ddm.DQ2.main('registerNewDataset',datasetName,lfns,guids,fsizes,chksums, + None,None,None,True) + if status != 0 and out.find('DQDatasetExistsException') == -1: + time.sleep(60) + else: + break + if out.find('DQDatasetExistsException') != -1: + pass + elif status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response to register %s' % datasetName, 'error') + return resForFailure + # freeze dataset + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s freezeDataset %s' % (iDDMTry,nTry,datasetName)) + status,out = ddm.DQ2.main('freezeDataset',datasetName) + if status != 0 and out.find('DQFrozenDatasetException') == -1: + time.sleep(60) + else: + break + if out.find('DQFrozenDatasetException') != -1: + pass + elif status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response to freeze %s' % datasetName, 'error') + return resForFailure + # register locations + for tmpLocation in locations: + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s registerDatasetLocation %s %s' % (iDDMTry,nTry,datasetName,tmpLocation)) + status,out = ddm.DQ2.main('registerDatasetLocation',datasetName,tmpLocation,0,1,None,None,None,"14 days") + if status != 0 and out.find('DQLocationExistsException') == -1: + time.sleep(60) + else: + break + if out.find('DQLocationExistsException') != -1: + pass + elif status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response to freeze %s' % datasetName, 'error') + return resForFailure + return True + + + # list datasets by file GUIDs + def listDatasetsByGUIDs(self,guids,dsFilters): + resForFailure = (False,{}) + # get size of datasets + nTry = 3 + for iDDMTry in range(nTry): + self.putLog('%s/%s listDatasetsByGUIDs' % (iDDMTry,nTry)) + status,out = ddm.DQ2.listDatasetsByGUIDs(guids) + if status != 0: + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + self.putLog(out,'error') + self.putLog('bad DQ2 response to list datasets by GUIDs','error') + return resForFailure + self.putLog(out) + # get map + retMap = {} + try: + exec "outMap = %s" % out + for guid in guids: + tmpDsNames = [] + # GUID not found + if not outMap.has_key(guid): + self.putLog('GUID=%s not found' % guid,'error') + return resForFailure + # ignore junk datasets + for tmpDsName in outMap[guid]: + if tmpDsName.startswith('panda') or \ + tmpDsName.startswith('user') or \ + tmpDsName.startswith('group') or \ + re.search('_sub\d+$',tmpDsName) != None or \ + re.search('_dis\d+$',tmpDsName) != None or \ + re.search('_shadow$',tmpDsName) != None: + continue + # check with filters + if dsFilters != []: + flagMatch = False + for tmpFilter in dsFilters: + if re.search(tmpFilter,tmpDsName) != None: + flagMatch = True + break + # not match + if not flagMatch: + continue + # append + tmpDsNames.append(tmpDsName) + # empty + if tmpDsNames == []: + self.putLog('no datasets found for GUID=%s' % guid) + continue + # duplicated + if len(tmpDsNames) != 1: + self.putLog('there are multiple datasets %s for GUID:%s' % (str(tmpDsNames),guid),'error') + return resForFailure + # append + retMap[guid] = tmpDsNames[0] + except: + self.putLog('failed to list datasets by GUIDs','error') + return resForFailure + return True,retMap + + + # conver event/run list to datasets + def convertEvtRunToDatasets(self,runEvtList,dsType,streamName,dsFilters,amiTag): + self.putLog('convertEvtRunToDatasets type=%s stream=%s dsPatt=%s amitag=%s' % \ + (dsType,streamName,str(dsFilters),amiTag)) + # check data type + failedRet = False,{},[] + if dsType == 'AOD': + streamRef = 'StreamAOD_ref' + elif dsType == 'ESD': + streamRef = 'StreamESD_ref' + elif dsType == 'RAW': + streamRef = 'StreamRAW_ref' + else: + self.putLog("invalid data type %s for EventRun conversion" % dsType,type='error') + return failedRet + # import event lookup client + from eventLookupClient import eventLookupClient + elssiIF = eventLookupClient() + # loop over all events + runEvtGuidMap = {} + nEventsPerLoop = 500 + iEventsTotal = 0 + while iEventsTotal < len(runEvtList): + tmpRunEvtList = runEvtList[iEventsTotal:iEventsTotal+nEventsPerLoop] + iEventsTotal += nEventsPerLoop + if streamName == '': + guidListELSSI = elssiIF.doLookup(tmpRunEvtList,tokens=streamRef, + amitag=amiTag,extract=True) + else: + guidListELSSI = elssiIF.doLookup(tmpRunEvtList,stream=streamName,tokens=streamRef, + amitag=amiTag,extract=True) + # failed + if guidListELSSI == None or len(guidListELSSI) == 0: + errStr = '' + for tmpLine in elssiIF.output: + errStr += tmpLine + self.putLog(errStr,type='error') + self.putLog("invalid retrun from EventLookup",type='error') + return failedRet + # check attribute + attrNames, attrVals = guidListELSSI + def getAttributeIndex(attr): + for tmpIdx,tmpAttrName in enumerate(attrNames): + if tmpAttrName.strip() == attr: + return tmpIdx + return None + # get index + indexEvt = getAttributeIndex('EventNumber') + indexRun = getAttributeIndex('RunNumber') + indexTag = getAttributeIndex(streamRef) + if indexEvt == None or indexRun == None or indexTag == None: + self.putLog("failed to get attribute index from %s" % str(attrNames),type='error') + return failedRet + # check events + for runNr,evtNr in tmpRunEvtList: + paramStr = 'Run:%s Evt:%s Stream:%s' % (runNr,evtNr,streamName) + self.putLog(paramStr) + # collect GUIDs + tmpguids = [] + for attrVal in attrVals: + if runNr == attrVal[indexRun] and evtNr == attrVal[indexEvt]: + tmpGuid = attrVal[indexTag] + # check non existing + if tmpGuid == 'NOATTRIB': + continue + if not tmpGuid in tmpguids: + tmpguids.append(tmpGuid) + # not found + if tmpguids == []: + errStr = "no GUIDs were found in Event Lookup service for %s" % paramStr + self.putLog(errStr,type='error') + return failedRet + # append + runEvtGuidMap[(runNr,evtNr)] = tmpguids + # convert to datasets + allDatasets = [] + allFiles = [] + allLocations = {} + for tmpIdx,tmpguids in runEvtGuidMap.iteritems(): + runNr,evtNr = tmpIdx + tmpDsRet,tmpDsMap = self.listDatasetsByGUIDs(tmpguids,dsFilters) + # failed + if not tmpDsRet: + self.putLog("failed to convert GUIDs to datasets",type='error') + return failedRet + # empty + if tmpDsMap == {}: + self.putLog("there is no dataset for Run:%s Evt:%s" % (runNr,evtNr),type='error') + return failedRet + if len(tmpDsMap) != 1: + self.putLog("there are multiple datasets %s for Run:%s Evt:%s" % (str(tmpDsMap),runNr,evtNr), + type='error') + return failedRet + # append + for tmpGUID,tmpDsName in tmpDsMap.iteritems(): + # collect dataset names + if not tmpDsName in allDatasets: + allDatasets.append(tmpDsName) + # get location + statRep,replicaMap = self.getListDatasetReplicas(tmpDsName) + # failed + if not statRep: + self.putLog("failed to get locations for DS:%s" % tmpDsName,type='error') + return failedRet + # collect locations + tmpLocationList = [] + for tmpLocation in replicaMap.keys(): + if not tmpLocation in tmpLocationList: + tmpLocationList.append(tmpLocation) + allLocations[tmpDsName] = tmpLocationList + # get file info + tmpFileRet,tmpFileInfo = self.getFileFromDataset(tmpDsName,tmpGUID) + # failed + if not tmpFileRet: + self.putLog("failed to get fileinfo for GUID:%s DS:%s" % (tmpGUID,tmpDsName),type='error') + return failedRet + # collect files + allFiles.append(tmpFileInfo) + # return + self.putLog('converted to %s, %s, %s' % (str(allDatasets),str(allLocations),str(allFiles))) + return True,allLocations,allFiles + + + # get mapping between TAG and parent GUIDs + def getMapTAGandParentGUIDs(self,dsName,tagQuery,streamRef): + # remove _tidXYZ + dsNameForLookUp = re.sub('_tid\d+(_\d+)*$','',dsName) + # reuse + if self.mapTAGandParentGUIDs.has_key(dsNameForLookUp): + return self.mapTAGandParentGUIDs[dsNameForLookUp] + # set + from countGuidsClient import countGuidsClient + tagIF = countGuidsClient() + tagResults = tagIF.countGuids(dsNameForLookUp,tagQuery,streamRef+',StreamTAG_ref') + if tagResults == None: + errStr = '' + for tmpLine in tagIF.output: + if tmpLine == '\n': + continue + errStr += tmpLine + self.putLog(errStr,type='error') + errStr2 = "invalid return from Event Lookup service. " + if "No collection in the catalog matches the dataset name" in errStr: + errStr2 += "Note that only merged TAG is uploaded to the TAG DB, " + errStr2 += "so you need to use merged TAG datasets (or container) for inDS. " + errStr2 += "If this is already the case please contact atlas-event-metadata@cern.ch" + self.putLog(errStr2,type='error') + return None + # empty + if not tagResults[0]: + errStr = "No GUIDs found for %s" % dsName + self.putLog(errStr,type='error') + return None + # collect + retMap = {} + for guidCount,guids in tagResults[1]: + self.putLog('%s %s' % (guidCount,guids)) + parentGUID,tagGUID = guids + # append TAG GUID + if not retMap.has_key(tagGUID): + retMap[tagGUID] = {} + # append parent GUID and the number of selected events + if retMap[tagGUID].has_key(parentGUID): + errStr = "GUIDs=%s is duplicated" % parentGUID + self.putLog(errStr,type='error') + return None + retMap[tagGUID][parentGUID] = long(guidCount) + # keep to avoid redundant lookup + self.mapTAGandParentGUIDs[dsNameForLookUp] = retMap + # return + return retMap + + + # get TAG files and parent DS/files using TAG query + def getTagParentInfoUsingTagQuery(self,tagDsList,tagQuery,streamRef): + # return code for failure + failedRet = False,{},[] + allDatasets = [] + allFiles = [] + allLocations = {} + # set empty if Query is undefined + if tagQuery == False: + tagQuery = '' + # loop over all tags + self.putLog('getting parent dataset names and LFNs from TAG DB using EventSelector.Query="%s"' % tagQuery) + for tagDS in tagDsList: + if tagDS.endswith('/'): + # get elements in container + tmpStat,elementMap = self.getListDatasetReplicasInContainer(tagDS) + else: + tmpStat,elementMap = self.getListDatasetReplicas(tagDS) + # loop over all elemets + for dsName in elementMap.keys(): + self.putLog("DS=%s Query=%s Ref:%s" % (dsName,tagQuery,streamRef)) + guidMap = self.getMapTAGandParentGUIDs(dsName,tagQuery,streamRef) + # failed + if guidMap == None: + self.putLog("failed to get mappping between TAG and parent GUIDs",type='error') + return failedRet + # convert TAG GUIDs to LFNs + tmpTagRet,tmpTagDsMap = self.listDatasetsByGUIDs(guidMap.keys(),[]) + # failed + if not tmpTagRet: + self.putLog("failed to convert GUIDs to datasets",type='error') + return failedRet + # empty + if tmpTagDsMap == {}: + self.putLog("there is no dataset for DS=%s Query=%s Ref:%s" % (dsName,tagQuery,streamRef),type='error') + return failedRet + # convert parent GUIDs for each TAG file + for tagGUID in guidMap.keys(): + # not found + if not tmpTagDsMap.has_key(tagGUID): + errStr = 'TAG GUID=%s not found in DQ2' % tagGUID + self.putLog(errStr,type='error') + return failedRet + # get TAG file info + tagElementDS = tmpTagDsMap[tagGUID] + tmpFileRet,tmpTagFileInfo = self.getFileFromDataset(tmpTagDsMap[tagGUID],tagGUID) + # failed + if not tmpFileRet: + self.putLog("failed to get fileinfo for GUID:%s DS:%s" % (tagGUID,tmpTagDsMap[tagGUID]),type='error') + return failedRet + # convert parent GUIDs to DS/LFNs + tmpParentRet,tmpParentDsMap = self.listDatasetsByGUIDs(guidMap[tagGUID].keys(),[]) + # failed + if not tmpParentRet: + self.putLog("failed to convert GUIDs:%s to parent datasets" % str(guidMap[tagGUID].keys()),type='error') + return failedRet + # empty + if tmpParentDsMap == {}: + self.putLog("there is no parent dataset for GUIDs:%s" % str(guidMap[tagGUID].keys()),type='error') + return failedRet + # loop over all parent GUIDs + for parentGUID in guidMap[tagGUID].keys(): + # not found + if not tmpParentDsMap.has_key(parentGUID): + errStr = '%s GUID=%s not found in DQ2' % (re.sub('_ref$','',streamRef),parentGUID) + self.putLog(errStr,type='error') + return failedRet + # get parent file info + tmpParentDS = tmpParentDsMap[parentGUID] + tmpFileRet,tmpParentFileInfo = self.getFileFromDataset(tmpParentDS,parentGUID) + # failed + if not tmpFileRet: + self.putLog("failed to get parent fileinfo for GUID:%s DS:%s" % (parentGUID,tmpParentDS), + type='error') + return failedRet + # collect files + allFiles.append(tmpParentFileInfo) + # get location + if not tmpParentDS in allDatasets: + allDatasets.append(tmpParentDS) + # get location + statRep,replicaMap = self.getListDatasetReplicas(tmpParentDS) + # failed + if not statRep: + self.putLog("failed to get locations for DS:%s" % tmpParentDS,type='error') + return failedRet + # collect locations + tmpLocationList = [] + for tmpLocation in replicaMap.keys(): + if not tmpLocation in tmpLocationList: + tmpLocationList.append(tmpLocation) + allLocations[tmpParentDS] = tmpLocationList + # return + self.putLog('converted to %s, %s, %s' % (str(allDatasets),str(allLocations),str(allFiles))) + return True,allLocations,allFiles + + + # put log + def putLog(self,msg,type='debug',sendLog=False,actionTag='',tagsMap={}): + tmpMsg = self.token+' '+msg + if type == 'error': + _logger.error(tmpMsg) + # keep last error message + self.lastMessage = tmpMsg + else: + _logger.debug(tmpMsg) + # send to logger + if sendLog: + tmpMsg = self.token + ' - ' + if actionTag != '': + tmpMsg += 'action=%s ' % actionTag + for tmpTag,tmpTagVal in tagsMap.iteritems(): + tmpMsg += '%s=%s ' % (tmpTag,tmpTagVal) + tmpMsg += '- ' + msg + tmpPandaLogger = PandaLogger() + tmpPandaLogger.lock() + tmpPandaLogger.setParams({'Type':'pd2p'}) + tmpLog = tmpPandaLogger.getHttpLogger(panda_config.loggername) + # add message + if type == 'error': + tmpLog.error(tmpMsg) + else: + tmpLog.info(tmpMsg) + # release HTTP handler + tmpPandaLogger.release() + time.sleep(1) + + + # peek log + def peekLog(self): + return self.lastMessage + + + # make T1 subscription + def makeT1Subscription(self,allCloudCandidates,tmpDS,dsSize, + nUsed=None,nWaitingJobs=None,nWaitingJobsets=None): + useSmallT1 = None + # no candidate + if allCloudCandidates == []: + return True,useSmallT1 + # convert to siteIDs + t1Candidates = [] + t1Weights = {} + siteToCloud = {} + for tmpCloud in allCloudCandidates: + tmpCloudSpec = self.siteMapper.getCloud(tmpCloud) + tmpT1SiteID = tmpCloudSpec['source'] + t1Candidates.append(tmpT1SiteID) + # use MoU share + t1Weights[tmpT1SiteID] = tmpCloudSpec['mcshare'] + # reverse lookup + siteToCloud[tmpT1SiteID] = tmpCloud + # get free disk size + self.putLog("getting free disk size for T1 PD2P") + retFreeSizeMap,freeSizeMap = self.getFreeDiskSize(tmpDS,t1Candidates) + if not retFreeSizeMap: + self.putLog("failed to get free disk size",type='error',sendLog=True) + return False,useSmallT1 + # run brokerage + tmpJob = JobSpec() + tmpJob.AtlasRelease = '' + self.putLog("run brokerage for T1-T1 for %s" % tmpDS) + selectedSite = self.chooseSite(t1Weights,freeSizeMap,dsSize) + self.putLog("site for T1 PD2P -> %s" % selectedSite) + # simulation + if self.simul: + return True,useSmallT1 + # no candidate + if selectedSite == None: + self.putLog("no candidate for T1-T1") + return False,useSmallT1 + # make subscription + tmpJob.computingSite = selectedSite + subRet,dq2ID = self.makeSubscription(tmpDS,tmpJob.computingSite) + tmpTagsMap = {'site':tmpJob.computingSite,'dataset':tmpDS} + if nUsed != None: + tmpTagsMap['nused'] = nUsed + if nWaitingJobs != None: + tmpTagsMap['nwaitingjobs'] = nWaitingJobs + if nWaitingJobsets != None: + tmpTagsMap['nwaitingjobsets'] = nWaitingJobsets + self.putLog("made subscription for T1-T1 to %s:%s" % (tmpJob.computingSite,dq2ID),sendLog=True, + actionTag='SELECTEDT1',tagsMap=tmpTagsMap) + # check if small cloud is used + if siteToCloud[tmpJob.computingSite] in cloudsWithSmallT1: + useSmallT1 = siteToCloud[tmpJob.computingSite] + # update database + if subRet: + self.taskBuffer.addUserSubscription(tmpDS,[dq2ID]) + return True,useSmallT1 + else: + return False,useSmallT1 + + + # make T2 subscription with MoU share + def makeT2SubscriptionMoU(self,allCandidates,tmpDS,dsSize,pd2pType, + nUsed=None,nWaitingJobs=None,nWaitingJobsets=None): + # no candidate + if allCandidates == []: + return True,None + # get MoU share + if self.shareMoUForT2 == None: + self.shareMoUForT2 = self.taskBuffer.getMouShareForT2PD2P() + # convert to DQ2 ID + t2Candidates = [] + t2Weights = {} + dq2List = [] + for tmpCandidate in allCandidates: + tmpDQ2ID = self.getDQ2ID(tmpCandidate,tmpDS) + if not tmpDQ2ID in dq2List: + # append + dq2List.append(tmpDQ2ID) + # get MoU share + if not self.shareMoUForT2.has_key(tmpDQ2ID): + # site is undefined in t_regions_replication + self.putLog("%s is not in MoU table" % tmpDQ2ID,type='error') + continue + if not self.shareMoUForT2[tmpDQ2ID]['status'] in ['ready']: + # site is not ready + self.putLog("%s is not ready in MoU table" % tmpDQ2ID) + continue + tmpWeight = self.shareMoUForT2[tmpDQ2ID]['weight'] + # skip if the weight is 0 + if tmpWeight == 0: + self.putLog("%s has 0 weight in MoU table" % tmpDQ2ID) + continue + # collect siteIDs and weights for brokerage + t2Candidates.append(tmpCandidate) + t2Weights[tmpCandidate] = tmpWeight + # sort for reproducibility + t2Candidates.sort() + # get free disk size + self.putLog("getting free disk size for T2 %s PD2P" % pd2pType) + retFreeSizeMap,freeSizeMap = self.getFreeDiskSize(tmpDS,t2Candidates) + if not retFreeSizeMap: + self.putLog("failed to get free disk size",type='error',sendLog=True) + return False,None + # run brokerage + tmpJob = JobSpec() + tmpJob.AtlasRelease = '' + self.putLog("run brokerage for T2 with %s for %s" % (pd2pType,tmpDS)) + selectedSite = self.chooseSite(t2Weights,freeSizeMap,dsSize) + self.putLog("site for T2 %s PD2P -> %s" % (pd2pType,selectedSite)) + # simulation + if self.simul: + return True,selectedSite + # no candidate + if selectedSite == None: + self.putLog("no candidate for T2 with %s" % pd2pType) + return False,None + # make subscription + subRet,dq2ID = self.makeSubscription(tmpDS,selectedSite) + tmpTagsMap = {'site':selectedSite,'dataset':tmpDS} + if nUsed != None: + tmpTagsMap['nused'] = nUsed + if nWaitingJobs != None: + tmpTagsMap['nwaitingjobs'] = nWaitingJobs + if nWaitingJobsets != None: + tmpTagsMap['nwaitingjobsets'] = nWaitingJobsets + self.putLog("made subscription for T2 with %s to %s:%s" % (pd2pType,selectedSite,dq2ID),sendLog=True, + actionTag='SELECTEDT2_%s' % pd2pType,tagsMap=tmpTagsMap) + # update database + if subRet: + self.taskBuffer.addUserSubscription(tmpDS,[dq2ID]) + return True,selectedSite + else: + return False,None + + + # choose site + def chooseSite(self,canWeights,freeSizeMap,datasetSize): + # loop over all candidates + totalW = 0 + allCandidates = [] + for tmpCan,tmpW in canWeights.iteritems(): + # size check + if freeSizeMap.has_key(tmpCan): + # disk threshold for PD2P max(5%,3TB) + diskThresholdPD2P = 1024 * 3 + thrForThisSite = long(freeSizeMap[tmpCan]['total'] * 5 / 100) + if thrForThisSite < diskThresholdPD2P: + thrForThisSite = diskThresholdPD2P + remSpace = freeSizeMap[tmpCan]['total'] - freeSizeMap[tmpCan]['used'] + if remSpace-datasetSize < thrForThisSite: + self.putLog(' skip: disk shortage %s-%s< %s' % (remSpace,datasetSize,thrForThisSite)) + continue + self.putLog('weight %s %s' % (tmpCan,tmpW)) + # get total weight + totalW += tmpW + # append candidate + allCandidates.append(tmpCan) + # no candidate + if allCandidates == []: + return None + # sort for reproducibility + allCandidates.sort() + # choose site + rNumber = random.random() * totalW + for tmpCan in allCandidates: + rNumber -= canWeights[tmpCan] + if rNumber <= 0: + return tmpCan + return allCandidates[-1] + + diff --git a/current/pandaserver/dataservice/ErrorCode.py b/current/pandaserver/dataservice/ErrorCode.py new file mode 100755 index 000000000..91faf46e1 --- /dev/null +++ b/current/pandaserver/dataservice/ErrorCode.py @@ -0,0 +1,16 @@ +############## errror code + +# Setupper +EC_Setupper = 100 + +# Setupper +EC_GUID = 101 + +# Adder +EC_Adder = 200 + +# Subscription failures +EC_Subscription = 201 + +# lost file (=taskbuffer.ErrorCode.EC_LostFile) +EC_LostFile = 110 diff --git a/current/pandaserver/dataservice/EventPicker.py b/current/pandaserver/dataservice/EventPicker.py new file mode 100644 index 000000000..977be5be5 --- /dev/null +++ b/current/pandaserver/dataservice/EventPicker.py @@ -0,0 +1,288 @@ +''' +add data to dataset + +''' + +import os +import re +import sys +import time +import fcntl +import datetime +import commands +import brokerage.broker +from dataservice import DynDataDistributer +from dataservice.MailUtils import MailUtils +from dataservice.Notifier import Notifier +from taskbuffer.JobSpec import JobSpec +from dataservice.datriHandler import datriHandler + + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('EventPicker') +DynDataDistributer.initLogger(_logger) + + +class EventPicker: + # constructor + def __init__(self,taskBuffer,siteMapper,evpFileName,ignoreError): + self.taskBuffer = taskBuffer + self.siteMapper = siteMapper + self.ignoreError = ignoreError + self.evpFileName = evpFileName + self.token = datetime.datetime.utcnow().isoformat(' ') + self.pd2p = DynDataDistributer.DynDataDistributer([],self.taskBuffer,self.siteMapper, + token=self.token) + self.userDatasetName = '' + self.creationTime = '' + self.params = '' + self.lockedBy = '' + self.evpFile = None + + # main + def run(self): + try: + self.putLog('start %s' % self.evpFileName) + # lock evp file + self.evpFile = open(self.evpFileName) + try: + fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_EX|fcntl.LOCK_NB) + except: + # relase + self.putLog("cannot lock %s" % self.evpFileName) + self.evpFile.close() + return True + # options + runEvtList = [] + eventPickDataType = '' + eventPickStreamName = '' + eventPickDS = [] + eventPickAmiTag = '' + inputFileList = [] + tagDsList = [] + tagQuery = '' + tagStreamRef = '' + # read evp file + for tmpLine in self.evpFile: + tmpMatch = re.search('^([^=]+)=(.+)$',tmpLine) + # check format + if tmpMatch == None: + continue + tmpItems = tmpMatch.groups() + if tmpItems[0] == 'runEvent': + # get run and event number + tmpRunEvt = tmpItems[1].split(',') + if len(tmpRunEvt) == 2: + runEvtList.append(tmpRunEvt) + elif tmpItems[0] == 'eventPickDataType': + # data type + eventPickDataType = tmpItems[1] + elif tmpItems[0] == 'eventPickStreamName': + # stream name + eventPickStreamName = tmpItems[1] + elif tmpItems[0] == 'eventPickDS': + # dataset pattern + eventPickDS = tmpItems[1].split(',') + elif tmpItems[0] == 'eventPickAmiTag': + # AMI tag + eventPickAmiTag = tmpItems[1] + elif tmpItems[0] == 'userName': + # user name + self.userDN = tmpItems[1] + self.putLog("user=%s" % self.userDN) + elif tmpItems[0] == 'userDatasetName': + # user dataset name + self.userDatasetName = tmpItems[1] + elif tmpItems[0] == 'lockedBy': + # client name + self.lockedBy = tmpItems[1] + elif tmpItems[0] == 'creationTime': + # creation time + self.creationTime = tmpItems[1] + elif tmpItems[0] == 'params': + # parameters + self.params = tmpItems[1] + elif tmpItems[0] == 'inputFileList': + # input file list + inputFileList = tmpItems[1].split(',') + try: + inputFileList.remove('') + except: + pass + elif tmpItems[0] == 'tagDS': + # TAG dataset + tagDsList = tmpItems[1].split(',') + elif tmpItems[0] == 'tagQuery': + # query for TAG + tagQuery = tmpItems[1] + elif tmpItems[0] == 'tagStreamRef': + # StreamRef for TAG + tagStreamRef = tmpItems[1] + if not tagStreamRef.endswith('_ref'): + tagStreamRef += '_ref' + # convert + if tagDsList == [] or tagQuery == '': + # convert run/event list to dataset/file list + tmpRet,locationMap,allFiles = self.pd2p.convertEvtRunToDatasets(runEvtList, + eventPickDataType, + eventPickStreamName, + eventPickDS, + eventPickAmiTag) + if not tmpRet: + self.endWithError('Failed to convert the run/event list to a dataset/file list') + return False + else: + # get parent dataset/files with TAG + tmpRet,locationMap,allFiles = self.pd2p.getTagParentInfoUsingTagQuery(tagDsList,tagQuery,tagStreamRef) + if not tmpRet: + self.endWithError('Failed to get parent dataset/file list with TAG') + return False + # use only files in the list + if inputFileList != []: + tmpAllFiles = [] + for tmpFile in allFiles: + if tmpFile['lfn'] in inputFileList: + tmpAllFiles.append(tmpFile) + allFiles = tmpAllFiles + # make dataset container + tmpRet = self.pd2p.registerDatasetContainerWithDatasets(self.userDatasetName,allFiles,locationMap) + if not tmpRet: + self.endWithError('Failed to make a dataset container %s' % self.userDatasetName) + return False + # get candidates + tmpRet,candidateMaps = self.pd2p.getCandidates(self.userDatasetName,checkUsedFile=False, + useHidden=True) + if not tmpRet: + self.endWithError('Failed to find candidate for destination') + return False + # collect all candidates + allCandidates = [] + for tmpDS,tmpDsVal in candidateMaps.iteritems(): + for tmpCloud,tmpCloudVal in tmpDsVal.iteritems(): + for tmpSiteName in tmpCloudVal[0]: + if not tmpSiteName in allCandidates: + allCandidates.append(tmpSiteName) + if allCandidates == []: + self.endWithError('No candidate for destination') + return False + # get size of dataset container + tmpRet,totalInputSize = self.pd2p.getDatasetSize(self.userDatasetName) + if not tmpRet: + self.endWithError('Failed to get the size of %s' % self.userDatasetName) + return False + # run brokerage + tmpJob = JobSpec() + tmpJob.AtlasRelease = '' + self.putLog("run brokerage for %s" % tmpDS) + brokerage.broker.schedule([tmpJob],self.taskBuffer,self.siteMapper,True,allCandidates, + True,datasetSize=totalInputSize) + if tmpJob.computingSite.startswith('ERROR'): + self.endWithError('brokerage failed with %s' % tmpJob.computingSite) + return False + self.putLog("site -> %s" % tmpJob.computingSite) + # send request to DaTRI + if self.lockedBy.startswith('ganga'): + tmpHandler = datriHandler(type='ganga') + else: + tmpHandler = datriHandler(type='pathena') + # remove redundant CN from DN + tmpDN = self.userDN + tmpDN = re.sub('/CN=limited proxy','',tmpDN) + tmpDN = re.sub('(/CN=proxy)+$','',tmpDN) + tmpMsg = "%s ds=%s site=%s id=%s" % ('datriHandler.sendRequest', + self.userDatasetName, + self.siteMapper.getSite(tmpJob.computingSite).ddm, + tmpDN) + self.putLog(tmpMsg) + tmpHandler.setParameters(data_pattern=self.userDatasetName, + site=self.siteMapper.getSite(tmpJob.computingSite).ddm, + userid=tmpDN) + nTry = 3 + for iTry in range(nTry): + dhStatus,dhOut = tmpHandler.sendRequest() + # succeeded + if dhStatus == 0 or "such request is exist" in dhOut: + self.putLog("%s %s" % (dhStatus,dhOut)) + break + if iTry+1 < nTry: + # sleep + time.sleep(60) + else: + # final attempt failed + self.endWithError('Failed to send request to DaTRI : %s %s' % (dhStatus,dhOut)) + return False + # send email notification for success + tmpMsg = 'A transfer request was successfully sent to DaTRI.\n' + tmpMsg += 'You will receive a notification from DaTRI when it completed.' + self.sendEmail(True,tmpMsg) + try: + # unlock and delete evp file + fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN) + self.evpFile.close() + os.remove(self.evpFileName) + except: + pass + # successfully terminated + self.putLog("end %s" % self.evpFileName) + return True + except: + errType,errValue = sys.exc_info()[:2] + self.endWithError('Got exception %s:%s' % (errType,errValue)) + return False + + + # end with error + def endWithError(self,message): + self.putLog(message,'error') + # unlock evp file + try: + fcntl.flock(self.evpFile.fileno(),fcntl.LOCK_UN) + self.evpFile.close() + if not self.ignoreError: + # remove evp file + os.remove(self.evpFileName) + # send email notification + self.sendEmail(False,message) + except: + pass + self.putLog('end %s' % self.evpFileName) + + + # put log + def putLog(self,msg,type='debug'): + tmpMsg = self.token+' '+msg + if type == 'error': + _logger.error(tmpMsg) + else: + _logger.debug(tmpMsg) + + + # send email notification + def sendEmail(self,isSucceeded,message): + # mail address + toAdder = Notifier(self.taskBuffer,None,[]).getEmail(self.userDN) + if toAdder == '': + self.putLog('cannot find email address for %s' % self.userDN,'error') + return + # subject + mailSubject = "PANDA notification for Event-Picking Request" + # message + mailBody = "Hello,\n\nHere is your request status for event picking\n\n" + if isSucceeded: + mailBody += "Status : Passed to DaTRI\n" + else: + mailBody += "Status : Failed\n" + mailBody += "Created : %s\n" % self.creationTime + mailBody += "Ended : %s\n" % datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') + mailBody += "Dataset : %s\n" % self.userDatasetName + mailBody += "\n" + mailBody += "Parameters : %s %s\n" % (self.lockedBy,self.params) + mailBody += "\n" + mailBody += "%s\n" % message + # send + retVal = MailUtils().send(toAdder,mailSubject,mailBody) + # return + return diff --git a/current/pandaserver/dataservice/Finisher.py b/current/pandaserver/dataservice/Finisher.py new file mode 100755 index 000000000..64d5c30be --- /dev/null +++ b/current/pandaserver/dataservice/Finisher.py @@ -0,0 +1,178 @@ +''' +finish transferring jobs + +''' + +import re +import sys +import commands +import threading +from DDM import ddm +from config import panda_config + +from brokerage.SiteMapper import SiteMapper + +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('Finisher') + + +class Finisher (threading.Thread): + # constructor + def __init__(self,taskBuffer,dataset,job=None,site=None): + threading.Thread.__init__(self) + self.dataset = dataset + self.taskBuffer = taskBuffer + self.job = job + self.site = site + + + # main + def run(self): + # start + try: + if self.job == None: + _logger.debug("start: %s" % self.dataset.name) + _logger.debug("callback from %s" % self.site) + # FIXME when callback from BNLPANDA disappeared + if self.site == 'BNLPANDA': + self.site = 'BNL-OSG2_ATLASMCDISK' + # instantiate site mapper + siteMapper = SiteMapper(self.taskBuffer) + # get computingSite/destinationSE + computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name) + if destinationSE == None: + # try to get computingSite/destinationSE from ARCH to delete sub + # even if no active jobs left + computingSite,destinationSE = self.taskBuffer.getDestSE(self.dataset.name,True) + if destinationSE == None: + _logger.error("cannot get source/destination for %s" % self.dataset.name) + _logger.debug("end: %s" % self.dataset.name) + return + _logger.debug("src: %s" % computingSite) + _logger.debug("dst: %s" % destinationSE) + # get corresponding token + tmpSrcSiteSpec = siteMapper.getSite(computingSite) + tmpDstSiteSpec = siteMapper.getSite(destinationSE) + _logger.debug(tmpDstSiteSpec.setokens) + destToken = None + for tmpToken,tmpDdmId in tmpDstSiteSpec.setokens.iteritems(): + if self.site == tmpDdmId: + destToken = tmpToken + break + _logger.debug("use Token=%s" % destToken) + # get required tokens + reqTokens = self.taskBuffer.getDestTokens(self.dataset.name) + if reqTokens == None: + _logger.error("cannot get required token for %s" % self.dataset.name) + _logger.debug("end: %s" % self.dataset.name) + return + _logger.debug("req Token=%s" % reqTokens) + # make bitmap for the token + bitMap = 1 + if len(reqTokens.split(','))>1: + for tmpReqToken in reqTokens.split(','): + if tmpReqToken == destToken: + break + # shift one bit + bitMap <<= 1 + # completed bitmap + compBitMap = (1 << len(reqTokens.split(',')))-1 + # ignore the lowest bit for T1, file on DISK is already there + if tmpSrcSiteSpec.ddm == tmpDstSiteSpec.ddm: + compBitMap = compBitMap & 0xFFFE + # update bitmap in DB + updatedBitMap = self.taskBuffer.updateTransferStatus(self.dataset.name,bitMap) + _logger.debug("transfer status:%s - comp:%s - bit:%s" % (hex(updatedBitMap),hex(compBitMap),hex(bitMap))) + # update output files + if (updatedBitMap & compBitMap) == compBitMap: + ids = self.taskBuffer.updateOutFilesReturnPandaIDs(self.dataset.name) + # set flag for T2 cleanup + self.dataset.status = 'cleanup' + self.taskBuffer.updateDatasets([self.dataset]) + else: + _logger.debug("end: %s" % self.dataset.name) + return + else: + _logger.debug("start: %s" % self.job.PandaID) + # update input files + ids = [self.job.PandaID] + _logger.debug("IDs: %s" % ids) + if len(ids) != 0: + # get job + if self.job == None: + jobs = self.taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False) + else: + jobs = [self.job] + # loop over all jobs + for job in jobs: + if job == None: + continue + _logger.debug("Job: %s" % job.PandaID) + if job.jobStatus == 'transferring': + jobReady = True + # check file status + for file in job.Files: + if file.type == 'output' or file.type == 'log': + if file.status != 'ready': + _logger.debug("Job: %s file:%s %s != ready" % (job.PandaID,file.lfn,file.status)) + jobReady = False + break + # finish job + if jobReady: + _logger.debug("Job: %s all files ready" % job.PandaID) + # create XML + try: + import xml.dom.minidom + dom = xml.dom.minidom.getDOMImplementation() + doc = dom.createDocument(None,'xml',None) + topNode = doc.createElement("POOLFILECATALOG") + for file in job.Files: + if file.type in ['output','log']: + # File + fileNode = doc.createElement("File") + fileNode.setAttribute("ID",file.GUID) + # LFN + logNode = doc.createElement("logical") + lfnNode = doc.createElement("lfn") + lfnNode.setAttribute('name',file.lfn) + # metadata + fsizeNode = doc.createElement("metadata") + fsizeNode.setAttribute("att_name","fsize") + fsizeNode.setAttribute("att_value",str(file.fsize)) + # checksum + if file.checksum.startswith('ad:'): + # adler32 + chksumNode = doc.createElement("metadata") + chksumNode.setAttribute("att_name","adler32") + chksumNode.setAttribute("att_value",re.sub('^ad:','',file.checksum)) + else: + # md5sum + chksumNode = doc.createElement("metadata") + chksumNode.setAttribute("att_name","md5sum") + chksumNode.setAttribute("att_value",re.sub('^md5:','',file.checksum)) + # append nodes + logNode.appendChild(lfnNode) + fileNode.appendChild(logNode) + fileNode.appendChild(fsizeNode) + fileNode.appendChild(chksumNode) + topNode.appendChild(fileNode) + # write to file + xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,job.PandaID,'finished',commands.getoutput('uuidgen')) + oXML = open(xmlFile,"w") + oXML.write(topNode.toxml()) + oXML.close() + except: + type, value, traceBack = sys.exc_info() + _logger.error("%s : %s %s" % (job.PandaID,type,value)) + _logger.debug("Job: %s status: %s" % (job.PandaID,job.jobStatus)) + # end + if self.job == None: + _logger.debug("end: %s" % self.dataset.name) + else: + _logger.debug("end: %s" % self.job.PandaID) + except: + type, value, traceBack = sys.exc_info() + _logger.error("run() : %s %s" % (type,value)) + diff --git a/current/pandaserver/dataservice/MailUtils.py b/current/pandaserver/dataservice/MailUtils.py new file mode 100755 index 000000000..9a8dfd290 --- /dev/null +++ b/current/pandaserver/dataservice/MailUtils.py @@ -0,0 +1,103 @@ +''' +email utilities +''' + +import sys +import smtplib + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('MailUtils') + +class MailUtils: + # constructor + def __init__(self): + pass + + # main + def send(self,toAddr,mailSubject,mailBody): + _logger.debug("start SEND session") + try: + # remove duplicated address + listToAddr = [] + newToAddr = '' + for tmpToAddr in toAddr.split(','): + if not tmpToAddr in listToAddr: + listToAddr.append(tmpToAddr) + newToAddr += '%s,' % tmpToAddr + toAddr = newToAddr[:-1] + # make message + fromAdd = panda_config.emailSender + message = \ +"""Subject: %s +From: %s +To: %s + +%s +""" % (mailSubject,fromAdd,toAddr,mailBody) + message = self.addTailer(message) + # send mail + _logger.debug("send to %s\n%s" % (toAddr,message)) + server = smtplib.SMTP(panda_config.emailSMTPsrv) + server.set_debuglevel(1) + server.ehlo() + server.starttls() + #server.login(panda_config.emailLogin,panda_config.emailPass) + out = server.sendmail(fromAdd,listToAddr,message) + _logger.debug(out) + server.quit() + retVal = True + except: + type, value, traceBack = sys.exc_info() + _logger.error("%s %s" % (type,value)) + retVal = False + _logger.debug("end SEND session") + return retVal + + + # send update notification to user + def sendSiteAccessUpdate(self,toAddr,newStatus,pandaSite): + # subject + mailSubject = "PANDA Update on Access Request for %s" % pandaSite + # message + mailBody = "Hello,\n\nYour access request for %s has been %s \n" % (pandaSite,newStatus.upper()) + # send + retVal = self.send(toAddr,mailSubject,mailBody) + # return + return retVal + + + # send requests to cloud responsible + def sendSiteAccessRequest(self,toAddr,requestsMap,cloud): + # subject + mailSubject = "PANDA Access Requests in %s" % cloud + # message + mailBody = "Hello,\n\nThere are access requests to be approved or rejected.\n\n" + for pandaSite,userNames in requestsMap.iteritems(): + mailBody += " %s\n" % pandaSite + userStr = '' + for userName in userNames: + userStr += ' %s,' % userName + userStr = userStr[:-1] + mailBody += " %s\n\n" % userStr + # send + retVal = self.send(toAddr,mailSubject,mailBody) + # return + return retVal + + + # add tailer + def addTailer(self,msg): + msg += """ +Report Panda problems of any sort to + + the eGroup for help request + hn-atlas-dist-analysis-help@cern.ch + + the Savannah for software bug + https://savannah.cern.ch/projects/panda/ +""" + return msg + diff --git a/current/pandaserver/dataservice/Merger.py b/current/pandaserver/dataservice/Merger.py new file mode 100644 index 000000000..b8e1d60e5 --- /dev/null +++ b/current/pandaserver/dataservice/Merger.py @@ -0,0 +1,692 @@ +''' +merge files in dataset + +''' + +import re +import sys +import time +import commands + +import dq2.common +from dq2.clientapi import DQ2 +import dq2.container.exceptions + +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('Merger') + + +class Merger: + + # constructor + def __init__(self,taskBuffer,job,simulFlag=False,noSubmit=False): + self.taskBuffer = taskBuffer + self.job = job + self.mergeType = "" + self.mergeScript = "" + self.runDir = "." + self.mergeTypeMap = {} + self.supportedMergeType = ['hist','ntuple','pool','user','log','text'] + self.simulFlag = simulFlag + self.noSubmit = noSubmit + self.dsContMergeLog = "" + self.fileDestSeMap = {} + + + # parse jobParameters and get mergeType specified by the client + def getMergeType(self): + type = "" + try: + paramList = re.split('\W+',self.job.jobParameters.strip()) + type = paramList[ paramList.index('mergeType') + 1 ] + except: + _logger.debug("%s cannot find --mergeType parameter from parent job" % self.job.PandaID) + return type + + + # parse jobParameters and get mergeScript specified by the client + def getUserMergeScript(self): + script = "" + try: + match = re.search("--mergeScript\s(([^\'\"\s]+)|(\"[^\"]+\")|(\'[^\']+\'))",self.job.jobParameters) + if match != None: + script = match.group(1) + except: + _logger.debug("%s cannot find --mergeScript parameter from parent job" % self.job.PandaID) + return script + + # parse jobParameters and get rundir specified by the client + def getRunDir(self): + rundir = "." + try: + m = re.match(r'.*\-r\s+(\S+)\s+.*', self.job.jobParameters.strip()) + if m: + rundir = re.sub(r'[\'"]','',m.group(1)) + except: + _logger.debug("%s cannot find -r parameter from parent job" % self.job.PandaID) + return rundir + + # parse jobParameters and get ROOT version + def getRootVer(self): + ver = "" + try: + m = re.match(r'.*\--rootVer\s+(\S+)\s+.*', self.job.jobParameters.strip()) + if m: + ver = m.group(1) + except: + _logger.debug("%s cannot find --rootVer parameter from parent job" % self.job.PandaID) + return ver + + # get file type + def getFileType(self,tmpLFN): + tmpLFN = re.sub('\.\d+$','',tmpLFN) + tmpMatch = re.search('^(.+)\._\d+\.(.+)$',tmpLFN) + if tmpMatch != None: + return (tmpMatch.group(1),tmpMatch.group(2)) + return None + + + # parse jobSpec to get merge type automatically + def getMergeTypeAuto(self): + # look for outmap + try: + tmpMatch = re.search('-o \"([^\"]+)\"',self.job.jobParameters) + outMapStr = tmpMatch.group(1) + exec "outMap="+outMapStr + except: + errType,errValue = sys.exc_info()[:2] + _logger.debug("%s cannot extract outMap from jobParameters=%s %s:%s" % \ + (self.job.PandaID,self.job.jobParameters,errType,errValue)) + return False + # convert output type to merge type + if '/runGen-' in self.job.transformation: + # loop over all output files for runGen + for oldName,newName in outMap.iteritems(): + # get file type + tmpKey = self.getFileType(newName) + if tmpKey != None: + # check extension + if re.search('\.pool\.root(\.\d+)*$',newName) != None: + # POOL + tmpType = 'pool' + elif re.search('\.root(\.\d+)*$',newName) != None: + # map all root files to ntuple + tmpType = 'ntuple' + else: + # catch all using zip + tmpType = 'text' + # append + self.mergeTypeMap[tmpKey] = tmpType + else: + # hist + if outMap.has_key('hist'): + tmpType = 'hist' + tmpKey = self.getFileType(outMap['hist']) + if tmpKey != None: + # append + self.mergeTypeMap[tmpKey] = tmpType + # ntuple + if outMap.has_key('ntuple'): + tmpType = 'ntuple' + for sName,fName in outMap['ntuple']: + tmpKey = self.getFileType(fName) + if tmpKey != None: + # append + self.mergeTypeMap[tmpKey] = tmpType + # AANT + if outMap.has_key('AANT'): + # map AANT to ntuple for now + tmpType = 'ntuple' + for aName,sName,fName in outMap['AANT']: + tmpKey = self.getFileType(fName) + if tmpKey != None: + # append + self.mergeTypeMap[tmpKey] = tmpType + # THIST + if outMap.has_key('THIST'): + tmpType = 'ntuple' + for aName,fName in outMap['THIST']: + tmpKey = self.getFileType(fName) + if tmpKey != None: + # append only when the stream is not used by AANT + if not self.mergeTypeMap.has_key(tmpKey): + self.mergeTypeMap[tmpKey] = tmpType + # POOL + for tmpOutType,tmpOutVal in outMap.iteritems(): + # TAG is mapped to POOL for now + if tmpOutType in ['RDO','ESD','AOD','TAG','Stream1','Stream2']: + tmpType = 'pool' + tmpKey = self.getFileType(tmpOutVal) + if tmpKey != None: + # append + self.mergeTypeMap[tmpKey] = tmpType + # general POOL stream + if outMap.has_key('StreamG'): + tmpType = 'pool' + for sName,fName in outMap['StreamG']: + tmpKey = self.getFileType(fName) + if tmpKey != None: + # append + self.mergeTypeMap[tmpKey] = tmpType + # meta + if outMap.has_key('Meta'): + tmpType = 'pool' + for sName,fName in outMap['Meta']: + tmpKey = self.getFileType(fName) + if tmpKey != None: + # append only when the stream is not used by another + if not self.mergeTypeMap.has_key(tmpKey): + self.mergeTypeMap[tmpKey] = tmpType + # UserData + if outMap.has_key('UserData'): + tmpType = 'pool' + for fName in outMap['UserData']: + tmpKey = self.getFileType(fName) + if tmpKey != None: + # append + self.mergeTypeMap[tmpKey] = tmpType + # BS + if outMap.has_key('BS'): + # ByteStream is mapped to text to use zip for now + tmpType = 'text' + tmpKey = self.getFileType(outMap['BS']) + if tmpKey != None: + # append + self.mergeTypeMap[tmpKey] = tmpType + # extra outputs + if outMap.has_key('IROOT'): + for oldName,newName in outMap['IROOT']: + tmpKey = self.getFileType(newName) + if tmpKey != None: + # check extension + if re.search('\.pool\.root(\.\d+)*$',newName) != None: + # POOL + tmpType = 'pool' + elif re.search('\.root(\.\d+)*$',newName) != None: + # map all root files to ntuple + tmpType = 'ntuple' + else: + # catch all using zip + tmpType = 'text' + # append + self.mergeTypeMap[tmpKey] = tmpType + # dump + _logger.debug("%s automatic merge type mapping -> %s" % (self.job.PandaID,str(self.mergeTypeMap))) + return True + + + # detect merge type with LFN prefix and suffix + def detectMergeTypeWithLFN(self,filePrefix,fileSuffix): + tmpKey = (filePrefix,fileSuffix) + if self.mergeTypeMap.has_key(tmpKey): + return self.mergeTypeMap[tmpKey] + # look for matching fileSuffix mainly for --useContElement which has differed prefix + for tmpKey in self.mergeTypeMap.keys(): + tmpFilePrefix,tmpFileSuffix = tmpKey + if tmpFileSuffix == fileSuffix: + _logger.debug("%s updated merge type mapping for %s:%s -> %s" % (self.job.PandaID,filePrefix,fileSuffix,str(self.mergeTypeMap))) + self.mergeTypeMap[(filePrefix,fileSuffix)] = self.mergeTypeMap[tmpKey] + return self.mergeTypeMap[tmpKey] + raise RuntimeError,'cannot find merge type for %s %s' % (filePrefix,fileSuffix) + + + # main returns None for unrecoverable + def run(self): + try: + _logger.debug("%s start" % self.job.PandaID) + # check source label + if not self.job.prodSourceLabel in ['user',]: + _logger.debug("%s do nothing for non-user job" % self.job.PandaID) + _logger.debug("%s end" % self.job.PandaID) + return None + # check command-line parameter + if not self.simulFlag and not "--mergeOutput" in self.job.jobParameters: + _logger.debug("%s skip no-merge" % self.job.PandaID) + _logger.debug("%s end" % self.job.PandaID) + return None + # get mergeType from jobParams + self.mergeType = self.getMergeType() + self.mergeScript = self.getUserMergeScript() + + # if mergeScript is given by user, it's equivalent to user mode mergeType + if self.mergeScript: + self.mergeType = 'user' + + if self.mergeType != '': + # check if the merging type is given and is supported + if self.mergeType not in self.supportedMergeType: + _logger.error("%s skip not supported merging type \"%s\"" % (self.job.PandaID, self.mergeType)) + _logger.debug("%s end" % self.job.PandaID) + return None + elif self.mergeType in ['user']: + self.runDir = self.getRunDir() + if not self.mergeScript: + _logger.error("%s skip: no merging command specified for merging type \"%s\"" % (self.job.PandaID, self.mergeType)) + _logger.debug("%s end" % self.job.PandaID) + return None + else: + # automatic merge type detection + tmpRet = self.getMergeTypeAuto() + if not tmpRet: + _logger.error("%s failed to detect merge type automatically" % self.job.PandaID) + _logger.debug("%s end" % self.job.PandaID) + return None + # instantiate DQ2 + self.dq2api = DQ2.DQ2() + # get list of datasets + dsList = [] + dsSubDsMap = {} + for tmpFile in self.job.Files: + # use output/log + if not tmpFile.type in ['log','output']: + continue + tmpContName = tmpFile.dataset + # extend logfile container name with ".merge.log" for storing logs of the merging operation + if tmpFile.type == 'log' and not self.dsContMergeLog: + self.dsContMergeLog = re.sub('/$','.merge.log/',tmpFile.dataset) + tmpSubDsName = tmpFile.destinationDBlock + # remove _sub + tmpDsName = re.sub('_sub\d+$','',tmpSubDsName) + tmpKey = (tmpContName,tmpDsName) + if not tmpKey in dsList: + dsList.append(tmpKey) + dsSubDsMap[tmpDsName] = tmpSubDsName + # get type + tmpMatch = self.getFileType(tmpFile.lfn) + if tmpMatch != None: + self.fileDestSeMap[tmpMatch] = tmpFile.destinationSE + # loop over all datasets + mergeJobList = {} + for tmpContName,tmpDsName in dsList: + # check prefix + if (not tmpDsName.startswith('user')) and (not tmpDsName.startswith('group')): + _logger.debug("%s ignore non-user/group DS %s" % (self.job.PandaID,tmpDsName)) + continue + # get list of files + _logger.debug("%s listFilesInDataset %s" % (self.job.PandaID,tmpDsName)) + tmpAllFileMap = {} + nTry = 3 + for iTry in range(nTry): + try: + tmpRetTimeStamp = self.dq2api.listFilesInDataset(tmpDsName) + except DQ2.DQUnknownDatasetException: + _logger.error("%s DQ2 doesn't know %s" % (self.job.PandaID,tmpDsName)) + _logger.debug("%s end" % self.job.PandaID) + return None + except: + if (iTry+1) == nTry: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s DQ2 failed with %s:%s to get file list for %s" % (self.job.PandaID,errType,errValue,tmpDsName)) + _logger.debug("%s end" % self.job.PandaID) + return False + # sleep + time.sleep(60) + # empty + if tmpRetTimeStamp == (): + # close dataset + varMap = {} + varMap[':name'] = tmpDsName + varMap[':status'] = 'tobeclosed' + uSQL = "UPDATE /*+ INDEX(tab DATASETS_NAME_IDX)*/ ATLAS_PANDA.Datasets " + uSQL += "SET status=:status,modificationdate=CURRENT_DATE WHERE name=:name " + self.taskBuffer.querySQLS(uSQL,varMap) + _logger.debug("%s %s is empty" % (self.job.PandaID,tmpDsName)) + continue + # loop over all GUIDs + tmpRet,tmpTimeStamp = tmpRetTimeStamp + for tmpGUID,tmpVal in tmpRet.iteritems(): + # set GUID + tmpVal['guid'] = tmpGUID + # get type + tmpMatch = self.getFileType(tmpVal['lfn']) + if tmpMatch == None: + _logger.error("%s cannot get type for %s" % (self.job.PandaID,tmpVal['lfn'])) + _logger.debug("%s end" % self.job.PandaID) + return None + tmpType = (tmpMatch[0],tmpMatch[1],tmpContName,tmpDsName) + # append + if not tmpAllFileMap.has_key(tmpType): + tmpAllFileMap[tmpType] = {} + tmpAllFileMap[tmpType][tmpVal['lfn']] = tmpVal + # max size of merged file + maxMergedFileSize = 5 * 1024 * 1024 * 1024 + # max number of files to be merged + maxNumToBeMerged = 200 + # loop over all types + for tmpType,tmpFileMap in tmpAllFileMap.iteritems(): + # sort LFNs + tmpFileList = tmpFileMap.keys() + tmpFileList.sort() + # split by size + subTotalSize = 0 + subFileList = [] + for tmpFileName in tmpFileList: + if (subTotalSize+tmpFileMap[tmpFileName]['filesize'] > maxMergedFileSize and subFileList != []) \ + or len(subFileList) >= maxNumToBeMerged: + # instantiate job + tmpMergeJob = self.makeMergeJob(subFileList,tmpFileMap,tmpType) + # append + if not mergeJobList.has_key(tmpDsName): + mergeJobList[tmpDsName] = [] + mergeJobList[tmpDsName].append(tmpMergeJob) + # reset + subTotalSize = 0 + subFileList = [] + # append + subTotalSize += tmpFileMap[tmpFileName]['filesize'] + subFileList.append(tmpFileName) + # remaining + if subFileList != []: + # instantiate job + tmpMergeJob = self.makeMergeJob(subFileList,tmpFileMap,tmpType) + # append + if not mergeJobList.has_key(tmpDsName): + mergeJobList[tmpDsName] = [] + mergeJobList[tmpDsName].append(tmpMergeJob) + # terminate simulation + if self.simulFlag and not self.noSubmit: + _logger.debug("%s end simulation" % self.job.PandaID) + return True + # get list of new datasets + newDatasetMap = {} + for tmpDsName,tmpJobList in mergeJobList.iteritems(): + # loop over all files + for tmpFile in tmpJobList[0].Files: + # ignore inputs + if not tmpFile.type in ['output','log']: + continue + # append + if not newDatasetMap.has_key(tmpFile.dataset): + newDatasetMap[tmpFile.dataset] = [] + if not tmpFile.destinationDBlock in newDatasetMap[tmpFile.dataset]: + newDatasetMap[tmpFile.dataset].append(tmpFile.destinationDBlock) + # remove /CN=proxy and /CN=limited from DN + tmpRealDN = self.job.prodUserID + tmpRealDN = re.sub('/CN=limited proxy','',tmpRealDN) + tmpRealDN = re.sub('/CN=proxy','',tmpRealDN) + tmpRealDN = dq2.common.parse_dn(tmpRealDN) + # register container for merge log files + if self.dsContMergeLog: + # register new container for the logs of merging operation + _logger.debug("%s registerContainer %s" % (self.job.PandaID, self.dsContMergeLog)) + nTry = 3 + unRecoverable = False + for iTry in range(nTry): + try: + self.dq2api.registerContainer(self.dsContMergeLog) + break + except DQ2.DQDatasetExistsException: + break + except: + errType,errValue = sys.exc_info()[:2] + if 'exceeds the maximum length' in str(errValue): + unRecoverable = True + if unRecoverable or (iTry+1) == nTry: + _logger.error("%s DQ2 failed with %s:%s to register new container %s" % (self.job.PandaID,errType,errValue,self.dsContMergeLog)) + _logger.debug("%s end" % self.job.PandaID) + if unRecoverable: + return None + return False + # sleep + time.sleep(60) + # set container owner + _logger.debug("%s setMetaDataAttribute %s %s" % (self.job.PandaID, self.dsContMergeLog, tmpRealDN)) + nTry = 3 + for iTry in range(nTry): + try: + self.dq2api.setMetaDataAttribute(self.dsContMergeLog, 'owner', tmpRealDN) + except: + if (iTry+1) == nTry: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s DQ2 failed with %s:%s to set owner for %s" % (self.job.PandaID,errType,errValue,self.dsContMergeLog)) + _logger.debug("%s end" % self.job.PandaID) + return False + # sleep + time.sleep(60) + # register datasets + for tmpDsContainer,tmpNewDatasets in newDatasetMap.iteritems(): + # loop over all datasets + for tmpNewDS in tmpNewDatasets: + # register + _logger.debug("%s registerNewDataset %s" % (self.job.PandaID,tmpNewDS)) + nTry = 3 + for iTry in range(nTry): + try: + self.dq2api.registerNewDataset(tmpNewDS) + except DQ2.DQDatasetExistsException: + pass + except: + if (iTry+1) == nTry: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s DQ2 failed with %s:%s to register %s" % (self.job.PandaID,errType,errValue,tmpNewDS)) + _logger.debug("%s end" % self.job.PandaID) + return False + # sleep + time.sleep(60) + # set owner + _logger.debug("%s setMetaDataAttribute %s %s" % (self.job.PandaID,tmpNewDS,tmpRealDN)) + nTry = 3 + for iTry in range(nTry): + try: + self.dq2api.setMetaDataAttribute(tmpNewDS,'owner',tmpRealDN) + except: + if (iTry+1) == nTry: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s DQ2 failed with %s:%s to set owner for %s" % (self.job.PandaID,errType,errValue,tmpNewDS)) + _logger.debug("%s end" % self.job.PandaID) + return False + # sleep + time.sleep(60) + # add to container + if tmpDsContainer.endswith('/'): + # add + _logger.debug("%s registerDatasetsInContainer %s %s" % (self.job.PandaID,tmpDsContainer,str(tmpNewDatasets))) + nTry = 3 + for iTry in range(nTry): + try: + self.dq2api.registerDatasetsInContainer(tmpDsContainer,tmpNewDatasets) + break + except dq2.container.exceptions.DQContainerAlreadyHasDataset: + break + except: + if (iTry+1) == nTry: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s DQ2 failed with %s:%s to add datasets to %s" % (self.job.PandaID,errType,errValue,tmpDsContainer)) + _logger.debug("%s end" % self.job.PandaID) + return False + # sleep + time.sleep(60) + # no submission + if self.noSubmit: + _logger.debug("%s end with no submission" % self.job.PandaID) + return True + # submit new jobs + _logger.debug("%s submit jobs" % self.job.PandaID) + # fake FQANs + fqans = [] + if not self.job.countryGroup in ['','NULL',None]: + fqans.append('/atlas/%s/Role=NULL' % self.job.countryGroup) + if self.job.destinationDBlock.startswith('group') and not self.job.workingGroup in ['','NULL',None]: + fqans.append('/atlas/%s/Role=production' % self.job.workingGroup) + # insert jobs + for tmpDsName,tmpJobList in mergeJobList.iteritems(): + ret = self.taskBuffer.storeJobs(tmpJobList,self.job.prodUserID,True,False,fqans, + self.job.creationHost,True,checkSpecialHandling=False) + if ret == []: + _logger.error("%s storeJobs failed with [] for %s" % (self.job.PandaID,tmpDsName)) + _logger.debug("%s end" % self.job.PandaID) + return False + else: + # set jobDefID + tmpJobDefID = ret[0][1] + if not tmpJobDefID in ['NULL','',None,-1]: + varMap = {} + varMap[':name'] = dsSubDsMap[tmpDsName] + varMap[':moverID'] = tmpJobDefID + uSQL = "UPDATE /*+ INDEX(tab DATASETS_NAME_IDX)*/ ATLAS_PANDA.Datasets " + uSQL += "SET moverID=:moverID WHERE name=:name " + self.taskBuffer.querySQLS(uSQL,varMap) + # dump + strPandaIDs = '' + for tmpItem in ret: + strPandaIDs += '%s,' % tmpItem[0] + _logger.debug("%s jobDefID=%s mergeJobs=%s" % (self.job.PandaID,tmpJobDefID,strPandaIDs[:-1])) + # return + _logger.debug("%s end" % self.job.PandaID) + return True + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s failed with %s:%s" % (self.job.PandaID,errType,errValue)) + _logger.debug("%s end" % self.job.PandaID) + return None + + + # make merge job + def makeMergeJob(self,fileList,fileMap,fileType): + # make job spec + tmpJob = JobSpec() + # set release and cache + if not self.job.AtlasRelease in ['','NULL',None]: + tmpJob.AtlasRelease = self.job.AtlasRelease + if not self.job.homepackage in ['','NULL',None]: + tmpJob.homepackage = self.job.homepackage + tmpJob.prodSourceLabel = 'user' + tmpJob.prodUserID = self.job.prodUserID + tmpJob.assignedPriority = 5000 + tmpJob.jobName = 'usermerge.%s' % commands.getoutput('uuidgen') + tmpJob.computingSite = self.job.computingSite + tmpJob.metadata = self.job.metadata + tmpJob.prodDBlock = self.job.prodDBlock + tmpJob.destinationDBlock = self.job.destinationDBlock + tmpJob.destinationSE = self.job.destinationSE + tmpJob.cloud = self.job.cloud + tmpJob.cmtConfig = self.job.cmtConfig + tmpJob.lockedby = self.job.lockedby + tmpJob.processingType = 'usermerge' + tmpJob.jobsetID = self.job.jobsetID + tmpJob.jobDefinitionID = 0 + tmpJob.transformation = "http://pandaserver.cern.ch:25080/trf/user/runMerge-00-00-01" + # decompose fileType + filePrefix,fileSuffix,containerName,datasetName = fileType + fileTypeKey = (filePrefix,fileSuffix) + # output dataset name + outDsName = datasetName+'.merge' + # job parameter + params = '--parentDS %s --parentContainer %s --outDS %s' % (datasetName,containerName,outDsName) + # look for lib.tgz + for tmpLibFile in self.job.Files: + if tmpLibFile.type == 'input' and tmpLibFile.lfn.endswith('.lib.tgz'): + tmpFile = FileSpec() + tmpFile.lfn = tmpLibFile.lfn + tmpFile.GUID = tmpLibFile.GUID + tmpFile.fsize = tmpLibFile.fsize + tmpFile.md5sum = tmpLibFile.md5sum + tmpFile.checksum = tmpLibFile.checksum + tmpFile.dataset = tmpLibFile.dataset + tmpFile.prodDBlock = tmpLibFile.prodDBlock + tmpFile.type = 'input' + tmpFile.status = 'ready' + tmpFile.prodDBlockToken = 'local' + tmpJob.addFile(tmpFile) + params += " --libTgz %s" % tmpFile.lfn + break + # reverse sort to use the largest SN in merged LFN, which is required to find SN offset when outDS is reused + fileList.reverse() + # input + serNum = None + attNum = None + for tmpFileName in fileList: + # extract serial number + if serNum == None: + tmpMatch = re.search('^'+filePrefix+'\.(_\d+)\.'+fileSuffix,tmpFileName) + if tmpMatch == None: + raise RuntimeError,'cannot extract SN from %s' % tmpFileName + serNum = tmpMatch.group(1) + # extract attempt number + tmpMatch = re.search('\.(\d+)$',tmpFileName) + if tmpMatch != None: + attNum = tmpMatch.group(1) + # make file spec + tmpFile = FileSpec() + vals = fileMap[tmpFileName] + tmpFile.lfn = tmpFileName + tmpFile.GUID = vals['guid'] + tmpFile.fsize = vals['filesize'] + tmpFile.md5sum = vals['checksum'] + tmpFile.checksum = vals['checksum'] + tmpFile.dataset = containerName + tmpFile.prodDBlock = tmpFile.dataset + tmpFile.type = 'input' + tmpFile.status = 'ready' + tmpFile.prodDBlockToken = 'local' + tmpJob.addFile(tmpFile) + + # merge type determination + if fileSuffix.endswith('log.tgz'): + # log + usedMergeType = 'log' + elif self.mergeType != '': + # user specified merging type + usedMergeType = self.mergeType + else: + # auto detection + usedMergeType = self.detectMergeTypeWithLFN(filePrefix,fileSuffix) + + if usedMergeType in ['user']: + ## run user mode merging given the merging script + params += ' -j %s -r %s' % (self.mergeScript, self.runDir) + + params += " -t %s" % usedMergeType + params += " -i \"%s\"" % repr(fileList) + + if self.getRootVer(): + params += " --rootVer %s" % self.getRootVer() + + if self.job.jobParameters.find('--useRootCore') >= 0: + params += " --useRootCore" + + # output + tmpFile = FileSpec() + if attNum == None: + tmpFile.lfn = "%s.%s.merge.%s" % (filePrefix,serNum,fileSuffix) + else: + tmpFile.lfn = "%s.%s.%s.merge.%s" % (filePrefix,serNum,attNum,fileSuffix) + + if usedMergeType == 'text' and \ + not tmpFile.lfn.endswith('.tgz') and \ + not tmpFile.lfn.endswith('.tar.gz'): + tmpFile.lfn += '.tgz' + tmpFile.destinationDBlock = outDsName + if self.fileDestSeMap.has_key(fileTypeKey): + tmpFile.destinationSE = self.fileDestSeMap[fileTypeKey] + else: + tmpFile.destinationSE = self.job.destinationSE + tmpFile.dataset = containerName + tmpFile.type = 'output' + tmpJob.addFile(tmpFile) + params += ' -o "%s"' % tmpFile.lfn + # log + tmpItems = filePrefix.split('.') + if len(tmpItems) > 3: + logPrefix = "%s.%s.%s" % tuple(tmpItems[:3]) + else: + logPrefix = filePrefix + tmpFile = FileSpec() + tmpFile.lfn = '%s._$PANDAID.log.tgz' % logPrefix + tmpFile.destinationDBlock = outDsName + ".log" + tmpFile.destinationSE = tmpJob.computingSite + tmpFile.dataset = self.dsContMergeLog + tmpFile.type = 'log' + tmpJob.addFile(tmpFile) + # set job parameter + tmpJob.jobParameters = params + if self.simulFlag: + _logger.debug("%s prams %s" % (self.job.PandaID,tmpJob.jobParameters)) + # return + return tmpJob diff --git a/current/pandaserver/dataservice/Notifier.py b/current/pandaserver/dataservice/Notifier.py new file mode 100755 index 000000000..44aa7cdcf --- /dev/null +++ b/current/pandaserver/dataservice/Notifier.py @@ -0,0 +1,396 @@ +''' +notifier + +''' + +import re +import sys +import fcntl +import commands +import threading +import urllib +import shelve +import smtplib +import datetime +import time + +from config import panda_config +from taskbuffer.OraDBProxy import DBProxy +from pandalogger.PandaLogger import PandaLogger +from dataservice.DDM import dq2Info +import taskbuffer.ErrorCode + +# logger +_logger = PandaLogger().getLogger('Notifier') + +# lock file +_lockGetMail = open(panda_config.lockfile_getMail, 'w') + +# ignored DN +_ignoreList = [ + 'Nurcan Ozturk', + 'Xin Zhao', + 'Dietrich Liko', + ] + +# NG words in email address +_ngWordsInMailAddr = ['support','system','stuff','service','secretariat','club','user','admin', + 'cvs','grid','librarian','svn','atlas','cms','lhcb','alice','alaelp'] + +# port for SMTP server +smtpPortList = [25,587] + +def initLogger(pLogger): + # redirect logging to parent as it doesn't work in nested threads + global _logger + _logger = pLogger + + +# wrapper to patch smtplib.stderr to send debug info to logger +class StderrLogger(object): + def __init__(self,token): + self.token = token + def write(self,message): + message = message.strip() + if message != '': + _logger.debug('%s %s' % (self.token,message)) + + +class Notifier: + # constructor + def __init__(self,taskBuffer,job,datasets,summary={},mailFile=None,mailFileName=''): + self.job = job + self.datasets = datasets + self.taskBuffer = taskBuffer + self.summary = summary + self.mailFile = mailFile + self.mailFileName = mailFileName + + # main + def run(self): + if self.mailFile == None: + _logger.debug("%s start" % self.job.PandaID) + try: + # check job type + if self.job.prodSourceLabel != 'user' and self.job.prodSourceLabel != 'panda': + _logger.error("Invalid job type : %s" % self.job.prodSourceLabel) + _logger.debug("%s end" % self.job.PandaID) + return + # ignore some DNs to avoid mail storm + for igName in _ignoreList: + if re.search(igName,self.job.prodUserID) != None: + _logger.debug("Ignore DN : %s" % self.job.prodUserID) + _logger.debug("%s end" % self.job.PandaID) + return + # get e-mail address + mailAddr = self.getEmail(self.job.prodUserID) + if mailAddr == '': + _logger.error("could not find email address for %s" % self.job.prodUserID) + _logger.debug("%s end" % self.job.PandaID) + return + # not send + if mailAddr in ['notsend','',None]: + _logger.debug("not send to %s" % self.job.prodUserID) + _logger.debug("%s end" % self.job.PandaID) + return + # use all datasets + if self.summary != {}: + self.datasets = [] + for tmpJobID,tmpDsList in self.summary.iteritems(): + if tmpDsList == []: + continue + self.datasets += tmpDsList + # get full jobSpec including metadata + self.job = self.taskBuffer.peekJobs([self.job.PandaID],fromDefined=False, + fromActive=False,fromWaiting=False)[0] + if self.job == None: + _logger.error('%s : not found in DB' % self.job.PandaID) + _logger.debug("%s end" % self.job.PandaID) + return + # get IDs + ids = [] + # from active tables + tmpIDs = self.taskBuffer.queryPandaIDwithDataset(self.datasets) + for tmpID in tmpIDs: + if not tmpID in ids: + ids.append(tmpID) + # from archived table + if self.job.jobsetID in [0,'NULL',None]: + tmpIDs = self.taskBuffer.getPandIDsWithIdInArch(self.job.prodUserName,self.job.jobDefinitionID,False) + else: + tmpIDs = self.taskBuffer.getPandIDsWithIdInArch(self.job.prodUserName,self.job.jobsetID,True) + for tmpID in tmpIDs: + if not tmpID in ids: + ids.append(tmpID) + _logger.debug("%s IDs: %s" % (self.job.PandaID,ids)) + if len(ids) != 0: + # get jobs + jobs = self.taskBuffer.getFullJobStatus(ids,fromDefined=False,fromActive=False, + fromWaiting=False,forAnal=False) + # statistics + nTotal = 0 + nSucceeded = 0 + nFailed = 0 + nPartial = 0 + nCancel = 0 + # time info + creationTime = self.job.creationTime + endTime = self.job.modificationTime + if isinstance(endTime,datetime.datetime): + endTime = endTime.strftime('%Y-%m-%d %H:%M:%S') + # datasets + iDSList = [] + oDSList = [] + siteMap = {} + logDS = None + for tmpJob in jobs: + if not siteMap.has_key(tmpJob.jobDefinitionID): + siteMap[tmpJob.jobDefinitionID] = tmpJob.computingSite + for file in tmpJob.Files: + if file.type == 'input': + if not file.dataset in iDSList: + iDSList.append(file.dataset) + else: + if not file.dataset in oDSList: + oDSList.append(file.dataset) + if file.type == 'log': + logDS = file.dataset + # job/jobset IDs and site + if self.summary == {}: + jobIDsite = "%s/%s" % (self.job.jobDefinitionID,self.job.computingSite) + jobsetID = self.job.jobDefinitionID + jobDefIDList = [self.job.jobDefinitionID] + else: + jobDefIDList = self.summary.keys() + jobDefIDList.sort() + jobIDsite = '' + tmpIndent = " " + for tmpJobID in jobDefIDList: + jobIDsite += '%s/%s\n%s' % (tmpJobID,siteMap[tmpJobID],tmpIndent) + remCount = len(tmpIndent) + 1 + jobIDsite = jobIDsite[:-remCount] + jobsetID = self.job.jobsetID + # count + for job in jobs: + if job == None: + continue + # ignore pilot-retried job + if job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_PilotRetried]: + continue + # total + nTotal += 1 + # count per job status + if job.jobStatus == 'finished': + # check all files were used + allUses = True + for file in job.Files: + if file.type == 'input' and file.status in ['skipped']: + allUses = False + break + if allUses: + nSucceeded += 1 + else: + nPartial += 1 + elif job.jobStatus == 'failed': + nFailed += 1 + elif job.jobStatus == 'cancelled': + nCancel += 1 + # make message + if nSucceeded == nTotal: + finalStatInSub = "(All Succeeded)" + else: + finalStatInSub = "(%s/%s Succeeded)" % (nSucceeded,nTotal) + fromadd = panda_config.emailSender + if self.job.jobsetID in [0,'NULL',None]: + message = \ +"""Subject: PANDA notification for JobID : %s %s +From: %s +To: %s + +Summary of JobID : %s + +Site : %s""" % (self.job.jobDefinitionID,finalStatInSub,fromadd,mailAddr,self.job.jobDefinitionID,self.job.computingSite) + else: + message = \ +"""Subject: PANDA notification for JobsetID : %s %s +From: %s +To: %s + +Summary of JobsetID : %s + +JobID/Site : %s""" % (jobsetID,finalStatInSub,fromadd,mailAddr,jobsetID,jobIDsite) + message += \ +""" + +Created : %s (UTC) +Ended : %s (UTC) + +Total Number of Jobs : %s + Succeeded : %s + Partial : %s + Failed : %s + Cancelled : %s +""" % (creationTime,endTime,nTotal,nSucceeded,nPartial,nFailed,nCancel) + # input datasets + for iDS in iDSList: + message += \ +""" +In : %s""" % iDS + # output datasets + for oDS in oDSList: + message += \ +""" +Out : %s""" % oDS + # command + if not self.job.metadata in ['','NULL',None]: + message += \ +""" + +Parameters : %s""" % self.job.metadata + # URLs to PandaMon + if self.job.jobsetID in [0,'NULL',None]: + for tmpIdx,tmpJobID in enumerate(jobDefIDList): + urlData = {} + urlData['job'] = '*' + urlData['jobDefinitionID'] = tmpJobID + urlData['user'] = self.job.prodUserName + urlData['at'] = (str(creationTime)).split()[0] + if tmpIdx == 0: + message += \ +""" + +PandaMonURL : http://panda.cern.ch/server/pandamon/query?%s""" % urllib.urlencode(urlData) + else: + message += \ +""" + http://panda.cern.ch/server/pandamon/query?%s""" % urllib.urlencode(urlData) + else: + urlData = {} + urlData['job'] = '*' + urlData['jobsetID'] = self.job.jobsetID + urlData['user'] = self.job.prodUserName + urlData['at'] = (str(creationTime)).split()[0] + message += \ +""" + +PandaMonURL : http://panda.cern.ch/server/pandamon/query?%s""" % urllib.urlencode(urlData) + if logDS != None: + message += \ +""" +TaskMonitorURL : https://dashb-atlas-task.cern.ch/templates/task-analysis/#task=%s""" % logDS + + # tailer + message += \ +""" + + +Report Panda problems of any sort to + + the eGroup for help request + hn-atlas-dist-analysis-help@cern.ch + + the Savannah for software bug + https://savannah.cern.ch/projects/panda/ +""" + + # send mail + self.sendMail(self.job.PandaID,fromadd,mailAddr,message,1,True) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s %s %s" % (self.job.PandaID,errType,errValue)) + _logger.debug("%s end" % self.job.PandaID) + else: + try: + _logger.debug("start recovery for %s" % self.mailFileName) + # read from file + pandaID = self.mailFile.readline()[:-1] + fromadd = self.mailFile.readline()[:-1] + mailAddr = self.mailFile.readline()[:-1] + message = self.mailFile.read() + _logger.debug("%s start recovery" % pandaID) + if message != '': + self.sendMail(pandaID,fromadd,mailAddr,message,5,False) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s %s %s" % (self.mailFileName,errType,errValue)) + _logger.debug("end recovery for %s" % self.mailFileName) + + + # send mail + def sendMail(self,pandaID,fromadd,mailAddr,message,nTry,fileBackUp): + _logger.debug("%s send to %s\n%s" % (pandaID,mailAddr,message)) + for iTry in range(nTry): + try: + org_smtpstderr = smtplib.stderr + smtplib.stderr = StderrLogger(pandaID) + smtpPort = smtpPortList[iTry % len(smtpPortList)] + server = smtplib.SMTP(panda_config.emailSMTPsrv,smtpPort) + server.set_debuglevel(1) + server.ehlo() + server.starttls() + #server.login(panda_config.emailLogin,panda_config.emailPass) + out = server.sendmail(fromadd,mailAddr,message) + _logger.debug('%s %s' % (pandaID,str(out))) + server.quit() + break + except: + errType,errValue = sys.exc_info()[:2] + if iTry+1 < nTry: + # sleep for retry + _logger.debug("%s sleep %s due to %s %s" % (pandaID,iTry,errType,errValue)) + time.sleep(30) + else: + _logger.error("%s %s %s" % (pandaID,errType,errValue)) + if fileBackUp: + # write to file which is processed in add.py + mailFile = '%s/mail_%s_%s' % (panda_config.logdir,self.job.PandaID,commands.getoutput('uuidgen')) + oMail = open(mailFile,"w") + oMail.write(str(self.job.PandaID)+'\n'+fromadd+'\n'+mailAddr+'\n'+message) + oMail.close() + try: + smtplib.stderr = org_smtpstderr + except: + pass + + + + # get email + def getEmail(self,dn): + # get DN + _logger.debug("getDN for %s" % dn) + dbProxy = DBProxy() + distinguishedName = dbProxy.cleanUserID(dn) + _logger.debug("DN = %s" % distinguishedName) + if distinguishedName == "": + _logger.error("cannot get DN for %s" % dn) + return "" + # get email from MetaDB + mailAddr = self.taskBuffer.getEmailAddr(distinguishedName) + if mailAddr == 'notsend': + _logger.debug("email from MetaDB : '%s'" % mailAddr) + return mailAddr + # get email from DQ2 + realDN = re.sub('/CN=limited proxy','',dn) + realDN = re.sub('(/CN=proxy)+','',realDN) + try: + _logger.debug("dq2Info.finger(%s)" % realDN) + for iDDMTry in range(3): + status,out = dq2Info.finger(realDN) + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(10) + else: + break + _logger.debug(out) + exec "userInfo=%s" % out + mailAddr = userInfo['email'] + _logger.debug("email from DQ2 : '%s'" % mailAddr) + return mailAddr + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s %s" % (errType,errValue)) + return "" + + + diff --git a/current/pandaserver/dataservice/ProcessLimiter.py b/current/pandaserver/dataservice/ProcessLimiter.py new file mode 100644 index 000000000..580fe9c39 --- /dev/null +++ b/current/pandaserver/dataservice/ProcessLimiter.py @@ -0,0 +1,54 @@ +import datetime +import commands +import threading + +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('ProcessLimiter') + + +# limit the number of processes +class ProcessLimiter: + # constructor + def __init__(self,maxProcess=3): + self.processLock = threading.Semaphore(maxProcess) + self.dataLock = threading.Lock() + self.summary = {'nQueued':0,'nRunning':0} + + + # update summary + def updateSummary(self,dataName,change): + # lock + self.dataLock.acquire() + # update + if self.summary.has_key(dataName): + self.summary[dataName] += change + # release + self.dataLock.release() + _logger.debug('Summary : %s' % str(self.summary)) + + + # execute command + def getstatusoutput(self,commandStr): + # time stamp + timestamp = datetime.datetime.utcnow().isoformat(' ') + _logger.debug('%s start for "%s"' % (timestamp,commandStr)) + self.updateSummary('nQueued',1) + _logger.debug('%s getting lock' % timestamp) + # get semaphore + self.processLock.acquire() + _logger.debug('%s got lock' % timestamp) + # execute + self.updateSummary('nRunning',1) + status,output = commands.getstatusoutput(commandStr) + _logger.debug('%s executed' % timestamp) + self.updateSummary('nRunning',-1) + # release queue + self.processLock.release() + _logger.debug('%s end' % timestamp) + self.updateSummary('nQueued',-1) + # return + return status,output + + diff --git a/current/pandaserver/dataservice/RetryMaker.py b/current/pandaserver/dataservice/RetryMaker.py new file mode 100755 index 000000000..e6b69a6ce --- /dev/null +++ b/current/pandaserver/dataservice/RetryMaker.py @@ -0,0 +1,125 @@ +''' +notifier + +''' + +import re +import sys +import commands +import urllib +import datetime +import time + +from config import panda_config +from userinterface import ReBroker +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('RetryMaker') + + +def initLogger(pLogger): + # redirect logging to parent as it doesn't work in nested threads + global _logger + _logger = pLogger + ReBroker.initLogger(_logger) + + +class RetryMaker: + # constructor + def __init__(self,taskBuffer,job): + self.job = job + self.taskBuffer = taskBuffer + + # main + def run(self): + _logger.debug("%s start" % self.job.PandaID) + try: + # check the number of server retry + nRetry = self.job.specialHandling.split(',').count('sretry') + _logger.debug("%s nRetry=%s" % (self.job.PandaID,nRetry)) + # too many reattempts + maxRetry = 2 + if nRetry >= maxRetry: + _logger.debug("%s end : too many reattempts %s>=%s" % (self.job.PandaID,nRetry,maxRetry)) + return True + # get all job status in Active + idStatus,buildID = self.taskBuffer.getPandIDsWithJobID(self.job.prodUserName, + self.job.jobDefinitionID, + {},0) + # count # of failed in active + nFailed = 0 + for tmpID,tmpVar in idStatus.iteritems(): + # ignore buildJob + if tmpID == buildID: + continue + # count + tmpStatus,tmpCommand = tmpVar + if tmpStatus == 'failed': + nFailed += 1 + elif tmpStatus == 'cancelled' or tmpCommand == 'tobekilled': + # killed + _logger.debug("%s end : cancelled" % self.job.PandaID) + return True + _logger.debug("%s : nFailed=%s in Active" % (self.job.PandaID,nFailed)) + # no failed + if nFailed == 0: + _logger.debug("%s end : no failed jobs" % self.job.PandaID) + return True + # get all job status including Archived + idStatus,buildID = self.taskBuffer.getPandIDsWithJobIDLog(self.job.prodUserName, + self.job.jobDefinitionID, + idStatus,0,buildID) + # count # of failed and others in archived + nFailed = 0 + nOthers = 0 + for tmpID,tmpVar in idStatus.iteritems(): + # ignore buildJob + if tmpID == buildID: + continue + # count + tmpStatus,tmpCommand = tmpVar + if tmpStatus == 'failed': + nFailed += 1 + elif tmpStatus == 'cancelled' or tmpCommand == 'tobekilled': + # killed + _logger.debug("%s end : cancelled" % self.job.PandaID) + return True + else: + nOthers += 1 + _logger.debug("%s : nFailed=%s nOthers=%s in Active+Archived" % (self.job.PandaID,nFailed,nOthers)) + # no successful jobs + if nOthers == 0: + _logger.debug("%s end : no successful jobs" % self.job.PandaID) + return True + # no failed jobs just in case + if nFailed == 0: + _logger.debug("%s end : no failed jobs" % self.job.PandaID) + return True + # check ratio + maxFailedRatio = 0.8 + failedRatio = float(nFailed) / float(nOthers+nFailed) + if failedRatio > maxFailedRatio: + _logger.debug("%s end : too many failed jobs %s/%s>%s" % (self.job.PandaID, + nFailed, + nOthers+nFailed, + maxFailedRatio)) + return True + # instantiate rebrokerage since server-side retry relies on that + rebro = ReBroker.ReBroker(self.taskBuffer,forFailed=True,avoidSameSite=True) + # lock job for retry + reSt,reVal = rebro.lockJob(self.job.prodUserID,self.job.jobDefinitionID) + if not reSt: + _logger.debug("%s end : failed to lock jobs with %s" % (self.job.PandaID,eVal)) + return False + # execute + _logger.debug("%s : execute ReBroker" % self.job.PandaID) + rebro.start() + rebro.join() + _logger.debug("%s end : successfully" % self.job.PandaID) + return True + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s %s %s" % (self.job.PandaID,errType,errValue)) + _logger.debug("%s end : failed" % self.job.PandaID) + return False diff --git a/current/pandaserver/dataservice/Setupper.py b/current/pandaserver/dataservice/Setupper.py new file mode 100755 index 000000000..6b2103fea --- /dev/null +++ b/current/pandaserver/dataservice/Setupper.py @@ -0,0 +1,2420 @@ +''' +setup dataset + +''' + +import re +import sys +import time +import types +import urllib +import datetime +import commands +import threading +import traceback +import ErrorCode +import TaskAssigner +from DDM import ddm +from dataservice.DDM import dq2Common +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec +from taskbuffer.DatasetSpec import DatasetSpec +from brokerage.SiteMapper import SiteMapper +from brokerage.PandaSiteIDs import PandaMoverIDs +import brokerage.broker +import brokerage.broker_util +import DataServiceUtils + + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('Setupper') + + +# temporary +PandaDDMSource = ['BNLPANDA','BNL-OSG2_MCDISK','BNL-OSG2_DATADISK','BNL-OSG2_MCTAPE','BNL-OSG2_DATATAPE'] + + +class Setupper (threading.Thread): + # constructor + def __init__(self,taskBuffer,jobs,resubmit=False,pandaDDM=False,ddmAttempt=0,forkRun=False,onlyTA=False, + resetLocation=False,useNativeDQ2=True): + threading.Thread.__init__(self) + self.jobs = jobs + self.taskBuffer = taskBuffer + # VUIDs of dispatchDBlocks + self.vuidMap = {} + # resubmission or not + self.resubmit = resubmit + # time stamp + self.timestamp = datetime.datetime.utcnow().isoformat(' ') + # use PandaDDM + self.pandaDDM = pandaDDM + # file list for dispDS for PandaDDM + self.dispFileList = {} + # priority for ddm job + self.ddmAttempt = ddmAttempt + # site mapper + self.siteMapper = None + # fork another process because python doesn't release memory + self.forkRun = forkRun + # run task assignment only + self.onlyTA = onlyTA + # location map + self.replicaMap = {} + # all replica locations + self.allReplicaMap = {} + # reset locations + self.resetLocation = resetLocation + # replica map for special brokerage + self.replicaMapForBroker = {} + # available files at T2 + self.availableLFNsInT2 = {} + # use DQ2 in the same process + self.useNativeDQ2 = useNativeDQ2 + # list of missing datasets + self.missingDatasetList = {} + # lfn ds map + self.lfnDatasetMap = {} + + + # main + def run(self): + try: + _logger.debug('%s startRun' % self.timestamp) + self._memoryCheck() + # run main procedure in the same process + if not self.forkRun: + if self.jobs != None and len(self.jobs) > 0: + _logger.debug('%s PandaID:%s type:%s taskID:%s' % (self.timestamp, + self.jobs[0].PandaID, + self.jobs[0].prodSourceLabel, + self.jobs[0].taskID)) + # instantiate site mapper + self.siteMapper = SiteMapper(self.taskBuffer) + # use native DQ2 + if self.useNativeDQ2: + ddm.useDirectDQ2() + # correctLFN + self._correctLFN() + # run full Setupper + if not self.onlyTA: + # invoke brokerage + _logger.debug('%s brokerSchedule' % self.timestamp) + brokerage.broker.schedule(self.jobs,self.taskBuffer,self.siteMapper, + replicaMap=self.replicaMapForBroker, + t2FilesMap=self.availableLFNsInT2) + # remove waiting jobs + self.removeWaitingJobs() + # setup dispatch dataset + _logger.debug('%s setupSource' % self.timestamp) + self._setupSource() + # sort by site so that larger subs are created in the next step + if self.jobs != [] and self.jobs[0].prodSourceLabel in ['managed','test']: + tmpJobMap = {} + for tmpJob in self.jobs: + # add site + if not tmpJobMap.has_key(tmpJob.computingSite): + tmpJobMap[tmpJob.computingSite] = [] + # add job + tmpJobMap[tmpJob.computingSite].append(tmpJob) + # make new list + tmpJobList = [] + for tmpSiteKey in tmpJobMap.keys(): + tmpJobList += tmpJobMap[tmpSiteKey] + # set new list + self.jobs = tmpJobList + # create dataset for outputs and assign destination + if self.jobs != [] and self.jobs[0].prodSourceLabel in ['managed','test'] and self.jobs[0].cloud in ['DE']: + # count the number of jobs per _dis + iBunch = 0 + prevDisDsName = None + nJobsPerDisList = [] + for tmpJob in self.jobs: + if prevDisDsName != None and prevDisDsName != tmpJob.dispatchDBlock: + nJobsPerDisList.append(iBunch) + iBunch = 0 + # increment + iBunch += 1 + # set _dis name + prevDisDsName = tmpJob.dispatchDBlock + # remaining + if iBunch != 0: + nJobsPerDisList.append(iBunch) + # split sub datasets + iBunch = 0 + nBunchMax = 50 + tmpIndexJob = 0 + for nJobsPerDis in nJobsPerDisList: + # check _dis boundary so that the same _dis doesn't contribute to many _subs + if iBunch+nJobsPerDis > nBunchMax: + if iBunch != 0: + self._setupDestination(startIdx=tmpIndexJob,nJobsInLoop=iBunch) + tmpIndexJob += iBunch + iBunch = 0 + # increment + iBunch += nJobsPerDis + # remaining + if iBunch != 0: + self._setupDestination(startIdx=tmpIndexJob,nJobsInLoop=iBunch) + else: + # at a burst + self._setupDestination() + # make dis datasets for existing files + self._makeDisDatasetsForExistingfiles() + # update jobs + _logger.debug('%s updateJobs' % self.timestamp) + self._updateJobs() + # then subscribe sites distpatchDBlocks. this must be the last method + _logger.debug('%s subscribeDistpatchDB' % self.timestamp) + self._subscribeDistpatchDB() + # dynamic data placement for analysis jobs + self._dynamicDataPlacement() + # pin input datasets + self._pinInputDatasets() + # make subscription for missing + self._makeSubscriptionForMissing() + else: + # write jobs to file + import os + import cPickle as pickle + outFileName = '%s/set.%s_%s' % (panda_config.logdir,self.jobs[0].PandaID,commands.getoutput('uuidgen')) + outFile = open(outFileName,'w') + pickle.dump(self.jobs,outFile) + outFile.close() + # run main procedure in another process because python doesn't release memory + com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) + com += 'source /opt/glite/etc/profile.d/grid-env.sh; ' + com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \ + (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, + panda_config.pandaPython_dir,outFileName) + if self.onlyTA: + com += " -t" + _logger.debug(com) + # exeute + status,output = self.taskBuffer.processLimiter.getstatusoutput(com) + _logger.debug("Ret from another process: %s %s" % (status,output)) + self._memoryCheck() + _logger.debug('%s endRun' % self.timestamp) + except: + type, value, traceBack = sys.exc_info() + _logger.error("%s run() : %s %s" % (self.timestamp,type,value)) + + + # make dipatchDBlocks, insert prod/dispatchDBlock to database + def _setupSource(self): + fileList = {} + prodList = [] + prodError = {} + dispSiteMap = {} + dispError = {} + # extract prodDBlock + for job in self.jobs: + # ignore failed jobs + if job.jobStatus in ['failed','cancelled']: + continue + # production datablock + if job.prodDBlock != 'NULL' and (not self.pandaDDM) and (not job.prodSourceLabel in ['user','panda']): + # get VUID and record prodDBlock into DB + if not prodError.has_key(job.prodDBlock): + time.sleep(1) + _logger.debug((self.timestamp,'queryDatasetByName',job.prodDBlock)) + prodError[job.prodDBlock] = '' + for iDDMTry in range(3): + status,out = ddm.repositoryClient.main('queryDatasetByName',job.prodDBlock) + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(60) + else: + break + _logger.debug("%s %s" % (self.timestamp,out)) + if status != 0 or out.find('Error') != -1: + prodError[job.prodDBlock] = "Setupper._setupSource() could not get VUID of prodDBlock" + _logger.error(out) + else: + try: + exec "vuids = %s['%s']['vuids']" % (out.split('\n')[0],job.prodDBlock) + nfiles = 0 + # dataset spec + ds = DatasetSpec() + ds.vuid = vuids[0] + ds.name = job.prodDBlock + ds.type = 'input' + ds.status = 'completed' + ds.numberfiles = nfiles + ds.currentfiles = nfiles + prodList.append(ds) + except: + type, value, traceBack = sys.exc_info() + _logger.error("_setupSource() : %s %s" % (type,value)) + prodError[job.prodDBlock] = "Setupper._setupSource() could not decode VUID of prodDBlock" + # error + if prodError[job.prodDBlock] != '': + job.jobStatus = 'failed' + job.ddmErrorCode = ErrorCode.EC_Setupper + job.ddmErrorDiag = prodError[job.prodDBlock] + continue + # dispatch datablock + if job.dispatchDBlock != 'NULL': + # src/dst sites + tmpSrcID = 'BNL_ATLAS_1' + if self.siteMapper.checkCloud(job.cloud): + # use cloud's source + tmpSrcID = self.siteMapper.getCloud(job.cloud)['source'] + srcDQ2ID = self.siteMapper.getSite(tmpSrcID).ddm + # use srcDQ2ID as dstDQ2ID when dst SE is same as src SE + srcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpSrcID).se) + dstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(job.computingSite).se) + if srcSEs == dstSEs: + dstDQ2ID = srcDQ2ID + else: + dstDQ2ID = self.siteMapper.getSite(job.computingSite).ddm + dispSiteMap[job.dispatchDBlock] = {'src':srcDQ2ID,'dst':dstDQ2ID,'site':job.computingSite} + # filelist + if not fileList.has_key(job.dispatchDBlock): + fileList[job.dispatchDBlock] = {'lfns':[],'guids':[],'fsizes':[],'md5sums':[],'chksums':[]} + # collect LFN and GUID + for file in job.Files: + if file.type == 'input' and file.status == 'pending': + if not file.lfn in fileList[job.dispatchDBlock]['lfns']: + fileList[job.dispatchDBlock]['lfns'].append(file.lfn) + fileList[job.dispatchDBlock]['guids'].append(file.GUID) + if file.fsize in ['NULL',0]: + fileList[job.dispatchDBlock]['fsizes'].append(None) + else: + fileList[job.dispatchDBlock]['fsizes'].append(long(file.fsize)) + if file.md5sum in ['NULL','']: + fileList[job.dispatchDBlock]['md5sums'].append(None) + elif file.md5sum.startswith("md5:"): + fileList[job.dispatchDBlock]['md5sums'].append(file.md5sum) + else: + fileList[job.dispatchDBlock]['md5sums'].append("md5:%s" % file.md5sum) + if file.checksum in ['NULL','']: + fileList[job.dispatchDBlock]['chksums'].append(None) + else: + fileList[job.dispatchDBlock]['chksums'].append(file.checksum) + # get replica locations + if not self.replicaMap.has_key(job.dispatchDBlock): + self.replicaMap[job.dispatchDBlock] = {} + if not self.allReplicaMap.has_key(file.dataset): + if file.dataset.endswith('/'): + status,out = self.getListDatasetReplicasInContainer(file.dataset) + else: + for iDDMTry in range(3): + _logger.debug((self.timestamp,'listDatasetReplicas',file.dataset)) + status,out = ddm.DQ2.main('listDatasetReplicas',file.dataset,0,None,False) + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1 \ + or out == '()': + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error("%s %s" % (self.timestamp,out)) + dispError[job.dispatchDBlock] = 'could not get locations for %s' % file.dataset + _logger.error(dispError[job.dispatchDBlock]) + else: + _logger.debug("%s %s" % (self.timestamp,out)) + tmpRepSites = {} + try: + # convert res to map + exec "tmpRepSites = %s" % out + self.allReplicaMap[file.dataset] = tmpRepSites + except: + dispError[job.dispatchDBlock] = 'could not convert HTTP-res to replica map for %s' % file.dataset + _logger.error(dispError[job.dispatchDBlock]) + _logger.error(out) + if self.allReplicaMap.has_key(file.dataset): + self.replicaMap[job.dispatchDBlock][file.dataset] = self.allReplicaMap[file.dataset] + # register dispatch dataset + dispList = [] + for dispatchDBlock in fileList.keys(): + # ignore empty dataset + if len(fileList[dispatchDBlock]['lfns']) == 0: + continue + # use DQ2 + if (not self.pandaDDM) and (not dispSiteMap[dispatchDBlock]['src'] in PandaDDMSource or \ + self.siteMapper.getSite(dispSiteMap[dispatchDBlock]['site']).cloud != 'US') \ + and (job.prodSourceLabel != 'ddm') and (not dispSiteMap[dispatchDBlock]['site'].endswith("_REPRO")): + # register dispatch dataset + disFiles = fileList[dispatchDBlock] + _logger.debug((self.timestamp,'registerNewDataset',dispatchDBlock,disFiles['lfns'],disFiles['guids'], + disFiles['fsizes'],disFiles['chksums'],None,None,None,True)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('registerNewDataset',dispatchDBlock,disFiles['lfns'],disFiles['guids'], + disFiles['fsizes'],disFiles['chksums'],None,None,None,True) + if status != 0 and out.find('DQDatasetExistsException') != -1: + break + elif status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + _logger.debug("%s sleep %s for %s" % (self.timestamp,iDDMTry,dispatchDBlock)) + _logger.debug(status) + _logger.debug(out) + _logger.debug("-------------") + time.sleep(60) + else: + break + if status != 0 or out.find('Error') != -1: + _logger.error("%s %s" % (self.timestamp,out)) + dispError[dispatchDBlock] = "Setupper._setupSource() could not register dispatchDBlock" + continue + _logger.debug("%s %s" % (self.timestamp,out)) + vuidStr = out + # freezeDataset dispatch dataset + time.sleep(1) + _logger.debug((self.timestamp,'freezeDataset',dispatchDBlock)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('freezeDataset',dispatchDBlock) + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(60) + else: + break + if status != 0 or (out.find('Error') != -1 and out.find("is frozen") == -1): + _logger.error("%s %s" % (self.timestamp,out)) + dispError[dispatchDBlock] = "Setupper._setupSource() could not freeze dispatchDBlock" + continue + _logger.debug("%s %s" % (self.timestamp,out)) + else: + # use PandaDDM + self.dispFileList[dispatchDBlock] = fileList[dispatchDBlock] + # create a fake vuidStr for PandaDDM + tmpMap = {'vuid':commands.getoutput('uuidgen')} + vuidStr = "%s" % tmpMap + # get VUID + try: + exec "vuid = %s['vuid']" % vuidStr + # dataset spec. currentfiles is used to count the number of failed jobs + ds = DatasetSpec() + ds.vuid = vuid + ds.name = dispatchDBlock + ds.type = 'dispatch' + ds.status = 'defined' + ds.numberfiles = len(fileList[dispatchDBlock])/2 + ds.currentfiles = 0 + dispList.append(ds) + self.vuidMap[ds.name] = ds.vuid + except: + type, value, traceBack = sys.exc_info() + _logger.error("_setupSource() : %s %s" % (type,value)) + dispError[dispatchDBlock] = "Setupper._setupSource() could not decode VUID dispatchDBlock" + # insert datasets to DB + self.taskBuffer.insertDatasets(prodList+dispList) + # job status + for job in self.jobs: + if dispError.has_key(job.dispatchDBlock) and dispError[job.dispatchDBlock] != '': + job.jobStatus = 'failed' + job.ddmErrorCode = ErrorCode.EC_Setupper + job.ddmErrorDiag = dispError[job.dispatchDBlock] + # delete explicitly some huge variables + del fileList + del prodList + del prodError + del dispSiteMap + + + # create dataset for outputs in the repository and assign destination + def _setupDestination(self,startIdx=-1,nJobsInLoop=50): + _logger.debug('%s setupDestination idx:%s n:%s' % (self.timestamp,startIdx,nJobsInLoop)) + destError = {} + datasetList = {} + newnameList = {} + snGottenDS = [] + if startIdx == -1: + jobsList = self.jobs + else: + jobsList = self.jobs[startIdx:startIdx+nJobsInLoop] + for job in jobsList: + # ignore failed jobs + if job.jobStatus in ['failed','cancelled']: + continue + for file in job.Files: + # ignore input files + if file.type == 'input': + continue + # don't touch with outDS for unmerge jobs + if job.prodSourceLabel == 'panda' and job.processingType == 'unmerge' and file.type != 'log': + continue + # extract destinationDBlock, destinationSE and computingSite + dest = (file.destinationDBlock,file.destinationSE,job.computingSite,file.destinationDBlockToken) + if not destError.has_key(dest): + destError[dest] = '' + originalName = '' + if (job.prodSourceLabel == 'panda') or (job.prodSourceLabel in ['ptest','rc_test'] and \ + job.processingType in ['pathena','prun','gangarobot-rctest']): + # keep original name + nameList = [file.destinationDBlock] + else: + # set freshness to avoid redundant DB lookup + definedFreshFlag = None + if file.destinationDBlock in snGottenDS: + # already checked + definedFreshFlag = False + elif job.prodSourceLabel in ['user','test','prod_test']: + # user or test datasets are always fresh in DB + definedFreshFlag = True + # get serial number + sn,freshFlag = self.taskBuffer.getSerialNumber(file.destinationDBlock,definedFreshFlag) + if sn == -1: + destError[dest] = "Setupper._setupDestination() could not get serial num for %s" % file.destinationDBlock + continue + if not file.destinationDBlock in snGottenDS: + snGottenDS.append(file.destinationDBlock) + # new dataset name + newnameList[dest] = "%s_sub0%s" % (file.destinationDBlock,sn) + if freshFlag or self.resetLocation: + # register original dataset and new dataset + nameList = [file.destinationDBlock,newnameList[dest]] + originalName = file.destinationDBlock + else: + # register new dataset only + nameList = [newnameList[dest]] + # create dataset + for name in nameList: + computingSite = job.computingSite + if name == originalName: + # for original dataset + computingSite = file.destinationSE + # use DQ2 + if (not self.pandaDDM) and (job.prodSourceLabel != 'ddm') and (job.destinationSE != 'local'): + # get src and dest DDM conversion is needed for unknown sites + if job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(computingSite): + # DQ2 ID was set by using --destSE for analysis job to transfer output + tmpSrcDDM = self.siteMapper.getSite(job.computingSite).ddm + else: + tmpSrcDDM = self.siteMapper.getSite(computingSite).ddm + if job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(file.destinationSE): + # DQ2 ID was set by using --destSE for analysis job to transfer output + tmpDstDDM = tmpSrcDDM + else: + tmpDstDDM = self.siteMapper.getSite(file.destinationSE).ddm + # skip registration for _sub when src=dest + if tmpSrcDDM == tmpDstDDM and name != originalName and re.search('_sub\d+$',name) != None: + # create a fake vuidStr + vuidStr = 'vuid="%s"' % commands.getoutput('uuidgen') + else: + # register dataset + time.sleep(1) + # set hidden flag for _sub + tmpHiddenFlag = False + if name != originalName and re.search('_sub\d+$',name) != None: + tmpHiddenFlag = True + _logger.debug((self.timestamp,'registerNewDataset',name,[],[],[],[], + None,None,None,tmpHiddenFlag)) + atFailed = 0 + for iDDMTry in range(3): + status,out = ddm.DQ2.main('registerNewDataset',name,[],[],[],[], + None,None,None,tmpHiddenFlag) + if status != 0 and out.find('DQDatasetExistsException') != -1: + atFailed = iDDMTry + break + elif status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + _logger.debug("%s sleep %s for %s" % (self.timestamp,iDDMTry,name)) + _logger.debug(status) + _logger.debug(out) + _logger.debug("-------------") + time.sleep(60) + else: + break + if status != 0 or out.find('Error') != -1: + # unset vuidStr + vuidStr = "" + # ignore 'already exists' ERROR because original dataset may be registered by upstream. + # atFailed > 0 is for the case in which the first attempt succeeded but report failure + if (job.prodSourceLabel == 'panda' or (job.prodSourceLabel in ['ptest','rc_test'] and \ + job.processingType in ['pathena','prun','gangarobot-rctest']) \ + or name == originalName or atFailed > 0) and \ + out.find('DQDatasetExistsException') != -1: + _logger.debug('%s ignored DQDatasetExistsException' % self.timestamp) + else: + destError[dest] = "Setupper._setupDestination() could not register : %s" % name + _logger.error("%s %s" % (self.timestamp,out)) + continue + else: + _logger.debug("%s %s" % (self.timestamp,out)) + vuidStr = "vuid = %s['vuid']" % out + # get list of tokens + tmpTokenList = file.destinationDBlockToken.split(',') + # register datasetsets + if name == originalName or tmpSrcDDM != tmpDstDDM or \ + job.prodSourceLabel == 'panda' or (job.prodSourceLabel in ['ptest','rc_test'] and \ + job.processingType in ['pathena','prun','gangarobot-rctest']) \ + or len(tmpTokenList) > 1: + time.sleep(1) + # register location + usingT1asT2 = False + if job.prodSourceLabel == 'user' and not self.siteMapper.siteSpecList.has_key(computingSite): + dq2IDList = [self.siteMapper.getSite(job.computingSite).ddm] + else: + if self.siteMapper.getSite(computingSite).cloud != job.cloud and \ + re.search('_sub\d+$',name) != None and \ + (not job.prodSourceLabel in ['user','panda']) and \ + (not self.siteMapper.getSite(computingSite).ddm.endswith('PRODDISK')): + # T1 used as T2. Use both DATADISK and PRODDISK as locations while T1 PRODDISK is phasing out + dq2IDList = [self.siteMapper.getSite(computingSite).ddm] + if self.siteMapper.getSite(computingSite).setokens.has_key('ATLASPRODDISK'): + dq2IDList += [self.siteMapper.getSite(computingSite).setokens['ATLASPRODDISK']] + usingT1asT2 = True + else: + dq2IDList = [self.siteMapper.getSite(computingSite).ddm] + # use another location when token is set + if (not usingT1asT2) and (not file.destinationDBlockToken in ['NULL','']): + dq2IDList = [] + for tmpToken in tmpTokenList: + # set default + dq2ID = self.siteMapper.getSite(computingSite).ddm + # convert token to DQ2ID + if self.siteMapper.getSite(computingSite).setokens.has_key(tmpToken): + dq2ID = self.siteMapper.getSite(computingSite).setokens[tmpToken] + # replace or append + if len(tmpTokenList) <= 1 or name != originalName: + # use location consistent with token + dq2IDList = [dq2ID] + break + else: + # use multiple locations for _tid + if not dq2ID in dq2IDList: + dq2IDList.append(dq2ID) + # loop over all locations + repLifeTime = None + if name != originalName and re.search('_sub\d+$',name) != None: + repLifeTime = "14 days" + for dq2ID in dq2IDList: + _logger.debug((self.timestamp,'registerDatasetLocation',name,dq2ID,0,0,None,None,None,repLifeTime)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('registerDatasetLocation',name,dq2ID,0,0,None,None,None,repLifeTime) + if status != 0 and out.find('DQLocationExistsException') != -1: + break + elif status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(60) + else: + break + # ignore "already exists at location XYZ" + if out.find('DQLocationExistsException') != -1: + _logger.debug('%s ignored DQLocationExistsException' % self.timestamp) + status,out = 0,'' + else: + _logger.debug("%s %s" % (self.timestamp,out)) + if status == 0 and out.find('Error') == -1: + # change replica ownership for user datasets + if self.resetLocation and ((name == originalName and job.prodSourceLabel == 'user') or \ + job.prodSourceLabel=='panda'): + # remove /CN=proxy and /CN=limited from DN + tmpRealDN = job.prodUserID + tmpRealDN = re.sub('/CN=limited proxy','',tmpRealDN) + tmpRealDN = re.sub('/CN=proxy','',tmpRealDN) + status,out = dq2Common.parse_dn(tmpRealDN) + if status != 0: + _logger.error("%s %s" % (self.timestamp,out)) + status,out = 1,'failed to truncate DN:%s' % job.prodUserID + else: + tmpRealDN = out + _logger.debug((self.timestamp,'setReplicaMetaDataAttribute',name,dq2ID,'owner',tmpRealDN)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',name,dq2ID,'owner',tmpRealDN) + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(60) + else: + break + # failed + if status != 0 or out.find('Error') != -1: + _logger.error("%s %s" % (self.timestamp,out)) + break + # delete old replicas + tmpDelStat = self.deleteDatasetReplicas([name],[dq2ID]) + if not tmpDelStat: + status,out = 1,'failed to delete old replicas for %s' % name + break + # failed + if status != 0 or out.find('Error') != -1: + _logger.error("%s %s" % (self.timestamp,out)) + break + else: + # skip registerDatasetLocations + status,out = 0,'' + if status != 0 or out.find('Error') != -1: + destError[dest] = "Could not register location : %s %s" % (name,out.split('\n')[-1]) + elif job.prodSourceLabel == 'panda' or (job.prodSourceLabel in ['ptest','rc_test'] and \ + job.processingType in ['pathena','prun','gangarobot-rctest']): + # do nothing for "panda" job + pass + elif name == originalName and job.prodSourceLabel in ['managed','test','rc_test','ptest']: + # set metadata + time.sleep(1) + dq2ID = self.siteMapper.getSite(file.destinationSE).ddm + # use another location when token is set + if not file.destinationDBlockToken in ['NULL','']: + # register only the first token becasue it is used as the location + tmpFirstToken = file.destinationDBlockToken.split(',')[0] + if self.siteMapper.getSite(file.destinationSE).setokens.has_key(tmpFirstToken): + dq2ID = self.siteMapper.getSite(file.destinationSE).setokens[tmpFirstToken] + _logger.debug((self.timestamp,'setMetaDataAttribute',name,'origin',dq2ID)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('setMetaDataAttribute',name,'origin',dq2ID) + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(60) + else: + break + _logger.debug("%s %s" % (self.timestamp,out)) + if status != 0 or (out != 'None' and out.find('already exists') == -1): + _logger.error(out) + destError[dest] = "Setupper._setupDestination() could not set metadata : %s" % name + # use PandaDDM or non-DQ2 + else: + # create a fake vuidStr + vuidStr = 'vuid="%s"' % commands.getoutput('uuidgen') + # already failed + if destError[dest] != '' and name == originalName: + break + # get vuid + if vuidStr == '': + _logger.debug((self.timestamp,'queryDatasetByName',name)) + for iDDMTry in range(3): + status,out = ddm.repositoryClient.main('queryDatasetByName',name) + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(60) + else: + break + _logger.debug("%s %s" % (self.timestamp,out)) + if status != 0 or out.find('Error') != -1: + _logger.error(out) + vuidStr = "vuid = %s['%s']['vuids'][0]" % (out.split('\n')[0],name) + try: + exec vuidStr + # dataset spec + ds = DatasetSpec() + ds.vuid = vuid + ds.name = name + ds.type = 'output' + ds.numberfiles = 0 + ds.currentfiles = 0 + ds.status = 'defined' + # append + datasetList[(name,file.destinationSE,computingSite)] = ds + except: + # set status + type, value, traceBack = sys.exc_info() + _logger.error("_setupDestination() : %s %s" % (type,value)) + destError[dest] = "Setupper._setupDestination() could not get VUID : %s" % name + # set new destDBlock + if newnameList.has_key(dest): + file.destinationDBlock = newnameList[dest] + # update job status if failed + if destError[dest] != '': + job.jobStatus = 'failed' + job.ddmErrorCode = ErrorCode.EC_Setupper + job.ddmErrorDiag = destError[dest] + else: + newdest = (file.destinationDBlock,file.destinationSE,job.computingSite) + # increment number of files + datasetList[newdest].numberfiles = datasetList[newdest].numberfiles + 1 + # dump + for tmpDsKey in datasetList.keys(): + if re.search('_sub\d+$',tmpDsKey[0]) != None: + _logger.debug('%s made sub:%s for nFiles=%s' % (self.timestamp,tmpDsKey[0],datasetList[tmpDsKey].numberfiles)) + # insert datasets to DB + return self.taskBuffer.insertDatasets(datasetList.values()) + + + # subscribe sites to distpatchDBlocks + def _subscribeDistpatchDB(self): + dispError = {} + failedJobs = [] + ddmJobs = [] + ddmUser = 'NULL' + for job in self.jobs: + # ignore failed jobs + if job.jobStatus in ['failed','cancelled']: + continue + # ignore no dispatch jobs + if job.dispatchDBlock=='NULL' or job.computingSite=='NULL': + continue + # extract dispatchDBlock and computingSite + disp = (job.dispatchDBlock,job.computingSite) + if dispError.has_key(disp) == 0: + dispError[disp] = '' + # DQ2 IDs + tmpSrcID = 'BNL_ATLAS_1' + if self.siteMapper.checkCloud(job.cloud): + # use cloud's source + tmpSrcID = self.siteMapper.getCloud(job.cloud)['source'] + srcDQ2ID = self.siteMapper.getSite(tmpSrcID).ddm + # destination + tmpDstID = job.computingSite + if srcDQ2ID != self.siteMapper.getSite(job.computingSite).ddm and \ + srcDQ2ID in self.siteMapper.getSite(job.computingSite).setokens.values(): + # direct usage of remote SE. Mainly for prestaging + tmpDstID = tmpSrcID + _logger.debug('%s use remote SiteSpec of %s for %s' % (self.timestamp,tmpDstID,job.computingSite)) + # use srcDQ2ID as dstDQ2ID when dst SE is same as src SE + srcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpSrcID).se) + dstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpDstID).se) + if srcSEs == dstSEs or job.computingSite.endswith("_REPRO"): + dstDQ2ID = srcDQ2ID + else: + dstDQ2ID = self.siteMapper.getSite(job.computingSite).ddm + # use DQ2 + if (not self.pandaDDM) and (not srcDQ2ID in PandaDDMSource or self.siteMapper.getSite(tmpDstID).cloud != 'US') \ + and (job.prodSourceLabel != 'ddm') and (not job.computingSite.endswith("_REPRO")): + # look for replica + dq2ID = srcDQ2ID + dq2IDList = [] + # register replica + if dq2ID != dstDQ2ID: + # make list + if self.replicaMap.has_key(job.dispatchDBlock): + # set DQ2 ID for DISK + if not srcDQ2ID.endswith('_DATADISK'): + hotID = re.sub('_MCDISK','_HOTDISK', srcDQ2ID) + diskID = re.sub('_MCDISK','_DATADISK',srcDQ2ID) + tapeID = re.sub('_MCDISK','_DATATAPE',srcDQ2ID) + mctapeID = re.sub('_MCDISK','_MCTAPE',srcDQ2ID) + else: + hotID = re.sub('_DATADISK','_HOTDISK', srcDQ2ID) + diskID = re.sub('_DATADISK','_DATADISK',srcDQ2ID) + tapeID = re.sub('_DATADISK','_DATATAPE',srcDQ2ID) + mctapeID = re.sub('_DATADISK','_MCTAPE',srcDQ2ID) + # DQ2 ID is mixed with TAIWAN-LCG2 and TW-FTT + if job.cloud in ['TW',]: + tmpSiteSpec = self.siteMapper.getSite(tmpSrcID) + if tmpSiteSpec.setokens.has_key('ATLASDATADISK'): + diskID = tmpSiteSpec.setokens['ATLASDATADISK'] + if tmpSiteSpec.setokens.has_key('ATLASDATATAPE'): + tapeID = tmpSiteSpec.setokens['ATLASDATATAPE'] + if tmpSiteSpec.setokens.has_key('ATLASMCTAPE'): + mctapeID = tmpSiteSpec.setokens['ATLASMCTAPE'] + hotID = 'TAIWAN-LCG2_HOTDISK' + for tmpDataset,tmpRepMap in self.replicaMap[job.dispatchDBlock].iteritems(): + if tmpRepMap.has_key(hotID): + # HOTDISK + if not hotID in dq2IDList: + dq2IDList.append(hotID) + if tmpRepMap.has_key(srcDQ2ID): + # MCDISK + if not srcDQ2ID in dq2IDList: + dq2IDList.append(srcDQ2ID) + if tmpRepMap.has_key(diskID): + # DATADISK + if not diskID in dq2IDList: + dq2IDList.append(diskID) + if job.cloud == 'US' and tmpRepMap.has_key('BNLPANDA'): + # BNLPANDA + if not 'BNLPANDA' in dq2IDList: + dq2IDList.append('BNLPANDA') + if tmpRepMap.has_key(tapeID): + # DATATAPE + if not tapeID in dq2IDList: + dq2IDList.append(tapeID) + if tmpRepMap.has_key(mctapeID): + # MCTAPE + if not mctapeID in dq2IDList: + dq2IDList.append(mctapeID) + # hack for split T1 + splitT1IDsHaveDS = [] + for tmpSplitT1Key in tmpRepMap.keys(): + if tmpSplitT1Key.startswith('NIKHEF-ELPROD'): + splitT1IDsHaveDS.append(tmpSplitT1Key) + if job.cloud == 'NL' and splitT1IDsHaveDS != [] \ + and not tmpRepMap.has_key('SARA-MATRIX_MCDISK') \ + and not tmpRepMap.has_key('SARA-MATRIX_DATADISK') \ + and not tmpRepMap.has_key('SARA-MATRIX_MCTAPE') \ + and not tmpRepMap.has_key('SARA-MATRIX_DATATAPE'): + for tmpSplitT1Key in splitT1IDsHaveDS: + if not tmpSplitT1Key in dq2IDList: + dq2IDList.append(tmpSplitT1Key) + # consider cloudconfig.tier1se + tmpCloudSEs = DataServiceUtils.getEndpointsAtT1(tmpRepMap,self.siteMapper,job.cloud) + useCloudSEs = [] + for tmpCloudSE in tmpCloudSEs: + if not tmpCloudSE in dq2IDList: + useCloudSEs.append(tmpCloudSE) + if useCloudSEs != []: + dq2IDList += useCloudSEs + _logger.debug('%s use additional endpoints %s from cloudconfig' % (self.timestamp,str(useCloudSEs))) + # use default location if empty + if dq2IDList == []: + dq2IDList = [dq2ID] + for dq2ID in dq2IDList: + time.sleep(1) + _logger.debug((self.timestamp,'registerDatasetLocation',job.dispatchDBlock,dq2ID,0,1,None,None,None,"7 days")) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('registerDatasetLocation',job.dispatchDBlock,dq2ID,0,1,None,None,None,"7 days") + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(60) + else: + break + _logger.debug("%s %s" % (self.timestamp,out)) + # failure + if status != 0 or out.find('Error') != -1: + break + else: + # skip registerDatasetLocations + status,out = 0,'' + if status != 0 or out.find('Error') != -1: + _logger.error(out) + dispError[disp] = "Setupper._subscribeDistpatchDB() could not register location" + else: + # assign destination + time.sleep(1) + optSub = {'DATASET_COMPLETE_EVENT' : ['https://%s:%s/server/panda/datasetCompleted' % \ + (panda_config.pserverhost,panda_config.pserverport)]} + optSource = {} + optSrcPolicy = 001000 | 010000 + dq2ID = dstDQ2ID + # prestaging + if srcDQ2ID == dstDQ2ID: + # stage-in callback + optSub['DATASET_STAGED_EVENT'] = ['https://%s:%s/server/panda/datasetCompleted' % \ + (panda_config.pserverhost,panda_config.pserverport)] + # use ATLAS*TAPE + seTokens = self.siteMapper.getSite(tmpDstID).setokens + if seTokens.has_key('ATLASDATATAPE') and seTokens.has_key('ATLASMCTAPE'): + dq2ID = seTokens['ATLASDATATAPE'] + # use MCDISK if needed + for tmpDataset,tmpRepMap in self.replicaMap[job.dispatchDBlock].iteritems(): + if (not tmpRepMap.has_key(dq2ID)) and tmpRepMap.has_key(seTokens['ATLASMCTAPE']): + dq2ID = seTokens['ATLASMCTAPE'] + break + # for CERN and BNL + if job.cloud in ['CERN','US'] and self.replicaMap.has_key(job.dispatchDBlock): + setNewIDflag = False + if job.cloud == 'CERN': + otherIDs = ['CERN-PROD_DAQ','CERN-PROD_TZERO','CERN-PROD_TMPDISK'] + else: + otherIDs = ['BNLPANDA'] + for tmpDataset,tmpRepMap in self.replicaMap[job.dispatchDBlock].iteritems(): + if not tmpRepMap.has_key(dq2ID): + # look for another id + for cernID in otherIDs: + if tmpRepMap.has_key(cernID): + dq2ID = cernID + setNewIDflag = True + break + # break + if setNewIDflag: + break + optSrcPolicy = 000010 + optSource[dq2ID] = {'policy' : 0} + else: + # set sources to handle T2s in another cloud and to transfer dis datasets being split in multiple sites + for tmpDQ2ID in dq2IDList: + optSource[tmpDQ2ID] = {'policy' : 0} + # T1 used as T2 + if job.cloud != self.siteMapper.getSite(tmpDstID).cloud and \ + (not dstDQ2ID.endswith('PRODDISK')) and \ + (not job.prodSourceLabel in ['user','panda']) and \ + self.siteMapper.getSite(tmpDstID).cloud in ['US']: + seTokens = self.siteMapper.getSite(tmpDstID).setokens + # use T1_PRODDISK + if seTokens.has_key('ATLASPRODDISK'): + dq2ID = seTokens['ATLASPRODDISK'] + # register subscription + _logger.debug('%s %s %s %s' % (self.timestamp,'registerDatasetSubscription', + (job.dispatchDBlock,dq2ID), + {'version':0,'archived':0,'callbacks':optSub,'sources':optSource,'sources_policy':optSrcPolicy, + 'wait_for_sources':0,'destination':None,'query_more_sources':0,'sshare':"production",'group':None, + 'activity':"Production",'acl_alias':None,'replica_lifetime':"7 days"})) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('registerDatasetSubscription',job.dispatchDBlock,dq2ID,version=0,archived=0,callbacks=optSub, + sources=optSource,sources_policy=optSrcPolicy,wait_for_sources=0,destination=None, + query_more_sources=0,sshare="production",group=None,activity="Production", + acl_alias=None,replica_lifetime="7 days") + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(60) + else: + break + _logger.debug("%s %s" % (self.timestamp,out)) + if status != 0 or (out != 'None' and len(out) != 35): + _logger.error(out) + dispError[disp] = "Setupper._subscribeDistpatchDB() could not register subscription" + # logging + try: + # make message + dq2ID = dstDQ2ID + message = '%s - siteID:%s type:dispatch vuid:%s' % (commands.getoutput('hostname'),dq2ID, + self.vuidMap[job.dispatchDBlock]) + # get logger + _pandaLogger = PandaLogger() + _pandaLogger.lock() + _pandaLogger.setParams({'Type':'registerSubscription'}) + logger = _pandaLogger.getHttpLogger(panda_config.loggername) + # add message + logger.info(message) + # release HTTP handler + _pandaLogger.release() + except: + pass + # use PandaDDM + else: + # set DDM user DN + if ddmUser == 'NULL': + ddmUser = job.prodUserID + # create a DDM job + ddmjob = JobSpec() + ddmjob.jobDefinitionID = int(time.time()) % 10000 + ddmjob.jobName = "%s" % commands.getoutput('uuidgen') + ddmjob.transformation = 'http://pandaserver.cern.ch:25080/trf/mover/run_dq2_cr' + ddmjob.destinationDBlock = 'pandaddm_%s.%s' % (time.strftime('%y.%m.%d'),ddmjob.jobName) + if job.cloud == 'NULL': + ddmjob.cloud = 'US' + else: + ddmjob.cloud = job.cloud + if not PandaMoverIDs.has_key(job.cloud): + ddmjob.computingSite = "BNL_ATLAS_DDM" + else: + ddmjob.computingSite = PandaMoverIDs[job.cloud] + ddmjob.destinationSE = ddmjob.computingSite + ddmjob.assignedPriority = 200000 + if job.prodSourceLabel in ['software']: + # set higher priority for installation jobs + ddmjob.assignedPriority += 1000 + else: + ddmjob.assignedPriority += job.currentPriority + ddmjob.currentPriority = ddmjob.assignedPriority + if self.ddmAttempt != 0: + # keep count of attemptNr + ddmjob.attemptNr = self.ddmAttempt + 1 + else: + ddmjob.attemptNr = 1 + # check attemptNr to avoid endless loop + if ddmjob.attemptNr > 10: + err = "Too many attempts %s for %s" % (ddmjob.attemptNr,job.dispatchDBlock) + _logger.error(err) + dispError[disp] = err + continue + ddmjob.prodSourceLabel = 'ddm' + ddmjob.transferType = 'dis' + ddmjob.processingType = 'pandamover' + # append log file + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz.%s" % (ddmjob.destinationDBlock,ddmjob.attemptNr) + fileOL.destinationDBlock = ddmjob.destinationDBlock + fileOL.destinationSE = ddmjob.destinationSE + fileOL.dataset = ddmjob.destinationDBlock + fileOL.type = 'log' + ddmjob.addFile(fileOL) + # make arguments + callBackURL = 'https://%s:%s/server/panda/datasetCompleted?vuid=%s&site=%s' % \ + (panda_config.pserverhost,panda_config.pserverport, + self.vuidMap[job.dispatchDBlock],dstDQ2ID) + callBackURL = urllib.quote(callBackURL) + lfnsStr = '' + for tmpLFN in self.dispFileList[job.dispatchDBlock]['lfns']: + lfnsStr += '%s,' % tmpLFN + guidStr = '' + for tmpGUID in self.dispFileList[job.dispatchDBlock]['guids']: + guidStr += '%s,' % tmpGUID + guidStr = guidStr[:-1] + lfnsStr = lfnsStr[:-1] + # check input token + moverUseTape = False + for tmpFile in job.Files: + if tmpFile.type == 'input' and tmpFile.dispatchDBlockToken in ['ATLASDATATAPE']: + moverUseTape = True + break + if srcDQ2ID != dstDQ2ID: + # get destination dir + tmpSpec = self.siteMapper.getSite(job.computingSite) + destDir = brokerage.broker_util._getDefaultStorage(tmpSpec.dq2url,tmpSpec.se,tmpSpec.seprodpath) + if destDir == '': + err = "could not get default storage for %s" % job.computingSite + _logger.error(err) + dispError[disp] = err + continue + # normal jobs + argStr = "" + if moverUseTape: + argStr += "--useTape " + argStr += "-t 7200 -n 3 -s %s -r %s --guids %s --lfns %s --tapePriority %s --callBack %s -d %spanda/dis/%s%s %s" % \ + (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,job.currentPriority,callBackURL,destDir, + time.strftime('%y/%m/%d/'),job.dispatchDBlock,job.dispatchDBlock) + else: + # prestaging jobs + argStr = "" + if moverUseTape: + argStr += "--useTape " + argStr += "-t 540 -n 2 -s %s -r %s --guids %s --lfns %s --tapePriority %s --callBack %s --prestage --cloud %s %s" % \ + (srcDQ2ID,dstDQ2ID,guidStr,lfnsStr,job.currentPriority,callBackURL,job.cloud,job.dispatchDBlock) + # set job parameters + ddmjob.jobParameters = argStr + _logger.debug('%s pdq2_cr %s' % (self.timestamp,ddmjob.jobParameters)) + # set src/dest + ddmjob.sourceSite = srcDQ2ID + ddmjob.destinationSite = dstDQ2ID + ddmJobs.append(ddmjob) + # failed jobs + if dispError[disp] != '': + job.jobStatus = 'failed' + job.ddmErrorCode = ErrorCode.EC_Setupper + job.ddmErrorDiag = dispError[disp] + failedJobs.append(job) + # update failed jobs only. succeeded jobs should be activate by DDM callback + self.taskBuffer.updateJobs(failedJobs,True) + # submit ddm jobs + if ddmJobs != []: + ddmRet = self.taskBuffer.storeJobs(ddmJobs,ddmUser,joinThr=True) + # update datasets + ddmIndex = 0 + ddmDsList = [] + for ddmPandaID,ddmJobDef,ddmJobName in ddmRet: + # invalid PandaID + if ddmPandaID in ['NULL',None]: + continue + # get dispatch dataset + dsName = ddmJobs[ddmIndex].jobParameters.split()[-1] + ddmIndex += 1 + tmpDS = self.taskBuffer.queryDatasetWithMap({'name':dsName}) + if tmpDS != None: + # set MoverID + tmpDS.MoverID = ddmPandaID + ddmDsList.append(tmpDS) + # update + if ddmDsList != []: + self.taskBuffer.updateDatasets(ddmDsList) + + + # update jobs + def _updateJobs(self): + updateJobs = [] + failedJobs = [] + activateJobs = [] + # sort out jobs + for job in self.jobs: + # failed jobs + if job.jobStatus in ['failed','cancelled']: + failedJobs.append(job) + # no input jobs + elif job.dispatchDBlock=='NULL': + activateJobs.append(job) + # normal jobs + else: + # change status + job.jobStatus = "assigned" + updateJobs.append(job) + # update DB + self.taskBuffer.activateJobs(activateJobs) + self.taskBuffer.updateJobs(updateJobs,True) + self.taskBuffer.updateJobs(failedJobs,True) + # delete local values + del updateJobs + del failedJobs + del activateJobs + + + # correct LFN for attemptNr + def _correctLFN(self): + lfnMap = {} + valMap = {} + prodError = {} + missingDS = {} + jobsWaiting = [] + jobsFailed = [] + jobsProcessed = [] + allLFNs = {} + allGUIDs = {} + cloudMap = {} + lfnDsMap = {} + replicaMap = {} + _logger.debug('%s go into LFN correction' % self.timestamp) + for job in self.jobs: + if self.onlyTA: + _logger.debug("%s start TA session %s" % (self.timestamp,job.taskID)) + # check if sitename is known + if job.computingSite != 'NULL' and (not job.computingSite in self.siteMapper.siteSpecList.keys()): + job.jobStatus = 'failed' + job.ddmErrorCode = ErrorCode.EC_Setupper + job.ddmErrorDiag = "computingSite:%s is unknown" % job.computingSite + # append job for downstream process + jobsProcessed.append(job) + # error message for TA + if self.onlyTA: + _logger.error("%s %s" % (self.timestamp,job.ddmErrorDiag)) + continue + # ignore no prodDBlock jobs or container dataset + if job.prodDBlock == 'NULL': + # set cloud + if panda_config.enableDynamicTA and job.prodSourceLabel in ['managed','validation'] \ + and job.cloud in ['NULL',''] and (not job.taskID in [None,'NULL',0]): + # look into map to check if it is already gotten + if not cloudMap.has_key(job.taskID): + # instantiate TaskAssigner + cloudResolver = TaskAssigner.TaskAssigner(self.taskBuffer,self.siteMapper, + job.taskID,job.prodSourceLabel,job) + # check cloud + _logger.debug("%s check cloud for %s" % (self.timestamp,job.taskID)) + retCloud = cloudResolver.checkCloud() + _logger.debug("%s checkCloud() -> %s" % (self.timestamp,retCloud)) + # failed + if retCloud == None: + _logger.error("failed to check cloud for %s" % job.taskID) + # append job to waiting list + jobsWaiting.append(job) + continue + # to be set + elif retCloud == "": + # collect LFN/GUID + tmpLFNs = [] + tmpGUIDs = [] + # set cloud + _logger.debug("%s set cloud for %s" % (self.timestamp,job.taskID)) + retCloud = cloudResolver.setCloud(tmpLFNs,tmpGUIDs,metadata=job.metadata) + _logger.debug("%s setCloud() -> %s" % (self.timestamp,retCloud)) + if retCloud == None: + _logger.debug("failed to set cloud for %s" % job.taskID) + # append job to waiting list + jobsWaiting.append(job) + continue + # append to map + cloudMap[job.taskID] = retCloud + # set cloud + job.cloud = cloudMap[job.taskID] + # message for TA + if self.onlyTA: + _logger.debug("%s set %s:%s" % (self.timestamp,job.taskID,job.cloud)) + # append job to processed list + jobsProcessed.append(job) + continue + # collect datasets + datasets = [] + for file in job.Files: + if file.type == 'input' and file.dispatchDBlock == 'NULL' \ + and (file.GUID == 'NULL' or job.prodSourceLabel in ['managed','test','ptest']): + if not file.dataset in datasets: + datasets.append(file.dataset) + # get LFN list + for dataset in datasets: + if not dataset in lfnMap.keys(): + prodError[dataset] = '' + lfnMap[dataset] = {} + # get LFNs + time.sleep(1) + for iDDMTry in range(3): + _logger.debug((self.timestamp,'listFilesInDataset',dataset)) + status,out = ddm.DQ2.main('listFilesInDataset',dataset) + if out.find("DQUnknownDatasetException") != -1: + break + elif status == -1: + break + elif status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error(out) + prodError[dataset] = 'could not get file list of prodDBlock %s' % dataset + _logger.error(prodError[dataset]) + # doesn't exist in DQ2 + if out.find('DQUnknownDatasetException') != -1: + missingDS[dataset] = "DS:%s not found in DQ2" % dataset + elif status == -1: + missingDS[dataset] = out + else: + # make map (key: LFN w/o attemptNr, value: LFN with attemptNr) + items = {} + try: + # protection for empty dataset + if out != '()': + exec "items = %s[0]" % out + # keep values to avoid redundant lookup + self.lfnDatasetMap[dataset] = items + # loop over all files + for guid,vals in items.iteritems(): + valMap[vals['lfn']] = {'guid' : guid, 'fsize' : vals['filesize'], + 'md5sum' : vals['checksum'], + 'chksum' : vals['checksum'], + 'scope' : vals['scope']} + genLFN = re.sub('\.\d+$','',vals['lfn']) + if lfnMap[dataset].has_key(genLFN): + # get attemptNr + newAttNr = 0 + newMat = re.search('\.(\d+)$',vals['lfn']) + if newMat != None: + newAttNr = int(newMat.group(1)) + oldAttNr = 0 + oldMat = re.search('\.(\d+)$',lfnMap[dataset][genLFN]) + if oldMat != None: + oldAttNr = int(oldMat.group(1)) + # compare + if newAttNr > oldAttNr: + lfnMap[dataset][genLFN] = vals['lfn'] + else: + lfnMap[dataset][genLFN] = vals['lfn'] + # mapping from LFN to DS + lfnDsMap[lfnMap[dataset][genLFN]] = dataset + except: + prodError[dataset] = 'could not convert HTTP-res to map for prodDBlock %s' % dataset + _logger.error(prodError[dataset]) + _logger.error(out) + # get replica locations + if (self.onlyTA or job.prodSourceLabel in ['managed','test']) \ + and prodError[dataset] == '' and (not replicaMap.has_key(dataset)): + if dataset.endswith('/'): + status,out = self.getListDatasetReplicasInContainer(dataset) + else: + for iDDMTry in range(3): + _logger.debug((self.timestamp,'listDatasetReplicas',dataset)) + status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1 \ + or out == '()': + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + prodError[dataset] = 'could not get locations for %s' % dataset + _logger.error(prodError[dataset]) + _logger.error(out) + else: + tmpRepSites = {} + try: + # convert res to map + exec "tmpRepSites = %s" % out + replicaMap[dataset] = tmpRepSites + except: + prodError[dataset] = 'could not convert HTTP-res to replica map for %s' % dataset + _logger.error(prodError[dataset]) + _logger.error(out) + # append except DBR + if not dataset.startswith('ddo'): + self.replicaMapForBroker[dataset] = tmpRepSites + # error + isFailed = False + # check for failed + for dataset in datasets: + if missingDS.has_key(dataset): + job.jobStatus = 'failed' + job.ddmErrorCode = ErrorCode.EC_GUID + job.ddmErrorDiag = missingDS[dataset] + # set missing + for tmpFile in job.Files: + if tmpFile.dataset == dataset: + tmpFile.status = 'missing' + # append + jobsFailed.append(job) + isFailed = True + # message for TA + if self.onlyTA: + _logger.error("%s %s" % (self.timestamp,missingDS[dataset])) + self.sendTaMesg("%s %s" % (job.taskID,missingDS[dataset]),msgType='error') + else: + _logger.debug("%s %s failed with %s" % (self.timestamp,job.PandaID,missingDS[dataset])) + break + if isFailed: + continue + # check for waiting + for dataset in datasets: + if prodError[dataset] != '': + # append job to waiting list + jobsWaiting.append(job) + isFailed = True + # message for TA + if self.onlyTA: + _logger.error("%s %s" % (self.timestamp,prodError[dataset])) + break + if isFailed: + continue + # set cloud + if panda_config.enableDynamicTA and job.prodSourceLabel in ['managed','validation'] \ + and job.cloud in ['NULL',''] and (not job.taskID in [None,'NULL',0]): + # look into map to check if it is already gotten + if not cloudMap.has_key(job.taskID): + # instantiate TaskAssigner + cloudResolver = TaskAssigner.TaskAssigner(self.taskBuffer,self.siteMapper, + job.taskID,job.prodSourceLabel,job) + # check cloud + _logger.debug("%s check cloud for %s" % (self.timestamp,job.taskID)) + retCloud = cloudResolver.checkCloud() + _logger.debug("%s checkCloud() -> %s" % (self.timestamp,retCloud)) + # failed + if retCloud == None: + _logger.error("failed to check cloud for %s" % job.taskID) + # append job to waiting list + jobsWaiting.append(job) + continue + # to be set + elif retCloud == "": + # collect LFN/GUID + tmpLFNs = [] + tmpGUIDs = [] + tmpReLoc = {} + tmpCountMap = {} + for dataset in datasets: + # get LFNs + eachDSLFNs = lfnMap[dataset].values() + tmpLFNs += eachDSLFNs + # get GUIDs + for oneLFN in eachDSLFNs: + tmpGUIDs.append(valMap[oneLFN]['guid']) + # locations + tmpReLoc[dataset] = replicaMap[dataset] + # file counts + tmpCountMap[dataset] = len(eachDSLFNs) + # set cloud + _logger.debug("%s set cloud for %s" % (self.timestamp,job.taskID)) + retCloud = cloudResolver.setCloud(tmpLFNs,tmpGUIDs,tmpReLoc,metadata=job.metadata, + fileCounts=tmpCountMap) + _logger.debug("%s setCloud() -> %s" % (self.timestamp,retCloud)) + if retCloud == None: + _logger.debug("failed to set cloud for %s" % job.taskID) + # append job to waiting list + jobsWaiting.append(job) + continue + # append to map + cloudMap[job.taskID] = retCloud + # set cloud + job.cloud = cloudMap[job.taskID] + # message for TA + if self.onlyTA: + _logger.debug("%s set %s:%s" % (self.timestamp,job.taskID,job.cloud)) + _logger.debug('%s replacing generic LFNs' % self.timestamp) + # replace generic LFN with real LFN + replaceList = [] + isFailed = False + for file in job.Files: + if file.type == 'input' and file.dispatchDBlock == 'NULL': + addToLfnMap = True + if file.GUID == 'NULL': + # get LFN w/o attemptNr + basename = re.sub('\.\d+$','',file.lfn) + if basename == file.lfn: + # replace + if basename in lfnMap[file.dataset].keys(): + file.lfn = lfnMap[file.dataset][basename] + replaceList.append((basename,file.lfn)) + # set GUID + if file.lfn in valMap: + file.GUID = valMap[file.lfn]['guid'] + file.fsize = valMap[file.lfn]['fsize'] + file.md5sum = valMap[file.lfn]['md5sum'] + file.checksum = valMap[file.lfn]['chksum'] + file.scope = valMap[file.lfn]['scope'] + # remove white space + if file.md5sum != None: + file.md5sum = file.md5sum.strip() + if file.checksum != None: + file.checksum = file.checksum.strip() + else: + if not job.prodSourceLabel in ['managed','test']: + addToLfnMap = False + # check missing file + if file.GUID == 'NULL' or job.prodSourceLabel in ['managed','test']: + if not file.lfn in valMap: + # append job to waiting list + errMsg = "GUID for %s not found in DQ2" % file.lfn + _logger.debug("%s %s" % (self.timestamp,errMsg)) + file.status = 'missing' + if not job in jobsFailed: + job.jobStatus = 'failed' + job.ddmErrorCode = ErrorCode.EC_GUID + job.ddmErrorDiag = errMsg + jobsFailed.append(job) + isFailed = True + continue + # add to allLFNs/allGUIDs + if addToLfnMap: + if not allLFNs.has_key(job.cloud): + allLFNs[job.cloud] = [] + if not allGUIDs.has_key(job.cloud): + allGUIDs[job.cloud] = [] + allLFNs[job.cloud].append(file.lfn) + allGUIDs[job.cloud].append(file.GUID) + # modify jobParameters + if not isFailed: + for patt,repl in replaceList: + job.jobParameters = re.sub('%s ' % patt, '%s ' % repl, job.jobParameters) + # append job to processed list + jobsProcessed.append(job) + # return if TA only + if self.onlyTA: + _logger.debug("%s end TA sessions" % self.timestamp) + return + _logger.debug('%s checking missing files at T1' % self.timestamp) + # get missing LFNs from source LRC/LFC + missLFNs = {} + for cloudKey in allLFNs.keys(): + # use BNL by default + dq2URL = self.siteMapper.getSite('BNL_ATLAS_1').dq2url + dq2SE = [] + # use cloud's source + if self.siteMapper.checkCloud(cloudKey): + tmpSrcID = self.siteMapper.getCloud(cloudKey)['source'] + tmpSrcSite = self.siteMapper.getSite(tmpSrcID) + # get LRC/LFC URL + if not tmpSrcSite.lfchost in [None,'']: + # LFC + dq2URL = 'lfc://'+tmpSrcSite.lfchost+':/grid/atlas/' + if tmpSrcSite.se != None: + for tmpSrcSiteSE in tmpSrcSite.se.split(','): + match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE) + if match != None: + dq2SE.append(match.group(1)) + # hack for split T1 + if cloudKey == 'NL': + tmpSplitSite = self.siteMapper.getSite('NIKHEF-ELPROD') + if tmpSplitSite.se != None: + for tmpSrcSiteSE in tmpSplitSite.se.split(','): + match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE) + if match != None: + dq2SE.append(match.group(1)) + else: + # LRC + dq2URL = tmpSrcSite.dq2url + dq2SE = [] + # get missing files + tmpMissLFNs = brokerage.broker_util.getMissLFNsFromLRC(allLFNs[cloudKey],dq2URL,allGUIDs[cloudKey],dq2SE) + # append + if not missLFNs.has_key(cloudKey): + missLFNs[cloudKey] = [] + missLFNs[cloudKey] += tmpMissLFNs + _logger.debug('%s checking T2 LFC' % self.timestamp) + # check availability of files at T2 + for cloudKey,tmpAllLFNs in allLFNs.iteritems(): + if len(self.jobs) > 0 and (self.jobs[0].prodSourceLabel in ['user','panda','ddm'] or \ + self.jobs[0].processingType.startswith('gangarobot') or \ + self.jobs[0].processingType.startswith('hammercloud')): + continue + # add cloud + if not self.availableLFNsInT2.has_key(cloudKey): + self.availableLFNsInT2[cloudKey] = {} + # loop over all files to find datasets + for tmpCheckLFN in tmpAllLFNs: + # add dataset + if not lfnDsMap.has_key(tmpCheckLFN): + continue + tmpDsName = lfnDsMap[tmpCheckLFN] + if not self.availableLFNsInT2[cloudKey].has_key(tmpDsName): + # collect sites + tmpSiteNameDQ2Map = DataServiceUtils.getSitesWithDataset(tmpDsName,self.siteMapper,replicaMap,cloudKey,getDQ2ID=True) + if tmpSiteNameDQ2Map == {}: + continue + self.availableLFNsInT2[cloudKey][tmpDsName] = {'allfiles':[],'allguids':[],'sites':{}} + for tmpSiteName in tmpSiteNameDQ2Map.keys(): + self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName] = [] + self.availableLFNsInT2[cloudKey][tmpDsName]['siteDQ2IDs'] = tmpSiteNameDQ2Map + # add files + if not tmpCheckLFN in self.availableLFNsInT2[cloudKey][tmpDsName]: + self.availableLFNsInT2[cloudKey][tmpDsName]['allfiles'].append(tmpCheckLFN) + self.availableLFNsInT2[cloudKey][tmpDsName]['allguids'].append(allGUIDs[cloudKey][allLFNs[cloudKey].index(tmpCheckLFN)]) + # get available files at each T2 + for tmpDsName in self.availableLFNsInT2[cloudKey].keys(): + checkedDq2SiteMap = {} + checkLfcSeMap = {} + for tmpSiteName in self.availableLFNsInT2[cloudKey][tmpDsName]['sites'].keys(): + tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) + # add LFC + if not checkLfcSeMap.has_key(tmpSiteSpec.lfchost): + checkLfcSeMap[tmpSiteSpec.lfchost] = {} + # add site + if not checkLfcSeMap[tmpSiteSpec.lfchost].has_key(tmpSiteName): + checkLfcSeMap[tmpSiteSpec.lfchost][tmpSiteName] = [] + # add SE + if tmpSiteSpec.se != None: + for tmpSrcSiteSE in tmpSiteSpec.se.split(','): + match = re.search('.+://([^:/]+):*\d*/*',tmpSrcSiteSE) + if match != None: + checkLfcSeMap[tmpSiteSpec.lfchost][tmpSiteName].append(match.group(1)) + # LFC lookup + for tmpLfcHost in checkLfcSeMap.keys(): + # get SEs + tmpSEList = [] + for tmpSiteName in checkLfcSeMap[tmpLfcHost].keys(): + tmpSEList += checkLfcSeMap[tmpLfcHost][tmpSiteName] + # get available file list + _logger.debug('%s checking T2 LFC=%s for %s' % (self.timestamp,tmpLfcHost,tmpSEList)) + bulkAvFiles = brokerage.broker_util.getFilesFromLRC(self.availableLFNsInT2[cloudKey][tmpDsName]['allfiles'], + 'lfc://'+tmpLfcHost+':/grid/atlas/', + self.availableLFNsInT2[cloudKey][tmpDsName]['allguids'], + storageName=tmpSEList,getPFN=True) + # check each site + for tmpSiteName in checkLfcSeMap[tmpLfcHost].keys(): + self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName] = [] + for tmpLFNck,tmpPFNlistck in bulkAvFiles.iteritems(): + siteHasFileFlag = False + for tmpPFNck in tmpPFNlistck: + # check se + for tmpSE in checkLfcSeMap[tmpLfcHost][tmpSiteName]: + if '://'+tmpSE in tmpPFNck: + siteHasFileFlag = True + break + # escape + if siteHasFileFlag: + break + # append + if siteHasFileFlag: + self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName].append(tmpLFNck) + _logger.debug('%s available %s files at %s T2=%s for %s' % \ + (self.timestamp, + len(self.availableLFNsInT2[cloudKey][tmpDsName]['sites'][tmpSiteName]), + cloudKey,tmpSiteName,tmpDsName)) + _logger.debug('%s missLFNs at T1 %s' % (self.timestamp,missLFNs)) + # check if files in source LRC/LFC + tmpJobList = tuple(jobsProcessed) + for job in tmpJobList: + # check only production/test jobs + if not job.prodSourceLabel in ['managed','test','software','rc_test','ptest']: + continue + # don't check if site is already set + if job.prodSourceLabel in ['managed','test'] and not job.computingSite in ['NULL','',None]: + continue + missingFlag = False + for file in job.Files: + if file.type == 'input': + if missLFNs.has_key(job.cloud) and file.lfn in missLFNs[job.cloud]: + # set file status + file.status = 'missing' + missingFlag = True + # check if missing files are available at T2s + goToT2 = None + if missingFlag: + tmpCandT2s = None + for tmpFile in job.Files: + if tmpFile.type == 'input' and tmpFile.status == 'missing': + # no cloud info + if not self.availableLFNsInT2.has_key(job.cloud): + goToT2 = False + break + # no dataset info + if not self.availableLFNsInT2[job.cloud].has_key(tmpFile.dataset): + goToT2 = False + break + # initial candidates + if tmpCandT2s == None: + tmpCandT2s = self.availableLFNsInT2[job.cloud][tmpFile.dataset]['sites'] + # check all candidates + newCandT2s = [] + for tmpCandT2 in tmpCandT2s: + # site doesn't have the dataset + if not self.availableLFNsInT2[job.cloud][tmpFile.dataset]['sites'].has_key(tmpCandT2): + continue + # site has the file + if tmpFile.lfn in self.availableLFNsInT2[job.cloud][tmpFile.dataset]['sites'][tmpCandT2]: + if not tmpCandT2 in newCandT2s: + newCandT2s.append(tmpCandT2) + # set new candidates + tmpCandT2s = newCandT2s + # no candidates left + if tmpCandT2s == []: + goToT2 = False + break + # go to T2 + if goToT2 == None: + goToT2 = True + # remove job not to process further + if missingFlag and goToT2 != True: + jobsProcessed.remove(job) + # revert + for oJob in self.jobs: + if oJob.PandaID == job.PandaID: + jobsWaiting.append(oJob) + break + # get missing datasets + if missingFlag: + if job.processingType.startswith('gangarobot') or \ + job.processingType.startswith('hammercloud'): + pass + elif not job.prodSourceLabel in ['managed']: + pass + else: + for tmpFile in job.Files: + if tmpFile.type == 'input' and tmpFile.status == 'missing' and \ + not tmpFile.dataset.startswith('ddo'): + # append + if not self.missingDatasetList.has_key(job.cloud): + self.missingDatasetList[job.cloud] = {} + if not self.missingDatasetList[job.cloud].has_key(tmpFile.dataset): + self.missingDatasetList[job.cloud][tmpFile.dataset] = [] + if not tmpFile.GUID in self.missingDatasetList[job.cloud][tmpFile.dataset]: + self.missingDatasetList[job.cloud][tmpFile.dataset].append(tmpFile.GUID) + # set data summary fields + for tmpJob in self.jobs: + try: + # set only for production/analysis/test + if not tmpJob.prodSourceLabel in ['managed','test','rc_test','ptest','user']: + continue + # loop over all files + tmpJob.nInputDataFiles = 0 + tmpJob.inputFileBytes = 0 + tmpInputFileProject = None + tmpInputFileType = None + for tmpFile in tmpJob.Files: + # use input files and ignore DBR/lib.tgz + if tmpFile.type == 'input' and (not tmpFile.dataset.startswith('ddo')) \ + and not tmpFile.lfn.endswith('.lib.tgz'): + tmpJob.nInputDataFiles += 1 + if not tmpFile.fsize in ['NULL',None,0,'0']: + tmpJob.inputFileBytes += tmpFile.fsize + # get input type and project + if tmpInputFileProject == None: + tmpInputItems = tmpFile.dataset.split('.') + # input project + tmpInputFileProject = tmpInputItems[0] + # input type. ignore user/group/groupXY + if len(tmpInputItems) > 4 and (not tmpInputItems[0] in ['','NULL','user','group']) \ + and (not tmpInputItems[0].startswith('group')): + tmpInputFileType = tmpInputItems[4] + # set input type and project + if not tmpJob.prodDBlock in ['',None,'NULL']: + # input project + if tmpInputFileProject != None: + tmpJob.inputFileProject = tmpInputFileProject + # input type + if tmpInputFileType != None: + tmpJob.inputFileType = tmpInputFileType + # protection + maxInputFileBytes = 99999999999 + if tmpJob.inputFileBytes > maxInputFileBytes: + tmpJob.inputFileBytes = maxInputFileBytes + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("failed to set data summary fields for PandaID=%s: %s %s" % (tmpJob.PandaID,errType,errValue)) + # send jobs to jobsWaiting + self.taskBuffer.keepJobs(jobsWaiting) + # update failed job + self.taskBuffer.updateJobs(jobsFailed,True) + # remove waiting/failed jobs + self.jobs = jobsProcessed + # delete huge variables + del lfnMap + del valMap + del prodError + del jobsWaiting + del jobsProcessed + del allLFNs + del allGUIDs + del cloudMap + del missLFNs + + + # remove waiting jobs + def removeWaitingJobs(self): + jobsWaiting = [] + jobsProcessed = [] + for tmpJob in self.jobs: + if tmpJob.jobStatus == 'waiting': + jobsWaiting.append(tmpJob) + else: + jobsProcessed.append(tmpJob) + # send jobs to jobsWaiting + self.taskBuffer.keepJobs(jobsWaiting) + # remove waiting/failed jobs + self.jobs = jobsProcessed + + + # memory checker + def _memoryCheck(self): + try: + import os + proc_status = '/proc/%d/status' % os.getpid() + procfile = open(proc_status) + name = "" + vmSize = "" + vmRSS = "" + # extract Name,VmSize,VmRSS + for line in procfile: + if line.startswith("Name:"): + name = line.split()[-1] + continue + if line.startswith("VmSize:"): + vmSize = "" + for item in line.split()[1:]: + vmSize += item + continue + if line.startswith("VmRSS:"): + vmRSS = "" + for item in line.split()[1:]: + vmRSS += item + continue + procfile.close() + _logger.debug('%s MemCheck PID=%s Name=%s VSZ=%s RSS=%s' % (self.timestamp,os.getpid(),name,vmSize,vmRSS)) + except: + type, value, traceBack = sys.exc_info() + _logger.error("memoryCheck() : %s %s" % (type,value)) + _logger.debug('%s MemCheck PID=%s unknown' % (self.timestamp,os.getpid())) + return + + + # check DDM response + def isDQ2ok(self,out): + if out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1 \ + or out == '()': + return False + return True + + + # get list of files in dataset + def getListFilesInDataset(self,dataset): + # use cache data + if self.lfnDatasetMap.has_key(dataset): + return True,self.lfnDatasetMap[dataset] + for iDDMTry in range(3): + _logger.debug((self.timestamp,'listFilesInDataset',dataset)) + status,out = ddm.DQ2.main('listFilesInDataset',dataset) + if out.find("DQUnknownDatasetException") != -1: + break + elif status == -1: + break + elif status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error("%s %s" % (self.timestamp,out)) + return False,{} + # convert + items = {} + try: + exec "items = %s[0]" % out + except: + return False,{} + return True,items + + + # get list of datasets in container + def getListDatasetInContainer(self,container): + # get datasets in container + _logger.debug((self.timestamp,'listDatasetsInContainer',container)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('listDatasetsInContainer',container) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + _logger.debug('%s %s' % (self.timestamp,out)) + if status != 0 or out.startswith('Error'): + return False,out + datasets = [] + try: + # convert to list + exec "datasets = %s" % out + except: + return False,out + return True,datasets + + + def getListDatasetReplicasInContainer(self,container,getMap=False): + # get datasets in container + _logger.debug((self.timestamp,'listDatasetsInContainer',container)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('listDatasetsInContainer',container) + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1 \ + or out == '()': + time.sleep(60) + else: + break + _logger.debug('%s %s' % (self.timestamp,out)) + if status != 0 or out.startswith('Error'): + return status,out + datasets = [] + try: + # convert to list + exec "datasets = %s" % out + except: + return status,out + # loop over all datasets + allRepMap = {} + for dataset in datasets: + _logger.debug((self.timestamp,'listDatasetReplicas',dataset)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1 \ + or out == '()': + time.sleep(60) + else: + break + _logger.debug('%s %s' % (self.timestamp,out)) + if status != 0 or out.startswith('Error'): + return status,out + tmpRepSites = {} + try: + # convert res to map + exec "tmpRepSites = %s" % out + except: + return status,out + # get map + if getMap: + allRepMap[dataset] = tmpRepSites + continue + # otherwise get sum + for siteId,statList in tmpRepSites.iteritems(): + if not allRepMap.has_key(siteId): + # append + allRepMap[siteId] = [statList[-1],] + else: + # add + newStMap = {} + for stName,stNum in allRepMap[siteId][0].iteritems(): + if statList[-1].has_key(stName): + # try mainly for archived=None + try: + newStMap[stName] = stNum + statList[-1][stName] + except: + newStMap[stName] = stNum + else: + newStMap[stName] = stNum + allRepMap[siteId] = [newStMap,] + # return + _logger.debug('%s %s' % (self.timestamp,str(allRepMap))) + if not getMap: + return 0,str(allRepMap) + else: + return 0,allRepMap + + + # get list of replicas for a dataset + def getListDatasetReplicas(self,dataset): + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug("%s %s/%s listDatasetReplicas %s" % (self.timestamp,iDDMTry,nTry,dataset)) + status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if status != 0 or out.startswith('Error'): + _logger.error(self.timestamp+' '+out) + _logger.error('%s bad DQ2 response for %s' % (self.timestamp,dataset)) + return False,{} + try: + # convert res to map + exec "tmpRepSites = %s" % out + _logger.debug('%s getListDatasetReplicas->%s' % (self.timestamp,str(tmpRepSites))) + return True,tmpRepSites + except: + _logger.error(self.timestamp+' '+out) + _logger.error('%s could not convert HTTP-res to replica map for %s' % (self.timestamp,dataset)) + return False,{} + + + # delete original locations + def deleteDatasetReplicas(self,datasets,keepSites): + # loop over all datasets + for dataset in datasets: + # get locations + status,tmpRepSites = self.getListDatasetReplicas(dataset) + if not status: + return False + # no replicas + if len(tmpRepSites.keys()) == 0: + continue + delSites = [] + for tmpRepSite in tmpRepSites.keys(): + if not tmpRepSite in keepSites: + delSites.append(tmpRepSite) + # no repilicas to be deleted + if delSites == []: + continue + # delete + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug("%s %s/%s deleteDatasetReplicas %s %s" % (self.timestamp,iDDMTry,nTry,dataset,str(delSites))) + status,out = ddm.DQ2.main('deleteDatasetReplicas',dataset,delSites) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if status != 0 or out.startswith('Error'): + _logger.error(self.timestamp+' '+out) + _logger.error('%s bad DQ2 response for %s' % (self.timestamp,dataset)) + return False + _logger.debug(self.timestamp+' '+out) + # return + _logger.debug('%s deleted replicas for %s' % (self.timestamp,str(datasets))) + return True + + + # dynamic data placement for analysis jobs + def _dynamicDataPlacement(self): + # no jobs + if len(self.jobs) == 0: + return + # only successful analysis + if self.jobs[0].jobStatus in ['failed','cancelled'] or (not self.jobs[0].prodSourceLabel in ['user','panda']): + return + # execute + _logger.debug('%s execute PD2P' % self.timestamp) + from DynDataDistributer import DynDataDistributer + ddd = DynDataDistributer(self.jobs,self.taskBuffer,self.siteMapper) + ddd.run() + _logger.debug('%s finished PD2P' % self.timestamp) + return + + + # make dis datasets for existing files to avoid deletion when jobs are queued + def _makeDisDatasetsForExistingfiles(self): + _logger.debug('%s make dis datasets for existing files' % self.timestamp) + # collect existing files + dsFileMap = {} + nMaxJobs = 20 + nJobsMap = {} + for tmpJob in self.jobs: + # use production or test jobs only + if not tmpJob.prodSourceLabel in ['managed','test']: + continue + # ignore inappropriate status + if tmpJob.jobStatus in ['failed','cancelled','waiting']: + continue + # check cloud + if (tmpJob.cloud == 'ND' and self.siteMapper.getSite(tmpJob.computingSite).cloud == 'ND') or \ + (tmpJob.cloud == 'US' and self.siteMapper.getSite(tmpJob.computingSite).cloud == 'US'): + continue + # check SE to use T2 only + tmpSrcID = self.siteMapper.getCloud(tmpJob.cloud)['source'] + srcSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpSrcID).se) + dstSEs = brokerage.broker_util.getSEfromSched(self.siteMapper.getSite(tmpJob.computingSite).se) + if srcSEs == dstSEs: + continue + # look for log _sub dataset to be used as a key + logSubDsName = '' + for tmpFile in tmpJob.Files: + if tmpFile.type == 'log': + logSubDsName = tmpFile.destinationDBlock + break + # append site + destDQ2ID = self.siteMapper.getSite(tmpJob.computingSite).ddm + # T1 used as T2 + if tmpJob.cloud != self.siteMapper.getSite(tmpJob.computingSite).cloud and \ + not destDQ2ID.endswith('PRODDISK') and \ + self.siteMapper.getSite(tmpJob.computingSite).cloud in ['US']: + tmpSeTokens = self.siteMapper.getSite(tmpJob.computingSite).setokens + if tmpSeTokens.has_key('ATLASPRODDISK'): + destDQ2ID = tmpSeTokens['ATLASPRODDISK'] + mapKeyJob = (destDQ2ID,logSubDsName) + # increment the number of jobs per key + if not nJobsMap.has_key(mapKeyJob): + nJobsMap[mapKeyJob] = 0 + mapKey = (destDQ2ID,logSubDsName,nJobsMap[mapKeyJob]/nMaxJobs) + nJobsMap[mapKeyJob] += 1 + if not dsFileMap.has_key(mapKey): + dsFileMap[mapKey] = {} + # add files + for tmpFile in tmpJob.Files: + if tmpFile.type != 'input': + continue + # if files are unavailable at the dest site normal dis datasets contain them + # or files are cached + if not tmpFile.status in ['ready']: + continue + # if available at T2 + realDestDQ2ID = (destDQ2ID,) + if self.availableLFNsInT2.has_key(tmpJob.cloud) and self.availableLFNsInT2[tmpJob.cloud].has_key(tmpFile.dataset) \ + and self.availableLFNsInT2[tmpJob.cloud][tmpFile.dataset]['sites'].has_key(tmpJob.computingSite) \ + and tmpFile.lfn in self.availableLFNsInT2[tmpJob.cloud][tmpFile.dataset]['sites'][tmpJob.computingSite]: + realDestDQ2ID = self.availableLFNsInT2[tmpJob.cloud][tmpFile.dataset]['siteDQ2IDs'][tmpJob.computingSite] + realDestDQ2ID = tuple(realDestDQ2ID) + # append + if not dsFileMap[mapKey].has_key(realDestDQ2ID): + dsFileMap[mapKey][realDestDQ2ID] = {'taskID':tmpJob.taskID, + 'PandaID':tmpJob.PandaID, + 'files':{}} + if not dsFileMap[mapKey][realDestDQ2ID]['files'].has_key(tmpFile.lfn): + dsFileMap[mapKey][realDestDQ2ID]['files'][tmpFile.lfn] = {'lfn' :tmpFile.lfn, + 'guid':tmpFile.GUID, + 'fileSpecs':[]} + # add file spec + dsFileMap[mapKey][realDestDQ2ID]['files'][tmpFile.lfn]['fileSpecs'].append(tmpFile) + # loop over all locations + dispList = [] + for tmpMapKey,tmpDumVal in dsFileMap.iteritems(): + tmpDumLocation,tmpLogSubDsName,tmpBunchIdx = tmpMapKey + for tmpLocationList,tmpVal in tmpDumVal.iteritems(): + for tmpLocation in tmpLocationList: + tmpFileList = tmpVal['files'] + if tmpFileList == {}: + continue + nMaxFiles = 500 + iFiles = 0 + iLoop = 0 + while iFiles < len(tmpFileList): + subFileNames = tmpFileList.keys()[iFiles:iFiles+nMaxFiles] + if len(subFileNames) == 0: + break + # dis name + disDBlock = "panda.%s.%s.%s.%s_dis0%s%s" % (tmpVal['taskID'],time.strftime('%m.%d'),'GEN', + commands.getoutput('uuidgen'),iLoop, + tmpVal['PandaID']) + iFiles += nMaxFiles + lfns = [] + guids = [] + fsizes = [] + chksums = [] + for tmpSubFileName in subFileNames: + lfns.append(tmpFileList[tmpSubFileName]['lfn']) + guids.append(tmpFileList[tmpSubFileName]['guid']) + fsizes.append(None) + chksums.append(None) + # set dis name + for tmpFileSpec in tmpFileList[tmpSubFileName]['fileSpecs']: + if tmpFileSpec.status in ['ready'] and tmpFileSpec.dispatchDBlock == 'NULL': + tmpFileSpec.dispatchDBlock = disDBlock + # register datasets + iLoop += 1 + _logger.debug((self.timestamp,'ext registerNewDataset',disDBlock,lfns,guids,fsizes,chksums, + None,None,None,True)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('registerNewDataset',disDBlock,lfns,guids,fsizes,chksums, + None,None,None,True) + if status != 0 and out.find('DQDatasetExistsException') != -1: + break + elif status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + _logger.debug("%s sleep %s for %s" % (self.timestamp,iDDMTry,disDBlock)) + _logger.debug(status) + _logger.debug(out) + time.sleep(60) + else: + break + if status != 0 or out.find('Error') != -1: + _logger.error("%s %s" % (self.timestamp,out)) + continue + _logger.debug("%s %s" % (self.timestamp,out)) + # get VUID + try: + exec "vuid = %s['vuid']" % out + # dataset spec. currentfiles is used to count the number of failed jobs + ds = DatasetSpec() + ds.vuid = vuid + ds.name = disDBlock + ds.type = 'dispatch' + ds.status = 'defined' + ds.numberfiles = len(lfns) + ds.currentfiles = 0 + dispList.append(ds) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("ext registerNewDataset : failed to decode VUID for %s - %s %s" % (disDBlock,errType,errValue)) + continue + # freezeDataset dispatch dataset + _logger.debug((self.timestamp,'freezeDataset',disDBlock)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('freezeDataset',disDBlock) + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(60) + else: + break + if status != 0 or (out.find('Error') != -1 and out.find("is frozen") == -1): + _logger.error("%s %s" % (self.timestamp,out)) + continue + _logger.debug("%s %s" % (self.timestamp,out)) + # register location + _logger.debug((self.timestamp,'registerDatasetLocation',disDBlock,tmpLocation,0,1,None,None,None,"7 days")) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('registerDatasetLocation',disDBlock,tmpLocation,0,1,None,None,None,"7 days") + if status != 0 or out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1: + time.sleep(60) + else: + break + _logger.debug("%s %s" % (self.timestamp,out)) + # failure + if status != 0 or out.find('Error') != -1: + _logger.error("%s %s" % (self.timestamp,out)) + continue + # insert datasets to DB + self.taskBuffer.insertDatasets(dispList) + _logger.debug('%s finished to make dis datasets for existing files' % self.timestamp) + return + + + # pin input dataset + def _pinInputDatasets(self): + _logger.debug('%s pin input datasets' % self.timestamp) + # collect input datasets and locations + doneList = [] + allReplicaMap = {} + for tmpJob in self.jobs: + # ignore HC jobs + if tmpJob.processingType.startswith('gangarobot') or \ + tmpJob.processingType.startswith('hammercloud'): + continue + # use production or test or user jobs only + if not tmpJob.prodSourceLabel in ['managed','test','user']: + continue + # ignore inappropriate status + if tmpJob.jobStatus in ['failed','cancelled','waiting']: + continue + # set lifetime + if tmpJob.prodSourceLabel in ['managed','test']: + pinLifeTime = 7 + else: + pinLifeTime = 7 + # get source + if tmpJob.prodSourceLabel in ['managed','test']: + tmpSrcID = self.siteMapper.getCloud(tmpJob.cloud)['source'] + srcDQ2ID = self.siteMapper.getSite(tmpSrcID).ddm + else: + srcDQ2ID = self.siteMapper.getSite(tmpJob.computingSite).ddm + # prefix of DQ2 ID + srcDQ2IDprefix = re.sub('_[A-Z,0-9]+DISK$','',srcDQ2ID) + # loop over all files + for tmpFile in tmpJob.Files: + # use input files and ignore DBR/lib.tgz + if tmpFile.type == 'input' and \ + not tmpFile.lfn.endswith('.lib.tgz') and \ + not tmpFile.dataset.startswith('ddo') and \ + not tmpFile.dataset.startswith('user') and \ + not tmpFile.dataset.startswith('group'): + # get replica locations + if not allReplicaMap.has_key(tmpFile.dataset): + if tmpFile.dataset.endswith('/'): + status,tmpRepSitesMap = self.getListDatasetReplicasInContainer(tmpFile.dataset,getMap=True) + if status == 0: + status = True + else: + status = False + else: + status,tmpRepSites = self.getListDatasetReplicas(tmpFile.dataset) + tmpRepSitesMap = {} + tmpRepSitesMap[tmpFile.dataset] = tmpRepSites + # append + if status: + allReplicaMap[tmpFile.dataset] = tmpRepSitesMap + else: + # set empty to avoid further lookup + allReplicaMap[tmpFile.dataset] = {} + # loop over constituent datasets + _logger.debug('%s pin DQ2 prefix=%s' % (self.timestamp,srcDQ2IDprefix)) + for tmpDsName,tmpRepSitesMap in allReplicaMap[tmpFile.dataset].iteritems(): + # loop over locations + for tmpRepSite in tmpRepSitesMap.keys(): + if tmpRepSite.startswith(srcDQ2IDprefix) \ + and not 'TAPE' in tmpRepSite \ + and not 'SCRATCH' in tmpRepSite: + tmpKey = (tmpDsName,tmpRepSite) + # already done + if tmpKey in doneList: + continue + # append to avoid repetition + doneList.append(tmpKey) + # get metadata + status,tmpMetadata = self.getReplicaMetadata(tmpDsName,tmpRepSite) + if not status: + continue + # check pin lifetime + if tmpMetadata.has_key('pin_expirationdate'): + if isinstance(tmpMetadata['pin_expirationdate'],types.StringType) and tmpMetadata['pin_expirationdate'] != 'None': + # keep original pin lifetime if it is longer + origPinLifetime = datetime.datetime.strptime(tmpMetadata['pin_expirationdate'],'%Y-%m-%d %H:%M:%S') + if origPinLifetime > datetime.datetime.utcnow()+datetime.timedelta(days=pinLifeTime): + _logger.debug('%s skip pinning for %s:%s due to longer lifetime %s' % (self.timestamp, + tmpDsName,tmpRepSite, + tmpMetadata['pin_expirationdate'])) + continue + # set pin lifetime + status = self.setReplicaMetadata(tmpDsName,tmpRepSite,'pin_lifetime','%s days' % pinLifeTime) + # retrun + _logger.debug('%s pin input datasets done' % self.timestamp) + return + + + # make T1 subscription for missing files + def _makeSubscriptionForMissing(self): + _logger.debug('%s make subscriptions for missing files' % self.timestamp) + # collect datasets + missingList = {} + for tmpCloud,tmpMissDatasets in self.missingDatasetList.iteritems(): + # append cloud + if not missingList.has_key(tmpCloud): + missingList[tmpCloud] = [] + # loop over all datasets + for tmpDsName,tmpMissFiles in tmpMissDatasets.iteritems(): + # check if datasets in container are used + if tmpDsName.endswith('/'): + # convert container to datasets + tmpStat,tmpDsList = self.getListDatasetInContainer(tmpDsName) + if not tmpStat: + _logger.error('%s failed to get datasets in container:%s' % (self.timestamp,tmpDsName)) + continue + # check if each dataset is actually used + for tmpConstDsName in tmpDsList: + # skip if already checked + if tmpDsName in missingList[tmpCloud]: + continue + # get files in each dataset + tmpStat,tmpFilesInDs = self.getListFilesInDataset(tmpConstDsName) + if not tmpStat: + _logger.error('%s failed to get files in dataset:%s' % (self.timestamp,tmpConstDsName)) + continue + # loop over all files to check the dataset is used + for tmpGUID in tmpMissFiles: + # append if used + if tmpFilesInDs.has_key(tmpGUID): + missingList[tmpCloud].append(tmpConstDsName) + break + else: + # append dataset w/o checking + if not tmpDsName in missingList[tmpCloud]: + missingList[tmpCloud].append(tmpDsName) + # make subscriptions + for tmpCloud,missDsNameList in missingList.iteritems(): + # get distination + tmpDstID = self.siteMapper.getCloud(tmpCloud)['source'] + dstDQ2ID = self.siteMapper.getSite(tmpDstID).ddm + # register subscription + for missDsName in missDsNameList: + _logger.debug('%s make subscription at %s for missing %s' % (self.timestamp,dstDQ2ID,missDsName)) + self.makeSubscription(missDsName,dstDQ2ID) + # retrun + _logger.debug('%s make subscriptions for missing files done' % self.timestamp) + return + + + # check DDM response + def isDQ2ok(self,out): + if out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1 \ + or out == '()': + return False + return True + + + # make subscription + def makeSubscription(self,dataset,dq2ID): + # return for failuer + retFailed = False + # make subscription + optSrcPolicy = 000001 + nTry = 3 + for iDDMTry in range(nTry): + # register subscription + _logger.debug('%s %s/%s registerDatasetSubscription %s %s' % (self.timestamp,iDDMTry,nTry,dataset,dq2ID)) + status,out = ddm.DQ2.main('registerDatasetSubscription',dataset,dq2ID,version=0,archived=0, + callbacks={},sources={},sources_policy=optSrcPolicy, + wait_for_sources=0,destination=None,query_more_sources=0, + sshare="production",group=None,activity='Production',acl_alias='secondary') + status,out = 0,'' + if out.find('DQSubscriptionExistsException') != -1: + break + elif out.find('DQLocationExistsException') != -1: + break + elif status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if out.find('DQSubscriptionExistsException') != -1: + pass + elif status != 0 or out.startswith('Error'): + _logger.error("%s %s" % (self.timestamp,out)) + return retFailed + # update + _logger.debug('%s %s %s' % (self.timestamp,status,out)) + # return + return True + + + # get replica metadata + def getReplicaMetadata(self,datasetName,locationName): + # response for failure + resForFailure = False,{} + # get metadata + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug('%s %s/%s listMetaDataReplica %s %s' % (self.timestamp,iDDMTry,nTry,datasetName,locationName)) + status,out = ddm.DQ2.main('listMetaDataReplica',locationName,datasetName) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error("%s %s" % (self.timestamp,out)) + return resForFailure + metadata = {} + try: + # convert to map + exec "metadata = %s" % out + except: + _logger.error('%s could not convert HTTP-res to replica metadata for %s:%s' % \ + (self.timestamp,datasetName,locationName)) + return resForFailure + # return + _logger.debug('%s getReplicaMetadata -> %s' % (self.timestamp,str(metadata))) + return True,metadata + + + # set replica metadata + def setReplicaMetadata(self,datasetName,locationName,attrname,attrvalue): + # response for failure + resForFailure = False + # get metadata + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug('%s %s/%s setReplicaMetaDataAttribute %s %s %s=%s' % (self.timestamp,iDDMTry,nTry,datasetName, + locationName,attrname,attrvalue)) + status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',datasetName,locationName,attrname,attrvalue) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error("%s %s" % (self.timestamp,out)) + return resForFailure + # return + _logger.debug('%s setReplicaMetadata done' % self.timestamp) + return True + + + # send task brokerage message to logger + def sendTaMesg(self,message,msgType=None): + try: + # get logger + tmpPandaLogger = PandaLogger() + # lock HTTP handler + tmpPandaLogger.lock() + tmpPandaLogger.setParams({'Type':'taskbrokerage'}) + # use bamboo for loggername + if panda_config.loggername == 'prod': + tmpLogger = tmpPandaLogger.getHttpLogger('bamboo') + else: + # for dev + tmpLogger = tmpPandaLogger.getHttpLogger(panda_config.loggername) + # add message + if msgType=='error': + tmpLogger.error(message) + elif msgType=='warning': + tmpLogger.warning(message) + elif msgType=='info': + tmpLogger.info(message) + else: + tmpLogger.debug(message) + # release HTTP handler + tmpPandaLogger.release() + except: + pass + time.sleep(1) + diff --git a/current/pandaserver/dataservice/TaLauncher.py b/current/pandaserver/dataservice/TaLauncher.py new file mode 100755 index 000000000..e44a7bc72 --- /dev/null +++ b/current/pandaserver/dataservice/TaLauncher.py @@ -0,0 +1,55 @@ +''' +launcer for TaskAssigner + +''' + +import sys +import time +import commands +import threading +import cPickle as pickle + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('TaLauncher') + + +class TaLauncher (threading.Thread): + # constructor + def __init__(self,taskBuffer,jobs): + threading.Thread.__init__(self) + self.jobs = jobs + self.taskBuffer = taskBuffer + # time stamp + self.timestamp = time.asctime() + + + # main + def run(self): + try: + _logger.debug('%s startRun' % self.timestamp) + # run setupper sequentially + for job in self.jobs: + # write jobs to file + outFileName = '%s/set.%s_%s' % (panda_config.logdir,job.PandaID,commands.getoutput('uuidgen')) + outFile = open(outFileName,'w') + pickle.dump([job],outFile) + outFile.close() + # run main procedure in another process because python doesn't release memory + com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) + com += 'source /opt/glite/etc/profile.d/grid-env.sh; ' + com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \ + (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, + panda_config.pandaPython_dir,outFileName) + # add option for TA + com += " -t" + _logger.debug('%s taskID:%s %s' % (self.timestamp,job.taskID,com)) + # exeute + status,output = self.taskBuffer.processLimiter.getstatusoutput(com) + _logger.debug("%s Ret from child process: %s %s" % (self.timestamp,status,output)) + _logger.debug('%s endRun' % self.timestamp) + except: + type, value, traceBack = sys.exc_info() + _logger.error("run() : %s %s" % (type,value)) diff --git a/current/pandaserver/dataservice/TaskAssigner.py b/current/pandaserver/dataservice/TaskAssigner.py new file mode 100644 index 000000000..677cd4645 --- /dev/null +++ b/current/pandaserver/dataservice/TaskAssigner.py @@ -0,0 +1,1180 @@ +''' +setup cloud + +''' + +import re +import sys +import time +import types +import random +import commands +import datetime +import brokerage.broker_util +from DDM import ddm +from DDM import dq2Common +from DDM import toa +from config import panda_config +from taskbuffer import ProcessGroups +from pandalogger.PandaLogger import PandaLogger +import DataServiceUtils + + +# logger +_logger = PandaLogger().getLogger('TaskAssigner') + +# cutoff for RW +thr_RW_low = 400 +thr_RW_high = 8000 +thr_RW_sub = 600 + +# cutoff for disk +thr_space_low = (1 * 1024) + +# special reduction for TAPE +reductionForTape = 0.5 + +# task types using MC share +taskTypesMcShare = ['evgen'] + +# task types for subscriptions +taskTypesSub = ['simul'] + +# dataset type to ignore file availability check +datasetTypeToSkipCheck = ['log'] + +class TaskAssigner: + # constructor + def __init__(self,taskBuffer,siteMapper,taskID,prodSourceLabel,job): + self.taskBuffer = taskBuffer + self.siteMapper = siteMapper + self.taskID = taskID + self.cloudTask = None + self.prodSourceLabel = prodSourceLabel + self.cloudForSubs = [] + self.job = job + self.metadataMap = {} + self.contDsMap = {} + + + # check cloud + def checkCloud(self): + try: + _logger.info('%s checkCloud' % self.taskID) + # get CloudTask from DB + self.cloudTask = self.taskBuffer.getCloudTask(self.taskID) + if self.cloudTask == None: + _logger.error('%s cannot get CloudTask' % self.taskID) + return None + # if already assigned + if self.cloudTask.status == 'assigned': + _logger.info('%s checked Cloud -> %s' % (self.taskID,self.cloudTask.cloud)) + return self.cloudTask.cloud + # return "" to set cloud later + _logger.info('%s return Cloud=""' % self.taskID) + return "" + except: + type, value, traceBack = sys.exc_info() + _logger.error("%s checkCloud : %s %s" % (self.taskID,type,value)) + return None + + + # set cloud + def setCloud(self,lfns,guids,locations={},metadata=None,fileCounts=None): + try: + _logger.info('%s setCloud' % self.taskID) + _logger.info('%s metadata="%s"' % (self.taskID,metadata)) + _logger.info('%s fileCounts="%s"' % (self.taskID,fileCounts)) + taskType = None + RWs = {} + expRWs = {} + highRWs = {} + prioMap = {} + fullRWs = {} + tt2Map = {} + diskCount = 0 + usingOpenDS = False + try: + # parse metadata + if not metadata in (None,'NULL'): + # task type + taskType = metadata.split(';')[0] + # RWs + exec "RWs = %s" % metadata.split(';')[1] + # expected RWs + exec "expRWs = %s" % metadata.split(';')[2] + # RWs for high priority tasks + exec "prioMap = %s" % metadata.split(';')[3] + # full RWs for space calcuration + exec "fullRWs = %s" % metadata.split(';')[4] + # tasktype2 map + exec "tt2Map = %s" % metadata.split(';')[5] + except: + pass + try: + diskCount = int(self.job.maxDiskCount) + except: + pass + message = '%s taskType==%s prio==%s RW==%s DiskCount==%s' % (self.taskID,taskType,prioMap[self.taskID], + expRWs[self.taskID],diskCount) + _logger.info(message) + self.sendMesg(message) + _logger.info('%s RWs = %s' % (self.taskID,str(RWs))) + _logger.info('%s expRWs = %s' % (self.taskID,str(expRWs))) + _logger.info('%s prioMap = %s' % (self.taskID,str(prioMap))) + _logger.info('%s fullRWs = %s' % (self.taskID,str(fullRWs))) + _logger.info('%s tt2Map = %s' % (self.taskID,str(tt2Map))) + # get cloud list + cloudList = self.siteMapper.getCloudList() + # get pilot statistics + nWNmap = self.taskBuffer.getCurrentSiteData() + # get process group + myTaskGroup = ProcessGroups.getProcessGroup(tt2Map[self.taskID]) + # recalculate RWs + for tmpTaskID,tmpExpRW in expRWs.iteritems(): + # skip myself + if tmpTaskID == self.taskID: + continue + # get cloud from DB + tmpCloudInDB = self.taskBuffer.seeCloudTask(tmpTaskID) + # not assigned + if tmpCloudInDB == '': + continue + # increase full RW + if not fullRWs.has_key(tmpCloudInDB): + fullRWs[tmpCloudInDB] = 0 + fullRWs[tmpCloudInDB] += tmpExpRW + # no priority info + if not prioMap.has_key(tmpTaskID): + continue + # lower priority + if prioMap[tmpTaskID] < prioMap[self.taskID]: + continue + # check tasktype2 + tmpTaskGroup = ProcessGroups.getProcessGroup(tt2Map[tmpTaskID]) + # check tasktype2 + if tmpTaskGroup != myTaskGroup: + continue + # increase RW + if not RWs.has_key(tmpCloudInDB): + RWs[tmpCloudInDB] = 0 + RWs[tmpCloudInDB] += tmpExpRW + _logger.info('%s newRWs =%s' % (self.taskID,str(RWs))) + _logger.info('%s fullRWs =%s' % (self.taskID,str(fullRWs))) + # remove offline clouds and check validation/fasttrack + tmpCloudList = [] + for tmpCloudName in cloudList: + # get cloud + tmpCloud = self.siteMapper.getCloud(tmpCloudName) + # skip offline clouds + if not tmpCloud['status'] in ['online']: + message = '%s %s skip : status==%s' % (self.taskID,tmpCloudName,tmpCloud['status']) + _logger.info(message) + self.sendMesg(message) + continue + # skip non-validation cloud if validation + if self.prodSourceLabel in ['validation'] and tmpCloud['validation'] != 'true': + message = "%s %s skip : validation=='%s'" % (self.taskID,tmpCloudName,tmpCloud['validation']) + _logger.info(message) + self.sendMesg(message) + continue + # check fast track + if ((taskType in ['evgen'] and prioMap[self.taskID] >= 700) or + (taskType in ['simul'] and prioMap[self.taskID] >= 800)) and tmpCloud['fasttrack'] != 'true': + message = "%s %s skip : fasttrack=='%s'" % (self.taskID,tmpCloudName,tmpCloud['fasttrack']) + _logger.info(message) + self.sendMesg(message) + continue + # check disk count + if diskCount != 0: + enoughSpace = self.checkDiskCount(diskCount,tmpCloudName) + if not enoughSpace: + message = "%s %s skip : no online sites have enough space for DiskCount==%s" % (self.taskID,tmpCloudName,diskCount) + _logger.info(message) + self.sendMesg(message,msgType='warning') + continue + # append + tmpCloudList.append(tmpCloudName) + self.cloudForSubs.append(tmpCloudName) + cloudList = tmpCloudList + # DQ2 location info + _logger.info('%s DQ2 locations %s' % (self.taskID,str(locations))) + # check immutable datasets + for tmpDataset,tmpSites in locations.iteritems(): + sitesForRefresh = [] + for tmpSite in tmpSites.keys(): + tmpStat = tmpSites[tmpSite][-1] + if tmpStat['total'] == -1 or tmpStat['found'] == None: + sitesForRefresh.append(tmpSite) + elif tmpStat['immutable'] == 0: + # using open datasets + usingOpenDS = True + _logger.info('%s open dataset : %s' % (self.taskID,tmpDataset)) + # refresh replica info + if sitesForRefresh != []: + # invoke listFileReplicasBySites to refresh replica info + _logger.info('%s listFileReplicasBySites %s:%s' % (self.taskID,tmpDataset,str(sitesForRefresh))) + tmpStat,tmpOut = ddm.DQ2_iter.listFileReplicasBySites(tmpDataset,0,sitesForRefresh,0,300) + _logger.info('%s listFileReplicasBySites end with %s:%s' % (self.taskID,tmpStat,tmpOut)) + # reset tmod to shorten retry interval + self.taskBuffer.resetTmodCloudTask(self.taskID) + removedDQ2Map = {} + t2ListForMissing = {} + diskCopyCloud = None + badMetaMap = {} + if locations != {}: + # sort datasets by the number of sites + numSitesDatasetMap = {} + for dataset,sites in locations.iteritems(): + numSites = len(sites) + if not numSitesDatasetMap.has_key(numSites): + numSitesDatasetMap[numSites] = [] + numSitesDatasetMap[numSites].append(dataset) + numSitesList = numSitesDatasetMap.keys() + numSitesList.sort() + sortedDatasetList = [] + for numSites in numSitesList: + sortedDatasetList += numSitesDatasetMap[numSites] + # loop over datasets starting with fewer replicas + removedCloud = [] + for dataset in sortedDatasetList: + sites = locations[dataset] + tmpDiskCopyCloud = [] + removedDQ2Map[dataset] = [] + _logger.info('%s DS:%s' % (self.taskID,dataset)) + datasetType = DataServiceUtils.getDatasetType(dataset) + for tmpCloudName in cloudList: + useCacheT1 = False + tmpCloud = self.siteMapper.getCloud(tmpCloudName) + if DataServiceUtils.isCachedFile(dataset,self.siteMapper.getSite(tmpCloud['source'])): + # use site's endpoint for CVMFS cache + foundSE = self.siteMapper.getSite(tmpCloud['source']).ddm + tmpDiskCopyCloud.append(tmpCloudName) + # using cached files at T1 + useCacheT1 = True + else: + # look for T1 SE which holds the max number of files + minFound = -1 + foundSE = '' + for tmpSePat in tmpCloud['tier1SE']: + # make regexp pattern + if '*' in tmpSePat: + tmpSePat = tmpSePat.replace('*','.*') + tmpSePat = '^' + tmpSePat +'$' + for tmpSE in sites.keys(): + # check name with regexp pattern + if re.search(tmpSePat,tmpSE) == None: + continue + # check metadata + metaOK = self.checkMetadata(dataset,tmpSE) + if not metaOK: + if not badMetaMap.has_key(dataset): + badMetaMap[dataset] = [] + badMetaMap[dataset].append(tmpSE) + _logger.info('%s skip %s due to ToBeDeleted' % (self.taskID,tmpSE)) + continue + # check the number of available files + tmpStat = sites[tmpSE][-1] + if tmpStat['found'] == None: + if minFound == -1: + foundSE = tmpSE + elif minFound < tmpStat['found']: + minFound = tmpStat['found'] + foundSE = tmpSE + # check if disk copy is available + tmpStatusSE,tmpRetSE = toa.getSiteProperty(tmpSE,'tape') + if tmpRetSE != 'True': + if tmpStat['found'] != None and tmpStat['found'] == tmpStat['total']: + tmpDiskCopyCloud.append(tmpCloudName) + else: + _logger.info('%s %s is on tape : %s' % (self.taskID,tmpSE,tmpRetSE)) + # get list of T2s where dataset is available + tmpT2List = [] + tmpT2Map = DataServiceUtils.getSitesWithDataset(dataset,self.siteMapper,locations, + tmpCloudName,True,getDQ2ID=True, + useOnlineSite=True) + for tmpT2Name,tmpT2DQ2List in tmpT2Map.iteritems(): + # skip redundant lookup + if t2ListForMissing.has_key(tmpCloudName) and \ + not tmpT2Name in t2ListForMissing[tmpCloudName]: + continue + # loop over all DQ2 IDs + for tmpT2DQ2 in tmpT2DQ2List: + # check metadata + metaOK = self.checkMetadata(dataset,tmpT2DQ2) + if metaOK: + tmpT2List.append(tmpT2Name) + break + else: + if not badMetaMap.has_key(dataset): + badMetaMap[dataset] = [] + badMetaMap[dataset].append(tmpT2DQ2) + _logger.info('%s skip %s due to ToBeDeleted' % (self.taskID,tmpT2DQ2)) + # take CVMFS cache into account + tmpT2CacheList = DataServiceUtils.getSitesWithCacheDS(tmpCloudName,tmpT2List,self.siteMapper,dataset) + tmpT2List += tmpT2CacheList + # remove cloud if T1SE or T2 is not a location + if foundSE == '': + # keep if T2 has the dataset + if tmpT2List == []: + if not tmpCloudName in removedCloud: + _logger.info('%s removed %s' % (self.taskID,tmpCloudName)) + removedCloud.append(tmpCloudName) + # add dataset to map for subscription when T2 has non-cached replica + if (tmpT2List != [] and len(tmpT2CacheList) != len(tmpT2List)) and not tmpCloudName in removedDQ2Map[dataset]: + removedDQ2Map[dataset].append(tmpCloudName) + else: + if not useCacheT1: + # check incomplete or not + tmpStat = sites[foundSE][-1] + if tmpStat['found'] == None or \ + (not datasetType in datasetTypeToSkipCheck and tmpStat['found'] < tmpStat['total']): + # add dataset to map which is subscribed when the task is used due to T2 files + if not tmpCloudName in removedDQ2Map[dataset]: + removedDQ2Map[dataset].append(tmpCloudName) + # aggregate T2 list + if not t2ListForMissing.has_key(tmpCloudName): + t2ListForMissing[tmpCloudName] = tmpT2List + else: + # use sites where all datasets are available + newTmpT2List = [] + for tmpT2 in t2ListForMissing[tmpCloudName]: + if tmpT2 in tmpT2List: + newTmpT2List.append(tmpT2) + t2ListForMissing[tmpCloudName] = newTmpT2List + # disk copy cloud + if diskCopyCloud == None: + diskCopyCloud = tmpDiskCopyCloud + else: + newDiskCopyCloud = [] + for tmpCloudName in diskCopyCloud: + if tmpCloudName in tmpDiskCopyCloud: + newDiskCopyCloud.append(tmpCloudName) + diskCopyCloud = newDiskCopyCloud + # remove clouds + for tmpCloudName in removedCloud: + if tmpCloudName in cloudList: + cloudList.remove(tmpCloudName) + _logger.info('%s new locations after DQ2 filter %s' % (self.taskID,str(cloudList))) + _logger.info('%s clouds where complete disk copies are available %s' % (self.taskID,str(diskCopyCloud))) + _logger.info('%s removed DQ2 map %s' % (self.taskID,str(removedDQ2Map))) + if cloudList == []: + # make subscription to empty cloud + if taskType in taskTypesSub: + _logger.info('%s makeSubscription start' % self.taskID) + retSub = self.makeSubscription(removedDQ2Map,RWs,fullRWs,expRWs) + _logger.info('%s makeSubscription end with %s' % (self.taskID,retSub)) + message = '%s no input data locations' % self.taskID + self.sendMesg(message,msgType='warning') + raise RuntimeError, '%s cloud list is empty after DQ2 filter' % self.taskID + message = '%s input data locations %s' % (self.taskID,str(cloudList)) + _logger.info(message) + self.sendMesg(message) + # calculate # of loops + nFile = 200 + nLoop = len(guids) / nFile + if len(guids) % nFile != 0: + nLoop += 1 + iFileList = [] + for iTmp in range(nLoop): + iFileList.append(iTmp*nFile) + # truncate list to avoid too many lookup + maxLoop = 100 + if len(iFileList) > maxLoop: + random.shuffle(iFileList) + iFileList = iFileList[:maxLoop] + iFileList.sort() + # count the number of files to be lookup + maxNFiles = 0 + if not usingOpenDS: + # if dataset is open, doesn't check nFiles + for iFile in iFileList: + maxNFiles += len(lfns[iFile:iFile+nFile]) + # loop over all cloud + weightParams = {} + foundCandidateWithT1 = [] + candidatesUsingT2 = [] + for tmpCloudName in cloudList: + _logger.info('%s calculate weight for %s' % (self.taskID,tmpCloudName)) + # add missing cloud in RWs + if not RWs.has_key(tmpCloudName): + RWs[tmpCloudName] = 0 + if not fullRWs.has_key(tmpCloudName): + fullRWs[tmpCloudName] = 0 + # get cloud + tmpCloud = self.siteMapper.getCloud(tmpCloudName) + weightParams[tmpCloudName] = {} + # get T1 site + tmpT1Site = self.siteMapper.getSite(tmpCloud['source']) + # get number of running jobs. Initially set 1 to avoid zero dividing + nPilot = 1 + for siteName in tmpCloud['sites']: + if nWNmap.has_key(siteName): + nPilot += (nWNmap[siteName]['getJob'] + nWNmap[siteName]['updateJob']) + weightParams[tmpCloudName]['nPilot'] = nPilot + _logger.info('%s # of pilots %s' % (self.taskID,nPilot)) + # available space + weightParams[tmpCloudName]['space'] = tmpT1Site.space + _logger.info('%s T1 space %s' % (self.taskID,tmpT1Site.space)) + # MC share + weightParams[tmpCloudName]['mcshare'] = tmpCloud['mcshare'] + _logger.info('%s MC share %s' % (self.taskID,tmpCloud['mcshare'])) + # calculate available space = totalT1space - ((RW(cloud)+RW(thistask))*GBperSI2kday)) + aveSpace,sizeCloud,sizeThis = self.getAvailableSpace(weightParams[tmpCloudName]['space'], + fullRWs[tmpCloudName], + expRWs[self.taskID]) + # no task is assigned if available space is less than 1TB + if aveSpace < thr_space_low: + message = '%s %s skip : space:%s (total:%s - assigned:%s - this:%s) < %sGB' % \ + (self.taskID,tmpCloudName,aveSpace,weightParams[tmpCloudName]['space'], + sizeCloud,sizeThis,thr_space_low) + _logger.info(message) + self.sendMesg(message,msgType='warning') + del weightParams[tmpCloudName] + continue + else: + _logger.info('%s %s pass : space:%s (total:%s - assigned:%s - this:%s)' % \ + (self.taskID,tmpCloudName,aveSpace,weightParams[tmpCloudName]['space'], + sizeCloud,sizeThis)) + # not assign tasks when RW is too high + if RWs.has_key(tmpCloudName) and RWs[tmpCloudName] > thr_RW_high*weightParams[tmpCloudName]['mcshare']: + message = '%s %s skip : too high RW==%s > %s' % \ + (self.taskID,tmpCloudName,RWs[tmpCloudName],thr_RW_high*weightParams[tmpCloudName]['mcshare']) + _logger.info(message) + self.sendMesg(message,msgType='warning') + del weightParams[tmpCloudName] + continue + # T1 + t1List = [tmpT1Site.sitename] + # hack for split T1 + if tmpCloudName == 'NL': + t1List.append('NIKHEF-ELPROD') + # get files + weightParams[tmpCloudName]['nFiles'] = 0 + # loop + tmpMaxNumFile = 0 + for tmpSiteNameScan in t1List: + tmpScanRet,tmpN = DataServiceUtils.getNumAvailableFilesSite(tmpSiteNameScan, + self.siteMapper, + locations,badMetaMap, + tmpCloud['tier1SE'], + noCheck=datasetTypeToSkipCheck, + fileCounts=fileCounts) + # failed + if not tmpScanRet: + raise RuntimeError, 'failed to get nFiles at %s due to %s' % (tmpSiteNameScan,tmpN) + # max + if tmpMaxNumFile < tmpN: + tmpMaxNumFile = tmpN + # set + weightParams[tmpCloudName]['nFiles'] = tmpMaxNumFile + _logger.info('%s # of files at T1 %s' % (self.taskID,weightParams[tmpCloudName]['nFiles'])) + # found candidate + foundCandidateT1 = False + if weightParams[tmpCloudName]['nFiles'] >= maxNFiles: + foundCandidateT1 = True + # avoid incomplete at T1 + for tmpDS,tmpT2CloudList in removedDQ2Map.iteritems(): + if tmpCloudName in tmpT2CloudList: + foundCandidateT1 = False + # reset nFiles at T1 + weightParams[tmpCloudName]['nFiles'] = 0 + break + if foundCandidateT1: + foundCandidateWithT1.append(tmpCloudName) + # check T2 if files are missing + if (not foundCandidateT1 or weightParams[tmpCloudName]['nFiles'] < maxNFiles) and \ + t2ListForMissing.has_key(tmpCloudName) and t2ListForMissing[tmpCloudName] != []: + _logger.info('%s T2 candidates %s' % (self.taskID,str(t2ListForMissing[tmpCloudName]))) + # loop + tmpMaxNumFile = 0 + for tmpSiteNameScan in t2ListForMissing[tmpCloudName]: + tmpScanRet,tmpN = DataServiceUtils.getNumAvailableFilesSite(tmpSiteNameScan, + self.siteMapper, + locations,badMetaMap, + noCheck=datasetTypeToSkipCheck, + fileCounts=fileCounts) + # failed + if not tmpScanRet: + raise RuntimeError, 'failed to get nFiles at %s due to %s' % (tmpSiteNameScan,tmpN) + # use larger value + _logger.info('%s # of files at T2:%s %s' % (self.taskID,tmpSiteNameScan,tmpN)) + if tmpN > weightParams[tmpCloudName]['nFiles']: + weightParams[tmpCloudName]['nFiles'] = tmpN + # found candidate + if weightParams[tmpCloudName]['nFiles'] >= maxNFiles: + candidatesUsingT2.append(tmpCloudName) + break + # compare parameters + definedCloud = "US" + maxClouds = [] + useMcShare = False + # use clouds where T1 have the data + maxClouds += foundCandidateWithT1 + # use clouds where T2 have the data + maxClouds += candidatesUsingT2 + # logging + _logger.info('%s check nFiles' % self.taskID) + for cloudName,params in weightParams.iteritems(): + if not cloudName in maxClouds: + if maxNFiles == 0: + message = '%s %s skip : missing files at DATA/GROUPDISK' % \ + (self.taskID,cloudName) + elif params['nFiles'] != maxNFiles: + message = '%s %s skip : nFiles==%s<%s' % \ + (self.taskID,cloudName,params['nFiles'],maxNFiles) + else: + message = '%s %s skip : no complete replica at DATA/GROUPDISK' % \ + (self.taskID,cloudName) + _logger.info(message) + self.sendMesg(message) + time.sleep(2) + # check RW + _logger.info('%s check RW' % self.taskID) + tmpInfClouds = [] + for cloudName in maxClouds: + # set weight to infinite when RW is too low + if not taskType in taskTypesMcShare: + if RWs[cloudName] < thr_RW_low*weightParams[cloudName]['mcshare']: + message = '%s %s infinite weight : RW==%s < %s' % \ + (self.taskID,cloudName,RWs[cloudName],thr_RW_low*weightParams[cloudName]['mcshare']) + _logger.info(message) + self.sendMesg(message) + tmpInfClouds.append(cloudName) + # use new list + if tmpInfClouds != []: + _logger.info('%s use infinite clouds after RW checking' % self.taskID) + maxClouds = tmpInfClouds + useMcShare = True + elif maxClouds == []: + messageEnd = '%s no candidates left' % self.taskID + self.sendMesg(messageEnd) + # make subscription to empty cloud + if taskType in taskTypesSub: + _logger.info('%s makeSubscription start' % self.taskID) + retSub = self.makeSubscription(removedDQ2Map,RWs,fullRWs,expRWs) + _logger.info('%s makeSubscription end with %s' % (self.taskID,retSub)) + if retSub: + message = '%s made subscription' % self.taskID + self.sendMesg(message,msgType='info') + else: + message = "%s didn't make subscription" % self.taskID + self.sendMesg(message,msgType='warning') + # return + _logger.info(messageEnd) + _logger.info("%s end" % self.taskID) + return None + # choose one + message = '%s candidates %s' % (self.taskID,str(maxClouds)) + _logger.info(message) + self.sendMesg(message) + if len(maxClouds) == 1: + definedCloud = maxClouds[0] + elif len(maxClouds) > 1: + # choose cloud according to weight + nWeightList = [] + totalWeight = 0 + for cloudName in maxClouds: + if (taskType in taskTypesMcShare): + # use MC share for evgen + tmpWeight = float(weightParams[cloudName]['mcshare']) + message = "%s %s weight==%s" % (self.taskID,cloudName,weightParams[cloudName]['mcshare']) + else: + # use nPilot/RW*MCshare + tmpWeight = float(weightParams[cloudName]['nPilot']) / float(1+RWs[cloudName]) + message = "%s %s weight==%s/%s" % (self.taskID,cloudName, + weightParams[cloudName]['nPilot'], + 1+RWs[cloudName]) + # use different weight if DISK is available + if diskCopyCloud != None and diskCopyCloud != [] and cloudName not in diskCopyCloud: + tmpWeight *= float(reductionForTape) + message += '*%s' % reductionForTape + self.sendMesg(message) + nWeightList.append(tmpWeight) + totalWeight += tmpWeight + # check total weight + if totalWeight == 0: + raise RuntimeError, 'totalWeight=0' + # determin cloud using random number + _logger.info('%s weights %s' % (self.taskID,str(nWeightList))) + rNumber = random.random() * totalWeight + _logger.info('%s totalW %s' % (self.taskID,totalWeight)) + _logger.info('%s rNumber %s' % (self.taskID,rNumber)) + for index,tmpWeight in enumerate(nWeightList): + rNumber -= tmpWeight + _logger.info('%s rNumber %s : Cloud=%s weight=%s' % + (self.taskID,rNumber,maxClouds[index],tmpWeight)) + if rNumber <= 0: + definedCloud = maxClouds[index] + break + # make subscription when T2 candidate is chosen + if definedCloud in candidatesUsingT2: + newT2DQ2Map = {} + for tmpDS,tmpT2CloudList in removedDQ2Map.iteritems(): + if definedCloud in tmpT2CloudList: + newT2DQ2Map[tmpDS] = [definedCloud] + if newT2DQ2Map == {}: + _logger.error('%s no subscription map to use T2 datasets cloud=%s map=%s' % (self.taskID,definedCloud,removedDQ2Map)) + return None + _logger.info('%s makeSubscription to use T2 start' % self.taskID) + retSub = self.makeSubscription(newT2DQ2Map,RWs,fullRWs,expRWs,noEmptyCheck=True,acceptInProcess=True) + if not retSub: + _logger.error('%s makeSubscription to use T2 failed with %s' % (self.taskID,retSub)) + return None + _logger.info('%s makeSubscription to use T2 end with %s' % (self.taskID,retSub)) + # set CloudTask in DB + self.cloudTask.cloud = definedCloud + retCloudTask = self.taskBuffer.setCloudTask(self.cloudTask) + if retCloudTask == None: + _logger.error('%s cannot set CloudTask' % self.taskID) + return None + # pin input dataset + pinSiteList = [] + if definedCloud in candidatesUsingT2: + # pin T2 replicas + if t2ListForMissing.has_key(definedCloud): + pinSiteList = t2ListForMissing[definedCloud] + else: + # pin T1 replica + pinSiteList = [self.siteMapper.getCloud(definedCloud)['tier1']] + if pinSiteList != []: + self.pinDataset(locations,pinSiteList,definedCloud) + message = '%s set Cloud -> %s' % (self.taskID,retCloudTask.cloud) + _logger.info(message) + self.sendMesg(message) + # return + return retCloudTask.cloud + except: + type, value, traceBack = sys.exc_info() + _logger.error("%s setCloud : %s %s" % (self.taskID,type,value)) + return None + + + # send message to logger + def sendMesg(self,message,msgType=None): + try: + # get logger + tmpPandaLogger = PandaLogger() + # lock HTTP handler + tmpPandaLogger.lock() + tmpPandaLogger.setParams({'Type':'taskbrokerage'}) + # use bamboo for loggername + if panda_config.loggername == 'prod': + tmpLogger = tmpPandaLogger.getHttpLogger('bamboo') + else: + # for dev + tmpLogger = tmpPandaLogger.getHttpLogger(panda_config.loggername) + # add message + if msgType=='error': + tmpLogger.error(message) + elif msgType=='warning': + tmpLogger.warning(message) + elif msgType=='info': + tmpLogger.info(message) + else: + tmpLogger.debug(message) + # release HTTP handler + tmpPandaLogger.release() + except: + pass + time.sleep(1) + + + # check disk count + def checkDiskCount(self,diskCount,cloud): + scanSiteList = self.siteMapper.getCloud(cloud)['sites'] + # loop over all sites + for tmpSiteName in scanSiteList: + if 'test' in tmpSiteName.lower(): + continue + # get sitespec + tmpSiteSpec = self.siteMapper.getSite(tmpSiteName) + # use online only + if not tmpSiteSpec.status in ['online']: + continue + # no size limit + if tmpSiteSpec.maxinputsize in [0,None,'']: + return True + # enough space for input + if int(tmpSiteSpec.maxinputsize) >= int(diskCount): + return True + # no sites have enough space + return False + + + # get available space + def getAvailableSpace(self,space,fullRW,expRW): + # calculate available space = totalT1space - ((RW(cloud)+RW(thistask))*GBperSI2kday)) + sizeCloud = fullRW * 0.2 + sizeThis = expRW * 0.2 + aveSpace = space - (sizeCloud + sizeThis) + return aveSpace,sizeCloud,sizeThis + + + # make subscription + def makeSubscription(self,dsCloudMap,RWs,fullRWs,expRWs,noEmptyCheck=False,acceptInProcess=False): + nDDMtry = 3 + cloudList = [] + # collect clouds which don't hold datasets + message = '%s possible clouds : %s' % (self.taskID,str(self.cloudForSubs)) + _logger.info(message) + for tmpDS,tmpClouds in dsCloudMap.iteritems(): + for tmpCloud in tmpClouds: + if (not tmpCloud in cloudList) and tmpCloud in self.cloudForSubs: + cloudList.append(tmpCloud) + message = '%s candidates for subscription : %s' % (self.taskID,str(cloudList)) + _logger.info(message) + self.sendMesg(message) + if cloudList == []: + _logger.info('%s no candidates for subscription' % self.taskID) + return False + # get DN + com = 'unset LD_LIBRARY_PATH; unset PYTHONPATH; export PATH=/usr/local/bin:/bin:/usr/bin; ' + com+= 'source %s; grid-proxy-info -subject' % panda_config.glite_source + status,DN = commands.getstatusoutput(com) + _logger.info('%s %s' % (self.taskID,DN)) + # ignore AC issuer + if re.search('WARNING: Unable to verify signature!',DN) != None: + status = 0 + if status != 0: + _logger.error('%s could not get DN %s:%s' % (self.taskID,status,DN)) + return False + # check if there is in-process subscription + if not acceptInProcess: + # remove /CN=proxy and /CN=limited from DN + DN = DN.split('\n')[-1] + DN = re.sub('(/CN=proxy)+$','',DN) + DN = re.sub('/CN=limited proxy','',DN) + status,out = dq2Common.parse_dn(DN) + if status != 0: + _logger.error('%s could not truncate DN %s:%s' % (self.taskID,status,DN)) + return False + DN = out + # loop over all datasets + runningSub = {} + for tmpDS,tmpClouds in dsCloudMap.iteritems(): + # get running subscriptions + runningSub[tmpDS] = [] + _logger.info('%s listSubscriptions(%s)' % (self.taskID,tmpDS)) + iTry = 0 + while True: + status,outLoc = ddm.DQ2.listSubscriptions(tmpDS) + # succeed + if status == 0: + break + # failed + iTry += 1 + if iTry < nDDMtry: + time.sleep(30) + else: + _logger.error('%s %s' % (self.taskID,outLoc)) + return False + _logger.info('%s %s %s' % (self.taskID,status,outLoc)) + time.sleep(1) + # get subscription metadata + exec "outLoc = %s" % outLoc + for tmpLocation in outLoc: + t1Flag = False + # check T1 or not + for tmpCloudName4T1 in self.siteMapper.getCloudList(): + if tmpLocation in self.siteMapper.getCloud(tmpCloudName4T1)['tier1SE']: + t1Flag = True + break + # skip non-T1 + if not t1Flag: + continue + _logger.info('%s listSubscriptionInfo(%s,%s)' % (self.taskID,tmpDS,tmpLocation)) + iTry = 0 + while True: + status,outMeta = ddm.DQ2.listSubscriptionInfo(tmpDS,tmpLocation,0) + # succeed + if status == 0: + break + # skip non-existing ID + if re.search('not a Tiers of Atlas Destination',outMeta) != None: + _logger.info('%s ignore %s' % (self.taskID,outMeta.split('\n')[-1])) + status = 0 + outMeta = "()" + break + # failed + iTry += 1 + if iTry < nDDMtry: + time.sleep(30) + else: + _logger.error('%s %s' % (self.taskID,outMeta)) + return False + _logger.info('%s %s %s' % (self.taskID,status,outMeta)) + time.sleep(1) + # look for DN in metadata + exec "outMeta = %s" % outMeta + if DN in outMeta: + # get corrosponding cloud + for tmpCloudName in self.siteMapper.getCloudList(): + tmpCloudSpec = self.siteMapper.getCloud(tmpCloudName) + if tmpLocation in tmpCloudSpec['tier1SE']: + # append + if not tmpCloudName in runningSub[tmpDS]: + runningSub[tmpDS].append(tmpCloudName) + break + _logger.info('%s runningSub=%s' % (self.taskID,runningSub)) + # doesn't make subscriptions when another subscriptions is in process + subThr = 1 + for tmpDS,tmpClouds in runningSub.iteritems(): + if len(tmpClouds) > 0: + message = '%s subscription:%s to %s in process' % (self.taskID,tmpDS,str(tmpClouds)) + _logger.info(message) + self.sendMesg(message) + return False + # get size of datasets + dsSizeMap = {} + for tmpDS in dsCloudMap.keys(): + _logger.debug('%s listFilesInDataset(%s)' % (self.taskID,tmpDS)) + iTry = 0 + while True: + status,outList = ddm.DQ2.listFilesInDataset(tmpDS) + # succeed + if status == 0: + break + # failed + iTry += 1 + if iTry < nDDMtry: + time.sleep(30) + else: + _logger.error('%s %s %s' % (self.taskID,status,outList)) + return False + # get total size + dsSizeMap[tmpDS] = 0 + exec "outList = %s" % outList + for guid,vals in outList[0].iteritems(): + try: + dsSizeMap[tmpDS] += long(vals['filesize']) + except: + pass + # GB + _logger.info('%s %s %sB' % (self.taskID,tmpDS,dsSizeMap[tmpDS])) + dsSizeMap[tmpDS] /= (1024*1024*1024) + _logger.info('%s dsSize=%s' % (self.taskID,dsSizeMap)) + # check space and RW + minRW = None + minCloud = None + for tmpCloudName in cloudList: + # get cloud spec + tmpCloudSpec = self.siteMapper.getCloud(tmpCloudName) + # get T1 site + tmpT1Site = self.siteMapper.getSite(tmpCloudSpec['source']) + # calculate available space + if not fullRWs.has_key(tmpCloudName): + fullRWs[tmpCloudName] = 0 + aveSpace,sizeCloud,sizeThis = self.getAvailableSpace(tmpT1Site.space, + fullRWs[tmpCloudName], + expRWs[self.taskID]) + # reduce requred space + for tmpDS,tmpClouds in dsCloudMap.iteritems(): + if tmpCloudName in tmpClouds: + aveSpace -= dsSizeMap[tmpDS] + # check space + if aveSpace < thr_space_low: + message = '%s %s skip : space==%s total==%s' % (self.taskID,tmpCloudName,aveSpace, + tmpT1Site.space) + _logger.info(message) + self.sendMesg(message,msgType='warning') + continue + _logger.info('%s %s pass : space==%s total==%s' % (self.taskID,tmpCloudName,aveSpace, + tmpT1Site.space)) + # get cloud spec + tmpCloudSpec = self.siteMapper.getCloud(tmpCloudName) + # check MC share + if tmpCloudSpec['mcshare'] == 0: + message = '%s %s skip : mcshare==%s' % (self.taskID,tmpCloudName,tmpCloudSpec['mcshare']) + _logger.info(message) + continue + # get minimum RW + if not RWs.has_key(tmpCloudName): + RWs[tmpCloudName] = 0 + tmpRwThr = tmpCloudSpec['mcshare']*thr_RW_sub + _logger.info('%s %s RW==%s Thr==%s' % (self.taskID,tmpCloudName,RWs[tmpCloudName], + tmpRwThr)) + tmpRwRatio = float(RWs[tmpCloudName])/float(tmpRwThr) + if minRW == None or minRW > tmpRwRatio: + minRW = tmpRwRatio + minCloud = tmpCloudName + # check RW + if minCloud == None: + message = '%s no candidates left for subscription' % self.taskID + _logger.info(message) + self.sendMesg(message) + return False + # get cloud spec + tmpCloudSpec = self.siteMapper.getCloud(minCloud) + # check threshold + if minRW > 1.0 and not noEmptyCheck: + message = '%s no empty cloud : %s minRW==%s>%s' % \ + (self.taskID,minCloud,RWs[minCloud],thr_RW_sub*tmpCloudSpec['mcshare']) + _logger.info(message) + self.sendMesg(message) + return False + message = '%s %s for subscription : minRW==%s' % (self.taskID,minCloud,minRW) + _logger.info(message) + self.sendMesg(message) + # get cloud spec for subscription + tmpCloudSpec = self.siteMapper.getCloud(minCloud) + # get T1 site + tmpT1Site = self.siteMapper.getSite(tmpCloudSpec['source']) + # dest DQ2 ID + dq2ID = tmpT1Site.ddm + # make subscription + for tmpDsName,tmpClouds in dsCloudMap.iteritems(): + # skip if the dataset already exists in the cloud + if not minCloud in tmpClouds: + _logger.info('%s %s already exists in %s' % (self.taskID,tmpDS,minCloud)) + continue + # get constituents + if tmpDsName.endswith('/'): + tmpStat,repMap = self.getListDatasetReplicasInContainer(tmpDsName) + if not tmpStat: + _logger.info('%s failed to get datasets in %s ' % (self.taskID,tmpDsName)) + continue + else: + repMap = {tmpDsName:{dq2ID:[]}} + # loop over all constituents + for tmpDS in repMap.keys(): + # register subscription + optSrcPolicy = 001000 | 010000 + _logger.debug("%s %s %s" % ('registerDatasetSubscription',(tmpDS,dq2ID), + {'version':0,'archived':0,'callbacks':{},'sources':{}, + 'sources_policy':optSrcPolicy,'wait_for_sources':0, + 'destination':None,'query_more_sources':0,'sshare':"secondary", + 'group':None,'activity':"Production",'acl_alias':'secondary'})) + iTry = 0 + while True: + # execute + status,out = ddm.DQ2.main('registerDatasetSubscription',tmpDS,dq2ID,version=0,archived=0,callbacks={}, + sources={},sources_policy=optSrcPolicy,wait_for_sources=0,destination=None, + query_more_sources=0,sshare="secondary",group=None,activity="Production", + acl_alias='secondary') + # succeed + if status == 0 or 'DQSubscriptionExistsException' in out: + break + # failed + iTry += 1 + if iTry < nDDMtry: + time.sleep(30) + else: + _logger.error('%s %s %s' % (self.taskID,status,out)) + return False + if 'DQSubscriptionExistsException' in out: + _logger.info('%s %s %s' % (self.taskID,status,'DQSubscriptionExistsException')) + else: + _logger.info('%s %s %s' % (self.taskID,status,out)) + message = '%s registered subscription %s %s:%s' % (self.taskID,tmpDS,minCloud,dq2ID) + _logger.info(message) + self.sendMesg(message) + time.sleep(1) + # completed + return True + + + # pin dataset + def pinDataset(self,locationMap,siteList,cloudName): + _logger.info('%s start pin input datasets' % self.taskID) + pinLifeTime = 7 + # loop over all datasets + for tmpDsName,tmpDQ2Map in locationMap.iteritems(): + # skip DBR + if DataServiceUtils.isDBR(tmpDsName): + continue + # get DQ2 IDs in the cloud where dataset is available + tmpDq2Map = DataServiceUtils.getSitesWithDataset(tmpDsName,self.siteMapper,locationMap, + cloudName,useHomeCloud=True, + getDQ2ID=True, + useOnlineSite=True, + includeT1=True) + # loop over all sites + for tmpSiteName in siteList: + # pin dataset when the site has replicas + if tmpDq2Map.has_key(tmpSiteName): + # loop over all DQ2 IDs + for tmpRepSite in tmpDq2Map[tmpSiteName]: + # get constituents + if tmpDsName.endswith('/'): + tmpStat,repMap = self.getListDatasetReplicasInContainer(tmpDsName) + if not tmpStat: + _logger.info('%s failed to get datasets in %s ' % (self.taskID,tmpDsName)) + continue + else: + repMap = {tmpDsName:{tmpRepSite:[]}} + # loop over all datasets + for datasetName,locVal in repMap.iteritems(): + # check missing + if not repMap[datasetName].has_key(tmpRepSite): + _logger.info('%s skip pinning for %s at %s due to missing replica' % \ + (self.taskID,datasetName,tmpRepSite)) + continue + # get metadata + status,tmpMetadata = self.getReplicaMetadata(datasetName,tmpRepSite) + if not status: + continue + # check pin lifetime + if tmpMetadata.has_key('pin_expirationdate'): + if isinstance(tmpMetadata['pin_expirationdate'],types.StringType) and tmpMetadata['pin_expirationdate'] != 'None': + # keep original pin lifetime if it is longer + origPinLifetime = datetime.datetime.strptime(tmpMetadata['pin_expirationdate'],'%Y-%m-%d %H:%M:%S') + if origPinLifetime > datetime.datetime.utcnow()+datetime.timedelta(days=pinLifeTime): + _logger.info('%s skip pinning for %s:%s due to longer lifetime %s' % (self.taskID, + datasetName,tmpRepSite, + tmpMetadata['pin_expirationdate'])) + continue + # set pin lifetime + status = self.setReplicaMetadata(datasetName,tmpRepSite,'pin_lifetime','%s days' % pinLifeTime) + # return + _logger.info('%s end pin input datasets' % self.taskID) + return + + + # get replica metadata + def getReplicaMetadata(self,datasetName,locationName): + # use cached data + if self.metadataMap.has_key(datasetName) and self.metadataMap[datasetName].has_key(locationName): + return True,self.metadataMap[datasetName][locationName] + # response for failure + resForFailure = False,{} + # get metadata + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug('%s %s/%s listMetaDataReplica %s %s' % (self.taskID,iDDMTry,nTry,datasetName,locationName)) + status,out = ddm.DQ2.main('listMetaDataReplica',locationName,datasetName) + if status != 0 or (not DataServiceUtils.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error("%s %s" % (self.taskID,out)) + return resForFailure + metadata = {} + try: + # convert to map + exec "metadata = %s" % out + except: + _logger.error('%s could not convert HTTP-res to replica metadata for %s:%s' % \ + (self.taskID,datasetName,locationName)) + return resForFailure + # append + if not self.metadataMap.has_key(datasetName): + self.metadataMap[datasetName] = {} + self.metadataMap[datasetName][locationName] = metadata + # return + _logger.debug('%s getReplicaMetadata -> %s' % (self.taskID,str(metadata))) + return True,metadata + + + # check metadata + def checkMetadata(self,datasetName,tmpSE): + try: + # skip checking for DBR + if DataServiceUtils.isDBR(datasetName): + return True + # get constituents + if datasetName.endswith('/'): + tmpStat,repMap = self.getListDatasetReplicasInContainer(datasetName) + if not tmpStat: + raise RuntimeError, 'failed to get datasets in %s when checkMetadata' % datasetName + else: + repMap = {datasetName:{tmpSE:[]}} + # loop over all datasets + for dataset,locVal in repMap.iteritems(): + # check missing + if not locVal.has_key(tmpSE): + _logger.info('%s skip %s at %s due to missing replica when checkMetadata' % (self.taskID,dataset,tmpSE)) + # NG + return False + # get metadata + status,metaItem = self.getReplicaMetadata(dataset,tmpSE) + if not status: + raise RuntimeError, 'failed to get metadata at %s for %s when checkMetadata' % (tmpSE,dataset) + # check + if metaItem.has_key('archived') and isinstance(metaItem['archived'],types.StringType) \ + and metaItem['archived'].lower() in ['tobedeleted',]: + _logger.info('%s skip %s due to ToBeDeleted when checkMetadata' % (self.taskID,tmpSE)) + # NG + return False + except: + errtype,errvalue = sys.exc_info()[:2] + _logger.error("%s checkMetadata : %s %s" % (self.taskID,errtype,errvalue)) + # FIXME + #return False + # OK + return True + + + # set replica metadata + def setReplicaMetadata(self,datasetName,locationName,attrname,attrvalue): + # response for failure + resForFailure = False + # get metadata + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug('%s %s/%s setReplicaMetaDataAttribute %s %s %s=%s' % (self.taskID,iDDMTry,nTry,datasetName, + locationName,attrname,attrvalue)) + status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',datasetName,locationName,attrname,attrvalue) + if status != 0 or (not DataServiceUtils.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error("%s %s" % (self.taskID,out)) + return resForFailure + # return + _logger.info('%s setReplicaMetadata done for %s:%s' % (self.taskID,datasetName,locationName)) + return True + + + # get list of replicas in container + def getListDatasetReplicasInContainer(self,container): + # use cache + if self.contDsMap.has_key(container): + return True,self.contDsMap[container] + # get datasets in container + _logger.debug((self.taskID,'listDatasetsInContainer',container)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('listDatasetsInContainer',container) + if status != 0 or (not DataServiceUtils.isDQ2ok(out)): + time.sleep(60) + else: + break + _logger.debug('%s %s' % (self.taskID,out)) + if status != 0 or out.startswith('Error'): + return False,out + datasets = [] + try: + # convert to list + exec "datasets = %s" % out + except: + return False,out + # loop over all datasets + allRepMap = {} + for dataset in datasets: + _logger.debug((self.taskID,'listDatasetReplicas',dataset)) + for iDDMTry in range(3): + status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) + if status != 0 or (not DataServiceUtils.isDQ2ok(out)): + time.sleep(60) + else: + break + _logger.debug('%s %s' % (self.taskID,out)) + if status != 0 or out.startswith('Error'): + return False,out + tmpRepSites = {} + try: + # convert res to map + exec "tmpRepSites = %s" % out + except: + return False,out + # get map + allRepMap[dataset] = tmpRepSites + # return + _logger.debug('%s %s' % (self.taskID,str(allRepMap))) + self.contDsMap[container] = allRepMap + return True,allRepMap + + + diff --git a/current/pandaserver/dataservice/Waker.py b/current/pandaserver/dataservice/Waker.py new file mode 100755 index 000000000..93234bcd7 --- /dev/null +++ b/current/pandaserver/dataservice/Waker.py @@ -0,0 +1,55 @@ +''' +awake jobs in waiting table + +''' + +import time +import threading +from DDM import ddm + +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('Waker') + + +class Waker (threading.Thread): + # constructor + def __init__(self,taskBuffer,dataset): + threading.Thread.__init__(self) + self.dataset = dataset + self.taskBuffer = taskBuffer + + + # main + def run(self): + _logger.debug("start: %s" % self.dataset.name) + # get file list from DDM + for iDDMTry in range(3): + status,out = ddm.DQ2.main('listFilesInDataset',self.dataset.name) + if status != 0 and out.find("DQ2 unknown dataset exception") != -1: + break + elif status != 0 or out.find("DQ2 internal server exception") != -1: + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error(out) + _logger.debug("failed: %s" % self.dataset.name) + return + # parse + lfns = [] + try: + exec "resDQ=%s" % out + for guid,vals in resDQ[0].iteritems(): + lfns.append(vals['lfn']) + except: + _logger.error("could not parse %s" % out) + # get PandaIDs of jobs which use files with LFNs + if len(lfns) != 0: + ids = self.taskBuffer.queryPandaIDwithLFN(lfns) + _logger.debug("IDs: %s" % ids) + if len(ids) != 0: + # awake jobs + self.taskBuffer.awakeJobs(ids) + _logger.debug("finished: %s" % self.dataset.name) diff --git a/current/pandaserver/dataservice/__init__.py b/current/pandaserver/dataservice/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/current/pandaserver/dataservice/countGuidsClient.py b/current/pandaserver/dataservice/countGuidsClient.py new file mode 100644 index 000000000..a65489ce2 --- /dev/null +++ b/current/pandaserver/dataservice/countGuidsClient.py @@ -0,0 +1,72 @@ +import urllib, re, string, os, time +from eventLookupClient import eventLookupClient + +# client for countGuids Athenaeum service +# author: Marcin.Nowak@cern.ch + + +class countGuidsClient(eventLookupClient): + + #serverURL = "http://j2eeps.cern.ch/test-Athenaeum/" + serverURL = "http://j2eeps.cern.ch/atlas-project-Athenaeum/" + #serverURL = "http://j2eeps.cern.ch/test-eventPicking/" + servicePage = "CountGuids.jsp" + getPage = "EventLookupGet.jsp" + + def __init__(self): + eventLookupClient.__init__(self) + + def countGuids(self, datasetName, query='', tokens=''): + """ contact the server and return GUIDs count + tokens - token names + """ + query_args = { 'key': self.key, + 'worker': self.workerURL(), + 'cert_proxy': self.certProxy, + 'query': query, + 'dataset': datasetName, + 'tokens': tokens + } + self.talkToServer(self.serverURL + self.servicePage, query_args) + + self.remoteFile = None + for line in self.output: + m = re.search("FILE=(.+)$", line) + if m: + return self.waitForFile( m.group(1) ) + + return self.scanOutputForGuids() + + + def scanOutputForGuids(self): + """ Scan the server output looking for GUIDs + return None in case of errors + """ + self.countedGuids = [] + self.tokens = [] + stage = None + tokpat = re.compile(r'([0-9A-F]{8}-([0-9A-F]{4}-){3}[0-9A-F]{12})') + for line in self.output: + if re.search(self.errorPattern, line, re.I): + #print " -- Error line matched: " + line + return None + if stage == "readGuids": + try: + (count, guidline) = line.split(None,1) + guids = guidline.split() + if tokpat.match(guids[0]): + self.countedGuids.append( (count,guids,) ) + continue + except ValueError: + pass + # end of input, finish + break + if re.search("Event count per distinct GUIDs group:", line): + stage = "readAttribs" + continue + if stage == "readAttribs": + self.tokens = line.split()[1:] + stage = "readGuids" + continue + + return (self.tokens, self.countedGuids) diff --git a/current/pandaserver/dataservice/datriHandler.py b/current/pandaserver/dataservice/datriHandler.py new file mode 100644 index 000000000..de6f8d407 --- /dev/null +++ b/current/pandaserver/dataservice/datriHandler.py @@ -0,0 +1,207 @@ +""" +DaTRI Handler for external applications (curl, python ver. >= 2.4) +CERN, ATLAS Distributed Computing (March 2010) + +@author: Mikhail Titov +@contact: mikhail.titov@cern.ch +@data: June 21, 2013 +@version: 0.97 +""" + +import os +import subprocess +from urllib import urlencode + +HTTPS_PORT = 25943 +PANDAMON_HOST = 'panda.cern.ch' +PANDAMON_URI = '/server/pandamon/query' + +# -s: Silent or quiet mode. Don't show progress meter or error messages. +# -S: When used with -s it makes curl show an error message if it fails. +CURL_SILENT_OPTION = '-s' + +PARAMS_LIST = ['mode', 'action', 'dpat', 'site', 'userid'] +PARAMS_LIST_ADDON = ['emails', 'comments'] +MODE = { + 'pathena': 'ddm_pathenareq', + 'ganga': 'ddm_gangareq', + 'group': 'ddm_groupreq'} + +RETRY_NUM = 2 + + +def execute(params): + """Returns tuple (out, err) + + @param params (@type list) + shell command (1st parameter) and its options + """ + try: + p = subprocess.Popen(params, stdout=subprocess.PIPE) + except (OSError, ValueError), e: + return '', 'SubprocessException: %s' % e + else: + return p.communicate() + + +class datriHandler(object): + + """Class datriHandler.""" + + def __init__(self, **kwargs): + """Initialization + + @param kwargs (@type dict) + has "type" with one of the next values: pathena/ganga/group + """ + self.curl = datriCurl() + self.info = {'mode': MODE.get(kwargs.get('type', 'pathena'), '')} + self.err_message = '' + if not self.info['mode']: + self.err_message = 'datriHandler: mode is incorrect' + + def __del__(self): + self.curl = None + self.info.clear() + self.err_message = '' + + def hasParams(self): + """Check that parameters are defined and are not null + + @return (@type bool) + True/False + """ + for p in PARAMS_LIST: + if not self.info.get(p, None): + return False + return True + + def setParameters(self, data_pattern, site, userid, **kwargs): + """Define request parameters + + @param data_pattern (@type str) + dataset | container | pattern + @param site (@type str) + destination site (see AGIS/TiersOfAtlas) + @param userid (@type str) + unique user identification (certificate dn | email) + """ + if data_pattern and site and userid: + self.info.update({'dpat': data_pattern, + 'site': site, + 'userid': userid}) + for p in PARAMS_LIST_ADDON: + if p in kwargs: + self.info[p] = kwargs[p] + else: + self.err_message = 'datriHandler: required data are not defined' + + def checkData(self): + """Check request data (send "Check"-request) + + @return (@type typle: int, str) + returns status code and info (error) message + """ + if not self.err_message: + self.info['action'] = 'Check' + if self.hasParams(): + return self.curl.get(**self.info) + else: + self.err_message = 'datriHandler: required data are not defined' + return 4, self.err_message + + def sendRequest(self): + """Send request to DaTRI (send "Submit"-request) + + @return (@type typle: int, str) + returns status code and info (error) message + """ + if not self.err_message: + self.info['action'] = 'Submit' + if self.hasParams(): + return self.curl.get(**self.info) + else: + self.err_message = 'datriHandler: required data are not defined' + return 4, self.err_message + +# - Class for https-request definition - + +class datriCurl(object): + + """Class datriCurl for curl-command creation.""" + + def __init__(self): + self.err_message = '' + self.cmd_params = ['curl', + '--user-agent', 'datricurl', + '--max-redirs', '5', + '--max-time', '90', + CURL_SILENT_OPTION, + '-G'] + self._user_proxy() + self._ca_path() + # - url definition - + self.url = 'https://%s:%s%s' % (PANDAMON_HOST, HTTPS_PORT, PANDAMON_URI) + + def _user_proxy(self): + cert = os.environ.get('X509_USER_PROXY') + if not cert: + cert = '/tmp/x509up_u%s' % os.getuid() + if not os.access(cert, os.R_OK): + cert = None + if cert: + self.cmd_params.extend(['--cert', cert, '--cacert', cert]) + else: + self.err_message += 'User proxy certificate is not defined; ' + + def _ca_path(self): + if os.environ.get('X509_CERT_DIR'): + self.cmd_params.extend(['--capath', os.environ['X509_CERT_DIR']]) + else: + self.err_message += 'CA-path is not defined; ' + + # - method GET - + def get(self, **kwargs): + """Returns status code and response message + + @param kwargs (@type dict) + parameters for DaTRI request definition (see PARAMS_LIST) + @return (@type typle: int, str) + returns status code and info (error) message + """ + if not self.err_message: + if not kwargs: + return 2, 'datriCurl: input parameters are not defined' + o, e = '', ' is not defined' + # - several attempts for cmd execution - begin - + cmd_params = (self.cmd_params + + ['--url', '%s?%s' % (self.url, urlencode(kwargs))]) + for i in range(RETRY_NUM): + o, e = execute(cmd_params) + if o and not e: + return (0, o) if o.startswith('OK.') else (1, o) + # - several attempts for cmd execution - end - + return 3, 'datriCurl: execution error (output=%s, error=%s)' % (o, e) + return 5, 'datriCurl: %s' % self.err_message + + +####################################################################################### +# datriHandler - Status code definition: # +# # +# 0 - DaTRI request - CREATED SUCCESSFULLY # +# # +# 1 - DaTRI request - NOT CREATED [due to incorrect input data] # +# datriHandler - EXECUTED SUCCESSFULLY # +# # +# 2 - DaTRI request - NOT CREATED # +# datriHandler - FAILED [due to lack of input data at datriCurl.get] # +# # +# 3 - DaTRI request - NOT CREATED # +# datriHandler - FAILED [due to failure at datriCurl.get] # +# # +# 4 - DaTRI request - NOT CREATED # +# datriHandler - FAILED [due to lack of input data at datriHandler.setParameters] # +# # +# 5 - DaTRI request - NOT CREATED # +# datriHandler - FAILED [due to failure at datriCurl] # +####################################################################################### diff --git a/current/pandaserver/dataservice/eventLookupClient.py b/current/pandaserver/dataservice/eventLookupClient.py new file mode 100644 index 000000000..b7ae3391a --- /dev/null +++ b/current/pandaserver/dataservice/eventLookupClient.py @@ -0,0 +1,201 @@ +import urllib, re, string, os, time + +# client for eventLookup Athenaeum service +# author: Marcin.Nowak@cern.ch + +class eventLookupClient: + + serverURL = "http://j2eeps.cern.ch/atlas-project-Athenaeum/" + #serverURL = "http://j2eeps.cern.ch/test-Athenaeum/" + #serverURL = "http://j2eeps.cern.ch/test-eventPicking/" + lookupPage = "EventLookup.jsp" + getPage = "EventLookupGet.jsp" + key = "insider" + workerHost = "atlas-tagservices.cern.ch" + #workerHost = "atlddm10.cern.ch" #this is at the moment the real host aliased by atlas-tagservices + #workerHost = "voatlas69.cern.ch" + workerPort = '10004' + connectionRefusedSleep = 20 + errorPattern = "(Exception)|(Error)|(Lookup cannot be run)|(invalid)|(NOT EXISTING)" + + + def __init__(self): + self.output = "" + self.guids = {} + self.guidsLine = "" + self.certProxyFileName = None + self.certProxy = "" + self.debug = None + self.remoteFile = None + try: + self.certProxyFileName = os.environ['X509_USER_PROXY'] + except KeyError: + print 'You do not seem to have a certificate proxy! (do voms-proxy-init)' + return + proxy = open(self.certProxyFileName) + try: + for line in proxy: + self.certProxy += line + finally: + proxy.close() + + + def workerURL(self): + if self.workerHost.find(":") > 0: + # port number together with the host name, possibly from commandline option + return "http://" + self.workerHost + else: + return "http://" + self.workerHost + ":" + self.workerPort + + + def doLookup(self, inputEvents, async=None, stream="", tokens="", + amitag="", extract=False): + """ contact the server and return a list of GUIDs + inputEvents - list of run-event pairs + async - request query procesing in a separate process, client will poll for results + stream - stream + tokens - token names + amitag - used to select reprocessing pass (default empty means the latest) + """ + if inputEvents == []: + return [] + + runs_events = "" + runs = set() + sep = "" + for run_ev in inputEvents: + runs_events += sep + run_ev[0] + " " + run_ev[1] + sep = "\n" + runs.add(run_ev[0]); + + if async is None: + if len(runs) > 50 or len(inputEvents) > 1000: + async = True + if async: + asyncStr = "true" + else: + asyncStr = "false" + + query_args = { 'key': self.key, + 'worker': self.workerURL(), + 'runs_events': runs_events, + 'cert_proxy': self.certProxy, + 'async': asyncStr, + 'stream': stream, + 'amitag': amitag, + 'tokens': tokens + } + if extract: + query_args['extract'] = "true" + + self.talkToServer(self.serverURL + self.lookupPage, query_args) + if not async: + for line in self.output: + if re.search("502 Bad Gateway", line): + # usually signifies a timeout on the J2EE server + print "Timeout detected. Retrying in asynchronous mode" + query_args['async'] = "true" + self.talkToServer(self.serverURL + self.lookupPage, query_args) + break + + self.remoteFile = None + for line in self.output: + m = re.search("FILE=(.+)$", line) + if m: + return self.waitForFile( m.group(1) ) + + return self.scanOutputForGuids() + + + def talkToServer(self, url, args): + encoded_args = urllib.urlencode(args) + if self.debug: + print "Contacting URL: " + url + print encoded_args + + for _try in range(1,6): + response = urllib.urlopen(url, encoded_args) + self.output = [] + retry = False + for line in response: + self.output.append(line) + if re.search("Connection refused", line): + retry = True + if retry: + if self.debug: + print "Failed to connect to the server, try " + str(_try) + time.sleep(self.connectionRefusedSleep) + else: + break + + + def scanOutputForGuids(self): + """ Scan the server output looking for a line with GUIDs + return list of GUIDs if line found, put GUIDs in self.guids + return None in case of errors + """ + self.guids = {} + self.tags = [] + self.tagAttributes = None + stage = None + tokpat = re.compile(r'[[]DB=(?P.*?)[]]') + for line in self.output: + if re.search(self.errorPattern, line, re.I): + #print " -- Error line matched: " + line + return None + if stage == "readTags": + if line[0:1] == ":": + # break the line up into attributes, extract GUIDs + values = [] + for attr in string.split(line[1:]): + tok = tokpat.match(attr) + if tok: + attr = tok.group('FID') + # self.guids - TODO - populate the guids dict + values.append(attr) + self.tags.append( values ) + continue + else: + return (self.tagAttributes, self.tags) + if re.match("\{.*\}$", line): + guids = eval(line) + if type(guids).__name__!='dict': + return None + self.guids = guids + return guids + if re.search("TAGs extracted:", line): + stage = "readAttribs" + continue + if stage == "readAttribs": + self.tagAttributes = string.split(line.strip(),",") + stage = "readTags" + continue + return None + + + def waitForFile(self, file): + """ Wait for the server to do EventLookup and store results in file + Retrieve the file and scan for GUIDs - return them if found + """ + query_args = { 'key': self.key, + 'worker': self.workerURL(), + 'file' : file, + 'wait_time' : "45" + } + self.remoteFile = file + if self.debug: + print "EventLookup waiting for server. Remote file=" + file + + ready = False + while not ready: + self.talkToServer(self.serverURL + self.getPage, query_args) + ready = True + for line in self.output: + if re.match("NOT READY", line): + if self.debug: + print "received NOT READY" + time.sleep(1) + ready = False + + return self.scanOutputForGuids() + diff --git a/current/pandaserver/dataservice/forkSetupper.py b/current/pandaserver/dataservice/forkSetupper.py new file mode 100755 index 000000000..415995de7 --- /dev/null +++ b/current/pandaserver/dataservice/forkSetupper.py @@ -0,0 +1,74 @@ +import os +import sys +import commands + +# exec +def run(inFile,v_onlyTA): + import cPickle as pickle + try: + # read Jobs from file + f = open(inFile) + jobs = pickle.load(f) + f.close() + except: + type, value, traceBack = sys.exc_info() + print("run() : %s %s" % (type,value)) + return + # password + from config import panda_config + passwd = panda_config.dbpasswd + # initialize cx_Oracle using dummy connection + from taskbuffer.Initializer import initializer + initializer.init() + # instantiate TB + from taskbuffer.TaskBuffer import taskBuffer + taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + # run Setupper + from dataservice.Setupper import Setupper + thr = Setupper(taskBuffer,jobs,onlyTA=v_onlyTA,useNativeDQ2=True) + thr.start() + thr.join() + return + + +# exit action +def _onExit(fname): + commands.getoutput('rm -rf %s' % fname) + + +#################################################################### +# main +def main(): + import getopt + import atexit + # option class + class _options: + def __init__(self): + pass + options = _options() + del _options + # set default values + options.inFile = "" + options.onlyTA = False + # get command-line parameters + try: + opts, args = getopt.getopt(sys.argv[1:],"i:t") + except: + print("ERROR : Invalid options") + sys.exit(1) + # set options + for o, a in opts: + if o in ("-i",): + options.inFile = a + if o in ("-t",): + options.onlyTA = True + # exit action + atexit.register(_onExit,options.inFile) + # run + run(options.inFile,options.onlyTA) + # return + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/current/pandaserver/jobdispatcher/ErrorCode.py b/current/pandaserver/jobdispatcher/ErrorCode.py new file mode 100755 index 000000000..e58b1b444 --- /dev/null +++ b/current/pandaserver/jobdispatcher/ErrorCode.py @@ -0,0 +1,11 @@ +############## errror code + +# Watcher +EC_Watcher = 100 + +# recovery failed +EC_Recovery = 101 + +# send failed +EC_SendError = 102 + diff --git a/current/pandaserver/jobdispatcher/JobDispatcher.py b/current/pandaserver/jobdispatcher/JobDispatcher.py new file mode 100755 index 000000000..86f126921 --- /dev/null +++ b/current/pandaserver/jobdispatcher/JobDispatcher.py @@ -0,0 +1,541 @@ +""" +dispatch jobs + +""" + +import re +import types +import threading +import Protocol +import time +import datetime +import commands +from threading import Lock +from config import panda_config +from dataservice.Adder import Adder +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('JobDispatcher') +_pilotReqLogger = PandaLogger().getLogger('PilotRequests') + + +# a wrapper to install timpout into a method +class _TimedMethod: + def __init__(self,method,timeout): + self.method = method + self.timeout = timeout + self.result = Protocol.TimeOutToken + + # method emulation + def __call__(self,*var): + self.result = apply(self.method,var) + + # run + def run(self,*var): + thr = threading.Thread(target=self,args=var) + # run thread + thr.start() + thr.join() #self.timeout) + + +# job dipatcher +class JobDipatcher: + # constructor + def __init__(self): + # taskbuffer + self.taskBuffer = None + # DN/token map + self.tokenDN = None + # datetime of last updated + self.lastUpdated = datetime.datetime.utcnow() + # how frequently update DN/token map + self.timeInterval = datetime.timedelta(seconds=180) + # pilot owners + self.pilotOwners = None + # hostnames for authorization at grid-free sites + self.allowedNodes = None + # lock + self.lock = Lock() + + + # set task buffer + def init(self,taskBuffer): + # lock + self.lock.acquire() + # set TB + if self.taskBuffer == None: + self.taskBuffer = taskBuffer + # update DN/token map + if self.tokenDN == None: + self.tokenDN = self.taskBuffer.getListSchedUsers() + # get pilot owners + if self.pilotOwners == None: + self.pilotOwners = self.taskBuffer.getPilotOwners() + # get allowed nodes + if self.allowedNodes == None: + self.allowedNodes = self.taskBuffer.getAllowedNodes() + # release + self.lock.release() + + + # get job + def getJob(self,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, + atlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup,allowOtherCountry): + jobs = [] + # wrapper function for timeout + tmpWrapper = _TimedMethod(self.taskBuffer.getJobs,timeout) + tmpWrapper.run(1,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, + atlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup,allowOtherCountry) + if isinstance(tmpWrapper.result,types.ListType): + jobs = jobs + tmpWrapper.result + # make response + if len(jobs) > 0: + proxyKey = jobs[-1] + nSent = jobs[-2] + jobs = jobs[:-2] + if len(jobs) != 0: + # succeed + response=Protocol.Response(Protocol.SC_Success) + # append Job + response.appendJob(jobs[0]) + # append nSent + response.appendNode('nSent',nSent) + # set proxy key + if getProxyKey: + response.setProxyKey(proxyKey) + else: + if tmpWrapper.result == Protocol.TimeOutToken: + # timeout + response=Protocol.Response(Protocol.SC_TimeOut) + else: + # no available jobs + response=Protocol.Response(Protocol.SC_NoJobs) + # return + _logger.debug("getJob : %s %s ret -> %s" % (siteName,node,response.encode())) + return response.encode() + + + # update job status + def updateJob(self,jobID,jobStatus,timeout,xml,siteName,param,metadata,attemptNr=None,stdout=''): + # retry failed analysis job and ddm job + if jobStatus=='failed' \ + and ((param.has_key('pilotErrorCode') and (param['pilotErrorCode'] in ['1200','1201'] \ + or param['pilotErrorCode'].startswith('-'))) \ + or (siteName != None and siteName.find('DDM') != -1)): + # retry + if param.has_key('pilotErrorCode') and param['pilotErrorCode'].startswith('-'): + # pilot retry with new PandaID + ret = self.taskBuffer.retryJob(jobID,param,getNewPandaID=True,attemptNr=attemptNr) + else: + # old style + ret = self.taskBuffer.retryJob(jobID,param,attemptNr=attemptNr) + if ret: + # return succeed + response=Protocol.Response(Protocol.SC_Success) + return response.encode() + # add metadata + if metadata != '': + self.taskBuffer.addMetadata([jobID],[metadata]) + # add stdout + if stdout != '': + self.taskBuffer.addStdOut(jobID,stdout) + # update + tmpStatus = jobStatus + updateStateChange = False + if jobStatus == 'failed' or jobStatus == 'finished': + tmpStatus = 'holding' + # update stateChangeTime to prevent Watcher from finding this job + updateStateChange = True + if tmpStatus == 'holding': + tmpWrapper = _TimedMethod(self.taskBuffer.updateJobStatus,None) + else: + tmpWrapper = _TimedMethod(self.taskBuffer.updateJobStatus,timeout) + tmpWrapper.run(jobID,tmpStatus,param,updateStateChange,attemptNr) + # make response + if tmpWrapper.result == Protocol.TimeOutToken: + # timeout + response=Protocol.Response(Protocol.SC_TimeOut) + else: + if tmpWrapper.result: + # succeed + response=Protocol.Response(Protocol.SC_Success) + # set command + if isinstance(tmpWrapper.result,types.StringType): + response.appendNode('command',tmpWrapper.result) + else: + response.appendNode('command','NULL') + # add output to dataset + if tmpWrapper.result != "badattemptnr" and (jobStatus == 'failed' or jobStatus == 'finished'): + Adder(self.taskBuffer,jobID,xml,jobStatus,attemptNr=attemptNr).start() + else: + # failed + response=Protocol.Response(Protocol.SC_Failed) + _logger.debug("updateJob : %s ret -> %s" % (jobID,response.encode())) + return response.encode() + + + # get job status + def getStatus(self,strIDs,timeout): + # convert str to list + ids = strIDs.split() + # peek jobs + tmpWrapper = _TimedMethod(self.taskBuffer.peekJobs,timeout) + tmpWrapper.run(ids,False,True,True,False) + # make response + if tmpWrapper.result == Protocol.TimeOutToken: + # timeout + response=Protocol.Response(Protocol.SC_TimeOut) + else: + if isinstance(tmpWrapper.result,types.ListType): + # succeed + response=Protocol.Response(Protocol.SC_Success) + # make return + retStr = '' + attStr = '' + for job in tmpWrapper.result: + if job == None: + retStr += '%s+' % 'notfound' + attStr += '0+' + else: + retStr += '%s+' % job.jobStatus + attStr += '%s+' % job.attemptNr + response.appendNode('status',retStr[:-1]) + response.appendNode('attemptNr',attStr[:-1]) + else: + # failed + response=Protocol.Response(Protocol.SC_Failed) + _logger.debug("getStatus : %s ret -> %s" % (strIDs,response.encode())) + return response.encode() + + + # get DN/token map + def getDnTokenMap(self): + # get current datetime + current = datetime.datetime.utcnow() + # lock + self.lock.acquire() + # update DN map if old + if current-self.lastUpdated > self.timeInterval: + # get new map + self.tokenDN = self.taskBuffer.getListSchedUsers() + # reset + self.lastUpdated = current + # release + self.lock.release() + # return + return self.tokenDN + + + # generate pilot token + def genPilotToken(self,schedulerhost,scheduleruser,schedulerid): + retVal = self.taskBuffer.genPilotToken(schedulerhost,scheduleruser,schedulerid) + # failed + if retVal == None: + return "ERROR : failed to generate token" + return "SUCCEEDED : " + retVal + + +# Singleton +jobDispatcher = JobDipatcher() +del JobDipatcher + + +# get FQANs +def _getFQAN(req): + fqans = [] + for tmpKey,tmpVal in req.subprocess_env.iteritems(): + # compact credentials + if tmpKey.startswith('GRST_CRED_'): + # VOMS attribute + if tmpVal.startswith('VOMS'): + # FQAN + fqan = tmpVal.split()[-1] + # append + fqans.append(fqan) + # old style + elif tmpKey.startswith('GRST_CONN_'): + tmpItems = tmpVal.split(':') + # FQAN + if len(tmpItems)==2 and tmpItems[0]=='fqan': + fqans.append(tmpItems[-1]) + # return + return fqans + + +# check role +def _checkRole(fqans,dn,jdCore,withVomsPatch=True,site='',hostname=''): + prodManager = False + try: + # VOMS attributes of production and pilot roles + prodAttrs = ['/atlas/usatlas/Role=production', + '/atlas/usatlas/Role=pilot', + '/atlas/Role=production', + '/atlas/Role=pilot', + '/osg/Role=pilot', + '/Engage/LBNE/Role=pilot', + ] + if withVomsPatch: + # FIXEME once http://savannah.cern.ch/bugs/?47136 is solved + prodAttrs += ['/atlas/'] + prodAttrs += ['/osg/'] + prodAttrs += ['/Engage/LBNE/'] + for fqan in fqans: + # check atlas/usatlas production role + for rolePat in prodAttrs: + if fqan.startswith(rolePat): + prodManager = True + break + # escape + if prodManager: + break + # service proxy for CERNVM + if site in ['CERNVM']: + serviceSubjects = ['/DC=ch/DC=cern/OU=computers/CN=pilot/copilot.cern.ch'] + for tmpSub in serviceSubjects: + if dn.startswith(tmpSub): + prodManager = True + break + # grid-free authorization + if not prodManager: + if hostname != '' and jdCore.allowedNodes.has_key(site): + for tmpPat in jdCore.allowedNodes[site]: + if re.search(tmpPat,hostname) != None: + prodManager = True + break + # check DN with pilotOwners + if (not prodManager) and (not dn in [None]): + for owner in jdCore.pilotOwners: + # check + if re.search(owner,dn) != None: + prodManager = True + break + except: + pass + # return + return prodManager + + +# get DN +def _getDN(req): + realDN = None + if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + realDN = req.subprocess_env['SSL_CLIENT_S_DN'] + # remove redundant CN + realDN = re.sub('/CN=limited proxy','',realDN) + realDN = re.sub('/CN=proxy(/CN=proxy)+','/CN=proxy',realDN) + # return + return realDN + + +# check token +def _checkToken(token,jdCore): + # not check None until all pilots use tokens + if token == None: + return True + # get map + tokenDN = jdCore.getDnTokenMap() + # return + return tokenDN.has_key(token) + + + +""" +web service interface + +""" + +# get job +def getJob(req,siteName,token=None,timeout=60,cpu=None,mem=None,diskSpace=None,prodSourceLabel=None,node=None, + computingElement=None,AtlasRelease=None,prodUserID=None,getProxyKey=None,countryGroup=None, + workingGroup=None,allowOtherCountry=None): + _logger.debug("getJob(%s)" % siteName) + # get DN + realDN = _getDN(req) + # get FQANs + fqans = _getFQAN(req) + # check production role + if getProxyKey == 'True': + # don't use /atlas to prevent normal proxy getting credname + prodManager = _checkRole(fqans,realDN,jobDispatcher,False,site=siteName) + else: + prodManager = _checkRole(fqans,realDN,jobDispatcher,site=siteName, + hostname=req.get_remote_host()) + # check token + validToken = _checkToken(token,jobDispatcher) + # set DN for non-production user + if not prodManager: + prodUserID = realDN + # allow getProxyKey for production role + if getProxyKey == 'True' and prodManager: + getProxyKey = True + else: + getProxyKey = False + # convert mem and diskSpace + try: + mem = int(float(mem)) + if mem < 0: + mem = 0 + except: + mem = 0 + try: + diskSpace = int(float(diskSpace)) + if diskSpace < 0: + diskSpace = 0 + except: + diskSpace = 0 + _logger.debug("getJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s)" \ + % (siteName,cpu,mem,diskSpace,prodSourceLabel,node, + computingElement,AtlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup, + allowOtherCountry,realDN,prodManager,token,validToken,str(fqans))) + _pilotReqLogger.info('method=getJob,site=%s,node=%s,type=%s' % (siteName,node,prodSourceLabel)) + # invalid role + if (not prodManager) and (not prodSourceLabel in ['user']): + _logger.warning("getJob(%s) : invalid role" % siteName) + return Protocol.Response(Protocol.SC_Role).encode() + # invalid token + if not validToken: + _logger.warning("getJob(%s) : invalid token" % siteName) + return Protocol.Response(Protocol.SC_Invalid).encode() + # invoke JD + return jobDispatcher.getJob(siteName,prodSourceLabel,cpu,mem,diskSpace,node,int(timeout), + computingElement,AtlasRelease,prodUserID,getProxyKey,countryGroup, + workingGroup,allowOtherCountry) + + +# update job status +def updateJob(req,jobId,state,token=None,transExitCode=None,pilotErrorCode=None,pilotErrorDiag=None,timestamp=None,timeout=60, + xml='',node=None,workdir=None,cpuConsumptionTime=None,cpuConsumptionUnit=None,remainingSpace=None, + schedulerID=None,pilotID=None,siteName=None,messageLevel=None,pilotLog='',metaData='', + cpuConversionFactor=None,exeErrorCode=None,exeErrorDiag=None,pilotTiming=None,computingElement=None, + startTime=None,endTime=None,nEvents=None,nInputFiles=None,batchID=None,attemptNr=None,jobMetrics=None, + stdout=''): + _logger.debug("updateJob(%s)" % jobId) + # get DN + realDN = _getDN(req) + # get FQANs + fqans = _getFQAN(req) + # check production role + prodManager = _checkRole(fqans,realDN,jobDispatcher,site=siteName,hostname=req.get_remote_host()) + # check token + validToken = _checkToken(token,jobDispatcher) + _logger.debug("updateJob(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,attemptNr:%s,DN:%s,role:%s,token:%s,val:%s,FQAN:%s\n==XML==\n%s\n==LOG==\n%s\n==Meta==\n%s\n==Metrics==\n%s\n==stdout==\n%s)" % + (jobId,state,transExitCode,pilotErrorCode,pilotErrorDiag,node,workdir,cpuConsumptionTime, + cpuConsumptionUnit,remainingSpace,schedulerID,pilotID,siteName,messageLevel,nEvents,nInputFiles, + cpuConversionFactor,exeErrorCode,exeErrorDiag,pilotTiming,computingElement,startTime,endTime, + batchID,attemptNr,realDN,prodManager,token,validToken,str(fqans),xml,pilotLog,metaData,jobMetrics, + stdout)) + _pilotReqLogger.info('method=updateJob,site=%s,node=%s,type=None' % (siteName,node)) + # invalid role + if not prodManager: + _logger.warning("updateJob(%s) : invalid role" % jobId) + return Protocol.Response(Protocol.SC_Role).encode() + # invalid token + if not validToken: + _logger.warning("updateJob(%s) : invalid token" % jobId) + return Protocol.Response(Protocol.SC_Invalid).encode() + # aborting message + if jobId=='NULL': + return Protocol.Response(Protocol.SC_Success).encode() + # check status + if not state in ['running','failed','finished','holding','starting','transferring']: + _logger.warning("invalid state=%s for updateJob" % state) + return Protocol.Response(Protocol.SC_Success).encode() + # pilot log + if pilotLog != '': + try: + # make message + message = pilotLog + # get logger + _pandaLogger = PandaLogger() + _pandaLogger.lock() + _pandaLogger.setParams({'Type':'pilotLog','PandaID':int(jobId)}) + logger = _pandaLogger.getHttpLogger(panda_config.loggername) + # add message + logger.info(message) + # release HTTP handler + _pandaLogger.release() + except: + pass + # create parameter map + param = {} + if cpuConsumptionTime != None: + param['cpuConsumptionTime']=cpuConsumptionTime + if cpuConsumptionUnit != None: + param['cpuConsumptionUnit']=cpuConsumptionUnit + if node != None: + param['modificationHost']=node + if transExitCode != None: + param['transExitCode']=transExitCode + if pilotErrorCode != None: + param['pilotErrorCode']=pilotErrorCode + if pilotErrorDiag != None: + param['pilotErrorDiag']=pilotErrorDiag[:500] + if jobMetrics != None: + param['jobMetrics']=jobMetrics[:500] + if schedulerID != None: + param['schedulerID']=schedulerID + if pilotID != None: + param['pilotID']=pilotID[:200] + if batchID != None: + param['batchID']=batchID + if exeErrorCode != None: + param['exeErrorCode']=exeErrorCode + if exeErrorDiag != None: + param['exeErrorDiag']=exeErrorDiag[:500] + if cpuConversionFactor != None: + param['cpuConversion']=cpuConversionFactor + if pilotTiming != None: + param['pilotTiming']=pilotTiming + if computingElement != None: + param['computingElement']=computingElement + if nEvents != None: + param['nEvents']=nEvents + if nInputFiles != None: + param['nInputFiles']=nInputFiles + if startTime != None: + try: + param['startTime']=datetime.datetime(*time.strptime(startTime,'%Y-%m-%d %H:%M:%S')[:6]) + except: + pass + if endTime != None: + try: + param['endTime']=datetime.datetime(*time.strptime(endTime,'%Y-%m-%d %H:%M:%S')[:6]) + except: + pass + if attemptNr != None: + try: + attemptNr = int(attemptNr) + except: + attemptNr = None + if stdout != '': + stdout = stdout[:2048] + # invoke JD + return jobDispatcher.updateJob(int(jobId),state,int(timeout),xml,siteName, + param,metaData,attemptNr,stdout) + + +# get job status +def getStatus(req,ids,timeout=60): + _logger.debug("getStatus(%s)" % ids) + return jobDispatcher.getStatus(ids,int(timeout)) + + +# generate pilot token +def genPilotToken(req,schedulerid,host=None): + # get DN + realDN = _getDN(req) + # get FQANs + fqans = _getFQAN(req) + # check production role + prodManager = _checkRole(fqans,realDN,jobDispatcher,False) + if not prodManager: + return "ERROR : production or pilot role is required" + if realDN == None: + return "ERROR : failed to retrive DN" + # hostname + if host == None: + host = req.get_remote_host() + # return + return jobDispatcher.genPilotToken(host,realDN,schedulerid) + diff --git a/current/pandaserver/jobdispatcher/Protocol.py b/current/pandaserver/jobdispatcher/Protocol.py new file mode 100755 index 000000000..42cbd9d4d --- /dev/null +++ b/current/pandaserver/jobdispatcher/Protocol.py @@ -0,0 +1,212 @@ +import urllib + + +# constants +TimeOutToken = "TimeOut" +NoJobsToken = "NoJobs" + +########### status codes +# succeeded +SC_Success = 0 +# timeout +SC_TimeOut = 10 +# no available jobs +SC_NoJobs = 20 +# failed +SC_Failed = 30 +# Not secure connection +SC_NonSecure = 40 +# invalid token +SC_Invalid = 50 +# invalid role +SC_Role = 60 + + +# response +class Response: + # constructor + def __init__(self,statusCode): + # create data object + self.data = {'StatusCode':statusCode} + + + # URL encode + def encode(self): + return urllib.urlencode(self.data) + + + # append Node + def appendNode(self,name,value): + self.data[name]=value + + + # append job + def appendJob(self,job): + # PandaID + self.data['PandaID'] = job.PandaID + # prodSourceLabel + self.data['prodSourceLabel'] = job.prodSourceLabel + # swRelease + self.data['swRelease'] = job.AtlasRelease + # homepackage + self.data['homepackage'] = job.homepackage + # transformation + self.data['transformation'] = job.transformation + # job name + self.data['jobName'] = job.jobName + # job definition ID + self.data['jobDefinitionID'] = job.jobDefinitionID + # cloud + self.data['cloud'] = job.cloud + # files + strIFiles = '' + strOFiles = '' + strDispatch = '' + strDisToken = '' + strDisTokenForOutput = '' + strDestination = '' + strRealDataset = '' + strRealDatasetIn = '' + strDestToken = '' + strProdToken = '' + strGUID = '' + strFSize = '' + strCheckSum = '' + strScopeIn = '' + strScopeOut = '' + strScopeLog = '' + logFile = '' + logGUID = '' + for file in job.Files: + if file.type == 'input': + if strIFiles != '': + strIFiles += ',' + strIFiles += file.lfn + if strDispatch != '': + strDispatch += ',' + strDispatch += file.dispatchDBlock + if strDisToken != '': + strDisToken += ',' + strDisToken += file.dispatchDBlockToken + if strProdToken != '': + strProdToken += ',' + strProdToken += file.prodDBlockToken + if strGUID != '': + strGUID += ',' + strGUID += file.GUID + strRealDatasetIn += '%s,' % file.dataset + strFSize += '%s,' % file.fsize + if not file.checksum in ['','NULL',None]: + strCheckSum += '%s,' % file.checksum + else: + strCheckSum += '%s,' % file.md5sum + strScopeIn += '%s,' % file.scope + if file.type == 'output' or file.type == 'log': + if strOFiles != '': + strOFiles += ',' + strOFiles += file.lfn + if strDestination != '': + strDestination += ',' + strDestination += file.destinationDBlock + if strRealDataset != '': + strRealDataset += ',' + strRealDataset += file.dataset + if file.type == 'log': + logFile = file.lfn + logGUID = file.GUID + strScopeLog = file.scope + else: + strScopeOut += '%s,' % file.scope + if strDestToken != '': + strDestToken += ',' + strDestToken += file.destinationDBlockToken.split(',')[0] + strDisTokenForOutput += '%s,' % file.dispatchDBlockToken + # inFiles + self.data['inFiles'] = strIFiles + # dispatch DBlock + self.data['dispatchDblock'] = strDispatch + # dispatch DBlock space token + self.data['dispatchDBlockToken'] = strDisToken + # dispatch DBlock space token for output + self.data['dispatchDBlockTokenForOut'] = strDisTokenForOutput[:-1] + # outFiles + self.data['outFiles'] = strOFiles + # destination DBlock + self.data['destinationDblock'] = strDestination + # destination DBlock space token + self.data['destinationDBlockToken'] = strDestToken + # prod DBlock space token + self.data['prodDBlockToken'] = strProdToken + # real output datasets + self.data['realDatasets'] = strRealDataset + # real output datasets + self.data['realDatasetsIn'] = strRealDatasetIn[:-1] + # log filename + self.data['logFile'] = logFile + # log GUID + self.data['logGUID'] = logGUID + # jobPars + self.data['jobPars'] = job.jobParameters + # attempt number + self.data['attemptNr'] = job.attemptNr + # GUIDs + self.data['GUID'] = strGUID + # checksum + self.data['checksum'] = strCheckSum[:-1] + # fsize + self.data['fsize'] = strFSize[:-1] + # scope + self.data['scopeIn'] = strScopeIn[:-1] + self.data['scopeOut'] = strScopeOut[:-1] + self.data['scopeLog'] = strScopeLog + # destinationSE + self.data['destinationSE'] = job.destinationSE + # user ID + self.data['prodUserID'] = job.prodUserID + # CPU count + self.data['maxCpuCount'] = job.maxCpuCount + # RAM count + self.data['minRamCount'] = job.minRamCount + # disk count + self.data['maxDiskCount'] = job.maxDiskCount + # cmtconfig + self.data['cmtConfig'] = job.cmtConfig + # processingType + self.data['processingType'] = job.processingType + # transferType + self.data['transferType'] = job.transferType + # current priority + self.data['currentPriority'] = job.currentPriority + # taskID + self.data['taskID'] = job.taskID + # debug mode + if job.specialHandling != None and 'debug' in job.specialHandling: + self.data['debug'] = 'True' + + + # set proxy key + def setProxyKey(self,proxyKey): + names = ['credname','myproxy'] + for name in names: + if proxyKey.has_key(name): + self.data[name] = proxyKey[name] + else: + self.data[name] = '' + + +# check if secure connection +def isSecure(req): + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return False + return True + + +# get user DN +def getUserDN(req): + try: + return req.subprocess_env['SSL_CLIENT_S_DN'] + except: + return 'None' + + + diff --git a/current/pandaserver/jobdispatcher/Watcher.py b/current/pandaserver/jobdispatcher/Watcher.py new file mode 100755 index 000000000..f07e6d922 --- /dev/null +++ b/current/pandaserver/jobdispatcher/Watcher.py @@ -0,0 +1,172 @@ +''' +watch job + +''' + +import re +import sys +import time +import commands +import datetime +import threading +import ErrorCode + +import taskbuffer.ErrorCode + +from brokerage.PandaSiteIDs import PandaSiteIDs + +from dataservice.Closer import Closer +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('Watcher') + + +class Watcher (threading.Thread): + # constructor + def __init__(self,taskBuffer,pandaID,single=False,sleepTime=360,sitemapper=None): + threading.Thread.__init__(self) + self.pandaID = pandaID + self.taskBuffer = taskBuffer + self.sleepTime = sleepTime + self.single = single + self.siteMapper = sitemapper + + # main + def run(self): + try: + while True: + _logger.debug('%s start' % self.pandaID) + # query job + job = self.taskBuffer.peekJobs([self.pandaID],fromDefined=False, + fromArchived=False,fromWaiting=False)[0] + # check job status + if job == None or (not job.jobStatus in ['running','sent','starting','holding', + 'stagein','stageout']): + _logger.debug('%s escape : %s' % (self.pandaID,job.jobStatus)) + return + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=self.sleepTime) + if job.modificationTime < timeLimit or (job.endTime != 'NULL' and job.endTime < timeLimit): + _logger.debug('%s %s lastmod:%s endtime:%s' % (job.PandaID,job.jobStatus, + str(job.modificationTime), + str(job.endTime))) + destDBList = [] + # retry analysis jobs + if (job.prodSourceLabel in ['user','panda']) and (job.attemptNr<2 or job.jobStatus == 'sent') \ + and job.commandToPilot != 'tobekilled' and (not job.processingType in ['ITB_INTEGRATION']) \ + and not job.taskBufferErrorCode in [taskbuffer.ErrorCode.EC_Reassigned, + taskbuffer.ErrorCode.EC_Retried, + taskbuffer.ErrorCode.EC_PilotRetried] \ + and not job.processingType.startswith('gangarobot') \ + and not job.processingType.startswith('hammercloud'): + # reset + _logger.debug(' -> reset %s job with %s : PandaID:%s #%s' % (job.prodSourceLabel,job.jobStatus,job.PandaID,job.attemptNr)) + job.jobStatus = 'activated' + job.startTime = None + job.endTime = None + job.attemptNr = job.attemptNr + 1 + # remove flag regarding to pledge-resource handling + if not job.specialHandling in [None,'NULL','']: + newSpecialHandling = re.sub(',*localpool','',job.specialHandling) + if newSpecialHandling == '': + job.specialHandling = None + else: + job.specialHandling = newSpecialHandling + # TEMPORARY : send it to long queue + oldComputingSite = job.computingSite + if job.jobStatus != 'sent' and job.computingSite.startswith('ANALY') and (not job.computingSite.startswith('ANALY_LONG_')): + tmpLongSiteList = [] + tmpLongSite = re.sub('^ANALY_','ANALY_LONG_',job.computingSite) + tmpLongSite = re.sub('_\d+$','',tmpLongSite) + tmpLongSiteList.append(tmpLongSite) + tmpLongSite = job.computingSite + '_LONG' + tmpLongSiteList.append(tmpLongSite) + tmpLongSite = re.sub('SHORT','LONG',job.computingSite) + if tmpLongSite != job.computingSite: + tmpLongSiteList.append(tmpLongSite) + for longSite in tmpLongSiteList: + if self.siteMapper.checkSite(longSite): + tmpSiteSpec = self.siteMapper.getSite(longSite) + if tmpSiteSpec.status == 'online': + job.computingSite = longSite + _logger.debug(' -> sending PandaID:%s to %s' % (job.PandaID,job.computingSite)) + # set destinationSE + if job.destinationSE == oldComputingSite: + job.destinationSE = job.computingSite + break + # modify LFNs and destinationSE + for file in job.Files: + modTypes = ('output','log') + if file.type in modTypes: + # set destinationSE + if file.destinationSE == oldComputingSite: + file.destinationSE = job.computingSite + if job.prodSourceLabel == 'panda': + # doesn't change output for buildJob + modTypes = ('log',) + if file.type in modTypes: + # set new GUID + if file.type == 'log': + file.GUID = commands.getoutput('uuidgen') + # add attempt nr + oldName = file.lfn + file.lfn = re.sub("\.\d+$","",file.lfn) + file.lfn = "%s.%d" % (file.lfn,job.attemptNr) + newName = file.lfn + # modify jobParameters + sepPatt = "(\'|\"|%20)" + oldName + "(\'|\"|%20)" + matches = re.findall(sepPatt,job.jobParameters) + for match in matches: + oldPatt = match[0]+oldName+match[-1] + newPatt = match[0]+newName+match[-1] + job.jobParameters = re.sub(oldPatt,newPatt,job.jobParameters) + else: + if job.jobStatus == 'sent': + # sent job didn't receive reply from pilot within 30 min + job.jobDispatcherErrorCode = ErrorCode.EC_SendError + job.jobDispatcherErrorDiag = "Sent job didn't receive reply from pilot within 30 min" + elif job.exeErrorDiag == 'NULL' and job.pilotErrorDiag == 'NULL': + # lost heartbeat + job.jobDispatcherErrorCode = ErrorCode.EC_Watcher + if job.jobDispatcherErrorDiag == 'NULL': + if job.endTime == 'NULL': + # normal lost heartbeat + job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.modificationTime) + else: + # job recovery failed + job.jobDispatcherErrorDiag = 'lost heartbeat : %s' % str(job.endTime) + else: + # job recovery failed + job.jobDispatcherErrorCode = ErrorCode.EC_Recovery + job.jobDispatcherErrorDiag = 'job recovery failed for %s hours' % (self.sleepTime/60) + # set job status + job.jobStatus = 'failed' + # set endTime for lost heartbeat + if job.endTime == 'NULL': + # normal lost heartbeat + job.endTime = job.modificationTime + # set files status + for file in job.Files: + if file.type == 'output' or file.type == 'log': + file.status = 'failed' + if not file.destinationDBlock in destDBList: + destDBList.append(file.destinationDBlock) + # update job + self.taskBuffer.updateJobs([job],False) + # start closer + if job.jobStatus == 'failed': + cThr = Closer(self.taskBuffer,destDBList,job) + cThr.start() + cThr.join() + _logger.debug('%s end' % job.PandaID) + return + # single action + if self.single: + return + # sleep + time.sleep(60*self.sleepTime) + except: + type, value, traceBack = sys.exc_info() + _logger.error("run() : %s %s" % (type,value)) + return diff --git a/current/pandaserver/jobdispatcher/__init__.py b/current/pandaserver/jobdispatcher/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/current/pandaserver/server/panda.py b/current/pandaserver/server/panda.py new file mode 100755 index 000000000..d8d9d4991 --- /dev/null +++ b/current/pandaserver/server/panda.py @@ -0,0 +1,180 @@ +#!/usr/bin/python2.5 + +""" +entry point + +""" + +# config file +from config import panda_config + +# initialize cx_Oracle using dummy connection +from taskbuffer.Initializer import initializer +initializer.init() + +# initialzie TaskBuffer +from taskbuffer.TaskBuffer import taskBuffer +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,panda_config.nDBConnection,True) + +# initialize JobDispatcher +from jobdispatcher.JobDispatcher import jobDispatcher +if panda_config.nDBConnection != 0: + jobDispatcher.init(taskBuffer) + +# initialize DataService +from dataservice.DataService import dataService +if panda_config.nDBConnection != 0: + dataService.init(taskBuffer) + +# initialize UserIF +from userinterface.UserIF import userIF +if panda_config.nDBConnection != 0: + userIF.init(taskBuffer) + +# import web I/F +allowedMethods = [] + +from taskbuffer.Utils import isAlive,putFile,deleteFile,getServer,updateLog,fetchLog,\ + touchFile,getVomsAttr,putEventPickingRequest,getAttr,getFile +allowedMethods += ['isAlive','putFile','deleteFile','getServer','updateLog','fetchLog', + 'touchFile','getVomsAttr','putEventPickingRequest','getAttr','getFile'] + +from dataservice.DataService import datasetCompleted,updateFileStatusInDisp +allowedMethods += ['datasetCompleted','updateFileStatusInDisp'] + +from jobdispatcher.JobDispatcher import getJob,updateJob,getStatus,genPilotToken +allowedMethods += ['getJob','updateJob','getStatus','genPilotToken'] + +from userinterface.UserIF import submitJobs,getJobStatus,queryPandaIDs,killJobs,reassignJobs,\ + getJobStatistics,getJobStatisticsPerSite,resubmitJobs,queryLastFilesInDataset,getPandaIDsSite,\ + getJobsToBeUpdated,updateProdDBUpdateTimes,runTaskAssignment,getAssigningTask,getSiteSpecs,\ + getCloudSpecs,runBrokerage,seeCloudTask,queryJobInfoPerCloud,registerProxyKey,getProxyKey,\ + getJobIDsInTimeRange,getPandIDsWithJobID,getFullJobStatus,getJobStatisticsForBamboo,\ + getNUserJobs,addSiteAccess,listSiteAccess,getFilesInUseForAnal,updateSiteAccess,\ + getPandaClientVer,getSlimmedFileInfoPandaIDs,runReBrokerage,deleteFilesFromCacheDB,\ + addFilesToCacheDB,flushCacheDB,checkFilesWithCacheDB,getQueuedAnalJobs,getHighestPrioJobStat,\ + getActiveDatasets,setCloudTaskByUser,getSerialNumberForGroupJob,getCachePrefixes,\ + checkMergeGenerationStatus,sendLogInfo,getNumPilots,retryFailedJobsInActive,\ + getJobStatisticsWithLabel,getPandaIDwithJobExeID,getJobStatisticsPerUserSite,\ + getDisInUseForAnal,getLFNsInUseForAnal,getScriptOfflineRunning,setDebugMode,\ + insertSandboxFileInfo,checkSandboxFile,changeJobPriorities +allowedMethods += ['submitJobs','getJobStatus','queryPandaIDs','killJobs','reassignJobs', + 'getJobStatistics','getJobStatisticsPerSite','resubmitJobs','queryLastFilesInDataset','getPandaIDsSite', + 'getJobsToBeUpdated','updateProdDBUpdateTimes','runTaskAssignment','getAssigningTask','getSiteSpecs', + 'getCloudSpecs','runBrokerage','seeCloudTask','queryJobInfoPerCloud','registerProxyKey','getProxyKey', + 'getJobIDsInTimeRange','getPandIDsWithJobID','getFullJobStatus','getJobStatisticsForBamboo', + 'getNUserJobs','addSiteAccess','listSiteAccess','getFilesInUseForAnal','updateSiteAccess', + 'getPandaClientVer','getSlimmedFileInfoPandaIDs','runReBrokerage','deleteFilesFromCacheDB', + 'addFilesToCacheDB','flushCacheDB','checkFilesWithCacheDB','getQueuedAnalJobs','getHighestPrioJobStat', + 'getActiveDatasets','setCloudTaskByUser','getSerialNumberForGroupJob','getCachePrefixes', + 'checkMergeGenerationStatus','sendLogInfo','getNumPilots','retryFailedJobsInActive', + 'getJobStatisticsWithLabel','getPandaIDwithJobExeID','getJobStatisticsPerUserSite', + 'getDisInUseForAnal','getLFNsInUseForAnal','getScriptOfflineRunning','setDebugMode', + 'insertSandboxFileInfo','checkSandboxFile','changeJobPriorities'] + +# import error +import taskbuffer.ErrorCode + + +# FastCGI/WSGI entry +if panda_config.useFastCGI or panda_config.useWSGI: + + import os + import cgi + import sys + from pandalogger.PandaLogger import PandaLogger + + # logger + _logger = PandaLogger().getLogger('Entry') + + # dummy request object + class DummyReq: + def __init__(self,env,): + # environ + self.subprocess_env = env + # header + self.headers_in = {} + # content-length + if self.subprocess_env.has_key('CONTENT_LENGTH'): + self.headers_in["content-length"] = self.subprocess_env['CONTENT_LENGTH'] + + # get remote host + def get_remote_host(self): + if self.subprocess_env.has_key('REMOTE_HOST'): + return self.subprocess_env['REMOTE_HOST'] + return "" + + + # application + def application(environ, start_response): + # get method name + methodName = '' + if environ.has_key('SCRIPT_NAME'): + methodName = environ['SCRIPT_NAME'].split('/')[-1] + if panda_config.entryVerbose: + _logger.debug("PID=%s %s in" % (os.getpid(),methodName)) + # check method name + if not methodName in allowedMethods: + _logger.error("PID=%s %s is forbidden" % (os.getpid(),methodName)) + exeRes = "False : %s is forbidden" % methodName + else: + # get method object + tmpMethod = None + try: + exec "tmpMethod = %s" % methodName + except: + pass + # object not found + if tmpMethod == None: + _logger.error("PID=%s %s is undefined" % (os.getpid(),methodName)) + exeRes = "False" + else: + # get params + tmpPars = cgi.FieldStorage(environ['wsgi.input'], environ=environ, + keep_blank_values=1) + # convert to map + params = {} + for tmpKey in tmpPars.keys(): + if tmpPars[tmpKey].file != None and tmpPars[tmpKey].filename != None: + # file + params[tmpKey] = tmpPars[tmpKey] + else: + # string + params[tmpKey] = tmpPars.getfirst(tmpKey) + if panda_config.entryVerbose: + _logger.debug("PID=%s %s with %s" % (os.getpid(),methodName,str(params.keys()))) + # dummy request object + dummyReq = DummyReq(environ) + try: + # exec + exeRes = apply(tmpMethod,[dummyReq],params) + # convert bool to string + if exeRes in [True,False]: + exeRes = str(exeRes) + except: + errType,errValue = sys.exc_info()[:2] + errStr = "" + for tmpKey,tmpVal in environ.iteritems(): + errStr += "%s : %s\n" % (tmpKey,str(tmpVal)) + _logger.error("execution failure : %s %s" % (errType,errValue)) + _logger.error(errStr) + # return internal server error + start_response('500 INTERNAL SERVER ERROR', [('Content-Type', 'text/plain')]) + return ["%s %s" % (errType,errValue)] + if panda_config.entryVerbose: + _logger.debug("PID=%s %s out" % (os.getpid(),methodName)) + # return + if exeRes == taskbuffer.ErrorCode.EC_NotFound: + start_response('404 Not Found', [('Content-Type', 'text/plain')]) + return ['not found'] + elif isinstance(exeRes,taskbuffer.ErrorCode.EC_Redirect): + start_response('302 Redirect', [('Location', exeRes.url)]) + return ['redirect'] + else: + start_response('200 OK', [('Content-Type', 'text/plain')]) + return [exeRes] + + # start server + if panda_config.useFastCGI: + from flup.server.fcgi import WSGIServer + WSGIServer(application,multithreaded=False).run() diff --git a/current/pandaserver/taskbuffer/ArchiveDBProxyPool.py b/current/pandaserver/taskbuffer/ArchiveDBProxyPool.py new file mode 100644 index 000000000..8bfd014b0 --- /dev/null +++ b/current/pandaserver/taskbuffer/ArchiveDBProxyPool.py @@ -0,0 +1,55 @@ +""" +pool for ArchiveDBProxies + +""" + +import time +import Queue +import random +import OraLogDBProxy as LogDBProxy +from config import panda_config + +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('ArchiveDBProxyPool') + +class ArchiveDBProxyPool: + + def __init__(self,nConnection=panda_config.nArchiveDBConnection): + # create Proxies + _logger.debug("init") + self.proxyList = Queue.Queue(nConnection) + for i in range(nConnection): + _logger.debug("connect -> %s " % i) + proxy = LogDBProxy.LogDBProxy() + nTry = 10 + for iTry in range(nTry): + if proxy.connect(dbhost = panda_config.archivedbhost, + dbpasswd = panda_config.archivedbpasswd, + dbuser = panda_config.archivedbuser, + dbname = panda_config.archivedbname): + break + _logger.debug("failed -> %s : try %s" % (i,iTry)) + if iTry+1 == nTry: + raise RuntimeError, 'ArchiveDBProxyPool.__init__ failed' + time.sleep(random.randint(10,20)) + self.proxyList.put(proxy) + time.sleep(1) + _logger.debug("ready") + + # return a free proxy. this method blocks until a proxy is available + def getProxy(self): + # get proxy + proxy = self.proxyList.get() + # wake up connection + proxy.wakeUp() + # return + return proxy + + + # put back a proxy + def putProxy(self,proxy): + # put + self.proxyList.put(proxy) + diff --git a/current/pandaserver/taskbuffer/CloudSpec.py b/current/pandaserver/taskbuffer/CloudSpec.py new file mode 100644 index 000000000..bfb1927d3 --- /dev/null +++ b/current/pandaserver/taskbuffer/CloudSpec.py @@ -0,0 +1,27 @@ +""" +cloud specification + +""" + +class CloudSpec(object): + # attributes + _attributes = ('name','tier1','tier1SE','relocation','weight','server','status','transtimelo', + 'transtimehi','waittime','validation','mcshare','countries','fasttrack','nprestage', + 'pilotowners') + + # constructor + def __init__(self): + # install attributes + for attr in self._attributes: + setattr(self,attr,None) + + # serialize + def __str__(self): + str = '' + for attr in self._attributes: + str += '%s:%s ' % (attr,getattr(self,attr)) + return str + + + + diff --git a/current/pandaserver/taskbuffer/CloudTaskSpec.py b/current/pandaserver/taskbuffer/CloudTaskSpec.py new file mode 100644 index 000000000..8fade3ce1 --- /dev/null +++ b/current/pandaserver/taskbuffer/CloudTaskSpec.py @@ -0,0 +1,99 @@ +""" +cloud/task specification + +""" + +class CloudTaskSpec(object): + # attributes + _attributes = ('id','taskname','taskid','cloud','status','tmod','tenter') + # slots + __slots__ = _attributes + + + # constructor + def __init__(self): + # install attributes + for attr in self._attributes: + setattr(self,attr,None) + + + # override __getattribute__ for SQL and PandaID + def __getattribute__(self,name): + ret = object.__getattribute__(self,name) + if ret == None: + return "NULL" + return ret + + + # return a tuple of values + def values(self): + ret = [] + for attr in self._attributes: + val = getattr(self,attr) + ret.append(val) + return tuple(ret) + + + # pack tuple into CloudTaskSpec + def pack(self,values): + for i in range(len(self._attributes)): + attr= self._attributes[i] + val = values[i] + setattr(self,attr,val) + + + # return state values to be pickled + def __getstate__(self): + state = [] + for attr in self._attributes: + val = getattr(self,attr) + state.append(val) + return state + + + # restore state from the unpickled state values + def __setstate__(self,state): + for i in range(len(self._attributes)): + if i+1 < len(state): + setattr(self,self._attributes[i],state[i]) + else: + setattr(self,self._attributes[i],'NULL') + + + # return column names for INSERT + def columnNames(cls): + ret = "" + for attr in cls._attributes: + if ret != "": + ret += ',' + ret += attr + return ret + columnNames = classmethod(columnNames) + + + # return expression of values for INSERT + def valuesExpression(cls): + ret = "VALUES(" + for attr in cls._attributes: + ret += "%s" + if attr != cls._attributes[len(cls._attributes)-1]: + ret += "," + ret += ")" + return ret + valuesExpression = classmethod(valuesExpression) + + + # return an expression for UPDATE + def updateExpression(cls): + ret = "" + for attr in cls._attributes: + ret = ret + attr + "=%s" + if attr != cls._attributes[len(cls._attributes)-1]: + ret += "," + return ret + updateExpression = classmethod(updateExpression) + + + + + diff --git a/current/pandaserver/taskbuffer/CloudURLMap.py b/current/pandaserver/taskbuffer/CloudURLMap.py new file mode 100644 index 000000000..27bdce567 --- /dev/null +++ b/current/pandaserver/taskbuffer/CloudURLMap.py @@ -0,0 +1,36 @@ +# cloud to Panda server's URL mapping +cloudURLMap = { + 'CA' : { + 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', + 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', + }, + 'ES' : { + 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', + 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', + }, + 'FR' : { + 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', + 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', + }, + 'IT' : { + 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', + 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', + }, + 'NL' : { + 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', + 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', + }, + 'TW' : { + 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', + 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', + }, + 'UK' : { + 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', + 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', + }, + 'US' : { + 'http' : 'http://gridui01.usatlas.bnl.gov:25080/server/panda', + 'https' : 'https://gridui01.usatlas.bnl.gov:25443/server/panda', + }, + } + diff --git a/current/pandaserver/taskbuffer/ConBridge.py b/current/pandaserver/taskbuffer/ConBridge.py new file mode 100644 index 000000000..3f4fd1abd --- /dev/null +++ b/current/pandaserver/taskbuffer/ConBridge.py @@ -0,0 +1,502 @@ +import os +import re +import sys +import time +import types +import socket +import signal +import random +import threading +import cPickle as pickle + +import OraDBProxy as DBProxy + +from config import panda_config +from JobSpec import JobSpec +from FileSpec import FileSpec +from DatasetSpec import DatasetSpec +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('ConBridge') + + +# exception for normal termination +class HarmlessEx(Exception): + pass + + +# terminate child process by itself when master has gone +class Terminator (threading.Thread): + + # constructor + def __init__(self,consock): + threading.Thread.__init__(self) + self.consock = consock + + + # main + def run(self): + # watching control socket + try: + rcvSize = self.consock.recv(1) + except: + pass + # get PID + pid = os.getpid() + _logger.debug("child %s received termination" % pid) + # kill + try: + os.kill(pid,signal.SIGTERM) + except: + pass + try: + os.kill(pid,signal.SIGKILL) + except: + pass + + + +# connection bridge with with timeout +class ConBridge (object): + + # constructor + def __init__(self): + self.child_pid = 0 + self.isMaster = False + self.mysock = None + self.consock = None + self.pid = os.getpid() + # timeout + if hasattr(panda_config,'dbtimeout'): + self.timeout = int(panda_config.dbtimeout) + else: + self.timeout = 600 + # verbose + if hasattr(panda_config,'dbbridgeverbose'): + self.verbose = panda_config.dbbridgeverbose + else: + self.verbose = False + + + # destructor + def __del__(self): + # kill old child process + self.bridge_killChild() + + + # connect + def connect(self,dbhost=panda_config.dbhost,dbpasswd=panda_config.dbpasswd, + dbuser=panda_config.dbuser,dbname=panda_config.dbname, + dbtimeout=None,reconnect=False): + # kill old child process + self.bridge_killChild() + _logger.debug('master %s connecting' % self.pid) + # reset child PID and sockets + self.child_pid = 0 + self.mysock = None + self.consock = None + # create socket + datpair = socket.socketpair() + conpair = socket.socketpair() + # fork + self.child_pid = os.fork() + if self.child_pid == 0: + # child + self.isMaster = False + self.pid = os.getpid() + # keep socket + self.mysock = datpair[1] + self.consock = conpair[1] + datpair[0].close() + conpair[0].close() + # connect to database + _logger.debug('child %s connecting to database' % self.pid) + self.proxy = DBProxy.DBProxy() + if not self.proxy.connect(dbhost=dbhost,dbpasswd=dbpasswd,dbtimeout=60): + _logger.error('child %s failed to connect' % self.pid) + # send error + self.bridge_sendError((RuntimeError,'child %s connection failed' % self.pid)) + # exit + self.bridge_childExit() + # send OK just for ACK + _logger.debug('child %s connection is ready' % self.pid) + self.bridge_sendResponse(None) + # start terminator + Terminator(self.consock).start() + # go main loop + _logger.debug('child %s going into the main loop' % self.pid) + self.bridge_run() + # exit + self.bridge_childExit(0) + else: + # master + self.isMaster = True + # keep socket + self.mysock = datpair[0] + self.consock = conpair[0] + datpair[1].close() + conpair[1].close() + try: + # get ACK + _logger.debug('master %s waiting ack from child=%s' % (self.pid,self.child_pid)) + self.bridge_getResponse() + _logger.debug('master %s got ready from child=%s' % (self.pid,self.child_pid)) + return True + except: + errType,errValue = sys.exc_info()[:2] + _logger.error('master %s failed to setup child=%s : %s %s' % \ + (self.pid,self.child_pid,errType,errValue)) + # kill child + self.bridge_killChild() + return False + + + + ####################### + # communication methods + + # send packet + def bridge_send(self,val): + try: + # set timeout + if self.isMaster: + self.mysock.settimeout(self.timeout) + # serialize + tmpStr = pickle.dumps(val) + # send size + self.mysock.sendall("%50s" % len(tmpStr)) + # send body + self.mysock.sendall(tmpStr) + # set timeout back + if self.isMaster: + self.mysock.settimeout(None) + except: + errType,errValue = sys.exc_info()[:2] + if self.isMaster: + roleType = 'master' + else: + roleType = 'child ' + _logger.error('%s %s send error : val=%s - %s %s' % \ + (roleType,self.pid,str(val),errType,errValue)) + # terminate child + if not self.isMaster: + self.bridge_childExit() + raise errType,errValue + + + # receive packet + def bridge_recv(self): + try: + # set timeout + if self.isMaster: + self.mysock.settimeout(self.timeout) + # get size + strSize = '' + headSize = 50 + while len(strSize) < headSize: + tmpSize = headSize - len(strSize) + tmpStr = self.mysock.recv(tmpSize) + if tmpStr == '': + if self.isMaster: + raise socket.error,'empty packet' + else: + # master closed socket + raise HarmlessEx,'empty packet' + strSize += tmpStr + # get body + strBody = '' + bodySize = long(strSize) + while len(strBody) < bodySize: + tmpSize = bodySize - len(strBody) + tmpStr = self.mysock.recv(tmpSize) + if tmpStr == '': + if self.isMaster: + raise socket.error,'empty packet' + else: + # master closed socket + raise HarmlessEx,'empty packet' + strBody += tmpStr + # set timeout back + if self.isMaster: + self.mysock.settimeout(None) + # deserialize + retVal = pickle.loads(strBody) + return True,retVal + except: + if self.isMaster: + roleType = 'master' + else: + roleType = 'child ' + errType,errValue = sys.exc_info()[:2] + if errType == HarmlessEx: + _logger.debug('%s %s recv harmless ex : %s' % \ + (roleType,self.pid,errValue)) + else: + _logger.error('%s %s recv error : %s %s' % \ + (roleType,self.pid,errType,errValue)) + # terminate child + if not self.isMaster: + self.bridge_childExit() + raise errType,errValue + + + + ####################### + # child's methods + + # send error + def bridge_sendError(self,val): + # send status + self.bridge_send("NG") + # check if pickle-able + try: + pickle.dumps(val) + except: + # use RuntimeError + val = (RuntimeError,str(val[-1])) + # send exceptions + self.bridge_send(val) + + + # send response + def bridge_sendResponse(self,val): + # send status + self.bridge_send("OK") + # send response + self.bridge_send(val) + + + # termination of child + def bridge_childExit(self,exitCode=1): + if not self.isMaster: + _logger.debug("child %s closing sockets" % self.pid) + # close sockets + try: + self.mysock.shutdown(socket.SHUT_RDWR) + except: + pass + try: + self.consock.shutdown(socket.SHUT_RDWR) + except: + pass + # exit + _logger.debug("child %s going to exit" % self.pid) + os._exit(exitCode) + + + # child main + def bridge_run(self): + comStr = '' + while True: + try: + # get command + status,comStr = self.bridge_recv() + if not status: + raise RuntimeError,'invalid command' + # get variables + status,variables = self.bridge_recv() + if not status: + raise RuntimeError,'invalid variables' + except: + errType,errValue = sys.exc_info()[:2] + _logger.error('child %s died : %s %s' % (self.pid,errType,errValue)) + # exit + self.bridge_childExit() + if self.verbose: + _logger.debug('child %s method %s executing' % (self.pid,comStr)) + try: + # execute + method = getattr(self.proxy,comStr) + res = apply(method,variables[0],variables[1]) + # FIXME : modify response since cx_Oracle types cannot be picked + if comStr in ['querySQLS']: + newRes = [True]+res[1:] + res = newRes + if self.verbose: + _logger.debug('child %s method %s completed' % (self.pid,comStr)) + # return + self.bridge_sendResponse((res,variables[0],variables[1])) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error('child %s method %s failed : %s %s' % (self.pid,comStr,errType,errValue)) + if errType in [socket.error,socket.timeout]: + _logger.error('child %s died : %s %s' % (self.pid,errType,errValue)) + # exit + self.bridge_childExit() + # send error + self.bridge_sendError((errType,errValue)) + + + + ####################### + # master's methods + + # kill child + def bridge_killChild(self): + # kill old child process + if self.child_pid != 0: + # close sockets + _logger.debug('master %s closing sockets for child=%s' % (self.pid,self.child_pid)) + try: + if self.mysock != None: + self.mysock.shutdown(socket.SHUT_RDWR) + except: + pass + try: + if self.consock != None: + self.consock.shutdown(socket.SHUT_RDWR) + except: + pass + _logger.debug('master %s killing child=%s' % (self.pid,self.child_pid)) + # send SIGTERM + try: + os.kill(self.child_pid,signal.SIGTERM) + except: + pass + time.sleep(2) + # send SIGKILL + try: + os.kill(self.child_pid,signal.SIGKILL) + except: + pass + # wait for completion of child + _logger.debug('master %s waiting child=%s' % (self.pid,self.child_pid)) + try: + os.waitpid(self.child_pid,0) + except: + pass + # sleep to avoid burst reconnection + time.sleep(random.randint(5,15)) + _logger.debug('master %s killed child=%s' % (self.pid,self.child_pid)) + + + # get responce + def bridge_getResponse(self): + # get status + status,strStatus = self.bridge_recv() + if not status: + raise RuntimeError,'master %s got invalid status response from child=%s' % \ + (self.pid,self.child_pid) + if strStatus == 'OK': + # return res + status,ret = self.bridge_recv() + if not status: + raise RuntimeError,'master %s got invalid response body from child=%s' % \ + (self.pid,self.child_pid) + return ret + elif strStatus == 'NG': + # raise error + status,ret = self.bridge_recv() + if not status: + raise RuntimeError,'master %s got invalid response value from child=%s' % \ + (self.pid,self.child_pid) + raise ret[0],ret[1] + else: + raise RuntimeError,'master %s got invalid response from child=%s : %s' % \ + (self.pid,self.child_pid,str(strStatus)) + + + # method wrapper class + class bridge_masterMethod: + + # constructor + def __init__(self,name,parent): + self.name = name + self.parent = parent + self.pid = os.getpid() + + + # copy changes in taskbuff objects to master + def copyTbObjChanges(self,oldPar,newPar): + # check they have the same type + if type(oldPar) != type(newPar): + return False + # copy some Specs since they are passed via ref's + if isinstance(oldPar,JobSpec) or isinstance(oldPar,FileSpec) \ + or isinstance(oldPar,DatasetSpec): + if hasattr(oldPar,'__getstate__'): + tmpStat = newPar.__getstate__() + oldPar.__setstate__(tmpStat) + else: + tmpStat = newPar.values() + oldPar.pack(tmpStat) + return True + # copy Datasets + return False + + + # copy changes in objects to master + def copyChanges(self,oldPar,newPar): + if isinstance(oldPar,types.ListType): + # delete all elements first + while len(oldPar) > 0: + oldPar.pop() + # append + for tmpItem in newPar: + oldPar.append(tmpItem) + elif isinstance(oldPar,types.DictType): + # replace + for tmpKey in newPar.keys(): + oldPar[tmpKey] = newPar[tmpKey] + else: + self.copyTbObjChanges(oldPar,newPar) + + + # method emulation + def __call__(self,*args,**keywords): + while True: + try: + # send command name + self.parent.bridge_send(self.name) + # send variables + self.parent.bridge_send((args,keywords)) + # get response + retVal,newArgs,newKeywords = self.parent.bridge_getResponse() + # propagate child's changes in args to master + for idxArg,tmpArg in enumerate(args): + self.copyChanges(tmpArg,newArgs[idxArg]) + # propagate child's changes in keywords to master + for tmpKey,tmpArg in keywords.iteritems(): + self.copyChanges(tmpArg,newKeywords[tmpKey]) + # return + return retVal + except: + errType,errValue = sys.exc_info()[:2] + _logger.error('master %s method %s failed : %s %s' % \ + (self.pid,self.name,errType,errValue)) + # reconnect when socket has a problem + if not errType in [socket.error,socket.timeout]: + # kill old child process + self.parent.bridge_killChild() + _logger.error('master %s killed child' % self.pid) + #raise errType,errValue + # sleep + time.sleep(5) + # reconnect + try: + _logger.debug('master %s trying to reconnect' % self.pid) + self.parent.connect() + _logger.debug('master %s reconnect completed' % self.pid) + except: + _logger.error('master %s connect failed' % self.pid) + + + # get atter for cursor attributes + def __getattribute__(self,name): + if object.__getattribute__(self,'isMaster'): + try: + # return origianl attribute + return object.__getattribute__(self,name) + except: + # append methods + if not name.startswith('_') and hasattr(DBProxy.DBProxy,name) and \ + isinstance(getattr(DBProxy.DBProxy,name),types.UnboundMethodType): + # get DBProxy's method wrapper + method = ConBridge.bridge_masterMethod(name,self) + # set method + setattr(self,name,method) + # return + return method + # return origianl attribute for child + return object.__getattribute__(self,name) diff --git a/current/pandaserver/taskbuffer/DBProxy.py b/current/pandaserver/taskbuffer/DBProxy.py new file mode 100755 index 000000000..9d0981e15 --- /dev/null +++ b/current/pandaserver/taskbuffer/DBProxy.py @@ -0,0 +1,3066 @@ +""" +proxy for database connection + +""" + +import re +import os +import sys +import time +import fcntl +import random +import urllib +import MySQLdb +import datetime +import commands +import traceback +import warnings +import ErrorCode +from JobSpec import JobSpec +from FileSpec import FileSpec +from DatasetSpec import DatasetSpec +from CloudTaskSpec import CloudTaskSpec +from pandalogger.PandaLogger import PandaLogger +from config import panda_config +from brokerage.PandaSiteIDs import PandaSiteIDs + +warnings.filterwarnings('ignore') + +# logger +_logger = PandaLogger().getLogger('DBProxy') + +# lock file +_lockGetSN = open(panda_config.lockfile_getSN, 'w') +_lockSetDS = open(panda_config.lockfile_setDS, 'w') +_lockGetCT = open(panda_config.lockfile_getCT, 'w') + + +# proxy +class DBProxy: + + # constructor + def __init__(self): + # connection object + self.conn = None + # cursor object + self.cur = None + # host name + self.hostname = None + # retry count + self.nTry = 5 + + # connect to DB + def connect(self,dbhost=panda_config.dbhost,dbpasswd=panda_config.dbpasswd, + dbuser=panda_config.dbuser,dbname=panda_config.dbname, + dbtimeout=None,reconnect=False): + # keep parameters for reconnect + if not reconnect: + self.dbhost = dbhost + self.dbpasswd = dbpasswd + self.dbuser = dbuser + self.dbname = dbname + self.dbtimeout = dbtimeout + # connect + try: + if self.dbtimeout == None: + self.conn = MySQLdb.connect(host=self.dbhost,user=self.dbuser, + passwd=self.dbpasswd,db=self.dbname) + else: + self.conn = MySQLdb.connect(host=self.dbhost,user=self.dbuser, + passwd=self.dbpasswd,db=self.dbname, + connect_timeout=self.dbtimeout) + self.cur=self.conn.cursor() + # get hostname + self.cur.execute('SELECT USER()') + res = self.cur.fetchone() + match = re.search('^([^@]+)@([^@]+)$',res[0]) + if match != None: + self.hostname = match.group(2) + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("connect : %s %s" % (type,value)) + # roll back + self._rollback() + return False + + + # query an SQL + def querySQL(self,sql): + comment = ' /* DBProxy.querySQL */' + try: + _logger.debug("querySQL : %s " % sql) + # begin transaction + self.cur.execute("START TRANSACTION") + self.cur.execute(sql+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return res + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("querySQL : %s " % sql) + _logger.error("querySQL : %s %s" % (type,value)) + return None + + + # query an SQL return Status + def querySQLS(self,sql): + comment = ' /* DBProxy.querySQLS */' + try: + # begin transaction + self.cur.execute("SET AUTOCOMMIT=1") + ret = self.cur.execute(sql+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return ret,res + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("querySQLS : %s " % sql) + _logger.error("querySQLS : %s %s" % (type,value)) + return -1,None + + + # query an SQL with list return Status + def querySQLwList(self,sql,valList): + comment = ' /* DBProxy.querySQLwList */' + try: + # begin transaction + self.cur.execute("SET AUTOCOMMIT=1") + ret = self.cur.execute(sql+comment,valList) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return ret,res + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("querySQLwList : %s %s" % (sql,str(valList))) + _logger.error("querySQLwList : %s %s" % (type,value)) + return -1,None + + + # insert job to jobsDefined + def insertNewJob(self,job,user,serNum,weight=0.0,priorityOffset=0,userVO=None): + comment = ' /* DBProxy.insertNewJob */' + sql1 = "INSERT INTO jobsDefined4 (%s) " % JobSpec.columnNames() + sql1+= JobSpec.valuesExpression() + # make sure PandaID is NULL + job.PandaID = None + # job status + job.jobStatus='defined' + # host and time information + job.modificationHost = self.hostname + job.creationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + job.modificationTime = job.creationTime + job.stateChangeTime = job.creationTime + # DN + if job.prodUserID == "NULL" or job.prodSourceLabel in ['user','panda']: + job.prodUserID = user + # VO + job.VO = userVO + # priority + if job.assignedPriority != 'NULL': + job.currentPriority = job.assignedPriority + if job.prodSourceLabel == 'user': + job.currentPriority = 1000 + priorityOffset - (serNum / 5) - int(100 * weight) + elif job.prodSourceLabel == 'panda': + job.currentPriority = 2000 + priorityOffset + # usergroup + if job.prodSourceLabel == 'regional': + job.computingSite= "BNLPROD" + try: + # begin transaction + self.cur.execute("START TRANSACTION") + # insert + retI = self.cur.execute(sql1+comment, job.values()) + # set PandaID + job.PandaID = self.conn.insert_id() + # insert files + _logger.debug("insertNewJob : %s Label : %s ret : %s" % (job.PandaID,job.prodSourceLabel,retI)) + sqlFile = "INSERT INTO filesTable4 (%s) " % FileSpec.columnNames() + sqlFile+= FileSpec.valuesExpression() + for file in job.Files: + file.rowID = None + if file.status != 'ready': + file.status='unknown' + # replace $PANDAID with real PandaID + file.lfn = re.sub('\$PANDAID', '%05d' % job.PandaID, file.lfn) + self.cur.execute(sqlFile+comment, file.values()) + # get rowID + file.rowID = self.conn.insert_id() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("insertNewJob : %s File OK" % job.PandaID) + # update job info in MonALISA - Job Defined. + #aThr = apmonInterface(job) + #aThr.start() + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("insertNewJob : %s %s" % (type,value)) + # roll back + self._rollback() + return False + + + # simply insert job to a table + def insertJobSimple(self,job,table,fileTable): + comment = ' /* DBProxy.insertJobSimple */' + _logger.debug("insertJobSimple : %s" % job.PandaID) + sql1 = "INSERT INTO %s (%s) " % (table,JobSpec.columnNames()) + sql1+= JobSpec.valuesExpression() + try: + # begin transaction + self.cur.execute("START TRANSACTION") + # insert + self.cur.execute(sql1+comment, job.values()) + # files + sqlFile = "INSERT INTO %s " % fileTable + sqlFile+= "(%s) " % FileSpec.columnNames() + sqlFile+= FileSpec.valuesExpression() + for file in job.Files: + self.cur.execute(sqlFile+comment, file.values()) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("insertJobSimple : %s %s" % (type,value)) + # roll back + self._rollback() + return False + + + # activate job. move job from jobsDefined to jobsActive + def activateJob(self,job): + comment = ' /* DBProxy.activateJob */' + if job==None: + _logger.debug("activateJob : None") + return True + _logger.debug("activateJob : %s" % job.PandaID) + sql0 = "SELECT rowID FROM filesTable4 WHERE PandaID=%s AND type=%s AND status!=%s" + sql1 = "UPDATE jobsDefined4 SET jobStatus='activated' " + sql1+= "WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined') AND commandToPilot<>'tobekilled'" + sql2 = "INSERT INTO jobsActive4 (%s) " % JobSpec.columnNames() + sql2+= JobSpec.valuesExpression() + # host and time information + job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + # set stateChangeTime for defined->activated but not for assigned->activated + if job.jobStatus in ['defined']: + job.stateChangeTime = job.modificationTime + nTry=3 + for iTry in range(nTry): + try: + # check if all files are ready + allOK = True + for file in job.Files: + if file.type == 'input' and file.status != 'ready': + allOK = False + break + # begin transaction + self.cur.execute("START TRANSACTION") + # check all inputs are ready + self.cur.execute(sql0+comment, (job.PandaID,"input","ready")) + res = self.cur.fetchall() + if len(res) == 0 or allOK: + # change status + job.jobStatus = "activated" + # update. Not delete for InnoDB + n = self.cur.execute(sql1+comment, (job.PandaID,)) + if n==0: + # already killed or activated + _logger.debug("activateJob : Not found %s" % job.PandaID) + else: + # insert + self.cur.execute(sql2+comment, job.values()) + # update files + sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" + for file in job.Files: + self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) + else: + # update job + sqlJ = ("UPDATE jobsDefined4 SET %s " % JobSpec.updateExpression()) + \ + "WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')" + n = self.cur.execute(sqlJ+comment, job.values()+(job.PandaID,)) + if n==0: + # already killed or activated + _logger.debug("activateJob : Not found %s" % job.PandaID) + else: + # update files + sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" + for file in job.Files: + self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("activateJob : %s retry : %s" % (job.PandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("activateJob : %s %s" % (type,value)) + return False + + + # send job to jobsWaiting + def keepJob(self,job): + comment = ' /* DBProxy.keepJob */' + _logger.debug("keepJob : %s" % job.PandaID) + sql1 = "UPDATE jobsDefined4 SET jobStatus='waiting' " + sql1+= "WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined') AND commandToPilot<>'tobekilled'" + sql2 = "INSERT INTO jobsWaiting4 (%s) " % JobSpec.columnNames() + sql2+= JobSpec.valuesExpression() + # time information + job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + job.stateChangeTime = job.modificationTime + nTry=3 + for iTry in range(nTry): + try: + # begin transaction + self.cur.execute("START TRANSACTION") + # delete + n = self.cur.execute(sql1+comment, (job.PandaID,)) + if n==0: + # already killed + _logger.debug("keepJob : Not found %s" % job.PandaID) + else: + # set status + job.jobStatus = 'waiting' + # insert + self.cur.execute(sql2+comment, job.values()) + # update files + sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" + for file in job.Files: + self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # update job info in MonALISA - Job sent to waiting state + #aThr = apmonInterface(job) + #aThr.start() + return True + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("keepJob : %s retry : %s" % (job.PandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("keepJob : %s %s" % (type,value)) + return False + + + # archive job to jobArchived and remove the job from jobsActive or jobsDefined + def archiveJob(self,job,fromJobsDefined): + comment = ' /* DBProxy.archiveJob */' + _logger.debug("archiveJob : %s" % job.PandaID) + if fromJobsDefined: + sql1 = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')" + else: + sql1 = "DELETE FROM jobsActive4 WHERE PandaID=%s" + sql2 = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames() + sql2+= JobSpec.valuesExpression() + nTry=3 + for iTry in range(nTry): + try: + # begin transaction + self.cur.execute("START TRANSACTION") + # delete + n = self.cur.execute(sql1+comment, (job.PandaID,)) + if n==0: + # already killed + _logger.debug("archiveJob : Not found %s" % job.PandaID) + else: + # insert + job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + job.stateChangeTime = job.modificationTime + if job.endTime == 'NULL': + job.endTime = job.modificationTime + self.cur.execute(sql2+comment, job.values()) + # update files + sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" + for file in job.Files: + self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) + # delete downstream jobs + ddmIDs = [] + newJob = None + ddmAttempt = 0 + if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed': + # look for outputs + upOutputs = [] + for file in job.Files: + if file.type == 'output': + upOutputs.append(file.lfn) + # look for downstream jobs + sqlD = "SELECT PandaID FROM filesTable4 WHERE type='input' AND lfn='%s' GROUP BY PandaID" + sqlDJS = "SELECT %s " % JobSpec.columnNames() + sqlDJS+= "FROM jobsDefined4 WHERE PandaID=%s" + sqlDJD = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s" + sqlDJI = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames() + sqlDJI+= JobSpec.valuesExpression() + for upFile in upOutputs: + _logger.debug("look for downstream jobs for %s" % upFile) + # select PandaID + self.cur.execute((sqlD+comment) % upFile) + res = self.cur.fetchall() + for downID in res: + _logger.debug("delete : %s" % downID) + # select jobs + self.cur.execute((sqlDJS+comment) % downID) + resJob = self.cur.fetchall() + if len(resJob) == 0: + continue + # instantiate JobSpec + dJob = JobSpec() + dJob.pack(resJob[0]) + # delete + retD = self.cur.execute((sqlDJD+comment) % downID) + if retD == 0: + continue + # error code + dJob.jobStatus = 'failed' + dJob.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + dJob.taskBufferErrorCode = ErrorCode.EC_Kill + dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed' + dJob.modificationTime = dJob.endTime + dJob.stateChangeTime = dJob.endTime + # insert + self.cur.execute(sqlDJI+comment, dJob.values()) + elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='dis': + # get corresponding jobs for production movers + vuid = '' + # extract vuid + match = re.search('--callBack (\S+)',job.jobParameters) + if match != None: + try: + callbackUrl = urllib.unquote(match.group(1)) + callbackUrl = re.sub('[&\?]',' ', callbackUrl) + # look for vuid= + for item in callbackUrl.split(): + if item.startswith('vuid='): + vuid = item.split('=')[-1] + break + except: + pass + if vuid == '': + _logger.error("cannot extract vuid from %s" % job.jobParameters) + else: + # get name + self.cur.execute(("SELECT name FROM Datasets WHERE vuid='%s' AND type='dispatch'" % vuid)+comment) + res = self.cur.fetchall() + if len(res) != 0: + disName = res[0] + # get PandaIDs + self.cur.execute(("SELECT PandaID FROM jobsDefined4 WHERE dispatchDBlock='%s' AND jobStatus='assigned'" % disName)+comment) + resDDM = self.cur.fetchall() + for tmpID, in resDDM: + ddmIDs.append(tmpID) + # get offset + ddmAttempt = job.attemptNr + _logger.debug("get PandaID for reassign : %s ddmAttempt=%s" % (str(ddmIDs),ddmAttempt)) + elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='ddm' and job.attemptNr<2 \ + and job.commandToPilot != 'tobekilled': + # instantiate new mover to retry subscription + newJob = JobSpec() + newJob.jobDefinitionID = job.jobDefinitionID + newJob.jobName = job.jobName + newJob.attemptNr = job.attemptNr + 1 + newJob.transformation = job.transformation + newJob.destinationDBlock = job.destinationDBlock + newJob.destinationSE = job.destinationSE + newJob.currentPriority = job.currentPriority + newJob.prodSourceLabel = job.prodSourceLabel + newJob.prodUserID = job.prodUserID + newJob.computingSite = job.computingSite + newJob.transferType = job.transferType + newJob.sourceSite = job.sourceSite + newJob.destinationSite = job.destinationSite + newJob.jobParameters = job.jobParameters + if job.Files != []: + file = job.Files[0] + fileOL = FileSpec() + # add attempt nr + fileOL.lfn = re.sub("\.\d+$","",file.lfn) + fileOL.lfn = "%s.%d" % (fileOL.lfn,job.attemptNr) + fileOL.destinationDBlock = file.destinationDBlock + fileOL.destinationSE = file.destinationSE + fileOL.dataset = file.dataset + fileOL.type = file.type + newJob.addFile(fileOL) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True,ddmIDs,ddmAttempt,newJob + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("archiveJob : %s retry : %s" % (job.PandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("archiveJob : %s" % job.PandaID) + _logger.error("archiveJob : %s %s" % (type,value)) + return False,[],0,None + + + # overload of archiveJob + def archiveJobLite(self,pandaID,jobStatus,param): + comment = ' /* DBProxy.archiveJobLite */' + _logger.debug("archiveJobLite : %s" % pandaID) + sql1 = "SELECT %s FROM jobsActive4 " % JobSpec.columnNames() + sql1+= "WHERE PandaID=%s" + sql2 = "DELETE FROM jobsActive4 WHERE PandaID=%s" + sql3 = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames() + sql3+= JobSpec.valuesExpression() + nTry=3 + for iTry in range(nTry): + try: + # begin transaction + self.cur.execute("START TRANSACTION") + # select + self.cur.execute(sql1+comment, (pandaID,)) + res = self.cur.fetchall() + if len(res) == 0: + _logger.error("archiveJobLite() : PandaID %d not found" % pandaID) + self._rollback() + return False + job = JobSpec() + job.pack(res[0]) + job.jobStatus = jobStatus + for key in param.keys(): + if param[key] != None: + setattr(job,key,param[key]) + job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + job.endTime = job.modificationTime + job.stateChangeTime = job.modificationTime + # delete + n = self.cur.execute(sql2+comment, (job.PandaID,)) + if n==0: + # already killed + _logger.debug("archiveJobLite : Not found %s" % pandaID) + else: + # insert + self.cur.execute(sql3+comment, job.values()) + # update files + sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" + for file in job.Files: + self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) + # delete downstream jobs + if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed': + # file select + sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=%s" + self.cur.execute(sqlFile+comment, (job.PandaID,)) + resFs = self.cur.fetchall() + for resF in resFs: + file = FileSpec() + file.pack(resF) + job.addFile(file) + # look for outputs + upOutputs = [] + for file in job.Files: + if file.type == 'output': + upOutputs.append(file.lfn) + # look for downstream jobs + sqlD = "SELECT PandaID FROM filesTable4 WHERE type='input' AND lfn='%s' GROUP BY PandaID" + sqlDJS = "SELECT %s " % JobSpec.columnNames() + sqlDJS+= "FROM jobsDefined4 WHERE PandaID=%s" + sqlDJD = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s" + sqlDJI = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames() + sqlDJI+= JobSpec.valuesExpression() + for upFile in upOutputs: + _logger.debug("look for downstream jobs for %s" % upFile) + # select PandaID + self.cur.execute((sqlD+comment) % upFile) + res = self.cur.fetchall() + for downID in res: + _logger.debug("delete : %s" % downID) + # select jobs + self.cur.execute((sqlDJS+comment) % downID) + resJob = self.cur.fetchall() + if len(resJob) == 0: + continue + # instantiate JobSpec + dJob = JobSpec() + dJob.pack(resJob[0]) + # delete + retD = self.cur.execute((sqlDJD+comment) % downID) + if retD == 0: + continue + # error code + dJob.jobStatus = 'failed' + dJob.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + dJob.taskBufferErrorCode = ErrorCode.EC_Kill + dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed' + dJob.modificationTime = dJob.endTime + dJob.stateChangeTime = dJob.endTime + # insert + self.cur.execute((sqlDJI+comment), dJob.values()) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("archiveJobLite : %s retry : %s" % (pandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("archiveJobLite : %s %s" % (type,value)) + return False + + + # update Job status in jobsActive + def updateJobStatus(self,pandaID,jobStatus,param): + comment = ' /* DBProxy.updateJobStatus */' + _logger.debug("updateJobStatus : %s" % pandaID) + sql1 = "UPDATE jobsActive4 SET jobStatus=%s,modificationTime=UTC_TIMESTAMP()" + if jobStatus in ['starting']: + sql1 += ",stateChangeTime=UTC_TIMESTAMP()" + values = [jobStatus] + for key in param.keys(): + if param[key] != None: + sql1 = sql1 + (',%s=' % key) + '%s' + values.append(param[key]) + sql1 += " WHERE PandaID=%s" + values.append(pandaID) + nTry=3 + for iTry in range(nTry): + try: + # begin transaction + self.cur.execute("START TRANSACTION") + # update + self.cur.execute (sql1+comment,tuple(values)) + # get command + self.cur.execute ('SELECT commandToPilot,endTime FROM jobsActive4 WHERE PandaID=%s'+comment,(pandaID,)) + res = self.cur.fetchone() + if res != None: + ret = res[0] + # update endTime + endTime = res[1] + if jobStatus == 'holding' and endTime==None: + self.cur.execute ("UPDATE jobsActive4 SET endTime=UTC_TIMESTAMP() WHERE PandaID=%s"+comment,(pandaID,)) + else: + # already deleted + ret = 'tobekilled' + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return ret + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("updateJobStatus : %s retry : %s" % (pandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("updateJobStatus : %s %s" % (type,value)) + _logger.error("updateJobStatus : %s" % pandaID) + return False + + + # update job information in jobsActive or jobsDefined + def updateJob(self,job,inJobsDefined): + comment = ' /* DBProxy.updateJob */' + _logger.debug("updateJob : %s" % job.PandaID) + if inJobsDefined: + sql1 = "UPDATE jobsDefined4 SET %s " % JobSpec.updateExpression() + else: + sql1 = "UPDATE jobsActive4 SET %s " % JobSpec.updateExpression() + sql1+= "WHERE PandaID=%s" + if inJobsDefined: + sql1+= " AND (jobStatus='assigned' OR jobStatus='defined')" + nTry=3 + for iTry in range(nTry): + try: + job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + # set stateChangeTime for defined->assigned + if inJobsDefined: + job.stateChangeTime = job.modificationTime + # begin transaction + self.cur.execute("START TRANSACTION") + # update + n = self.cur.execute(sql1+comment, job.values()+(job.PandaID,)) + if n==0: + # already killed or activated + _logger.debug("updateJob : Not found %s" % job.PandaID) + else: + sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" + for file in job.Files: + self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("updateJob : %s retry : %s" % (job.PandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("updateJob : %s %s" % (type,value)) + return False + + + # retry analysis job + def retryJob(self,pandaID,param): + comment = ' /* DBProxy.retryJob */' + _logger.debug("retryJob : %s" % pandaID) + sql1 = "SELECT %s FROM jobsActive4 " % JobSpec.columnNames() + sql1+= "WHERE PandaID=%s" + sql2 = "UPDATE jobsActive4 SET %s " % JobSpec.updateExpression() + sql2+= "WHERE PandaID=%s" + nTry=3 + for iTry in range(nTry): + try: + retValue = False + # begin transaction + self.cur.execute("START TRANSACTION") + # select + self.cur.execute(sql1+comment, (pandaID,)) + res = self.cur.fetchall() + if len(res) == 0: + _logger.debug("retryJob() : PandaID %d not found" % pandaID) + self._rollback() + return retValue + job = JobSpec() + job.pack(res[0]) + # check if it's analysis job + if (((job.prodSourceLabel == 'user' or job.prodSourceLabel == 'panda') \ + and job.computingSite.startswith('ANALY_') and param.has_key('pilotErrorCode') \ + and param['pilotErrorCode'] in ['1200','1201'] and (not job.computingSite.startswith('ANALY_LONG_')) \ + and job.attemptNr < 2) or (job.prodSourceLabel == 'ddm' and job.cloud == 'CA' and job.attemptNr <= 10)) \ + and job.commandToPilot != 'tobekilled': + _logger.debug(' -> reset PandaID:%s #%s' % (job.PandaID,job.attemptNr)) + # reset job + job.jobStatus = 'activated' + job.startTime = None + job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + job.attemptNr = job.attemptNr + 1 + # send it to long queue for analysis jobs + oldComputingSite = job.computingSite + if job.computingSite.startswith('ANALY') and (not job.computingSite.startswith('ANALY_LONG_')): + longSite = re.sub('^ANALY_','ANALY_LONG_',job.computingSite) + longSite = re.sub('_\d+$','',longSite) + if longSite in PandaSiteIDs.keys(): + job.computingSite = longSite + # set destinationSE if queue is changed + if oldComputingSite == job.destinationSE: + job.destinationSE = job.computingSite + # select files + sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=%s AND (type='log' OR type='output')" + self.cur.execute(sqlFile+comment, (job.PandaID,)) + resFs = self.cur.fetchall() + for resF in resFs: + # set PandaID + file = FileSpec() + file.pack(resF) + job.addFile(file) + # set new GUID + if file.type == 'log': + file.GUID = commands.getoutput('uuidgen') + # append attemptNr to LFN + oldName = file.lfn + file.lfn = re.sub('\.\d+$','',file.lfn) + file.lfn = '%s.%s' % (file.lfn,job.attemptNr) + newName = file.lfn + # set destinationSE + if oldComputingSite == file.destinationSE: + file.destinationSE = job.computingSite + # modify jobParameters + sepPatt = "(\'|\"|%20)" + oldName + "(\'|\"|%20)" + matches = re.findall(sepPatt,job.jobParameters) + for match in matches: + oldPatt = match[0]+oldName+match[-1] + newPatt = match[0]+newName+match[-1] + job.jobParameters = re.sub(oldPatt,newPatt,job.jobParameters) + # update + sqlFup = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" + self.cur.execute(sqlFup+comment, file.values()+(file.rowID,)) + # update job + self.cur.execute(sql2+comment, job.values()+(job.PandaID,)) + # set return + retValue = True + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return retValue + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("retryJob : %s retry : %s" % (pandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + # error report + type, value, traceBack = sys.exc_info() + _logger.error("retryJob : %s %s" % (type,value)) + return False + + + # get jobs + def getJobs(self,nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, + atlasRelease,prodUserID): + comment = ' /* DBProxy.getJobs */' + dynamicBrokering = False + sql1 = "WHERE jobStatus=%s AND computingSite=%s AND commandToPilot<>'tobekilled' " + if not mem in [0,'0']: + sql1+= "AND (minRamCount<=%s OR minRamCount=0) " % mem + if not diskSpace in [0,'0']: + sql1+= "AND (maxDiskCount<%s OR maxDiskCount=0) " % diskSpace + if prodSourceLabel == 'user': + sql1+= "AND (prodSourceLabel='user' OR prodSourceLabel='panda') " + elif prodSourceLabel == 'ddm': + dynamicBrokering = True + sql1+= "AND prodSourceLabel='ddm' " + elif prodSourceLabel in [None,'managed']: + sql1+= "AND (prodSourceLabel='managed' OR prodSourceLabel='test') " + elif prodSourceLabel == 'software': + sql1+= "AND prodSourceLabel='software' " + elif prodSourceLabel == 'test' and computingElement != None: + dynamicBrokering = True + sql1+= "AND (computingElement='%s' OR computingElement='to.be.set' OR processingType='prod_test' OR prodSourceLabel='test') " % computingElement + else: + sql1+= "AND prodSourceLabel='%s' " % prodSourceLabel + # user ID + if prodUserID != None: + sql1+= "AND prodUserID='%s' " % prodUserID + sql2 = "SELECT %s FROM jobsActive4 " % JobSpec.columnNames() + sql2+= "WHERE PandaID=%s" + retJobs = [] + nSent = 0 + try: + timeLimit = datetime.timedelta(seconds=timeout-10) + timeStart = datetime.datetime.utcnow() + strName = datetime.datetime.isoformat(timeStart) + attLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) + attSQL = "AND ((creationTime<'%s' AND attemptNr>1) OR attemptNr<=1) " % attLimit.strftime('%Y-%m-%d %H:%M:%S') + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # get nJobs + for iJob in range(nJobs): + pandaID = 0 + # select channel for ddm jobs + if prodSourceLabel == 'ddm': + sqlDDM = "SELECT count(*),jobStatus,sourceSite,destinationSite,transferType FROM jobsActive4 WHERE computingSite=%s AND prodSourceLabel='ddm' " + attSQL + "GROUP BY jobStatus,sourceSite,destinationSite,transferType" + _logger.debug((sqlDDM+comment) % siteName) + self.cur.execute(sqlDDM+comment,(siteName,)) + resDDM = self.cur.fetchall() + # make a channel map + channelMap = {} + for tmp_count,tmp_jobStatus,tmp_sourceSite,tmp_destinationSite,tmp_transferType in resDDM: + # use source,dest,type as the key + channel = (tmp_sourceSite,tmp_destinationSite,tmp_transferType) + if not channelMap.has_key(channel): + channelMap[channel] = {} + # ignore holding + if tmp_jobStatus == 'holding': + continue + # distinguish activate from other stats + if tmp_jobStatus != 'activated': + tmp_jobStatus = 'others' + # append + if not channelMap[channel].has_key(tmp_jobStatus): + channelMap[channel][tmp_jobStatus] = int(tmp_count) + else: + channelMap[channel][tmp_jobStatus] += int(tmp_count) + _logger.debug(channelMap) + # choose channel + channels = channelMap.keys() + random.shuffle(channels) + foundChannel = False + for channel in channels: + # no activated jobs + if (not channelMap[channel].has_key('activated')) or channelMap[channel]['activated'] == 0: + continue + maxRunning = 10 + # prestaging job + if channel[0] == channel[1] and channel[2] == 'dis': + maxRunning = 50 + if (not channelMap[channel].has_key('others')) or channelMap[channel]['others'] < maxRunning: + # set SQL + sql1+= "AND sourceSite='%s' AND destinationSite='%s' AND transferType='%s' " \ + % channel + foundChannel = True + break + # no proper channel + if not foundChannel: + _logger.debug("getJobs : no DDM jobs for Site %s" % siteName) + break + # get job + if prodSourceLabel in ['ddm']: + # to add some delay for attempts + sql1 += attSQL + nTry=1 + for iTry in range(nTry): + # set siteID + tmpSiteID = siteName + if siteName.startswith('ANALY_BNL_ATLAS'): + tmpSiteID = 'ANALY_BNL_ATLAS_1' + # get file lock + _logger.debug("getJobs : %s -> lock" % strName) + if (datetime.datetime.utcnow() - timeStart) < timeLimit: + toGetPandaIDs = True + pandaIDs = [] + # get max priority for analysis jobs + if prodSourceLabel in ['panda','user']: + sqlMX = "SELECT MAX(currentPriority) FROM jobsActive4 " + sqlMX+= sql1 + _logger.debug((sqlMX+comment) % ("activated",tmpSiteID)) + self.cur.execute(sqlMX+comment, ("activated",tmpSiteID)) + tmpPriority, = self.cur.fetchone() + # no jobs + if tmpPriority == None: + toGetPandaIDs = False + else: + # set priority + sql1 += "AND currentPriority=%s" % tmpPriority + if toGetPandaIDs: + # get PandaIDs + sqlP = "SELECT PandaID,currentPriority FROM jobsActive4 " + sqlP+= sql1 + _logger.debug((sqlP+comment) % ("activated",tmpSiteID)) + self.cur.execute(sqlP+comment, ("activated",tmpSiteID)) + resIDs = self.cur.fetchall() + maxCurrentPriority = None + # get max priority and min PandaID + for tmpPandaID,tmpCurrentPriority in resIDs: + if maxCurrentPriority==None or maxCurrentPriority < tmpCurrentPriority: + maxCurrentPriority = tmpCurrentPriority + pandaIDs = [tmpPandaID] + elif maxCurrentPriority == tmpCurrentPriority: + pandaIDs.append(tmpPandaID) + # sort + pandaIDs.sort() + if pandaIDs == []: + _logger.debug("getJobs : %s -> no PandaIDs" % strName) + retU = 0 + else: + # get nSent for production jobs + if prodSourceLabel in [None,'managed']: + sentLimit = timeStart - datetime.timedelta(seconds=60) + sqlSent = "SELECT count(*) FROM jobsActive4 WHERE jobStatus='sent' " + sqlSent += "AND prodSourceLabel IN ('managed','test') " + sqlSent += "AND computingSite='%s' " % tmpSiteID + sqlSent += "AND modificationTime>'%s' " % sentLimit.strftime('%Y-%m-%d %H:%M:%S') + self.cur.execute(sqlSent+comment) + resSent = self.cur.fetchone() + if resSent != None: + nSent, = resSent + # update + for indexID,tmpPandaID in enumerate(pandaIDs): + # max attempts + if indexID > 10: + break + # update + sqlJ = "UPDATE jobsActive4 " + sqlJ+= "SET jobStatus=%s,modificationTime=UTC_TIMESTAMP(),modificationHost=%s,startTime=UTC_TIMESTAMP()" + # set CE + if computingElement != None: + sqlJ+= ",computingElement='%s'" % computingElement + sqlJ+= " WHERE PandaID=%s AND jobStatus=%s" + _logger.debug((sqlJ+comment) % ("sent",node,tmpPandaID,"activated")) + retU = self.cur.execute(sqlJ+comment,("sent",node,tmpPandaID,"activated")) + # succeeded + if retU != 0: + pandaID = tmpPandaID + # increment nSent + if prodSourceLabel in [None,'managed']: + nSent += (indexID+1) + break + else: + _logger.debug("getJobs : %s -> do nothing" % strName) + retU = 0 + # release file lock + _logger.debug("getJobs : %s -> unlock" % strName) + # succeeded + if retU != 0: + break + if iTry+1 < nTry: + #time.sleep(0.5) + pass + # failed to UPDATE + if retU == 0: + # reset pandaID + pandaID = 0 + _logger.debug("getJobs : Site %s : retU %s : PandaID %s - %s" + % (siteName,retU,pandaID,prodSourceLabel)) + if pandaID == 0: + break + # select + self.cur.execute(sql2+comment, (pandaID,)) + res = self.cur.fetchone() + if len(res) == 0: + break + # instantiate Job + job = JobSpec() + job.pack(res) + # Files + sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=%s" + self.cur.execute(sqlFile+comment, (job.PandaID,)) + resFs = self.cur.fetchall() + for resF in resFs: + file = FileSpec() + file.pack(resF) + job.addFile(file) + # append + retJobs.append(job) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return retJobs,nSent + except: + # roll back + self._rollback() + # error report + type, value, traceBack = sys.exc_info() + _logger.error("getJobs : %s %s" % (type,value)) + return [],0 + + + # reset job in jobsActive or jobsWaiting + def resetJob(self,pandaID,activeTable=True,keepSite=False): + comment = ' /* DBProxy.resetJob */' + _logger.debug("resetJobs : %s" % pandaID) + # select table + table = 'jobsWaiting4' + if activeTable: + table = 'jobsActive4' + sql1 = "SELECT %s FROM %s " % (JobSpec.columnNames(),table) + sql1+= "WHERE PandaID=%s" + sql2 = "DELETE FROM %s " % table + sql2+= "WHERE PandaID=%s AND (jobStatus='waiting' OR jobStatus='activated')" + sql3 = "INSERT INTO jobsDefined4 (%s) " % JobSpec.columnNames() + sql3+= JobSpec.valuesExpression() + try: + # transaction causes Request ndbd time-out in jobsActive4 + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute(sql1+comment,(pandaID,)) + res = self.cur.fetchone() + # not found + if res == None: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return None + # instantiate Job + job = JobSpec() + job.pack(res) + # if already running + if job.jobStatus != 'waiting' and job.jobStatus != 'activated': + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return None + # delete + retD = self.cur.execute(sql2+comment,(pandaID,)) + # delete failed + _logger.debug("resetJobs : retD = %s" % retD) + if retD != 1: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return None + # delete from jobsDefined4 just in case + sqlD = "DELETE FROM jobsDefined4 WHERE PandaID=%s" + self.cur.execute(sqlD+comment,(pandaID,)) + # increase priority + if job.jobStatus == 'activated' and job.currentPriority < 100: + job.currentPriority = 100 + # reset computing site and dispatchDBlocks + job.jobStatus = 'defined' + job.dispatchDBlock = None + # erase old assignment + if (not keepSite) and job.relocationFlag != 1: + job.computingSite = None + job.computingElement = None + # host and time information + job.modificationHost = self.hostname + job.modificationTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + job.stateChangeTime = job.modificationTime + # insert + self.cur.execute(sql3+comment, job.values()) + # Files + sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=%s" + self.cur.execute(sqlFile+comment, (job.PandaID,)) + resFs = self.cur.fetchall() + for resF in resFs: + file = FileSpec() + file.pack(resF) + # reset GUID to trigger LRC/LFC scanning + if file.status == 'missing': + file.GUID = None + # reset status, destinationDBlock and dispatchDBlock + file.status ='unknown' + file.dispatchDBlock = None + file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock) + # add file + job.addFile(file) + # update files + sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" + self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return job + except: + # roll back + self._rollback() + # error report + type, value, traceBack = sys.exc_info() + _logger.error("resetJobs : %s %s" % (type,value)) + _logger.error("resetJobs : %s" % pandaID) + return None + + + # reset jobs in jobsDefined + def resetDefinedJob(self,pandaID,keepSite=False): + comment = ' /* DBProxy.resetDefinedJob */' + _logger.debug("resetDefinedJob : %s" % pandaID) + sql1 = "UPDATE jobsDefined4 SET " + sql1 += "jobStatus='defined'," + sql1 += "modificationTime=UTC_TIMESTAMP()," + sql1 += "dispatchDBlock=NULL," + sql1 += "computingElement=NULL" + sql1 += " WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')" + sql2 = "SELECT %s FROM jobsDefined4 " % JobSpec.columnNames() + sql2+= "WHERE PandaID=%s" + try: + # begin transaction + self.cur.execute("START TRANSACTION") + # update + retU = self.cur.execute(sql1+comment,(pandaID,)) + # not found + job = None + if retU == 0: + _logger.debug("resetDefinedJob : Not found %s" % pandaID) + else: + # select + self.cur.execute(sql2+comment,(pandaID,)) + res = self.cur.fetchone() + # not found + if res == None: + raise RuntimeError, 'Could not SELECT : PandaID=%s' % pandaID + # instantiate Job + job = JobSpec() + job.pack(res) + job.dispatchDBlock = None + if (not keepSite) and job.relocationFlag != 1: + # erase old assignment + job.computingSite = None + job.computingElement = None + # Files + sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=%s" + self.cur.execute(sqlFile+comment, (job.PandaID,)) + resFs = self.cur.fetchall() + for resF in resFs: + file = FileSpec() + file.pack(resF) + # reset status, destinationDBlock and dispatchDBlock + file.status ='unknown' + file.dispatchDBlock = None + file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock) + # add file + job.addFile(file) + # update files + sqlF = ("UPDATE filesTable4 SET %s" % FileSpec.updateExpression()) + "WHERE rowID=%s" + self.cur.execute(sqlF+comment, file.values()+(file.rowID,)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return job + except: + # error report + type, value, traceBack = sys.exc_info() + _logger.error("resetDefinedJobs : %s %s" % (type,value)) + #_logger.error(traceback.format_exc()) + # roll back + self._rollback() + return None + + + # kill job + def killJob(self,pandaID,user,code,prodManager): + comment = ' /* DBProxy.killJob */' + _logger.debug("killJob : %s %s %s %s" % (code,pandaID,prodManager,user)) + # check PandaID + try: + long(pandaID) + except: + _logger.error("not an integer : %s" % pandaID) + return False + sql0 = "SELECT prodUserID FROM %s WHERE PandaID=%s" + sql1 = "UPDATE %s SET commandToPilot='tobekilled' WHERE PandaID=%s AND commandToPilot<>'tobekilled'" + sql2 = "SELECT %s " % JobSpec.columnNames() + sql2+= "FROM %s WHERE PandaID=%s AND jobStatus<>'running'" + sql3 = "DELETE FROM %s WHERE PandaID=%s" + sqlU = "UPDATE jobsDefined4 SET jobStatus='failed' WHERE PandaID=%s AND (jobStatus='assigned' OR jobStatus='defined')" + sql4 = "INSERT INTO jobsArchived4 (%s) " % JobSpec.columnNames() + sql4+= JobSpec.valuesExpression() + try: + flagCommand = False + flagKilled = False + # begin transaction + self.cur.execute("START TRANSACTION") + for table in ('jobsDefined4','jobsActive4','jobsWaiting4'): + # get DN if user is not production DN + if (not prodManager) and (not user.startswith('/DC=org/DC=doegrids/OU=People/CN=Nurcan Ozturk')) \ + and (not user.startswith('/DC=org/DC=doegrids/OU=People/CN=Torre Wenaus')): + self.cur.execute((sql0+comment) % (table,pandaID)) + res = self.cur.fetchone() + # not found + if res == None: + continue + # owner? + def getCN(dn): + distinguishedName = '' + for line in dn.split('/'): + if line.startswith('CN='): + distinguishedName = re.sub('^CN=','',line) + distinguishedName = re.sub('\d+$','',distinguishedName) + distinguishedName = distinguishedName.strip() + break + return distinguishedName + cn1 = getCN(res[0]) + cn2 = getCN(user) + _logger.debug("Owner:%s - Requester:%s " % (cn1,cn2)) + if cn1 != cn2: + _logger.debug("ignore killJob -> Owner != Requester") + break + # update + retU = self.cur.execute((sql1+comment) % (table,pandaID)) + if retU == 0: + continue + # set flag + flagCommand = True + # select + self.cur.execute((sql2+comment) % (table,pandaID)) + res = self.cur.fetchall() + if len(res) == 0: + continue + # instantiate JobSpec + job = JobSpec() + job.pack(res[0]) + # delete + if table=='jobsDefined4': + retD = self.cur.execute((sqlU+comment) % (pandaID,)) + else: + retD = self.cur.execute((sql3+comment) % (table,pandaID)) + if retD == 0: + continue + # error code + job.jobStatus = 'failed' + job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + job.modificationTime = job.endTime + job.stateChangeTime = job.modificationTime + if code in ['2','4']: + # expire + if code == '2': + job.taskBufferErrorCode = ErrorCode.EC_Expire + job.taskBufferErrorDiag = 'expired after 7 days since submission' + else: + # waiting timeout + job.taskBufferErrorCode = ErrorCode.EC_Expire + #job.taskBufferErrorCode = ErrorCode.EC_WaitTimeout + job.taskBufferErrorDiag = 'expired after waiting for input data for 2 days' + elif code=='3': + # aborted + job.taskBufferErrorCode = ErrorCode.EC_Aborted + job.taskBufferErrorDiag = 'aborted by ExtIF' + else: + # killed + job.taskBufferErrorCode = ErrorCode.EC_Kill + job.taskBufferErrorDiag = 'killed by %s' % user + # insert + self.cur.execute(sql4+comment, job.values()) + flagKilled = True + break + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("killJob : com=%s kill=%s " % (flagCommand,flagKilled)) + return (flagCommand or flagKilled) + except: + type, value, traceBack = sys.exc_info() + _logger.error("killJob : %s %s" % (type,value)) + # roll back + self._rollback() + return False + + + # peek at job + def peekJob(self,pandaID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal=False): + comment = ' /* DBProxy.peekJob */' + _logger.debug("peekJob : %s" % pandaID) + # return None for NULL PandaID + if pandaID in ['NULL','','None',None]: + return None + sql1_0 = "SELECT %s FROM %s " + sql1_1 = "WHERE PandaID=%s" + try: + tables=[] + if fromActive: + tables.append('jobsActive4') + if fromArchived: + tables.append('jobsArchived4') + if fromWaiting: + tables.append('jobsWaiting4') + if fromDefined: + # defined needs to be the last one due to InnoDB's auto_increment + tables.append('jobsDefined4') + # select + for table in tables: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1 + self.cur.execute(sql+comment, (pandaID,)) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if len(res) != 0: + # Job + job = JobSpec() + job.pack(res[0]) + # Files + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=%s" + self.cur.execute(sqlFile+comment, (job.PandaID,)) + resFs = self.cur.fetchall() + # metadata + if table == 'jobsArchived4' and (not forAnal): + # read metadata only for finished/failed jobs + sqlMeta = "SELECT metaData FROM metaTable WHERE PandaID=%s" + self.cur.execute(sqlMeta+comment, (job.PandaID,)) + resMeta = self.cur.fetchone() + else: + resMeta = None + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # set files + for resF in resFs: + file = FileSpec() + file.pack(resF) + job.addFile(file) + # set metadata + if resMeta != None: + job.metadata = resMeta[0] + return job + _logger.debug("peekJob() : PandaID %s not found" % pandaID) + return None + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("peekJob : %s %s" % (type,value)) + # return None for analysis + if forAnal: + return None + # return 'unknown' + job = JobSpec() + job.PandaID = pandaID + job.jobStatus = 'unknown' + return job + + + # get JobIDs in a time range + def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs): + comment = ' /* DBProxy.getJobIDsInTimeRange */' + _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) + try: + tables = ['jobsArchived4','jobsActive4','jobsWaiting4','jobsDefined4'] + # select + for table in tables: + # make sql + sql = "SELECT jobDefinitionID FROM %s " % table + sql += "WHERE prodUserID=%s AND modificationTime>%s AND prodSourceLabel='user'" + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + _logger.debug(sql+comment+str((dn,timeRange.strftime('%Y-%m-%d %H:%M:%S')))) + self.cur.execute(sql+comment, (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for tmpID, in resList: + if not tmpID in retJobIDs: + retJobIDs.append(tmpID) + _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs)) + return retJobIDs + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getJobIDsInTimeRange : %s %s" % (type,value)) + # return empty list + return [] + + + # get PandaIDs for a JobID + def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs): + comment = ' /* DBProxy.getPandIDsWithJobID */' + _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID)) + try: + tables = ['jobsArchived4','jobsActive4','jobsWaiting4','jobsDefined4'] + # select + for table in tables: + # skip if all jobs have already been gotten + if nJobs > 0 and len(idStatus) >= nJobs: + continue + # make sql + sql = "SELECT PandaID,jobStatus,commandToPilot FROM %s " % table + sql += "WHERE prodUserID=%s AND jobDefinitionID=%s " + sql += "AND prodSourceLabel in ('user','panda') " + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + _logger.debug(sql+comment+str((dn,jobID))) + self.cur.execute(sql+comment, (dn,jobID)) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for tmpID,tmpStatus,tmpCommand in resList: + if not idStatus.has_key(tmpID): + idStatus[tmpID] = (tmpStatus,tmpCommand) + _logger.debug("getPandIDsWithJobID : %s" % str(idStatus)) + return idStatus + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getPandIDsWithJobID : %s %s" % (type,value)) + # return empty list + return {} + + + # query PandaID + def queryPandaID(self,jobDefID): + comment = ' /* DBProxy.queryPandaID */' + _logger.debug("queryPandaID : %s" % jobDefID) + sql0 = "SELECT PandaID,attemptNr FROM %s WHERE attemptNr=(" + sql0+= "SELECT MAX(attemptNr) FROM %s" + sql1= " WHERE prodSourceLabel=%s AND jobDefinitionID=%s) AND prodSourceLabel=%s AND jobDefinitionID=%s" + try: + ids = [] + # select + for table in ['jobsDefined4','jobsActive4','jobsArchived4','jobsWaiting4']: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = sql0 % (table,table) + sql1 + self.cur.execute(sql+comment, ('managed',jobDefID,'managed',jobDefID)) + res = self.cur.fetchall() + ids += list(res) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # look for the latest attempt + preAtt =-1 + pandaID=None + for pID,att in ids: + if att > preAtt: + pandaID = pID + preAtt = att + if att == preAtt: + if pandaID < pID: + pandaID = pID + return pandaID + except: + type, value, traceBack = sys.exc_info() + _logger.error("queryPandaID : %s %s" % (type,value)) + # roll back + self._rollback() + return None + + + # query job info per cloud + def queryJobInfoPerCloud(self,cloud,schedulerID=None): + comment = ' /* DBProxy.queryJobInfoPerCloud */' + _logger.debug("queryJobInfoPerCloud : %s %s" % (cloud,schedulerID)) + attrs = ['PandaID','jobStatus','jobName'] + sql0 = "SELECT " + for attr in attrs: + sql0 += "%s," % attr + sql0 = "%s " % sql0[:-1] + sql0+= "FROM %s " + sql0+= "WHERE cloud='%s' " % cloud + if schedulerID != None: + sql0+= "AND schedulerID='%s' " % schedulerID + try: + ids = [] + returnList = [] + # select + for table in ['jobsActive4','jobsWaiting4','jobsDefined4']: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = sql0 % table + self.cur.execute(sql+comment) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # loop over all + for res in resList: + valMap = {} + # skip if already in the list + PandaID = res[0] + if PandaID in ids: + continue + # convert to map + for idx,attr in enumerate(attrs): + valMap[attr] = res[idx] + # append to list + ids.append(PandaID) + returnList.append(valMap) + # return + return returnList + except: + type, value, traceBack = sys.exc_info() + _logger.error("queryJobInfoPerCloud : %s %s" % (type,value)) + # roll back + self._rollback() + return None + + + # get PandaIDs at Site + def getPandaIDsSite(self,site,status,limit): + comment = ' /* DBProxy.getPandaIDsSite */' + _logger.debug("getPandaIDsSite : %s %s %s" % (site,status,limit)) + try: + ids = [] + # find table + if status in ['defined','assigned']: + table = 'jobsDefined4' + elif status in ['activated','running','holding','trasnferring']: + table = 'jobsActive4' + elif status in ['waiting']: + table = 'jobsWaiting4' + elif status in ['finished','failed']: + table = 'jobsArchived4' + else: + _logger.error("unknown status:%s" % status) + return ids + # limit + limit = int(limit) + # SQL + sql = "SELECT PandaID FROM %s " % table + sql += "WHERE computingSite=%s AND jobStatus=%s AND prodSourceLabel=%s " + sql += "LIMIT %d" % limit + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute(sql+comment, (site,status,'managed')) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # convert to list + for id, in res: + ids.append(id) + return ids + except: + type, value, traceBack = sys.exc_info() + _logger.error("getPandaIDsSite : %s %s" % (type,value)) + # roll back + self._rollback() + return [] + + + # get PandaIDs to be updated in prodDB + def getPandaIDsForProdDB(self,limit,lockedby): + comment = ' /* DBProxy.getPandaIDsForProdDB */' + _logger.debug("getPandaIDsForProdDB %s" % limit) + sql0 = "SELECT PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID FROM %s " + sql0+= "WHERE prodSourceLabel IN ('managed','rc_test') AND lockedby='%s' " % lockedby + sql0+= "AND stateChangeTime>prodDBUpdateTime AND stateChangeTime<>'0000-00-00 00:00:00'" + try: + retMap = {} + totalIDs = 0 + # select + for table in ['jobsArchived4','jobsActive4','jobsWaiting4','jobsDefined4']: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = sql0 % table + self.cur.execute(sql+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + for PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID in res: + # ignore dummy jobs in jobsDefined4 + if table == 'jobsDefined4' and (not jobStatus in ['defined','assigned']): + continue + # add status + if not retMap.has_key(jobStatus): + retMap[jobStatus] = [] + # append + retMap[jobStatus].append({'PandaID':PandaID,'attemptNr':attemptNr, + 'stateChangeTime':stateChangeTime.strftime('%Y-%m-%d %H:%M:%S'), + 'jobDefinitionID':jobDefinitionID, + 'jobExecutionID':jobExecutionID}) + totalIDs += 1 + # limit + if totalIDs > limit: + break + _logger.debug("getPandaIDsForProdDB %s ret->%s" % (limit,totalIDs)) + return retMap + except: + type, value, traceBack = sys.exc_info() + _logger.error("getPandaIDsForProdDB : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # update prodDBUpdateTime + def updateProdDBUpdateTime(self,param): + comment = ' /* DBProxy.updateProdDBUpdateTime */' + _logger.debug("updateProdDBUpdateTime %s" % str(param)) + sql0 = "UPDATE %s " + sql0+= "SET prodDBUpdateTime='%s' " % param['stateChangeTime'] + sql0+= "WHERE PandaID=%s AND jobStatus='%s' AND stateChangeTime='%s'" % (param['PandaID'], + param['jobStatus'], + param['stateChangeTime']) + try: + if param['jobStatus'] in ['defined','assigned']: + table = 'jobsDefined4' + elif param['jobStatus'] in ['waiting']: + table = 'jobsWaiting4' + elif param['jobStatus'] in ['activated','sent','starting','running','holding','transferring']: + table = 'jobsActive4' + elif param['jobStatus'] in ['finished','failed']: + table = 'jobsArchived4' + else: + _logger.error("invalid status %s" % param['jobStatus']) + return False + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # update + sql = sql0 % table + _logger.debug(sql) + retU = self.cur.execute(sql+comment) + _logger.debug("updateProdDBUpdateTime %s ret=%s" % (param['PandaID'],retU)) + if retU == 1: + return True + return False + except: + type, value, traceBack = sys.exc_info() + _logger.error("updateProdDBUpdateTime : %s %s" % (type,value)) + # roll back + self._rollback() + return False + + + # add metadata + def addMetadata(self,pandaID,metadata): + comment = ' /* DBProxy.addMetaData */' + _logger.debug("addMetaData : %s" % pandaID) + sql0 = "SELECT PandaID FROM metaTable WHERE PandaID=%s" + sql1 = "INSERT INTO metaTable (PandaID,metaData) VALUE (%s,%s)" + nTry=3 + for iTry in range(nTry): + try: + # autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute(sql0+comment, (pandaID,)) + res = self.cur.fetchone() + # already exist + if res != None: + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + # insert + self.cur.execute(sql1+comment, (pandaID,metadata)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("addMetaData : %s retry : %s" % (pandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("addMetaData : %s %s" % (type,value)) + return False + + + # insert dataset + def insertDataset(self,dataset,tablename="Datasets"): + comment = ' /* DBProxy.insertDataset */' + _logger.debug("insertDataset(%s)" % dataset.name) + sql1 = "INSERT INTO %s " % tablename + sql1+= "(%s) " % DatasetSpec.columnNames() + sql1+= DatasetSpec.valuesExpression() + # time information + dataset.creationdate = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + dataset.modificationdate = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + try: + # get file lock + #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_EX) + # begin transaction + self.cur.execute("START TRANSACTION") + # avoid duplication + self.cur.execute("SELECT vuid FROM "+tablename+" WHERE vuid=%s"+comment, (dataset.vuid,)) + res = self.cur.fetchall() + if len(res) == 0: + # insert + self.cur.execute(sql1+comment, dataset.values()) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # release file lock + #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN) + return True + except: + # roll back + self._rollback() + # release file lock + #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN) + # error + type, value, traceBack = sys.exc_info() + _logger.error("insertDataset() : %s %s" % (type,value)) + return False + + + # query dataset with map + def queryDatasetWithMap(self,map): + comment = ' /* DBProxy.queryDatasetWithMap */' + _logger.debug("queryDatasetWithMap(%s)" % map) + sql1 = "SELECT %s FROM Datasets" % DatasetSpec.columnNames() + valueL = [] + for key in map.keys(): + if len(valueL)==0: + sql1+= " WHERE %s=" % key + else: + sql1+= " AND %s=" % key + sql1+= "%s" + valueL.append(map[key]) + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + nTry=5 + for iTry in range(nTry): + retS = self.cur.execute(sql1+comment, tuple(valueL)) + res = self.cur.fetchall() + if retS>=0 and res != None and retS==len(res): + break + if iTry+1 < nTry: + _logger.debug("queryDatasetWithMap : retS %s retry : %s" % (retS,iTry)) + time.sleep(random.randint(10,20)) + _logger.debug("queryDatasetWithMap(%s) : retS %s ret %s" % (str(map),retS,str(res))) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # instantiate Dataset + if res != None and len(res) != 0: + dataset = DatasetSpec() + dataset.pack(res[0]) + return dataset + _logger.error("queryDatasetWithMap(%s) : dataset not found" % map) + return None + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("queryDatasetWithMap(%s) : %s %s" % (map,type,value)) + return None + + + # update dataset + def updateDataset(self,datasets,withLock,withCriteria): + comment = ' /* DBProxy.updateDataset */' + _logger.debug("updateDataset()") + sql1 = "UPDATE Datasets SET %s " % DatasetSpec.updateExpression() + sql1+= "WHERE vuid=%s" + if withCriteria != "": + sql1+= " AND %s" % withCriteria + nTry=3 + for iTry in range(nTry): + try: + # get file lock + if withLock: + fcntl.flock(_lockSetDS.fileno(), fcntl.LOCK_EX) + retList = [] + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + for dataset in datasets: + _logger.debug("updateDataset(%s,%s)" % (dataset.name,dataset.status)) + # time information + dataset.modificationdate = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) + # update + retU = self.cur.execute(sql1+comment, dataset.values()+(dataset.vuid,)) + if retU != 0 and retU != 1: + raise RuntimeError, 'Invalid retrun %s' % retU + retList.append(retU) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # release file lock + if withLock: + fcntl.flock(_lockSetDS.fileno(), fcntl.LOCK_UN) + _logger.debug("updateDataset() ret:%s" % retList) + return retList + except: + # roll back + self._rollback() + # release file lock + if withLock: + fcntl.flock(_lockSetDS.fileno(), fcntl.LOCK_UN) + if iTry+1 < nTry: + _logger.debug("updateDataset : retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("updateDataset() : %s %s" % (type,value)) + return [] + + + # delete dataset + def deleteDataset(self,name): + comment = ' /* DBProxy.deleteDataset */' + sql1 = "DELETE FROM Datasets WHERE name=%s" + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # delete + self.cur.execute(sql1+comment,(name,)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("deleteDataset() : %s %s" % (type,value)) + return False + + + # get serial number for dataset, insert dummy datasets to increment SN + def getSerialNumber(self,datasetname): + comment = ' /* DBProxy.getSerialNumber */' + try: + _logger.debug("getSerialNumber(%s)" % datasetname) + # get file lock + #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_EX) + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = "SELECT COUNT(*) FROM Datasets WHERE type='output' AND name='%s'" % datasetname + nTry=3 + for iTry in range(nTry): + retS = self.cur.execute(sql+comment) + res = self.cur.fetchone() + _logger.debug("getSerialNumber : retS %s, res %s" % (retS,res)) + if retS>=0 and res != None: + break + if iTry+1 < nTry: + time.sleep(random.randint(10,20)) + # fresh dataset or not + if res != None and len(res) != 0 and res[0] > 0: + freshFlag = False + else: + freshFlag = True + # get serial number + sql = "INSERT INTO subCounter (subID) VALUES ('NULL')" + self.cur.execute(sql+comment) + sn = self.conn.insert_id() + # delete. '<' is needed for auto_incr of InnoDB + sql = "DELETE FROM subCounter where subID<%s" % sn + self.cur.execute(sql+comment) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # release file lock + #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN) + _logger.debug("getSerialNumber : %s %s" % (sn,freshFlag)) + return (sn,freshFlag) + except: + # roll back + self._rollback() + # release file lock + #fcntl.flock(_lockGetSN.fileno(), fcntl.LOCK_UN) + # error + type, value, traceBack = sys.exc_info() + _logger.error("getSerialNumber() : %s %s" % (type,value)) + return (-1,False) + + + # update transfer status for a dataset + def updateTransferStatus(self,datasetname,bitMap): + comment = ' /* DBProxy.updateTransferStatus */' + try: + _logger.debug("updateTransferStatus(%s,%s)" % (datasetname,hex(bitMap))) + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + retTransSt = 0 + # update bitmap + sqlU = "UPDATE Datasets SET transferStatus=transferStatus|%s WHERE name='%s'" % (bitMap,datasetname) + retU = self.cur.execute(sqlU+comment) + # get transferStatus + sqlS = "SELECT transferStatus from Datasets WHERE name='%s'" % datasetname + retS = self.cur.execute(sqlS+comment) + resS = self.cur.fetchall() + if resS != None and len(resS) != 0: + retTransSt = resS[0][0] + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("updateTransferStatus : %s" % hex(retTransSt)) + return retTransSt + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("updateTransferStatus : %s %s" % (type,value)) + return 0 + + + # get CloudTask. If not exist, create it + def getCloudTask(self,tid): + comment = ' /* getCloudTask */' + try: + _logger.debug("getCloudTask(%s)" % tid) + # check tid + if tid in [None,'NULL']: + _logger.error("invalid TID : %s" % tid) + return None + # get file lock + fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_EX) + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = "SELECT %s FROM cloudtasks " % CloudTaskSpec.columnNames() + sql += "WHERE taskid=%s" % tid + nTry=5 + for iTry in range(nTry): + retS = self.cur.execute(sql+comment) + res = self.cur.fetchall() + _logger.debug("getCloudTask : retS %s" % retS) + if retS>=0 and res != None and retS==len(res): + break + if iTry+1 < nTry: + time.sleep(random.randint(10,20)) + # already exist + if res != None and len(res) != 0: + # instantiate CloudTask + cloudTask = CloudTaskSpec() + cloudTask.pack(res[0]) + # update tmod if status <> 'assigned' + if cloudTask.status <> 'assigned': + sql = "UPDATE cloudtasks SET tmod=UTC_TIMESTAMP() WHERE taskid=%s" % cloudTask.taskid + self.cur.execute(sql+comment) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # release file lock + fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_UN) + _logger.debug("return existing CloudTask") + return cloudTask + # insert new CloudTask + _logger.debug("insert new CloudTask") + cloudTask = CloudTaskSpec() + cloudTask.taskid = tid + cloudTask.status = 'defined' + sql = "INSERT INTO cloudtasks (taskid,status,tmod,tenter) VALUES(%s,%s,UTC_TIMESTAMP(),UTC_TIMESTAMP())" + self.cur.execute(sql+comment,(cloudTask.taskid,cloudTask.status)) + # get id + cloudTask.id = self.conn.insert_id() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # release file lock + fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_UN) + _logger.debug("return new CloudTask") + return cloudTask + except: + # roll back + self._rollback() + # release file lock + fcntl.flock(_lockGetCT.fileno(), fcntl.LOCK_UN) + # error + type, value, traceBack = sys.exc_info() + _logger.error("getCloudTask() : %s %s" % (type,value)) + return None + + + # set cloud to CloudTask + def setCloudTask(self,cloudTask): + comment = ' /* setCloudTask */' + try: + _logger.debug("setCloudTask(id=%s,taskid=%s)" % (cloudTask.id,cloudTask.taskid)) + sql = "UPDATE cloudtasks SET cloud=%s,status=%s,tmod=UTC_TIMESTAMP() WHERE id=%s AND status='defined'" + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # update + retU = self.cur.execute(sql+comment,(cloudTask.cloud,'assigned',cloudTask.id)) + # succeeded + if retU != 0: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return cloudTask + # read if it is already set by another thread + sql = "SELECT %s FROM cloudtasks " % CloudTaskSpec.columnNames() + sql += "WHERE id=%s" % cloudTask.id + # select + retS = self.cur.execute(sql+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # retrun CloudTask + if res != None and len(res) != 0: + # instantiate CloudTask + cloudTask = CloudTaskSpec() + cloudTask.pack(res[0]) + return cloudTask + _logger.error("setCloudTask() : cannot find CloudTask for %s" % cloudTask.id) + return None + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("setCloudTask() : %s %s" % (type,value)) + return None + + + # see CloudTask + def seeCloudTask(self,tid): + comment = ' /* seeCloudTask */' + try: + _logger.debug("seeCloudTask(%s)" % tid) + # check tid + if tid in [None,'NULL']: + _logger.error("invalid TID : %s" % tid) + return None + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = "SELECT cloud FROM cloudtasks WHERE taskid=%s" % tid + nTry=5 + for iTry in range(nTry): + retS = self.cur.execute(sql+comment) + res = self.cur.fetchall() + _logger.debug("seeCloudTask : retS %s" % retS) + if retS>=0 and res != None and retS==len(res): + break + if iTry+1 < nTry: + time.sleep(random.randint(10,20)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # existing task + if res != None and len(res) != 0: + # return cloud + return res[0][0] + else: + return None + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("seeCloudTask() : %s %s" % (type,value)) + return None + + + # get assigning task + def getAssigningTask(self): + comment = ' /* getAssigningTask */' + try: + _logger.debug("getAssigningTask") + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = "SELECT taskid FROM cloudtasks WHERE status<>'assigned' AND tmod>'%s'" % timeLimit.strftime('%Y-%m-%d %H:%M:%S') + self.cur.execute(sql+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # loop over all taskid + retList = [] + if res != None: + for tid, in res: + retList.append(tid) + # return + _logger.debug("getAssigningTask ret:%s" % retList) + return retList + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getAssigningTask : %s %s" % (type,value)) + return [] + + + # query files with map + def queryFilesWithMap(self,map): + comment = ' /* DBProxy.queryFilesWithMap */' + _logger.debug("queryFilesWithMap()") + sql1 = "SELECT PandaID,%s FROM filesTable4" % FileSpec.columnNames() + valueL = [] + for key in map.keys(): + if len(valueL)==0: + sql1+= " WHERE %s=" % key + else: + sql1+= " AND %s=" % key + sql1+= "%s" + valueL.append(map[key]) + nTry=3 + for iTry in range(nTry): + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute(sql1+comment, tuple(valueL)) + res = self.cur.fetchall() + _logger.debug("queryFilesWithMap() : %s" % str(res)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # instantiate files + retList = [] + for item in res: + # instantiate dummy JobSpec obj for PandaID + job = JobSpec() + job.PandaID = item[0] + # instantiate file + file = FileSpec() + file.pack(item[1:]) + # set owner + file.setOwner(job) + # append + retList.append(file) + return retList + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("queryFilesWithMap retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("queryFilesWithMap : %s %s" % (type,value)) + return [] + + + # count the number of files with map + def countFilesWithMap(self,map): + comment = ' /* DBProxy.countFilesWithMap */' + sql1 = "SELECT COUNT(*) FROM filesTable4" + valueL = [] + for key in map.keys(): + if len(valueL)==0: + sql1+= " WHERE %s=" % key + else: + sql1+= " AND %s=" % key + sql1+= "%s" + valueL.append(map[key]) + nTry=3 + for iTry in range(nTry): + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + _logger.debug("countFilesWithMap() : %s" % str(map)) + retS = self.cur.execute(sql1+comment, tuple(valueL)) + res = self.cur.fetchone() + _logger.debug("countFilesWithMap() : %s %s" % (retS,str(res))) + # check return + if retS != 1: + raise RuntimeError, 'Invalid return' + nFiles=0 + if res != None: + nFiles=res[0] + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return nFiles + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("countFilesWithMap() retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("countFilesWithMap(%s) : %s %s" % (map,type,value)) + return -1 + + + # update input files and return corresponding PandaIDs + def updateInFilesReturnPandaIDs(self,dataset,status): + comment = ' /* DBProxy.updateInFilesReturnPandaIDs */' + _logger.debug("updateInFilesReturnPandaIDs(%s)" % dataset) + sql0 = "SELECT rowID,PandaID FROM filesTable4 WHERE status<>%s AND dispatchDBlock=%s" + for iTry in range(self.nTry): + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + retS = self.cur.execute(sql0+comment, (status,dataset)) + resS = self.cur.fetchall() + _logger.debug("updateInFilesReturnPandaIDs : retS %s" % retS) + if retS<0 or resS==None or retS!=len(resS): + raise RuntimeError, 'SQL error' + # avoid too long expression + nDiv = 10 + nRow,tmpMod = divmod(len(resS),nDiv) + if tmpMod != 0: + nRow += 1 + # update + retList = [] + for iRow in range(nRow): + rows = [] + pandaIDs = [] + for tmpRowID,tmpPandaID in resS[iRow*nDiv:(iRow+1)*nDiv]: + rows.append(tmpRowID) + if not tmpPandaID in pandaIDs: + pandaIDs.append(tmpPandaID) + # make SQL query + sql1 = "UPDATE filesTable4 SET status=%s WHERE " + for row in rows: + if row != rows[0]: + sql1+= "OR " + sql1+= "rowID=%s " + # update + retU = self.cur.execute(sql1+comment, tuple([status]+rows)) + _logger.debug("updateInFilesReturnPandaIDs : retU %s" % retU) + # append + for tmpPandaID in pandaIDs: + if not tmpPandaID in retList: + retList.append(tmpPandaID) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + _logger.debug("updateInFilesReturnPandaIDs : %s" % str(retList)) + return retList + except: + # roll back + self._rollback() + # error report + if iTry+1 < self.nTry: + _logger.debug("updateInFilesReturnPandaIDs retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("updateInFilesReturnPandaIDs : %s %s" % (type, value)) + return [] + + + # update output files and return corresponding PandaIDs + def updateOutFilesReturnPandaIDs(self,dataset): + comment = ' /* DBProxy.updateOutFilesReturnPandaIDs */' + _logger.debug("updateOutFilesReturnPandaIDs(%s)" % dataset) + sql0 = "SELECT rowID,PandaID FROM filesTable4 WHERE destinationDBlock=%s AND status='transferring'" + for iTry in range(self.nTry): + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + retS = self.cur.execute(sql0+comment, (dataset,)) + resS = self.cur.fetchall() + _logger.debug("updateOutFilesReturnPandaIDs : retS %s" % retS) + if retS<0 or resS==None or retS!=len(resS): + raise RuntimeError, 'SQL error' + # avoid too long expression + nDiv = 10 + nRow,tmpMod = divmod(len(resS),nDiv) + if tmpMod != 0: + nRow += 1 + # update + retList = [] + for iRow in range(nRow): + rows = [] + pandaIDs = [] + for tmpRowID,tmpPandaID in resS[iRow*nDiv:(iRow+1)*nDiv]: + rows.append(tmpRowID) + if not tmpPandaID in pandaIDs: + pandaIDs.append(tmpPandaID) + # make SQL query + sql1 = "UPDATE filesTable4 SET status=%s WHERE " + for row in rows: + if row != rows[0]: + sql1+= "OR " + sql1+= "rowID=%s " + # update + retU = self.cur.execute(sql1+comment, tuple(['ready']+rows)) + _logger.debug("updateOutFilesReturnPandaIDs : retU %s" % retU) + # append + for tmpPandaID in pandaIDs: + if not tmpPandaID in retList: + retList.append(tmpPandaID) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + _logger.debug("updateOutFilesReturnPandaIDs : %s" % str(retList)) + return retList + except: + # roll back + self._rollback() + # error report + if iTry+1 < self.nTry: + _logger.debug("updateOutFilesReturnPandaIDs retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("updateOutFilesReturnPandaIDs : %s %s" % (type, value)) + return [] + + + # set GUIDs + def setGUIDs(self,files): + comment = ' /* DBProxy.setGUIDs */' + _logger.debug("setGUIDs(%s)" % files) + sql0 = "UPDATE filesTable4 SET GUID=%s WHERE lfn=%s" + for iTry in range(self.nTry): + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # update + for file in files: + retU = self.cur.execute(sql0+comment, (file['guid'],file['lfn'])) + _logger.debug("setGUIDs : retU %s" % retU) + if retU<0: + raise RuntimeError, 'SQL error' + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + # error report + if iTry+1 < self.nTry: + _logger.debug("setGUIDs retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("setGUIDs : %s %s" % (type, value)) + return False + + + # query PandaID with Datasets + def queryPandaIDwithDataset(self,datasets): + comment = ' /* DBProxy.queryPandaIDwithDataset */' + _logger.debug("queryPandaIDwithDataset(%s)" % datasets) + if len(datasets) == 0: + return [] + # make SQL query + sql1 = "SELECT PandaID FROM filesTable4 WHERE " + for dataset in datasets: + if dataset != datasets[0]: + sql1+= "OR " + sql1+= "destinationDBlock='%s' " % dataset + sql1+= "GROUP BY PandaID" + # execute + for iTry in range(self.nTry): + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute(sql1+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + retList = [] + for r in res: + retList.append(r[0]) + # return + _logger.debug("queryPandaIDwithDataset : %s" % str(retList)) + return retList + except: + # roll back + self._rollback() + # error report + if iTry+1 < self.nTry: + _logger.debug("queryPandaIDwithDataset retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("queryPandaIDwithDataset : %s %s" % (type, value)) + return [] + + + # query last files in datasets + def queryLastFilesInDataset(self,datasets): + comment = ' /* DBProxy.queryLastFilesInDataset */' + _logger.debug("queryLastFilesInDataset(%s)" % datasets) + if len(datasets) == 0: + return [] + # make SQL query + sql1 = "SELECT MAX(PandaID) FROM filesTable4 WHERE dataset=%s AND type='output'" + sql2 = "SELECT lfn FROM filesTable4 WHERE PandaID=%s AND type='output'" + # execute + try: + retMap = {} + for dataset in datasets: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select PandaID + self.cur.execute(sql1+comment,(dataset,)) + res = self.cur.fetchone() + # found + retList = [] + if res != None: + pandaID = res[0] + # select LFNs + self.cur.execute(sql2+comment,(pandaID,)) + res = self.cur.fetchall() + for r in res: + retList.append(r[0]) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + retMap[dataset] = retList + # return + _logger.debug("queryLastFilesInDataset : %s" % str(retMap)) + return retMap + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("queryLastFilesInDataset : %s %s" % (type, value)) + return {} + + + # query PandaID with filenames + def queryPandaIDwithLFN(self,vlfns): + comment = ' /* DBProxy.queryPandaIDwithLFN */' + _logger.debug("queryPandaIDwithLFN(%s)" % vlfns) + if len(vlfns) == 0: + return [] + # avoid too long expression + nDiv = 15 + nLFN,tmpMod = divmod(len(vlfns),nDiv) + if tmpMod != 0: + nLFN += 1 + # execute + retList = [] + for iLFN in range(nLFN): + lfns = vlfns[iLFN*nDiv:(iLFN+1)*nDiv] + # make SQL query + sql1 = "SELECT PandaID FROM filesTable4 WHERE " + for lfn in lfns: + if lfn != lfns[0]: + sql1+= "OR " + sql1+= "lfn=%s " + sql1+= "GROUP BY PandaID" + # get generic LFNs + gLFNs = [] + for lfn in lfns: + gLFNs.append(re.sub('\.\d+$','',lfn)) + # try + for iTry in range(self.nTry): + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute(sql1+comment, tuple(gLFNs)) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append IDs + for r in res: + if not r[0] in retList: + retList.append(r[0]) + break + except: + # roll back + self._rollback() + # error report + if iTry+1 < self.nTry: + _logger.debug("queryPandaIDwithLFN retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("queryPandaIDwithLFN : %s %s" % (type, value)) + return [] + # return + _logger.debug("queryPandaIDwithLFN : %s" % str(retList)) + return retList + + + # get job statistics + def getJobStatistics(self,archived=False,predefined=False): + comment = ' /* DBProxy.getJobStatistics */' + _logger.debug("getJobStatistics()") + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) + sql0 = "SELECT computingSite,jobStatus,COUNT(*) FROM %s WHERE prodSourceLabel in ('managed','rc_test','user','panda','ddm') " + if predefined: + sql0 += "AND relocationFlag=1 " + sql0 += "GROUP BY computingSite,jobStatus" + sqlA = "SELECT computingSite,jobStatus,COUNT(*) FROM jobsArchived4 WHERE modificationTime>'%s' AND prodSourceLabel in ('managed','rc_test','user','panda','ddm') " \ + % (timeLimit.strftime('%Y-%m-%d %H:%M:%S')) + if predefined: + sqlA += "AND relocationFlag=1 " + sqlA += "GROUP BY computingSite,jobStatus" + tables = ['jobsActive4','jobsDefined4'] + if archived: + tables.append('jobsArchived4') + ret = {} + nTry=3 + for iTry in range(nTry): + try: + for table in tables: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + if table != 'jobsArchived4': + self.cur.execute((sql0+comment) % table) + else: + self.cur.execute(sqlA+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for item in res: + if not ret.has_key(item[0]): + ret[item[0]] = {} + if not ret[item[0]].has_key(item[1]): + ret[item[0]][item[1]] = 0 + ret[item[0]][item[1]] += item[2] + # for zero + stateList = ['assigned','activated','running'] + if archived: + stateList += ['finished','failed'] + for site in ret.keys(): + for state in stateList: + if not ret[site].has_key(state): + ret[site][state] = 0 + # return + _logger.debug("getJobStatistics() : %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("getJobStatistics() retry : %s" % iTry) + time.sleep(2) + continue + type, value, traceBack = sys.exc_info() + _logger.error("getJobStatistics : %s %s" % (type, value)) + return {} + + + # get job statistics for brokerage + def getJobStatisticsBrokerage(self): + comment = ' /* DBProxy.getJobStatisticsBrokerage */' + _logger.debug("getJobStatisticsBrokerage()") + sql0 = "SELECT computingSite,jobStatus,processingType,COUNT(*) FROM %s WHERE prodSourceLabel IN ('managed','rc_test','user','panda','ddm') " + sql0 += "GROUP BY computingSite,jobStatus,processingType" + tables = ['jobsActive4','jobsDefined4'] + ret = {} + nTry=3 + for iTry in range(nTry): + try: + for table in tables: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute((sql0+comment) % table) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for computingSite,jobStatus,processingType,count in res: + # add site + if not ret.has_key(computingSite): + ret[computingSite] = {} + # add processingType + if not ret[computingSite].has_key(processingType): + ret[computingSite][processingType] = {} + # add jobStatus + if not ret[computingSite][processingType].has_key(jobStatus): + ret[computingSite][processingType][jobStatus] = count + # for zero + for site,siteVal in ret.iteritems(): + for pType,typeVal in siteVal.iteritems(): + for stateItem in ['assigned','activated','running']: + if not typeVal.has_key(stateItem): + typeVal[stateItem] = 0 + # return + return ret + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("getJobStatisticsBrokerage retry : %s" % iTry) + time.sleep(2) + continue + type, value, traceBack = sys.exc_info() + _logger.error("getJobStatisticsBrokerage : %s %s" % (type, value)) + return {} + + + # get computingSite and destinationSE for a dataset + def getDestSE(self,dsname): + comment = ' /* DBProxy.getDestSE */' + _logger.debug("getDestSE(%s)" % dsname) + sql0 = "SELECT PandaID FROM filesTable4 WHERE destinationDBlock='%s' AND status='transferring' LIMIT 1" % dsname + sql1 = "SELECT computingSite,destinationSE FROM jobsActive4 WHERE PandaID=%s" + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute(sql0+comment) + res = self.cur.fetchall() + # get PandaID + pandaID = None + if len(res) != 0: + pandaID = res[0][0] + # get computingSite and destinationSE + destSE = None,None + if pandaID != None: + self.cur.execute((sql1+comment) % pandaID) + res = self.cur.fetchall() + if len(res) != 0: + destSE = res[0] + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + _logger.debug("getDestSE(%s) : %s" % (dsname,str(destSE))) + return destSE + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getDestSE : %s %s" % (type, value)) + return None,None + + + # get destinationDBlockToken for a dataset + def getDestTokens(self,dsname): + comment = ' /* DBProxy.getDestTokens */' + _logger.debug("getDestTokens(%s)" % dsname) + sql0 = "SELECT destinationDBlockToken FROM filesTable4 WHERE destinationDBlock='%s' LIMIT 1" % dsname + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute(sql0+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + retToken = None + if len(res) != 0: + retToken = res[0][0] + # return + _logger.debug("getDestTokens(%s) : %s" % (dsname,retToken)) + return retToken + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getDestTokens : %s %s" % (type, value)) + return None + + + # get the number of job for a user + def getNumberJobsUser(self,dn): + comment = ' /* DBProxy.getNumberJobsUser */' + _logger.debug("getNumberJobsUsers(%s)" % dn) + sql0 = "SELECT COUNT(*) FROM %s WHERE prodUserID='%s' AND prodSourceLabel='user'" + nTry = 1 + nJob = 0 + for iTry in range(nTry): + try: + for table in ('jobsActive4','jobsDefined4'): + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute((sql0+comment) % (table,dn)) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + if len(res) != 0: + nJob += res[0][0] + # return + _logger.debug("getNumberJobsUsers(%s) : %s" % (dn,nJob)) + return nJob + except: + # roll back + self._rollback() + if iTry+1 < nTry: + time.sleep(2) + continue + type, value, traceBack = sys.exc_info() + _logger.error("getNumberJobsUsers : %s %s" % (type, value)) + return 0 + + + # get job statistics for ExtIF + def getJobStatisticsForExtIF(self,sourcetype=None): + comment = ' /* DBProxy.getJobStatisticsForExtIF */' + _logger.debug("getJobStatisticsForExtIF()") + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) + if sourcetype == 'analysis': + sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel in ('user','panda') GROUP BY jobStatus,cloud" + sqlA = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel in ('user','panda') " + else: + sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN ('managed','rc_test') GROUP BY jobStatus,cloud" + sqlA = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN ('managed','rc_test') " + sqlA+= "AND modificationTime>'%s' GROUP BY jobStatus,cloud" % (timeLimit.strftime('%Y-%m-%d %H:%M:%S')) + ret = {} + try: + for table in ('jobsActive4','jobsWaiting4','jobsArchived4','jobsDefined4'): + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + if table != 'jobsArchived4': + self.cur.execute((sql0+comment) % table) + else: + self.cur.execute((sqlA+comment) % table) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # change NULL to US for old jobs + newRes = [] + usMap = {} + for jobStatus,count,cloud in res: + if not cloud in ['US','NULL']: + # append since no conversion is required + newRes.append((jobStatus,count,cloud)) + else: + # sum + if not usMap.has_key(jobStatus): + usMap[jobStatus] = 0 + usMap[jobStatus] += count + # append US counts + for jobStatus,count in usMap.iteritems(): + newRes.append((jobStatus,count,'US')) + # create map + for item in newRes: + # add cloud + if not ret.has_key(item[2]): + ret[item[2]] = {} + # this is needed for auto_increment of InnoDB + if not ret[item[2]].has_key(item[0]): + ret[item[2]][item[0]] = item[1] + # return + _logger.debug("getJobStatisticsForExtIF() : %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getJobStatisticsForExtIF : %s %s" % (type, value)) + return {} + + + # get job statistics per processingType + def getJobStatisticsPerProcessingType(self): + comment = ' /* DBProxy.getJobStatisticsPerProcessingType */' + _logger.debug("getJobStatisticsPerProcessingType()") + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) + sql0 = "SELECT jobStatus,COUNT(*),cloud,processingType FROM %s " + sql0 += "WHERE prodSourceLabel IN ('managed','rc_test') " + sqlT = "AND modificationTime>'%s' " % timeLimit.strftime('%Y-%m-%d %H:%M:%S') + sql1 = "GROUP BY jobStatus,cloud,processingType" + sqlN = sql0 + sql1 + sqlA = sql0 + sqlT + sql1 + ret = {} + try: + for table in ('jobsActive4','jobsWaiting4','jobsArchived4','jobsDefined4'): + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + if table == 'jobsArchived4': + self.cur.execute((sqlA+comment) % table) + else: + self.cur.execute((sqlN+comment) % table) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for jobStatus,count,cloud,processingType in res: + # add cloud + if not ret.has_key(cloud): + ret[cloud] = {} + # add processingType + if not ret[cloud].has_key(processingType): + ret[cloud][processingType] = {} + # this is needed for auto_increment of InnoDB + if not ret[cloud][processingType].has_key(jobStatus): + ret[cloud][processingType][jobStatus] = count + # return + _logger.debug("getJobStatisticsPerProcessingType() : %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getJobStatisticsPerProcessingType : %s %s" % (type, value)) + return {} + + + # get number of analysis jobs per user + def getNUserJobs(self,siteName,nJobs): + comment = ' /* DBProxy.getNUserJobs */' + _logger.debug("getNUserJobs(%s)" % siteName) + sql0 = "SELECT prodUserID FROM jobsActive4 WHERE jobStatus='activated' AND prodSourceLabel in ('user','panda') AND computingSite='%s' ORDER BY currentPriority DESC LIMIT %s" % (siteName,nJobs) + ret = {} + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute(sql0+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for prodUserID, in res: + if not ret.has_key(prodUserID): + ret[prodUserID] = 0 + ret[prodUserID] += 1 + # return + _logger.debug("getNUserJobs() : %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getNUserJobs : %s %s" % (type, value)) + return {} + + + # get number of activated analysis jobs + def getNAnalysisJobs(self,nProcesses): + comment = ' /* DBProxy.getNAnalysisJobs */' + _logger.debug("getNAnalysisJobs(%s)" % nProcesses) + sql0 = "SELECT computingSite,COUNT(*) FROM jobsActive4 WHERE jobStatus='activated' AND (prodSourceLabel='user' OR prodSourceLabel='panda') GROUP BY computingSite" + ret = {} + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute(sql0+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for item in res: + ret[item[0]] = float(item[1])/nProcesses + # return + _logger.debug("getNAnalysisJobs() : %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getNAnalysisJobs : %s %s" % (type, value)) + return {} + + + # count pilot requests + def countPilotRequests(self,ids,prodSourceLabel='None'): + comment = ' /* DBProxy.countPilotRequests */' + # prodSourceLabel + if prodSourceLabel=='user': + criteria = " AND MESSAGE REGEXP 'user$'" + else: + criteria = " AND MESSAGE REGEXP 'None$'" + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) + ret = {} + try: + for siteID in ids: + # begin transaction + self.cur.execute("START TRANSACTION") + # select + sql0 = "SELECT COUNT(*) FROM PANDALOG WHERE Type='getJob' AND BINTIME>'%s'" % \ + timeLimit.strftime('%Y-%m-%d %H:%M:%S') + sql0+= " AND MESSAGE REGEXP '%s'" % siteID + sql0+= criteria + self.cur.execute(sql0+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + ret[siteID] = res[0][0] + # return + _logger.debug("countPilotRequests() : %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("countPilotRequests : %s %s" % (type, value)) + # for zero + for siteID in ids: + if not ret.has_key(siteID): + ret[siteID]=0 + return ret + + + # generate pilot token + def genPilotToken(self,schedulerhost,scheduleruser,schedulerid): + comment = ' /* DBProxy.genPilotToken */' + try: + _logger.debug("genPilotToken(%s,%s,%s)" % (schedulerhost,scheduleruser,schedulerid)) + token = commands.getoutput('uuidgen') + timeNow = datetime.datetime.utcnow() + timeExp = timeNow + datetime.timedelta(days=4) + sql = "INSERT INTO pilottoken (token,schedulerhost,scheduleruser,schedulerid,created,expires) " + sql += "VALUES (%s,%s,%s,%s,%s,%s)" + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # execute + self.cur.execute(sql+comment,(token,schedulerhost,scheduleruser,schedulerid, + timeNow.strftime('%Y-%m-%d %H:%M:%S'), + timeExp.strftime('%Y-%m-%d %H:%M:%S'))) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + retVal = "token=%s,created=%s,expires=%s" % (token,timeNow.strftime('%Y-%m-%d %H:%M:%S'), + timeExp.strftime('%Y-%m-%d %H:%M:%S')) + _logger.debug("genPilotToken -> %s" % retVal) + return retVal + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("genPilotToken : %s %s" % (type, value)) + return None + + + # get list of scheduler users + def getListSchedUsers(self): + comment = ' /* DBProxy.getListSchedUsers */' + try: + _logger.debug("getListSchedUsers") + sql = "SELECT token,scheduleruser FROM pilottoken WHERE expires>UTC_TIMESTAMP()" + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # execute + self.cur.execute(sql+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + retVal = {} + for token,scheduleruser in res: + retVal[token] = scheduleruser + _logger.debug("getListSchedUsers->%s" % str(retVal)) + return retVal + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getListSchedUsers : %s %s" % (type, value)) + return {} + + + # wake up connection + def wakeUp(self): + for iTry in range(5): + try: + # check if the connection is working + self.conn.ping() + return + except: + type, value, traceBack = sys.exc_info() + _logger.debug("wakeUp %d : %s %s" % (iTry,type,value)) + # wait for reconnection + time.sleep(1) + self.connect(reconnect=True) + + + # commit + def _commit(self): + try: + self.conn.commit() + return True + except: + _logger.error("commit error") + return False + + + # rollback + def _rollback(self): + try: + self.conn.rollback() + return True + except: + _logger.error("rollback error") + return False + diff --git a/current/pandaserver/taskbuffer/DBProxyPool.py b/current/pandaserver/taskbuffer/DBProxyPool.py new file mode 100755 index 000000000..53aed84bd --- /dev/null +++ b/current/pandaserver/taskbuffer/DBProxyPool.py @@ -0,0 +1,88 @@ +""" +pool for DBProxies + +""" + +import inspect +import Queue +import OraDBProxy as DBProxy +import os +import time +import random +from threading import Lock +from config import panda_config +from taskbuffer.ConBridge import ConBridge +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('DBProxyPool') + +class DBProxyPool: + + def __init__(self,dbhost,dbpasswd,nConnection,useTimeout=False): + # crate lock for callers + self.lock = Lock() + self.callers = [] + # create Proxies + _logger.debug("init") + self.proxyList = Queue.Queue(nConnection) + for i in range(nConnection): + _logger.debug("connect -> %s " % i) + if useTimeout and hasattr(panda_config,'usedbtimeout') and \ + panda_config.usedbtimeout == True: + proxy = ConBridge() + else: + proxy = DBProxy.DBProxy() + iTry = 0 + while True: + if proxy.connect(dbhost,dbpasswd,dbtimeout=60): + break + iTry += 1 + _logger.debug("failed -> %s : try %s" % (i,iTry)) + time.sleep(random.randint(60,90)) + self.proxyList.put(proxy) + time.sleep(1) + # get PID + self.pid = os.getpid() + _logger.debug("ready") + + # return a free proxy. this method blocks until a proxy is available + def getProxy(self): + """ + # get caller + caller = inspect.stack()[1][3] + _logger.debug("PID=%s %s getting proxy used by %s" % (self.pid,caller,str(self.callers))) + """ + # get proxy + proxy = self.proxyList.get() + """ + # lock + self.lock.acquire() + # append + self.callers.append(caller) + # release + self.lock.release() + _logger.debug("PID=%s %s got proxy used by %s" % (self.pid,caller,str(self.callers))) + """ + # wake up connection + proxy.wakeUp() + # return + return proxy + + # put back a proxy + def putProxy(self,proxy): + """ + # get caller + caller = inspect.stack()[1][3] + _logger.debug("PID=%s %s releasing. used by %s" % (self.pid,caller,str(self.callers))) + """ + self.proxyList.put(proxy) + """ + # lock + self.lock.acquire() + # append + self.callers.remove(caller) + # release + self.lock.release() + _logger.debug("PID=%s %s released. used by %s" % (self.pid,caller,str(self.callers))) + """ diff --git a/current/pandaserver/taskbuffer/DatasetSpec.py b/current/pandaserver/taskbuffer/DatasetSpec.py new file mode 100755 index 000000000..815b98a59 --- /dev/null +++ b/current/pandaserver/taskbuffer/DatasetSpec.py @@ -0,0 +1,118 @@ +""" +dataset specification + +""" + +class DatasetSpec(object): + # attributes + _attributes = ('vuid','name','version','type','status','numberfiles','currentfiles','creationdate', + 'modificationdate','MoverID','transferStatus','subType') + + # attributes which have 0 by default + _zeroAttrs = ('MoverID','transferStatus') + + + + # constructor + def __init__(self): + # install attributes + for attr in self._attributes: + setattr(self,attr,None) + + + # override __getattribute__ for SQL + def __getattribute__(self,name): + ret = object.__getattribute__(self,name) + if ret == None: + return "NULL" + return ret + + + # return a tuple of values + def values(self): + ret = [] + for attr in self._attributes: + val = getattr(self,attr) + ret.append(val) + return tuple(ret) + + + # return map of values + def valuesMap(self): + ret = {} + for attr in self._attributes: + val = getattr(self,attr) + if val == 'NULL': + if attr in self._zeroAttrs: + val = 0 + else: + val = None + ret[':%s' % attr] = val + return ret + + + # pack tuple into DatasetSpec + def pack(self,values): + for i in range(len(self._attributes)): + attr= self._attributes[i] + val = values[i] + setattr(self,attr,val) + + + # return column names for INSERT + def columnNames(cls): + ret = "" + for attr in cls._attributes: + if ret != "": + ret += ',' + ret += attr + return ret + columnNames = classmethod(columnNames) + + + # return expression of values for INSERT + def valuesExpression(cls): + ret = "VALUES(" + for attr in cls._attributes: + ret += "%s" + if attr != cls._attributes[len(cls._attributes)-1]: + ret += "," + ret += ")" + return ret + valuesExpression = classmethod(valuesExpression) + + + # return expression of bind values for INSERT + def bindValuesExpression(cls): + ret = "VALUES(" + for attr in cls._attributes: + ret += ":%s," % attr + ret = ret[:-1] + ret += ")" + return ret + bindValuesExpression = classmethod(bindValuesExpression) + + + # return an expression for UPDATE + def updateExpression(cls): + ret = "" + for attr in cls._attributes: + ret = ret + attr + "=%s" + if attr != cls._attributes[len(cls._attributes)-1]: + ret += "," + return ret + updateExpression = classmethod(updateExpression) + + + # return an expression of bind variables for UPDATE + def bindUpdateExpression(cls): + ret = "" + for attr in cls._attributes: + ret += '%s=:%s,' % (attr,attr) + ret = ret[:-1] + return ret + bindUpdateExpression = classmethod(bindUpdateExpression) + + + + diff --git a/current/pandaserver/taskbuffer/ErrorCode.py b/current/pandaserver/taskbuffer/ErrorCode.py new file mode 100755 index 000000000..08f72b116 --- /dev/null +++ b/current/pandaserver/taskbuffer/ErrorCode.py @@ -0,0 +1,37 @@ +############## errror code + +# killed +EC_Kill = 100 + +# transfer timeout +EC_Transfer = 101 + +# expire +EC_Expire = 102 + +# aborted +EC_Aborted = 103 + +# wait timeout +EC_WaitTimeout = 104 + +# reassigned by rebrokeage +EC_Reassigned = 105 + +# reassigned by server-side retry +EC_Retried = 106 + +# retried by pilot +EC_PilotRetried = 107 + +# lost file (=dataservice.ErrorCode.EC_LostFile) +EC_LostFile = 110 + +# file not found +class EC_NotFound: + pass + +# file relocated +class EC_Redirect: + def __init__(self,url): + self.url = url diff --git a/current/pandaserver/taskbuffer/FileSpec.py b/current/pandaserver/taskbuffer/FileSpec.py new file mode 100755 index 000000000..209b2ed65 --- /dev/null +++ b/current/pandaserver/taskbuffer/FileSpec.py @@ -0,0 +1,213 @@ +""" +file specification + +""" + + +class FileSpec(object): + # attributes + _attributes = ('row_ID','PandaID','GUID','lfn','type','dataset','status','prodDBlock', + 'prodDBlockToken','dispatchDBlock','dispatchDBlockToken','destinationDBlock', + 'destinationDBlockToken','destinationSE','fsize','md5sum','checksum','scope') + # slots + __slots__ = _attributes+('_owner','_changedAttrs','_oldPandaID') + # attributes which have 0 by default + _zeroAttrs = ('fsize',) + # mapping between sequence and attr + _seqAttrMap = {'row_ID':'ATLAS_PANDA.FILESTABLE4_ROW_ID_SEQ.nextval'} + + + # constructor + def __init__(self): + # install attributes + for attr in self._attributes: + object.__setattr__(self,attr,None) + # set owner to synchronize PandaID + object.__setattr__(self,'_owner',None) + # map of changed attributes + object.__setattr__(self,'_changedAttrs',{}) + # old PandaID + object.__setattr__(self,'_oldPandaID','NULL') + + + # override __getattribute__ for SQL and PandaID + def __getattribute__(self,name): + # PandaID + if name == 'PandaID': + if self._owner == None: + return 'NULL' + return self._owner.PandaID + # others + ret = object.__getattribute__(self,name) + if ret == None: + return "NULL" + return ret + + + # override __setattr__ to collecte the changed attributes + def __setattr__(self,name,value): + oldVal = getattr(self,name) + object.__setattr__(self,name,value) + newVal = getattr(self,name) + # collect changed attributes + if oldVal != newVal: + self._changedAttrs[name] = value + + + # set owner + def setOwner(self,owner): + self._owner = owner + self._oldPandaID = self.PandaID + + + # reset changed attribute list + def resetChangedList(self): + self._oldPandaID = self.PandaID + object.__setattr__(self,'_changedAttrs',{}) + + + # return a tuple of values + def values(self): + ret = [] + for attr in self._attributes: + val = getattr(self,attr) + ret.append(val) + return tuple(ret) + + + # return map of values + def valuesMap(self,useSeq=False,onlyChanged=False): + ret = {} + for attr in self._attributes: + if useSeq and self._seqAttrMap.has_key(attr): + continue + if onlyChanged: + if attr == 'PandaID': + if self.PandaID == self._oldPandaID: + continue + elif not self._changedAttrs.has_key(attr): + continue + val = getattr(self,attr) + if val == 'NULL': + if attr in self._zeroAttrs: + val = 0 + else: + val = None + ret[':%s' % attr] = val + return ret + + + # pack tuple into FileSpec + def pack(self,values): + for i in range(len(self._attributes)): + attr= self._attributes[i] + val = values[i] + object.__setattr__(self,attr,val) + + + # return state values to be pickled + def __getstate__(self): + state = [] + for attr in self._attributes: + val = getattr(self,attr) + state.append(val) + # append owner info + state.append(self._owner) + return state + + + # restore state from the unpickled state values + def __setstate__(self,state): + pandaID = 'NULL' + for i in range(len(self._attributes)): + if i+1 < len(state): + object.__setattr__(self,self._attributes[i],state[i]) + else: + object.__setattr__(self,self._attributes[i],'NULL') + if self._attributes[i] == 'PandaID': + pandaID = state[i] + object.__setattr__(self,'_owner',state[-1]) + object.__setattr__(self,'_changedAttrs',{}) + object.__setattr__(self,'_oldPandaID',pandaID) + + + # return column names for INSERT + def columnNames(cls,withMod=False): + ret = "" + for attr in cls._attributes: + if ret != "": + ret += ',' + ret += attr + # add modificationTime + if withMod: + ret += ",modificationTime" + return ret + columnNames = classmethod(columnNames) + + + # return expression of values for INSERT + def valuesExpression(cls): + ret = "VALUES(" + for attr in cls._attributes: + ret += "%s" + if attr != cls._attributes[len(cls._attributes)-1]: + ret += "," + ret += ")" + return ret + valuesExpression = classmethod(valuesExpression) + + + # return expression of bind variables for INSERT + def bindValuesExpression(cls,useSeq=False,withMod=False): + ret = "VALUES(" + for attr in cls._attributes: + if useSeq and cls._seqAttrMap.has_key(attr): + ret += "%s," % cls._seqAttrMap[attr] + else: + ret += ":%s," % attr + ret = ret[:-1] + # add modificationTime + if withMod: + ret += ",:modificationTime" + ret += ")" + return ret + bindValuesExpression = classmethod(bindValuesExpression) + + + # return an expression for UPDATE + def updateExpression(cls): + ret = "" + for attr in cls._attributes: + ret = ret + attr + "=%s" + if attr != cls._attributes[len(cls._attributes)-1]: + ret += "," + return ret + updateExpression = classmethod(updateExpression) + + + # return an expression of bind variables for UPDATE + def bindUpdateExpression(cls): + ret = "" + for attr in cls._attributes: + ret += '%s=:%s,' % (attr,attr) + ret = ret[:-1] + ret += ' ' + return ret + bindUpdateExpression = classmethod(bindUpdateExpression) + + + # return an expression of bind variables for UPDATE to update only changed attributes + def bindUpdateChangesExpression(self): + ret = "" + for attr in self._attributes: + if self._changedAttrs.has_key(attr) or \ + (attr == 'PandaID' and self.PandaID != self._oldPandaID): + ret += '%s=:%s,' % (attr,attr) + ret = ret[:-1] + ret += ' ' + return ret + + + + + diff --git a/current/pandaserver/taskbuffer/Initializer.py b/current/pandaserver/taskbuffer/Initializer.py new file mode 100644 index 000000000..a9c158b43 --- /dev/null +++ b/current/pandaserver/taskbuffer/Initializer.py @@ -0,0 +1,46 @@ +import sys +import cx_Oracle +from threading import Lock + +from config import panda_config + +# logger +from pandalogger.PandaLogger import PandaLogger +_logger = PandaLogger().getLogger('Initializer') + +# initialize cx_Oracle using dummy connection to avoid "Unable to acquire Oracle environment handle" +class Initializer: + def __init__(self): + self.lock = Lock() + self.first = True + + def init(self): + _logger.debug("init new=%s" % self.first) + # do nothing when nDBConnection is 0 + if panda_config.nDBConnection == 0: + return True + # lock + self.lock.acquire() + if self.first: + self.first = False + try: + _logger.debug("connect") + # connect + conn = cx_Oracle.connect(dsn=panda_config.dbhost,user=panda_config.dbuser, + password=panda_config.dbpasswd,threaded=True) + # close + conn.close() + _logger.debug("done") + except: + self.lock.release() + type, value, traceBack = sys.exc_info() + _logger.error("connect : %s %s" % (type,value)) + return False + # release + self.lock.release() + return True + + +# singleton +initializer = Initializer() +del Initializer diff --git a/current/pandaserver/taskbuffer/JobSpec.py b/current/pandaserver/taskbuffer/JobSpec.py new file mode 100755 index 000000000..7eaa764ab --- /dev/null +++ b/current/pandaserver/taskbuffer/JobSpec.py @@ -0,0 +1,239 @@ +""" +job specification + +""" + +class JobSpec(object): + # attributes + _attributes = ('PandaID','jobDefinitionID','schedulerID','pilotID','creationTime','creationHost', + 'modificationTime','modificationHost','AtlasRelease','transformation','homepackage', + 'prodSeriesLabel','prodSourceLabel','prodUserID','assignedPriority','currentPriority', + 'attemptNr','maxAttempt','jobStatus','jobName','maxCpuCount','maxCpuUnit','maxDiskCount', + 'maxDiskUnit','ipConnectivity','minRamCount','minRamUnit','startTime','endTime', + 'cpuConsumptionTime','cpuConsumptionUnit','commandToPilot','transExitCode','pilotErrorCode', + 'pilotErrorDiag','exeErrorCode','exeErrorDiag','supErrorCode','supErrorDiag', + 'ddmErrorCode','ddmErrorDiag','brokerageErrorCode','brokerageErrorDiag', + 'jobDispatcherErrorCode','jobDispatcherErrorDiag','taskBufferErrorCode', + 'taskBufferErrorDiag','computingSite','computingElement','jobParameters', + 'metadata','prodDBlock','dispatchDBlock','destinationDBlock','destinationSE', + 'nEvents','grid','cloud','cpuConversion','sourceSite','destinationSite','transferType', + 'taskID','cmtConfig','stateChangeTime','prodDBUpdateTime','lockedby','relocationFlag', + 'jobExecutionID','VO','pilotTiming','workingGroup','processingType','prodUserName', + 'nInputFiles','countryGroup','batchID','parentID','specialHandling','jobsetID', + 'coreCount','nInputDataFiles','inputFileType','inputFileProject','inputFileBytes', + 'nOutputDataFiles','outputFileBytes','jobMetrics') + # slots + __slots__ = _attributes+('Files','_changedAttrs') + # attributes which have 0 by default + _zeroAttrs = ('assignedPriority','currentPriority','attemptNr','maxAttempt','maxCpuCount','maxDiskCount', + 'minRamCount','cpuConsumptionTime','pilotErrorCode','exeErrorCode','supErrorCode','ddmErrorCode', + 'brokerageErrorCode','jobDispatcherErrorCode','taskBufferErrorCode','nEvents','relocationFlag', + 'jobExecutionID','nOutputDataFiles','outputFileBytes') + # attribute to be suppressed. They are in another table + _suppAttrs = ('jobParameters','metadata') + # mapping between sequence and attr + _seqAttrMap = {'PandaID':'ATLAS_PANDA.JOBSDEFINED4_PANDAID_SEQ.nextval'} + # limit length + _limitLength = {'ddmErrorDiag' : 500, + 'taskBufferErrorDiag' : 300, + 'jobDispatcherErrorDiag' : 250, + 'brokerageErrorDiag' : 250, + 'pilotErrorDiag' : 500, + 'exeErrorDiag' : 500, + } + + + # constructor + def __init__(self): + # install attributes + for attr in self._attributes: + object.__setattr__(self,attr,None) + # files list + object.__setattr__(self,'Files',[]) + # map of changed attributes + object.__setattr__(self,'_changedAttrs',{}) + + + # override __getattribute__ for SQL + def __getattribute__(self,name): + ret = object.__getattribute__(self,name) + if ret == None: + return "NULL" + return ret + + + # override __setattr__ to collecte the changed attributes + def __setattr__(self,name,value): + oldVal = getattr(self,name) + object.__setattr__(self,name,value) + newVal = getattr(self,name) + # collect changed attributes + if oldVal != newVal and not name in self._suppAttrs: + self._changedAttrs[name] = value + + + # reset changed attribute list + def resetChangedList(self): + object.__setattr__(self,'_changedAttrs',{}) + + + # add File to files list + def addFile(self,file): + # set owner + file.setOwner(self) + # append + self.Files.append(file) + + + # pack tuple into JobSpec + def pack(self,values): + for i in range(len(self._attributes)): + attr= self._attributes[i] + val = values[i] + object.__setattr__(self,attr,val) + + + # return a tuple of values + def values(self): + ret = [] + for attr in self._attributes: + val = getattr(self,attr) + ret.append(val) + return tuple(ret) + + + # return map of values + def valuesMap(self,useSeq=False,onlyChanged=False): + ret = {} + for attr in self._attributes: + if useSeq and self._seqAttrMap.has_key(attr): + continue + if onlyChanged: + if not self._changedAttrs.has_key(attr): + continue + val = getattr(self,attr) + if val == 'NULL': + if attr in self._zeroAttrs: + val = 0 + else: + val = None + # jobParameters/metadata go to another table + if attr in self._suppAttrs: + val = None + # truncate too long values + if self._limitLength.has_key(attr): + if val != None: + val = val[:self._limitLength[attr]] + ret[':%s' % attr] = val + return ret + + + # return state values to be pickled + def __getstate__(self): + state = [] + for attr in self._attributes: + val = getattr(self,attr) + state.append(val) + # append File info + state.append(self.Files) + return state + + + # restore state from the unpickled state values + def __setstate__(self,state): + for i in range(len(self._attributes)): + # schema evolution is supported only when adding attributes + if i+1 < len(state): + object.__setattr__(self,self._attributes[i],state[i]) + else: + object.__setattr__(self,self._attributes[i],'NULL') + object.__setattr__(self,'Files',state[-1]) + object.__setattr__(self,'_changedAttrs',{}) + + + # return column names for INSERT or full SELECT + def columnNames(cls): + ret = "" + for attr in cls._attributes: + if ret != "": + ret += ',' + ret += attr + return ret + columnNames = classmethod(columnNames) + + + # return expression of values for INSERT + def valuesExpression(cls): + ret = "VALUES(" + for attr in cls._attributes: + ret += "%s" + if attr != cls._attributes[len(cls._attributes)-1]: + ret += "," + ret += ")" + return ret + valuesExpression = classmethod(valuesExpression) + + + # return expression of bind values for INSERT + def bindValuesExpression(cls,useSeq=False): + ret = "VALUES(" + for attr in cls._attributes: + if useSeq and cls._seqAttrMap.has_key(attr): + ret += "%s," % cls._seqAttrMap[attr] + else: + ret += ":%s," % attr + ret = ret[:-1] + ret += ")" + return ret + bindValuesExpression = classmethod(bindValuesExpression) + + + # return an expression for UPDATE + def updateExpression(cls): + ret = "" + for attr in cls._attributes: + ret = ret + attr + "=%s" + if attr != cls._attributes[len(cls._attributes)-1]: + ret += "," + return ret + updateExpression = classmethod(updateExpression) + + + # return an expression of bind variables for UPDATE + def bindUpdateExpression(cls): + ret = "" + for attr in cls._attributes: + ret += '%s=:%s,' % (attr,attr) + ret = ret[:-1] + ret += ' ' + return ret + bindUpdateExpression = classmethod(bindUpdateExpression) + + + # comparison function for sort + def compFunc(cls,a,b): + iPandaID = list(cls._attributes).index('PandaID') + iPriority = list(cls._attributes).index('currentPriority') + if a[iPriority] > b[iPriority]: + return -1 + elif a[iPriority] < b[iPriority]: + return 1 + else: + if a[iPandaID] > b[iPandaID]: + return 1 + elif a[iPandaID] < b[iPandaID]: + return -1 + else: + return 0 + compFunc = classmethod(compFunc) + + + # return an expression of bind variables for UPDATE to update only changed attributes + def bindUpdateChangesExpression(self): + ret = "" + for attr in self._attributes: + if self._changedAttrs.has_key(attr): + ret += '%s=:%s,' % (attr,attr) + ret = ret[:-1] + ret += ' ' + return ret diff --git a/current/pandaserver/taskbuffer/LogDBProxy.py b/current/pandaserver/taskbuffer/LogDBProxy.py new file mode 100755 index 000000000..e32ef22fc --- /dev/null +++ b/current/pandaserver/taskbuffer/LogDBProxy.py @@ -0,0 +1,790 @@ +""" +proxy for log database connection + +""" + +import re +import sys +import time +import datetime + +import MySQLdb + +from pandalogger.PandaLogger import PandaLogger +from config import panda_config + +import SiteSpec +import CloudSpec + +from JobSpec import JobSpec +from FileSpec import FileSpec + +# logger +_logger = PandaLogger().getLogger('LogDBProxy') + +# proxy +class LogDBProxy: + + # constructor + def __init__(self): + # connection object + self.conn = None + # cursor object + self.cur = None + + # connect to DB + def connect(self,dbhost=panda_config.logdbhost,dbpasswd=panda_config.logdbpasswd, + dbuser=panda_config.logdbuser,dbname=panda_config.logdbname,reconnect=False): + # keep parameters for reconnect + if not reconnect: + self.dbhost = dbhost + self.dbpasswd = dbpasswd + self.dbuser = dbuser + self.dbname = dbname + # connect + try: + self.conn = MySQLdb.connect(host=self.dbhost,user=self.dbuser, + passwd=self.dbpasswd,db=self.dbname) + self.cur=self.conn.cursor() + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("connect : %s %s" % (type,value)) + # roll back + self._rollback() + return False + + + # query an SQL + def querySQL(self,sql): + try: + # begin transaction + self.cur.execute("START TRANSACTION") + self.cur.execute(sql) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return res + except: + type, value, traceBack = sys.exc_info() + _logger.error("querySQL : %s %s" % (type,value)) + return None + + + # get site data + def getCurrentSiteData(self): + _logger.debug("getCurrentSiteData") + sql = "SELECT SITE,getJob,updateJob FROM SiteData WHERE FLAG='production' and HOURS=3" + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + self.cur.execute(sql) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + ret = {} + for item in res: + ret[item[0]] = {'getJob':item[1],'updateJob':item[2]} + _logger.debug(ret) + return ret + except: + type, value, traceBack = sys.exc_info() + _logger.error("getCurrentSiteData : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # get list of site + def getSiteList(self): + _logger.debug("getSiteList start") + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = "SELECT siteid,nickname FROM schedconfig WHERE siteid<>''" + self.cur.execute(sql) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + retMap = {} + if res != None and len(res) != 0: + for siteid,nickname in res: + # append + if not retMap.has_key(siteid): + retMap[siteid] = [] + retMap[siteid].append(nickname) + _logger.debug(retMap) + _logger.debug("getSiteList done") + return retMap + except: + type, value, traceBack = sys.exc_info() + _logger.error("getSiteList : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # get site info + def getSiteInfo(self): + _logger.debug("getSiteInfo start") + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = "SELECT nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory," + sql+= "maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec," + sql+= "priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue," + sql+= "validatedreleases,accesscontrol " + sql+= "FROM schedconfig WHERE siteid<>''" + self.cur.execute(sql) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + retList = {} + if resList != None: + # loop over all results + for res in resList: + nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory,\ + maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec,\ + priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue,\ + validatedreleases,accesscontrol \ + = res + # instantiate SiteSpec + ret = SiteSpec.SiteSpec() + ret.sitename = siteid + ret.nickname = nickname + ret.dq2url = dq2url + ret.cloud = cloud + ret.ddm = ddm.split(',')[0] + ret.lfchost = lfchost + ret.se = se + ret.gatekeeper = gatekeeper + ret.memory = memory + ret.maxtime = maxtime + ret.status = status + ret.space = space + ret.glexec = glexec + ret.queue = queue + ret.localqueue = localqueue + ret.accesscontrol = accesscontrol + # job recoverty + ret.retry = True + if retry == 'FALSE': + ret.retry = False + # convert releases to list + ret.releases = [] + for tmpRel in releases.split('|'): + # remove white space + tmpRel = tmpRel.strip() + if tmpRel != '': + ret.releases.append(tmpRel) + # convert validatedreleases to list + ret.validatedreleases = [] + for tmpRel in validatedreleases.split('|'): + # remove white space + tmpRel = tmpRel.strip() + if tmpRel != '': + ret.validatedreleases.append(tmpRel) + # cmtconfig + # add slc3 if the column is empty + ret.cmtconfig = ['i686-slc3-gcc323-opt'] + if cmtconfig != '': + ret.cmtconfig.append(cmtconfig) + # map between token and DQ2 ID + ret.setokens = {} + tmpTokens = setokens.split(',') + for idxToken,tmpddmID in enumerate(ddm.split(',')): + if idxToken < len(tmpTokens): + ret.setokens[tmpTokens[idxToken]] = tmpddmID + # expand [] in se path + match = re.search('([^\[]*)\[([^\]]+)\](.*)',seprodpath) + if match != None and len(match.groups()) == 3: + seprodpath = '' + for tmpBody in match.group(2).split(','): + seprodpath += '%s%s%s,' % (match.group(1),tmpBody,match.group(3)) + seprodpath = seprodpath[:-1] + # map between token and se path + ret.seprodpath = {} + tmpTokens = setokens.split(',') + for idxToken,tmpSePath in enumerate(seprodpath.split(',')): + if idxToken < len(tmpTokens): + ret.seprodpath[tmpTokens[idxToken]] = tmpSePath + # VO related params + ret.priorityoffset = priorityoffset + ret.allowedgroups = allowedgroups + ret.defaulttoken = defaulttoken + # append + retList[ret.nickname] = ret + _logger.debug("getSiteInfo done") + return retList + except: + type, value, traceBack = sys.exc_info() + _logger.error("getSiteInfo : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # get cloud list + def getCloudList(self): + _logger.debug("getCloudList start") + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = "SELECT name,tier1,tier1SE,relocation,weight,server,status,transtimelo," + sql += "transtimehi,waittime,validation,mcshare,countries,fasttrack,nprestage," + sql += "pilotowners " + sql+= "FROM cloudconfig" + self.cur.execute(sql) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + ret = {} + if res != None and len(res) != 0: + for name,tier1,tier1SE,relocation,weight,server,status,transtimelo,transtimehi,\ + waittime,validation,mcshare,countries,fasttrack,nprestage,pilotowners in res: + # instantiate CloudSpec + tmpC = CloudSpec.CloudSpec() + tmpC.name = name + tmpC.tier1 = tier1 + tmpC.tier1SE = re.sub(' ','',tier1SE).split(',') + tmpC.relocation = relocation + tmpC.weight = weight + tmpC.server = server + tmpC.status = status + tmpC.transtimelo = transtimelo + tmpC.transtimehi = transtimehi + tmpC.waittime = waittime + tmpC.validation = validation + tmpC.mcshare = mcshare + tmpC.countries = countries + tmpC.fasttrack = fasttrack + tmpC.nprestage = nprestage + tmpC.pilotowners = pilotowners + # append + ret[name] = tmpC + _logger.debug("getCloudList done") + return ret + except: + type, value, traceBack = sys.exc_info() + _logger.error("getCloudList : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # extract name from DN + def cleanUserID(self, id): + try: + up = re.compile('/(DC|O|OU|C|L)=[^\/]+') + username = up.sub('', id) + up2 = re.compile('/CN=[0-9]+') + username = up2.sub('', username) + up3 = re.compile(' [0-9]+') + username = up3.sub('', username) + up4 = re.compile('_[0-9]+') + username = up4.sub('', username) + username = username.replace('/CN=proxy','') + username = username.replace('/CN=limited proxy','') + username = username.replace('limited proxy','') + pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)') + mat = pat.match(username) + if mat: + username = mat.group(2) + else: + username = username.replace('/CN=','') + if username.lower().find('/email') > 0: + username = username[:username.lower().find('/email')] + pat = re.compile('.*(limited.*proxy).*') + mat = pat.match(username) + if mat: + username = mat.group(1) + username = username.replace('(','') + username = username.replace(')','') + return username + except: + return id + + + # check quota + def checkQuota(self,dn): + _logger.debug("checkQuota %s" % dn) + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + name = self.cleanUserID(dn) + sql = "SELECT cpua1,cpua7,cpua30,quotaa1,quotaa7,quotaa30 FROM users WHERE name = '%s'" % name + self.cur.execute(sql) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + weight = 0.0 + if res != None and len(res) != 0: + item = res[0] + # cpu and quota + cpu1 = item[0] + cpu7 = item[1] + cpu30 = item[2] + quota1 = item[3] * 3600 + quota7 = item[4] * 3600 + quota30 = item[5] * 3600 + # CPU usage + if cpu1 == None: + cpu1 = 0.0 + # weight + weight = float(cpu1) / float(quota1) + # not exceeded the limit + if weight < 1.0: + weight = 0.0 + _logger.debug("checkQuota %s Weight:%s Quota:%s CPU:%s" % (dn,weight,quota1,cpu1)) + else: + _logger.debug("checkQuota cannot found %s" % dn) + return weight + except: + type, value, traceBack = sys.exc_info() + _logger.error("checkQuota : %s %s" % (type,value)) + # roll back + self._rollback() + return 0.0 + + + # get serialize JobID and status + def getUserParameter(self,dn,jobID): + _logger.debug("getUserParameter %s %s" % (dn,jobID)) + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + name = self.cleanUserID(dn) + sql = "SELECT jobid,status FROM users WHERE name = '%s'" % name + self.cur.execute(sql) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + retJobID = jobID + retStatus = True + if res != None and len(res) != 0: + item = res[0] + # JobID in DB + dbJobID = item[0] + # check status + if item[1] in ['disabled']: + retStatus = False + # use larger JobID + if dbJobID >= int(retJobID): + retJobID = dbJobID+1 + # update DB + sql = "UPDATE users SET jobid=%d WHERE name = '%s'" % (retJobID,name) + self.cur.execute(sql) + _logger.debug("getUserParameter set JobID=%s for %s" % (retJobID,dn)) + return retJobID,retStatus + except: + type, value, traceBack = sys.exc_info() + _logger.error("getUserParameter : %s %s" % (type,value)) + # roll back + self._rollback() + return jobID,True + + + # get email address for a user + def getEmailAddr(self,name): + _logger.debug("get email for %s" % name) + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = "SELECT email FROM users WHERE name='%s'" % name + self.cur.execute(sql) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if res != None and len(res) != 0: + return res[0][0] + # return empty string + return "" + except: + type, value, traceBack = sys.exc_info() + _logger.error("getEmailAddr : %s %s" % (type,value)) + # roll back + self._rollback() + return "" + + + # register proxy key + def registerProxyKey(self,params): + _logger.debug("register ProxyKey %s" % str(params)) + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # construct SQL + sql0 = 'INSERT INTO proxykey (' + sql1 = 'VALUES (' + vals = [] + for key,val in params.iteritems(): + sql0 += '%s,' % key + sql1 += '%s,' + vals.append(val) + sql0 = sql0[:-1] + sql1 = sql1[:-1] + sql = sql0 + ') ' + sql1 + ') ' + # insert + self.cur.execute(sql,tuple(vals)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return True + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("registerProxyKey : %s %s" % (type,value)) + # roll back + self._rollback() + return "" + + + # get proxy key + def getProxyKey(self,dn): + _logger.debug("get ProxyKey %s" % dn) + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # construct SQL + sql = 'SELECT credname,expires,origin,myproxy FROM proxykey WHERE dn=%s ORDER BY expires DESC' + # select + self.cur.execute(sql,(dn,)) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + retMap = {} + if res != None and len(res) != 0: + credname,expires,origin,myproxy = res[0] + retMap['credname'] = credname + retMap['expires'] = expires + retMap['origin'] = origin + retMap['myproxy'] = myproxy + _logger.debug(retMap) + return retMap + except: + type, value, traceBack = sys.exc_info() + _logger.error("getProxyKey : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # check site access + def checkSiteAccess(self,siteid,dn): + comment = ' /* LogDBProxy.checkSiteAccess */' + _logger.debug("checkSiteAccess %s:%s" % (siteid,dn)) + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # construct SQL + sql = 'SELECT poffset,rights,status FROM siteaccess WHERE dn=%s AND pandasite=%s' + # select + self.cur.execute(sql+comment,(dn,siteid)) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + retMap = {} + if res != None and len(res) != 0: + poffset,rights,status = res[0] + retMap['poffset'] = poffset + retMap['rights'] = rights + retMap['status'] = status + _logger.debug(retMap) + return retMap + except: + type, value, traceBack = sys.exc_info() + _logger.error("checkSiteAccess : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # add account to siteaccess + def addSiteAccess(self,siteID,dn): + comment = ' /* LogDBProxy.addSiteAccess */' + _logger.debug("addSiteAccess : %s %s" % (siteID,dn)) + try: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = 'SELECT status FROM siteaccess WHERE dn=%s AND pandasite=%s' + self.cur.execute(sql+comment, (dn,siteID)) + res = self.cur.fetchone() + if res != None: + _logger.debug("account already exists with status=%s" % res[0]) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return res[0] + # add + sql = 'INSERT INTO siteaccess (dn,pandasite,status) VALUES (%s,%s,%s)' + self.cur.execute(sql+comment, (dn,siteID,'requested')) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("account was added") + return 0 + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("addSiteAccess( : %s %s" % (type,value)) + # return None + return -1 + + + # list site access + def listSiteAccess(self,siteid=None,dn=None): + comment = ' /* LogDBProxy.listSiteAccess */' + _logger.debug("listSiteAccess %s:%s" % (siteid,dn)) + try: + if siteid==None and dn==None: + return [] + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # construct SQL + if siteid != None: + varMap = (siteid,) + sql = 'SELECT dn,status FROM siteaccess WHERE pandasite=%s ORDER BY dn' + else: + varMap = (dn,) + sql = 'SELECT pandasite,status FROM siteaccess WHERE dn=%s ORDER BY pandasite' + # select + self.cur.execute(sql+comment,varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + ret = [] + if res != None and len(res) != 0: + for tmpRes in res: + ret.append(tmpRes) + _logger.debug(ret) + return ret + except: + type, value, traceBack = sys.exc_info() + _logger.error("listSiteAccess : %s %s" % (type,value)) + # roll back + self._rollback() + return [] + + + # get list of archived tables + def getArchiveTables(self): + tables = [] + cdate = datetime.datetime.utcnow() + for iCycle in range(2): # 2 = (1 months + 2 just in case)/2 + if cdate.month==1: + cdate = cdate.replace(year = (cdate.year-1)) + cdate = cdate.replace(month = 12, day = 1) + else: + cdate = cdate.replace(month = (cdate.month/2)*2, day = 1) + tableName = "jobsArchived_%s%s" % (cdate.strftime('%b'),cdate.year) + if not tableName in tables: + tables.append(tableName) + # one older table + if cdate.month > 2: + cdate = cdate.replace(month = (cdate.month-2)) + else: + cdate = cdate.replace(year = (cdate.year-1), month = 12) + # return + return tables + + + # get JobIDs in a time range + def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs): + comment = ' /* LogDBProxy.getJobIDsInTimeRange */' + _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) + try: + # get list of archived tables + tables = self.getArchiveTables() + # select + for table in tables: + # make sql + sql = "SELECT jobDefinitionID FROM %s " % table + sql += "WHERE prodUserID=%s AND modificationTime>%s AND prodSourceLabel='user'" + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + _logger.debug(sql+comment+str((dn,timeRange.strftime('%Y-%m-%d %H:%M:%S')))) + self.cur.execute(sql+comment, (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for tmpID, in resList: + if not tmpID in retJobIDs: + retJobIDs.append(tmpID) + _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs)) + return retJobIDs + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getJobIDsInTimeRange : %s %s" % (type,value)) + # return empty list + return retJobIDs + + + # get PandaIDs for a JobID + def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs): + comment = ' /* LogProxy.getPandIDsWithJobID */' + _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID)) + try: + # get list of archived tables + tables = self.getArchiveTables() + # select + for table in tables: + # skip if all jobs have already been gotten + if nJobs > 0 and len(idStatus) >= nJobs: + continue + # make sql + sql = "SELECT PandaID,jobStatus,commandToPilot FROM %s " % table + sql += "WHERE prodUserID=%s AND jobDefinitionID=%s " + sql += "AND prodSourceLabel in ('user','panda') " + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + _logger.debug(sql+comment+str((dn,jobID))) + self.cur.execute(sql+comment, (dn,jobID)) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for tmpID,tmpStatus,tmpCommand in resList: + if not idStatus.has_key(tmpID): + idStatus[tmpID] = (tmpStatus,tmpCommand) + _logger.debug("getPandIDsWithJobID : %s" % str(idStatus)) + return idStatus + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getPandIDsWithJobID : %s %s" % (type,value)) + # return empty list + return {} + + + # peek at job + def peekJob(self,pandaID): + comment = ' /* LogDBProxy.peekJob */' + _logger.debug("peekJob : %s" % pandaID) + # return None for NULL PandaID + if pandaID in ['NULL','','None',None]: + return None + sql1_0 = "SELECT %s FROM %s " + sql1_1 = "WHERE PandaID=%s" + try: + # get list of archived tables + tables = self.getArchiveTables() + # select + for table in tables: + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1 + self.cur.execute(sql+comment, (pandaID,)) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if len(res) != 0: + # Job + job = JobSpec() + job.pack(res[0]) + # Files + # set autocommit on + self.cur.execute("SET AUTOCOMMIT=1") + # select + fileTableName = re.sub('jobsArchived','filesTable',table) + sqlFile = "SELECT %s " % FileSpec.columnNames() + sqlFile+= "FROM %s " % fileTableName + sqlFile+= "WHERE PandaID=%s" + self.cur.execute(sqlFile+comment, (job.PandaID,)) + resFs = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # set files + for resF in resFs: + file = FileSpec() + file.pack(resF) + job.addFile(file) + return job + _logger.debug("peekJob() : PandaID %s not found" % pandaID) + return None + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("peekJob : %s %s" % (type,value)) + # return None + return None + + + # wake up connection + def wakeUp(self): + for iTry in range(5): + try: + # check if the connection is working + self.conn.ping() + return + except: + type, value, traceBack = sys.exc_info() + _logger.debug("wakeUp %d : %s %s" % (iTry,type,value)) + # wait for reconnection + time.sleep(1) + self.connect(reconnect=True) + + + # close + def close(self): + try: + self.cur.close() + self.conn.close() + except: + type, value, traceBack = sys.exc_info() + _logger.error("close : %s %s" % (type,value)) + + + # commit + def _commit(self): + try: + self.conn.commit() + return True + except: + _logger.error("commit error") + return False + + + # rollback + def _rollback(self): + try: + self.conn.rollback() + return True + except: + _logger.error("rollback error") + return False + diff --git a/current/pandaserver/taskbuffer/LogDBProxyPool.py b/current/pandaserver/taskbuffer/LogDBProxyPool.py new file mode 100755 index 000000000..c9f986741 --- /dev/null +++ b/current/pandaserver/taskbuffer/LogDBProxyPool.py @@ -0,0 +1,52 @@ +""" +pool for LogDBProxies + +""" + +import time +import Queue +import random +import OraLogDBProxy as LogDBProxy +from config import panda_config + +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('LogDBProxyPool') + +class LogDBProxyPool: + + def __init__(self,nConnection=panda_config.nLogDBConnection): + # create Proxies + _logger.debug("init") + self.proxyList = Queue.Queue(nConnection) + for i in range(nConnection): + _logger.debug("connect -> %s " % i) + proxy = LogDBProxy.LogDBProxy() + nTry = 10 + for iTry in range(nTry): + if proxy.connect(): + break + _logger.debug("failed -> %s : try %s" % (i,iTry)) + if iTry+1 == nTry: + raise RuntimeError, 'LogDBProxyPool.__init__ failed' + time.sleep(random.randint(10,20)) + self.proxyList.put(proxy) + time.sleep(1) + _logger.debug("ready") + + # return a free proxy. this method blocks until a proxy is available + def getProxy(self): + # get proxy + proxy = self.proxyList.get() + # wake up connection + proxy.wakeUp() + # return + return proxy + + + # put back a proxy + def putProxy(self,proxy): + # put + self.proxyList.put(proxy) + diff --git a/current/pandaserver/taskbuffer/MemProxy.py b/current/pandaserver/taskbuffer/MemProxy.py new file mode 100644 index 000000000..025b6a815 --- /dev/null +++ b/current/pandaserver/taskbuffer/MemProxy.py @@ -0,0 +1,205 @@ +# proxy for memcached + +import sys + +from config import panda_config + +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('MemProxy') + + +# proxy +class MemProxy: + + # constructor + def __init__(self): + try: + import memcache + # initialize memcached client + _logger.debug("initialize memcache client with %s" % panda_config.memcached_srvs) + self.mclient = memcache.Client(panda_config.memcached_srvs.split(',')) + # server statistics + _logger.debug(self.mclient.get_stats()) + _logger.debug("memcache client is ready") + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("failed to initialize memcach client : %s %s" % (errType,errValue)) + + + # insert files + def setFiles(self,pandaID,site,node,files): + try: + _logger.debug("setFiles site=%s node=%s start" % (site,node)) + # key prefix + keyPrefix = self.getKeyPrefix(site,node) + # failed to get key prefix + if keyPrefix == None: + _logger.error("setFiles failed to get key prefix") + return False + # loop over all files + varMap = {} + for tmpFile in files: + newKey = tmpFile + varMap[newKey] = True + # bulk insert + failedList = self.mclient.set_multi(varMap,time=panda_config.memcached_exptime, + key_prefix=keyPrefix) + # failed + if failedList != []: + _logger.error("setFiles failed to insert %s values for site=%s node=%s" % \ + (len(failedList),site,node)) + return False + _logger.debug("setFiles site=%s node=%s completed" % (site,node)) + return True + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("setFiles failed with %s %s" % (errType,errValue)) + return False + + + # delete files + def deleteFiles(self,site,node,files): + try: + fileList = files.split(',') + # remove '' + try: + fileList.remove('') + except: + pass + _logger.debug("deleteFiles for %s:%s:%s start" % (site,node,len(fileList))) + # empty list + if len(fileList) == 0: + _logger.debug("deleteFiles skipped for empty list") + return True + # key prefix + keyPrefix = self.getKeyPrefix(site,node) + # non-existing key + if keyPrefix == None: + _logger.debug("deleteFiles skipped for non-existing key") + return True + # get the number of bunches + nKeys = 100 + tmpDiv,tmpMod = divmod(len(fileList),nKeys) + if tmpMod != 0: + tmpDiv += 1 + # loop over all bunches + retMap = {True:0,False:0} + for idxB in range(tmpDiv): + # delete + retD = self.mclient.delete_multi(fileList[idxB*nKeys:(idxB+1)*nKeys], + key_prefix=keyPrefix) + if retD == 1: + retMap[True] += 1 + else: + retMap[False] += 1 + # failed + if retMap[False] != 0: + _logger.error("deleteFiles failed %s/%s" % (retMap[False], + retMap[True]+retMap[False])) + return False + _logger.debug("deleteFiles succeeded") + return True + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("deleteFiles failed with %s %s" % (errType,errValue)) + return False + + + # check files + def checkFiles(self,pandaID,files,site,node,keyPrefix='',getDetail=False): + try: + _logger.debug("checkFiles PandaID=%s with %s:%s start" % (pandaID,site,node)) + # get key prefix + if keyPrefix == '': + keyPrefix = self.getKeyPrefix(site,node) + # non-existing key + if keyPrefix == None: + _logger.debug("checkFiles PandaID=%s with %s:%s doesn't exist" % \ + (pandaID,site,node)) + return 0 + # loop over all files + keyList = [] + for tmpFile in files: + newKey = tmpFile + if not newKey in keyList: + keyList.append(newKey) + # bulk get + retMap = self.mclient.get_multi(keyList,key_prefix=keyPrefix) + _logger.debug("checkFiles PandaID=%s with %s:%s has %s files" % \ + (pandaID,site,node,len(retMap))) + # return detailed string + if getDetail: + retStr = '' + for tmpFile in files: + if retMap.has_key(tmpFile): + retStr += '1,' + else: + retStr += '0,' + retStr = retStr[:-1] + return retStr + # return number of files + return len(retMap) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("checkFiles failed with %s %s" % (errType,errValue)) + return 0 + + + # flush files + def flushFiles(self,site,node): + try: + _logger.debug("flushFiles for %s:%s start" % (site,node)) + # key prefix stored in memcached + keyPrefix = self.getInternalKeyPrefix(site,node) + # increment + serNum = self.mclient.incr(keyPrefix) + # return if not exist + if serNum == None: + _logger.debug("flushFiles skipped for non-existing key") + return True + # avoid overflow + if serNum > 1024: + serNum = 0 + # set + retS = self.mclient.set(keyPrefix,serNum,time=panda_config.memcached_exptime) + if retS == 0: + # failed + _logger.error("flushFiles failed to set new SN") + return False + _logger.error("flushFiles completed") + return True + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("flushFiles failed with %s %s" % (errType,errValue)) + return False + + + # get internal key prefix + def getInternalKeyPrefix(self,site,node): + # get short WN name + shortWN = node.split('.')[0] + # key prefix stored in memcached + keyPrefix = '%s_%s' % (site,shortWN) + return keyPrefix + + + # get key prefix + def getKeyPrefix(self,site,node): + # key prefix stored in memcached + keyPrefix = self.getInternalKeyPrefix(site,node) + # get serial number from memcached + serNum = self.mclient.get(keyPrefix) + # use 0 if not exist + if serNum == None: + serNum = 0 + # set to avoid expiration + retS = self.mclient.set(keyPrefix,serNum,time=panda_config.memcached_exptime) + if retS == 0: + # failed + return None + else: + # return prefix site_node_sn_ + newPrefix = '%s_%s' % (keyPrefix,serNum) + return newPrefix diff --git a/current/pandaserver/taskbuffer/OraDBProxy.py b/current/pandaserver/taskbuffer/OraDBProxy.py new file mode 100755 index 000000000..1295ef360 --- /dev/null +++ b/current/pandaserver/taskbuffer/OraDBProxy.py @@ -0,0 +1,10739 @@ +""" +proxy for database connection + +""" + +import re +import os +import sys +import time +import fcntl +import types +import random +import urllib +import socket +import datetime +import commands +import traceback +import warnings +import cx_Oracle +import ErrorCode +import SiteSpec +import CloudSpec +import PrioUtil +import ProcessGroups +from JobSpec import JobSpec +from FileSpec import FileSpec +from DatasetSpec import DatasetSpec +from CloudTaskSpec import CloudTaskSpec +from pandalogger.PandaLogger import PandaLogger +from config import panda_config +from brokerage.PandaSiteIDs import PandaSiteIDs + +warnings.filterwarnings('ignore') + +# logger +_logger = PandaLogger().getLogger('DBProxy') + +# lock file +_lockGetSN = open(panda_config.lockfile_getSN, 'w') +_lockSetDS = open(panda_config.lockfile_setDS, 'w') +_lockGetCT = open(panda_config.lockfile_getCT, 'w') + + +# proxy +class DBProxy: + + # constructor + def __init__(self,useOtherError=False): + # connection object + self.conn = None + # cursor object + self.cur = None + # host name + self.hostname = None + # retry count + self.nTry = 5 + # use special error codes for reconnection in querySQL + self.useOtherError = useOtherError + # memcached client + self.memcache = None + # pledge resource ratio + self.beyondPledgeRatio = {} + # update time for pledge resource ratio + self.updateTimeForPledgeRatio = None + # fareshare policy + self.faresharePolicy = {} + # update time for fareshare policy + self.updateTimeForFaresharePolicy = None + # hostname + self.myHostName = socket.getfqdn() + + + # connect to DB + def connect(self,dbhost=panda_config.dbhost,dbpasswd=panda_config.dbpasswd, + dbuser=panda_config.dbuser,dbname=panda_config.dbname, + dbtimeout=None,reconnect=False): + _logger.debug("connect : re=%s" % reconnect) + # keep parameters for reconnect + if not reconnect: + self.dbhost = dbhost + self.dbpasswd = dbpasswd + self.dbuser = dbuser + self.dbname = dbname + self.dbtimeout = dbtimeout + # close old connection + if reconnect: + _logger.debug("closing old connection") + try: + self.conn.close() + except: + _logger.debug("failed to close old connection") + # connect + try: + self.conn = cx_Oracle.connect(dsn=self.dbhost,user=self.dbuser, + password=self.dbpasswd,threaded=True) + self.cur=self.conn.cursor() + try: + # use SQL dumper + if panda_config.dump_sql: + import SQLDumper + self.cur = SQLDumper.SQLDumper(self.cur) + except: + pass + # get hostname + self.cur.execute("SELECT SYS_CONTEXT('USERENV','HOST') FROM dual") + res = self.cur.fetchone() + if res != None: + self.hostname = res[0] + # set TZ + self.cur.execute("ALTER SESSION SET TIME_ZONE='UTC'") + # set DATE format + self.cur.execute("ALTER SESSION SET NLS_DATE_FORMAT='YYYY/MM/DD HH24:MI:SS'") + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("connect : %s %s" % (type,value)) + return False + + + # query an SQL + def querySQL(self,sql,arraySize=1000): + comment = ' /* DBProxy.querySQL */' + try: + _logger.debug("querySQL : %s " % sql) + # begin transaction + self.conn.begin() + self.cur.arraysize = arraySize + self.cur.execute(sql+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return res + except: + # roll back + self._rollback(self.useOtherError) + type, value, traceBack = sys.exc_info() + _logger.error("querySQL : %s " % sql) + _logger.error("querySQL : %s %s" % (type,value)) + return None + + + # query an SQL return Status + def querySQLS(self,sql,varMap,arraySize=1000): + comment = ' /* DBProxy.querySQLS */' + try: + # begin transaction + self.conn.begin() + self.cur.arraysize = arraySize + ret = self.cur.execute(sql+comment,varMap) + if sql.startswith('INSERT') or sql.startswith('UPDATE') or \ + sql.startswith('DELETE'): + res = self.cur.rowcount + else: + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return ret,res + except: + # roll back + self._rollback(self.useOtherError) + type, value, traceBack = sys.exc_info() + _logger.error("querySQLS : %s %s" % (sql,str(varMap))) + _logger.error("querySQLS : %s %s" % (type,value)) + return -1,None + + + # get CLOB + def getClobObj(self,sql,varMap,arraySize=10000): + comment = ' /* DBProxy.getClobObj */' + try: + # begin transaction + self.conn.begin() + self.cur.arraysize = arraySize + ret = self.cur.execute(sql+comment,varMap) + res = [] + for items in self.cur: + resItem = [] + for item in items: + # read CLOB + resItem.append(item.read()) + # append + res.append(resItem) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return ret,res + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getClobObj : %s %s" % (sql,str(varMap))) + _logger.error("getClobObj : %s %s" % (type,value)) + return -1,None + + + # insert job to jobsDefined + def insertNewJob(self,job,user,serNum,weight=0.0,priorityOffset=0,userVO=None,groupJobSN=0,toPending=False): + comment = ' /* DBProxy.insertNewJob */' + if not toPending: + sql1 = "INSERT INTO ATLAS_PANDA.jobsDefined4 (%s) " % JobSpec.columnNames() + else: + sql1 = "INSERT INTO ATLAS_PANDA.jobsWaiting4 (%s) " % JobSpec.columnNames() + sql1+= JobSpec.bindValuesExpression(useSeq=True) + sql1+= " RETURNING PandaID INTO :newPandaID" + # make sure PandaID is NULL + job.PandaID = None + # job status + if not toPending: + job.jobStatus='defined' + else: + job.jobStatus='pending' + # host and time information + job.modificationHost = self.hostname + job.creationTime = datetime.datetime.utcnow() + job.modificationTime = job.creationTime + job.stateChangeTime = job.creationTime + job.prodDBUpdateTime = datetime.datetime(1,1,1) + # DN + if job.prodUserID == "NULL" or job.prodSourceLabel in ['user','panda']: + job.prodUserID = user + # compact user name + job.prodUserName = self.cleanUserID(job.prodUserID) + if job.prodUserName in ['','NULL']: + # use prodUserID as compact user name + job.prodUserName = job.prodUserID + # VO + job.VO = userVO + # priority + if job.assignedPriority != 'NULL': + job.currentPriority = job.assignedPriority + if job.prodSourceLabel == 'install': + job.currentPriority = 4100 + elif job.prodSourceLabel == 'user': + if job.processingType in ['usermerge'] and not job.currentPriority in ['NULL',None]: + # avoid prio reduction for merge jobs + pass + else: + job.currentPriority = PrioUtil.calculatePriority(priorityOffset,serNum,weight) + if 'express' in job.specialHandling: + job.currentPriority = 6000 + elif job.prodSourceLabel == 'panda': + job.currentPriority = 2000 + priorityOffset + if 'express' in job.specialHandling: + job.currentPriority = 6500 + # usergroup + if job.prodSourceLabel == 'regional': + job.computingSite= "BNLPROD" + # group job SN + groupJobSN = "%05d" % groupJobSN + # set attempt numbers + if job.prodSourceLabel in ['user','panda','ptest','rc_test']: + if job.attemptNr in [None,'NULL','']: + job.attemptNr = 0 + if job.maxAttempt in [None,'NULL','']: + job.maxAttempt = 0 + # set maxAttempt to attemptNr to disable server/pilot retry + if job.maxAttempt == -1: + job.maxAttempt = job.attemptNr + else: + # set maxAttempt to have server/pilot retries for retried jobs + if job.maxAttempt <= job.attemptNr: + job.maxAttempt = job.attemptNr + 2 + try: + # begin transaction + self.conn.begin() + # insert + varMap = job.valuesMap(useSeq=True) + varMap[':newPandaID'] = self.cur.var(cx_Oracle.NUMBER) + retI = self.cur.execute(sql1+comment, varMap) + # set PandaID + job.PandaID = long(varMap[':newPandaID'].getvalue()) + # get jobsetID + if job.jobsetID in [None,'NULL',-1]: + jobsetID = 0 + else: + jobsetID = job.jobsetID + jobsetID = '%06d' % jobsetID + # reset changed attribute list + job.resetChangedList() + # insert files + _logger.debug("insertNewJob : %s Label:%s prio:%s" % (job.PandaID,job.prodSourceLabel, + job.currentPriority)) + sqlFile = "INSERT INTO ATLAS_PANDA.filesTable4 (%s) " % FileSpec.columnNames() + sqlFile+= FileSpec.bindValuesExpression(useSeq=True) + sqlFile+= " RETURNING row_ID INTO :newRowID" + for file in job.Files: + file.row_ID = None + if file.status != 'ready': + file.status='unknown' + # replace $PANDAID with real PandaID + file.lfn = re.sub('\$PANDAID', '%05d' % job.PandaID, file.lfn) + # replace $JOBSETID with real jobsetID + if not job.prodSourceLabel in ['managed']: + file.lfn = re.sub('\$JOBSETID', jobsetID, file.lfn) + file.lfn = re.sub('\$GROUPJOBSN', groupJobSN, file.lfn) + # set scope + if file.type in ['output','log'] and job.VO in ['atlas']: + file.scope = self.extractScope(file.dataset) + # insert + varMap = file.valuesMap(useSeq=True) + varMap[':newRowID'] = self.cur.var(cx_Oracle.NUMBER) + self.cur.execute(sqlFile+comment, varMap) + # get rowID + file.row_ID = long(varMap[':newRowID'].getvalue()) + # reset changed attribute list + file.resetChangedList() + # metadata + if job.prodSourceLabel in ['user','panda'] and job.metadata != '': + sqlMeta = "INSERT INTO ATLAS_PANDA.metaTable (PandaID,metaData) VALUES (:PandaID,:metaData)" + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':metaData'] = job.metadata + self.cur.execute(sqlMeta+comment, varMap) + # job parameters + if not job.prodSourceLabel in ['managed']: + job.jobParameters = re.sub('\$JOBSETID', jobsetID, job.jobParameters) + job.jobParameters = re.sub('\$GROUPJOBSN', groupJobSN, job.jobParameters) + sqlJob = "INSERT INTO ATLAS_PANDA.jobParamsTable (PandaID,jobParameters) VALUES (:PandaID,:param)" + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':param'] = job.jobParameters + self.cur.execute(sqlJob+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("insertNewJob : %s File OK" % job.PandaID) + # record status change + try: + self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) + except: + _logger.error('recordStatusChange in insertNewJob') + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("insertNewJob : %s %s" % (type,value)) + # roll back + self._rollback() + return False + + + # simply insert job to a table + def insertJobSimple(self,job,table,fileTable,jobParamsTable,metaTable): + comment = ' /* DBProxy.insertJobSimple */' + _logger.debug("insertJobSimple : %s" % job.PandaID) + sql1 = "INSERT INTO %s (%s) " % (table,JobSpec.columnNames()) + sql1+= JobSpec.bindValuesExpression() + try: + # begin transaction + self.conn.begin() + # insert + self.cur.execute(sql1+comment, job.valuesMap()) + # files + sqlFile = "INSERT INTO %s " % fileTable + sqlFile+= "(%s) " % FileSpec.columnNames(withMod=True) + sqlFile+= FileSpec.bindValuesExpression(withMod=True) + for file in job.Files: + varMap = file.valuesMap() + varMap[':modificationTime'] = job.modificationTime + self.cur.execute(sqlFile+comment, varMap) + # job parameters + sqlJob = "INSERT INTO %s (PandaID,jobParameters,modificationTime) VALUES (:PandaID,:param,:modificationTime)" \ + % jobParamsTable + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':param'] = job.jobParameters + varMap[':modificationTime'] = job.modificationTime + self.cur.execute(sqlJob+comment, varMap) + # metadata + if not job.metadata in [None,'NULL','']: + sqlMeta = "INSERT INTO %s (PandaID,metaData,modificationTime) VALUES(:PandaID,:metaData,:modificationTime)" \ + % metaTable + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':metaData'] = job.metadata + varMap[':modificationTime'] = job.modificationTime + self.cur.execute(sqlMeta+comment,varMap) + # set flag to avoid duplicated insertion attempts + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':archivedFlag'] = 1 + sqlArch = "UPDATE ATLAS_PANDA.jobsArchived4 SET archivedFlag=:archivedFlag WHERE PandaID=:PandaID" + self.cur.execute(sqlArch+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("insertJobSimple : %s %s" % (type,value)) + # roll back + self._rollback() + return False + + + # simply insert job to a table without reading + def insertJobSimpleUnread(self,pandaID,modTime): + comment = ' /* DBProxy.insertJobSimpleUnread */' + _logger.debug("insertJobSimpleUnread : %s" % pandaID) + # check + sqlC = "SELECT archivedFlag FROM ATLAS_PANDA.jobsArchived4 " + sqlC += "WHERE PandaID=:pandaID " + # job + sqlJ = "INSERT INTO ATLAS_PANDAARCH.jobsArchived (%s) " % JobSpec.columnNames() + sqlJ += "SELECT %s FROM ATLAS_PANDA.jobsArchived4 " % JobSpec.columnNames() + sqlJ += "WHERE PandaID=:pandaID " + # file + sqlF = "INSERT INTO ATLAS_PANDAARCH.filesTable_ARCH (%s) " % FileSpec.columnNames(withMod=True) + sqlF += "SELECT %s,:modTime FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames(withMod=False) + sqlF += "WHERE PandaID=:pandaID " + # parameters + sqlP = "INSERT INTO ATLAS_PANDAARCH.jobParamsTable_ARCH (PandaID,jobParameters,modificationTime) " + sqlP += "SELECT PandaID,jobParameters,:modTime FROM ATLAS_PANDA.jobParamsTable " + sqlP += "WHERE PandaID=:pandaID " + # metadata + sqlM1 = "SELECT PandaID FROM ATLAS_PANDA.metaTable " + sqlM1 += "WHERE PandaID=:pandaID AND rownum<=1 " + sqlM2 = "INSERT INTO ATLAS_PANDAARCH.metaTable_ARCH (PandaID,metaData,modificationTime) " + sqlM2 += "SELECT PandaID,metaData,:modTime FROM ATLAS_PANDA.metaTable " + sqlM2 += "WHERE PandaID=:pandaID " + try: + # begin transaction + self.conn.begin() + # check + varMap = {} + varMap[':pandaID'] = pandaID + self.cur.execute(sqlC+comment,varMap) + res = self.cur.fetchone() + if res == None or res[0] == 1: + if res == None: + _logger.error("insertJobSimpleUnread : %s cannot get archivedFlag" % pandaID) + else: + _logger.debug("insertJobSimpleUnread : %s skip" % pandaID) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + # insert + varMap = {} + varMap[':pandaID'] = pandaID + self.cur.execute(sqlJ+comment,varMap) + varMap = {} + varMap[':pandaID'] = pandaID + varMap[':modTime'] = modTime + self.cur.execute(sqlF+comment,varMap) + varMap = {} + varMap[':pandaID'] = pandaID + varMap[':modTime'] = modTime + self.cur.execute(sqlP+comment,varMap) + varMap = {} + varMap[':pandaID'] = pandaID + self.cur.execute(sqlM1+comment,varMap) + res = self.cur.fetchone() + if res != None: + varMap = {} + varMap[':pandaID'] = pandaID + varMap[':modTime'] = modTime + self.cur.execute(sqlM2+comment,varMap) + # set flag to avoid duplicated insertion attempts + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':archivedFlag'] = 1 + sqlArch = "UPDATE ATLAS_PANDA.jobsArchived4 SET archivedFlag=:archivedFlag WHERE PandaID=:PandaID" + self.cur.execute(sqlArch+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("insertJobSimpleUnread %s : %s %s" % (pandaID,type,value)) + # roll back + self._rollback() + return False + + + # delete job + def deleteJobSimple(self,pandaID): + comment = ' /* DBProxy.deleteJobSimple */' + _logger.debug("deleteJobSimple : %s" % pandaID) + try: + # begin transaction + self.conn.begin() + # delete + varMap = {} + varMap[':PandaID'] = pandaID + sql = 'DELETE from ATLAS_PANDA.jobsArchived4 WHERE PandaID=:PandaID' + self.cur.execute(sql+comment, varMap) + sql = "DELETE from ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID" + self.cur.execute(sql+comment, varMap) + sql = "DELETE from ATLAS_PANDA.metaTable WHERE PandaID=:PandaID" + self.cur.execute(sql+comment, varMap) + sql = "DELETE from ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" + self.cur.execute(sql+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + type, value = sys.exc_info()[:2] + _logger.error("deleteJobSimple %s : %s %s" % (pandaID,type,value)) + # roll back + self._rollback() + return False + + + # activate job. move job from jobsDefined to jobsActive + def activateJob(self,job): + comment = ' /* DBProxy.activateJob */' + updatedFlag = False + if job==None: + _logger.debug("activateJob : None") + return True + _logger.debug("activateJob : %s" % job.PandaID) + sql0 = "SELECT row_ID FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type AND NOT status IN (:status1,:status2) " + sql1 = "DELETE FROM ATLAS_PANDA.jobsDefined4 " + sql1+= "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2) AND commandToPilot IS NULL" + sql2 = "INSERT INTO ATLAS_PANDA.jobsActive4 (%s) " % JobSpec.columnNames() + sql2+= JobSpec.bindValuesExpression() + # host and time information + job.modificationTime = datetime.datetime.utcnow() + # set stateChangeTime for defined->activated but not for assigned->activated + if job.jobStatus in ['defined']: + job.stateChangeTime = job.modificationTime + nTry=3 + for iTry in range(nTry): + try: + # check if all files are ready + allOK = True + for file in job.Files: + if file.type == 'input' and not file.status in ['ready','cached']: + allOK = False + break + # begin transaction + self.conn.begin() + # check all inputs are ready + varMap = {} + varMap[':type'] = 'input' + varMap[':status1'] = 'ready' + varMap[':status2'] = 'cached' + varMap[':PandaID'] = job.PandaID + self.cur.arraysize = 100 + self.cur.execute(sql0+comment, varMap) + res = self.cur.fetchall() + if len(res) == 0 or allOK: + # change status + job.jobStatus = "activated" + # delete + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':oldJobStatus1'] = 'assigned' + varMap[':oldJobStatus2'] = 'defined' + self.cur.execute(sql1+comment, varMap) + n = self.cur.rowcount + if n==0: + # already killed or activated + _logger.debug("activateJob : Not found %s" % job.PandaID) + else: + # insert + self.cur.execute(sql2+comment, job.valuesMap()) + # update files + for file in job.Files: + sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" + varMap = file.valuesMap(onlyChanged=True) + if varMap != {}: + varMap[':row_ID'] = file.row_ID + _logger.debug(sqlF+comment+str(varMap)) + self.cur.execute(sqlF+comment, varMap) + # job parameters + sqlJob = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID" + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':param'] = job.jobParameters + self.cur.execute(sqlJob+comment, varMap) + updatedFlag = True + else: + # update job + sqlJ = ("UPDATE ATLAS_PANDA.jobsDefined4 SET %s " % job.bindUpdateChangesExpression()) + \ + "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)" + varMap = job.valuesMap(onlyChanged=True) + varMap[':PandaID'] = job.PandaID + varMap[':oldJobStatus1'] = 'assigned' + varMap[':oldJobStatus2'] = 'defined' + _logger.debug(sqlJ+comment+str(varMap)) + self.cur.execute(sqlJ+comment, varMap) + n = self.cur.rowcount + if n==0: + # already killed or activated + _logger.debug("activateJob : Not found %s" % job.PandaID) + else: + # update files + for file in job.Files: + sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" + varMap = file.valuesMap(onlyChanged=True) + if varMap != {}: + varMap[':row_ID'] = file.row_ID + _logger.debug(sqlF+comment+str(varMap)) + self.cur.execute(sqlF+comment, varMap) + # job parameters + sqlJob = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID" + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':param'] = job.jobParameters + self.cur.execute(sqlJob+comment, varMap) + updatedFlag = True + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # record status change + try: + if updatedFlag: + self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) + except: + _logger.error('recordStatusChange in activateJob') + return True + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("activateJob : %s retry : %s" % (job.PandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("activateJob : %s %s" % (type,value)) + return False + + + # send job to jobsWaiting + def keepJob(self,job): + comment = ' /* DBProxy.keepJob */' + _logger.debug("keepJob : %s" % job.PandaID) + sql1 = "DELETE FROM ATLAS_PANDA.jobsDefined4 " + sql1+= "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2) AND commandToPilot IS NULL" + sql2 = "INSERT INTO ATLAS_PANDA.jobsWaiting4 (%s) " % JobSpec.columnNames() + sql2+= JobSpec.bindValuesExpression() + # time information + job.modificationTime = datetime.datetime.utcnow() + job.stateChangeTime = job.modificationTime + updatedFlag = False + nTry=3 + for iTry in range(nTry): + try: + # begin transaction + self.conn.begin() + # delete + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':oldJobStatus1'] = 'assigned' + varMap[':oldJobStatus2'] = 'defined' + self.cur.execute(sql1+comment, varMap) + n = self.cur.rowcount + if n==0: + # already killed + _logger.debug("keepJob : Not found %s" % job.PandaID) + else: + # set status + job.jobStatus = 'waiting' + # insert + self.cur.execute(sql2+comment, job.valuesMap()) + # update files + for file in job.Files: + sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" + varMap = file.valuesMap(onlyChanged=True) + if varMap != {}: + varMap[':row_ID'] = file.row_ID + _logger.debug(sqlF+comment+str(varMap)) + self.cur.execute(sqlF+comment, varMap) + updatedFlag = True + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # record status change + try: + if updatedFlag: + self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) + except: + _logger.error('recordStatusChange in keepJob') + return True + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("keepJob : %s retry : %s" % (job.PandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("keepJob : %s %s" % (type,value)) + return False + + + # archive job to jobArchived and remove the job from jobsActive or jobsDefined + def archiveJob(self,job,fromJobsDefined): + comment = ' /* DBProxy.archiveJob */' + _logger.debug("archiveJob : %s" % job.PandaID) + if fromJobsDefined: + sql1 = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)" + else: + sql1 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID" + sql2 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() + sql2+= JobSpec.bindValuesExpression() + updatedJobList = [] + nTry=3 + for iTry in range(nTry): + try: + # begin transaction + self.conn.begin() + # delete + varMap = {} + varMap[':PandaID'] = job.PandaID + if fromJobsDefined: + varMap[':oldJobStatus1'] = 'assigned' + varMap[':oldJobStatus2'] = 'defined' + self.cur.execute(sql1+comment, varMap) + n = self.cur.rowcount + if n==0: + # already killed + _logger.debug("archiveJob : Not found %s" % job.PandaID) + else: + # insert + job.modificationTime = datetime.datetime.utcnow() + job.stateChangeTime = job.modificationTime + if job.endTime == 'NULL': + job.endTime = job.modificationTime + self.cur.execute(sql2+comment, job.valuesMap()) + # update files + for file in job.Files: + sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" + varMap = file.valuesMap(onlyChanged=True) + if varMap != {}: + varMap[':row_ID'] = file.row_ID + _logger.debug(sqlF+comment+str(varMap)) + self.cur.execute(sqlF+comment, varMap) + # update metadata and parameters + sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':modificationTime'] = job.modificationTime + self.cur.execute(sqlFMod+comment,varMap) + self.cur.execute(sqlMMod+comment,varMap) + self.cur.execute(sqlPMod+comment,varMap) + # increment the number of failed jobs in _dis + myDisList = [] + if job.jobStatus == 'failed' and job.prodSourceLabel in ['managed','test']: + for tmpFile in job.Files: + if tmpFile.type == 'input' and not tmpFile.dispatchDBlock in ['','NULL',None] \ + and not tmpFile.dispatchDBlock in myDisList: + varMap = {} + varMap[':name'] = tmpFile.dispatchDBlock + # check currentfiles + sqlGetCurFiles = """SELECT /*+ BEGIN_OUTLINE_DATA """ + sqlGetCurFiles += """INDEX_RS_ASC(@"SEL$1" "TAB"@"SEL$1" ("DATASETS"."NAME")) """ + sqlGetCurFiles += """OUTLINE_LEAF(@"SEL$1") ALL_ROWS """ + sqlGetCurFiles += """OPTIMIZER_FEATURES_ENABLE('10.2.0.4') """ + sqlGetCurFiles += """IGNORE_OPTIM_EMBEDDED_HINTS """ + sqlGetCurFiles += """END_OUTLINE_DATA */ """ + sqlGetCurFiles += "currentfiles,vuid FROM ATLAS_PANDA.Datasets tab WHERE name=:name" + self.cur.execute(sqlGetCurFiles+comment,varMap) + resCurFiles = self.cur.fetchone() + _logger.debug("archiveJob : %s %s" % (job.PandaID,str(resCurFiles))) + if resCurFiles != None: + # increment currentfiles only for the first failed job since that is enough + tmpCurrentFiles,tmpVUID = resCurFiles + _logger.debug("archiveJob : %s %s currentfiles=%s" % (job.PandaID,tmpFile.dispatchDBlock,tmpCurrentFiles)) + if tmpCurrentFiles == 0: + _logger.debug("archiveJob : %s %s update currentfiles" % (job.PandaID,tmpFile.dispatchDBlock)) + varMap = {} + varMap[':vuid'] = tmpVUID + sqlFailedInDis = 'UPDATE ATLAS_PANDA.Datasets ' + sqlFailedInDis += 'SET currentfiles=currentfiles+1 WHERE vuid=:vuid' + self.cur.execute(sqlFailedInDis+comment,varMap) + myDisList.append(tmpFile.dispatchDBlock) + # collect to record state change + updatedJobList.append(job) + # delete downstream jobs + ddmIDs = [] + newJob = None + ddmAttempt = 0 + if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed': + # look for outputs + upOutputs = [] + for file in job.Files: + if file.type == 'output': + upOutputs.append(file.lfn) + toBeClosedSubList = {} + topUserDsList = [] + # look for downstream jobs + sqlD = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND lfn=:lfn GROUP BY PandaID" + sqlDJS = "SELECT %s " % JobSpec.columnNames() + sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" + sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" + sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() + sqlDJI+= JobSpec.bindValuesExpression() + sqlDFup = "UPDATE ATLAS_PANDA.filesTable4 SET status=:status WHERE PandaID=:PandaID AND type IN (:type1,:type2)" + sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlGetSub = "SELECT DISTINCT destinationDBlock FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND PandaID=:PandaID" + sqlCloseSub = 'UPDATE /*+ INDEX_RS_ASC(TAB("DATASETS"."NAME")) */ ATLAS_PANDA.Datasets tab ' + sqlCloseSub += 'SET status=:status,modificationDate=CURRENT_DATE WHERE name=:name' + for upFile in upOutputs: + _logger.debug("look for downstream jobs for %s" % upFile) + # select PandaID + varMap = {} + varMap[':lfn'] = upFile + varMap[':type'] = 'input' + self.cur.arraysize = 100000 + self.cur.execute(sqlD+comment, varMap) + res = self.cur.fetchall() + for downID, in res: + _logger.debug("delete : %s" % downID) + # select jobs + varMap = {} + varMap[':PandaID'] = downID + self.cur.arraysize = 10 + self.cur.execute(sqlDJS+comment, varMap) + resJob = self.cur.fetchall() + if len(resJob) == 0: + continue + # instantiate JobSpec + dJob = JobSpec() + dJob.pack(resJob[0]) + # delete + varMap = {} + varMap[':PandaID'] = downID + self.cur.execute(sqlDJD+comment, varMap) + retD = self.cur.rowcount + if retD == 0: + continue + # error code + dJob.jobStatus = 'cancelled' + dJob.endTime = datetime.datetime.utcnow() + dJob.taskBufferErrorCode = ErrorCode.EC_Kill + dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed' + dJob.modificationTime = dJob.endTime + dJob.stateChangeTime = dJob.endTime + # insert + self.cur.execute(sqlDJI+comment, dJob.valuesMap()) + # update file status + varMap = {} + varMap[':PandaID'] = downID + varMap[':status'] = 'failed' + varMap[':type1'] = 'output' + varMap[':type2'] = 'log' + self.cur.execute(sqlDFup+comment, varMap) + # update files,metadata,parametes + varMap = {} + varMap[':PandaID'] = downID + varMap[':modificationTime'] = dJob.modificationTime + self.cur.execute(sqlFMod+comment,varMap) + self.cur.execute(sqlMMod+comment,varMap) + self.cur.execute(sqlPMod+comment,varMap) + # collect to record state change + updatedJobList.append(dJob) + # set tobeclosed to sub datasets + if not toBeClosedSubList.has_key(dJob.jobDefinitionID): + # init + toBeClosedSubList[dJob.jobDefinitionID] = [] + # get sub datasets + varMap = {} + varMap[':type'] = 'output' + varMap[':PandaID'] = downID + self.cur.arraysize = 1000 + self.cur.execute(sqlGetSub+comment, varMap) + resGetSub = self.cur.fetchall() + if len(resGetSub) == 0: + continue + # loop over all sub datasets + for tmpDestinationDBlock, in resGetSub: + if re.search('_sub\d+$',tmpDestinationDBlock) == None: + continue + if not tmpDestinationDBlock in toBeClosedSubList[dJob.jobDefinitionID]: + # set tobeclosed + varMap = {} + varMap[':status'] = 'tobeclosed' + varMap[':name'] = tmpDestinationDBlock + self.cur.execute(sqlCloseSub+comment, varMap) + _logger.debug("set tobeclosed for %s" % tmpDestinationDBlock) + # append + toBeClosedSubList[dJob.jobDefinitionID].append(tmpDestinationDBlock) + # close top-level user dataset + topUserDsName = re.sub('_sub\d+$','',tmpDestinationDBlock) + if topUserDsName != tmpDestinationDBlock and not topUserDsName in topUserDsList: + # set tobeclosed + varMap = {} + if dJob.processingType.startswith('gangarobot') or \ + dJob.processingType.startswith('hammercloud'): + varMap[':status'] = 'completed' + else: + varMap[':status'] = 'tobeclosed' + varMap[':name'] = topUserDsName + self.cur.execute(sqlCloseSub+comment, varMap) + _logger.debug("set %s for %s" % (varMap[':status'],topUserDsName)) + # append + topUserDsList.append(topUserDsName) + elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='dis': + # get corresponding jobs for production movers + vuid = '' + # extract vuid + match = re.search('--callBack (\S+)',job.jobParameters) + if match != None: + try: + callbackUrl = urllib.unquote(match.group(1)) + callbackUrl = re.sub('[&\?]',' ', callbackUrl) + # look for vuid= + for item in callbackUrl.split(): + if item.startswith('vuid='): + vuid = item.split('=')[-1] + break + except: + pass + if vuid == '': + _logger.error("cannot extract vuid from %s" % job.jobParameters) + else: + # get name + varMap = {} + varMap[':vuid'] = vuid + varMap[':type'] = 'dispatch' + self.cur.arraysize = 10 + self.cur.execute("SELECT name FROM ATLAS_PANDA.Datasets WHERE vuid=:vuid AND type=:type "+comment, varMap) + res = self.cur.fetchall() + if len(res) != 0: + disName = res[0][0] + # check lost files + varMap = {} + varMap[':status'] = 'lost' + varMap[':dispatchDBlock'] = disName + sqlLost = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ distinct PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE status=:status AND dispatchDBlock=:dispatchDBlock" + self.cur.execute(sqlLost+comment,varMap) + resLost = self.cur.fetchall() + # fail jobs with lost files + sqlDJS = "SELECT %s " % JobSpec.columnNames() + sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" + sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" + sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() + sqlDJI+= JobSpec.bindValuesExpression() + sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + lostJobIDs = [] + for tmpID, in resLost: + _logger.debug("fail due to lost files : %s" % tmpID) + varMap = {} + varMap[':PandaID'] = tmpID + self.cur.arraysize = 10 + self.cur.execute(sqlDJS+comment, varMap) + resJob = self.cur.fetchall() + if len(resJob) == 0: + continue + # instantiate JobSpec + dJob = JobSpec() + dJob.pack(resJob[0]) + # delete + varMap = {} + varMap[':PandaID'] = tmpID + self.cur.execute(sqlDJD+comment, varMap) + retD = self.cur.rowcount + if retD == 0: + continue + # error code + dJob.jobStatus = 'failed' + dJob.endTime = datetime.datetime.utcnow() + dJob.ddmErrorCode = 101 #ErrorCode.EC_LostFile + dJob.ddmErrorDiag = 'lost file in SE' + dJob.modificationTime = dJob.endTime + dJob.stateChangeTime = dJob.endTime + # insert + self.cur.execute(sqlDJI+comment, dJob.valuesMap()) + # update files,metadata,parametes + varMap = {} + varMap[':PandaID'] = tmpID + varMap[':modificationTime'] = dJob.modificationTime + self.cur.execute(sqlFMod+comment,varMap) + self.cur.execute(sqlMMod+comment,varMap) + self.cur.execute(sqlPMod+comment,varMap) + # append + lostJobIDs.append(tmpID) + # collect to record state change + updatedJobList.append(dJob) + # get PandaIDs + varMap = {} + varMap[':jobStatus'] = 'assigned' + varMap[':dispatchDBlock'] = disName + self.cur.execute("SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE dispatchDBlock=:dispatchDBlock AND jobStatus=:jobStatus "+comment, + varMap) + resDDM = self.cur.fetchall() + for tmpID, in resDDM: + if not tmpID in lostJobIDs: + ddmIDs.append(tmpID) + # get offset + ddmAttempt = job.attemptNr + _logger.debug("get PandaID for reassign : %s ddmAttempt=%s" % (str(ddmIDs),ddmAttempt)) + elif job.prodSourceLabel == 'ddm' and job.jobStatus == 'failed' and job.transferType=='ddm' and job.attemptNr<2 \ + and job.commandToPilot != 'tobekilled': + # instantiate new mover to retry subscription + newJob = JobSpec() + newJob.jobDefinitionID = job.jobDefinitionID + newJob.jobName = job.jobName + newJob.attemptNr = job.attemptNr + 1 + newJob.transformation = job.transformation + newJob.destinationDBlock = job.destinationDBlock + newJob.destinationSE = job.destinationSE + newJob.currentPriority = job.currentPriority + newJob.prodSourceLabel = job.prodSourceLabel + newJob.prodUserID = job.prodUserID + newJob.computingSite = job.computingSite + newJob.transferType = job.transferType + newJob.sourceSite = job.sourceSite + newJob.destinationSite = job.destinationSite + newJob.jobParameters = job.jobParameters + if job.Files != []: + file = job.Files[0] + fileOL = FileSpec() + # add attempt nr + fileOL.lfn = re.sub("\.\d+$","",file.lfn) + fileOL.lfn = "%s.%d" % (fileOL.lfn,job.attemptNr) + fileOL.destinationDBlock = file.destinationDBlock + fileOL.destinationSE = file.destinationSE + fileOL.dataset = file.dataset + fileOL.type = file.type + newJob.addFile(fileOL) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # record status change + try: + for tmpJob in updatedJobList: + self.recordStatusChange(tmpJob.PandaID,tmpJob.jobStatus,jobInfo=tmpJob) + except: + _logger.error('recordStatusChange in archiveJob') + return True,ddmIDs,ddmAttempt,newJob + except: + # roll back + self._rollback(True) + if iTry+1 < nTry: + _logger.debug("archiveJob : %s retry : %s" % (job.PandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("archiveJob : %s" % job.PandaID) + _logger.error("archiveJob : %s %s" % (type,value)) + return False,[],0,None + + + # overload of archiveJob + def archiveJobLite(self,pandaID,jobStatus,param): + comment = ' /* DBProxy.archiveJobLite */' + _logger.debug("archiveJobLite : %s" % pandaID) + sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames() + sql1+= "WHERE PandaID=:PandaID" + sql2 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID" + sql3 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() + sql3+= JobSpec.bindValuesExpression() + sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + nTry=3 + for iTry in range(nTry): + try: + # begin transaction + self.conn.begin() + # select + varMap = {} + varMap[':PandaID'] = pandaID + self.cur.arraysize = 10 + self.cur.execute(sql1+comment, varMap) + res = self.cur.fetchall() + if len(res) == 0: + _logger.error("archiveJobLite() : PandaID %d not found" % pandaID) + self._rollback() + return False + job = JobSpec() + job.pack(res[0]) + job.jobStatus = jobStatus + for key in param.keys(): + if param[key] != None: + setattr(job,key,param[key]) + job.modificationTime = datetime.datetime.utcnow() + job.endTime = job.modificationTime + job.stateChangeTime = job.modificationTime + # delete + self.cur.execute(sql2+comment, varMap) + n = self.cur.rowcount + if n==0: + # already killed + _logger.debug("archiveJobLite : Not found %s" % pandaID) + else: + # insert + self.cur.execute(sql3+comment, job.valuesMap()) + # update files + for file in job.Files: + sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" + varMap = file.valuesMap(onlyChanged=True) + if varMap != {}: + varMap[':row_ID'] = file.row_ID + _logger.debug(sqlF+comment+str(varMap)) + self.cur.execute(sqlF+comment, varMap) + # update files,metadata,parametes + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':modificationTime'] = job.modificationTime + self.cur.execute(sqlFMod+comment,varMap) + self.cur.execute(sqlMMod+comment,varMap) + self.cur.execute(sqlPMod+comment,varMap) + # delete downstream jobs + if job.prodSourceLabel == 'panda' and job.jobStatus == 'failed': + # file select + sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=:PandaID" + varMap = {} + varMap[':PandaID'] = pandaID + self.cur.arraysize = 100000 + self.cur.execute(sqlFile+comment, varMap) + resFs = self.cur.fetchall() + for resF in resFs: + file = FileSpec() + file.pack(resF) + job.addFile(file) + # look for outputs + upOutputs = [] + for file in job.Files: + if file.type == 'output': + upOutputs.append(file.lfn) + # look for downstream jobs + sqlD = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND lfn=:lfn GROUP BY PandaID" + sqlDJS = "SELECT %s " % JobSpec.columnNames() + sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" + sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" + sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() + sqlDJI+= JobSpec.bindValuesExpression() + for upFile in upOutputs: + _logger.debug("look for downstream jobs for %s" % upFile) + # select PandaID + varMap = {} + varMap[':lfn'] = upFile + varMap[':type'] = 'input' + self.cur.arraysize = 100000 + self.cur.execute(sqlD+comment, varMap) + res = self.cur.fetchall() + for downID, in res: + _logger.debug("delete : %s" % downID) + # select jobs + varMap = {} + varMap[':PandaID'] = downID + self.cur.arraysize = 10 + self.cur.execute(sqlDJS+comment, varMap) + resJob = self.cur.fetchall() + if len(resJob) == 0: + continue + # instantiate JobSpec + dJob = JobSpec() + dJob.pack(resJob[0]) + # delete + varMap = {} + varMap[':PandaID'] = downID + self.cur.execute(sqlDJD+comment, varMap) + retD = self.cur.rowcount + if retD == 0: + continue + # error code + dJob.jobStatus = 'failed' + dJob.endTime = datetime.datetime.utcnow() + dJob.taskBufferErrorCode = ErrorCode.EC_Kill + dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed' + dJob.modificationTime = dJob.endTime + dJob.stateChangeTime = dJob.endTime + # insert + self.cur.execute(sqlDJI+comment, dJob.valuesMap()) + # update files,metadata,parametes + varMap = {} + varMap[':PandaID'] = downID + varMap[':modificationTime'] = dJob.modificationTime + self.cur.execute(sqlFMod+comment,varMap) + self.cur.execute(sqlMMod+comment,varMap) + self.cur.execute(sqlPMod+comment,varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("archiveJobLite : %s retry : %s" % (pandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("archiveJobLite : %s %s" % (type,value)) + return False + + + # finalize pending jobs + def finalizePendingJobs(self,prodUserName,jobDefinitionID): + comment = ' /* DBProxy.finalizePendingJobs */' + _logger.debug("finalizePendingJobs : %s %s" % (prodUserName,jobDefinitionID)) + sql0 = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 " + sql0+= "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sql0+= "AND prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus " + sqlU = "UPDATE ATLAS_PANDA.jobsActive4 SET jobStatus=:newJobStatus " + sqlU+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus " + sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames() + sql1+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus " + sql2 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID AND jobStatus=:jobStatus " + sql3 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() + sql3+= JobSpec.bindValuesExpression() + sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + try: + # begin transaction + self.conn.begin() + self.cur.arraysize = 100000 + # select + varMap = {} + varMap[':jobStatus'] = 'failed' + varMap[':prodUserName'] = prodUserName + varMap[':jobDefinitionID'] = jobDefinitionID + varMap[':prodSourceLabel'] = 'user' + self.cur.execute(sql0+comment,varMap) + resPending = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # lock + pPandaIDs = [] + for pandaID, in resPending: + # begin transaction + self.conn.begin() + # update + varMap = {} + varMap[':jobStatus'] = 'failed' + varMap[':newJobStatus'] = 'holding' + varMap[':PandaID'] = pandaID + self.cur.execute(sqlU+comment,varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + retU = self.cur.rowcount + if retU != 0: + pPandaIDs.append(pandaID) + # loop over all PandaIDs + for pandaID in pPandaIDs: + # begin transaction + self.conn.begin() + # get job + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':jobStatus'] = 'holding' + self.cur.arraysize = 10 + self.cur.execute(sql1+comment,varMap) + res = self.cur.fetchall() + if len(res) == 0: + _logger.debug("finalizePendingJobs : PandaID %d not found" % pandaID) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + continue + job = JobSpec() + job.pack(res[0]) + job.jobStatus = 'failed' + job.modificationTime = datetime.datetime.utcnow() + # delete + self.cur.execute(sql2+comment,varMap) + n = self.cur.rowcount + if n==0: + # already killed + _logger.debug("finalizePendingJobs : Not found %s" % pandaID) + else: + # insert + self.cur.execute(sql3+comment,job.valuesMap()) + # update files,metadata,parametes + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':modificationTime'] = job.modificationTime + self.cur.execute(sqlFMod+comment,varMap) + self.cur.execute(sqlMMod+comment,varMap) + self.cur.execute(sqlPMod+comment,varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("finalizePendingJobs : %s %s done for %s" % (prodUserName,jobDefinitionID,len(pPandaIDs))) + return True + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("finalizePendingJobs : %s %s" % (errType,errValue)) + return False + + + # delete stalled jobs + def deleteStalledJobs(self,libFileName): + comment = ' /* DBProxy.deleteStalledJobs */' + _logger.debug("deleteStalledJobs : %s" % libFileName) + sql2 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() + sql2+= JobSpec.bindValuesExpression() + nTry=3 + try: + # begin transaction + self.conn.begin() + # look for downstream jobs + sqlD = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND lfn=:lfn GROUP BY PandaID" + sqlDJS = "SELECT %s " % JobSpec.columnNames() + sqlDJS+= "FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" + sqlDJD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" + sqlDJI = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() + sqlDJI+= JobSpec.bindValuesExpression() + sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + _logger.debug("deleteStalledJobs : look for downstream jobs for %s" % libFileName) + # select PandaID + varMap = {} + varMap[':lfn'] = libFileName + varMap[':type'] = 'input' + self.cur.arraysize = 100000 + self.cur.execute(sqlD+comment, varMap) + res = self.cur.fetchall() + for downID, in res: + _logger.debug("deleteStalledJobs : delete %s" % downID) + # select jobs + varMap = {} + varMap[':PandaID'] = downID + self.cur.arraysize = 10 + self.cur.execute(sqlDJS+comment, varMap) + resJob = self.cur.fetchall() + if len(resJob) == 0: + continue + # instantiate JobSpec + dJob = JobSpec() + dJob.pack(resJob[0]) + # delete + varMap = {} + varMap[':PandaID'] = downID + self.cur.execute(sqlDJD+comment, varMap) + retD = self.cur.rowcount + if retD == 0: + continue + # error code + dJob.jobStatus = 'cancelled' + dJob.endTime = datetime.datetime.utcnow() + dJob.taskBufferErrorCode = ErrorCode.EC_Kill + dJob.taskBufferErrorDiag = 'killed by Panda server : upstream job failed' + dJob.modificationTime = dJob.endTime + dJob.stateChangeTime = dJob.endTime + # insert + self.cur.execute(sqlDJI+comment, dJob.valuesMap()) + # update files,metadata,parametes + varMap = {} + varMap[':PandaID'] = downID + varMap[':modificationTime'] = dJob.modificationTime + self.cur.execute(sqlFMod+comment,varMap) + self.cur.execute(sqlMMod+comment,varMap) + self.cur.execute(sqlPMod+comment,varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback(True) + errtype,errvalue = sys.exc_info()[:2] + _logger.error("deleteStalledJobs : %s %s" % (errtype,errvalue)) + return False + + + # update Job status in jobsActive + def updateJobStatus(self,pandaID,jobStatus,param,updateStateChange=False,attemptNr=None): + comment = ' /* DBProxy.updateJobStatus */' + _logger.debug("updateJobStatus : PandaID=%s attemptNr=%s status=%s" % (pandaID,attemptNr,jobStatus)) + sql0 = "SELECT commandToPilot,endTime,specialHandling,jobStatus,computingSite,cloud,prodSourceLabel FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID " + varMap0 = {} + varMap0[':PandaID'] = pandaID + sql1 = "UPDATE ATLAS_PANDA.jobsActive4 SET jobStatus=:jobStatus,modificationTime=CURRENT_DATE" + if updateStateChange or jobStatus in ['starting']: + sql1 += ",stateChangeTime=CURRENT_DATE" + varMap = {} + varMap[':jobStatus'] = jobStatus + presetEndTime = False + for key in param.keys(): + if param[key] != None: + sql1 += ',%s=:%s' % (key,key) + varMap[':%s' % key] = param[key] + if key == 'endTime': + presetEndTime = True + try: + # store positive error code even for pilot retry + if key == 'pilotErrorCode' and param[key].startswith('-'): + varMap[':%s' % key] = param[key][1:] + except: + pass + sql1W = " WHERE PandaID=:PandaID " + varMap[':PandaID'] = pandaID + if attemptNr != None: + sql0 += "AND attemptNr=:attemptNr " + sql1W += "AND attemptNr=:attemptNr " + varMap[':attemptNr'] = attemptNr + varMap0[':attemptNr'] = attemptNr + # prevent change from holding to transferring which doesn't register files to sub/tid + if jobStatus == 'transferring': + sql1W += "AND NOT jobStatus=:ngStatus " + varMap[':ngStatus'] = 'holding' + updatedFlag = False + nTry=3 + for iTry in range(nTry): + try: + # begin transaction + self.conn.begin() + # select + self.cur.arraysize = 10 + self.cur.execute (sql0+comment,varMap0) + res = self.cur.fetchone() + if res != None: + ret = '' + commandToPilot,endTime,specialHandling,oldJobStatus,computingSite,cloud,prodSourceLabel = res + # debug mode + """ + if not specialHandling in [None,''] and 'debug' in specialHandling: + ret += 'debugon,' + else: + ret += 'debugoff,' + """ + # kill command + if not commandToPilot in [None,'']: + ret += '%s,' % commandToPilot + ret = ret[:-1] + # convert empty to NULL + if ret == '': + ret = 'NULL' + # don't update holding + if oldJobStatus == 'holding' and jobStatus == 'holding': + _logger.debug("updateJobStatus : PandaID=%s skip to reset holding" % pandaID) + else: + # set endTime if undefined for holding + if jobStatus == 'holding' and endTime==None and not presetEndTime: + sql1 += ',endTime=CURRENT_DATE ' + # update + self.cur.execute (sql1+sql1W+comment,varMap) + nUp = self.cur.rowcount + _logger.debug("updateJobStatus : PandaID=%s attemptNr=%s nUp=%s" % (pandaID,attemptNr,nUp)) + if nUp == 1: + updatedFlag = True + if nUp == 0 and jobStatus == 'transferring': + _logger.debug("updateJobStatus : PandaID=%s ignore to update for transferring" % pandaID) + else: + _logger.debug("updateJobStatus : PandaID=%s attemptNr=%s notFound" % (pandaID,attemptNr)) + # already deleted or bad attempt number + ret = "badattemptnr" + #ret = 'tobekilled' + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # record status change + try: + if updatedFlag and oldJobStatus != None and oldJobStatus != jobStatus: + self.recordStatusChange(pandaID,jobStatus, + infoMap={'computingSite':computingSite, + 'cloud':cloud, + 'prodSourceLabel':prodSourceLabel}) + except: + _logger.error('recordStatusChange in updateJobStatus') + return ret + except: + # roll back + self._rollback(True) + if iTry+1 < nTry: + _logger.debug("updateJobStatus : %s retry : %s" % (pandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("updateJobStatus : %s %s" % (type,value)) + _logger.error("updateJobStatus : %s" % pandaID) + return False + + + # update job information in jobsActive or jobsDefined + def updateJob(self,job,inJobsDefined): + comment = ' /* DBProxy.updateJob */' + _logger.debug("updateJob : %s" % job.PandaID) + updatedFlag = False + nTry=3 + for iTry in range(nTry): + try: + job.modificationTime = datetime.datetime.utcnow() + # set stateChangeTime for defined->assigned + if inJobsDefined: + job.stateChangeTime = job.modificationTime + # make SQL + if inJobsDefined: + sql1 = "UPDATE ATLAS_PANDA.jobsDefined4 SET %s " % job.bindUpdateChangesExpression() + else: + sql1 = "UPDATE ATLAS_PANDA.jobsActive4 SET %s " % job.bindUpdateChangesExpression() + sql1+= "WHERE PandaID=:PandaID " + if inJobsDefined: + sql1+= " AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2) " + # begin transaction + self.conn.begin() + # update + varMap = job.valuesMap(onlyChanged=True) + varMap[':PandaID'] = job.PandaID + if inJobsDefined: + varMap[':oldJobStatus1'] = 'assigned' + varMap[':oldJobStatus2'] = 'defined' + _logger.debug(sql1+comment+str(varMap)) + self.cur.execute(sql1+comment, varMap) + n = self.cur.rowcount + if n==0: + # already killed or activated + _logger.debug("updateJob : Not found %s" % job.PandaID) + else: + for file in job.Files: + sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" + varMap = file.valuesMap(onlyChanged=True) + if varMap != {}: + varMap[':row_ID'] = file.row_ID + _logger.debug(sqlF+comment+str(varMap)) + self.cur.execute(sqlF+comment, varMap) + # update job parameters + sqlJobP = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID" + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':param'] = job.jobParameters + self.cur.execute(sqlJobP+comment, varMap) + updatedFlag = True + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # record status change + try: + if updatedFlag: + self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) + except: + _logger.error('recordStatusChange in updateJob') + return True + except: + # roll back + self._rollback(True) + if iTry+1 < nTry: + _logger.debug("updateJob : %s retry : %s" % (job.PandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("updateJob : %s %s" % (type,value)) + return False + + + # retry analysis job + def retryJob(self,pandaID,param,failedInActive=False,changeJobInMem=False,inMemJob=None, + getNewPandaID=False,attemptNr=None): + comment = ' /* DBProxy.retryJob */' + _logger.debug("retryJob : %s inActive=%s" % (pandaID,failedInActive)) + sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames() + sql1+= "WHERE PandaID=:PandaID " + if failedInActive: + sql1+= "AND jobStatus=:jobStatus " + updatedFlag = False + nTry=3 + for iTry in range(nTry): + try: + retValue = False + if not changeJobInMem: + # begin transaction + self.conn.begin() + # select + varMap = {} + varMap[':PandaID'] = pandaID + if failedInActive: + varMap[':jobStatus'] = 'failed' + self.cur.arraysize = 10 + self.cur.execute(sql1+comment, varMap) + res = self.cur.fetchall() + if len(res) == 0: + _logger.debug("retryJob() : PandaID %d not found" % pandaID) + self._rollback() + return retValue + job = JobSpec() + job.pack(res[0]) + else: + job = inMemJob + # don't use getNewPandaID for buildJob since the order of PandaIDs is broken + if getNewPandaID and job.prodSourceLabel in ['panda']: + if not changeJobInMem: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return retValue + # convert attemptNr to int + try: + attemptNr = int(attemptNr) + except: + _logger.debug("retryJob : %s attemptNr=%s non-integer" % (pandaID,attemptNr)) + attemptNr = -999 + # check attemptNr + if attemptNr != None: + if job.attemptNr != attemptNr: + _logger.debug("retryJob : %s bad attemptNr job.%s != pilot.%s" % (pandaID,job.attemptNr,attemptNr)) + if not changeJobInMem: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return retValue + # check if already retried + if job.taskBufferErrorCode in [ErrorCode.EC_Reassigned,ErrorCode.EC_Retried,ErrorCode.EC_PilotRetried]: + _logger.debug("retryJob : %s already retried %s" % (pandaID,job.taskBufferErrorCode)) + if not changeJobInMem: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return retValue + # check pilot retry + usePilotRetry = False + if job.prodSourceLabel in ['user','panda','ptest','rc_test'] and \ + param.has_key('pilotErrorCode') and param['pilotErrorCode'].startswith('-') and \ + job.maxAttempt > job.attemptNr and \ + (not job.processingType.startswith('gangarobot') or job.processingType=='gangarobot-rctest') and \ + not job.processingType.startswith('hammercloud'): + usePilotRetry = True + # check if it's analysis job # FIXME once pilot retry works correctly the conditions below will be cleaned up + if (((job.prodSourceLabel == 'user' or job.prodSourceLabel == 'panda') \ + and not job.processingType.startswith('gangarobot') \ + and not job.processingType.startswith('hammercloud') \ + and job.computingSite.startswith('ANALY_') and param.has_key('pilotErrorCode') \ + and param['pilotErrorCode'] in ['1200','1201','1213'] and (not job.computingSite.startswith('ANALY_LONG_')) \ + and job.attemptNr < 2) or (job.prodSourceLabel == 'ddm' and job.cloud == 'CA' and job.attemptNr <= 10) \ + or failedInActive or usePilotRetry) \ + and job.commandToPilot != 'tobekilled': + _logger.debug('reset PandaID:%s #%s' % (job.PandaID,job.attemptNr)) + if not changeJobInMem: + # job parameters + sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" + varMap = {} + varMap[':PandaID'] = job.PandaID + self.cur.execute(sqlJobP+comment, varMap) + for clobJobP, in self.cur: + job.jobParameters = clobJobP.read() + break + # reset job + job.jobStatus = 'activated' + job.startTime = None + job.modificationTime = datetime.datetime.utcnow() + job.attemptNr = job.attemptNr + 1 + if usePilotRetry: + job.currentPriority -= 10 + if failedInActive: + job.endTime = None + job.transExitCode = None + for attr in job._attributes: + if attr.endswith('ErrorCode') or attr.endswith('ErrorDiag'): + setattr(job,attr,None) + # remove flag regarding to pledge-resource handling + if not job.specialHandling in [None,'NULL','']: + newSpecialHandling = re.sub(',*localpool','',job.specialHandling) + if newSpecialHandling == '': + job.specialHandling = None + else: + job.specialHandling = newSpecialHandling + # send it to long queue for analysis jobs + oldComputingSite = job.computingSite + if not changeJobInMem: + if job.computingSite.startswith('ANALY'): + longSite = None + tmpLongSiteList = [] + tmpLongSite = re.sub('^ANALY_','ANALY_LONG_',job.computingSite) + tmpLongSite = re.sub('_\d+$','',tmpLongSite) + tmpLongSiteList.append(tmpLongSite) + tmpLongSite = job.computingSite + '_LONG' + tmpLongSiteList.append(tmpLongSite) + tmpLongSite = re.sub('SHORT','LONG',job.computingSite) + if tmpLongSite != job.computingSite: + tmpLongSiteList.append(tmpLongSite) + # loop over all possible long sitenames + for tmpLongSite in tmpLongSiteList: + varMap = {} + varMap[':siteID'] = tmpLongSite + varMap[':status'] = 'online' + sqlSite = "SELECT COUNT(*) FROM ATLAS_PANDAMETA.schedconfig WHERE siteID=:siteID AND status=:status" + self.cur.execute(sqlSite+comment, varMap) + resSite = self.cur.fetchone() + if resSite != None and resSite[0] > 0: + longSite = tmpLongSite + break + # use long site if exists + if longSite != None: + _logger.debug('sending PandaID:%s to %s' % (job.PandaID,longSite)) + job.computingSite = longSite + # set destinationSE if queue is changed + if oldComputingSite == job.destinationSE: + job.destinationSE = job.computingSite + if not changeJobInMem: + # select files + varMap = {} + varMap[':PandaID'] = job.PandaID + if not getNewPandaID: + varMap[':type1'] = 'log' + varMap[':type2'] = 'output' + sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() + if not getNewPandaID: + sqlFile+= "WHERE PandaID=:PandaID AND (type=:type1 OR type=:type2)" + else: + sqlFile+= "WHERE PandaID=:PandaID" + self.cur.arraysize = 100 + self.cur.execute(sqlFile+comment, varMap) + resFs = self.cur.fetchall() + else: + # get log or output files only + resFs = [] + for tmpFile in job.Files: + if tmpFile.type in ['log','output']: + resFs.append(tmpFile) + # loop over all files + for resF in resFs: + if not changeJobInMem: + # set PandaID + file = FileSpec() + file.pack(resF) + job.addFile(file) + else: + file = resF + # set new GUID + if file.type == 'log': + file.GUID = commands.getoutput('uuidgen') + # don't change input and lib.tgz + if file.type == 'input' or (file.type == 'output' and job.prodSourceLabel == 'panda') or \ + (file.type == 'output' and file.lfn.endswith('.lib.tgz') and job.prodSourceLabel in ['rc_test','ptest']): + continue + # append attemptNr to LFN + oldName = file.lfn + file.lfn = re.sub('\.\d+$','',file.lfn) + file.lfn = '%s.%s' % (file.lfn,job.attemptNr) + newName = file.lfn + # set destinationSE + if oldComputingSite == file.destinationSE: + file.destinationSE = job.computingSite + # modify jobParameters + sepPatt = "(\'|\"|%20)" + oldName + "(\'|\"|%20)" + matches = re.findall(sepPatt,job.jobParameters) + for match in matches: + oldPatt = match[0]+oldName+match[-1] + newPatt = match[0]+newName+match[-1] + job.jobParameters = re.sub(oldPatt,newPatt,job.jobParameters) + if not changeJobInMem and not getNewPandaID: + # reset file status + if file.type in ['output','log']: + file.status = 'unknown' + # update files + sqlFup = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" + varMap = file.valuesMap(onlyChanged=True) + if varMap != {}: + varMap[':row_ID'] = file.row_ID + self.cur.execute(sqlFup+comment, varMap) + if not changeJobInMem: + # reuse original PandaID + if not getNewPandaID: + # update job + sql2 = "UPDATE ATLAS_PANDA.jobsActive4 SET %s " % job.bindUpdateChangesExpression() + sql2+= "WHERE PandaID=:PandaID " + varMap = job.valuesMap(onlyChanged=True) + varMap[':PandaID'] = job.PandaID + self.cur.execute(sql2+comment, varMap) + # update job parameters + sqlJobP = "UPDATE ATLAS_PANDA.jobParamsTable SET jobParameters=:param WHERE PandaID=:PandaID" + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':param'] = job.jobParameters + self.cur.execute(sqlJobP+comment, varMap) + updatedFlag = True + else: + # read metadata + sqlMeta = "SELECT metaData FROM ATLAS_PANDA.metaTable WHERE PandaID=:PandaID" + varMap = {} + varMap[':PandaID'] = job.PandaID + self.cur.execute(sqlMeta+comment, varMap) + for clobJobP, in self.cur: + job.metadata = clobJobP.read() + break + # insert job with new PandaID + sql1 = "INSERT INTO ATLAS_PANDA.jobsActive4 (%s) " % JobSpec.columnNames() + sql1+= JobSpec.bindValuesExpression(useSeq=True) + sql1+= " RETURNING PandaID INTO :newPandaID" + # set parentID + job.parentID = job.PandaID + varMap = job.valuesMap(useSeq=True) + varMap[':newPandaID'] = self.cur.var(cx_Oracle.NUMBER) + # insert + retI = self.cur.execute(sql1+comment, varMap) + # set PandaID + job.PandaID = long(varMap[':newPandaID'].getvalue()) + _logger.debug('Generate new PandaID %s -> %s #%s' % (job.parentID,job.PandaID,job.attemptNr)) + # insert files + sqlFile = "INSERT INTO ATLAS_PANDA.filesTable4 (%s) " % FileSpec.columnNames() + sqlFile+= FileSpec.bindValuesExpression(useSeq=True) + sqlFile+= " RETURNING row_ID INTO :newRowID" + for file in job.Files: + # reset rowID + file.row_ID = None + # insert + varMap = file.valuesMap(useSeq=True) + varMap[':newRowID'] = self.cur.var(cx_Oracle.NUMBER) + self.cur.execute(sqlFile+comment, varMap) + # update mod time for files + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':modificationTime'] = job.modificationTime + sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + self.cur.execute(sqlFMod+comment,varMap) + # metadata + sqlMeta = "INSERT INTO ATLAS_PANDA.metaTable (PandaID,metaData,modificationTime) VALUES (:PandaID,:metaData,:modTime)" + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':metaData'] = job.metadata + varMap[':modTime'] = job.modificationTime + self.cur.execute(sqlMeta+comment, varMap) + # job parameters + sqlJob = "INSERT INTO ATLAS_PANDA.jobParamsTable (PandaID,jobParameters,modificationTime) VALUES (:PandaID,:param,:modTime)" + varMap = {} + varMap[':PandaID'] = job.PandaID + varMap[':param'] = job.jobParameters + varMap[':modTime'] = job.modificationTime + self.cur.execute(sqlJob+comment, varMap) + # set error code to original job to avoid being retried by another process + sqlE = "UPDATE ATLAS_PANDA.jobsActive4 SET taskBufferErrorCode=:errCode,taskBufferErrorDiag=:errDiag WHERE PandaID=:PandaID" + varMap = {} + varMap[':PandaID'] = job.parentID + varMap[':errCode'] = ErrorCode.EC_PilotRetried + varMap[':errDiag'] = 'retrying at the same site. new PandaID=%s' % job.PandaID + self.cur.execute(sqlE+comment, varMap) + # set return + if not getNewPandaID: + retValue = True + if not changeJobInMem: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # record status change + try: + if updatedFlag: + self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) + except: + _logger.error('recordStatusChange in retryJob') + return retValue + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("retryJob : %s retry : %s" % (pandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + # error report + type, value, traceBack = sys.exc_info() + _logger.error("retryJob : %s %s" % (type,value)) + return False + + + # retry failed analysis jobs in Active4 + def retryJobsInActive(self,prodUserName,jobDefinitionID): + comment = ' /* DBProxy.retryJobsInActive */' + _logger.debug("retryJobsInActive : start - %s %s" % (prodUserName,jobDefinitionID)) + try: + # begin transaction + self.conn.begin() + # count the number of jobs in Defined + sqlC = "SELECT COUNT(*) FROM ATLAS_PANDA.jobsDefined4 " + sqlC += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sqlC += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " + varMap = {} + varMap[':prodUserName'] = prodUserName + varMap[':jobDefinitionID'] = jobDefinitionID + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + self.cur.arraysize = 10 + self.cur.execute(sqlC+comment,varMap) + res = self.cur.fetchone() + # failed to get the number of jobs in Defined + if res == None: + _logger.error("retryJobsInActive : %s %s - failed to get num of jobs in Def" % (prodUserName,jobDefinitionID)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return None for DB error + return None + nJobsInDef = res[0] + # get failed PandaIDs in Active + sql0 = "SELECT PandaID,jobStatus,taskBufferErrorCode,attemptNr FROM ATLAS_PANDA.jobsActive4 " + sql0+= "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sql0+= "AND prodSourceLabel=:prodSourceLabel " + varMap = {} + varMap[':prodUserName'] = prodUserName + varMap[':jobDefinitionID'] = jobDefinitionID + varMap[':prodSourceLabel'] = 'user' + self.cur.execute(sql0+comment,varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # the number of jobs in Active + nJobsInAct = len(res) + # loop over all PandaID + failedPandaIDs = [] + for pandaID,tmpJobStatus,tmpTaskBufferErrorCode,tmpAttemptNr in res: + if tmpJobStatus == 'failed' and not tmpTaskBufferErrorCode in \ + [ErrorCode.EC_Reassigned,ErrorCode.EC_Retried,ErrorCode.EC_PilotRetried]: + failedPandaIDs.append((pandaID,tmpAttemptNr)) + _logger.debug("retryJobsInActive : %s %s - %s failed jobs" % (prodUserName,jobDefinitionID,len(failedPandaIDs))) + # there are some failed jobs in Active + if failedPandaIDs != []: + # get list of sub datasets to lock Closer + sqlF = "SELECT DISTINCT destinationDBlock FROM ATLAS_PANDA.filesTable4 " + sqlF += "WHERE PandaID=:PandaID AND type IN (:type1,:type2) " + varMap = {} + varMap[':PandaID'] = failedPandaIDs[0][0] + varMap[':type1'] = 'log' + varMap[':type2'] = 'output' + # begin transaction + self.conn.begin() + self.cur.arraysize = 100000 + self.cur.execute(sqlF+comment,varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + subDsList = [] + for tmpDSname, in res: + tmpDS = self.queryDatasetWithMap({'name':tmpDSname}) + if tmpDS == None: + _logger.error("retryJobsInActive : %s %s - failed to get DS=%s" % (prodUserName,jobDefinitionID,tmpDSname)) + # return None for DB error + return None + # append + subDsList.append(tmpDS) + # lock datasets + lockedDS = True + ngStatus = ['closed','tobeclosed','completed','tobemerged','merging','cleanup'] + sqlD = "UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE " + sqlD+= "WHERE vuid=:vuid AND NOT status IN (" + for tmpIdx,tmpNgStat in enumerate(ngStatus): + sqlD += ':ngSt%s,' % tmpIdx + sqlD = sqlD[:-1] + sqlD += ") " + self.conn.begin() + self.cur.arraysize = 10 + for tmpDS in subDsList: + varMap = {} + varMap[':status'] = 'locked' + varMap[':vuid'] = tmpDS.vuid + for tmpIdx,tmpNgStat in enumerate(ngStatus): + tmpKey = ':ngSt%s' % tmpIdx + varMap[tmpKey] = tmpNgStat + # update + self.cur.execute(sqlD+comment,varMap) + retD = self.cur.rowcount + # datasets already closed + if retD == 0: + # roll back + self._rollback() + # failed to lock datasets + _logger.debug("retryJobsInActive : %s %s - %s is closed" % (prodUserName,jobDefinitionID,tmpDS.name)) + lockedDS = False + break + # retry jobs + if lockedDS: + # commit for dataset lock + if not self._commit(): + raise RuntimeError, 'Commit error' + # loop over all PandaIDs + for pandaID,tmpAttemptNr in failedPandaIDs: + retryRet = self.retryJob(pandaID,{},failedInActive=True,attemptNr=tmpAttemptNr) + _logger.debug("retryJobsInActive : %s %s - PandaID=%s %s" % (prodUserName,jobDefinitionID,pandaID,retryRet)) + # unlock datasets + sqlDU = "UPDATE ATLAS_PANDA.Datasets SET status=:nStatus,modificationdate=CURRENT_DATE " + sqlDU+= "WHERE vuid=:vuid AND status=:oStatus" + self.conn.begin() + self.cur.arraysize = 10 + for tmpDS in subDsList: + varMap = {} + varMap[':oStatus'] = 'locked' + varMap[':nStatus'] = tmpDS.status + varMap[':vuid'] = tmpDS.vuid + # update + self.cur.execute(sqlDU+comment,varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return True when job is active + retVal = False + if nJobsInAct > 0 or nJobsInDef > 0: + retVal = True + _logger.debug("retryJobsInActive : end %s - %s %s" % (retVal,prodUserName,jobDefinitionID)) + return retVal + except: + # roll back + self._rollback() + # error report + errType,errValue = sys.exc_info()[:2] + _logger.error("retryJobsInActive : %s %s" % (errType,errValue)) + return None + + + # get jobs + def getJobs(self,nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, + atlasRelease,prodUserID,countryGroup,workingGroup,allowOtherCountry): + comment = ' /* DBProxy.getJobs */' + # use memcache + useMemcache = False + try: + if panda_config.memcached_enable and siteName in ['MWT2_UC','ANALY_MWT2','BNL_ATLAS_test','ANALY_BNL_test', + 'ANALY_GLASGOW']: # FIXME + # initialize memcache + if self.memcache == None: + from MemProxy import MemProxy + self.memcache = MemProxy() + if not self.memcache in [None,False]: + useMemcache = True + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("failed to initialize memcached with %s %s" % (errType,errValue)) + # aggregated sites which use different appdirs + aggSiteMap = {'CERN-PROD':{'CERN-RELEASE':'release', + 'CERN-UNVALID':'unvalid', + 'CERN-BUILDS' :'builds', + }, + } + # construct where clause + dynamicBrokering = False + getValMap = {} + getValMap[':oldJobStatus'] = 'activated' + getValMap[':computingSite'] = siteName + if not aggSiteMap.has_key(siteName): + sql1 = "WHERE jobStatus=:oldJobStatus AND computingSite=:computingSite AND commandToPilot IS NULL " + else: + # aggregated sites + sql1 = "WHERE jobStatus=:oldJobStatus AND computingSite IN (:computingSite," + for tmpAggIdx,tmpAggSite in enumerate(aggSiteMap[siteName].keys()): + tmpKeyName = ':computingSite%s' % tmpAggIdx + sql1 += '%s,' % tmpKeyName + getValMap[tmpKeyName] = tmpAggSite + sql1 = sql1[:-1] + sql1 += ") AND commandToPilot IS NULL " + if not mem in [0,'0']: + sql1+= "AND (minRamCount<=:minRamCount OR minRamCount=0) " + getValMap[':minRamCount'] = mem + if not diskSpace in [0,'0']: + sql1+= "AND (maxDiskCount<=:maxDiskCount OR maxDiskCount=0) " + getValMap[':maxDiskCount'] = diskSpace + if prodSourceLabel == 'user': + sql1+= "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2,:prodSourceLabel3) " + getValMap[':prodSourceLabel1'] = 'user' + getValMap[':prodSourceLabel2'] = 'panda' + getValMap[':prodSourceLabel3'] = 'install' + elif prodSourceLabel == 'ddm': + dynamicBrokering = True + sql1+= "AND prodSourceLabel=:prodSourceLabel " + getValMap[':prodSourceLabel'] = 'ddm' + elif prodSourceLabel in [None,'managed']: + sql1+= "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2,:prodSourceLabel3,:prodSourceLabel4) " + getValMap[':prodSourceLabel1'] = 'managed' + getValMap[':prodSourceLabel2'] = 'test' + getValMap[':prodSourceLabel3'] = 'prod_test' + getValMap[':prodSourceLabel4'] = 'install' + elif prodSourceLabel == 'software': + sql1+= "AND prodSourceLabel=:prodSourceLabel " + getValMap[':prodSourceLabel'] = 'software' + elif prodSourceLabel == 'test' and computingElement != None: + dynamicBrokering = True + sql1+= "AND (processingType IN (:processingType1,:processingType2,:processingType3) " + sql1+= "OR prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2,:prodSourceLabel3)) " + getValMap[':processingType1'] = 'gangarobot' + getValMap[':processingType2'] = 'analy_test' + getValMap[':processingType3'] = 'prod_test' + getValMap[':prodSourceLabel1'] = 'test' + getValMap[':prodSourceLabel2'] = 'prod_test' + getValMap[':prodSourceLabel3'] = 'install' + else: + sql1+= "AND prodSourceLabel=:prodSourceLabel " + getValMap[':prodSourceLabel'] = prodSourceLabel + # user ID + if prodUserID != None: + # get compact DN + compactDN = self.cleanUserID(prodUserID) + if compactDN in ['','NULL',None]: + compactDN = prodUserID + sql1+= "AND prodUserName=:prodUserName " + getValMap[':prodUserName'] = compactDN + # country group + specialHandled = False + if prodSourceLabel == 'user': + # update pledge resource ratio + self.getPledgeResourceRatio() + # other country is allowed to use the pilot + if allowOtherCountry=='True' and self.beyondPledgeRatio.has_key(siteName) and self.beyondPledgeRatio[siteName] > 0: + # check if countryGroup needs to be used for beyond-pledge + if self.checkCountryGroupForBeyondPledge(siteName): + countryGroup = self.beyondPledgeRatio[siteName]['countryGroup'] + specialHandled = True + else: + countryGroup = '' + # countryGroup + if not countryGroup in ['',None]: + sql1+= "AND countryGroup IN (" + idxCountry = 1 + for tmpCountry in countryGroup.split(','): + tmpKey = ":countryGroup%s" % idxCountry + sql1+= "%s," % tmpKey + getValMap[tmpKey] = tmpCountry + idxCountry += 1 + sql1 = sql1[:-1] + sql1+= ") " + # workingGroup + if not workingGroup in ['',None]: + sql1+= "AND workingGroup IN (" + idxWorking = 1 + for tmpWorking in workingGroup.split(','): + tmpKey = ":workingGroup%s" % idxWorking + sql1+= "%s," % tmpKey + getValMap[tmpKey] = tmpWorking + idxWorking += 1 + sql1 = sql1[:-1] + sql1+= ") " + # production share + if prodSourceLabel in ['managed',None,'sharetest']: + aggSitesForFairshare = [] + if aggSiteMap.has_key(siteName): + aggSitesForFairshare = aggSiteMap[siteName].keys() + shareSQL,shareVarMap = self.getCriteriaForProdShare(siteName,aggSitesForFairshare) + if shareVarMap != {}: + sql1 += shareSQL + for tmpShareKey in shareVarMap.keys(): + getValMap[tmpShareKey] = shareVarMap[tmpShareKey] + sql2 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames() + sql2+= "WHERE PandaID=:PandaID" + retJobs = [] + nSent = 0 + try: + timeLimit = datetime.timedelta(seconds=timeout-10) + timeStart = datetime.datetime.utcnow() + strName = datetime.datetime.isoformat(timeStart) + attLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=15) + attSQL = "AND ((creationTime<:creationTime AND attemptNr>1) OR attemptNr<=1) " + # get nJobs + for iJob in range(nJobs): + pandaID = 0 + fileMapForMem = {} + # select channel for ddm jobs + if prodSourceLabel == 'ddm': + sqlDDM = "SELECT count(*),jobStatus,sourceSite,destinationSite,transferType FROM ATLAS_PANDA.jobsActive4 WHERE computingSite=:computingSite AND prodSourceLabel=:prodSourceLabel " \ + + attSQL + "GROUP BY jobStatus,sourceSite,destinationSite,transferType" + ddmValMap = {} + ddmValMap[':computingSite'] = siteName + ddmValMap[':creationTime'] = attLimit + ddmValMap[':prodSourceLabel'] = 'ddm' + _logger.debug(sqlDDM+comment+str(ddmValMap)) + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 100 + self.cur.execute(sqlDDM+comment, ddmValMap) + resDDM = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # make a channel map + channelMap = {} + for tmp_count,tmp_jobStatus,tmp_sourceSite,tmp_destinationSite,tmp_transferType in resDDM: + # use source,dest,type as the key + channel = (tmp_sourceSite,tmp_destinationSite,tmp_transferType) + if not channelMap.has_key(channel): + channelMap[channel] = {} + # ignore holding + if tmp_jobStatus == 'holding': + continue + # distinguish activate from other stats + if tmp_jobStatus != 'activated': + tmp_jobStatus = 'others' + # append + if not channelMap[channel].has_key(tmp_jobStatus): + channelMap[channel][tmp_jobStatus] = int(tmp_count) + else: + channelMap[channel][tmp_jobStatus] += int(tmp_count) + _logger.debug(channelMap) + # choose channel + channels = channelMap.keys() + random.shuffle(channels) + foundChannel = False + for channel in channels: + # no activated jobs + if (not channelMap[channel].has_key('activated')) or channelMap[channel]['activated'] == 0: + continue + maxRunning = 15 + # prestaging job + if channel[0] == channel[1] and channel[2] == 'dis': + maxRunning = 50 + if (not channelMap[channel].has_key('others')) or channelMap[channel]['others'] < maxRunning: + # set SQL + sql1+= "AND sourceSite=:sourceSite AND destinationSite=:destinationSite AND transferType=:transferType " + getValMap[':sourceSite'] = channel[0] + getValMap[':destinationSite'] = channel[1] + getValMap[':transferType'] = channel[2] + foundChannel = True + break + # no proper channel + if not foundChannel: + _logger.debug("getJobs : no DDM jobs for Site %s" % siteName) + break + # get job + if prodSourceLabel in ['ddm']: + # to add some delay for attempts + sql1 += attSQL + getValMap[':creationTime'] = attLimit + nTry=1 + for iTry in range(nTry): + # set siteID + tmpSiteID = siteName + if siteName.startswith('ANALY_BNL_ATLAS'): + tmpSiteID = 'ANALY_BNL_ATLAS_1' + # get file lock + _logger.debug("getJobs : %s -> lock" % strName) + if (datetime.datetime.utcnow() - timeStart) < timeLimit: + toGetPandaIDs = True + pandaIDs = [] + specialHandlingMap = {} + # get max priority for analysis jobs + if prodSourceLabel in ['panda','user']: + sqlMX = "SELECT /*+ INDEX_RS_ASC(tab (PRODSOURCELABEL COMPUTINGSITE JOBSTATUS) ) */ MAX(currentPriority) FROM ATLAS_PANDA.jobsActive4 tab " + sqlMX+= sql1 + _logger.debug(sqlMX+comment+str(getValMap)) + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10 + self.cur.execute(sqlMX+comment, getValMap) + tmpPriority, = self.cur.fetchone() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # no jobs + if tmpPriority == None: + toGetPandaIDs = False + else: + # set priority + sql1 += "AND currentPriority=:currentPriority" + getValMap[':currentPriority'] = tmpPriority + maxAttemptIDx = 10 + if toGetPandaIDs: + # get PandaIDs + sqlP = "SELECT /*+ INDEX_RS_ASC(tab (PRODSOURCELABEL COMPUTINGSITE JOBSTATUS) ) */ PandaID,currentPriority,specialHandling FROM ATLAS_PANDA.jobsActive4 tab " + sqlP+= sql1 + _logger.debug(sqlP+comment+str(getValMap)) + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 100000 + self.cur.execute(sqlP+comment, getValMap) + resIDs = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + maxCurrentPriority = None + # get max priority and min PandaID + for tmpPandaID,tmpCurrentPriority,tmpSpecialHandling in resIDs: + if maxCurrentPriority==None or maxCurrentPriority < tmpCurrentPriority: + maxCurrentPriority = tmpCurrentPriority + pandaIDs = [tmpPandaID] + elif maxCurrentPriority == tmpCurrentPriority: + pandaIDs.append(tmpPandaID) + specialHandlingMap[tmpPandaID] = tmpSpecialHandling + # sort + pandaIDs.sort() + if pandaIDs == []: + _logger.debug("getJobs : %s -> no PandaIDs" % strName) + retU = 0 + else: + # check the number of available files + if useMemcache: + _logger.debug("getJobs : %s -> memcache check start" % strName) + # truncate + pandaIDs = pandaIDs[:maxAttemptIDx] + # get input files + availableFileMap = {} + self.cur.arraysize = 100000 + sqlMemFile = "SELECT lfn FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type" + for tmpPandaID in pandaIDs: + varMap = {} + varMap[':type'] = 'input' + varMap[':PandaID'] = tmpPandaID + # start transaction + self.conn.begin() + # select + self.cur.execute(sqlMemFile+comment,varMap) + resFiles = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # get list + fileMapForMem[tmpPandaID] = [] + for tmpItem, in resFiles: + fileMapForMem[tmpPandaID].append(tmpItem) + # get number of available files + nAvailable = self.memcache.checkFiles(tmpPandaID,fileMapForMem[tmpPandaID], + siteName,node) + # append + if not nAvailable in availableFileMap: + availableFileMap[nAvailable] = [] + availableFileMap[nAvailable].append(tmpPandaID) + # sort by the number of available files + tmpAvaKeys = availableFileMap.keys() + tmpAvaKeys.sort() + tmpAvaKeys.reverse() + pandaIDs = [] + for tmpAvaKey in tmpAvaKeys: + pandaIDs += availableFileMap[tmpAvaKey] + _logger.debug("getJobs : %s -> memcache check done" % strName) + # update + for indexID,tmpPandaID in enumerate(pandaIDs): + # max attempts + if indexID > maxAttemptIDx: + break + # update + sqlJ = "UPDATE ATLAS_PANDA.jobsActive4 " + sqlJ+= "SET jobStatus=:newJobStatus,modificationTime=CURRENT_DATE,modificationHost=:modificationHost,startTime=CURRENT_DATE" + varMap = {} + varMap[':PandaID'] = tmpPandaID + varMap[':newJobStatus'] = 'sent' + varMap[':oldJobStatus'] = 'activated' + varMap[':modificationHost'] = node + # set CE + if computingElement != None: + sqlJ+= ",computingElement=:computingElement" + varMap[':computingElement'] = computingElement + # set special handlng + if specialHandled: + sqlJ+= ",specialHandling=:specialHandling" + spString = 'localpool' + if specialHandlingMap.has_key(tmpPandaID) and isinstance(specialHandlingMap[tmpPandaID],types.StringType): + if not spString in specialHandlingMap[tmpPandaID]: + varMap[':specialHandling'] = specialHandlingMap[tmpPandaID]+','+spString + else: + varMap[':specialHandling'] = specialHandlingMap[tmpPandaID] + else: + varMap[':specialHandling'] = spString + sqlJ+= " WHERE PandaID=:PandaID AND jobStatus=:oldJobStatus" + # SQL to get nSent + sentLimit = timeStart - datetime.timedelta(seconds=60) + sqlSent = "SELECT count(*) FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus " + sqlSent += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " + sqlSent += "AND computingSite=:computingSite " + sqlSent += "AND modificationTime>:modificationTime " + varMapSent = {} + varMapSent[':jobStatus'] = 'sent' + varMapSent[':computingSite'] = tmpSiteID + varMapSent[':modificationTime'] = sentLimit + varMapSent[':prodSourceLabel1'] = 'managed' + varMapSent[':prodSourceLabel2'] = 'test' + # start + _logger.debug(sqlJ+comment+str(varMap)) + # start transaction + self.conn.begin() + # update + self.cur.execute(sqlJ+comment, varMap) + retU = self.cur.rowcount + if retU != 0: + # get nSent for production jobs + if prodSourceLabel in [None,'managed']: + _logger.debug(sqlSent+comment+str(varMapSent)) + self.cur.execute(sqlSent+comment, varMapSent) + resSent = self.cur.fetchone() + if resSent != None: + nSent, = resSent + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # succeeded + if retU != 0: + pandaID = tmpPandaID + break + else: + _logger.debug("getJobs : %s -> do nothing" % strName) + retU = 0 + # release file lock + _logger.debug("getJobs : %s -> unlock" % strName) + # succeeded + if retU != 0: + break + if iTry+1 < nTry: + #time.sleep(0.5) + pass + # failed to UPDATE + if retU == 0: + # reset pandaID + pandaID = 0 + _logger.debug("getJobs : Site %s : retU %s : PandaID %s - %s" + % (siteName,retU,pandaID,prodSourceLabel)) + if pandaID == 0: + break + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[':PandaID'] = pandaID + self.cur.arraysize = 10 + self.cur.execute(sql2+comment, varMap) + res = self.cur.fetchone() + if len(res) == 0: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + break + # instantiate Job + job = JobSpec() + job.pack(res) + # Files + sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=:PandaID" + self.cur.arraysize = 10000 + self.cur.execute(sqlFile+comment, varMap) + resFs = self.cur.fetchall() + # job parameters + sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" + varMap = {} + varMap[':PandaID'] = job.PandaID + self.cur.execute(sqlJobP+comment, varMap) + for clobJobP, in self.cur: + job.jobParameters = clobJobP.read() + break + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + for resF in resFs: + file = FileSpec() + file.pack(resF) + job.addFile(file) + # overwrite processingType for appdir at aggrigates sites + if aggSiteMap.has_key(siteName): + if aggSiteMap[siteName].has_key(job.computingSite): + job.processingType = aggSiteMap[siteName][job.computingSite] + job.computingSite = job.computingSite + # append + retJobs.append(job) + # record status change + try: + self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) + except: + _logger.error('recordStatusChange in getJobs') + return retJobs,nSent + except: + # roll back + self._rollback() + # error report + type, value, traceBack = sys.exc_info() + _logger.error("getJobs : %s %s" % (type,value)) + return [],0 + + + # reset job in jobsActive or jobsWaiting + def resetJob(self,pandaID,activeTable=True,keepSite=False,getOldSubs=False,forPending=True): + comment = ' /* DBProxy.resetJob */' + _logger.debug("resetJobs : %s" % pandaID) + # select table + table = 'ATLAS_PANDA.jobsWaiting4' + if activeTable: + table = 'ATLAS_PANDA.jobsActive4' + sql1 = "SELECT %s FROM %s " % (JobSpec.columnNames(),table) + sql1+= "WHERE PandaID=:PandaID" + sql2 = "DELETE FROM %s " % table + sql2+= "WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)" + sql3 = "INSERT INTO ATLAS_PANDA.jobsDefined4 (%s) " % JobSpec.columnNames() + sql3+= JobSpec.bindValuesExpression() + try: + # transaction causes Request ndbd time-out in ATLAS_PANDA.jobsActive4 + self.conn.begin() + # select + varMap = {} + varMap[':PandaID'] = pandaID + self.cur.arraysize = 10 + self.cur.execute(sql1+comment,varMap) + res = self.cur.fetchone() + # not found + if res == None: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return None + # instantiate Job + job = JobSpec() + job.pack(res) + # if already running + if job.jobStatus != 'waiting' and job.jobStatus != 'activated' \ + and (forPending and job.jobStatus != 'pending'): + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return None + # do nothing for analysis jobs + if job.prodSourceLabel in ['user','panda']: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return None + # delete + varMap = {} + varMap[':PandaID'] = pandaID + if not forPending: + varMap[':oldJobStatus1'] = 'waiting' + else: + varMap[':oldJobStatus1'] = 'pending' + varMap[':oldJobStatus2'] = 'activated' + self.cur.execute(sql2+comment,varMap) + retD = self.cur.rowcount + # delete failed + _logger.debug("resetJobs : retD = %s" % retD) + if retD != 1: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return None + # delete from jobsDefined4 just in case + varMap = {} + varMap[':PandaID'] = pandaID + sqlD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" + self.cur.execute(sqlD+comment,varMap) + # increase priority + if job.jobStatus == 'activated' and job.currentPriority < 100: + job.currentPriority = 100 + # reset computing site and dispatchDBlocks + job.jobStatus = 'defined' + job.dispatchDBlock = None + # erase old assignment + if (not keepSite) and job.relocationFlag != 1: + job.computingSite = None + job.computingElement = None + # host and time information + job.modificationHost = self.hostname + job.modificationTime = datetime.datetime.utcnow() + job.stateChangeTime = job.modificationTime + # reset + job.brokerageErrorDiag = None + job.brokerageErrorCode = None + # insert + self.cur.execute(sql3+comment, job.valuesMap()) + # job parameters + sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" + self.cur.execute(sqlJobP+comment, varMap) + for clobJobP, in self.cur: + job.jobParameters = clobJobP.read() + break + # Files + oldSubList = [] + sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=:PandaID" + self.cur.arraysize = 10000 + self.cur.execute(sqlFile+comment, varMap) + resFs = self.cur.fetchall() + for resF in resFs: + file = FileSpec() + file.pack(resF) + # reset GUID to trigger LRC/LFC scanning + if file.status == 'missing': + file.GUID = None + # collect old subs + if job.prodSourceLabel in ['managed','test'] and file.type in ['output','log'] \ + and re.search('_sub\d+$',file.destinationDBlock) != None: + if not file.destinationDBlock in oldSubList: + oldSubList.append(file.destinationDBlock) + # reset status, destinationDBlock and dispatchDBlock + file.status ='unknown' + file.dispatchDBlock = None + file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock) + # add file + job.addFile(file) + # update files + sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" + varMap = file.valuesMap(onlyChanged=True) + if varMap != {}: + varMap[':row_ID'] = file.row_ID + _logger.debug(sqlF+comment+str(varMap)) + self.cur.execute(sqlF+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # record status change + try: + self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) + except: + _logger.error('recordStatusChange in resetJobs') + if getOldSubs: + return job,oldSubList + return job + except: + # roll back + self._rollback() + # error report + type, value, traceBack = sys.exc_info() + _logger.error("resetJobs : %s %s" % (type,value)) + _logger.error("resetJobs : %s" % pandaID) + return None + + + # reset jobs in jobsDefined + def resetDefinedJob(self,pandaID,keepSite=False,getOldSubs=False): + comment = ' /* DBProxy.resetDefinedJob */' + _logger.debug("resetDefinedJob : %s" % pandaID) + sql1 = "UPDATE ATLAS_PANDA.jobsDefined4 SET " + sql1 += "jobStatus=:newJobStatus," + sql1 += "modificationTime=CURRENT_DATE," + sql1 += "dispatchDBlock=NULL," + sql1 += "computingElement=NULL" + sql1 += " WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)" + sql2 = "SELECT %s FROM ATLAS_PANDA.jobsDefined4 " % JobSpec.columnNames() + sql2+= "WHERE PandaID=:PandaID" + try: + oldSubList = [] + # begin transaction + self.conn.begin() + # update + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':newJobStatus'] = 'defined' + varMap[':oldJobStatus1'] = 'assigned' + varMap[':oldJobStatus2'] = 'defined' + self.cur.execute(sql1+comment,varMap) + retU = self.cur.rowcount + # not found + updatedFlag = False + job = None + if retU == 0: + _logger.debug("resetDefinedJob : Not found %s" % pandaID) + else: + # select + varMap = {} + varMap[':PandaID'] = pandaID + self.cur.arraysize = 10 + self.cur.execute(sql2+comment,varMap) + res = self.cur.fetchone() + # not found + if res == None: + raise RuntimeError, 'Could not SELECT : PandaID=%s' % pandaID + # instantiate Job + job = JobSpec() + job.pack(res) + # do nothing for analysis jobs + if job.prodSourceLabel in ['user','panda']: + _logger.debug('resetDefinedJob : rollback since PandaID=%s is analysis job' % pandaID) + # roll back + self._rollback() + return None + job.dispatchDBlock = None + if (not keepSite) and job.relocationFlag != 1: + # erase old assignment + job.computingSite = None + job.computingElement = None + # job parameters + sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" + self.cur.execute(sqlJobP+comment, varMap) + for clobJobP, in self.cur: + job.jobParameters = clobJobP.read() + break + # Files + sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=:PandaID" + self.cur.arraysize = 10000 + self.cur.execute(sqlFile+comment, varMap) + resFs = self.cur.fetchall() + for resF in resFs: + file = FileSpec() + file.pack(resF) + # collect old subs + if job.prodSourceLabel in ['managed','test'] and file.type in ['output','log'] \ + and re.search('_sub\d+$',file.destinationDBlock) != None: + if not file.destinationDBlock in oldSubList: + oldSubList.append(file.destinationDBlock) + # reset status, destinationDBlock and dispatchDBlock + file.status ='unknown' + file.dispatchDBlock = None + file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock) + # add file + job.addFile(file) + # update files + sqlF = ("UPDATE ATLAS_PANDA.filesTable4 SET %s" % file.bindUpdateChangesExpression()) + "WHERE row_ID=:row_ID" + varMap = file.valuesMap(onlyChanged=True) + if varMap != {}: + varMap[':row_ID'] = file.row_ID + _logger.debug(sqlF+comment+str(varMap)) + self.cur.execute(sqlF+comment, varMap) + updatedFlag = True + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # record status change + try: + if updatedFlag: + self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) + except: + _logger.error('recordStatusChange in resetDefinedJobs') + if getOldSubs: + return job,oldSubList + return job + except: + # error report + type, value, traceBack = sys.exc_info() + _logger.error("resetDefinedJobs : %s %s" % (type,value)) + # roll back + self._rollback() + return None + + + # kill job + def killJob(self,pandaID,user,code,prodManager,getUserInfo=False,wgProdRole=[]): + # code + # 2 : expire + # 3 : aborted + # 4 : expire in waiting + # 7 : retry by server + # 8 : rebrokerage + # 9 : force kill + # 91 : kill user jobs with prod role + comment = ' /* DBProxy.killJob */' + _logger.debug("killJob : code=%s PandaID=%s role=%s user=%s wg=%s" % (code,pandaID,prodManager,user,wgProdRole)) + # check PandaID + try: + long(pandaID) + except: + _logger.error("not an integer : %s" % pandaID) + if getUserInfo: + return False,{} + return False + sql0 = "SELECT prodUserID,prodSourceLabel,jobDefinitionID,jobsetID,workingGroup FROM %s WHERE PandaID=:PandaID" + sql1 = "UPDATE %s SET commandToPilot=:commandToPilot,taskBufferErrorDiag=:taskBufferErrorDiag WHERE PandaID=:PandaID AND commandToPilot IS NULL" + sql1F = "UPDATE %s SET commandToPilot=:commandToPilot,taskBufferErrorDiag=:taskBufferErrorDiag WHERE PandaID=:PandaID" + sql2 = "SELECT %s " % JobSpec.columnNames() + sql2 += "FROM %s WHERE PandaID=:PandaID AND jobStatus<>:jobStatus" + sql3 = "DELETE FROM %s WHERE PandaID=:PandaID" + sqlU = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID AND (jobStatus=:oldJobStatus1 OR jobStatus=:oldJobStatus2)" + sql4 = "INSERT INTO ATLAS_PANDA.jobsArchived4 (%s) " % JobSpec.columnNames() + sql4 += JobSpec.bindValuesExpression() + sqlF = "UPDATE ATLAS_PANDA.filesTable4 SET status=:status WHERE PandaID=:PandaID AND type IN (:type1,:type2)" + sqlFMod = "UPDATE ATLAS_PANDA.filesTable4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlMMod = "UPDATE ATLAS_PANDA.metaTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + sqlPMod = "UPDATE ATLAS_PANDA.jobParamsTable SET modificationTime=:modificationTime WHERE PandaID=:PandaID" + try: + flagCommand = False + flagKilled = False + userProdUserID = '' + userProdSourceLabel = '' + userJobDefinitionID = '' + userJobsetID = '' + updatedFlag = False + # begin transaction + self.conn.begin() + for table in ('ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4'): + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # begin transaction + self.conn.begin() + # get DN if user is not production DN + varMap = {} + varMap[':PandaID'] = pandaID + self.cur.arraysize = 10 + self.cur.execute((sql0+comment) % table, varMap) + res = self.cur.fetchone() + # not found + if res == None: + continue + # owner? + def getCN(dn): + distinguishedName = '' + for line in dn.split('/'): + if line.startswith('CN='): + distinguishedName = re.sub('^CN=','',line) + distinguishedName = re.sub('\d+$','',distinguishedName) + distinguishedName = distinguishedName.strip() + break + if distinguishedName == '': + distinguishedName = dn + return distinguishedName + # prevent prod proxy from killing analysis jobs + userProdUserID,userProdSourceLabel,userJobDefinitionID,userJobsetID,workingGroup = res + # check group prod role + validGroupProdRole = False + if res[1] in ['managed','test'] and workingGroup != '': + for tmpGroupProdRole in wgProdRole: + if tmpGroupProdRole == '': + continue + if re.search('(^|_)'+tmpGroupProdRole+'$',workingGroup,re.I) != None: + validGroupProdRole = True + break + if prodManager: + if res[1] in ['user','panda'] and (not code in ['2','4','7','8','9','91']): + _logger.debug("ignore killJob -> prod proxy tried to kill analysis job type=%s" % res[1]) + break + _logger.debug("killJob : %s using prod role" % pandaID) + elif validGroupProdRole: + # WGs with prod role + _logger.debug("killJob : %s using group prod role for workingGroup=%s" % (pandaID,workingGroup)) + pass + else: + cn1 = getCN(res[0]) + cn2 = getCN(user) + _logger.debug("Owner:%s - Requester:%s " % (cn1,cn2)) + if cn1 != cn2: + _logger.debug("ignore killJob -> Owner != Requester") + break + # update + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':commandToPilot'] = 'tobekilled' + varMap[':taskBufferErrorDiag'] = 'killed by %s' % user + if userProdSourceLabel in ['managed','test'] and code in ['9',]: + # ignore commandToPilot for force kill + self.cur.execute((sql1F+comment) % table, varMap) + else: + self.cur.execute((sql1+comment) % table, varMap) + retU = self.cur.rowcount + if retU == 0: + continue + # set flag + flagCommand = True + # select + varMap = {} + varMap[':PandaID'] = pandaID + if (userProdSourceLabel in ['managed','test'] or 'test' in userProdSourceLabel) and code in ['9',]: + # use dummy for force kill + varMap[':jobStatus'] = 'dummy' + else: + varMap[':jobStatus'] = 'running' + self.cur.arraysize = 10 + self.cur.execute((sql2+comment) % table, varMap) + res = self.cur.fetchall() + if len(res) == 0: + continue + # instantiate JobSpec + job = JobSpec() + job.pack(res[0]) + # delete + if table=='ATLAS_PANDA.jobsDefined4': + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':oldJobStatus1'] = 'assigned' + varMap[':oldJobStatus2'] = 'defined' + self.cur.execute(sqlU+comment, varMap) + else: + varMap = {} + varMap[':PandaID'] = pandaID + self.cur.execute((sql3+comment) % table, varMap) + retD = self.cur.rowcount + if retD == 0: + continue + # error code + if job.jobStatus != 'failed': + # set status etc for non-failed jobs + job.endTime = datetime.datetime.utcnow() + job.modificationTime = job.endTime + if code in ['2','4']: + # expire + if code == '2': + job.taskBufferErrorCode = ErrorCode.EC_Expire + job.taskBufferErrorDiag = 'expired after 7 days since submission' + else: + # waiting timeout + job.taskBufferErrorCode = ErrorCode.EC_Expire + #job.taskBufferErrorCode = ErrorCode.EC_WaitTimeout + job.taskBufferErrorDiag = 'expired after waiting for input data for 2 days' + elif code=='3': + # aborted + job.taskBufferErrorCode = ErrorCode.EC_Aborted + job.taskBufferErrorDiag = 'aborted by ExtIF' + elif code=='8': + # reassigned by rebrokeage + job.taskBufferErrorCode = ErrorCode.EC_Reassigned + job.taskBufferErrorDiag = 'reassigned to another site by rebrokerage. new %s' % user + job.commandToPilot = None + else: + # killed + job.taskBufferErrorCode = ErrorCode.EC_Kill + job.taskBufferErrorDiag = 'killed by %s' % user + # set job status + job.jobStatus = 'cancelled' + else: + # keep status for failed jobs + job.modificationTime = datetime.datetime.utcnow() + if code=='7': + # retried by server + job.taskBufferErrorCode = ErrorCode.EC_Retried + job.taskBufferErrorDiag = 'retrying at another site. new %s' % user + job.commandToPilot = None + job.stateChangeTime = job.modificationTime + # insert + self.cur.execute(sql4+comment, job.valuesMap()) + # update file + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':status'] = 'failed' + varMap[':type1'] = 'output' + varMap[':type2'] = 'log' + self.cur.execute(sqlF+comment,varMap) + # update files,metadata,parametes + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':modificationTime'] = job.modificationTime + self.cur.execute(sqlFMod+comment,varMap) + self.cur.execute(sqlMMod+comment,varMap) + self.cur.execute(sqlPMod+comment,varMap) + flagKilled = True + updatedFlag = True + break + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("killJob : com=%s kill=%s " % (flagCommand,flagKilled)) + # record status change + try: + if updatedFlag: + self.recordStatusChange(job.PandaID,job.jobStatus,jobInfo=job) + except: + _logger.error('recordStatusChange in killJob') + if getUserInfo: + return (flagCommand or flagKilled),{'prodUserID':userProdUserID, + 'prodSourceLabel':userProdSourceLabel, + 'jobDefinitionID':userJobDefinitionID, + 'jobsetID':userJobsetID} + return (flagCommand or flagKilled) + except: + type, value, traceBack = sys.exc_info() + _logger.error("killJob : %s %s" % (type,value)) + # roll back + self._rollback() + if getUserInfo: + return False,{} + return False + + + # peek at job + def peekJob(self,pandaID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal=False): + comment = ' /* DBProxy.peekJob */' + _logger.debug("peekJob : %s" % pandaID) + # return None for NULL PandaID + if pandaID in ['NULL','','None',None]: + return None + # only int + try: + tmpID = int(pandaID) + except: + _logger.debug("peekJob : return None for %s:non-integer" % pandaID) + return None + sql1_0 = "SELECT %s FROM %s " + sql1_1 = "WHERE PandaID=:PandaID" + nTry=3 + for iTry in range(nTry): + try: + tables=[] + if fromDefined: + tables.append('ATLAS_PANDA.jobsDefined4') + if fromActive: + tables.append('ATLAS_PANDA.jobsActive4') + if fromArchived: + tables.append('ATLAS_PANDA.jobsArchived4') + if fromWaiting: + tables.append('ATLAS_PANDA.jobsWaiting4') + if fromDefined: + # for jobs which are just reset + tables.append('ATLAS_PANDA.jobsDefined4') + # select + varMap = {} + varMap[':PandaID'] = pandaID + for table in tables: + # start transaction + self.conn.begin() + # select + sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1 + self.cur.arraysize = 10 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if len(res) != 0: + # Job + job = JobSpec() + job.pack(res[0]) + # Files + # start transaction + self.conn.begin() + # select + sqlFile = "SELECT %s FROM ATLAS_PANDA.filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=:PandaID" + self.cur.arraysize = 10000 + self.cur.execute(sqlFile+comment, varMap) + resFs = self.cur.fetchall() + # metadata + resMeta = None + if table == 'ATLAS_PANDA.jobsArchived4' or forAnal: + # read metadata only for finished/failed production jobs + sqlMeta = "SELECT metaData FROM ATLAS_PANDA.metaTable WHERE PandaID=:PandaID" + self.cur.execute(sqlMeta+comment, varMap) + for clobMeta, in self.cur: + if clobMeta != None: + resMeta = clobMeta.read() + break + # job parameters + job.jobParameters = None + sqlJobP = "SELECT jobParameters FROM ATLAS_PANDA.jobParamsTable WHERE PandaID=:PandaID" + varMap = {} + varMap[':PandaID'] = job.PandaID + self.cur.execute(sqlJobP+comment, varMap) + for clobJobP, in self.cur: + if clobJobP != None: + job.jobParameters = clobJobP.read() + break + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # set files + for resF in resFs: + file = FileSpec() + file.pack(resF) + job.addFile(file) + # set metadata + job.metadata = resMeta + return job + _logger.debug("peekJob() : PandaID %s not found" % pandaID) + return None + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("peekJob : %s retry : %s" % (pandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("peekJob : %s %s %s" % (pandaID,type,value)) + # return None for analysis + if forAnal: + return None + # return 'unknown' + job = JobSpec() + job.PandaID = pandaID + job.jobStatus = 'unknown' + return job + + + # get PandaID with jobexeID + def getPandaIDwithJobExeID(self,jobexeID): + comment = ' /* DBProxy.getPandaIDwithJobExeID */' + _logger.debug("getPandaIDwithJobExeID : %s" % jobexeID) + failedRetVal = (None,None,'') + # return for wrong jobexeID + if jobexeID in ['NULL','','None',None]: + return failedRetVal + # SQL + sql = "SELECT PandaID,jobDefinitionID,jobName FROM ATLAS_PANDA.jobsWaiting4 " + sql += "WHERE jobExecutionID=:jobexeID AND prodSourceLabel=:prodSourceLabel " + sql += "AND jobStatus=:jobStatus " + varMap = {} + varMap[':jobexeID'] = jobexeID + varMap[':jobStatus'] = 'pending' + varMap[':prodSourceLabel'] = 'managed' + try: + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10 + self.cur.execute(sql+comment,varMap) + res = self.cur.fetchone() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # not found + if res == None: + _logger.debug("getPandaIDwithJobExeID : jobexeID %s not found" % jobexeID) + return failedRetVal + _logger.debug("getPandaIDwithJobExeID : %s -> %s" % (jobexeID,str(res))) + return res + except: + # roll back + self._rollback() + errtype,errvalue = sys.exc_info()[:2] + _logger.error("getPandaIDwithJobExeID : %s %s %s" % (jobexeID,errtype,errvalue)) + return failedRetVal + + + # get express jobs + def getExpressJobs(self,dn): + comment = ' /* DBProxy.getExpressJobs */' + _logger.debug("getExpressJobs : %s" % dn) + sqlX = "SELECT specialHandling,COUNT(*) FROM %s " + sqlX += "WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel1 " + sqlX += "AND specialHandling IS NOT NULL " + sqlXJob = "SELECT PandaID,jobStatus,prodSourceLabel,modificationTime,jobDefinitionID,jobsetID,startTime,endTime FROM %s " + sqlXJob += "WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel1 " + sqlXJob += "AND specialHandling IS NOT NULL AND specialHandling=:specialHandling " + sqlQ = sqlX + sqlQ += "GROUP BY specialHandling " + sqlQJob = sqlXJob + sqlA = sqlX + sqlA += "AND modificationTime>:modificationTime GROUP BY specialHandling " + sqlAJob = sqlXJob + sqlAJob += "AND modificationTime>:modificationTime " + try: + # get compact DN + compactDN = self.cleanUserID(dn) + if compactDN in ['','NULL',None]: + compactDN = dn + expressStr = 'express' + activeExpressU = [] + timeUsageU = datetime.timedelta(0) + executionTimeU = datetime.timedelta(hours=1) + jobCreditU = 3 + timeCreditU = executionTimeU * jobCreditU + timeNow = datetime.datetime.utcnow() + timeLimit = timeNow - datetime.timedelta(hours=6) + # loop over tables + for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: + varMap = {} + varMap[':prodUserName'] = compactDN + varMap[':prodSourceLabel1'] = 'user' + if table == 'ATLAS_PANDA.jobsArchived4': + varMap[':modificationTime'] = timeLimit + sql = sqlA % table + sqlJob = sqlAJob % table + else: + sql = sqlQ % table + sqlJob = sqlQJob % table + # start transaction + self.conn.begin() + # get the number of jobs for each specialHandling + self.cur.arraysize = 10 + _logger.debug(sql+comment+str(varMap)) + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + _logger.debug("getExpressJobs %s" % str(res)) + for specialHandling,countJobs in res: + if specialHandling == None: + continue + # look for express jobs + if expressStr in specialHandling: + varMap[':specialHandling'] = specialHandling + self.cur.arraysize = 1000 + self.cur.execute(sqlJob+comment, varMap) + resJobs = self.cur.fetchall() + _logger.debug("getExpressJobs %s" % str(resJobs)) + for tmp_PandaID,tmp_jobStatus,tmp_prodSourceLabel,tmp_modificationTime,\ + tmp_jobDefinitionID,tmp_jobsetID,tmp_startTime,tmp_endTime \ + in resJobs: + # collect active jobs + if not tmp_jobStatus in ['finished','failed','cancelled']: + activeExpressU.append((tmp_PandaID,tmp_jobsetID,tmp_jobDefinitionID)) + # get time usage + if not tmp_jobStatus in ['defined','activated']: + # check only jobs which actually use or used CPU on WN + if tmp_startTime != None: + # running or not + if tmp_endTime == None: + # job got started before/after the time limit + if timeLimit > tmp_startTime: + timeDelta = timeNow - timeLimit + else: + timeDelta = timeNow - tmp_startTime + else: + # job got started before/after the time limit + if timeLimit > tmp_startTime: + timeDelta = tmp_endTime - timeLimit + else: + timeDelta = tmp_endTime - tmp_startTime + # add + if timeDelta > datetime.timedelta(0): + timeUsageU += timeDelta + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # check quota + rRet = True + rRetStr = '' + rQuota = 0 + if len(activeExpressU) >= jobCreditU: + rRetStr += "The number of queued runXYZ exceeds the limit = %s. " % jobCreditU + rRet = False + if timeUsageU >= timeCreditU: + rRetStr += "The total execution time for runXYZ exceeds the limit = %s min. " % (timeCreditU.seconds / 60) + rRet = False + # calculate available quota + if rRet: + tmpQuota = jobCreditU - len(activeExpressU) - timeUsageU.seconds/executionTimeU.seconds + if tmpQuota < 0: + rRetStr += "Quota for runXYZ exceeds. " + rRet = False + else: + rQuota = tmpQuota + # return + retVal = {'status':rRet,'quota':rQuota,'output':rRetStr,'usage':timeUsageU,'jobs':activeExpressU} + _logger.debug("getExpressJobs : %s" % str(retVal)) + return retVal + except: + # roll back + self._rollback() + errtype,errvalue = sys.exc_info()[:2] + _logger.error("getExpressJobs : %s %s" % (errtype,errvalue)) + return None + + + # get active debug jobs + def getActiveDebugJobs(self,dn): + comment = ' /* DBProxy.getActiveDebugJobs */' + _logger.debug("getActiveDebugJobs : %s" % dn) + sqlX = "SELECT PandaID,jobStatus,specialHandling FROM %s " + sqlX += "WHERE prodUserName=:prodUserName " + sqlX += "AND specialHandling IS NOT NULL " + try: + # get compact DN + compactDN = self.cleanUserID(dn) + if compactDN in ['','NULL',None]: + compactDN = dn + debugStr = 'debug' + activeDebugJobs = [] + # loop over tables + for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4']: + varMap = {} + varMap[':prodUserName'] = compactDN + sql = sqlX % table + # start transaction + self.conn.begin() + # get jobs with specialHandling + self.cur.arraysize = 100000 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # loop over all PandaIDs + for pandaID,jobStatus,specialHandling in res: + if specialHandling == None: + continue + # only active jobs + if not jobStatus in ['defined','activated','running','sent','starting']: + continue + # look for debug jobs + if debugStr in specialHandling and not pandaID in activeDebugJobs: + activeDebugJobs.append(pandaID) + # return + activeDebugJobs.sort() + _logger.debug("getActiveDebugJobs : %s -> %s" % (dn,str(activeDebugJobs))) + return activeDebugJobs + except: + # roll back + self._rollback() + errtype,errvalue = sys.exc_info()[:2] + _logger.error("getActiveDebugJobs : %s %s" % (errtype,errvalue)) + return None + + + # set debug mode + def setDebugMode(self,dn,pandaID,prodManager,modeOn): + comment = ' /* DBProxy.setDebugMode */' + _logger.debug("turnDebugModeOn : dn=%s id=%s prod=%s mode=%s" % (dn,pandaID,prodManager,modeOn)) + sqlX = "SELECT prodUserName,jobStatus,specialHandling FROM %s " + sqlX += "WHERE PandaID=:PandaID " + sqlU = "UPDATE %s SET specialHandling=:specialHandling " + sqlU += "WHERE PandaID=:PandaID " + try: + # get compact DN + compactDN = self.cleanUserID(dn) + if compactDN in ['','NULL',None]: + compactDN = dn + debugStr = 'debug' + retStr = '' + retCode = False + # loop over tables + for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4']: + varMap = {} + varMap[':PandaID'] = pandaID + sql = sqlX % table + # start transaction + self.conn.begin() + # get jobs with specialHandling + self.cur.arraysize = 10 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchone() + # not found + if res == None: + retStr = 'Not found in active DB' + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + continue + prodUserName,jobStatus,specialHandling = res + # not active + if not jobStatus in ['defined','activated','running','sent','starting']: + retStr = 'Not in one of active job status' + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + break + # not owner + if not prodManager and prodUserName != compactDN: + retStr = 'Permission denied. Not the owner' + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + break + # set specialHandling + updateSH = True + if specialHandling in [None,'']: + if modeOn: + # set debug mode + specialHandling = debugStr + else: + # already disabled debug mode + updateSH = False + elif debugStr in specialHandling: + if modeOn: + # already in debug mode + updateSH = False + else: + # disable debug mode + specialHandling = re.sub(debugStr,'',specialHandling) + specialHandling = re.sub(',,',',',specialHandling) + specialHandling = re.sub('^,','',specialHandling) + specialHandling = re.sub(',$','',specialHandling) + else: + if modeOn: + # set debug mode + specialHandling = '%s,%s' % (debugStr,specialHandling) + else: + # already disabled debug mode + updateSH = False + + # no update + if not updateSH: + retStr = 'Already set accordingly' + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + break + # update + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':specialHandling'] = specialHandling + self.cur.execute((sqlU+comment) % table, varMap) + retD = self.cur.rowcount + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if retD == 0: + retStr = 'Failed to update DB' + else: + retStr = 'Succeeded' + break + # return + _logger.debug("setDebugMode : %s %s -> %s" % (dn,pandaID,retStr)) + return retStr + except: + # roll back + self._rollback() + errtype,errvalue = sys.exc_info()[:2] + _logger.error("setDebugMode : %s %s" % (errtype,errvalue)) + return None + + + # get PandaID with destinationDBlock + def getPandaIDwithDestDBlock(self,destinationDBlock): + comment = ' /* DBProxy.getPandaIDwithDestDBlock */' + _logger.debug("getPandaIDwithDestDBlock : %s" % destinationDBlock) + try: + sqlP = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab " + sqlP += "WHERE type IN (:type1,:type2) AND destinationDBlock=:destinationDBlock AND rownum<=1" + # start transaction + self.conn.begin() + pandaID = None + varMap = {} + varMap[':type1'] = 'log' + varMap[':type2'] = 'output' + varMap[':destinationDBlock'] = destinationDBlock + # select + self.cur.arraysize = 10 + self.cur.execute(sqlP+comment, varMap) + res = self.cur.fetchone() + # append + if res != None: + pandaID, = res + # commit to release tables + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return pandaID + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getPandaIDwithDestDBlock : %s %s" % (errType,errValue)) + # return empty list + return None + + + # get destSE with destinationDBlock + def getDestSEwithDestDBlock(self,destinationDBlock): + comment = ' /* DBProxy.getDestSEwithDestDBlock */' + _logger.debug("getDestSEwithDestDBlock : %s" % destinationDBlock) + try: + sqlP = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ destinationSE FROM ATLAS_PANDA.filesTable4 tab " + sqlP += "WHERE type IN (:type1,:type2) AND destinationDBlock=:destinationDBlock AND rownum<=1" + # start transaction + self.conn.begin() + varMap = {} + varMap[':type1'] = 'log' + varMap[':type2'] = 'output' + varMap[':destinationDBlock'] = destinationDBlock + # select + self.cur.arraysize = 10 + self.cur.execute(sqlP+comment, varMap) + res = self.cur.fetchone() + # append + destinationSE = None + if res != None: + destinationSE, = res + # commit to release tables + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return destinationSE + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getDestSEwithDestDBlock : %s %s" % (errType,errValue)) + # return empty list + return None + + + # get number of activated/defined jobs with output datasets + def getNumWaitingJobsWithOutDS(self,outputDSs): + comment = ' /* DBProxy.getNumWaitingJobsWithOutDS */' + _logger.debug("getNumWaitingJobsWithOutDS : %s" % str(outputDSs)) + try: + sqlD = "SELECT distinct destinationDBlock FROM ATLAS_PANDA.filesTable4 " + sqlD += "WHERE type IN (:type1,:type2) AND dataset=:dataset AND status IN (:status1,:status2)" + sqlP = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab " + sqlP += "WHERE type IN (:type1,:type2) AND destinationDBlock=:destinationDBlock AND status IN (:status1,:status2) AND rownum<=1" + sqlJ = "SELECT jobDefinitionID,taskID,prodUserName,jobStatus,prodSourceLabel FROM %s " + sqlJ += "WHERE PandaID=:PandaID" + sqlC = "SELECT count(*) FROM ATLAS_PANDA.jobsActive4 " + sqlC += "WHERE jobDefinitionID=:jobDefinitionID AND prodUserName=:prodUserName AND jobStatus IN (:jobStatus1)" + # start transaction + self.conn.begin() + # get sub datasets + subDSList = [] + for outputDS in outputDSs: + varMap = {} + varMap[':type1'] = 'log' + varMap[':type2'] = 'output' + varMap[':status1'] = 'unknown' + varMap[':status2'] = 'pending' + varMap[':dataset'] = outputDS + # select + self.cur.arraysize = 1000 + self.cur.execute(sqlD+comment, varMap) + resList = self.cur.fetchall() + # append + for destinationDBlock, in resList: + subDSList.append(destinationDBlock) + # get PandaIDs + pandaIDs = [] + for subDS in subDSList: + varMap = {} + varMap[':type1'] = 'log' + varMap[':type2'] = 'output' + varMap[':status1'] = 'unknown' + varMap[':status2'] = 'pending' + varMap[':destinationDBlock'] = subDS + # select + self.cur.arraysize = 10 + self.cur.execute(sqlP+comment, varMap) + res = self.cur.fetchone() + # append + if res != None: + pandaID, = res + pandaIDs.append(pandaID) + # commit to release tables + if not self._commit(): + raise RuntimeError, 'Commit error' + # loop over all PandaIDs + jobInfos = [] + for pandaID in pandaIDs: + varMap = {} + varMap[':PandaID'] = pandaID + # start transaction + self.conn.begin() + # get jobID,nJobs,jobStatus,userName + res = None + for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']: + # select + self.cur.arraysize = 10 + self.cur.execute((sqlJ % table)+comment,varMap) + res = self.cur.fetchone() + if res != None: + break + # commit to release tables + if not self._commit(): + raise RuntimeError, 'Commit error' + # not found + if res == None: + continue + # append + jobInfos.append(res) + # no jobs + if jobInfos == []: + _logger.error("getNumWaitingJobsWithOutDS : no jobs found") + return False,{} + # loop over all jobIDs + retMap = {} + for jobID,taskID,prodUserName,jobStatus,prodSourceLabel in jobInfos: + if retMap.has_key(jobID): + continue + retMap[jobID] = {} + retMap[jobID]['nJobs'] = taskID + retMap[jobID]['sourceLabel'] = prodSourceLabel + # don't check # of activated + if jobStatus in ['defined']: + retMap[jobID]['activated'] = False + retMap[jobID]['nActs'] = 0 + continue + retMap[jobID]['activated'] = True + # get # of activated jobs + varMap = {} + varMap[':prodUserName'] = prodUserName + varMap[':jobDefinitionID'] = jobID + varMap[':jobStatus1'] = 'activated' + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10 + self.cur.execute(sqlC+comment, varMap) + res = self.cur.fetchone() + # commit to release tables + if not self._commit(): + raise RuntimeError, 'Commit error' + if res == None: + _logger.error("getNumWaitingJobsWithOutDS : cannot get # of activated for %s:%s" % \ + (jobID,prodUserName)) + return False,{} + # set # of activated + nActs, = res + retMap[jobID]['nActs'] = nActs + # return + _logger.debug("getNumWaitingJobsWithOutDS -> %s" % str(retMap)) + return True,retMap + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getNumWaitingJobsWithOutDS : %s %s" % (errType,errValue)) + # return empty list + return False,{} + + + # get slimmed file info with PandaIDs + def getSlimmedFileInfoPandaIDs(self,pandaIDs): + comment = ' /* DBProxy.getSlimmedFileInfoPandaIDs */' + _logger.debug("getSlimmedFileInfoPandaIDs : %s len=%s" % (pandaIDs[0],len(pandaIDs))) + try: + sqlL = "SELECT lfn,type,dataset FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID" + sqlA = "SELECT /*+ INDEX(tab FILES_ARCH_PANDAID_IDX)*/ lfn,type,dataset FROM ATLAS_PANDAARCH.filesTable_ARCH tab " + sqlA += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-60)" + retMap = {'inDS':[],'outDS':[]} + # start transaction + self.conn.begin() + # select + for pandaID in pandaIDs: + # make sql + varMap = {} + varMap[':PandaID'] = pandaID + # select + self.cur.arraysize = 10000 + self.cur.execute(sqlL+comment, varMap) + resList = self.cur.fetchall() + # try archived if not found in filesTable4 + if len(resList) == 0: + self.cur.execute(sqlA+comment, varMap) + resList = self.cur.fetchall() + # append + for tmp_lfn,tmp_type,tmp_dataset in resList: + # skip lib.tgz + if tmp_lfn.endswith('.lib.tgz'): + continue + if tmp_type == 'input': + if not tmp_dataset in retMap['inDS']: + retMap['inDS'].append(tmp_dataset) + elif tmp_type == 'output': + if not tmp_dataset in retMap['outDS']: + retMap['outDS'].append(tmp_dataset) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("getSlimmedFileInfoPandaIDs : %s" % str(retMap)) + return retMap + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getSlimmedFileInfoPandaIDs : %s %s" % (type,value)) + # return empty list + return {} + + + # get JobIDs in a time range + def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs): + comment = ' /* DBProxy.getJobIDsInTimeRange */' + _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) + try: + # get compact DN + compactDN = self.cleanUserID(dn) + if compactDN in ['','NULL',None]: + compactDN = dn + tables = ['ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4'] + # select + for table in tables: + # make sql + if table == 'ATLAS_PANDA.jobsArchived4': + sql = 'SELECT /*+ INDEX_RS_ASC(TAB("JOBSARCHIVED4"."PRODUSERNAME")) NO_INDEX(TAB("JOBSARCHIVED4"."MODIFICATIONTIME")) */ jobDefinitionID FROM %s tab ' % table + elif table == 'ATLAS_PANDA.jobsActive4': + sql = 'SELECT /*+ INDEX_RS_ASC(TAB("JOBSACTIVE4"."PRODUSERNAME")) NO_INDEX(TAB("JOBSACTIVE4"."MODIFICATIONTIME")) */ jobDefinitionID FROM %s tab ' % table + else: + sql = "SELECT jobDefinitionID FROM %s " % table + sql += "WHERE prodUserName=:prodUserName AND modificationTime>:modificationTime " + sql += "AND prodSourceLabel=:prodSourceLabel GROUP BY jobDefinitionID" + varMap = {} + varMap[':prodUserName'] = compactDN + varMap[':prodSourceLabel'] = 'user' + varMap[':modificationTime'] = timeRange + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + _logger.debug(sql+comment+str(varMap)) + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for tmpID, in resList: + if not tmpID in retJobIDs: + retJobIDs.append(tmpID) + _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs)) + return retJobIDs + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getJobIDsInTimeRange : %s %s" % (type,value)) + # return empty list + return [] + + + # get PandaIDs for a JobID + def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs): + comment = ' /* DBProxy.getPandIDsWithJobID */' + _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID)) + try: + # get compact DN + compactDN = self.cleanUserID(dn) + if compactDN in ['','NULL',None]: + compactDN = dn + tables = ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsArchived4'] + buildJobID = None + # select + for table in tables: + # skip if all jobs have already been gotten + if nJobs > 0 and len(idStatus) >= nJobs: + continue + # make sql + sql = "SELECT PandaID,jobStatus,commandToPilot,prodSourceLabel,taskBufferErrorCode FROM %s " % table + sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sql += "AND prodSourceLabel in (:prodSourceLabel1,:prodSourceLabel2)" + varMap = {} + varMap[':prodUserName'] = compactDN + varMap[':jobDefinitionID'] = jobID + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + # select + _logger.debug(sql+comment+str(varMap)) + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + # append + for tmpID,tmpStatus,tmpCommand,tmpProdSourceLabel,tmpTaskBufferErrorCode in resList: + # ignore jobs retried by pilot since they have new PandaIDs with the same jobsetID/jobdefID + if tmpTaskBufferErrorCode in [ErrorCode.EC_PilotRetried]: + continue + # ignore old buildJob which was replaced by rebrokerage + if tmpProdSourceLabel == 'panda': + if buildJobID == None: + # first buildJob + buildJobID = tmpID + elif buildJobID >= tmpID: + # don't append old one + continue + else: + # delete old one + del idStatus[buildJobID] + buildJobID = tmpID + # append + idStatus[tmpID] = (tmpStatus,tmpCommand) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("getPandIDsWithJobID : %s" % str(idStatus)) + return idStatus,buildJobID + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getPandIDsWithJobID : %s %s" % (type,value)) + # return empty list + return {},None + + + # lock jobs for reassign + def lockJobsForReassign(self,tableName,timeLimit,statList,labels,processTypes,sites,clouds): + comment = ' /* DBProxy.lockJobsForReassign */' + _logger.debug("lockJobsForReassign : %s %s %s %s %s %s %s" % \ + (tableName,timeLimit,statList,labels,processTypes,sites,clouds)) + try: + # make sql + sql = "SELECT PandaID FROM %s " % tableName + sql += "WHERE modificationTime<:modificationTime " + varMap = {} + varMap[':modificationTime'] = timeLimit + if statList != []: + sql += 'AND jobStatus IN (' + tmpIdx = 0 + for tmpStat in statList: + tmpKey = ':stat%s' % tmpIdx + varMap[tmpKey] = tmpStat + sql += '%s,' % tmpKey + sql = sql[:-1] + sql += ') ' + if labels != []: + sql += 'AND prodSourceLabel IN (' + tmpIdx = 0 + for tmpStat in labels: + tmpKey = ':label%s' % tmpIdx + varMap[tmpKey] = tmpStat + sql += '%s,' % tmpKey + sql = sql[:-1] + sql += ') ' + if processTypes != []: + sql += 'AND processingType IN (' + tmpIdx = 0 + for tmpStat in processTypes: + tmpKey = ':processType%s' % tmpIdx + varMap[tmpKey] = tmpStat + sql += '%s,' % tmpKey + sql = sql[:-1] + sql += ') ' + if sites != []: + sql += 'AND computingSite IN (' + tmpIdx = 0 + for tmpStat in sites: + tmpKey = ':site%s' % tmpIdx + varMap[tmpKey] = tmpStat + sql += '%s,' % tmpKey + sql = sql[:-1] + sql += ') ' + if clouds != []: + sql += 'AND cloud IN (' + tmpIdx = 0 + for tmpStat in clouds: + tmpKey = ':cloud%s' % tmpIdx + varMap[tmpKey] = tmpStat + sql += '%s,' % tmpKey + sql = sql[:-1] + sql += ') ' + # sql for lock + sqlLock = 'UPDATE %s SET modificationTime=CURRENT_DATE WHERE PandaID=:PandaID' % tableName + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 1000000 + self.cur.execute(sql+comment,varMap) + resList = self.cur.fetchall() + retList = [] + # lock + for tmpID, in resList: + varLock = {':PandaID':tmpID} + self.cur.execute(sqlLock+comment,varLock) + retList.append((tmpID,)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # sort + retList.sort() + _logger.debug("lockJobsForReassign : %s" % (len(retList))) + return True,retList + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("lockJobsForReassign : %s %s" % (errType,errValue)) + # return empty + return False,[] + + + # lock jobs for finisher + def lockJobsForFinisher(self,timeNow,rownum,highPrio): + comment = ' /* DBProxy.lockJobsForFinisher */' + _logger.debug("lockJobsForFinisher : %s %s %s" % (timeNow,rownum,highPrio)) + try: + varMap = {} + varMap[':jobStatus'] = 'transferring' + varMap[':currentPriority'] = 800 + varMap[':prodSourceLabel'] = 'managed' + # make sql + sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 " + sql += "WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel " + if highPrio: + varMap[':modificationTime'] = timeNow - datetime.timedelta(hours=1) + sql += "AND currentPriority>=:currentPriority AND rownum<=%s " % rownum + else: + sql += "AND currentPriority<:currentPriority AND rownum<=%s " % rownum + varMap[':modificationTime'] = timeNow - datetime.timedelta(hours=12) + sql += "FOR UPDATE " + # sql for lock + sqlLock = 'UPDATE ATLAS_PANDA.jobsActive4 SET modificationTime=CURRENT_DATE WHERE PandaID=:PandaID' + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 1000 + self.cur.execute(sql+comment,varMap) + resList = self.cur.fetchall() + retList = [] + # lock + for tmpID, in resList: + varLock = {':PandaID':tmpID} + self.cur.execute(sqlLock+comment,varLock) + retList.append(tmpID) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # sort + retList.sort() + _logger.debug("lockJobsForFinisher : %s" % (len(retList))) + return True,retList + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("lockJobsForFinisher : %s %s" % (errType,errValue)) + # return empty + return False,[] + + + # get the number of waiting jobs with a dataset + def getNumWaitingJobsForPD2P(self,datasetName): + comment = ' /* DBProxy.getNumWaitingJobsForPD2P */' + _logger.debug("getNumWaitingJobsForPD2P : %s" % datasetName) + try: + tables = ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4'] + nJobs = 0 + # select + for table in tables: + # make sql + sql = "SELECT COUNT(*) FROM %s " % table + sql += "WHERE prodDBlock=:prodDBlock AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " + sql += "AND jobStatus IN (:jobStatus1,:jobStatus2) " + varMap = {} + varMap[':prodDBlock'] = datasetName + varMap[':jobStatus1'] = 'defined' + varMap[':jobStatus2'] = 'activated' + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchone() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if res != None: + tmpN, = res + nJobs += tmpN + _logger.debug("getNumWaitingJobsForPD2P : %s -> %s" % (datasetName,nJobs)) + return nJobs + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getNumWaitingJobsForPD2P : %s %s" % (errType,errValue)) + # return 0 + return 0 + + + # get the number of waiting jobsets with a dataset + def getNumWaitingJobsetsForPD2P(self,datasetName): + comment = ' /* DBProxy.getNumWaitingJobsetsForPD2P */' + _logger.debug("getNumWaitingJobsetsForPD2P : %s" % datasetName) + try: + tables = ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4'] + jobsetIDuserList = [] + # select + for table in tables: + # make sql + sql = "SELECT jobsetID,prodUserName FROM %s " % table + sql += "WHERE prodDBlock=:prodDBlock AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " + sql += "AND jobStatus IN (:jobStatus1,:jobStatus2) GROUP BY jobsetID,prodUserName" + varMap = {} + varMap[':prodDBlock'] = datasetName + varMap[':jobStatus1'] = 'defined' + varMap[':jobStatus2'] = 'activated' + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + for jobsetID,prodUserName in resList: + tmpKey = (jobsetID,prodUserName) + if not tmpKey in jobsetIDuserList: + jobsetIDuserList.append(tmpKey) + _logger.debug("getNumWaitingJobsetsForPD2P : %s -> %s" % (datasetName,len(jobsetIDuserList))) + return len(jobsetIDuserList) + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getNumWaitingJobsetsForPD2P : %s %s" % (errType,errValue)) + # return 0 + return 0 + + + # lock job for re-brokerage + def lockJobForReBrokerage(self,dn,jobID,simulation,forceOpt,forFailed=False): + comment = ' /* lockJobForReBrokerage */' + _logger.debug("lockJobForReBrokerage : %s %s %s %s %s" % (dn,jobID,simulation,forceOpt,forFailed)) + try: + errMsg = '' + # get compact DN + compactDN = self.cleanUserID(dn) + if compactDN in ['','NULL',None]: + compactDN = dn + # start transaction + self.conn.begin() + buildJobPandaID = None + buildJobStatus = None + buildJobDefID = None + buildCreationTime = None + runPandaID = None + minPandaIDlibDS = None + maxPandaIDlibDS = None + # get one runXYZ job + if errMsg == '': + for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4']: + sql = "SELECT PandaID FROM %s " % table + sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sql += "AND prodSourceLabel=:prodSourceLabel1 AND jobStatus IN (:jobStatus1,:jobStatus2) " + sql += "AND rownum <= 1" + varMap = {} + varMap[':prodUserName'] = compactDN + varMap[':jobDefinitionID'] = jobID + varMap[':prodSourceLabel1'] = 'user' + if not forFailed: + # lock active jobs for normal rebrokerage + varMap[':jobStatus1'] = 'defined' + varMap[':jobStatus2'] = 'activated' + else: + # lock failed jobs for retry + varMap[':jobStatus1'] = 'failed' + varMap[':jobStatus2'] = 'dummy' + # select + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchone() + # not found + if res != None: + runPandaID, = res + break + if runPandaID == None: + if not forFailed: + errMsg = "no defined/activated jobs to reassign. running/finished/failed jobs are not reassigned by rebrokerage " + else: + errMsg = "could not get failed runXYZ jobs" + # get libDS + libDS = '' + if errMsg == '': + sql = "SELECT lfn,dataset FROM ATLAS_PANDA.filesTable4 WHERE type=:type AND PandaID=:PandaID" + varMap = {} + varMap[':type'] = 'input' + varMap[':PandaID'] = runPandaID + # select + self.cur.arraysize = 10000 + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + for tmpLFN,tmpDS in resList: + if tmpLFN.endswith('.lib.tgz'): + libDS = tmpDS + break + # check status of corresponding buildJob + if libDS != '': + sql = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 " + sql += "WHERE type=:type AND dataset=:dataset" + varMap = {} + varMap[':type'] = 'output' + varMap[':dataset'] = libDS + # select + self.cur.arraysize = 10 + # select + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchone() + # not found in active table + if res == None: + # look for buildJob in archived table + sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODSOURCELABEL_IDX JOBS_PRODUSERNAME_IDX) */ " + sql += "PandaID,jobStatus,jobDefinitionID,creationTime " + sql += "FROM ATLAS_PANDAARCH.jobsArchived tab " + sql += "WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLable1 " + sql += "AND modificationTime>(CURRENT_DATE-10) ORDER BY PandaID DESC" + varMap = {} + varMap[':prodUserName'] = compactDN + varMap[':prodSourceLable1'] = 'panda' + # select + self.cur.arraysize = 10000 + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + # loop over PandaIDs to find corresponding libDS + sql = "SELECT /*+ INDEX(tab FILES_ARCH_PANDAID_IDX)*/ PandaID FROM ATLAS_PANDAARCH.filesTable_ARCH tab " + sql += "WHERE PandaID=:PandaID AND type=:type AND dataset=:dataset AND status=:status " + sql += "AND modificationTime>(CURRENT_DATE-10)" + self.cur.arraysize = 10 + for tmpID,tmpJobStatus,tmpJobDefID,tmpCreationTime in resList: + varMap = {} + varMap[':PandaID'] = tmpID + varMap[':type'] = 'output' + varMap[':status'] = 'ready' + varMap[':dataset'] = libDS + # select + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchone() + if res != None: + # get PandaID of buildJob + buildJobPandaID, = res + buildJobStatus = tmpJobStatus + buildJobDefID = tmpJobDefID + buildCreationTime = tmpCreationTime + break + # not found + if buildJobPandaID == None: + errMsg = "could not find successful buildJob for %s" % libDS + else: + # get PandaID of buildJob + buildJobPandaID, = res + # found buildJob + if errMsg == '': + # get current buildJob status + if buildJobStatus == None: + for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsDefined4']: + # make sql + sql = "SELECT jobStatus,jobDefinitionID,creationTime FROM %s " % table + sql += "WHERE PandaID=:PandaID " + varMap = {} + varMap[':PandaID'] = buildJobPandaID + # select + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchone() + # found + if res != None: + buildJobStatus,buildJobDefID,buildCreationTime = res + break + # not found + if buildJobStatus == None: + errMsg = "could not find buildJob=%s in database" % buildJobPandaID + # check status + if errMsg != '': + if not buildJobStatus in ['defined','activated','finished','cancelled']: + errMsg = "status of buildJob is '%s' != defined/activated/finished/cancelled so that jobs cannot be reassigned" \ + % buildJobStatus + # get max/min PandaIDs using the libDS + if errMsg == '': + sql = "SELECT MAX(PandaID),MIN(PandaID) FROM ATLAS_PANDA.filesTable4 " + sql += "WHERE type=:type AND dataset=:dataset" + varMap = {} + varMap[':type'] = 'input' + varMap[':dataset'] = libDS + self.cur.arraysize = 10 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchone() + if res == None: + errMsg = "cannot get MAX/MIN PandaID for multiple usage for %s" % libDS + else: + maxPandaIDlibDS,minPandaIDlibDS = res + # check creationDate of buildJob + if errMsg == '': + # buildJob has already finished + timeLimit = datetime.datetime.utcnow()-datetime.timedelta(days=6) + if buildJobStatus in ['finished','cancelled'] and buildCreationTime < timeLimit: + errMsg = "corresponding buildJob %s is too old %s" % (buildJobPandaID,buildCreationTime.strftime('%Y-%m-%d %H:%M:%S')) + # check modificationTime + if errMsg == '': + # make sql + tables = ['ATLAS_PANDA.jobsDefined4'] + if not buildJobStatus in ['defined']: + tables.append('ATLAS_PANDA.jobsActive4') + sql = "SELECT modificationTime FROM %s " + sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus IN (:jobStatus1,:jobStatus2) " + sql += "FOR UPDATE " + varMap = {} + varMap[':prodUserName'] = compactDN + varMap[':jobDefinitionID'] = jobID + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + if not forFailed: + # normal rebrokerage + varMap[':jobStatus1'] = 'defined' + varMap[':jobStatus2'] = 'activated' + else: + # retry + varMap[':jobStatus1'] = 'failed' + varMap[':jobStatus2'] = 'dummy' + for tableName in tables: + # select + self.cur.execute((sql % tableName)+comment, varMap) + res = self.cur.fetchone() + if res != None: + break + if res == None: + if not forFailed: + errMsg = "no defined/activated jobs to be reassigned" + else: + errMsg = "no failed jobs to be retried" + else: + tmpModificationTime, = res + # prevent users from rebrokering more than once in one hour + timeLimit = datetime.datetime.utcnow()-datetime.timedelta(hours=1) + if timeLimit < tmpModificationTime and not forceOpt: + errMsg = "last mod time is %s > current-1hour. Cannot run (re)brokerage more than once in one hour" \ + % tmpModificationTime.strftime('%Y-%m-%d %H:%M:%S') + elif simulation: + pass + else: + # update modificationTime for locking + for tableName in tables: + sql = 'UPDATE %s ' % tableName + sql += 'SET modificationTime=CURRENT_DATE ' + sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus IN (:jobStatus1,:jobStatus2) " + self.cur.execute(sql+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return failure + if errMsg != '': + _logger.debug('lockJobForReBrokerage : '+errMsg) + return False,{'err':errMsg} + # return + retMap = {'bPandaID':buildJobPandaID,'bStatus':buildJobStatus,'userName':compactDN, + 'bJobID':buildJobDefID,'rPandaID':runPandaID, + 'maxPandaIDlibDS':maxPandaIDlibDS,'minPandaIDlibDS':minPandaIDlibDS} + _logger.debug("lockJobForReBrokerage %s" % str(retMap)) + return True,retMap + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("lockJobForReBrokerage : %s %s" % (type,value)) + # return empty list + return False,{'err':'database error'} + + + # get input datasets for rebrokerage + def getInDatasetsForReBrokerage(self,jobID,userName): + comment = ' /* DBProxy.getInDatasetsForReBrokerage */' + failedRet = False,{},None + try: + _logger.debug("getInDatasetsForReBrokerage(%s,%s)" % (jobID,userName)) + # start transaction + self.conn.begin() + # get pandaID + pandaIDs = [] + maxTotalFileSize = None + for table in ['jobsActive4','jobsDefined4']: + sql = "SELECT PandaID FROM ATLAS_PANDA.%s WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " % table + sql += "AND prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2)" + varMap = {} + varMap[':prodUserName'] = userName + varMap[':jobDefinitionID'] = jobID + varMap[':prodSourceLabel'] = 'user' + varMap[':jobStatus1'] = 'defined' + varMap[':jobStatus2'] = 'activated' + self.cur.arraysize = 10000 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + if res != []: + for tmpItem in res: + pandaIDs.append(tmpItem[0]) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # not found + if pandaIDs == []: + _logger.debug("getInDatasetsForReBrokerage : PandaIDs not found") + return failedRet + # get dataset and lfn + retMapLFN = {} + sql = "SELECT dataset,lfn,fsize FROM ATLAS_PANDA.filesTable4 " + sql += "WHERE PandaID=:PandaID AND type=:type" + for pandaID in pandaIDs: + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':type'] = 'input' + # start transaction + self.conn.begin() + self.cur.arraysize = 10000 + self.cur.execute(sql+comment, varMap) + resL = self.cur.fetchall() + # append + tmpTotalFileSize = 0 + for tmpDataset,tmpLFN,tmpFileSize in resL: + # ignore lib.tgz + if tmpLFN.endswith('.lib.tgz'): + continue + if not retMapLFN.has_key(tmpDataset): + retMapLFN[tmpDataset] = [] + if not tmpLFN in retMapLFN[tmpDataset]: + retMapLFN[tmpDataset].append(tmpLFN) + try: + tmpTotalFileSize += long(tmpFileSize) + except: + pass + if maxTotalFileSize == None or maxTotalFileSize < tmpTotalFileSize: + maxTotalFileSize = tmpTotalFileSize + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("getInDatasetsForReBrokerage : done") + # max size in MB + maxTotalFileSize /= (1024*1024) + # return + return True,retMapLFN,maxTotalFileSize + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getInDatasetsForReBrokerage(%s,%s) : %s %s" % (jobID,userName,errType,errValue)) + return failedRet + + + # move jobs to jobsDefine4 for re-brokerage + def resetBuildJobForReBrokerage(self,pandaID): + comment = ' /* resetBuildJobForReBrokerage */' + _logger.debug("resetBuildJobForReBrokerage : start %s" % pandaID) + try: + # make sql to move jobs + sql1 = "SELECT %s FROM ATLAS_PANDA.jobsActive4 " % JobSpec.columnNames() + sql1+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus1" + sql3 = "INSERT INTO ATLAS_PANDA.jobsDefined4 (%s) " % JobSpec.columnNames() + sql3+= JobSpec.bindValuesExpression() + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':jobStatus1'] = 'activated' + self.cur.arraysize = 10 + self.cur.execute(sql1+comment,varMap) + res = self.cur.fetchone() + # not found + if res == None: + _logger.error("resetBuildJobForReBrokerage : PandaID=%s not found" % pandaID) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return False + # instantiate Job + job = JobSpec() + job.pack(res) + # delete from jobsDefined4 just in case + varMap = {} + varMap[':PandaID'] = pandaID + sqlD = "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID=:PandaID" + self.cur.execute(sqlD+comment,varMap) + # reset job status + job.jobStatus = 'defined' + # host and time information + job.modificationHost = self.hostname + job.modificationTime = datetime.datetime.utcnow() + # insert to Defined + self.cur.execute(sql3+comment, job.valuesMap()) + # delete from Active + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':jobStatus1'] = 'activated' + sql2 = "DELETE FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID AND jobStatus=:jobStatus1" + self.cur.execute(sql2+comment,varMap) + retD = self.cur.rowcount + # delete failed + if retD != 1: + _logger.error("resetBuildJobForReBrokerage : failed to delete PandaID=%s" % pandaID) + # rollback + self._rollback() + return False + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + _logger.debug("resetBuildJobForReBrokerage : end %s" % pandaID) + return True + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("resetBuildJobForReBrokerage : %s %s" % (type,value)) + # return empty list + return False + + + # get PandaIDs using userName/jobID for re-brokerage or retry + def getPandaIDsForReBrokerage(self,userName,jobID,fromActive,forFailed=False): + comment = ' /* DBProxy.getPandaIDsForReBrokerage */' + _logger.debug("getPandaIDsForReBrokerage : %s %s %s %s" % (userName,jobID,fromActive,forFailed)) + try: + returnList = [] + varMap = {} + varMap[':prodUserName'] = userName + varMap[':jobDefinitionID'] = jobID + if not forFailed: + varMap[':jobStatus1'] = 'activated' + else: + varMap[':jobStatus1'] = 'failed' + sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 " + sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sql += "AND jobStatus=:jobStatus1" + # get IDs from Active table + if fromActive: + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 20000 + self.cur.execute(sql+comment,varMap) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for tmpID, in resList: + if not tmpID in returnList: + returnList.append(tmpID) + # set holding to prevent activated jobs from being picked up + if not forFailed: + sql = 'UPDATE ATLAS_PANDA.jobsActive4 SET jobStatus=:newStatus ' + sql += 'WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID ' + sql += "AND jobStatus=:jobStatus1" + varMap[':newStatus'] = 'holding' + # start transaction + self.conn.begin() + # update + self.cur.execute(sql+comment,varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # get IDs from Defined table just in case + varMap = {} + varMap[':prodUserName'] = userName + varMap[':jobDefinitionID'] = jobID + varMap[':jobStatus1'] = 'defined' + varMap[':jobStatus2'] = 'assgined' + sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 " + sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sql += "AND jobStatus IN (:jobStatus1,:jobStatus2)" + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 20000 + self.cur.execute(sql+comment,varMap) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for tmpID, in resList: + if not tmpID in returnList: + returnList.append(tmpID) + # sort + returnList.sort() + # return + return returnList + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getPandaIDsForReBrokerage : %s %s" % (type,value)) + # return empty list + return [] + + + # get outDSs with userName/jobID + def getOutDSsForReBrokerage(self,userName,jobID): + comment = ' /* DBProxy.getOutDSsForReBrokerage */' + _logger.debug("getOutDSsForReBrokerage : %s %s" % (userName,jobID)) + falseRet = (False,[],None,None) + try: + # get one PandaID + sql = "SELECT PandaID,computingSite,destinationSE FROM ATLAS_PANDA.jobsActive4 " + sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sql += "AND prodSourceLabel=:prodSourceLabel AND rownum<=1" + varMap = {} + varMap[':prodUserName'] = userName + varMap[':jobDefinitionID'] = jobID + varMap[':prodSourceLabel'] = 'user' + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchone() + # not found + if res == None: + _logger.debug("getOutDSsForReBrokerage : failed to get PandaID") + if not self._commit(): + raise RuntimeError, 'Commit error' + return falseRet + pandaID,computingSite,destinationSE = res + # get outDSs + sql = "SELECT dataset FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type IN (:type1,:type2)" + varMap = {} + varMap[':type1'] = 'output' + varMap[':type2'] = 'log' + varMap[':PandaID'] = pandaID + self.cur.arraysize = 1000 + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + returnList = [] + for tmpOutDS, in resList: + if not tmpOutDS in returnList: + returnList.append(tmpOutDS) + # return + return True,returnList,computingSite,destinationSE + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getOutDSsForReBrokerage : %s %s" % (type,value)) + # return empty list + return falseRet + + + # query PandaID + def queryPandaID(self,jobDefID): + comment = ' /* DBProxy.queryPandaID */' + _logger.debug("queryPandaID : %s" % jobDefID) + sql0 = "SELECT PandaID,attemptNr FROM %s WHERE attemptNr=(" + sql0+= "SELECT MAX(attemptNr) FROM %s" + sql1= " WHERE prodSourceLabel=:prodSourceLabel AND jobDefinitionID=:jobDefinitionID)" + sql1+=" AND prodSourceLabel=:prodSourceLabel AND jobDefinitionID=:jobDefinitionID" + try: + ids = [] + # select + varMap = {} + varMap[':jobDefinitionID'] = jobDefID + varMap[':prodSourceLabel'] = 'managed' + for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsWaiting4']: + # start transaction + self.conn.begin() + # select + sql = sql0 % (table,table) + sql1 + self.cur.arraysize = 10 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + ids += list(res) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # look for the latest attempt + preAtt =-1 + pandaID=None + for pID,att in ids: + if att > preAtt: + pandaID = pID + preAtt = att + if att == preAtt: + if pandaID < pID: + pandaID = pID + return pandaID + except: + type, value, traceBack = sys.exc_info() + _logger.error("queryPandaID : %s %s" % (type,value)) + # roll back + self._rollback() + return None + + + # query job info per cloud + def queryJobInfoPerCloud(self,cloud,schedulerID=None): + comment = ' /* DBProxy.queryJobInfoPerCloud */' + _logger.debug("queryJobInfoPerCloud : %s %s" % (cloud,schedulerID)) + attrs = ['PandaID','jobStatus','jobName'] + sql0 = "SELECT " + for attr in attrs: + sql0 += "%s," % attr + sql0 = "%s " % sql0[:-1] + sql0+= "FROM %s " + sql0+= "WHERE cloud=:cloud " + varMap = {} + varMap[':cloud'] = cloud + if schedulerID != None: + sql0+= "AND schedulerID=:schedulerID " + varMap[':schedulerID'] = schedulerID + try: + ids = [] + returnList = [] + # select + for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: + # start transaction + self.conn.begin() + # select + sql = sql0 % table + self.cur.arraysize = 10000 + self.cur.execute(sql+comment,varMap) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # loop over all + for res in resList: + valMap = {} + # skip if already in the list + PandaID = res[0] + if PandaID in ids: + continue + # convert to map + for idx,attr in enumerate(attrs): + valMap[attr] = res[idx] + # append to list + ids.append(PandaID) + returnList.append(valMap) + # return + return returnList + except: + type, value, traceBack = sys.exc_info() + _logger.error("queryJobInfoPerCloud : %s %s" % (type,value)) + # roll back + self._rollback() + return None + + + # get PandaIDs at Site + def getPandaIDsSite(self,site,status,limit): + comment = ' /* DBProxy.getPandaIDsSite */' + _logger.debug("getPandaIDsSite : %s %s %s" % (site,status,limit)) + try: + ids = [] + # find table + if status in ['defined','assigned']: + table = 'ATLAS_PANDA.jobsDefined4' + elif status in ['activated','running','holding','trasnferring']: + table = 'ATLAS_PANDA.jobsActive4' + elif status in ['waiting']: + table = 'ATLAS_PANDA.jobsWaiting4' + elif status in ['finished','failed']: + table = 'ATLAS_PANDA.jobsArchived4' + else: + _logger.error("unknown status:%s" % status) + return ids + # limit + limit = int(limit) + # SQL + sql = "SELECT PandaID FROM %s " % table + sql += "WHERE computingSite=:computingSite AND jobStatus=:jobStatus AND prodSourceLabel=:prodSourceLabel " + sql += "AND rownum<=:limit" + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[':computingSite'] = site + varMap[':jobStatus'] = status + varMap[':limit'] = limit + varMap[':prodSourceLabel'] = 'managed' + self.cur.arraysize = limit + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # convert to list + for id, in res: + ids.append(id) + return ids + except: + type, value, traceBack = sys.exc_info() + _logger.error("getPandaIDsSite : %s %s" % (type,value)) + # roll back + self._rollback() + return [] + + + # get PandaIDs to be updated in prodDB + def getPandaIDsForProdDB(self,limit,lockedby): + comment = ' /* DBProxy.getPandaIDsForProdDB */' + _logger.debug("getPandaIDsForProdDB %s" % limit) + sql0 = "PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID FROM %s " + sqlW = "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND lockedby=:lockedby " + sqlX = "AND stateChangeTime>prodDBUpdateTime " + sqlA = "AND (CASE WHEN stateChangeTime>prodDBUpdateTime THEN 1 ELSE null END) = 1 " + sql1 = "AND rownum<=:limit " + varMap = {} + varMap[':lockedby'] = lockedby + varMap[':limit'] = limit + varMap[':prodSourceLabel1'] = 'managed' + varMap[':prodSourceLabel2'] = 'rc_test' + try: + retMap = {} + totalIDs = 0 + # select + for table in ['ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: + # start transaction + self.conn.begin() + # select + sql = sql0 % table + if table in ['ATLAS_PANDA.jobsArchived4']: + sql = "SELECT /*+ INDEX_RS_ASC(tab JOBSARCHIVED4_CHANGETIME) NO_INDEX(tab(PRODSOURCELABEL))*/ " + sql + " tab " + sqlW + sqlA + else: + sql = "SELECT " + sql + sqlW + sqlX + sql += sql1 + self.cur.arraysize = limit + _logger.debug("getPandaIDsForProdDB %s %s" % (sql+comment,str(varMap))) + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + _logger.debug("getPandaIDsForProdDB got %s" % len(res)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + for PandaID,jobStatus,stateChangeTime,attemptNr,jobDefinitionID,jobExecutionID in res: + # ignore dummy jobs in jobsDefined4 + if table == 'ATLAS_PANDA.jobsDefined4' and (not jobStatus in ['defined','assigned']): + continue + # add status + if not retMap.has_key(jobStatus): + retMap[jobStatus] = [] + # append + retMap[jobStatus].append({'PandaID':PandaID,'attemptNr':attemptNr, + 'stateChangeTime':stateChangeTime.strftime('%Y-%m-%d %H:%M:%S'), + 'jobDefinitionID':jobDefinitionID, + 'jobExecutionID':jobExecutionID}) + totalIDs += 1 + # limit + if totalIDs > limit: + break + _logger.debug("getPandaIDsForProdDB %s ret->%s" % (limit,totalIDs)) + return retMap + except: + type, value, traceBack = sys.exc_info() + _logger.error("getPandaIDsForProdDB : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # update prodDBUpdateTime + def updateProdDBUpdateTime(self,param): + comment = ' /* DBProxy.updateProdDBUpdateTime */' + _logger.debug("updateProdDBUpdateTime %s" % str(param)) + sql0 = "UPDATE %s " + sql0+= "SET prodDBUpdateTime=TO_TIMESTAMP(:prodDBUpdateTime,'YYYY-MM-DD HH24:MI:SS') " + sql0+= "WHERE PandaID=:PandaID AND jobStatus=:jobStatus AND stateChangeTime=TO_TIMESTAMP(:stateChangeTime,'YYYY-MM-DD HH24:MI:SS') " + varMap = {} + varMap[':prodDBUpdateTime'] = param['stateChangeTime'] + varMap[':PandaID'] = param['PandaID'] + varMap[':jobStatus'] = param['jobStatus'] + varMap[':stateChangeTime'] = param['stateChangeTime'] + try: + # convert to string + if isinstance(varMap[':prodDBUpdateTime'],datetime.datetime): + varMap[':prodDBUpdateTime'] = varMap[':prodDBUpdateTime'].strftime('%Y-%m-%d %H:%M:%S') + if isinstance(varMap[':stateChangeTime'],datetime.datetime): + varMap[':stateChangeTime'] = varMap[':stateChangeTime'].strftime('%Y-%m-%d %H:%M:%S') + # set table + if param['jobStatus'] in ['defined','assigned']: + table = 'ATLAS_PANDA.jobsDefined4' + elif param['jobStatus'] in ['waiting','pending']: + table = 'ATLAS_PANDA.jobsWaiting4' + elif param['jobStatus'] in ['activated','sent','starting','running','holding','transferring']: + table = 'ATLAS_PANDA.jobsActive4' + elif param['jobStatus'] in ['finished','failed','cancelled']: + table = 'ATLAS_PANDA.jobsArchived4' + else: + _logger.error("invalid status %s" % param['jobStatus']) + return False + # set transaction + self.conn.begin() + # update + sql = sql0 % table + _logger.debug(sql+comment+str(varMap)) + self.cur.execute(sql+comment, varMap) + retU = self.cur.rowcount + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("updateProdDBUpdateTime %s ret=%s" % (param['PandaID'],retU)) + if retU == 1: + return True + return False + except: + type, value, traceBack = sys.exc_info() + _logger.error("updateProdDBUpdateTime : %s %s" % (type,value)) + # roll back + self._rollback() + return False + + + # add metadata + def addMetadata(self,pandaID,metadata): + comment = ' /* DBProxy.addMetaData */' + _logger.debug("addMetaData : %s" % pandaID) + sql0 = "SELECT PandaID FROM ATLAS_PANDA.metaTable WHERE PandaID=:PandaID" + sql1 = "INSERT INTO ATLAS_PANDA.metaTable (PandaID,metaData) VALUES (:PandaID,:metaData)" + nTry=3 + for iTry in range(nTry): + try: + # autocommit on + self.conn.begin() + # select + varMap = {} + varMap[':PandaID'] = pandaID + self.cur.arraysize = 10 + self.cur.execute(sql0+comment, varMap) + res = self.cur.fetchone() + # already exist + if res != None: + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + # insert + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':metaData'] = metadata + self.cur.execute(sql1+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("addMetaData : %s retry : %s" % (pandaID,iTry)) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("addMetaData : %s %s" % (type,value)) + return False + + + # add stdout + def addStdOut(self,pandaID,stdOut): + comment = ' /* DBProxy.addStdOut */' + _logger.debug("addStdOut : %s start" % pandaID) + sqlJ = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE PandaID=:PandaID FOR UPDATE " + sqlC = "SELECT PandaID FROM ATLAS_PANDA.jobsDebug WHERE PandaID=:PandaID " + sqlI = "INSERT INTO ATLAS_PANDA.jobsDebug (PandaID,stdOut) VALUES (:PandaID,:stdOut) " + sqlU = "UPDATE ATLAS_PANDA.jobsDebug SET stdOut=:stdOut WHERE PandaID=:PandaID " + try: + # autocommit on + self.conn.begin() + # select + varMap = {} + varMap[':PandaID'] = pandaID + self.cur.arraysize = 10 + # check job table + self.cur.execute(sqlJ+comment, varMap) + res = self.cur.fetchone() + if res == None: + _logger.debug("addStdOut : %s non active" % pandaID) + else: + # check debug table + self.cur.execute(sqlC+comment, varMap) + res = self.cur.fetchone() + # already exist + if res != None: + # update + sql = sqlU + else: + # insert + sql = sqlI + # write stdout + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':stdOut'] = stdOut + self.cur.execute(sql+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + errtype,errvalue = sys.exc_info()[:2] + _logger.error("addStdOut : %s %s" % (errtype,errvalue)) + return False + + + # insert sandbox file info + def insertSandboxFileInfo(self,userName,hostName,fileName,fileSize,checkSum): + comment = ' /* DBProxy.insertSandboxFileInfo */' + _logger.debug("insertSandboxFileInfo : %s %s %s %s %s" % (userName,hostName,fileName,fileSize,checkSum)) + sqlC = "SELECT userName,fileSize,checkSum FROM ATLAS_PANDAMETA.userCacheUsage " + sqlC += "WHERE hostName=:hostName AND fileName=:fileName FOR UPDATE" + sql = "INSERT INTO ATLAS_PANDAMETA.userCacheUsage " + sql += "(userName,hostName,fileName,fileSize,checkSum,creationTime,modificationTime) " + sql += "VALUES (:userName,:hostName,:fileName,:fileSize,:checkSum,CURRENT_DATE,CURRENT_DATE) " + try: + # begin transaction + self.conn.begin() + # check if it already exists + varMap = {} + varMap[':hostName'] = hostName + varMap[':fileName'] = fileName + self.cur.arraysize = 10 + self.cur.execute(sqlC+comment, varMap) + res = self.cur.fetchall() + if len(res) != 0: + _logger.debug("insertSandboxFileInfo : skip %s %s since already exists" % (hostName,fileName)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return "WARNING: file exist" + # insert + varMap = {} + varMap[':userName'] = userName + varMap[':hostName'] = hostName + varMap[':fileName'] = fileName + varMap[':fileSize'] = fileSize + varMap[':checkSum'] = checkSum + self.cur.execute(sql+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return "OK" + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("insertSandboxFileInfo : %s %s" % (type,value)) + return "ERROR: DB failure" + + + # check duplicated sandbox file + def checkSandboxFile(self,dn,fileSize,checkSum): + comment = ' /* DBProxy.checkSandboxFile */' + _logger.debug("checkSandboxFile : %s %s %s" % (dn,fileSize,checkSum)) + sqlC = "SELECT hostName,fileName FROM ATLAS_PANDAMETA.userCacheUsage " + sqlC += "WHERE userName=:userName AND fileSize=:fileSize AND checkSum=:checkSum " + sqlC += "AND hostName<>:ngHostName AND creationTime>CURRENT_DATE-3 " + sqlC += "AND creationTime>CURRENT_DATE-3 " + try: + retStr = 'NOTFOUND' + # get compact DN + compactDN = self.cleanUserID(dn) + if compactDN in ['','NULL',None]: + compactDN = dn + # begin transaction + self.conn.begin() + # check if it already exists + varMap = {} + varMap[':userName'] = compactDN + varMap[':fileSize'] = fileSize + varMap[':checkSum'] = checkSum + varMap[':ngHostName'] = 'localhost.localdomain' + self.cur.arraysize = 10 + self.cur.execute(sqlC+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if len(res) != 0: + hostName,fileName = res[0] + retStr = "FOUND:%s:%s" % (hostName,fileName) + _logger.debug("checkSandboxFile -> %s" % retStr) + return retStr + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("checkSandboxFile : %s %s" % (type,value)) + return "ERROR: DB failure" + + + # insert dataset + def insertDataset(self,dataset,tablename="ATLAS_PANDA.Datasets"): + comment = ' /* DBProxy.insertDataset */' + _logger.debug("insertDataset(%s)" % dataset.name) + sql0 = "SELECT COUNT(*) FROM %s WHERE vuid=:vuid" % tablename + sql1 = "INSERT INTO %s " % tablename + sql1+= "(%s) " % DatasetSpec.columnNames() + sql1+= DatasetSpec.bindValuesExpression() + # time information + dataset.creationdate = datetime.datetime.utcnow() + dataset.modificationdate = dataset.creationdate + try: + # subtype + if dataset.subType in ['','NULL',None]: + # define using name + if re.search('_dis\d+$',dataset.name) != None: + dataset.subType = 'dis' + elif re.search('_sub\d+$',dataset.name) != None: + dataset.subType= 'sub' + else: + dataset.subType= 'top' + # begin transaction + self.conn.begin() + # check if it already exists + varMap = {} + varMap[':vuid'] = dataset.vuid + self.cur.execute(sql0+comment, varMap) + nDS, = self.cur.fetchone() + _logger.debug("insertDataset nDS=%s with %s" % (nDS,dataset.vuid)) + if nDS == 0: + # insert + _logger.debug("insertDataset insert %s" % dataset.name) + self.cur.execute(sql1+comment, dataset.valuesMap()) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("insertDataset() : %s %s" % (type,value)) + return False + + + # get and lock dataset with a query + def getLockDatasets(self,sqlQuery,varMapGet,modTimeOffset='',getVersion=False): + comment = ' /* DBProxy.getLockDatasets */' + _logger.debug("getLockDatasets(%s,%s,%s)" % (sqlQuery,str(varMapGet),modTimeOffset)) + sqlGet = "SELECT /*+ INDEX_RS_ASC(tab(STATUS,TYPE,MODIFICATIONDATE)) */ vuid,name,modificationdate,version,transferStatus FROM ATLAS_PANDA.Datasets tab WHERE " + sqlQuery + sqlLock = "UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE" + if modTimeOffset != '': + sqlLock += "+%s" % modTimeOffset + sqlLock += ",transferStatus=MOD(transferStatus+1,10)" + if getVersion: + sqlLock += ",version=:version" + sqlLock += " WHERE vuid=:vuid AND transferStatus=:transferStatus" + retList = [] + try: + # begin transaction + self.conn.begin() + # get datasets + self.cur.arraysize = 1000000 + self.cur.execute(sqlGet+comment,varMapGet) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # loop over all datasets + if res != None and len(res) != 0: + for vuid,name,modificationdate,version,transferStatus in res: + # lock + varMapLock = {} + varMapLock[':vuid'] = vuid + varMapLock[':transferStatus'] = transferStatus + if getVersion: + try: + varMapLock[':version'] = str(int(version) + 1) + except: + varMapLock[':version'] = str(1) + # begin transaction + self.conn.begin() + # update for lock + self.cur.execute(sqlLock+comment,varMapLock) + retU = self.cur.rowcount + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if retU > 0: + # append + if not getVersion: + retList.append((vuid,name,modificationdate)) + else: + retList.append((vuid,name,modificationdate,version)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # retrun + return retList + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getLockDatasets : %s %s" % (type,value)) + return [] + + + # query dataset with map + def queryDatasetWithMap(self,map): + comment = ' /* DBProxy.queryDatasetWithMap */' + _logger.debug("queryDatasetWithMap(%s)" % map) + if map.has_key('name'): + sql1 = """SELECT /*+ BEGIN_OUTLINE_DATA """ + sql1 += """INDEX_RS_ASC(@"SEL$1" "TAB"@"SEL$1" ("DATASETS"."NAME")) """ + sql1 += """OUTLINE_LEAF(@"SEL$1") ALL_ROWS """ + sql1 += """OPTIMIZER_FEATURES_ENABLE('10.2.0.4') """ + sql1 += """IGNORE_OPTIM_EMBEDDED_HINTS """ + sql1 += """END_OUTLINE_DATA */ """ + sql1 += "%s FROM ATLAS_PANDA.Datasets tab" % DatasetSpec.columnNames() + else: + sql1 = "SELECT %s FROM ATLAS_PANDA.Datasets" % DatasetSpec.columnNames() + varMap = {} + for key in map.keys(): + if len(varMap)==0: + sql1+= " WHERE %s=:%s" % (key,key) + else: + sql1+= " AND %s=:%s" % (key,key) + varMap[':%s' % key] = map[key] + try: + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 100 + _logger.debug(sql1+comment+str(varMap)) + self.cur.execute(sql1+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # instantiate Dataset + if res != None and len(res) != 0: + dataset = DatasetSpec() + dataset.pack(res[0]) + return dataset + _logger.error("queryDatasetWithMap(%s) : dataset not found" % map) + return None + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("queryDatasetWithMap(%s) : %s %s" % (map,type,value)) + return None + + + # update dataset + def updateDataset(self,datasets,withLock,withCriteria,criteriaMap): + comment = ' /* DBProxy.updateDataset */' + _logger.debug("updateDataset()") + sql1 = "UPDATE ATLAS_PANDA.Datasets SET %s " % DatasetSpec.bindUpdateExpression() + sql1+= "WHERE vuid=:vuid" + if withCriteria != "": + sql1+= " AND %s" % withCriteria + retList = [] + try: + # start transaction + self.conn.begin() + for dataset in datasets: + _logger.debug("updateDataset(%s,%s)" % (dataset.name,dataset.status)) + # time information + dataset.modificationdate = datetime.datetime.utcnow() + # update + varMap = dataset.valuesMap() + varMap[':vuid'] = dataset.vuid + for cKey in criteriaMap.keys(): + varMap[cKey] = criteriaMap[cKey] + self.cur.execute(sql1+comment, varMap) + retU = self.cur.rowcount + if retU != 0 and retU != 1: + raise RuntimeError, 'Invalid retrun %s' % retU + retList.append(retU) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("updateDataset() ret:%s" % retList) + return retList + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("updateDataset() : %s %s" % (type,value)) + return [] + + + # delete dataset + def deleteDataset(self,name): + comment = ' /* DBProxy.deleteDataset */' + sql1 = "DELETE /*+ INDEX(tab DATASETS_NAME_IDX)*/ FROM ATLAS_PANDA.Datasets tab WHERE name=:name" + try: + # start transaction + self.conn.begin() + # delete + varMap = {} + varMap[':name'] = name + self.cur.execute(sql1+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("deleteDataset() : %s %s" % (type,value)) + return False + + + # get serial number for dataset, insert dummy datasets to increment SN + def getSerialNumber(self,datasetname,definedFreshFlag=None): + comment = ' /* DBProxy.getSerialNumber */' + try: + _logger.debug("getSerialNumber(%s,%s)" % (datasetname,definedFreshFlag)) + # start transaction + self.conn.begin() + # check freashness + if definedFreshFlag == None: + # select + varMap = {} + varMap[':name'] = datasetname + varMap[':type'] = 'output' + sql = "SELECT /*+ INDEX_RS_ASC(TAB (DATASETS.NAME)) */ COUNT(*) FROM ATLAS_PANDA.Datasets tab WHERE type=:type AND name=:name" + self.cur.arraysize = 100 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchone() + # fresh dataset or not + if res != None and len(res) != 0 and res[0] > 0: + freshFlag = False + else: + freshFlag = True + else: + # use predefined flag + freshFlag = definedFreshFlag + # get serial number + sql = "SELECT ATLAS_PANDA.SUBCOUNTER_SUBID_SEQ.nextval FROM dual"; + self.cur.arraysize = 100 + self.cur.execute(sql+comment, {}) + sn, = self.cur.fetchone() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # release file lock + _logger.debug("getSerialNumber : %s %s" % (sn,freshFlag)) + return (sn,freshFlag) + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getSerialNumber() : %s %s" % (type,value)) + return (-1,False) + + + # get serial number for group job + def getSerialNumberForGroupJob(self,name): + comment = ' /* DBProxy.getSerialNumberForGroupJob */' + retVal = {'sn':'','status':False} + try: + _logger.debug("getSerialNumberForGroupJob(%s)" % name) + # start transaction + self.conn.begin() + # get serial number + sql = "SELECT ATLAS_PANDA.GROUP_JOBID_SEQ.nextval FROM dual"; + self.cur.execute(sql+comment, {}) + sn, = self.cur.fetchone() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + retVal['sn'] = sn + retVal['status'] = True + _logger.debug("getSerialNumberForGroupJob : %s %s" % (name,str(retVal))) + return retVal + except: + # roll back + self._rollback() + # error + errtype,errvalue = sys.exc_info()[:2] + _logger.error("getSerialNumberForGroupJob : %s %s" % (errtype,errvalue)) + retVal['status'] = False + return retVal + + + # change job priorities + def changeJobPriorities(self,newPrioMap): + comment = ' /* DBProxy.changeJobPriorities */' + try: + _logger.debug("changeJobPriorities start") + sql = "UPDATE %s SET currentPriority=:currentPriority,assignedPriority=:assignedPriority " + sql += "WHERE PandaID=:PandaID" + # loop over all PandaIDs + for pandaID,newPrio in newPrioMap.iteritems(): + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':currentPriority'] = newPrio + varMap[':assignedPriority'] = newPrio + _logger.debug("changeJobPriorities PandaID=%s -> prio=%s" % (pandaID,newPrio)) + # start transaction + self.conn.begin() + # try active tables + retU = None + for tableName in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsWaiting4']: + # execute + self.cur.execute((sql % tableName)+comment,varMap) + retU = self.cur.rowcount + if retU > 0: + break + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("changeJobPriorities PandaID=%s retU=%s" % (pandaID,retU)) + # return + _logger.debug("changeJobPriorities done") + return True,'' + except: + # roll back + self._rollback() + # error + errtype,errvalue = sys.exc_info()[:2] + _logger.error("changeJobPriorities : %s %s" % (errtype,errvalue)) + return False,'database error' + + + # update transfer status for a dataset + def updateTransferStatus(self,datasetname,bitMap): + comment = ' /* DBProxy.updateTransferStatus */' + try: + _logger.debug("updateTransferStatus(%s,%s)" % (datasetname,hex(bitMap))) + # start transaction + self.conn.begin() + retTransSt = 0 + # update bitmap + sqlU = 'UPDATE /*+ INDEX_RS_ASC(TAB("DATASETS"."NAME")) */ ATLAS_PANDA.Datasets tab SET transferStatus=ATLAS_PANDA.BITOR(transferStatus,:bitMap) WHERE name=:name' + varMap = {} + varMap[':bitMap'] = bitMap + varMap[':name'] = datasetname + retU = self.cur.execute(sqlU+comment, varMap) + # get transferStatus + sqlS = 'SELECT /*+ INDEX_RS_ASC(TAB("DATASETS"."NAME")) */ transferStatus FROM ATLAS_PANDA.Datasets tab WHERE name=:name' + varMap = {} + varMap[':name'] = datasetname + self.cur.arraysize = 10 + retS = self.cur.execute(sqlS+comment, varMap) + resS = self.cur.fetchall() + if resS != None and len(resS) != 0: + retTransSt = resS[0][0] + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("updateTransferStatus : %s" % hex(retTransSt)) + return retTransSt + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("updateTransferStatus : %s %s" % (type,value)) + return 0 + + + # get CloudTask. If not exist, create it + def getCloudTask(self,tid): + comment = ' /* getCloudTask */' + try: + _logger.debug("getCloudTask(%s)" % tid) + # check tid + if tid in [None,'NULL']: + _logger.error("invalid TID : %s" % tid) + return None + # start transaction + self.conn.begin() + # get CloudTask + sql = "SELECT %s FROM ATLAS_PANDA.cloudtasks " % CloudTaskSpec.columnNames() + sql += "WHERE taskid=:taskid" + varMap = {} + varMap[':taskid'] = tid + # select + self.cur.arraysize = 10 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + # already exist + if res != None and len(res) != 0: + # instantiate CloudTask + cloudTask = CloudTaskSpec() + cloudTask.pack(res[0]) + # update tmod if status is defined + if cloudTask.status == 'defined': + sql = "UPDATE ATLAS_PANDA.cloudtasks SET tmod=CURRENT_DATE WHERE taskid=:taskid" + varMap = {} + varMap[':taskid'] = cloudTask.taskid + self.cur.execute(sql+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return cloudTask + # insert new CloudTask + _logger.debug("insert new CloudTask") + cloudTask = CloudTaskSpec() + cloudTask.taskid = tid + cloudTask.status = 'defined' + sql = "INSERT INTO ATLAS_PANDA.cloudtasks (id,taskid,status,tmod,tenter) VALUES(ATLAS_PANDA.CLOUDTASKS_ID_SEQ.nextval,:taskid,:status,CURRENT_DATE,CURRENT_DATE)" + sql+= " RETURNING id INTO :newID" + varMap = {} + varMap[':taskid'] = cloudTask.taskid + varMap[':status'] = cloudTask.status + varMap[':newID'] = self.cur.var(cx_Oracle.NUMBER) + self.cur.execute(sql+comment, varMap) + # get id + cloudTask.id = long(varMap[':newID'].getvalue()) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("return new CloudTask") + return cloudTask + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getCloudTask() : %s %s" % (type,value)) + return None + + + # set cloud to CloudTask + def setCloudTask(self,cloudTask): + comment = ' /* setCloudTask */' + try: + _logger.debug("setCloudTask(id=%s,taskid=%s)" % (cloudTask.id,cloudTask.taskid)) + sql = "UPDATE ATLAS_PANDA.cloudtasks SET cloud=:cloud,status=:newStatus,tmod=CURRENT_DATE WHERE id=:id AND status=:oldStatus" + # start transaction + self.conn.begin() + # update + varMap = {} + varMap[':cloud'] = cloudTask.cloud + varMap[':id'] = cloudTask.id + varMap[':newStatus'] = 'assigned' + varMap[':oldStatus'] = 'defined' + self.cur.execute(sql+comment, varMap) + retU = self.cur.rowcount + # succeeded + if retU == 1: + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return cloudTask + # read if it is already set by another thread + sql = "SELECT %s FROM ATLAS_PANDA.cloudtasks " % CloudTaskSpec.columnNames() + sql += "WHERE id=:id" + varMap = {} + varMap[':id'] = cloudTask.id + # select + self.cur.arraysize = 10 + retS = self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # retrun CloudTask + if res != None and len(res) != 0: + # instantiate CloudTask + cloudTask = CloudTaskSpec() + cloudTask.pack(res[0]) + return cloudTask + _logger.error("setCloudTask() : cannot find CloudTask for %s" % cloudTask.id) + return None + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("setCloudTask() : %s %s" % (type,value)) + return None + + + # see CloudTask + def seeCloudTask(self,tid): + comment = ' /* seeCloudTask */' + try: + _logger.debug("seeCloudTask(%s)" % tid) + # check tid + if tid in [None,'NULL']: + _logger.error("invalid TID : %s" % tid) + return None + # start transaction + self.conn.begin() + # select + sql = "SELECT cloud FROM ATLAS_PANDA.cloudtasks WHERE taskid=:taskid" + varMap = {} + varMap[':taskid'] = tid + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # existing task + if res != None and len(res) != 0: + # return cloud + return res[0][0] + else: + return None + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("seeCloudTask() : %s %s" % (type,value)) + return None + + + # reset modification time of a task to shorten retry interval + def resetTmodCloudTask(self,tid): + comment = ' /* resetTmodCloudTask */' + try: + _logger.debug("resetTmodCloudTask %s" % tid) + # check tid + if tid in [None,'NULL']: + _logger.error("invalid TID : %s" % tid) + return None + # start transaction + self.conn.begin() + # update + sql = "UPDATE ATLAS_PANDA.cloudtasks SET tmod=:tmod WHERE taskid=:taskid" + varMap = {} + varMap[':taskid'] = tid + varMap[':tmod'] = datetime.datetime.utcnow() - datetime.timedelta(minutes=165) + self.cur.execute(sql+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("resetTmodCloudTask : %s %s" % (type,value)) + return False + + + # get assigning task + def getAssigningTask(self): + comment = ' /* getAssigningTask */' + try: + _logger.debug("getAssigningTask") + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) + # start transaction + self.conn.begin() + # select + sql = "SELECT taskid FROM ATLAS_PANDA.cloudtasks WHERE status=:status AND tmod>:tmod" + varMap = {} + varMap[':tmod'] = timeLimit + varMap[':status'] = 'defined' + self.cur.arraysize = 100 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # loop over all taskid + retList = [] + if res != None: + for tid, in res: + retList.append(tid) + # return + _logger.debug("getAssigningTask ret:%s" % retList) + return retList + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getAssigningTask : %s %s" % (type,value)) + return [] + + + # set CloudTask by user + def setCloudTaskByUser(self,user,tid,cloud,status): + comment = ' /* setCloudTaskByUser */' + try: + _logger.debug("setCloudTaskByUser(tid=%s,cloud=%s,status=%s) by %s" % (tid,cloud,status,user)) + # check tid + if tid in [None,'NULL']: + tmpMsg = "invalid TID : %s" % tid + _logger.error(tmpMsg) + return "ERROR: " + tmpMsg + # check status + statusList = ['tobeaborted'] + if not status in statusList: + tmpMsg = "invalid status=%s. Must be one of %s" (status,str(statusList)) + _logger.error(tmpMsg) + return "ERROR: " + tmpMsg + # start transaction + self.conn.begin() + # get CloudTask + sql = "SELECT %s FROM ATLAS_PANDA.cloudtasks " % CloudTaskSpec.columnNames() + sql += "WHERE taskid=:taskid" + varMap = {} + varMap[':taskid'] = tid + # select + self.cur.arraysize = 10 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + # already exist + if res != None and len(res) != 0: + # set status + sql = "UPDATE ATLAS_PANDA.cloudtasks SET status=:status,tmod=CURRENT_DATE WHERE taskid=:taskid" + varMap = {} + varMap[':taskid'] = tid + varMap[':status'] = status + self.cur.execute(sql+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return "SUCCEEDED" + # insert new CloudTask + sql = "INSERT INTO ATLAS_PANDA.cloudtasks (id,taskid,status,tmod,tenter) VALUES(ATLAS_PANDA.CLOUDTASKS_ID_SEQ.nextval,:taskid,:status,CURRENT_DATE,CURRENT_DATE)" + varMap = {} + varMap[':taskid'] = tid + varMap[':status'] = status + self.cur.execute(sql+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return "SUCCEEDED" + except: + # roll back + self._rollback() + # error + errType,errValue = sys.exc_info()[:2] + _logger.error("setCloudTaskByUser() : %s %s" % (errType,errValue)) + return "ERROR: database error" + + + # query files with map + def queryFilesWithMap(self,map): + comment = ' /* DBProxy.queryFilesWithMap */' + _logger.debug("queryFilesWithMap()") + sql1 = "SELECT PandaID,%s FROM ATLAS_PANDA.filesTable4" % FileSpec.columnNames() + varMap = {} + for key in map.keys(): + if len(varMap)==0: + sql1+= " WHERE %s=:%s" % (key,key) + else: + sql1+= " AND %s=:%s" % (key,key) + varMap[':%s' % key] = map[key] + nTry=3 + for iTry in range(nTry): + try: + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + self.cur.execute(sql1+comment, varMap) + res = self.cur.fetchall() + _logger.debug("queryFilesWithMap() : %s" % str(res)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # instantiate files + retList = [] + for item in res: + # instantiate dummy JobSpec obj for PandaID + job = JobSpec() + job.PandaID = item[0] + # instantiate file + file = FileSpec() + file.pack(item[1:]) + # set owner + file.setOwner(job) + # append + retList.append(file) + return retList + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("queryFilesWithMap retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("queryFilesWithMap : %s %s" % (type,value)) + return [] + + + # count the number of files with map + def countFilesWithMap(self,map): + comment = ' /* DBProxy.countFilesWithMap */' + sql1 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ COUNT(*) FROM ATLAS_PANDA.filesTable4 tab" + varMap = {} + for key in map.keys(): + if len(varMap)==0: + sql1+= " WHERE %s=:%s" % (key,key) + else: + sql1+= " AND %s=:%s" % (key,key) + varMap[':%s' % key] = map[key] + nTry=3 + for iTry in range(nTry): + try: + # start transaction + self.conn.begin() + # select + _logger.debug("countFilesWithMap() : %s %s" % (sql1,str(map))) + self.cur.arraysize = 10 + retS = self.cur.execute(sql1+comment, varMap) + res = self.cur.fetchone() + _logger.debug("countFilesWithMap() : %s %s" % (retS,str(res))) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + nFiles=0 + if res != None: + nFiles=res[0] + return nFiles + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("countFilesWithMap() retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("countFilesWithMap(%s) : %s %s" % (map,type,value)) + return -1 + + + # count the number of pending files + def countPendingFiles(self,pandaID,forInput=True): + comment = ' /* DBProxy.countPendingFiles */' + varMap = {} + varMap[':pandaID'] = pandaID + varMap[':status'] = 'ready' + if forInput: + sql1 = "SELECT COUNT(*) FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:pandaID AND type=:type AND status<>:status " + varMap[':type'] = 'input' + else: + sql1 = "SELECT COUNT(*) FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:pandaID AND type IN (:type1,:type2) AND status<>:status " + varMap[':type1'] = 'output' + varMap[':type2'] = 'log' + try: + # start transaction + self.conn.begin() + # select + _logger.debug("countPendingFiles : %s start" % pandaID) + self.cur.arraysize = 10 + retS = self.cur.execute(sql1+comment, varMap) + res = self.cur.fetchone() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + nFiles = -1 + if res != None: + nFiles=res[0] + _logger.debug("countPendingFiles : %s -> %s" % (pandaID,nFiles)) + return nFiles + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("countPendingFiles : %s : %s %s" % (pandaID,errType,errValue)) + return -1 + + + # get datasets associated with file + def getDatasetWithFile(self,lfn,jobPrioity=0): + comment = ' /* DBProxy.getDatasetWithFile */' + varMap = {} + varMap[':lfn'] = lfn + varMap[':status1'] = 'pending' + varMap[':status2'] = 'transferring' + sql1 = "SELECT PandaID,status,destinationDBlock,destinationDBlockToken,dispatchDBlock FROM ATLAS_PANDA.filesTable4 " + sql1 += "WHERE lfn=:lfn AND status IN (:status1,:status2) AND modificationTime %s" % (lfn,str(retMap))) + return retMap + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getDatasetWithFile : %s : %s %s" % (lfn,errType,errValue)) + return {} + + + # get input files currently in use for analysis + def getFilesInUseForAnal(self,outDataset): + comment = ' /* DBProxy.getFilesInUseForAnal */' + sqlSub = "SELECT destinationDBlock,PandaID FROM ATLAS_PANDA.filesTable4 " + sqlSub += "WHERE dataset=:dataset AND type IN (:type1,:type2) GROUP BY destinationDBlock,PandaID" + sqlPaA = "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsDefined4 " + sqlPaA += "WHERE PandaID=:PandaID " + sqlPaA += "UNION " + sqlPaA += "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsActive4 " + sqlPaA += "WHERE PandaID=:PandaID " + sqlPan = "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsArchived4 " + sqlPan += "WHERE PandaID=:PandaID AND modificationTime<=CURRENT_DATE " + sqlPan += "UNION " + sqlPan += "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDAARCH.jobsArchived " + sqlPan += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)" + sqlIdA = "SELECT PandaID,jobStatus FROM ATLAS_PANDA.jobsArchived4 " + sqlIdA += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sqlIdA += "AND prodSourceLabel=:prodSourceLabel1 " + sqlIdL = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ " + sqlIdL += "PandaID,jobStatus FROM ATLAS_PANDAARCH.jobsArchived tab " + sqlIdL += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sqlIdL += "AND prodSourceLabel=:prodSourceLabel1 AND modificationTime>(CURRENT_DATE-30) " + sqlDis = "SELECT distinct dispatchDBlock FROM ATLAS_PANDA.filesTable4 " + sqlDis += "WHERE PandaID=:PandaID AND type=:type AND dispatchDBlock IS NOT NULL AND modificationTime <= CURRENT_DATE" + sqlLfn = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ lfn,PandaID FROM ATLAS_PANDA.filesTable4 tab " + sqlLfn += "WHERE dispatchDBlock=:dispatchDBlock AND type=:type " + sqlLfn += "AND (destinationDBlockToken IS NULL OR destinationDBlockToken<>:noshadow) AND modificationTime<=CURRENT_DATE" + nTry=3 + for iTry in range(nTry): + inputFilesList = [] + try: + # start transaction + self.conn.begin() + # get sub datasets + varMap = {} + varMap[':dataset'] = outDataset + varMap[':type1'] = 'output' + varMap[':type2'] = 'log' + _logger.debug("getFilesInUseForAnal : %s %s" % (sqlSub,str(varMap))) + self.cur.arraysize = 100000 + retS = self.cur.execute(sqlSub+comment, varMap) + res = self.cur.fetchall() + subDSpandaIDmap = {} + checkedPandaIDs = {} + for subDataset,pandaID in res: + # avoid redundunt lookup + if checkedPandaIDs.has_key(pandaID): + continue + if subDSpandaIDmap.has_key(subDataset): + # append jobs as running since they are not in archived tables + if not pandaID in subDSpandaIDmap[subDataset]: + checkedPandaIDs[pandaID] = 'running' + subDSpandaIDmap[subDataset].append(pandaID) + continue + # look for jobdefID and userName + varMap = {} + varMap[':PandaID'] = pandaID + _logger.debug("getFilesInUseForAnal : %s %s" % (sqlPaA,str(varMap))) + retP = self.cur.execute(sqlPaA+comment, varMap) + resP = self.cur.fetchall() + if len(resP) != 0: + jobDefinitionID,prodUserName = resP[0] + else: + _logger.debug("getFilesInUseForAnal : %s %s" % (sqlPan,str(varMap))) + retP = self.cur.execute(sqlPan+comment, varMap) + resP = self.cur.fetchall() + if len(resP) != 0: + jobDefinitionID,prodUserName = resP[0] + else: + continue + # get PandaIDs with obdefID and userName + tmpPandaIDs = [] + varMap = {} + varMap[':prodUserName'] = prodUserName + varMap[':jobDefinitionID'] = jobDefinitionID + varMap[':prodSourceLabel1'] = 'user' + _logger.debug("getFilesInUseForAnal : %s %s" % (sqlIdA,str(varMap))) + retID = self.cur.execute(sqlIdA+comment, varMap) + resID = self.cur.fetchall() + for tmpPandaID,tmpJobStatus in resID: + checkedPandaIDs[tmpPandaID] = tmpJobStatus + tmpPandaIDs.append(tmpPandaID) + _logger.debug("getFilesInUseForAnal : %s %s" % (sqlIdL,str(varMap))) + retID = self.cur.execute(sqlIdL+comment, varMap) + resID = self.cur.fetchall() + for tmpPandaID,tmpJobStatus in resID: + if not tmpPandaID in tmpPandaIDs: + checkedPandaIDs[tmpPandaID] = tmpJobStatus + tmpPandaIDs.append(tmpPandaID) + # append + if not subDSpandaIDmap.has_key(subDataset): + subDSpandaIDmap[subDataset] = [] + for tmpPandaID in tmpPandaIDs: + # reuse failed files if jobs are in Archived since they cannot change back to active + if checkedPandaIDs[tmpPandaID] in ['failed','cancelled']: + continue + # collect PandaIDs + subDSpandaIDmap[subDataset].append(tmpPandaID) + # loop over all sub datasets + for subDataset,activePandaIDs in subDSpandaIDmap.iteritems(): + # skip empty + if activePandaIDs == []: + continue + # get dispatchDBlocks + pandaID = activePandaIDs[0] + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':type'] = 'input' + _logger.debug("getFilesInUseForAnal : %s %s" % (sqlDis,str(varMap))) + self.cur.arraysize = 10000 + retD = self.cur.execute(sqlDis+comment, varMap) + resD = self.cur.fetchall() + # get LFNs + for disDataset, in resD: + # use new style only + if not disDataset.startswith('user_disp.'): + continue + varMap = {} + varMap[':dispatchDBlock'] = disDataset + varMap[':type'] = 'input' + varMap[':noshadow'] = 'noshadow' + _logger.debug("getFilesInUseForAnal : %s %s" % (sqlLfn,str(varMap))) + self.cur.arraysize = 100000 + retL = self.cur.execute(sqlLfn+comment, varMap) + resL = self.cur.fetchall() + # append + for lfn,filePandaID in resL: + # skip files used by archived failed or cancelled jobs + if filePandaID in activePandaIDs and not lfn in inputFilesList: + inputFilesList.append(lfn) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("getFilesInUseForAnal : %s" % len(inputFilesList)) + return inputFilesList + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("inputFilesList retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("inputFilesList(%s) : %s %s" % (outDataset,type,value)) + return [] + + + # get list of dis dataset to get input files in shadow + def getDisInUseForAnal(self,outDataset): + comment = ' /* DBProxy.getDisInUseForAnal */' + sqlSub = "SELECT destinationDBlock,PandaID,status FROM ATLAS_PANDA.filesTable4 " + sqlSub += "WHERE dataset=:dataset AND type=:type1 GROUP BY destinationDBlock,PandaID,status" + sqlPaA = "SELECT jobStatus FROM ATLAS_PANDA.jobsDefined4 " + sqlPaA += "WHERE PandaID=:PandaID " + sqlPaA += "UNION " + sqlPaA += "SELECT jobStatus FROM ATLAS_PANDA.jobsActive4 " + sqlPaA += "WHERE PandaID=:PandaID " + sqlPan = "SELECT jobStatus FROM ATLAS_PANDA.jobsArchived4 " + sqlPan += "WHERE PandaID=:PandaID AND modificationTime<=CURRENT_DATE " + sqlPan += "UNION " + sqlPan += "SELECT jobStatus FROM ATLAS_PANDAARCH.jobsArchived " + sqlPan += "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)" + sqlDis = "SELECT distinct dispatchDBlock FROM ATLAS_PANDA.filesTable4 " + sqlDis += "WHERE PandaID=:PandaID AND type=:type AND dispatchDBlock IS NOT NULL AND modificationTime <= CURRENT_DATE" + inputDisList = [] + try: + timeStart = datetime.datetime.utcnow() + _logger.debug("getDisInUseForAnal start for %s" % outDataset) + # start transaction + self.conn.begin() + # get sub datasets + varMap = {} + varMap[':dataset'] = outDataset + varMap[':type1'] = 'log' + _logger.debug("getDisInUseForAnal : %s %s" % (sqlSub,str(varMap))) + self.cur.arraysize = 100000 + retS = self.cur.execute(sqlSub+comment, varMap) + res = self.cur.fetchall() + subDSpandaIDmap = {} + checkedPandaIDs = {} + for subDataset,pandaID,fileStatus in res: + # add map + if not subDSpandaIDmap.has_key(subDataset): + subDSpandaIDmap[subDataset] = [] + # check job status + if fileStatus != 'ready': + varMap = {} + varMap[':PandaID'] = pandaID + _logger.debug("getDisInUseForAnal : %s %s" % (sqlPaA,str(varMap))) + retP = self.cur.execute(sqlPaA+comment, varMap) + resP = self.cur.fetchall() + if len(resP) != 0: + # append jobs as running since they are not in archived tables yet + checkedPandaIDs[pandaID] = 'running' + subDSpandaIDmap[subDataset].append(pandaID) + else: + _logger.debug("getDisInUseForAnal : %s %s" % (sqlPan,str(varMap))) + retP = self.cur.execute(sqlPan+comment, varMap) + resP = self.cur.fetchall() + if len(resP) != 0: + checkedPandaIDs[pandaID], = resP[0] + # reuse failed files if jobs are in Archived since they cannot change back to active + if checkedPandaIDs[pandaID] in ['failed','cancelled']: + continue + # collect PandaIDs + subDSpandaIDmap[subDataset].append(pandaID) + else: + # not found + continue + else: + # no job lookup since file was sucessfully finished + checkedPandaIDs[pandaID] = 'finished' + # collect PandaIDs + subDSpandaIDmap[subDataset].append(pandaID) + # loop over all sub datasets + for subDataset,activePandaIDs in subDSpandaIDmap.iteritems(): + # skip empty + if activePandaIDs == []: + continue + resDisList = [] + # get dispatchDBlocks + pandaID = activePandaIDs[0] + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':type'] = 'input' + _logger.debug("getDisInUseForAnal : %s %s" % (sqlDis,str(varMap))) + self.cur.arraysize = 10000 + retD = self.cur.execute(sqlDis+comment, varMap) + resD = self.cur.fetchall() + # get shadow dis + for disDataset, in resD: + # use new style only + if not disDataset.startswith('user_disp.'): + continue + if not disDataset in resDisList: + resDisList.append(disDataset) + # append + if resDisList != []: + inputDisList.append((resDisList,activePandaIDs)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + timeDelta = datetime.datetime.utcnow()-timeStart + _logger.debug("getDisInUseForAnal end for %s len=%s time=%ssec" % (outDataset,len(inputDisList),timeDelta.seconds)) + return inputDisList + except: + # roll back + self._rollback() + errtype,errvalue = sys.exc_info()[:2] + _logger.error("getDisInUseForAnal(%s) : %s %s" % (outDataset,errtype,errvalue)) + return None + + + # get input LFNs currently in use for analysis with shadow dis + def getLFNsInUseForAnal(self,inputDisList): + comment = ' /* DBProxy.getLFNsInUseForAnal */' + sqlLfn = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ lfn,PandaID FROM ATLAS_PANDA.filesTable4 tab " + sqlLfn += "WHERE dispatchDBlock=:dispatchDBlock AND type=:type " + sqlLfn += "AND (destinationDBlockToken IS NULL OR destinationDBlockToken<>:noshadow) AND modificationTime<=CURRENT_DATE" + inputFilesList = [] + try: + token = datetime.datetime.utcnow().isoformat('/') + # loop over all shadow dis datasets + pandaIdLfnMap = {} + for disDatasetList,activePandaIDs in inputDisList: + for disDataset in disDatasetList: + # use new style only + if not disDataset.startswith('user_disp.'): + continue + # read LFNs and PandaIDs + if not pandaIdLfnMap.has_key(disDataset): + # start transaction + self.conn.begin() + varMap = {} + varMap[':dispatchDBlock'] = disDataset + varMap[':type'] = 'input' + varMap[':noshadow'] = 'noshadow' + _logger.debug("getLFNsInUseForAnal : <%s> %s %s" % (token,sqlLfn,str(varMap))) + timeStart = datetime.datetime.utcnow() + self.cur.arraysize = 100000 + retL = self.cur.execute(sqlLfn+comment, varMap) + resL = self.cur.fetchall() + # commit + timeDelta = datetime.datetime.utcnow()-timeStart + _logger.debug("getLFNsInUseForAnal : <%s> %s time=%ssec commit" % (token,disDataset,timeDelta.seconds)) + if not self._commit(): + raise RuntimeError, 'Commit error' + # make map + pandaIdLfnMap[disDataset] = {} + for lfn,filePandaID in resL: + if not pandaIdLfnMap[disDataset].has_key(filePandaID): + pandaIdLfnMap[disDataset][filePandaID] = [] + pandaIdLfnMap[disDataset][filePandaID].append(lfn) + _logger.debug("getLFNsInUseForAnal : <%s> %s map made with len=%s" % \ + (token,disDataset,len(resL))) + # append + for disDataset in disDatasetList: + _logger.debug("getLFNsInUseForAnal : <%s> %s list making pandaIDs=%s fileLen=%s" % \ + (token,disDataset,len(activePandaIDs),len(inputFilesList))) + for activePandaID in activePandaIDs: + # skip files used by archived failed or cancelled jobs + if pandaIdLfnMap[disDataset].has_key(activePandaID): + inputFilesList += pandaIdLfnMap[disDataset][activePandaID] + _logger.debug("getLFNsInUseForAnal : <%s> %s done" % (token,disDataset)) + _logger.debug("getLFNsInUseForAnal : <%s> %s" % (token,len(inputFilesList))) + return inputFilesList + except: + # roll back + self._rollback() + errtype,errvalue = sys.exc_info()[:2] + _logger.error("getLFNsInUseForAnal(%s) : %s %s" % (str(inputDisList),errtype,errvalue)) + return None + + + # update input files and return corresponding PandaIDs + def updateInFilesReturnPandaIDs(self,dataset,status,fileLFN=''): + comment = ' /* DBProxy.updateInFilesReturnPandaIDs */' + _logger.debug("updateInFilesReturnPandaIDs(%s,%s)" % (dataset,fileLFN)) + sql0 = "SELECT /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ row_ID,PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE status<>:status AND dispatchDBlock=:dispatchDBlock" + sql1 = "UPDATE /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ ATLAS_PANDA.filesTable4 tab SET status=:status WHERE status<>:status AND dispatchDBlock=:dispatchDBlock" + varMap = {} + varMap[':status'] = status + varMap[':dispatchDBlock'] = dataset + if fileLFN != '': + sql0 += " AND lfn=:lfn" + sql1 += " AND lfn=:lfn" + varMap[':lfn'] = fileLFN + for iTry in range(self.nTry): + try: + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + retS = self.cur.execute(sql0+comment, varMap) + resS = self.cur.fetchall() + # update + retU = self.cur.execute(sql1+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # collect PandaIDs + retList = [] + for tmpRowID,tmpPandaID in resS: + # append + if not tmpPandaID in retList: + retList.append(tmpPandaID) + # return + _logger.debug("updateInFilesReturnPandaIDs : %s" % str(retList)) + return retList + except: + # roll back + self._rollback() + # error report + if iTry+1 < self.nTry: + _logger.debug("updateInFilesReturnPandaIDs retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("updateInFilesReturnPandaIDs : %s %s" % (type, value)) + return [] + + + # update file status in dispatch dataset + def updateFileStatusInDisp(self,dataset,fileStatusMap): + comment = ' /* DBProxy.updateFileStatusInDisp */' + _logger.debug("updateFileStatusInDisp(%s,%s)" % (dataset,fileStatusMap)) + sql1 = "UPDATE /*+ index(tab FILESTABLE4_DISPDBLOCK_IDX) */ ATLAS_PANDA.filesTable4 tab SET status=:status WHERE dispatchDBlock=:dispatchDBlock AND lfn=:lfn" + nTry = 1 + for iTry in range(nTry): + try: + # start transaction + self.conn.begin() + # update + for status,lfns in fileStatusMap.iteritems(): + varMap = {} + varMap[':status'] = status + varMap[':dispatchDBlock'] = dataset + # loop over all files + for lfn in lfns: + varMap['lfn'] = lfn + # update + retU = self.cur.execute(sql1+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + _logger.debug("updateFileStatusInDisp : done") + return True + except: + # roll back + self._rollback() + # error report + if iTry+1 < nTry: + _logger.debug("updateFileStatusInDisp retry : %s" % iTry) + time.sleep(random.randint(5,10)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("updateFileStatusInDisp : %s %s" % (type, value)) + return False + + + # update output files and return corresponding PandaIDs + def updateOutFilesReturnPandaIDs(self,dataset,fileLFN=''): + comment = ' /* DBProxy.updateOutFilesReturnPandaIDs */' + _logger.debug("updateOutFilesReturnPandaIDs(%s,%s)" % (dataset,fileLFN)) + sql0 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ row_ID,PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status=:status" + sql1 = "UPDATE /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ ATLAS_PANDA.filesTable4 tab SET status='ready' WHERE destinationDBlock=:destinationDBlock AND status=:status" + varMap = {} + varMap[':status'] = 'transferring' + varMap[':destinationDBlock'] = dataset + if fileLFN != '': + sql0 += " AND lfn=:lfn" + sql1 += " AND lfn=:lfn" + varMap[':lfn'] = fileLFN + for iTry in range(self.nTry): + try: + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + retS = self.cur.execute(sql0+comment, varMap) + resS = self.cur.fetchall() + # update + retList = [] + retU = self.cur.execute(sql1+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # collect PandaIDs + retList = [] + for tmpRowID,tmpPandaID in resS: + # append + if not tmpPandaID in retList: + retList.append(tmpPandaID) + # return + _logger.debug("updateOutFilesReturnPandaIDs : %s" % str(retList)) + return retList + except: + # roll back + self._rollback() + # error report + if iTry+1 < self.nTry: + _logger.debug("updateOutFilesReturnPandaIDs retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("updateOutFilesReturnPandaIDs : %s %s" % (type, value)) + return [] + + + # get _dis datasets associated to _sub + def getAssociatedDisDatasets(self,subDsName): + comment = ' /* DBProxy.getAssociatedDisDatasets */' + _logger.debug("getAssociatedDisDatasets(%s)" % subDsName) + sqlF = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ distinct PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock" + sqlJ = "SELECT distinct dispatchDBlock FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type" + try: + # start transaction + self.conn.begin() + # get PandaIDs + varMap = {} + varMap[':destinationDBlock'] = subDsName + self.cur.arraysize = 10000 + self.cur.execute(sqlF+comment,varMap) + resS = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # loop over all PandaIDs + retList = [] + for pandaID, in resS: + # start transaction + self.conn.begin() + # get _dis name + varMap = {} + varMap[':type'] = 'input' + varMap[':PandaID'] = pandaID + self.cur.arraysize = 1000 + self.cur.execute(sqlJ+comment,varMap) + resD = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for disName, in resD: + if disName != None and not disName in retList: + retList.append(disName) + # return + _logger.debug("getAssociatedDisDatasets : %s" % str(retList)) + return retList + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getAssociatedDisDatasets : %s : %s %s" % (subDsName,errType,errValue)) + return [] + + + # set GUIDs + def setGUIDs(self,files): + comment = ' /* DBProxy.setGUIDs */' + _logger.debug("setGUIDs(%s)" % files) + sql0 = "UPDATE ATLAS_PANDA.filesTable4 SET GUID=:GUID,fsize=:fsize,checksum=:checksum,scope=:scope WHERE lfn=:lfn" + for iTry in range(self.nTry): + try: + # start transaction + self.conn.begin() + # update + for file in files: + varMap = {} + varMap[':GUID'] = file['guid'] + varMap[':lfn'] = file['lfn'] + if file['checksum'] in ['','NULL']: + varMap[':checksum'] = None + else: + varMap[':checksum'] = file['checksum'] + varMap[':fsize'] = file['fsize'] + if not file.has_key('scope') or file['scope'] in ['','NULL']: + varMap[':scope'] = None + else: + varMap[':scope'] = file['scope'] + self.cur.execute(sql0+comment, varMap) + retU = self.cur.rowcount + _logger.debug("setGUIDs : retU %s" % retU) + if retU<0: + raise RuntimeError, 'SQL error' + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + # error report + if iTry+1 < self.nTry: + _logger.debug("setGUIDs retry : %s" % iTry) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("setGUIDs : %s %s" % (type, value)) + return False + + + # query PandaID with Datasets + def queryPandaIDwithDataset(self,datasets): + comment = ' /* DBProxy.queryPandaIDwithDataset */' + _logger.debug("queryPandaIDwithDataset(%s)" % datasets) + if len(datasets) == 0: + return [] + # make SQL query + sql1 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock GROUP BY PandaID" + # execute + try: + retList = [] + for dataset in datasets: + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + varMap = {} + varMap[':destinationDBlock'] = dataset + self.cur.execute(sql1+comment,varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # get IDs + for r in res: + retList.append(r[0]) + # return + _logger.debug("queryPandaIDwithDataset : %s" % str(retList)) + return retList + except: + # roll back + self._rollback() + # error report + type, value, traceBack = sys.exc_info() + _logger.error("queryPandaIDwithDataset : %s %s" % (type, value)) + return [] + + + # query last files in datasets + def queryLastFilesInDataset(self,datasets): + comment = ' /* DBProxy.queryLastFilesInDataset */' + _logger.debug("queryLastFilesInDataset(%s)" % datasets) + if len(datasets) == 0: + return [] + # make SQL query + sql1 = "SELECT lfn,PandaID FROM ATLAS_PANDA.filesTable4 WHERE dataset=:dataset AND type=:type ORDER BY lfn DESC" + sqlL = "SELECT processingType FROM %s WHERE PandaID=:PandaID " + sqlA = "UNION SELECT processingType FROM ATLAS_PANDAARCH.jobsArchived WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30)" + sql2 = "SELECT lfn FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type=:type" + # execute + try: + retMap = {} + for dataset in datasets: + # start transaction + self.conn.begin() + # select max LFN + varMap = {} + varMap[':type'] = 'output' + varMap[':dataset'] = dataset + self.cur.arraysize = 100000 + self.cur.execute(sql1+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # found + retList = [] + for tmpLFN,pandaID in res: + # skip log.tgz + if re.search('\.log\.tgz(\.\d+)*$',tmpLFN) != None: + continue + # start transaction + self.conn.begin() + self.cur.arraysize = 10 + # check processingType + processingType = None + for tmpTable in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: + varMap = {} + varMap[':PandaID'] = pandaID + if tmpTable == 'ATLAS_PANDA.jobsArchived4': + self.cur.execute((sqlL % tmpTable)+sqlA+comment, varMap) + else: + self.cur.execute((sqlL % tmpTable)+comment, varMap) + resP = self.cur.fetchone() + if resP != None: + processingType = resP[0] + break + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # job not found + if processingType == None: + continue + # ignore merge jobs + if processingType in ['usermerge']: + continue + # start transaction + self.conn.begin() + # select LFNs + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':type'] = 'output' + self.cur.arraysize = 1000 + self.cur.execute(sql2+comment, varMap) + res = self.cur.fetchall() + for r in res: + retList.append(r[0]) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # get only the largest one + break + # append + retMap[dataset] = retList + # return + _logger.debug("queryLastFilesInDataset : %s" % str(retMap)) + return retMap + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("queryLastFilesInDataset : %s %s" % (type, value)) + return {} + + + # query PandaID with filenames + def queryPandaIDwithLFN(self,vlfns): + comment = ' /* DBProxy.queryPandaIDwithLFN */' + _logger.debug("queryPandaIDwithLFN(%s)" % vlfns) + if len(vlfns) == 0: + return [] + # make SQL query + sql1 = "SELECT PandaID FROM ATLAS_PANDA.filesTable4 WHERE lfn=:lfn GROUP BY PandaID" + # execute + retList = [] + for lfn in vlfns: + # get generic LFNs + gLFN = re.sub('\.\d+$','',lfn) + # try + try: + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[':lfn'] = gLFN + self.cur.arraysize = 10000 + self.cur.execute(sql1+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append IDs + for tmpID, in res: + if not tmpID in retList: + retList.append(tmpID) + except: + # roll back + self._rollback() + # error report + type, value, traceBack = sys.exc_info() + _logger.error("queryPandaIDwithLFN : %s %s" % (type, value)) + return [] + # return + _logger.debug("queryPandaIDwithLFN : %s" % str(retList)) + return retList + + + # get job statistics + def getJobStatistics(self,archived=False,predefined=False,workingGroup='',countryGroup='',jobType='',forAnal=None,minPriority=None): + comment = ' /* DBProxy.getJobStatistics */' + _logger.debug("getJobStatistics(%s,%s,'%s','%s','%s',%s,%s)" % (archived,predefined,workingGroup,countryGroup,jobType,forAnal,minPriority)) + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) + sql0 = "SELECT computingSite,jobStatus,COUNT(*) FROM %s " + # processingType + tmpJobTypeMap = {} + sqlJobType = '' + useWhereInSQL = True + if forAnal == None or jobType != "": + useWhereInSQL = False + elif forAnal == True: + tmpJobTypeMap[':prodSourceLabel1'] = 'user' + tmpJobTypeMap[':prodSourceLabel2'] = 'panda' + sql0 += "WHERE prodSourceLabel IN (" + sqlJobType = ":prodSourceLabel1,:prodSourceLabel2) " + else: + tmpJobTypeMap[':prodSourceLabel1'] = 'managed' + sql0 += "WHERE prodSourceLabel IN (" + sqlJobType = ":prodSourceLabel1) " + sql0 += sqlJobType + # predefined + if predefined: + if useWhereInSQL: + sql0 += "AND relocationFlag=1 " + else: + sql0 += "WHERE relocationFlag=1 " + useWhereInSQL = True + # working group + tmpGroupMap = {} + sqlGroups = '' + if workingGroup != '': + if useWhereInSQL: + sqlGroups += "AND workingGroup IN (" + else: + sqlGroups += "WHERE workingGroup IN (" + useWhereInSQL = True + # loop over all groups + idxWG = 1 + for tmpWG in workingGroup.split(','): + tmpWGkey = ':workingGroup%s' % idxWG + sqlGroups += "%s," % tmpWGkey + tmpGroupMap[tmpWGkey] = tmpWG + idxWG += 1 + sqlGroups = sqlGroups[:-1] + ") " + # country group + if countryGroup != '': + if useWhereInSQL: + sqlGroups += "AND countryGroup IN (" + else: + sqlGroups += "WHERE countryGroup IN (" + useWhereInSQL = True + # loop over all groups + idxCG = 1 + for tmpCG in countryGroup.split(','): + tmpCGkey = ':countryGroup%s' % idxCG + sqlGroups += "%s," % tmpCGkey + tmpGroupMap[tmpCGkey] = tmpCG + idxCG += 1 + sqlGroups = sqlGroups[:-1] + ") " + sql0 += sqlGroups + # minimum priority + sqlPrio = '' + tmpPrioMap = {} + if minPriority != None: + if useWhereInSQL: + sqlPrio = "AND currentPriority>=:minPriority " + else: + sqlPrio = "WHERE currentPriority>=:minPriority " + useWhereInSQL = True + tmpPrioMap[':minPriority'] = minPriority + sql0 += sqlPrio + sql0 += "GROUP BY computingSite,jobStatus" + sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ computingSite,jobStatus,COUNT(*) FROM ATLAS_PANDA.jobsArchived4 tab WHERE modificationTime>:modificationTime " + if sqlJobType != "": + sqlA += "AND prodSourceLabel IN (" + sqlA += sqlJobType + if predefined: + sqlA += "AND relocationFlag=1 " + sqlA += sqlGroups + sqlA += sqlPrio + sqlA += "GROUP BY computingSite,jobStatus" + tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] + if archived: + tables.append('ATLAS_PANDA.jobsArchived4') + # sql for materialized view + sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0) + sqlMV = re.sub(':minPriority','TRUNC(:minPriority,-1)',sqlMV) + sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) + ret = {} + nTry=3 + for iTry in range(nTry): + try: + for table in tables: + # start transaction + self.conn.begin() + # select + varMap = {} + for tmpJobType in tmpJobTypeMap.keys(): + varMap[tmpJobType] = tmpJobTypeMap[tmpJobType] + for tmpGroup in tmpGroupMap.keys(): + varMap[tmpGroup] = tmpGroupMap[tmpGroup] + for tmpPrio in tmpPrioMap.keys(): + varMap[tmpPrio] = tmpPrioMap[tmpPrio] + if table != 'ATLAS_PANDA.jobsArchived4': + self.cur.arraysize = 10000 + if table == 'ATLAS_PANDA.jobsActive4': + sqlExeTmp = (sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS' + else: + sqlExeTmp = (sql0+comment) % table + _logger.debug("getJobStatistics : %s %s" % (sqlExeTmp,str(varMap))) + self.cur.execute(sqlExeTmp, varMap) + else: + varMap[':modificationTime'] = timeLimit + self.cur.arraysize = 10000 + self.cur.execute(sqlA+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for item in res: + if not ret.has_key(item[0]): + ret[item[0]] = {} + if not ret[item[0]].has_key(item[1]): + ret[item[0]][item[1]] = 0 + ret[item[0]][item[1]] += item[2] + # for zero + stateList = ['assigned','activated','running'] + if archived: + stateList += ['finished','failed'] + for site in ret.keys(): + for state in stateList: + if not ret[site].has_key(state): + ret[site][state] = 0 + # return + _logger.debug("getJobStatistics -> %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("getJobStatistics() retry : %s" % iTry) + time.sleep(2) + continue + type, value, traceBack = sys.exc_info() + _logger.error("getJobStatistics : %s %s" % (type, value)) + return {} + + + # get job statistics with label + def getJobStatisticsWithLabel(self,siteStr=''): + comment = ' /* DBProxy.getJobStatisticsWithLabel */' + _logger.debug("getJobStatisticsWithLabel(%s)" % siteStr) + sql0 = "SELECT computingSite,prodSourceLabel,jobStatus,COUNT(*) FROM %s " + # site + tmpSiteMap = {} + if siteStr != '': + sql0 += "WHERE computingSite IN (" + # loop over all sites + idxSite = 1 + for tmpSite in siteStr.split(','): + tmpSiteKey = ':site%s' % idxSite + sql0 += "%s," % tmpSiteKey + tmpSiteMap[tmpSiteKey] = tmpSite + idxSite += 1 + sql0 = sql0[:-1] + ") " + sql0 += "GROUP BY computingSite,prodSourceLabel,jobStatus " + sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0) + sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) + tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] + returnMap = {} + try: + for table in tables: + # start transaction + self.conn.begin() + # select + varMap = {} + self.cur.arraysize = 10000 + if table == 'ATLAS_PANDA.jobsActive4': + sqlExeTmp = (sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS' + else: + sqlExeTmp = (sql0+comment) % table + self.cur.execute(sqlExeTmp,tmpSiteMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for computingSite,prodSourceLabel,jobStatus,nCount in res: + # add site + if not returnMap.has_key(computingSite): + returnMap[computingSite] = {} + # add SourceLabel + if not returnMap[computingSite].has_key(prodSourceLabel): + returnMap[computingSite][prodSourceLabel] = {} + # add jobstatus + if not returnMap[computingSite][prodSourceLabel].has_key(jobStatus): + returnMap[computingSite][prodSourceLabel][jobStatus] = 0 + # add + returnMap[computingSite][prodSourceLabel][jobStatus] += nCount + # return + _logger.debug("getJobStatisticsWithLabel() : %s" % str(returnMap)) + return returnMap + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getJobStatisticsWithLabel : %s %s" % (errType,errValue)) + return {} + + + # get job statistics for brokerage + def getJobStatisticsBrokerage(self,minPriority=None): + comment = ' /* DBProxy.getJobStatisticsBrokerage */' + _logger.debug("getJobStatisticsBrokerage(%s)" % minPriority) + sql0 = "SELECT cloud,computingSite,jobStatus,processingType,COUNT(*) FROM %s WHERE " + sql0 += "prodSourceLabel IN (:prodSourceLabel1) " + tmpPrioMap = {} + if minPriority != None: + sql0 += "AND currentPriority>=:minPriority " + tmpPrioMap[':minPriority'] = minPriority + sql0 += "GROUP BY cloud,computingSite,jobStatus,processingType" + # sql for materialized view + sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0) + sqlMV = re.sub(':minPriority','TRUNC(:minPriority,-1)',sqlMV) + sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) + tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] + if minPriority != None: + # read the number of running jobs with prio<=MIN + tables.append('ATLAS_PANDA.jobsActive4') + sqlMVforRun = re.sub('currentPriority>=','currentPriority<=',sqlMV) + ret = {} + nTry=3 + iActive = 0 + for iTry in range(nTry): + try: + for table in tables: + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[':prodSourceLabel1'] = 'managed' + for tmpPrio in tmpPrioMap.keys(): + varMap[tmpPrio] = tmpPrioMap[tmpPrio] + self.cur.arraysize = 10000 + useRunning = None + if table == 'ATLAS_PANDA.jobsActive4': + # first count non-running and then running if minPriority is specified + if minPriority != None: + if iActive == 0: + useRunning = False + else: + useRunning = True + iActive += 1 + if useRunning in [None,False]: + self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap) + else: + self.cur.execute((sqlMVforRun+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap) + else: + self.cur.execute((sql0+comment) % table, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for cloud,computingSite,jobStatus,processingType,count in res: + # check jobstatus if minPriority isspecified + if minPriority != None: + # count the number of non-running with prio>=MIN + if useRunning == True and jobStatus != 'running': + continue + # count the number of running with prio<=MIN + if useRunning == False and jobStatus == 'running': + continue + # add cloud + if not ret.has_key(cloud): + ret[cloud] = {} + # add site + if not ret[cloud].has_key(computingSite): + ret[cloud][computingSite] = {} + # add processingType + if not ret[cloud][computingSite].has_key(processingType): + ret[cloud][computingSite][processingType] = {} + # add jobStatus + if not ret[cloud][computingSite][processingType].has_key(jobStatus): + ret[cloud][computingSite][processingType][jobStatus] = count + # for zero + for cloud,cloudVal in ret.iteritems(): + for site,siteVal in cloudVal.iteritems(): + for pType,typeVal in siteVal.iteritems(): + for stateItem in ['assigned','activated','running','transferring']: + if not typeVal.has_key(stateItem): + typeVal[stateItem] = 0 + # return + _logger.debug("getJobStatisticsBrokerage -> %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("getJobStatisticsBrokerage retry : %s" % iTry) + time.sleep(2) + continue + type, value, traceBack = sys.exc_info() + _logger.error("getJobStatisticsBrokerage : %s %s" % (type, value)) + return {} + + + # get job statistics for analysis brokerage + def getJobStatisticsAnalBrokerage(self,minPriority=None): + comment = ' /* DBProxy.getJobStatisticsAnalBrokerage */' + _logger.debug("getJobStatisticsAnalBrokerage(%s)" % minPriority) + sql0 = "SELECT computingSite,jobStatus,processingType,COUNT(*) FROM %s WHERE " + sql0 += "prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " + if minPriority != None: + sql0 += "AND currentPriority>=:minPriority " + sql0 += "GROUP BY cloud,computingSite,jobStatus,processingType" + # sql for materialized view + sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0) + sqlMV = re.sub(':minPriority','TRUNC(:minPriority,-1)',sqlMV) + sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) + tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] + ret = {} + nTry=3 + for iTry in range(nTry): + try: + for table in tables: + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + if minPriority != None: + varMap[':minPriority'] = minPriority + self.cur.arraysize = 10000 + if table == 'ATLAS_PANDA.jobsActive4': + self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap) + else: + self.cur.execute((sql0+comment) % table, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for computingSite,jobStatus,processingType,count in res: + # add site + if not ret.has_key(computingSite): + ret[computingSite] = {} + # add processingType + if not ret[computingSite].has_key(processingType): + ret[computingSite][processingType] = {} + # add jobStatus + if not ret[computingSite][processingType].has_key(jobStatus): + ret[computingSite][processingType][jobStatus] = count + # for zero + for site,siteVal in ret.iteritems(): + for pType,typeVal in siteVal.iteritems(): + for stateItem in ['defined','assigned','activated','running']: + if not typeVal.has_key(stateItem): + typeVal[stateItem] = 0 + # return + _logger.debug("getJobStatisticsAnalBrokerage -> %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.debug("getJobStatisticsAnalBrokerage retry : %s" % iTry) + time.sleep(2) + continue + type, value, traceBack = sys.exc_info() + _logger.error("getJobStatisticsAnalBrokerage : %s %s" % (type, value)) + return {} + + + # get highest prio jobs + def getHighestPrioJobStat(self): + comment = ' /* DBProxy.getHighestPrioJobStat */' + _logger.debug("getHighestPrioJobStat()") + sql0 = "SELECT cloud,max(currentPriority) FROM %s WHERE " + sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) GROUP BY cloud" + sqlC = "SELECT COUNT(*) FROM %s WHERE " + sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " + sqlC += "cloud=:cloud AND currentPriority=:currentPriority" + tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] + ret = {} + try: + for table in tables: + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[':prodSourceLabel'] = 'managed' + if table == 'ATLAS_PANDA.jobsActive4': + varMap[':jobStatus1'] = 'activated' + varMap[':jobStatus2'] = 'dummy' + else: + varMap[':jobStatus1'] = 'defined' + varMap[':jobStatus2'] = 'assigned' + self.cur.arraysize = 100 + _logger.debug((sql0+comment) % table) + self.cur.execute((sql0+comment) % table, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for cloud,maxPriority in res: + # add cloud + if not ret.has_key(cloud): + ret[cloud] = {} + # add max priority + prioKey = 'highestPrio' + nNotRunKey = 'nNotRun' + getNumber = False + if not ret[cloud].has_key(prioKey): + ret[cloud][prioKey] = maxPriority + ret[cloud][nNotRunKey] = 0 + getNumber = True + else: + # use highest one + if ret[cloud][prioKey] < maxPriority: + ret[cloud][prioKey] = maxPriority + # reset + ret[cloud][nNotRunKey] = 0 + getNumber = True + elif ret[cloud][prioKey] == maxPriority: + getNumber = True + # get number of jobs with highest prio + if getNumber: + varMap[':cloud'] = cloud + varMap[':currentPriority'] = maxPriority + self.cur.arraysize = 10 + _logger.debug((sqlC+comment) % table) + self.cur.execute((sqlC+comment) % table, varMap) + resC = self.cur.fetchone() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + ret[cloud][nNotRunKey] += resC[0] + # return + return ret + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getHighestPrioJobStat : %s %s" % (type, value)) + return {} + + + # get highest prio jobs per process group + def getHighestPrioJobStatPerPG(self,useMorePG=False): + comment = ' /* DBProxy.getHighestPrioJobStatPerPG */' + _logger.debug("getHighestPrioJobStatPerPG()") + if useMorePG == False: + sql0 = "SELECT cloud,max(currentPriority),processingType FROM %s WHERE " + sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) GROUP BY cloud,processingType" + sqlC = "SELECT COUNT(*) FROM %s WHERE " + sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " + sqlC += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType" + else: + sql0 = "SELECT cloud,max(currentPriority),processingType,coreCount,workingGroup FROM %s WHERE " + sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) " + sql0 += "GROUP BY cloud,processingType,coreCount,workingGroup" + sqlC = "SELECT COUNT(*) FROM %s WHERE " + sqlC += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " + sqlC += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType AND " + sqlC += "coreCount=:coreCount AND workingGroup=:workingGroup" + sqlCN = "SELECT COUNT(*) FROM %s WHERE " + sqlCN += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND " + sqlCN += "cloud=:cloud AND currentPriority=:currentPriority AND processingType=:processingType AND " + sqlCN += "coreCount IS NULL AND workingGroup=:workingGroup" + tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] + ret = {} + try: + for table in tables: + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[':prodSourceLabel'] = 'managed' + if table == 'ATLAS_PANDA.jobsActive4': + varMap[':jobStatus1'] = 'activated' + varMap[':jobStatus2'] = 'dummy' + else: + varMap[':jobStatus1'] = 'defined' + varMap[':jobStatus2'] = 'assigned' + self.cur.arraysize = 100 + _logger.debug((sql0+comment) % table+str(varMap)) + self.cur.execute((sql0+comment) % table, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for tmpItem in res: + if useMorePG == False: + cloud,maxPriority,processingType = tmpItem + origCloud = cloud + origProcessingType = processingType + else: + origCloud,maxPriority,origProcessingType,coreCount,workingGroup = tmpItem + # convert cloud and processingType for extended process group + if useMorePG == ProcessGroups.extensionLevel_1: + # extension level 1 + cloud,processingType = ProcessGroups.converCPTforEPG(origCloud,origProcessingType, + coreCount) + else: + # extension level 2 + cloud,processingType = ProcessGroups.converCPTforEPG(origCloud,origProcessingType, + coreCount,workingGroup) + # add cloud + if not ret.has_key(cloud): + ret[cloud] = {} + # get process group + processGroup = ProcessGroups.getProcessGroup(processingType) + # add process group + if not ret[cloud].has_key(processGroup): + ret[cloud][processGroup] = {} + # add max priority + prioKey = 'highestPrio' + nNotRunKey = 'nNotRun' + getNumber = False + if not ret[cloud][processGroup].has_key(prioKey): + ret[cloud][processGroup][prioKey] = maxPriority + ret[cloud][processGroup][nNotRunKey] = 0 + getNumber = True + else: + # use highest one + if ret[cloud][processGroup][prioKey] < maxPriority: + ret[cloud][processGroup][prioKey] = maxPriority + # reset + ret[cloud][processGroup][nNotRunKey] = 0 + getNumber = True + elif ret[cloud][processGroup][prioKey] == maxPriority: + getNumber = True + # get number of jobs with highest prio + if getNumber: + varMap[':cloud'] = origCloud + varMap[':currentPriority'] = maxPriority + varMap[':processingType'] = origProcessingType + if useMorePG != False: + varMap[':workingGroup'] = workingGroup + if coreCount != None: + varMap[':coreCount'] = coreCount + self.cur.arraysize = 10 + _logger.debug((sqlC+comment) % table+str(varMap)) + self.cur.execute((sqlC+comment) % table, varMap) + resC = self.cur.fetchone() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + ret[cloud][processGroup][nNotRunKey] += resC[0] + # return + _logger.debug("getHighestPrioJobStatPerPG -> %s" % ret) + return ret + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getHighestPrioJobStatPerPG : %s %s" % (type, value)) + return {} + + + # get queued analysis jobs at a site + def getQueuedAnalJobs(self,site,dn): + comment = ' /* DBProxy.getQueuedAnalJobs */' + _logger.debug("getQueuedAnalJobs(%s,%s)" % (site,dn)) + sql0 = "SELECT COUNT(*),jobStatus FROM %s WHERE " + sql0 += "prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) " + sql0 += "AND computingSite=:computingSite AND prodUserName != :prodUserName " + sql0 += "GROUP BY jobStatus " + tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'] + try: + # get compact DN + compactDN = self.cleanUserID(dn) + if compactDN in ['','NULL',None]: + compactDN = dn + nQueued = 0 + nRunning = 0 + # loop over all tables + for table in tables: + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[':prodSourceLabel'] = 'user' + varMap[':computingSite'] = site + varMap[':prodUserName'] = compactDN + if table == 'ATLAS_PANDA.jobsActive4': + varMap[':jobStatus1'] = 'activated' + varMap[':jobStatus2'] = 'running' + else: + varMap[':jobStatus1'] = 'defined' + varMap[':jobStatus2'] = 'assigned' + self.cur.arraysize = 10 + self.cur.execute((sql0+comment) % table, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # sum + for cnt,jobStatus in res: + if jobStatus == 'running': + nRunning += cnt + else: + nQueued += cnt + # return + return {'queued':nQueued, 'running':nRunning} + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getQueuedAnalJobs : %s %s" % (errType,errValue)) + return {} + + + # get computingSite and destinationSE for a dataset + def getDestSE(self,dsname,fromArch=False): + comment = ' /* DBProxy.getDestSE */' + _logger.debug("getDestSE(%s,%s)" % (dsname,fromArch)) + sql0 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock " + if not fromArch: + sql0 += "AND status=:status " + sql0 += "AND rownum=1" + sql1 = "SELECT computingSite,destinationSE FROM %s WHERE PandaID=:PandaID" + actTableList = ['ATLAS_PANDA.jobsActive4'] + if fromArch: + actTableList.append("ATLAS_PANDA.jobsArchived4") + try: + # start transaction + self.conn.begin() + # select + varMap = {} + if not fromArch: + varMap[':status'] = 'transferring' + varMap[':destinationDBlock'] = dsname + self.cur.arraysize = 10 + self.cur.execute(sql0+comment, varMap) + res = self.cur.fetchall() + # get PandaID + pandaID = None + if len(res) != 0: + pandaID = res[0][0] + # get computingSite and destinationSE + destSE = None,None + if pandaID != None: + varMap = {} + varMap[':PandaID'] = pandaID + # loop over all active tables + foundInActive = False + for actTable in actTableList: + self.cur.execute((sql1 % actTable)+comment, varMap) + res = self.cur.fetchall() + if len(res) != 0: + destSE = res[0] + foundInActive = True + break + # look into ARCH table + if not foundInActive: + if fromArch: + sqlA = "SELECT computingSite,destinationSE FROM ATLAS_PANDAARCH.jobsArchived WHERE PandaID=:PandaID " + sqlA += "AND modificationTime>(CURRENT_DATE-30) " + self.cur.execute(sqlA+comment, varMap) + res = self.cur.fetchall() + if len(res) != 0: + destSE = res[0] + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + _logger.debug("getDestSE(%s) : %s" % (dsname,str(destSE))) + return destSE + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getDestSE : %s %s" % (type, value)) + return None,None + + + # get destinationDBlockToken for a dataset + def getDestTokens(self,dsname): + comment = ' /* DBProxy.getDestTokens */' + _logger.debug("getDestTokens(%s)" % dsname) + sql0 = "SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ destinationDBlockToken FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND rownum=1" + try: + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[':destinationDBlock'] = dsname + self.cur.arraysize = 10 + self.cur.execute(sql0+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + retToken = None + if len(res) != 0: + retToken = res[0][0] + # convert None to NULL + if retToken == None: + retToken = 'NULL' + # return + _logger.debug("getDestTokens(%s) : %s" % (dsname,retToken)) + return retToken + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getDestTokens : %s %s" % (type, value)) + return None + + + # get the number of job for a user + def getNumberJobsUser(self,dn,workingGroup=None): + comment = ' /* DBProxy.getNumberJobsUser */' + _logger.debug("getNumberJobsUsers(%s,%s)" % (dn,workingGroup)) + # get compact DN + compactDN = self.cleanUserID(dn) + if compactDN in ['','NULL',None]: + compactDN = dn + if workingGroup != None: + sql0 = "SELECT COUNT(*) FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel AND workingGroup=:workingGroup" + else: + sql0 = "SELECT COUNT(*) FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel=:prodSourceLabel AND workingGroup IS NULL" + nTry = 1 + nJob = 0 + for iTry in range(nTry): + try: + for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'): + # start transaction + self.conn.begin() + # select + varMap = {} + varMap[':prodUserName'] = compactDN + varMap[':prodSourceLabel'] = 'user' + if workingGroup != None: + varMap[':workingGroup'] = workingGroup + self.cur.arraysize = 10 + self.cur.execute((sql0+comment) % table, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + if len(res) != 0: + nJob += res[0][0] + # return + _logger.debug("getNumberJobsUsers(%s) : %s" % (dn,nJob)) + return nJob + except: + # roll back + self._rollback() + if iTry+1 < nTry: + time.sleep(2) + continue + type, value, traceBack = sys.exc_info() + _logger.error("getNumberJobsUsers : %s %s" % (type, value)) + return 0 + + + # get job statistics for ExtIF + def getJobStatisticsForExtIF(self,sourcetype=None): + comment = ' /* DBProxy.getJobStatisticsForExtIF */' + _logger.debug("getJobStatisticsForExtIF()") + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) + if sourcetype == 'analysis': + sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) GROUP BY jobStatus,cloud" + sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ jobStatus,COUNT(*),cloud FROM %s tab WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " + else: + sql0 = "SELECT jobStatus,COUNT(*),cloud FROM %s WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) GROUP BY jobStatus,cloud" + sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ jobStatus,COUNT(*),cloud FROM %s tab WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " + sqlA+= "AND modificationTime>:modificationTime GROUP BY jobStatus,cloud" + # sql for materialized view + sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sql0) + sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) + ret = {} + try: + for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsDefined4'): + # start transaction + self.conn.begin() + # select + varMap = {} + if sourcetype == 'analysis': + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + else: + varMap[':prodSourceLabel1'] = 'managed' + varMap[':prodSourceLabel2'] = 'rc_test' + if table != 'ATLAS_PANDA.jobsArchived4': + self.cur.arraysize = 10000 + if table == 'ATLAS_PANDA.jobsActive4': + self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap) + else: + self.cur.execute((sql0+comment) % table, varMap) + else: + varMap[':modificationTime'] = timeLimit + self.cur.arraysize = 10000 + self.cur.execute((sqlA+comment) % table, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # change NULL to US for old jobs + newRes = [] + usMap = {} + for jobStatus,count,cloud in res: + if not cloud in ['US','NULL']: + # append since no conversion is required + newRes.append((jobStatus,count,cloud)) + else: + # sum + if not usMap.has_key(jobStatus): + usMap[jobStatus] = 0 + usMap[jobStatus] += count + # append US counts + for jobStatus,count in usMap.iteritems(): + newRes.append((jobStatus,count,'US')) + # create map + for item in newRes: + # add cloud + if not ret.has_key(item[2]): + ret[item[2]] = {} + # this is needed for auto_increment of InnoDB + if not ret[item[2]].has_key(item[0]): + ret[item[2]][item[0]] = item[1] + # return + _logger.debug("getJobStatisticsForExtIF -> %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getJobStatisticsForExtIF : %s %s" % (type, value)) + return {} + + + # get job statistics per processingType + def getJobStatisticsPerProcessingType(self,useMorePG=False): + comment = ' /* DBProxy.getJobStatisticsPerProcessingType */' + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) + _logger.debug("getJobStatisticsPerProcessingType()") + if useMorePG == False: + sqlN = "SELECT jobStatus,COUNT(*),cloud,processingType FROM %s " + sqlN += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) GROUP BY jobStatus,cloud,processingType" + sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ jobStatus,COUNT(*),cloud,processingType FROM %s tab " + sqlA += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>:modificationTime GROUP BY jobStatus,cloud,processingType" + else: + sqlN = "SELECT jobStatus,COUNT(*),cloud,processingType,coreCount,workingGroup FROM %s " + sqlN += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) " + sqlN += "GROUP BY jobStatus,cloud,processingType,coreCount,workingGroup" + sqlA = "SELECT /*+ INDEX_RS_ASC(tab (MODIFICATIONTIME PRODSOURCELABEL)) */ " + sqlA += "jobStatus,COUNT(*),cloud,processingType,coreCount,workingGroup FROM %s tab " + sqlA += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>:modificationTime " + sqlA += "GROUP BY jobStatus,cloud,processingType,coreCount,workingGroup" + # sql for materialized view + sqlMV = re.sub('COUNT\(\*\)','SUM(num_of_jobs)',sqlN) + sqlMV = re.sub('SELECT ','SELECT /*+ RESULT_CACHE */ ',sqlMV) + ret = {} + try: + for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsDefined4'): + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + # select + varMap = {} + varMap[':prodSourceLabel1'] = 'managed' + varMap[':prodSourceLabel2'] = 'rc_test' + if table == 'ATLAS_PANDA.jobsArchived4': + varMap[':modificationTime'] = timeLimit + self.cur.execute((sqlA+comment) % table, varMap) + else: + if table == 'ATLAS_PANDA.jobsActive4' and useMorePG == False: + self.cur.execute((sqlMV+comment) % 'ATLAS_PANDA.MV_JOBSACTIVE4_STATS', varMap) + else: + # use real table since coreCount is unavailable in MatView + self.cur.execute((sqlN+comment) % table, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for tmpItem in res: + if useMorePG == False: + jobStatus,count,cloud,processingType = tmpItem + else: + jobStatus,count,cloud,processingType,coreCount,workingGroup = tmpItem + # convert cloud and processingType for extended process group + if useMorePG == ProcessGroups.extensionLevel_1: + # extension level 1 + cloud,processingType = ProcessGroups.converCPTforEPG(cloud,processingType, + coreCount) + else: + # extension level 2 + cloud,processingType = ProcessGroups.converCPTforEPG(cloud,processingType, + coreCount,workingGroup) + + # add cloud + if not ret.has_key(cloud): + ret[cloud] = {} + # add processingType + if not ret[cloud].has_key(processingType): + ret[cloud][processingType] = {} + # add status + if not ret[cloud][processingType].has_key(jobStatus): + ret[cloud][processingType][jobStatus] = 0 + ret[cloud][processingType][jobStatus] += count + # return + _logger.debug("getJobStatisticsPerProcessingType -> %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getJobStatisticsPerProcessingType : %s %s" % (type, value)) + return {} + + + # get the number of waiting jobs per site and user + def getJobStatisticsPerUserSite(self): + comment = ' /* DBProxy.getJobStatisticsPerUserSite */' + _logger.debug("getJobStatisticsPerUserSite()") + sqlN = "SELECT COUNT(*),prodUserID,computingSite FROM %s " + sqlN += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus=:jobStatus GROUP BY prodUserID,computingSite" + ret = {} + try: + for table in ('ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4'): + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 100000 + # select + if table == 'ATLAS_PANDA.jobsActive4': + jobStatus = 'activated' + else: + jobStatus = 'assigned' + varMap = {} + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + varMap[':jobStatus'] = jobStatus + self.cur.execute((sqlN+comment) % table, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for cnt,prodUserName,computingSite in res: + # add site + if not ret.has_key(computingSite): + ret[computingSite] = {} + # add user + if not ret[computingSite].has_key(prodUserName): + ret[computingSite][prodUserName] = {'assigned':0,'activated':0} + # add info + ret[computingSite][prodUserName][jobStatus] = cnt + # return + _logger.debug("getJobStatisticsPerUserSite -> %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + # error + errtype,errvalue = sys.exc_info()[:2] + _logger.error("getJobStatisticsPerUserSite : %s %s" % (errtype,errvalue)) + return {} + + + # get number of analysis jobs per user + def getNUserJobs(self,siteName,nJobs): + comment = ' /* DBProxy.getNUserJobs */' + _logger.debug("getNUserJobs(%s)" % siteName) + sql0 = "SELECT * FROM (SELECT prodUserID FROM ATLAS_PANDA.jobsActive4 " + sql0 += "WHERE jobStatus=:jobStatus AND prodSourceLabel in (:prodSourceLabel1,:prodSourceLabel2) " + sql0 += "AND computingSite=:computingSite ORDER BY currentPriority DESC) WHERE rownum<=:nJobs" + varMap = {} + varMap[':computingSite'] = siteName + varMap[':nJobs'] = nJobs + varMap[':jobStatus'] = 'activated' + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + ret = {} + try: + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + self.cur.execute(sql0+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for prodUserID, in res: + if not ret.has_key(prodUserID): + ret[prodUserID] = 0 + ret[prodUserID] += 1 + # return + _logger.debug("getNUserJobs() : %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getNUserJobs : %s %s" % (type, value)) + return {} + + + # get number of activated analysis jobs + def getNAnalysisJobs(self,nProcesses): + comment = ' /* DBProxy.getNAnalysisJobs */' + _logger.debug("getNAnalysisJobs(%s)" % nProcesses) + sql0 = "SELECT computingSite,COUNT(*) FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus " + sql0 += "AND (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2) GROUP BY computingSite" + varMap = {} + varMap[':jobStatus'] = 'activated' + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + ret = {} + try: + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + self.cur.execute(sql0+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # create map + for item in res: + ret[item[0]] = float(item[1])/nProcesses + # return + _logger.debug("getNAnalysisJobs() : %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getNAnalysisJobs : %s %s" % (type, value)) + return {} + + + # generate pilot token + def genPilotToken(self,schedulerhost,scheduleruser,schedulerid): + comment = ' /* DBProxy.genPilotToken */' + try: + _logger.debug("genPilotToken(%s,%s,%s)" % (schedulerhost,scheduleruser,schedulerid)) + token = commands.getoutput('uuidgen') + timeNow = datetime.datetime.utcnow() + timeExp = timeNow + datetime.timedelta(days=4) + sql = "INSERT INTO ATLAS_PANDA.pilottoken (token,schedulerhost,scheduleruser,schedulerid,created,expires) " + sql += "VALUES (:token,:schedulerhost,:scheduleruser,:schedulerid,:created,:expires)" + # start transaction + self.conn.begin() + # execute + varMap = {':token':token,':schedulerhost':schedulerhost,':scheduleruser':scheduleruser, + ':schedulerid':schedulerid,':created':timeNow,':expires':timeExp} + self.cur.execute(sql+comment,varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + retVal = "token=%s,created=%s,expires=%s" % (token,timeNow.strftime('%Y-%m-%d %H:%M:%S'), + timeExp.strftime('%Y-%m-%d %H:%M:%S')) + _logger.debug("genPilotToken -> %s" % retVal) + return retVal + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("genPilotToken : %s %s" % (type, value)) + return None + + + # get list of scheduler users + def getListSchedUsers(self): + comment = ' /* DBProxy.getListSchedUsers */' + try: + _logger.debug("getListSchedUsers") + sql = "SELECT token,scheduleruser FROM ATLAS_PANDA.pilottoken WHERE expires>CURRENT_DATE" + # start transaction + self.conn.begin() + # execute + self.cur.arraysize = 100 + self.cur.execute(sql+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + retVal = {} + for token,scheduleruser in res: + retVal[token] = scheduleruser + _logger.debug("getListSchedUsers->%s" % str(retVal)) + return retVal + except: + # roll back + self._rollback() + # error + type, value, traceBack = sys.exc_info() + _logger.error("getListSchedUsers : %s %s" % (type, value)) + return {} + + + ########################################################################### + # + # LogDBProxy stuff + + # update site data + def updateSiteData(self,hostID,pilotRequests): + comment = ' /* DBProxy.updateSiteData */' + _logger.debug("updateSiteData start") + sqlDel = "DELETE FROM ATLAS_PANDAMETA.SiteData WHERE HOURS=:HOURS AND LASTMOD<:LASTMOD" + sqlCh = "SELECT count(*) FROM ATLAS_PANDAMETA.SiteData WHERE FLAG=:FLAG AND HOURS=:HOURS AND SITE=:SITE" + sqlIn = "INSERT INTO ATLAS_PANDAMETA.SiteData (SITE,FLAG,HOURS,GETJOB,UPDATEJOB,LASTMOD," + sqlIn += "NSTART,FINISHED,FAILED,DEFINED,ASSIGNED,WAITING,ACTIVATED,HOLDING,RUNNING,TRANSFERRING) " + sqlIn += "VALUES (:SITE,:FLAG,:HOURS,:GETJOB,:UPDATEJOB,CURRENT_DATE," + sqlIn += "0,0,0,0,0,0,0,0,0,0)" + sqlUp = "UPDATE ATLAS_PANDAMETA.SiteData SET GETJOB=:GETJOB,UPDATEJOB=:UPDATEJOB,LASTMOD=CURRENT_DATE " + sqlUp += "WHERE FLAG=:FLAG AND HOURS=:HOURS AND SITE=:SITE" + sqlAll = "SELECT getJob,updateJob,FLAG FROM ATLAS_PANDAMETA.SiteData WHERE HOURS=:HOURS AND SITE=:SITE" + try: + # delete old records + varMap = {} + varMap[':HOURS'] = 3 + varMap[':LASTMOD'] = datetime.datetime.utcnow()-datetime.timedelta(hours=varMap[':HOURS']) + self.conn.begin() + self.cur.execute(sqlDel+comment,varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # shuffle to avoid concatenation + tmpSiteList = pilotRequests.keys() + random.shuffle(tmpSiteList) + # loop over all sites + for tmpSite in tmpSiteList: + tmpVal = pilotRequests[tmpSite] + # start transaction + self.conn.begin() + # check individual host info first + varMap = {} + varMap[':FLAG'] = hostID + varMap[':SITE'] = tmpSite + varMap[':HOURS'] = 3 + self.cur.arraysize = 10 + self.cur.execute(sqlCh+comment,varMap) + res = self.cur.fetchone() + # row exists or not + if res[0] == 0: + sql = sqlIn + else: + sql = sqlUp + if tmpVal.has_key('getJob'): + varMap[':GETJOB'] = len(tmpVal['getJob']) + else: + varMap[':GETJOB'] = 0 + if tmpVal.has_key('updateJob'): + varMap[':UPDATEJOB'] = len(tmpVal['updateJob']) + else: + varMap[':UPDATEJOB'] = 0 + # update + self.cur.execute(sql+comment,varMap) + # get all info + sumExist = False + varMap = {} + varMap[':SITE'] = tmpSite + varMap[':HOURS'] = 3 + self.cur.arraysize = 100 + self.cur.execute(sqlAll+comment,varMap) + res = self.cur.fetchall() + # get total getJob/updateJob + varMap[':GETJOB'] = 0 + varMap[':UPDATEJOB'] = 0 + nCol = 0 + for tmpGetJob,tmpUpdateJob,tmpFlag in res: + # don't use summed info + if tmpFlag == 'production': + sumExist = True + continue + if tmpFlag == 'analysis': + if tmpSite.startswith('ANALY_'): + sumExist = True + continue + if tmpFlag in ['test']: + continue + # sum + varMap[':GETJOB'] += tmpGetJob + varMap[':UPDATEJOB'] += tmpUpdateJob + nCol += 1 + # get average + if nCol != 0: + if varMap[':GETJOB'] >= nCol: + varMap[':GETJOB'] /= nCol + if varMap[':UPDATEJOB'] >= nCol: + varMap[':UPDATEJOB'] /= nCol + if tmpSite.startswith('ANALY_'): + varMap[':FLAG'] = 'analysis' + else: + varMap[':FLAG'] = 'production' + # row exists or not + if sumExist: + sql = sqlUp + else: + sql = sqlIn + # update + self.cur.execute(sql+comment,varMap) + _logger.debug('updateSiteData : %s getJob=%s updateJob=%s' % \ + (tmpSite,varMap[':GETJOB'],varMap[':UPDATEJOB'])) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("updateSiteData done") + return True + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("updateSiteData : %s %s" % (type,value)) + return False + + + # get site data + def getCurrentSiteData(self): + comment = ' /* DBProxy.getCurrentSiteData */' + _logger.debug("getCurrentSiteData") + sql = "SELECT SITE,getJob,updateJob,FLAG FROM ATLAS_PANDAMETA.SiteData WHERE FLAG IN (:FLAG1,:FLAG2) and HOURS=3" + varMap = {} + varMap[':FLAG1'] = 'production' + varMap[':FLAG2'] = 'analysis' + try: + # set autocommit on + self.conn.begin() + # select + self.cur.arraysize = 10000 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + ret = {} + for site,getJob,updateJob,flag in res: + if site.startswith('ANALY_'): + if flag != 'analysis': + continue + else: + if flag != 'production': + continue + ret[site] = {'getJob':getJob,'updateJob':updateJob} + return ret + except: + type, value, traceBack = sys.exc_info() + _logger.error("getCurrentSiteData : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # insert nRunning in site data + def insertnRunningInSiteData(self): + comment = ' /* DBProxy.insertnRunningInSiteData */' + _logger.debug("insertnRunningInSiteData start") + sqlDel = "DELETE FROM ATLAS_PANDAMETA.SiteData WHERE FLAG IN (:FLAG1,:FLAG2) AND LASTMOD= nSiteRow: + continue + tmpIdx += 1 + if usingGroup: + workingGroup = tmpItem[tmpIdx] + tmpIdx += 1 + else: + workingGroup = None + if usingType: + processingType = tmpItem[tmpIdx] + tmpIdx += 1 + # get process group + processGroup = ProcessGroups.getProcessGroup(processingType) + else: + processingType = None + processGroup = None + if usingPrio: + currentPriority = tmpItem[tmpIdx] + tmpIdx += 1 + else: + currentPriority = None + cnt = tmpItem[tmpIdx] + tmpIdx += 1 + maxPriority = tmpItem[tmpIdx] + # append processingType list + if not processGroupInQueueMap.has_key(processGroup): + processGroupInQueueMap[processGroup] = [] + if not processingType in processGroupInQueueMap[processGroup]: + processGroupInQueueMap[processGroup].append(processingType) + # count the number of jobs for each policy + for tmpShareDef in shareDefList: + policyName = tmpShareDef['policy']['name'] + # use different list based on usage of priority + if tmpShareDef['policy']['priority'] == None: + groupInDefList = self.faresharePolicy[siteName]['groupList'] + typeInDefList = self.faresharePolicy[siteName]['typeList'][tmpShareDef['policy']['group']] + else: + groupInDefList = self.faresharePolicy[siteName]['groupListWithPrio'] + typeInDefList = self.faresharePolicy[siteName]['typeListWithPrio'][tmpShareDef['policy']['group']] + # check working group + if usingGroup: + if tmpShareDef['policy']['group'] == None: + # catchall doesn't contain WGs used by other policies + if workingGroup != None and workingGroup in groupInDefList: + continue + # check for wildcard + toBeSkippedFlag = False + for tmpPattern in groupInDefList: + if '*' in tmpPattern: + tmpPattern = '^' + tmpPattern.replace('*','.*') + '$' + # don't use WG if it is included in other policies + if re.search(tmpPattern,workingGroup) != None: + toBeSkippedFlag = True + break + if toBeSkippedFlag: + continue + else: + # needs to be matched if it is specified in the policy + if '*' in tmpShareDef['policy']['group']: + # using wild card + tmpPattern = '^' + tmpShareDef['policy']['group'].replace('*','.*') + '$' + if re.search(tmpPattern,workingGroup) == None: + continue + else: + if tmpShareDef['policy']['group'] != workingGroup: + continue + # collect real WGs per defined WG mainly for wildcard + if not workingGroupInQueueMap.has_key(tmpShareDef['policy']['group']): + workingGroupInQueueMap[tmpShareDef['policy']['group']] = [] + if not workingGroup in workingGroupInQueueMap[tmpShareDef['policy']['group']]: + workingGroupInQueueMap[tmpShareDef['policy']['group']].append(workingGroup) + # check processingType + if usingType: + if tmpShareDef['policy']['type'] == None: + # catchall doesn't contain processGroups used by other policies + if processGroup != None and processGroup in typeInDefList: + continue + else: + # needs to be matched if it is specified in the policy + if tmpShareDef['policy']['type'] != processGroup: + continue + # check priority + if usingPrio: + if currentPriority != None and tmpShareDef['policy']['priority'] != None: + if tmpShareDef['policy']['prioCondition'] == '>': + if currentPriority <= tmpShareDef['policy']['priority']: + continue + elif tmpShareDef['policy']['prioCondition'] == '>=': + if currentPriority < tmpShareDef['policy']['priority']: + continue + elif tmpShareDef['policy']['prioCondition'] == '<=': + if currentPriority > tmpShareDef['policy']['priority']: + continue + elif tmpShareDef['policy']['prioCondition'] == '<': + if currentPriority >= tmpShareDef['policy']['priority']: + continue + # append job status + if not tmpShareDef['count'].has_key(jobStatus): + tmpShareDef['count'][jobStatus] = 0 + # sum + tmpShareDef['count'][jobStatus] += cnt + # max priority + if not tmpShareDef['maxprio'].has_key(jobStatus): + tmpShareDef['maxprio'][jobStatus] = maxPriority + elif tmpShareDef['maxprio'][jobStatus] < maxPriority: + tmpShareDef['maxprio'][jobStatus] = maxPriority + # loop over all policies to calcurate total number of running jobs and total share + totalRunning = 0 + shareMap = {} + msgShare = 'share->' + msgShareMap = {} + totalShareNonGP = 0 + totalRunningNonGP = 0 + totalActiveShareNonGP = 0 + for tmpShareDef in shareDefList: + tmpNumMap = tmpShareDef['count'] + policyName = tmpShareDef['policy']['name'] + # policies with priorities are used only to limit the numer of jobs + if tmpShareDef['policy']['priority'] != None: + continue + # the number of activated jobs + if not tmpNumMap.has_key('activated') or tmpNumMap['activated'] == 0: + tmpNumActivated = 0 + else: + tmpNumActivated = tmpNumMap['activated'] + # get share, removing % + tmpShareValue = tmpShareDef['policy']['share'][:-1] + tmpShareValue = int(tmpShareValue) + # get the number of runnig + if not tmpNumMap.has_key('running'): + tmpNumRunning = 0 + else: + tmpNumRunning = tmpNumMap['running'] + # debug message for share + msgShareMap[policyName] = '%s:activated=%s:running=%s' % (policyName,tmpNumActivated,tmpNumRunning) + # get total share and total number of running jobs for non-GP + if tmpShareDef['policy']['group'] == None: + totalShareNonGP += tmpShareValue + totalRunningNonGP += tmpNumRunning + # get total share for active non-GP + if tmpNumActivated != 0: + totalActiveShareNonGP += tmpShareValue + # sum + totalRunning += tmpNumRunning + # not use the policy if no activated jobs + if tmpNumActivated == 0: + continue + # max priority + maxPriority = 0 + if tmpShareDef['maxprio'].has_key('activated'): + maxPriority = tmpShareDef['maxprio']['activated'] + # append + shareMap[policyName] = { + 'share':tmpShareValue, + 'running':tmpNumRunning, + 'policy':tmpShareDef['policy'], + 'maxprio':maxPriority, + } + # re-normalize when some non-GP policies are inactive + if totalShareNonGP != totalActiveShareNonGP and totalActiveShareNonGP != 0: + for policyName,tmpVarMap in shareMap.iteritems(): + # essentially non-GP share is multiplied by totalShareNonGP/totalActiveShareNonGP + if tmpVarMap['policy']['group'] == None: + tmpVarMap['share'] *= totalShareNonGP + else: + tmpVarMap['share'] *= totalActiveShareNonGP + # make message with share info + for policyName in msgShareMap.keys(): + if shareMap.has_key(policyName): + msgShare += '%s:share=%s,' % (msgShareMap[policyName],shareMap[policyName]['share']) + else: + msgShare += '%s:share=0,' % msgShareMap[policyName] + # get total share + totalShare = 0 + for policyName,tmpVarMap in shareMap.iteritems(): + totalShare += tmpVarMap['share'] + msgShare = msgShare[:-1] + # loop over all policies to check if the priority constraint should be activated + prioToBeImposed = [] + msgPrio = '' + if usingPrio: + msgPrio += 'prio->' + for tmpShareDef in shareDefList: + tmpNumMap = tmpShareDef['count'] + policyName = tmpShareDef['policy']['name'] + # only policies with priorities are used to limit the numer of jobs + if tmpShareDef['policy']['priority'] == None: + continue + # get the number of runnig + if not tmpNumMap.has_key('running'): + tmpNumRunning = 0 + else: + tmpNumRunning = tmpNumMap['running'] + # the number of activated jobs + if not tmpNumMap.has_key('activated') or tmpNumMap['activated'] == 0: + tmpNumActivated = 0 + else: + tmpNumActivated = tmpNumMap['activated'] + # get limit + tmpLimitValue = tmpShareDef['policy']['share'] + # check if more jobs are running than the limit + toBeImposed = False + if tmpLimitValue.endswith('%'): + # percentage based + tmpLimitValue = tmpLimitValue[:-1] + if float(tmpNumRunning) > float(totalRunning) * float(tmpLimitValue) / 100.0: + toBeImposed = True + # debug message for prio + msgPrio += '%s:total=%s:running=%s:impose=%s,' % (policyName,totalRunning,tmpNumRunning,toBeImposed) + else: + # number based + if tmpNumRunning > int(tmpLimitValue): + toBeImposed = True + # debug message for prio + msgPrio += '%s:running=%s:impose=%s,' % (policyName,tmpNumRunning,toBeImposed) + # append + if toBeImposed: + prioToBeImposed.append(tmpShareDef['policy']) + msgPrio = msgPrio[:-1] + # no activated + if shareMap == {}: + _logger.debug("getCriteriaForProdShare %s : ret=None - no activated" % siteName) + return retForNone + # no running + if totalRunning == 0: + _logger.debug("getCriteriaForProdShare %s : ret=None - no running" % siteName) + return retForNone + # zero share + if totalShare == 0: + _logger.debug("getCriteriaForProdShare %s : ret=None - zero share" % siteName) + return retForNone + # select the group where share most diverges from the definition + lowestShareRatio = None + lowestSharePolicy = None + for policyName,tmpVarMap in shareMap.iteritems(): + # ignore zero share + if tmpVarMap['share'] == 0: + continue + tmpShareDef = float(tmpVarMap['share']) / float(totalShare) + tmpShareNow = float(tmpVarMap['running']) / float(totalRunning) + tmpShareRatio = tmpShareNow / tmpShareDef + # take max priority into account for cloud share + if usingCloud != '': + # skip over share + if tmpShareNow > tmpShareDef: + continue + tmpShareRatio /= float(1000 + tmpVarMap['maxprio']) + if lowestShareRatio == None or lowestShareRatio > tmpShareRatio: + lowestShareRatio = tmpShareRatio + lowestSharePolicy = policyName + # make criteria + retVarMap = {} + retStr = '' + if lowestSharePolicy != None: + tmpShareDef = shareMap[lowestSharePolicy]['policy'] + # working group + if tmpShareDef['group'] == None: + groupInDefList = self.faresharePolicy[siteName]['groupList'] + # catch all except WGs used by other policies + if groupInDefList != []: + groupUsedInClause = [] + tmpIdx = 0 + # use real name of workingGroup + for tmpGroupIdx in groupInDefList: + if not workingGroupInQueueMap.has_key(tmpGroupIdx): + continue + for tmpGroup in workingGroupInQueueMap[tmpGroupIdx]: + if tmpGroup in groupUsedInClause: + continue + # add AND at the first WG + if groupUsedInClause == []: + retStr += 'AND workingGroup NOT IN (' + # add WG + tmpKey = ':shareWG%s' % tmpIdx + retVarMap[tmpKey] = tmpGroup + retStr += '%s,' % tmpKey + tmpIdx += 1 + # append + groupUsedInClause.append(tmpGroup) + if groupUsedInClause != []: + retStr = retStr[:-1] + retStr += ') ' + else: + # match with one WG + if workingGroupInQueueMap.has_key(tmpShareDef['group']): + groupUsedInClause = [] + tmpIdx = 0 + # use real name of workingGroup + for tmpGroup in workingGroupInQueueMap[tmpShareDef['group']]: + if tmpGroup in groupUsedInClause: + continue + # add AND at the first WG + if groupUsedInClause == []: + retStr += 'AND workingGroup IN (' + # add WG + tmpKey = ':shareWG%s' % tmpIdx + retVarMap[tmpKey] = tmpGroup + retStr += '%s,' % tmpKey + tmpIdx += 1 + # append + groupUsedInClause.append(tmpGroup) + if groupUsedInClause != []: + retStr = retStr[:-1] + retStr += ') ' + # processing type + if tmpShareDef['type'] == None: + typeInDefList = self.faresharePolicy[siteName]['typeList'][tmpShareDef['group']] + # catch all except WGs used by other policies + if typeInDefList != []: + # get the list of processingTypes from the list of processGroups + retVarMapP = {} + retStrP = 'AND processingType NOT IN (' + tmpIdx = 0 + for tmpTypeGroup in typeInDefList: + if processGroupInQueueMap.has_key(tmpTypeGroup): + for tmpType in processGroupInQueueMap[tmpTypeGroup]: + tmpKey = ':sharePT%s' % tmpIdx + retVarMapP[tmpKey] = tmpType + retStrP += '%s,' % tmpKey + tmpIdx += 1 + retStrP = retStrP[:-1] + retStrP += ') ' + # copy + if retVarMapP != {}: + retStr += retStrP + for tmpKey,tmpType in retVarMapP.iteritems(): + retVarMap[tmpKey] = tmpType + else: + # match with one processingGroup + if processGroupInQueueMap.has_key(tmpShareDef['type']) and processGroupInQueueMap[tmpShareDef['type']] != []: + retStr += 'AND processingType IN (' + tmpIdx = 0 + for tmpType in processGroupInQueueMap[tmpShareDef['type']]: + tmpKey = ':sharePT%s' % tmpIdx + retVarMap[tmpKey] = tmpType + retStr += '%s,' % tmpKey + tmpIdx += 1 + retStr = retStr[:-1] + retStr += ') ' + # priority + tmpIdx = 0 + for tmpDefItem in prioToBeImposed: + if tmpDefItem['group'] in [None,tmpShareDef['group']] and \ + tmpDefItem['type'] in [None,tmpShareDef['type']]: + if tmpDefItem['prioCondition'] == '>': + retStrP = '<=' + elif tmpDefItem['prioCondition'] == '>=': + retStrP = '<' + elif tmpDefItem['prioCondition'] == '<=': + retStrP = '>' + elif tmpDefItem['prioCondition'] == '<': + retStrP = '>=' + else: + continue + tmpKey = ':sharePrio%s' % tmpIdx + retVarMap[tmpKey] = tmpDefItem['priority'] + retStr += ('AND currentPriority%s%s' % (retStrP,tmpKey)) + tmpIdx += 1 + _logger.debug("getCriteriaForProdShare %s : sql='%s' var=%s cloud=%s %s %s" % \ + (siteName,retStr,str(retVarMap),usingCloud,msgShare,msgPrio)) + # append criteria for test jobs + if retStr != '': + retVarMap[':shareLabel1'] = 'managed' + retVarMap[':shareLabel2'] = 'test' + retVarMap[':shareLabel3'] = 'prod_test' + retVarMap[':shareLabel4'] = 'install' + retStr = 'AND (prodSourceLabel IN (:shareLabel2,:shareLabel3,:shareLabel4) OR (prodSourceLabel=:shareLabel1 ' + retStr + '))' + return retStr,retVarMap + except: + errtype,errvalue = sys.exc_info()[:2] + _logger.error("getCriteriaForProdShare %s : %s %s" % (siteName,errtype,errvalue)) + # roll back + self._rollback() + return retForNone + + + # get beyond pledge resource ratio + def getPledgeResourceRatio(self): + comment = ' /* DBProxy.getPledgeResourceRatio */' + # check utime + if self.updateTimeForPledgeRatio != None and (datetime.datetime.utcnow()-self.updateTimeForPledgeRatio) < datetime.timedelta(hours=3): + return + # update utime + self.updateTimeForPledgeRatio = datetime.datetime.utcnow() + _logger.debug("getPledgeResourceRatio") + try: + # set autocommit on + self.conn.begin() + # select + sql = "SELECT siteid,countryGroup,availableCPU,availableStorage,pledgedCPU,pledgedStorage " + sql += "FROM ATLAS_PANDAMETA.schedconfig WHERE countryGroup IS NOT NULL AND siteid LIKE 'ANALY_%' " + self.cur.arraysize = 100000 + self.cur.execute(sql+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # update ratio + self.beyondPledgeRatio = {} + if res != None and len(res) != 0: + for siteid,countryGroup,tmp_availableCPU,tmp_availableStorage,tmp_pledgedCPU,tmp_pledgedStorage in res: + # ignore when countryGroup is undefined + if countryGroup in ['',None]: + continue + # append + self.beyondPledgeRatio[siteid] = {} + self.beyondPledgeRatio[siteid]['countryGroup'] = countryGroup + # convert to float + try: + availableCPU = float(tmp_availableCPU) + except: + availableCPU = 0 + try: + pledgedCPU = float(tmp_pledgedCPU) + except: + pledgedCPU = 0 + # calculate ratio + if availableCPU == 0 or pledgedCPU == 0: + # set 0% when CPU ratio is undefined + self.beyondPledgeRatio[siteid]['ratio'] = 0 + else: + # ratio = (availableCPU-pledgedCPU)/availableCPU*(1-storageTerm) + self.beyondPledgeRatio[siteid]['ratio'] = (availableCPU-pledgedCPU)/availableCPU + _logger.debug("getPledgeResourceRatio -> %s" % str(self.beyondPledgeRatio)) + return + except: + errtype,errvalue = sys.exc_info()[:2] + _logger.error("getPledgeResourceRatio : %s %s" % (errtype,errvalue)) + # roll back + self._rollback() + return + + + # get fareshare policy + def getFaresharePolicy(self,getNewMap=False): + comment = ' /* DBProxy.getFaresharePolicy */' + # check utime + if not getNewMap and self.updateTimeForFaresharePolicy != None and \ + (datetime.datetime.utcnow()-self.updateTimeForFaresharePolicy) < datetime.timedelta(hours=3): + return + if not getNewMap: + # update utime + self.updateTimeForFaresharePolicy = datetime.datetime.utcnow() + _logger.debug("getFaresharePolicy") + try: + # set autocommit on + self.conn.begin() + # get default share + cloudShareMap = {} + cloudTier1Map = {} + sqlD = "SELECT name,fairshare,tier1 FROM ATLAS_PANDAMETA.cloudconfig" + self.cur.arraysize = 100000 + self.cur.execute(sqlD+comment) + res = self.cur.fetchall() + for cloudName,cloudShare,cloudTier1 in res: + try: + cloudTier1Map[cloudName] = cloudTier1.split(',') + except: + pass + if not cloudShare in ['',None]: + cloudShareMap[cloudName] = cloudShare + # get share per site + sql = "SELECT siteid,fairsharePolicy,cloud " + sql += "FROM ATLAS_PANDAMETA.schedconfig WHERE NOT siteid LIKE 'ANALY_%' GROUP BY siteid,fairsharePolicy,cloud" + self.cur.execute(sql+comment) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # update policy + faresharePolicy = {} + for siteid,faresharePolicyStr,cloudName in res: + try: + # share is undefined + usingCloudShare = '' + if faresharePolicyStr in ['',None]: + # skip if share is not defined at site or cloud + if not cloudShareMap.has_key(cloudName): + continue + # skip if T1 doesn't define share + if cloudTier1Map.has_key(cloudName) and siteid in cloudTier1Map[cloudName]: + continue + # use cloud share + faresharePolicyStr = cloudShareMap[cloudName] + usingCloudShare = cloudName + # decompose + hasNonPrioPolicy = False + for tmpItem in faresharePolicyStr.split(','): + # skip empty + tmpItem = tmpItem.strip() + if tmpItem == '': + continue + # keep name + tmpPolicy = {'name':tmpItem} + # group + tmpPolicy['group'] = None + tmpMatch = re.search('group=([^:]+)',tmpItem) + if tmpMatch != None: + if tmpMatch.group(1) in ['','central','*','any']: + # use None for catchall + pass + else: + tmpPolicy['group'] = tmpMatch.group(1) + # type + tmpPolicy['type'] = None + tmpMatch = re.search('type=([^:]+)',tmpItem) + if tmpMatch != None: + if tmpMatch.group(1) in ['*','any']: + # use None for catchall + pass + else: + tmpPolicy['type'] = tmpMatch.group(1) + # priority + tmpPolicy['priority'] = None + tmpPolicy['prioCondition'] = None + tmpMatch = re.search('priority([=<>]+)(\d+)',tmpItem) + if tmpMatch != None: + tmpPolicy['priority'] = int(tmpMatch.group(2)) + tmpPolicy['prioCondition'] = tmpMatch.group(1) + else: + hasNonPrioPolicy = True + # share + tmpPolicy['share'] = tmpItem.split(':')[-1] + # append + if not faresharePolicy.has_key(siteid): + faresharePolicy[siteid] = {'policyList':[]} + faresharePolicy[siteid]['policyList'].append(tmpPolicy) + # add any:any if only priority policies + if not hasNonPrioPolicy: + tmpPolicy = {'name' : 'type=any', + 'group' : None, + 'type' : None, + 'priority' : None, + 'prioCondition' : None, + 'share' : '100%'} + faresharePolicy[siteid]['policyList'].append(tmpPolicy) + # some translation + faresharePolicy[siteid]['usingGroup'] = False + faresharePolicy[siteid]['usingType'] = False + faresharePolicy[siteid]['usingPrio'] = False + faresharePolicy[siteid]['usingCloud'] = usingCloudShare + faresharePolicy[siteid]['groupList'] = [] + faresharePolicy[siteid]['typeList'] = {} + faresharePolicy[siteid]['groupListWithPrio'] = [] + faresharePolicy[siteid]['typeListWithPrio'] = {} + for tmpDefItem in faresharePolicy[siteid]['policyList']: + # using WG + if tmpDefItem['group'] != None: + faresharePolicy[siteid]['usingGroup'] = True + # using PG + if tmpDefItem['type'] != None: + faresharePolicy[siteid]['usingType'] = True + # using prio + if tmpDefItem['priority'] != None: + faresharePolicy[siteid]['usingPrio'] = True + # get list of WG and PG with/without priority + if tmpDefItem['priority'] == None: + # get list of woringGroups + if tmpDefItem['group'] != None and not tmpDefItem['group'] in faresharePolicy[siteid]['groupList']: + faresharePolicy[siteid]['groupList'].append(tmpDefItem['group']) + # get list of processingGroups + if not faresharePolicy[siteid]['typeList'].has_key(tmpDefItem['group']): + faresharePolicy[siteid]['typeList'][tmpDefItem['group']] = [] + if tmpDefItem['type'] != None and not tmpDefItem['type'] in faresharePolicy[siteid]['typeList'][tmpDefItem['group']]: + faresharePolicy[siteid]['typeList'][tmpDefItem['group']].append(tmpDefItem['type']) + else: + # get list of woringGroups + if tmpDefItem['group'] != None and not tmpDefItem['group'] in faresharePolicy[siteid]['groupListWithPrio']: + faresharePolicy[siteid]['groupListWithPrio'].append(tmpDefItem['group']) + # get list of processingGroups + if not faresharePolicy[siteid]['typeListWithPrio'].has_key(tmpDefItem['group']): + faresharePolicy[siteid]['typeListWithPrio'][tmpDefItem['group']] = [] + if tmpDefItem['type'] != None and not tmpDefItem['type'] in faresharePolicy[siteid]['typeListWithPrio'][tmpDefItem['group']]: + faresharePolicy[siteid]['typeListWithPrio'][tmpDefItem['group']].append(tmpDefItem['type']) + except: + errtype,errvalue = sys.exc_info()[:2] + _logger.warning("getFaresharePolicy : wrond definition '%s' for %s : %s %s" % (faresharePolicy,siteid,errtype,errvalue)) + _logger.debug("getFaresharePolicy -> %s" % str(faresharePolicy)) + if not getNewMap: + self.faresharePolicy = faresharePolicy + return + else: + return faresharePolicy + except: + errtype,errvalue = sys.exc_info()[:2] + _logger.error("getFaresharePolicy : %s %s" % (errtype,errvalue)) + # roll back + self._rollback() + if not getNewMap: + return + else: + return {} + + + # get cloud list + def getCloudList(self): + comment = ' /* DBProxy.getCloudList */' + _logger.debug("getCloudList start") + try: + # set autocommit on + self.conn.begin() + # select + sql = "SELECT name,tier1,tier1SE,relocation,weight,server,status,transtimelo," + sql += "transtimehi,waittime,validation,mcshare,countries,fasttrack,nprestage," + sql += "pilotowners " + sql+= "FROM ATLAS_PANDAMETA.cloudconfig" + self.cur.arraysize = 10000 + self.cur.execute(sql+comment) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + ret = {} + if resList != None and len(resList) != 0: + for res in resList: + # change None to '' + resTmp = [] + for tmpItem in res: + if tmpItem == None: + tmpItem = '' + resTmp.append(tmpItem) + name,tier1,tier1SE,relocation,weight,server,status,transtimelo,transtimehi,\ + waittime,validation,mcshare,countries,fasttrack,nprestage,pilotowners = resTmp + # instantiate CloudSpec + tmpC = CloudSpec.CloudSpec() + tmpC.name = name + tmpC.tier1 = tier1 + tmpC.tier1SE = re.sub(' ','',tier1SE).split(',') + tmpC.relocation = relocation + tmpC.weight = weight + tmpC.server = server + tmpC.status = status + tmpC.transtimelo = transtimelo + tmpC.transtimehi = transtimehi + tmpC.waittime = waittime + tmpC.validation = validation + tmpC.mcshare = mcshare + tmpC.countries = countries + tmpC.fasttrack = fasttrack + tmpC.nprestage = nprestage + tmpC.pilotowners = pilotowners + # append + ret[name] = tmpC + _logger.debug("getCloudList done") + return ret + except: + type, value, traceBack = sys.exc_info() + _logger.error("getCloudList : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # check sites with release/cache + def checkSitesWithRelease(self,sites,releases,caches,cmtConfig=None): + comment = ' /* DBProxy.checkSitesWithRelease */' + try: + relStr = releases + if releases != None: + relStr = releases.replace('\n',' ') + caStr = caches + if caches != None: + caStr = caches.replace('\n',' ') + _logger.debug("checkSitesWithRelease(%s,%s,%s,%s)" % (sites,relStr,caStr,cmtConfig)) + # select + sql = "SELECT distinct siteid FROM ATLAS_PANDAMETA.InstalledSW WHERE " + loopKey2 = None + loopValues2 = [] + if not caches in ['','NULL',None]: + loopKey = ':cache' + loopValues = caches.split('\n') + sql += "cache=:cache " + if not releases in ['','NULL',None]: + loopKey2 = ':release' + loopValues2 = releases.split('\n') + sql += "AND release=:release " + elif not releases in ['','NULL',None]: + loopKey = ':release' + loopValues = releases.split('\n') + sql += "release=:release AND cache='None' " + else: + # don't check + return sites + checkCMT = False + if not cmtConfig in ['','NULL',None]: + sql += "AND cmtConfig=:cmtConfig " + checkCMT = True + sql += "AND siteid IN (" + # start transaction + self.conn.begin() + self.cur.arraysize = 1000 + # loop over all releases/caches + for loopIdx,loopVal in enumerate(loopValues): + # remove Atlas- + loopVal = re.sub('^Atlas-','',loopVal) + sqlSite = sql + varMap = {} + varMap[loopKey] = loopVal + if loopKey2 != None: + loopVal2 = loopValues2[loopIdx] + loopVal2 = re.sub('^Atlas-','',loopVal2) + varMap[loopKey2] = loopVal2 + if checkCMT: + varMap[':cmtConfig'] = cmtConfig + tmpRetSites = [] + # loop over sites + nSites = 10 + iSite = 0 + for siteIndex,site in enumerate(sites): + iSite += 1 + tmpSiteKey = ':siteid%s' % iSite + varMap[tmpSiteKey] = site + sqlSite += '%s,' % tmpSiteKey + if iSite == nSites or (siteIndex+1) == len(sites): + iSite = 0 + # close bracket in SQL + sqlSite = sqlSite[:-1] + sqlSite += ')' + # execute + _logger.debug(sqlSite+comment+str(varMap)) + self.cur.execute(sqlSite+comment, varMap) + resList = self.cur.fetchall() + # collect candidates + if len(resList) > 0: + for tmpSite, in resList: + # append + tmpRetSites.append(tmpSite) + # reset + sqlSite = sql + varMap = {} + varMap[loopKey] = loopVal + if loopKey2 != None: + varMap[loopKey2] = loopVal2 + if checkCMT: + varMap[':cmtConfig'] = cmtConfig + # set + sites = tmpRetSites + # escape + if sites == []: + break + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("checkSitesWithRelease -> %s" % sites) + return sites + except: + # roll back + self._rollback() + type,value,traceBack = sys.exc_info() + _logger.error("checkSitesWithRelease : %s %s" % (type,value)) + return [] + + + # get sites with release/cache in cloud + def getSitesWithReleaseInCloud(self,cloud,releases,caches,validation): + comment = ' /* DBProxy.getSitesWithReleaseInCloud */' + try: + relStr = releases + if releases != None: + relStr = releases.replace('\n',' ') + caStr = caches + if caches != None: + caStr = caches.replace('\n',' ') + _logger.debug("getSitesWithReleaseInCloud(%s,%s,%s,%s)" % (cloud,relStr,caStr,validation)) + # select + sql = "SELECT distinct siteid FROM ATLAS_PANDAMETA.InstalledSW WHERE cloud=:cloud AND " + varMap = {} + varMap[':cloud'] = cloud + if not caches in ['','NULL',None]: + loopKey = ':cache' + loopValues = caches.split('\n') + sql += "cache=:cache " + else: + loopKey = ':release' + loopValues = releases.split('\n') + sql += "release=:release AND cache='None' " + # validation + if validation: + sql += "validation=:validation " + varMap[':validation'] = 'validated' + # start transaction + self.conn.begin() + self.cur.arraysize = 100 + # loop over all releases/caches + retSites = None + for loopVal in loopValues: + # remove Atlas- + loopVal = re.sub('^Atlas-','',loopVal) + varMap[loopKey] = loopVal + # execute + _logger.debug(sql+comment+str(varMap)) + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + # append + tmpRetSites = [] + for tmpItem, in resList: + if retSites == None or (tmpItem in retSites): + tmpRetSites.append(tmpItem) + # set + retSites = tmpRetSites + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + retSites = [] + for tmpItem, in resList: + retSites.append(tmpItem) + _logger.debug("getSitesWithReleaseInCloud -> %s" % retSites) + return retSites + except: + # roll back + self._rollback() + type,value,traceBack = sys.exc_info() + _logger.error("getSitesWithReleaseInCloud : %s %s" % (type,value)) + return [] + + + # get list of cache prefix + def getCachePrefixes(self): + comment = ' /* DBProxy.getCachePrefixes */' + try: + _logger.debug("getCachePrefixes") + # select + sql = "SELECT distinct cache FROM ATLAS_PANDAMETA.installedSW WHERE cache IS NOT NULL" + # start transaction + self.conn.begin() + self.cur.arraysize = 10000 + # execute + self.cur.execute(sql+comment, {}) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + tmpList = [] + for tmpItem, in resList: + match = re.search('^([^-]+)-',tmpItem) + if match != None: + tmpPrefix = match.group(1) + if not tmpPrefix in tmpList: + tmpList.append(tmpPrefix) + _logger.debug("getCachePrefixes -> %s" % tmpList) + return tmpList + except: + # roll back + self._rollback() + type,value,traceBack = sys.exc_info() + _logger.error("getCachePrefixes : %s %s" % (type,value)) + return [] + + + # get pilot owners + def getPilotOwners(self): + comment = ' /* DBProxy.getPilotOwners */' + _logger.debug("getPilotOwners") + try: + # set autocommit on + self.conn.begin() + # select + sql = "SELECT pilotowners FROM ATLAS_PANDAMETA.cloudconfig" + self.cur.arraysize = 100 + self.cur.execute(sql+comment) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + ret = [] + for tmpItem, in resList: + if tmpItem != None: + for tmpOwner in tmpItem.split('|'): + if tmpOwner != '': + ret.append(tmpOwner) + _logger.debug("getPilotOwners -> %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + type,value,traceBack = sys.exc_info() + _logger.error("getPilotOwners : %s %s" % (type,value)) + return [] + + + # get allowed nodes + def getAllowedNodes(self): + comment = ' /* DBProxy.getAllowedNodes */' + _logger.debug("getAllowedNodes") + try: + # set autocommit on + self.conn.begin() + # select + sql = "SELECT siteid,allowedNode FROM ATLAS_PANDAMETA.schedconfig " + sql += "WHERE siteid IS NOT NULL AND allowedNode IS NOT NULL" + self.cur.arraysize = 1000 + self.cur.execute(sql+comment) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + ret = {} + for tmpSiteID,tmpAllowedNode in resList: + if not ret.has_key(tmpSiteID): + ret[tmpSiteID] = tmpAllowedNode.split(',') + _logger.debug("getAllowedNodes -> %s" % str(ret)) + return ret + except: + # roll back + self._rollback() + tmpType,tmpValue = sys.exc_info()[:2] + _logger.error("getAllowedNodes : %s %s" % (tmpType,tmpValue)) + return {} + + + # extract name from DN + def cleanUserID(self, id): + try: + up = re.compile('/(DC|O|OU|C|L)=[^\/]+') + username = up.sub('', id) + up2 = re.compile('/CN=[0-9]+') + username = up2.sub('', username) + up3 = re.compile(' [0-9]+') + username = up3.sub('', username) + up4 = re.compile('_[0-9]+') + username = up4.sub('', username) + username = username.replace('/CN=proxy','') + username = username.replace('/CN=limited proxy','') + username = username.replace('limited proxy','') + username = re.sub('/CN=Robot:[^/]+','',username) + pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)') + mat = pat.match(username) + if mat: + username = mat.group(2) + else: + username = username.replace('/CN=','') + if username.lower().find('/email') > 0: + username = username[:username.lower().find('/email')] + pat = re.compile('.*(limited.*proxy).*') + mat = pat.match(username) + if mat: + username = mat.group(1) + username = username.replace('(','') + username = username.replace(')','') + username = username.replace("'",'') + return username + except: + return id + + + # extract scope from dataset name + def extractScope(self,name): + try: + if name.lower().startswith('user') or \ + name.lower().startswith('group'): + # return None if there are not enough fields + if len(name.split('.')) < 2: + return None + return name.lower().split('.')[0] + '.' + name.lower().split('.')[1] + return name.split('.')[0] + except: + return None + + + # check quota + def checkQuota(self,dn): + comment = ' /* DBProxy.checkQuota */' + _logger.debug("checkQuota %s" % dn) + try: + # set autocommit on + self.conn.begin() + # select + name = self.cleanUserID(dn) + sql = "SELECT cpua1,cpua7,cpua30,quotaa1,quotaa7,quotaa30 FROM ATLAS_PANDAMETA.users WHERE name=:name" + varMap = {} + varMap[':name'] = name + self.cur.arraysize = 10 + self.cur.execute(sql+comment,varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + weight = 0.0 + if res != None and len(res) != 0: + item = res[0] + # cpu and quota + cpu1 = item[0] + cpu7 = item[1] + cpu30 = item[2] + if item[3] in [0,None]: + quota1 = 0 + else: + quota1 = item[3] * 3600 + if item[4] in [0,None]: + quota7 = 0 + else: + quota7 = item[4] * 3600 + if item[5] in [0,None]: + quota30 = 0 + else: + quota30 = item[5] * 3600 + # CPU usage + if cpu1 == None: + cpu1 = 0.0 + # weight + if quota1 > 0: + weight = float(cpu1) / float(quota1) + # not exceeded the limit + if weight < 1.0: + weight = 0.0 + _logger.debug("checkQuota %s Weight:%s Quota:%s CPU:%s" % (dn,weight,quota1,cpu1)) + else: + _logger.debug("checkQuota cannot found %s" % dn) + return weight + except: + type, value, traceBack = sys.exc_info() + _logger.error("checkQuota : %s %s" % (type,value)) + # roll back + self._rollback() + return 0.0 + + + # get serialize JobID and status + def getUserParameter(self,dn,jobID,jobsetID): + comment = ' /* DBProxy.getUserParameter */' + _logger.debug("getUserParameter %s JobID=%s JobsetID=%s" % (dn,jobID,jobsetID)) + try: + # set initial values + retStatus = True + if jobsetID == -1: + # generate new jobsetID + retJobsetID = jobID + # new jobID = 1 + new jobsetID + retJobID = retJobsetID + 1 + elif jobsetID in ['NULL',None,0]: + # no jobsetID + retJobsetID = None + retJobID = jobID + else: + # user specified jobsetID + retJobsetID = jobsetID + retJobID = jobID + # set autocommit on + self.conn.begin() + # select + name = self.cleanUserID(dn) + sql = "SELECT jobid,status FROM ATLAS_PANDAMETA.users WHERE name=:name " + sql += "FOR UPDATE " + sqlAdd = "INSERT INTO ATLAS_PANDAMETA.users " + sqlAdd += "(ID,NAME,LASTMOD,FIRSTJOB,LATESTJOB,CACHETIME,NCURRENT,JOBID) " + sqlAdd += "VALUES(ATLAS_PANDAMETA.USERS_ID_SEQ.nextval,:name," + sqlAdd += "CURRENT_DATE,CURRENT_DATE,CURRENT_DATE,CURRENT_DATE,0,1) " + varMap = {} + varMap[':name'] = name + self.cur.execute(sql+comment,varMap) + self.cur.arraysize = 10 + res = self.cur.fetchall() + # insert if no record + if res == None or len(res) == 0: + try: + self.cur.execute(sqlAdd+comment,varMap) + retI = self.cur.rowcount + _logger.debug("getUserParameter %s inserted new row with %s" % (dn,retI)) + # emulate DB response + res = [[1,'']] + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("getUserParameter %s failed to insert new row with %s:%s" % (dn,errType,errValue)) + if res != None and len(res) != 0: + item = res[0] + # JobID in DB + dbJobID = item[0] + # check status + if item[1] in ['disabled']: + retStatus = False + # use larger JobID + if dbJobID >= int(retJobID) or (jobsetID == -1 and dbJobID >= int(retJobsetID)): + if jobsetID == -1: + # generate new jobsetID = 1 + exsiting jobID + retJobsetID = dbJobID+1 + # new jobID = 1 + new jobsetID + retJobID = retJobsetID + 1 + else: + # new jobID = 1 + exsiting jobID + retJobID = dbJobID+1 + # update DB + varMap = {} + varMap[':name'] = name + varMap[':jobid'] = retJobID + sql = "UPDATE ATLAS_PANDAMETA.users SET jobid=:jobid WHERE name=:name" + self.cur.execute(sql+comment,varMap) + _logger.debug("getUserParameter set JobID=%s for %s" % (retJobID,dn)) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("getUserParameter %s return JobID=%s JobsetID=%s Status=%s" % (dn,retJobID,retJobsetID,retStatus)) + return retJobID,retJobsetID,retStatus + except: + type, value, traceBack = sys.exc_info() + _logger.error("getUserParameter : %s %s" % (type,value)) + # roll back + self._rollback() + return retJobID,retJobsetID,retStatus + + + # get JobID for user + def getJobIdUser(self,dn): + comment = ' /* DBProxy.getJobIdUser */' + _logger.debug("getJobIdUser %s" % dn) + jobID = 0 + try: + # set autocommit on + self.conn.begin() + # select + name = self.cleanUserID(dn) + sql = "SELECT jobid FROM ATLAS_PANDAMETA.users WHERE name=:name" + varMap = {} + varMap[':name'] = name + self.cur.arraysize = 10 + self.cur.execute(sql+comment,varMap) + res = self.cur.fetchone() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if res != None: + jobID, = res + _logger.debug("getJobIdUser %s -> %s" % (name,jobID)) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("getJobIdUser : %s %s" % (errType,errValue)) + # roll back + self._rollback() + return jobID + + + # check ban user + def checkBanUser(self,dn,sourceLabel): + comment = ' /* DBProxy.checkBanUser */' + _logger.debug("checkBanUser %s %s" % (dn,sourceLabel)) + try: + # set initial values + retStatus = True + # set autocommit on + self.conn.begin() + # select + name = self.cleanUserID(dn) + sql = "SELECT status FROM ATLAS_PANDAMETA.users WHERE name=:name" + varMap = {} + varMap[':name'] = name + self.cur.execute(sql+comment,varMap) + self.cur.arraysize = 10 + res = self.cur.fetchone() + if res != None: + # check status + tmpStatus, = res + if tmpStatus in ['disabled']: + retStatus = False + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("checkBanUser %s %s Status=%s" % (dn,sourceLabel,retStatus)) + return retStatus + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("checkBanUser %s %s : %s %s" % (dn,sourceLabel,errType,errValue)) + # roll back + self._rollback() + return retStatus + + + # get email address for a user + def getEmailAddr(self,name): + comment = ' /* DBProxy.getEmailAddr */' + _logger.debug("get email for %s" % name) + try: + # set autocommit on + self.conn.begin() + # select + sql = "SELECT email FROM ATLAS_PANDAMETA.users WHERE name=:name" + varMap = {} + varMap[':name'] = name + self.cur.execute(sql+comment,varMap) + self.cur.arraysize = 10 + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if res != None and len(res) != 0: + return res[0][0] + # return empty string + return "" + except: + type, value, traceBack = sys.exc_info() + _logger.error("getEmailAddr : %s %s" % (type,value)) + # roll back + self._rollback() + return "" + + + # get client version + def getPandaClientVer(self): + comment = ' /* DBProxy.getPandaClientVer */' + _logger.debug("getPandaClientVer") + try: + # set autocommit on + self.conn.begin() + # select + sql = "SELECT pathena FROM ATLAS_PANDAMETA.pandaconfig WHERE name=:name" + varMap = {} + varMap[':name'] = 'current' + self.cur.execute(sql+comment,varMap) + self.cur.arraysize = 10 + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + retStr = '' + if res != None and len(res) != 0: + retStr = res[0][0] + _logger.debug("getPandaClientVer -> %s" % retStr) + return retStr + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getPandaClientVer : %s %s" % (type,value)) + return "" + + + # add files to memcached + def addFilesToMemcached(self,site,node,files): + _logger.debug("addFilesToMemcached start %s %s" % (site,node)) + # memcached is unused + if not panda_config.memcached_enable: + _logger.debug("addFilesToMemcached skip %s %s" % (site,node)) + return True + try: + # initialize memcache if needed + if self.memcache == None: + from MemProxy import MemProxy + self.memcache = MemProxy() + # convert string to list + fileList = files.split(',') + # remove '' + try: + fileList.remove('') + except: + pass + # empty list + if len(fileList) == 0: + _logger.debug("addFilesToMemcached skipped for empty list") + return True + # list of siteIDs + siteIDs = site.split(',') + # loop over all siteIDs + for tmpSite in siteIDs: + # add + iFiles = 0 + nFiles = 100 + retS = True + while iFiles < len(fileList): + tmpRetS = self.memcache.setFiles(None,tmpSite,node,fileList[iFiles:iFiles+nFiles]) + if not tmpRetS: + retS = False + iFiles += nFiles + _logger.debug("addFilesToMemcached done %s %s with %s" % (site,node,retS)) + return retS + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("addFilesToMemcached : %s %s" % (errType,errValue)) + return False + + + # delete files from memcached + def deleteFilesFromMemcached(self,site,node,files): + _logger.debug("deleteFilesFromMemcached start %s %s" % (site,node)) + # memcached is unused + if not panda_config.memcached_enable: + _logger.debug("deleteFilesFromMemcached skip %s %s" % (site,node)) + return True + try: + # initialize memcache if needed + if self.memcache == None: + from MemProxy import MemProxy + self.memcache = MemProxy() + # list of siteIDs + siteIDs = site.split(',') + # loop over all siteIDs + for tmpSite in siteIDs: + # delete + self.memcache.deleteFiles(tmpSite,node,files) + _logger.debug("deleteFilesFromMemcached done %s %s" % (site,node)) + return True + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("deleteFilesFromMemcached : %s %s" % (errType,errValue)) + return False + + + # flush memcached + def flushMemcached(self,site,node): + _logger.debug("flushMemcached start %s %s" % (site,node)) + # memcached is unused + if not panda_config.memcached_enable: + _logger.debug("flushMemcached skip %s %s" % (site,node)) + return True + try: + # initialize memcache if needed + if self.memcache == None: + from MemProxy import MemProxy + self.memcache = MemProxy() + # list of siteIDs + siteIDs = site.split(',') + # loop over all siteIDs + for tmpSite in siteIDs: + # flush + self.memcache.flushFiles(tmpSite,node) + _logger.debug("flushMemcached done %s %s" % (site,node)) + return True + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("flushMemcached : %s %s" % (errType,errValue)) + return False + + + # check files with memcached + def checkFilesWithMemcached(self,site,node,files): + _logger.debug("checkFilesWithMemcached start %s %s" % (site,node)) + # convert string to list + fileList = files.split(',') + # remove '' + try: + fileList.remove('') + except: + pass + # memcached is unused + if not panda_config.memcached_enable: + _logger.debug("checkFilesWithMemcached skip %s %s" % (site,node)) + # return 0 + retStr = '' + for tmpF in fileList: + retStr += '0,' + retStr = retStr[:-1] + return retStr + try: + # initialize memcache if needed + if self.memcache == None: + from MemProxy import MemProxy + self.memcache = MemProxy() + # empty list + if len(fileList) == 0: + _logger.debug("checkFilesWithMemcached skipped for empty list") + return '' + # check + iFiles = 0 + nFiles = 100 + retS = '' + while iFiles < len(fileList): + retS += self.memcache.checkFiles(None,fileList[iFiles:iFiles+nFiles],site,node,getDetail=True) + retS += ',' + iFiles += nFiles + retS = retS[:-1] + _logger.debug("checkFilesWithMemcached done %s %s with %s" % (site,node,retS)) + return retS + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("checkFilesWithMemcached : %s %s" % (errType,errValue)) + return False + + + # register proxy key + def registerProxyKey(self,params): + comment = ' /* DBProxy.registerProxyKey */' + _logger.debug("register ProxyKey %s" % str(params)) + try: + # set autocommit on + self.conn.begin() + # construct SQL + sql0 = 'INSERT INTO ATLAS_PANDAMETA.proxykey (id,' + sql1 = 'VALUES (ATLAS_PANDAMETA.PROXYKEY_ID_SEQ.nextval,' + vals = {} + for key,val in params.iteritems(): + sql0 += '%s,' % key + sql1 += ':%s,' % key + vals[':%s' % key] = val + sql0 = sql0[:-1] + sql1 = sql1[:-1] + sql = sql0 + ') ' + sql1 + ') ' + # insert + self.cur.execute(sql+comment,vals) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return True + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("registerProxyKey : %s %s" % (type,value)) + # roll back + self._rollback() + return "" + + + # get proxy key + def getProxyKey(self,dn): + comment = ' /* DBProxy.getProxyKey */' + _logger.debug("get ProxyKey %s" % dn) + try: + # set autocommit on + self.conn.begin() + # construct SQL + sql = 'SELECT credname,expires,origin,myproxy FROM ATLAS_PANDAMETA.proxykey WHERE dn=:dn ORDER BY expires DESC' + varMap = {} + varMap[':dn'] = dn + # select + self.cur.execute(sql+comment,varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + retMap = {} + if res != None and len(res) != 0: + credname,expires,origin,myproxy = res[0] + retMap['credname'] = credname + retMap['expires'] = expires + retMap['origin'] = origin + retMap['myproxy'] = myproxy + _logger.debug(retMap) + return retMap + except: + type, value, traceBack = sys.exc_info() + _logger.error("getProxyKey : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # check site access + def checkSiteAccess(self,siteid,longDN): + comment = ' /* DBProxy.checkSiteAccess */' + _logger.debug("checkSiteAccess %s:%s" % (siteid,longDN)) + try: + # use compact DN + dn = self.cleanUserID(longDN) + # construct SQL + sql = 'SELECT poffset,rights,status,workingGroups FROM ATLAS_PANDAMETA.siteaccess WHERE dn=:dn AND pandasite=:pandasite' + varMap = {} + varMap[':dn'] = dn + varMap[':pandasite'] = siteid + # set autocommit on + self.conn.begin() + # select + self.cur.execute(sql+comment,varMap) + self.cur.arraysize = 10 + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + retMap = {} + if res != None and len(res) != 0: + poffset,rights,status,workingGroups = res[0] + retMap['poffset'] = poffset + retMap['rights'] = rights + retMap['status'] = status + if workingGroups in ['',None]: + workingGroups = [] + else: + workingGroups = workingGroups.split(',') + retMap['workingGroups'] = workingGroups + _logger.debug(retMap) + return retMap + except: + type, value, traceBack = sys.exc_info() + _logger.error("checkSiteAccess : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # add account to siteaccess + def addSiteAccess(self,siteID,longDN): + comment = ' /* DBProxy.addSiteAccess */' + _logger.debug("addSiteAccess : %s %s" % (siteID,longDN)) + try: + # use compact DN + dn = self.cleanUserID(longDN) + # set autocommit on + self.conn.begin() + # select + sql = 'SELECT status FROM ATLAS_PANDAMETA.siteaccess WHERE dn=:dn AND pandasite=:pandasite' + varMap = {} + varMap[':dn'] = dn + varMap[':pandasite'] = siteID + self.cur.execute(sql+comment,varMap) + self.cur.arraysize = 10 + res = self.cur.fetchone() + if res != None: + _logger.debug("account already exists with status=%s" % res[0]) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return res[0] + # add + sql = 'INSERT INTO ATLAS_PANDAMETA.siteaccess (id,dn,pandasite,status,created) VALUES (ATLAS_PANDAMETA.SITEACCESS_ID_SEQ.nextval,:dn,:pandasite,:status,CURRENT_DATE)' + varMap = {} + varMap[':dn'] = dn + varMap[':pandasite'] = siteID + varMap[':status'] = 'requested' + self.cur.execute(sql+comment,varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("account was added") + return 0 + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("addSiteAccess : %s %s" % (type,value)) + # return None + return -1 + + + # list site access + def listSiteAccess(self,siteid=None,dn=None,longFormat=False): + comment = ' /* DBProxy.listSiteAccess */' + _logger.debug("listSiteAccess %s:%s" % (siteid,dn)) + try: + if siteid==None and dn==None: + return [] + longAttributes = 'status,poffset,rights,workingGroups,created' + # set autocommit on + self.conn.begin() + # construct SQL + if siteid != None: + varMap = {':pandasite':siteid} + if not longFormat: + sql = 'SELECT dn,status FROM ATLAS_PANDAMETA.siteaccess WHERE pandasite=:pandasite ORDER BY dn' + else: + sql = 'SELECT dn,%s FROM ATLAS_PANDAMETA.siteaccess ' % longAttributes + sql += 'WHERE pandasite=:pandasite ORDER BY dn' + else: + shortDN = self.cleanUserID(dn) + varMap = {':dn':shortDN} + if not longFormat: + sql = 'SELECT pandasite,status FROM ATLAS_PANDAMETA.siteaccess WHERE dn=:dn ORDER BY pandasite' + else: + sql = 'SELECT pandasite,%s FROM ATLAS_PANDAMETA.siteaccess ' % longAttributes + sql += 'WHERE dn=:dn ORDER BY pandasite' + # select + self.cur.execute(sql+comment,varMap) + self.cur.arraysize = 1000 + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + ret = [] + if res != None and len(res) != 0: + for tmpRes in res: + if not longFormat: + ret.append(tmpRes) + else: + # create map for long format + tmpRetMap = {} + # use first value as a primary key + tmpRetMap['primKey'] = tmpRes[0] + idxVal = 1 + for tmpKey in longAttributes.split(','): + tmpRetMap[tmpKey] = tmpRes[idxVal] + idxVal += 1 + ret.append(tmpRetMap) + _logger.debug(ret) + return ret + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("listSiteAccess : %s %s" % (type,value)) + return [] + + + # update site access + def updateSiteAccess(self,method,siteid,requesterDN,userName,attrValue): + comment = ' /* DBProxy.updateSiteAccess */' + _logger.debug("updateSiteAccess %s:%s:%s:%s:%s" % (method,siteid,requesterDN,userName,attrValue)) + try: + # set autocommit on + self.conn.begin() + # check existence + varMap = {} + varMap[':pandasite'] = siteid + varMap[':dn'] = userName + sql = 'SELECT count(*) FROM ATLAS_PANDAMETA.siteaccess WHERE pandasite=:pandasite AND dn=:dn' + self.cur.execute(sql+comment,varMap) + self.cur.arraysize = 10 + res = self.cur.fetchall() + if res == None or res[0][0] == 0: + _logger.error("updateSiteAccess : No request for %s" % varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return 'No request for %s:%s' % (siteid,userName) + # get cloud + varMap = {':pandasite':siteid} + sql = 'SELECT cloud,dn FROM ATLAS_PANDAMETA.schedconfig WHERE siteid=:pandasite AND rownum<=1' + self.cur.execute(sql+comment,varMap) + res = self.cur.fetchall() + if res == None or len(res) == 0: + _logger.error("updateSiteAccess : No cloud in schedconfig for %s" % siteid) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return "No cloud in schedconfig for %s" % siteid + cloud = res[0][0] + siteContact = res[0][1] + # get cloud responsible + varMap = {':cloud':cloud} + sql = 'SELECT dn FROM ATLAS_PANDAMETA.cloudconfig WHERE name=:cloud' + self.cur.execute(sql+comment,varMap) + res = self.cur.fetchall() + if res == None or len(res) == 0: + _logger.error("updateSiteAccess : No contact in cloudconfig for %s" % cloud) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + return "No contact in cloudconfig for %s" % cloud + contactNames = res[0][0] + if contactNames in [None,'']: + contactNames = [] + else: + contactNames = contactNames.split(',') + # get site responsible + if not siteContact in [None,'']: + contactNames += siteContact.split(',') + # check privilege + if not self.cleanUserID(requesterDN) in contactNames: + _logger.error("updateSiteAccess : %s is not one of contacts %s" % (requesterDN,str(contactNames))) + # return + return "Insufficient privilege" + # update + varMap = {} + varMap[':pandasite'] = siteid + varMap[':dn'] = userName + if method in ['approve','reject']: + # update status + sql = 'UPDATE ATLAS_PANDAMETA.siteaccess SET status=:newStatus WHERE pandasite=:pandasite AND dn=:dn' + if method == 'approve': + varMap[':newStatus'] = 'tobeapproved' + else: + varMap[':newStatus'] = 'toberejected' + elif method == 'delete': + # delete + sql = 'DELETE FROM ATLAS_PANDAMETA.siteaccess WHERE pandasite=:pandasite AND dn=:dn' + elif method == 'set': + # check value + if re.search('^[a-z,A-Z]+:[a-z,A-Z,0-9,\,_\-]+$',attrValue) == None: + errStr = "Invalid argument for set : %s. Must be key:value" % attrValue + _logger.error("updateSiteAccess : %s" % errStr) + # retrun + return errStr + # decompose to key and value + tmpKey = attrValue.split(':')[0].lower() + tmpVal = attrValue.split(':')[-1] + # check key + changeableKeys = ['poffset','workinggroups','rights'] + if not tmpKey in changeableKeys: + errStr = "%s cannot be set. Only %s are allowed" % (tmpKey,str(changeableKeys)) + _logger.error("updateSiteAccess : %s" % errStr) + # retrun + return errStr + # set value map + varMap[':%s' % tmpKey] = tmpVal + sql = 'UPDATE ATLAS_PANDAMETA.siteaccess SET %s=:%s WHERE pandasite=:pandasite AND dn=:dn' % (tmpKey,tmpKey) + else: + _logger.error("updateSiteAccess : Unknown method '%s'" % method) + # return + return "Unknown method '%s'" % method + # execute + _logger.debug(sql+comment+str(varMap)) + self.cur.execute(sql+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("updateSiteAccess : completed") + return True + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("updateSiteAccess : %s %s" % (type,value)) + return 'DB error %s %s' % (type,value) + + + # get list of archived tables + def getArchiveTables(self): + # return + return ['ATLAS_PANDAARCH.jobsArchived'] + + + # get JobIDs in a time range + def getJobIDsInTimeRangeLog(self,dn,timeRange,retJobIDs): + comment = ' /* DBProxy.getJobIDsInTimeRangeLog */' + _logger.debug("getJobIDsInTimeRangeLog : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) + try: + # get compact DN + compactDN = self.cleanUserID(dn) + if compactDN in ['','NULL',None]: + compactDN = dn + # get list of archived tables + tables = self.getArchiveTables() + # select + for table in tables: + # make sql + sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODSOURCELABEL_IDX JOBS_PRODUSERNAME_IDX) */ " + sql += "jobDefinitionID FROM %s tab " % table + sql += "WHERE prodUserName=:prodUserName AND modificationTime>:modificationTime " + sql += "AND prodSourceLabel=:prodSourceLabel GROUP BY jobDefinitionID" + varMap = {} + varMap[':prodUserName'] = compactDN + varMap[':prodSourceLabel'] = 'user' + varMap[':modificationTime'] = timeRange + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + _logger.debug(sql+comment+str(varMap)) + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for tmpID, in resList: + if not tmpID in retJobIDs: + retJobIDs.append(tmpID) + _logger.debug("getJobIDsInTimeRangeLog : %s" % str(retJobIDs)) + return retJobIDs + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getJobIDsInTimeRangeLog : %s %s" % (type,value)) + # return empty list + return retJobIDs + + + # get PandaIDs for a JobID + def getPandIDsWithJobIDLog(self,dn,jobID,idStatus,nJobs,buildJobID=None): + comment = ' /* Proxy.getPandIDsWithJobIDLog */' + _logger.debug("getPandIDsWithJobIDLog : %s %s" % (dn,jobID)) + try: + # get compact DN + compactDN = self.cleanUserID(dn) + if compactDN in ['','NULL',None]: + compactDN = dn + # get list of archived tables + tables = self.getArchiveTables() + # select + for table in tables: + # skip if all jobs have already been gotten + if nJobs > 0 and len(idStatus) >= nJobs: + continue + # make sql + sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ " + sql += "PandaID,jobStatus,commandToPilot,prodSourceLabel,taskBufferErrorCode FROM %s tab " % table + sql += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>(CURRENT_DATE-30) " + varMap = {} + varMap[':prodUserName'] = compactDN + varMap[':jobDefinitionID'] = jobID + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + # select + _logger.debug(sql+comment+str(varMap)) + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for tmpID,tmpStatus,tmpCommand,tmpProdSourceLabel,tmpTaskBufferErrorCode in resList: + # ignore jobs retried by pilot since they have new PandaIDs with the same jobsetID/jobdefID + if tmpTaskBufferErrorCode in [ErrorCode.EC_PilotRetried]: + continue + # ignore old buildJob which was replaced by rebrokerage + if tmpProdSourceLabel == 'panda': + if buildJobID == None: + # first buildJob + buildJobID = tmpID + elif buildJobID >= tmpID: + # don't append old one + continue + else: + # delete old one + del idStatus[buildJobID] + buildJobID = tmpID + # append + if not idStatus.has_key(tmpID): + idStatus[tmpID] = (tmpStatus,tmpCommand) + _logger.debug("getPandIDsWithJobIDLog : %s" % str(idStatus)) + return idStatus + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getPandIDsWithJobIDLog : %s %s" % (type,value)) + # return empty list + return {} + + + # get PandaIDs for a JobsetID or JobdefID in jobsArchived + def getPandIDsWithIdInArch(self,prodUserName,id,isJobset): + comment = ' /* Proxy.getPandIDsWithIdInArch */' + _logger.debug("getPandIDsWithIdInArch : %s %s %s" % (prodUserName,id,isJobset)) + try: + # make sql + if isJobset: + sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBSETID_IDX) */ " + else: + sql = "SELECT /*+ NO_INDEX(tab JOBS_MODTIME_IDX) INDEX_COMBINE(tab JOBS_PRODUSERNAME_IDX JOBS_JOBDEFID_IDX) */ " + sql += "PandaID FROM ATLAS_PANDAARCH.jobsArchived tab " + sql += "WHERE prodUserName=:prodUserName " + sql += "AND prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND modificationTime>(CURRENT_DATE-30) " + if isJobset: + sql += "AND jobsetID=:jobID " + else: + sql += "AND jobDefinitionID=:jobID " + varMap = {} + varMap[':prodUserName'] = prodUserName + varMap[':jobID'] = id + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 1000000 + # select + _logger.debug(sql+comment+str(varMap)) + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + pandaIDs = [] + for tmpID, in resList: + pandaIDs.append(tmpID) + _logger.debug("getPandIDsWithIdInArch : %s %s -> %s" % (prodUserName,id,str(pandaIDs))) + return pandaIDs + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getPandIDsWithIdInArch : %s %s" % (errType,errValue)) + # return empty list + return [] + + + # peek at job + def peekJobLog(self,pandaID): + comment = ' /* DBProxy.peekJobLog */' + _logger.debug("peekJobLog : %s" % pandaID) + # return None for NULL PandaID + if pandaID in ['NULL','','None',None]: + return None + sql1_0 = "SELECT %s FROM %s " + sql1_1 = "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-30) " + # select + varMap = {} + varMap[':PandaID'] = pandaID + nTry=3 + for iTry in range(nTry): + try: + # get list of archived tables + tables = self.getArchiveTables() + # select + for table in tables: + # start transaction + self.conn.begin() + # select + sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1 + self.cur.arraysize = 10 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if len(res) != 0: + # Job + job = JobSpec() + job.pack(res[0]) + # Files + # start transaction + self.conn.begin() + # select + fileTableName = re.sub('jobsArchived','filesTable_ARCH',table) + sqlFile = "SELECT /*+ INDEX(tab FILES_ARCH_PANDAID_IDX)*/ %s " % FileSpec.columnNames() + sqlFile+= "FROM %s tab " % fileTableName + # put constraint on modificationTime to avoid full table scan + sqlFile+= "WHERE PandaID=:PandaID AND modificationTime>(CURRENT_DATE-60)" + self.cur.arraysize = 10000 + self.cur.execute(sqlFile+comment, varMap) + resFs = self.cur.fetchall() + # metadata + job.metadata = None + metaTableName = re.sub('jobsArchived','metaTable_ARCH',table) + sqlMeta = "SELECT metaData FROM %s WHERE PandaID=:PandaID" % metaTableName + self.cur.execute(sqlMeta+comment, varMap) + for clobMeta, in self.cur: + if clobMeta != None: + job.metadata = clobMeta.read() + break + # job parameters + job.jobParameters = None + jobParamTableName = re.sub('jobsArchived','jobParamsTable_ARCH',table) + sqlJobP = "SELECT jobParameters FROM %s WHERE PandaID=:PandaID" % jobParamTableName + varMap = {} + varMap[':PandaID'] = job.PandaID + self.cur.execute(sqlJobP+comment, varMap) + for clobJobP, in self.cur: + if clobJobP != None: + job.jobParameters = clobJobP.read() + break + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # set files + for resF in resFs: + file = FileSpec() + file.pack(resF) + # remove redundant white spaces + try: + file.md5sum = file.md5sum.strip() + except: + pass + try: + file.checksum = file.checksum.strip() + except: + pass + job.addFile(file) + return job + _logger.debug("peekJobLog() : PandaID %s not found" % pandaID) + return None + except: + # roll back + self._rollback() + if iTry+1 < nTry: + _logger.error("peekJobLog : %s" % pandaID) + time.sleep(random.randint(10,20)) + continue + type, value, traceBack = sys.exc_info() + _logger.error("peekJobLog : %s %s" % (type,value)) + # return None + return None + + + # get user subscriptions + def getUserSubscriptions(self,datasetName,timeRange): + comment = ' /* DBProxy.getUserSubscriptions */' + _logger.debug("getUserSubscriptions(%s,%s)" % (datasetName,timeRange)) + sql0 = "SELECT site FROM ATLAS_PANDAMETA.UserSubs " + sql0 += "WHERE datasetName=:datasetName and modificationDate>CURRENT_DATE-:timeRange" + varMap = {} + varMap[':datasetName'] = datasetName + varMap[':timeRange'] = timeRange + try: + # start transaction + self.conn.begin() + # select + self.cur.execute(sql0+comment, varMap) + resSs = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + retList = [] + for tmpSite, in resSs: + retList.append(tmpSite) + return retList + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getUserSubscriptions : %s %s" % (errType,errValue)) + return [] + + + # get the number of user subscriptions + def getNumUserSubscriptions(self): + comment = ' /* DBProxy.getNumUserSubscriptions */' + _logger.debug("getNumUserSubscriptions") + sql0 = "SELECT site,COUNT(*) FROM ATLAS_PANDAMETA.UserSubs " + sql0 += "WHERE creationDate>CURRENT_DATE-2 GROUP BY site" + try: + # start transaction + self.conn.begin() + # select + self.cur.execute(sql0+comment,{}) + resSs = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + retList = {} + for tmpSite,countNum in resSs: + retList[tmpSite] = countNum + return retList + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getNumUserSubscriptions : %s %s" % (errType,errValue)) + return [] + + + # add user subscriptions + def addUserSubscription(self,datasetName,dq2IDs): + comment = ' /* DBProxy.addUserSubscription */' + _logger.debug("addUserSubscription(%s,%s)" % (datasetName,dq2IDs)) + sql0 = "INSERT INTO ATLAS_PANDAMETA.UserSubs " + sql0 += "(datasetName,site,creationDate,modificationDate,nUsed) " + sql0 += "VALUES (:datasetName,:site,CURRENT_DATE,CURRENT_DATE,:nUsed)" + try: + # start transaction + self.conn.begin() + for site in dq2IDs: + varMap = {} + varMap[':datasetName'] = datasetName + varMap[':site'] = site + varMap[':nUsed'] = 0 + # insert + self.cur.execute(sql0+comment, varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return True + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("addUserSubscription : %s %s" % (errType,errValue)) + return False + + + # increment counter for subscription + def incrementUsedCounterSubscription(self,datasetName): + comment = ' /* DBProxy.incrementUsedCounterSubscription */' + _logger.debug("incrementUsedCounterSubscription(%s)" % datasetName) + sql0 = "UPDATE ATLAS_PANDAMETA.UserSubs SET nUsed=nUsed+1 " + sql0 += "WHERE datasetName=:datasetName AND nUsed IS NOT NULL" + sqlU = "SELECT MAX(nUsed) FROM ATLAS_PANDAMETA.UserSubs " + sqlU += "WHERE datasetName=:datasetName" + try: + # start transaction + self.conn.begin() + varMap = {} + varMap[':datasetName'] = datasetName + # update + self.cur.execute(sql0+comment,varMap) + # get nUsed + nUsed = 0 + retU = self.cur.rowcount + if retU > 0: + # get nUsed + self.cur.execute(sqlU+comment,varMap) + self.cur.arraysize = 10 + res = self.cur.fetchone() + if res != None: + nUsed = res[0] + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return nUsed + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("incrementUsedCounterSubscription : %s %s" % (errType,errValue)) + return -1 + + + # get active datasets + def getActiveDatasets(self,computingSite,prodSourceLabel): + comment = ' /* DBProxy.getActiveDatasets */' + _logger.debug("getActiveDatasets(%s,%s)" % (computingSite,prodSourceLabel)) + varMap = {} + varMap[':computingSite'] = computingSite + varMap[':jobStatus1'] = 'assigned' + varMap[':jobStatus2'] = 'activated' + varMap[':jobStatus3'] = 'waiting' + varMap[':prodSourceLabel'] = prodSourceLabel + try: + retList = [] + for table in ['jobsActive4','jobsDefined4','jobsWaiting4']: + if table == 'jobsActive4': + sql0 = "SELECT distinct prodDBlock FROM ATLAS_PANDA.%s " % table + else: + sql0 = "SELECT distinct prodDBlock FROM ATLAS_PANDA.%s " % table + sql0 += "WHERE computingSite=:computingSite AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3) " + sql0 += "AND prodSourceLabel=:prodSourceLabel" + # start transaction + self.conn.begin() + # select + self.cur.execute(sql0+comment, varMap) + resSs = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for prodDBlock, in resSs: + if not prodDBlock in retList: + retList.append(prodDBlock) + # make string + retStr = '' + for tmpItem in retList: + retStr += '%s,' % tmpItem + retStr = retStr[:-1] + return retStr + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getActiveDatasets : %s %s" % (errType,errValue)) + return "" + + + # check status of all sub datasets to trigger Notifier + def checkDatasetStatusForNotifier(self,jobsetID,jobDefinitionID,prodUserName): + comment = ' /* DBProxy.checkDatasetStatusForNotifier */' + _logger.debug("checkDatasetStatusForNotifier(%s,%s,%s)" % (jobsetID,jobDefinitionID,prodUserName)) + try: + # get PandaIDs to get all associated destinationDBlocks + varMap = {} + varMap[':jobsetID'] = jobsetID + varMap[':prodUserName'] = prodUserName + sql = "SELECT MAX(PandaID),jobDefinitionID FROM %s WHERE prodUserName=:prodUserName AND jobsetID=:jobsetID GROUP BY jobDefinitionID" + pandaIDs = {} + for table in ['ATLAS_PANDA.jobsArchived4','ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsWaiting4']: + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 1000 + self.cur.execute((sql % table)+comment, varMap) + resSs = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # get PandaIDs + for tmpPandaID,tmpJobDefID in resSs: + if (not pandaIDs.has_key(tmpJobDefID)) or tmpPandaID > pandaIDs[tmpJobDefID]: + pandaIDs[tmpJobDefID] = tmpPandaID + # get all destinationDBlocks + varMap = {} + varMap[':type1'] = 'log' + varMap[':type2'] = 'output' + sql = 'SELECT DISTINCT destinationDBlock FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID AND type IN (:type1,:type2)' + datasetMap = {} + # start transaction + self.conn.begin() + self.cur.arraysize = 1000 + for tmpJobDefID,tmpPandaID in pandaIDs.iteritems(): + varMap[':PandaID'] = tmpPandaID + # select + self.cur.execute(sql+comment, varMap) + resSs = self.cur.fetchall() + # get destinationDBlock + for tmpDestDBlock, in resSs: + if not datasetMap.has_key(tmpJobDefID): + datasetMap[tmpJobDefID] = [] + if not tmpDestDBlock in datasetMap[tmpJobDefID]: + datasetMap[tmpJobDefID].append(tmpDestDBlock) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # check dataset status + allClosed = True + retInfo = {} + latestUpdate = None + latestJobDefID = None + varMap = {} + varMap[':type1'] = 'log' + varMap[':type2'] = 'output' + sql = 'SELECT status,modificationDate FROM ATLAS_PANDA.Datasets WHERE name=:name AND type IN (:type1,:type2)' + sqlJ = "SELECT MAX(modificationTime) FROM ATLAS_PANDA.jobsArchived4 " + sqlJ += "WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID" + # start transaction + self.conn.begin() + self.cur.arraysize = 1000 + for tmpJobDefID,tmpDatasets in datasetMap.iteritems(): + retInfo[tmpJobDefID] = [] + for tmpDataset in tmpDatasets: + if not tmpDataset in retInfo[tmpJobDefID]: + retInfo[tmpJobDefID].append(tmpDataset) + varMap[':name'] = tmpDataset + # select + self.cur.execute(sql+comment, varMap) + resSs = self.cur.fetchall() + # check status and mod time + for tmpStatus,tmpModificationDate in resSs: + _logger.debug("checkDatasetStatusForNotifier(%s,%s) %s has %s with %s at %s" % \ + (jobsetID,jobDefinitionID,tmpJobDefID,tmpDataset,tmpStatus,tmpModificationDate)) + if not tmpStatus in ['closed','tobeclosed','completed']: + # some datasets are still active + allClosed = False + _logger.debug("checkDatasetStatusForNotifier(%s,%s) wait due to %s %s %s" % \ + (jobsetID,jobDefinitionID,tmpJobDefID,tmpDataset,tmpStatus)) + break + elif tmpStatus == 'tobeclosed': + # select latest modificationTime in job table + varMapJ = {} + varMapJ[':prodUserName'] = prodUserName + varMapJ[':jobDefinitionID'] = tmpJobDefID + self.cur.execute(sqlJ+comment, varMapJ) + resJ = self.cur.fetchone() + if resJ == None: + # error + allClosed = False + _logger.error("checkDatasetStatusForNotifier(%s,%s) %s cannot find job" % \ + (jobsetID,jobDefinitionID,tmpJobDefID)) + break + tmpModificationTime, = resJ + _logger.debug("checkDatasetStatusForNotifier(%s,%s) %s modtime:%s" % \ + (jobsetID,jobDefinitionID,tmpJobDefID,tmpModificationTime)) + if latestUpdate == None or latestUpdate < tmpModificationTime: + # use the latest updated jobDefID + latestUpdate = tmpModificationTime + latestJobDefID = tmpJobDefID + elif latestUpdate == tmpModificationTime and latestJobDefID < tmpJobDefID: + # use larger jobDefID when datasets are closed at the same time + latestJobDefID = tmpJobDefID + # escape + if not allClosed: + break + # escape + if not allClosed: + break + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + _logger.debug("checkDatasetStatusForNotifier(%s,%s) -> all:%s %s latest:%s" % \ + (jobsetID,jobDefinitionID,allClosed,latestJobDefID, + jobDefinitionID == latestJobDefID)) + # return + if not allClosed or jobDefinitionID != latestJobDefID: + return False,{} + return True,retInfo + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("checkDatasetStatusForNotifier : %s %s" % (errType,errValue)) + return False,{} + + + # get MoU share for T2 PD2P + def getMouShareForT2PD2P(self): + comment = ' /* DBProxy.getMouShareForT2PD2P */' + _logger.debug("getMouShareForT2PD2P start") + sqlG = "SELECT gid,ntup_share FROM ATLAS_GRISLI.t_tier2_groups " + sqlT = "SELECT tier2,t2group,status FROM ATLAS_GRISLI.t_m4regions_replication" + try: + # start transaction + self.conn.begin() + self.cur.arraysize = 100000 + # get weight for each group + self.cur.execute(sqlG+comment) + resG = self.cur.fetchall() + gidShareMap = {} + for gid,ntup_share in resG: + gidShareMap[gid] = {'ntup_share':ntup_share,'nSites':0} + # get group for each site + self.cur.execute(sqlT+comment) + resT = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + siteGroupMap = {} + # loop over all sites + for tier2,t2group,t2status in resT: + # unknown group + if not gidShareMap.has_key(t2group): + _logger.error("getMouShareForT2PD2P unknown group %s for %s" % (t2group,tier2)) + continue + # use only DATADISK + if not tier2.endswith('_DATADISK'): + continue + # count the number of ready sites per group + if t2status in ['ready']: + gidShareMap[t2group]['nSites'] += 1 + # append + siteGroupMap[tier2] = {'group':t2group,'status':t2status} + # normalize + _logger.debug("getMouShareForT2PD2P normalize factor = %s" % str(gidShareMap)) + weightsMap = {} + for tier2,t2Val in siteGroupMap.iteritems(): + t2group = t2Val['group'] + t2status = t2Val['status'] + if gidShareMap[t2group]['ntup_share'] == 0: + # set 0 to be skipped in the brokerage + tmpWeight = 0 + elif gidShareMap[t2group]['nSites'] > 0: + # normalize + tmpWeight = float(gidShareMap[t2group]['ntup_share']) / float(gidShareMap[t2group]['nSites']) + else: + # no site is ready in this group + tmpWeight = 0 + weightsMap[tier2] = {'weight':tmpWeight,'status':t2status} + _logger.debug("getMouShareForT2PD2P -> %s" % str(weightsMap)) + return weightsMap + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("getMouShareForT2PD2P : %s %s" % (errType,errValue)) + return {} + + + # record status change + def recordStatusChange(self,pandaID,jobStatus,jobInfo=None,infoMap={}): + comment = ' /* DBProxy.recordStatusChange */' + # check config + if not hasattr(panda_config,'record_statuschange') or panda_config.record_statuschange != True: + return + # get job info + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':jobStatus'] = jobStatus + varMap[':modificationHost'] = self.myHostName + if jobInfo != None: + varMap[':computingSite'] = jobInfo.computingSite + varMap[':cloud'] = jobInfo.cloud + varMap[':prodSourceLabel'] = jobInfo.prodSourceLabel + elif infoMap != None: + varMap[':computingSite'] = infoMap['computingSite'] + varMap[':cloud'] = infoMap['cloud'] + varMap[':prodSourceLabel'] = infoMap['prodSourceLabel'] + else: + # no info + return + # convert NULL to None + for tmpKey in varMap.keys(): + if varMap[tmpKey] == 'NULL': + varMap[tmpKey] = None + # insert + sql = "INSERT INTO ATLAS_PANDA.jobs_StatusLog " + sql += "(PandaID,modificationTime,jobStatus,prodSourceLabel,cloud,computingSite,modificationHost) " + sql += "VALUES (:PandaID,CURRENT_DATE,:jobStatus,:prodSourceLabel,:cloud,:computingSite,:modificationHost) " + try: + # start transaction + self.conn.begin() + self.cur.execute(sql+comment,varMap) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + except: + # roll back + self._rollback() + errType,errValue = sys.exc_info()[:2] + _logger.error("recordStatusChange %s %s: %s %s" % (pandaID,jobStatus,errType,errValue)) + return + + + # wake up connection + def wakeUp(self): + for iTry in range(5): + try: + # check if the connection is working + self.conn.ping() + return + except: + type, value, traceBack = sys.exc_info() + _logger.debug("wakeUp %d : %s %s" % (iTry,type,value)) + # wait for reconnection + time.sleep(1) + self.connect(reconnect=True) + + + # commit + def _commit(self): + try: + self.conn.commit() + return True + except: + _logger.error("commit error") + return False + + + # rollback + def _rollback(self,useOtherError=False): + retVal = True + # rollback + _logger.debug("rollback") + try: + self.conn.rollback() + except: + _logger.error("rollback error") + retVal = False + # reconnect if needed + try: + # get ORA ErrorCode + errType,errValue = sys.exc_info()[:2] + oraErrCode = str(errValue).split()[0] + oraErrCode = oraErrCode[:-1] + _logger.debug("rollback EC:%s %s" % (oraErrCode,errValue)) + # error codes for connection error + error_Codes = ['ORA-01012','ORA-01033','ORA-01034','ORA-01089', + 'ORA-03113','ORA-03114','ORA-12203','ORA-12500', + 'ORA-12571','ORA-03135','ORA-25402'] + # other errors are apperantly given when connection lost contact + if useOtherError: + error_Codes += ['ORA-01861','ORA-01008'] + if oraErrCode in error_Codes: + # reconnect + retFlag = self.connect(reconnect=True) + _logger.debug("rollback reconnected %s" % retFlag) + except: + pass + # return + return retVal diff --git a/current/pandaserver/taskbuffer/OraLogDBProxy.py b/current/pandaserver/taskbuffer/OraLogDBProxy.py new file mode 100755 index 000000000..8f397db40 --- /dev/null +++ b/current/pandaserver/taskbuffer/OraLogDBProxy.py @@ -0,0 +1,727 @@ +""" +proxy for log database connection + +""" + +import re +import sys +import time + +import cx_Oracle + +from pandalogger.PandaLogger import PandaLogger +from config import panda_config + +import SiteSpec +import CloudSpec + +from JobSpec import JobSpec +from FileSpec import FileSpec + +# logger +_logger = PandaLogger().getLogger('LogDBProxy') + +# proxy +class LogDBProxy: + + # constructor + def __init__(self): + # connection object + self.conn = None + # cursor object + self.cur = None + + # connect to DB + def connect(self,dbhost=panda_config.logdbhost,dbpasswd=panda_config.logdbpasswd, + dbuser=panda_config.logdbuser,dbname=panda_config.logdbname,reconnect=False): + # keep parameters for reconnect + if not reconnect: + self.dbhost = dbhost + self.dbpasswd = dbpasswd + self.dbuser = dbuser + self.dbname = dbname + # connect + try: + self.conn = cx_Oracle.connect(dsn=self.dbhost,user=self.dbuser, + password=self.dbpasswd,threaded=True) + self.cur=self.conn.cursor() + # set TZ + self.cur.execute("ALTER SESSION SET TIME_ZONE='UTC'") + # set DATE format + self.cur.execute("ALTER SESSION SET NLS_DATE_FORMAT='YYYY/MM/DD HH24:MI:SS'") + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("connect : %s %s" % (type,value)) + # roll back + self._rollback() + return False + + + # query an SQL + def querySQL(self,sql,arraySize=1000): + try: + # begin transaction + self.conn.begin() + self.cur.arraysize = arraySize + self.cur.execute(sql) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + return res + except: + type, value, traceBack = sys.exc_info() + _logger.error("querySQL : %s %s" % (type,value)) + return None + + + # get site data + def getCurrentSiteData(self): + _logger.debug("getCurrentSiteData") + sql = "SELECT SITE,getJob,updateJob FROM SiteData WHERE FLAG='production' and HOURS=3" + try: + # set autocommit on + self.conn.begin() + # select + self.cur.arraysize = 10000 + self.cur.execute(sql) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + ret = {} + for item in res: + ret[item[0]] = {'getJob':item[1],'updateJob':item[2]} + _logger.debug(ret) + return ret + except: + type, value, traceBack = sys.exc_info() + _logger.error("getCurrentSiteData : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # get list of site + def getSiteList(self): + _logger.debug("getSiteList start") + try: + # set autocommit on + self.conn.begin() + # select + sql = "SELECT siteid,nickname FROM schedconfig WHERE siteid IS NOT NULL" + self.cur.arraysize = 10000 + self.cur.execute(sql) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + retMap = {} + if res != None and len(res) != 0: + for siteid,nickname in res: + # skip invalid siteid + if siteid in [None,'']: + continue + # append + if not retMap.has_key(siteid): + retMap[siteid] = [] + retMap[siteid].append(nickname) + _logger.debug(retMap) + _logger.debug("getSiteList done") + return retMap + except: + type, value, traceBack = sys.exc_info() + _logger.error("getSiteList : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # get site info + def getSiteInfo(self): + _logger.debug("getSiteInfo start") + try: + # set autocommit on + self.conn.begin() + # select + sql = "SELECT nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory," + sql+= "maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec," + sql+= "priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue " + sql+= "FROM schedconfig WHERE siteid IS NOT NULL" + self.cur.arraysize = 10000 + self.cur.execute(sql) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + retList = {} + if resList != None: + # loop over all results + for res in resList: + # change None to '' + resTmp = [] + for tmpItem in res: + if tmpItem == None: + tmpItem = '' + resTmp.append(tmpItem) + nickname,dq2url,cloud,ddm,lfchost,se,gatekeeper,releases,memory,\ + maxtime,status,space,retry,cmtconfig,setokens,seprodpath,glexec,\ + priorityoffset,allowedgroups,defaulttoken,siteid,queue,localqueue \ + = resTmp + # skip invalid siteid + if siteid in [None,'']: + continue + # instantiate SiteSpec + ret = SiteSpec.SiteSpec() + ret.sitename = siteid + ret.nickname = nickname + ret.dq2url = dq2url + ret.cloud = cloud + ret.ddm = ddm.split(',')[0] + ret.lfchost = lfchost + ret.se = se + ret.gatekeeper = gatekeeper + ret.memory = memory + ret.maxtime = maxtime + ret.status = status + ret.space = space + ret.glexec = glexec + ret.queue = queue + ret.localqueue = localqueue + # job recoverty + ret.retry = True + if retry == 'FALSE': + ret.retry = False + # convert releases to list + ret.releases = [] + for tmpRel in releases.split('|'): + # remove white space + tmpRel = tmpRel.strip() + if tmpRel != '': + ret.releases.append(tmpRel) + # cmtconfig + # add slc3 if the column is empty + ret.cmtconfig = ['i686-slc3-gcc323-opt'] + if cmtconfig != '': + ret.cmtconfig.append(cmtconfig) + # map between token and DQ2 ID + ret.setokens = {} + tmpTokens = setokens.split(',') + for idxToken,tmpddmID in enumerate(ddm.split(',')): + if idxToken < len(tmpTokens): + ret.setokens[tmpTokens[idxToken]] = tmpddmID + # expand [] in se path + match = re.search('([^\[]*)\[([^\]]+)\](.*)',seprodpath) + if match != None and len(match.groups()) == 3: + seprodpath = '' + for tmpBody in match.group(2).split(','): + seprodpath += '%s%s%s,' % (match.group(1),tmpBody,match.group(3)) + seprodpath = seprodpath[:-1] + # map between token and se path + ret.seprodpath = {} + tmpTokens = setokens.split(',') + for idxToken,tmpSePath in enumerate(seprodpath.split(',')): + if idxToken < len(tmpTokens): + ret.seprodpath[tmpTokens[idxToken]] = tmpSePath + # VO related params + ret.priorityoffset = priorityoffset + ret.allowedgroups = allowedgroups + ret.defaulttoken = defaulttoken + # append + retList[ret.nickname] = ret + _logger.debug("getSiteInfo done") + return retList + except: + type, value, traceBack = sys.exc_info() + _logger.error("getSiteInfo : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # get cloud list + def getCloudList(self): + _logger.debug("getCloudList start") + try: + # set autocommit on + self.conn.begin() + # select + sql = "SELECT name,tier1,tier1SE,relocation,weight,server,status,transtimelo," + sql += "transtimehi,waittime,validation,mcshare,countries,fasttrack " + sql+= "FROM cloudconfig" + self.cur.arraysize = 10000 + self.cur.execute(sql) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + ret = {} + if resList != None and len(resList) != 0: + for res in resList: + # change None to '' + resTmp = [] + for tmpItem in res: + if tmpItem == None: + tmpItem = '' + resTmp.append(tmpItem) + name,tier1,tier1SE,relocation,weight,server,status,transtimelo,transtimehi,\ + waittime,validation,mcshare,countries,fasttrack = resTmp + # instantiate CloudSpec + tmpC = CloudSpec.CloudSpec() + tmpC.name = name + tmpC.tier1 = tier1 + tmpC.tier1SE = re.sub(' ','',tier1SE).split(',') + tmpC.relocation = relocation + tmpC.weight = weight + tmpC.server = server + tmpC.status = status + tmpC.transtimelo = transtimelo + tmpC.transtimehi = transtimehi + tmpC.waittime = waittime + tmpC.validation = validation + tmpC.mcshare = mcshare + tmpC.countries = countries + tmpC.fasttrack = fasttrack + # append + ret[name] = tmpC + _logger.debug("getCloudList done") + return ret + except: + type, value, traceBack = sys.exc_info() + _logger.error("getCloudList : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # extract name from DN + def cleanUserID(self, id): + try: + up = re.compile('/(DC|O|OU|C|L)=[^\/]+') + username = up.sub('', id) + up2 = re.compile('/CN=[0-9]+') + username = up2.sub('', username) + up3 = re.compile(' [0-9]+') + username = up3.sub('', username) + up4 = re.compile('_[0-9]+') + username = up4.sub('', username) + username = username.replace('/CN=proxy','') + username = username.replace('/CN=limited proxy','') + username = username.replace('limited proxy','') + pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)') + mat = pat.match(username) + if mat: + username = mat.group(2) + else: + username = username.replace('/CN=','') + if username.lower().find('/email') > 0: + username = username[:username.lower().find('/email')] + pat = re.compile('.*(limited.*proxy).*') + mat = pat.match(username) + if mat: + username = mat.group(1) + username = username.replace('(','') + username = username.replace(')','') + return username + except: + return id + + + # check quota + def checkQuota(self,dn): + _logger.debug("checkQuota %s" % dn) + try: + # set autocommit on + self.conn.begin() + # select + name = self.cleanUserID(dn) + sql = "SELECT cpua1,cpua7,cpua30,quotaa1,quotaa7,quotaa30 FROM users WHERE name = :name" + varMap = {} + varMap[':name'] = name + self.cur.arraysize = 10 + self.cur.execute(sql,varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + weight = 0.0 + if res != None and len(res) != 0: + item = res[0] + # cpu and quota + cpu1 = item[0] + cpu7 = item[1] + cpu30 = item[2] + quota1 = item[3] * 3600 + quota7 = item[4] * 3600 + quota30 = item[5] * 3600 + # CPU usage + if cpu1 == None: + cpu1 = 0.0 + # weight + weight = float(cpu1) / float(quota1) + # not exceeded the limit + if weight < 1.0: + weight = 0.0 + _logger.debug("checkQuota %s Weight:%s Quota:%s CPU:%s" % (dn,weight,quota1,cpu1)) + else: + _logger.debug("checkQuota cannot found %s" % dn) + return weight + except: + type, value, traceBack = sys.exc_info() + _logger.error("checkQuota : %s %s" % (type,value)) + # roll back + self._rollback() + return 0.0 + + + # get serialize JobID and status + def getUserParameter(self,dn,jobID): + _logger.debug("getUserParameter %s %s" % (dn,jobID)) + try: + # set autocommit on + self.conn.begin() + # select + name = self.cleanUserID(dn) + sql = "SELECT jobid,status FROM users WHERE name = :name" + varMap = {} + varMap[':name'] = name + self.cur.execute(sql,varMap) + self.cur.arraysize = 10 + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + retJobID = jobID + retStatus = True + if res != None and len(res) != 0: + item = res[0] + # JobID in DB + dbJobID = item[0] + # check status + if item[1] in ['disabled']: + retStatus = False + # use larger JobID + if dbJobID >= int(retJobID): + retJobID = dbJobID+1 + # update DB + sql = "UPDATE users SET jobid=%d WHERE name = '%s'" % (retJobID,name) + self.cur.execute(sql) + _logger.debug("getUserParameter set JobID=%s for %s" % (retJobID,dn)) + return retJobID,retStatus + except: + type, value, traceBack = sys.exc_info() + _logger.error("getUserParameter : %s %s" % (type,value)) + # roll back + self._rollback() + return jobID,True + + + # get email address for a user + def getEmailAddr(self,name): + _logger.debug("get email for %s" % name) + try: + # set autocommit on + self.conn.begin() + # select + sql = "SELECT email FROM users WHERE name=:name" + varMap = {} + varMap[':name'] = name + self.cur.execute(sql,varMap) + self.cur.arraysize = 10 + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if res != None and len(res) != 0: + return res[0][0] + # return empty string + return "" + except: + type, value, traceBack = sys.exc_info() + _logger.error("getEmailAddr : %s %s" % (type,value)) + # roll back + self._rollback() + return "" + + + # register proxy key + def registerProxyKey(self,params): + _logger.debug("register ProxyKey %s" % str(params)) + try: + # set autocommit on + self.conn.begin() + # construct SQL + sql0 = 'INSERT INTO proxykey (' + sql1 = 'VALUES (' + vals = {} + for key,val in params.iteritems(): + sql0 += '%s,' % key + sql1 += ':%s,' % key + vals[':%s' % key] = val + sql0 = sql0[:-1] + sql1 = sql1[:-1] + sql = sql0 + ') ' + sql1 + ') ' + # insert + self.cur.execute(sql,vals) + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return True + return True + except: + type, value, traceBack = sys.exc_info() + _logger.error("registerProxyKey : %s %s" % (type,value)) + # roll back + self._rollback() + return "" + + + # get proxy key + def getProxyKey(self,dn): + _logger.debug("get ProxyKey %s" % dn) + try: + # set autocommit on + self.conn.begin() + # construct SQL + sql = 'SELECT credname,expires,origin,myproxy FROM proxykey WHERE dn=:dn ORDER BY expires DESC' + varMap = {} + varMap[':dn'] = dn + # select + self.cur.execute(sql,varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # return + retMap = {} + if res != None and len(res) != 0: + credname,expires,origin,myproxy = res[0] + retMap['credname'] = credname + retMap['expires'] = expires + retMap['origin'] = origin + retMap['myproxy'] = myproxy + _logger.debug(retMap) + return retMap + except: + type, value, traceBack = sys.exc_info() + _logger.error("getProxyKey : %s %s" % (type,value)) + # roll back + self._rollback() + return {} + + + # get list of archived tables + def getArchiveTables(self): + tables = [] + cdate = datetime.datetime.utcnow() + for iCycle in range(2): # 2 = (1 months + 2 just in case)/2 + if cdate.month==1: + cdate = cdate.replace(year = (cdate.year-1)) + cdate = cdate.replace(month = 12, day = 1) + else: + cdate = cdate.replace(month = (cdate.month/2)*2, day = 1) + tableName = "jobsArchived_%s%s" % (cdate.strftime('%b'),cdate.year) + if not tableName in tables: + tables.append(tableName) + # one older table + if cdate.month > 2: + cdate = cdate.replace(month = (cdate.month-2)) + else: + cdate = cdate.replace(year = (cdate.year-1), month = 12) + # return + return tables + + + # get JobIDs in a time range + def getJobIDsInTimeRange(self,dn,timeRange,retJobIDs): + comment = ' /* LogDBProxy.getJobIDsInTimeRange */' + _logger.debug("getJobIDsInTimeRange : %s %s" % (dn,timeRange.strftime('%Y-%m-%d %H:%M:%S'))) + try: + # get list of archived tables + tables = self.getArchiveTables() + # select + for table in tables: + # make sql + sql = "SELECT jobDefinitionID FROM %s " % table + sql += "WHERE prodUserID=:prodUserID AND modificationTime>:modificationTime " + sql += "AND prodSourceLabel='user' GROUP BY jobDefinitionID" + varMap = {} + varMap[':prodUserID'] = dn + varMap[':modificationTime'] = timeRange + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 10000 + _logger.debug(sql+comment+str(varMap)) + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for tmpID, in resList: + if not tmpID in retJobIDs: + retJobIDs.append(tmpID) + _logger.debug("getJobIDsInTimeRange : %s" % str(retJobIDs)) + return retJobIDs + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getJobIDsInTimeRange : %s %s" % (type,value)) + # return empty list + return retJobIDs + + + # get PandaIDs for a JobID + def getPandIDsWithJobID(self,dn,jobID,idStatus,nJobs): + comment = ' /* LogProxy.getPandIDsWithJobID */' + _logger.debug("getPandIDsWithJobID : %s %s" % (dn,jobID)) + try: + # get list of archived tables + tables = self.getArchiveTables() + # select + for table in tables: + # skip if all jobs have already been gotten + if nJobs > 0 and len(idStatus) >= nJobs: + continue + # make sql + sql = "SELECT PandaID,jobStatus,commandToPilot FROM %s " % table + sql += "WHERE prodUserID=:prodUserID AND jobDefinitionID=:jobDefinitionID " + sql += "AND prodSourceLabel in ('user','panda') " + varMap = {} + varMap[':prodUserID'] = dn + varMap[':jobDefinitionID'] = jobID + # start transaction + self.conn.begin() + # select + self.cur.arraysize = 5000 + # select + _logger.debug(sql+comment+str(varMap)) + self.cur.execute(sql+comment, varMap) + resList = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # append + for tmpID,tmpStatus,tmpCommand in resList: + if not idStatus.has_key(tmpID): + idStatus[tmpID] = (tmpStatus,tmpCommand) + _logger.debug("getPandIDsWithJobID : %s" % str(idStatus)) + return idStatus + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("getPandIDsWithJobID : %s %s" % (type,value)) + # return empty list + return {} + + + # peek at job + def peekJob(self,pandaID): + comment = ' /* LogDBProxy.peekJob */' + _logger.debug("peekJob : %s" % pandaID) + # return None for NULL PandaID + if pandaID in ['NULL','','None',None]: + return None + sql1_0 = "SELECT %s FROM %s " + sql1_1 = "WHERE PandaID=:PandaID" + # select + varMap = {} + varMap[':PandaID'] = pandaID + try: + # get list of archived tables + tables = self.getArchiveTables() + # select + for table in tables: + # start transaction + self.conn.begin() + # select + sql = sql1_0 % (JobSpec.columnNames(),table) + sql1_1 + self.cur.arraysize = 10 + self.cur.execute(sql+comment, varMap) + res = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + if len(res) != 0: + # Job + job = JobSpec() + job.pack(res[0]) + # Files + # start transaction + self.conn.begin() + # select + fileTableName = re.sub('jobsArchived','filesTable',table) + sqlFile = "SELECT %s " % FileSpec.columnNames() + sqlFile+= "FROM %s " % fileTableName + sqlFile+= "WHERE PandaID=:PandaID" + self.cur.arraysize = 10000 + self.cur.execute(sqlFile+comment, varMap) + resFs = self.cur.fetchall() + # commit + if not self._commit(): + raise RuntimeError, 'Commit error' + # set files + for resF in resFs: + file = FileSpec() + file.pack(resF) + job.addFile(file) + return job + _logger.debug("peekJob() : PandaID %s not found" % pandaID) + return None + except: + # roll back + self._rollback() + type, value, traceBack = sys.exc_info() + _logger.error("peekJob : %s %s" % (type,value)) + # return None + return None + + + # wake up connection + def wakeUp(self): + for iTry in range(5): + try: + # check if the connection is working + self.cur.execute("select user from dual") + return + except: + type, value, traceBack = sys.exc_info() + _logger.debug("wakeUp %d : %s %s" % (iTry,type,value)) + # wait for reconnection + time.sleep(1) + self.connect(reconnect=True) + + + # close + def close(self): + try: + self.cur.close() + self.conn.close() + except: + type, value, traceBack = sys.exc_info() + _logger.error("close : %s %s" % (type,value)) + + + # commit + def _commit(self): + try: + self.conn.commit() + return True + except: + _logger.error("commit error") + return False + + + # rollback + def _rollback(self): + try: + self.conn.rollback() + return True + except: + _logger.error("rollback error") + return False + diff --git a/current/pandaserver/taskbuffer/PrioUtil.py b/current/pandaserver/taskbuffer/PrioUtil.py new file mode 100644 index 000000000..ac8d99d5f --- /dev/null +++ b/current/pandaserver/taskbuffer/PrioUtil.py @@ -0,0 +1,4 @@ +# calculate priority for user jobs +def calculatePriority(priorityOffset,serNum,weight): + priority = 1000 + priorityOffset - (serNum / 5) - int(100 * weight) + return priority diff --git a/current/pandaserver/taskbuffer/ProcessGroups.py b/current/pandaserver/taskbuffer/ProcessGroups.py new file mode 100644 index 000000000..1318ca0d1 --- /dev/null +++ b/current/pandaserver/taskbuffer/ProcessGroups.py @@ -0,0 +1,101 @@ +processGroups = [('others', []), + ('evgensimul', ['evgen','simul']), + ('reprocessing', ['reprocessing']), + ('test', ['prod_test','rc_test','validation']), + ('mcore', ['mcore']), + ('group', ['group']), + ] + +# source labels used for panda internal purpose +internalSourceLabels = ['ddm'] + +# maximum number of debug jobs per user +maxDebugJobs = 3 + +# maximum number of debug jobs for prod role +maxDebugProdJobs = 30 + +# extension level for GP +extensionLevel_1 = 1 + + +# get corresponding group +def getProcessGroup(valGroup): + tmpGroup = None + for tmpKey,tmpList in processGroups: + # set default + if tmpGroup == None: + tmpGroup = tmpKey + continue + if valGroup in tmpList: + tmpGroup = tmpKey + break + # return + return tmpGroup + + +# convert cloud and processingType for extended PG +def converCPTforEPG(cloud,processingType,coreCount,workingGroup=None): + if coreCount in [0,1,None]: + # use group queue for GP jobs + if workingGroup != None and workingGroup.startswith('GP_'): + return cloud,'group' + return cloud,processingType + else: + # use MCORE queue for MPC jobs in all clouds + return "ALL","mcore" + + +# count the number of jobs per group +def countJobsPerGroup(valMap): + ret = {} + # loop over all clouds + for cloud,cloudVal in valMap.iteritems(): + # add cloud + if not ret.has_key(cloud): + ret[cloud] = {} + # loop over all sites + for site,siteVal in cloudVal.iteritems(): + # add site + if not ret[cloud].has_key(site): + ret[cloud][site] = {} + # loop over all types + for pType,typeVal in siteVal.iteritems(): + # get process group + tmpGroup = getProcessGroup(pType) + # add group + if not ret[cloud][site].has_key(tmpGroup): + ret[cloud][site][tmpGroup] = {} + # loop over all status + for jobStatus,statVal in typeVal.iteritems(): + if not ret[cloud][site][tmpGroup].has_key(jobStatus): + ret[cloud][site][tmpGroup][jobStatus] = 0 + # add + ret[cloud][site][tmpGroup][jobStatus] += statVal + # return + return ret + + +# count the number of jobs per group for analysis +def countJobsPerGroupForAnal(valMap): + ret = {} + # loop over all sites + for site,siteVal in valMap.iteritems(): + # add site + if not ret.has_key(site): + ret[site] = {} + # loop over all types + for pType,typeVal in siteVal.iteritems(): + # get process group + tmpGroup = getProcessGroup(pType) + # add group + if not ret[site].has_key(tmpGroup): + ret[site][tmpGroup] = {} + # loop over all status + for jobStatus,statVal in typeVal.iteritems(): + if not ret[site][tmpGroup].has_key(jobStatus): + ret[site][tmpGroup][jobStatus] = 0 + # add + ret[site][tmpGroup][jobStatus] += statVal + # return + return ret diff --git a/current/pandaserver/taskbuffer/SQLDumper.py b/current/pandaserver/taskbuffer/SQLDumper.py new file mode 100644 index 000000000..16240d1be --- /dev/null +++ b/current/pandaserver/taskbuffer/SQLDumper.py @@ -0,0 +1,22 @@ +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('SQLDumper') + +class SQLDumper(object): + def __init__(self,cur): + self.cursor = cur + def __iter__(self): + return self + def next(self): + return self.cursor.next() + def my_execute(self,sql,var={}): + _logger.debug('SQL=%s var=%s' % (sql,str(var))) + return self.cursor.execute(sql,var) + def __getattribute__(self,name): + if name == 'execute': + return object.__getattribute__(self,'my_execute') + elif name in ['cursor','__iter__','next']: + return object.__getattribute__(self,name) + else: + return getattr(self.cursor,name) diff --git a/current/pandaserver/taskbuffer/SiteSpec.py b/current/pandaserver/taskbuffer/SiteSpec.py new file mode 100644 index 000000000..e261e08d0 --- /dev/null +++ b/current/pandaserver/taskbuffer/SiteSpec.py @@ -0,0 +1,31 @@ +""" +site specification + +""" + +class SiteSpec(object): + # attributes + _attributes = ('sitename','nickname','dq2url','cloud','ddm','lfchost','se','type','gatekeeper', + 'releases','memory','maxtime','status','space','retry','cmtconfig','setokens', + 'seprodpath','glexec','priorityoffset','allowedgroups','defaulttoken','queue', + 'localqueue','validatedreleases','accesscontrol','copysetup','maxinputsize', + 'cachedse','allowdirectaccess','comment','cloudlist','statusmodtime','lfcregister', + 'countryGroup','availableCPU','pledgedCPU','coreCount','reliabilityLevel', + 'iscvmfs','transferringlimit') + + # constructor + def __init__(self): + # install attributes + for attr in self._attributes: + setattr(self,attr,None) + + # serialize + def __str__(self): + str = '' + for attr in self._attributes: + str += '%s:%s ' % (attr,getattr(self,attr)) + return str + + + + diff --git a/current/pandaserver/taskbuffer/TaskBuffer.py b/current/pandaserver/taskbuffer/TaskBuffer.py new file mode 100755 index 000000000..9c03a1b35 --- /dev/null +++ b/current/pandaserver/taskbuffer/TaskBuffer.py @@ -0,0 +1,2294 @@ +import re +import sys +import types +import shlex +import datetime +import ProcessGroups +from threading import Lock +from DBProxyPool import DBProxyPool +from brokerage.SiteMapper import SiteMapper +from dataservice.Setupper import Setupper +from dataservice.Closer import Closer +from dataservice.TaLauncher import TaLauncher +from dataservice.ProcessLimiter import ProcessLimiter + +# logger +from pandalogger.PandaLogger import PandaLogger +_logger = PandaLogger().getLogger('TaskBuffer') + + +class TaskBuffer: + """ + task queue + + """ + + # constructor + def __init__(self): + self.proxyPool = None + self.lock = Lock() + self.processLimiter = None + + + # initialize + def init(self,dbname,dbpass,nDBConnection=10,useTimeout=False): + # lock + self.lock.acquire() + # create Proxy Pool + if self.proxyPool == None: + self.proxyPool = DBProxyPool(dbname,dbpass,nDBConnection,useTimeout) + # create process limiter + if self.processLimiter == None: + self.processLimiter = ProcessLimiter() + # release + self.lock.release() + + + # check production role + def checkProdRole(self,fqans): + for fqan in fqans: + # check production role + match = re.search('/([^/]+)/Role=production',fqan) + if match != None: + return True,match.group(1) + return False,None + + + # get priority parameters for user + def getPrioParameters(self,jobs,user,fqans,userDefinedWG,validWorkingGroup): + withProdRole = False + workingGroup = None + priorityOffset = 0 + serNum = 0 + weight = None + # get DB proxy + proxy = self.proxyPool.getProxy() + # check production role + withProdRole,workingGroup = self.checkProdRole(fqans) + if withProdRole: + # check dataset name + for tmpFile in jobs[-1].Files: + if tmpFile.type in ['output','log'] and not tmpFile.lfn.startswith('group'): + # reset + withProdRole,workingGroup = False,None + break + # set high prioryty for production role + """ + if withProdRole: + serNum = 0 + weight = 0.0 + priorityOffset = 2000 + """ + # reset nJob/weight for HC + if jobs[0].processingType in ['hammercloud','gangarobot'] \ + or jobs[0].processingType.startswith('gangarobot-'): + serNum = 0 + weight = 0.0 + if jobs[0].processingType in ['gangarobot','gangarobot-pft']: + priorityOffset = 3000 + # check quota + if weight == None: + weight = proxy.checkQuota(user) + # get nJob + if userDefinedWG and validWorkingGroup: + serNum = proxy.getNumberJobsUser(user,workingGroup=jobs[0].workingGroup) + else: + serNum = proxy.getNumberJobsUser(user,workingGroup=None) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return withProdRole,workingGroup,priorityOffset,serNum,weight + + + # store Jobs into DB + def storeJobs(self,jobs,user,joinThr=False,forkSetupper=False,fqans=[],hostname='',resetLocInSetupper=False, + checkSpecialHandling=True,toPending=False): + try: + _logger.debug("storeJobs : start for %s nJobs=%s" % (user,len(jobs))) + # check quota for priority calculation + weight = 0.0 + userJobID = -1 + userJobsetID = -1 + userStatus = True + priorityOffset = 0 + userVO = 'atlas' + userCountry = None + useExpress = False + nExpressJobs = 0 + useDebugMode = False + # check ban user except internally generated jobs + if len(jobs) > 0 and not jobs[0].prodSourceLabel in ProcessGroups.internalSourceLabels: + # get DB proxy + proxy = self.proxyPool.getProxy() + # check user status + tmpStatus = proxy.checkBanUser(user,jobs[0].prodSourceLabel) + # release proxy + self.proxyPool.putProxy(proxy) + # return if DN is blocked + if not tmpStatus: + _logger.debug("storeJobs : end for %s DN is blocked 1" % user) + return [] + # set parameters for user jobs + if len(jobs) > 0 and (jobs[0].prodSourceLabel in ['user','panda','ptest','rc_test','ssc']) \ + and (not jobs[0].processingType in ['merge','unmerge']): + # get DB proxy + proxy = self.proxyPool.getProxy() + # get JobID and status + userJobID,userJobsetID,userStatus = proxy.getUserParameter(user,jobs[0].jobDefinitionID,jobs[0].jobsetID) + # get site access + userSiteAccess = proxy.checkSiteAccess(jobs[0].computingSite,user) + # check quota for express jobs + if 'express' in jobs[0].specialHandling: + expressQuota = proxy.getExpressJobs(user) + if expressQuota != None and expressQuota['status'] and expressQuota['quota'] > 0: + nExpressJobs = expressQuota['quota'] + if nExpressJobs > 0: + useExpress = True + # debug mode + if 'debug' in jobs[0].specialHandling: + debugJobList = proxy.getActiveDebugJobs(user) + if len(debugJobList) < ProcessGroups.maxDebugJobs: + useDebugMode = True + # release proxy + self.proxyPool.putProxy(proxy) + # get site spec + siteMapper = SiteMapper(self) + tmpSiteSpec = siteMapper.getSite(jobs[0].computingSite) + # check allowed groups + if userStatus and hasattr(tmpSiteSpec,'allowedgroups') and (not tmpSiteSpec.allowedgroups in ['',None]): + # set status to False when allowedgroups is defined + userStatus = False + # loop over all groups + for tmpGroup in tmpSiteSpec.allowedgroups.split(','): + if tmpGroup == '': + continue + # loop over all FQANs + for tmpFQAN in fqans: + if re.search('^%s' % tmpGroup,tmpFQAN) != None: + userStatus = True + break + # escape + if userStatus: + break + # get priority offset + if hasattr(tmpSiteSpec,'priorityoffset') and (not tmpSiteSpec.priorityoffset in ['',None]): + # loop over all groups + for tmpGP in tmpSiteSpec.priorityoffset.split(','): + if tmpGP == '': + continue + # get group and offset + tmpGroup = tmpGP.split(':')[0] + try: + tmpOffset = int(tmpGP.split(':')[-1]) + except: + tmpOffset = 0 + # loop over all FQANs + for tmpFQAN in fqans: + _logger.debug(tmpFQAN) + if re.search('^%s/' % tmpGroup,tmpFQAN) != None or \ + re.search('%s$' % tmpGroup,tmpFQAN) != None: + # use the largest offset + if tmpOffset > priorityOffset: + priorityOffset = tmpOffset + break + # check site access + if hasattr(tmpSiteSpec,'accesscontrol') and tmpSiteSpec.accesscontrol == 'grouplist': + if userSiteAccess == {} or userSiteAccess['status'] != 'approved': + # user is not allowed + userStatus = False + # set priority offset + if userStatus: + if userSiteAccess.has_key('poffset') and userSiteAccess['poffset'] > priorityOffset: + priorityOffset = userSiteAccess['poffset'] + # extract country group + for tmpFQAN in fqans: + match = re.search('^/atlas/([^/]+)/',tmpFQAN) + if match != None: + tmpCountry = match.group(1) + # use country code or usatlas + if len(tmpCountry) == 2: + userCountry = tmpCountry + break + # usatlas + if tmpCountry in ['usatlas']: + userCountry = 'us' + break + # return if DN is blocked + if not userStatus: + _logger.debug("storeJobs : end for %s DN is blocked 2" % user) + return [] + # extract VO + for tmpFQAN in fqans: + match = re.search('^/([^/]+)/',tmpFQAN) + if match != None: + userVO = match.group(1) + break + # get number of jobs currently in PandaDB + serNum = 0 + userDefinedWG = False + validWorkingGroup = False + usingBuild = False + withProdRole = False + workingGroup = None + if len(jobs) > 0 and (jobs[0].prodSourceLabel in ['user','panda']) \ + and (not jobs[0].processingType in ['merge','unmerge']): + # check workingGroup + if not jobs[0].workingGroup in ['',None,'NULL']: + userDefinedWG = True + if userSiteAccess != {}: + if userSiteAccess['status'] == 'approved' and jobs[0].workingGroup in userSiteAccess['workingGroups']: + # valid workingGroup + validWorkingGroup = True + # using build for analysis + if jobs[0].prodSourceLabel == 'panda': + usingBuild = True + # get priority parameters for user + withProdRole,workingGroup,priorityOffset,serNum,weight = self.getPrioParameters(jobs,user,fqans,userDefinedWG, + validWorkingGroup) + # get DB proxy + proxy = self.proxyPool.getProxy() + # get group job serial number + groupJobSerialNum = 0 + if len(jobs) > 0 and (jobs[0].prodSourceLabel in ['user','panda']) \ + and (not jobs[0].processingType in ['merge','unmerge']): + for tmpFile in jobs[-1].Files: + if tmpFile.type in ['output','log'] and '$GROUPJOBSN' in tmpFile.lfn: + tmpSnRet = proxy.getSerialNumberForGroupJob(user) + if tmpSnRet['status']: + groupJobSerialNum = tmpSnRet['sn'] + break + # loop over all jobs + ret =[] + newJobs=[] + usePandaDDM = False + firstLiveLog = True + nRunJob = 0 + for job in jobs: + # set JobID. keep original JobID when retry + if userJobID != -1 and job.prodSourceLabel in ['user','panda'] \ + and (job.attemptNr in [0,'0','NULL'] or (not job.jobExecutionID in [0,'0','NULL'])) \ + and (not jobs[0].processingType in ['merge','unmerge']): + job.jobDefinitionID = userJobID + # set jobsetID + if job.prodSourceLabel in ['user','panda','ptest','rc_test']: + job.jobsetID = userJobsetID + # set specialHandling + if job.prodSourceLabel in ['user','panda']: + if checkSpecialHandling: + specialHandling = '' + # debug mode + if useDebugMode and nRunJob == 0 and job.prodSourceLabel == 'user': + specialHandling += 'debug,' + # express mode + if useExpress and (nRunJob < nExpressJobs or job.prodSourceLabel == 'panda'): + specialHandling += 'express,' + # reset specialHandling + specialHandling = specialHandling[:-1] + job.specialHandling = specialHandling + if job.prodSourceLabel != 'panda': + nRunJob += 1 + # set relocation flag + if job.computingSite != 'NULL': + job.relocationFlag = 1 + # protection agains empty jobParameters + if job.jobParameters in ['',None,'NULL']: + job.jobParameters = ' ' + # set country group and nJobs (=taskID) + if job.prodSourceLabel in ['user','panda']: + job.countryGroup = userCountry + # set workingGroup + if not validWorkingGroup: + if withProdRole: + # set country group if submitted with production role + job.workingGroup = workingGroup + else: + if userDefinedWG: + # reset invalid working group + job.workingGroup = None + # set nJobs (=taskID) + if usingBuild: + tmpNumBuild = 1 + tmpNunRun = len(jobs) - 1 + else: + tmpNumBuild = 0 + tmpNunRun = len(jobs) + # encode + job.taskID = tmpNumBuild + (tmpNunRun << 1) + # change TRF URL just in case + if job.transformation.startswith('http://www.usatlas.bnl.gov/svn/panda/pathena/trf'): + job.transformation = re.sub('^http://www.usatlas.bnl.gov/svn/panda/pathena/trf/', + 'http://pandaserver.cern.ch:25080/trf/user/', + job.transformation) + # set hostname + if hostname != '': + job.creationHost = hostname + # insert job to DB + if not proxy.insertNewJob(job,user,serNum,weight,priorityOffset,userVO,groupJobSerialNum, + toPending): + # reset if failed + job.PandaID = None + else: + # live log + if job.prodSourceLabel in ['user','panda']: + if ' --liveLog ' in job.jobParameters: + # enable liveLog only for the first one + if firstLiveLog: + # set file name + repPatt = ' --liveLog stdout.%s ' % job.PandaID + else: + # remove the option + repPatt = ' ' + job.jobParameters = re.sub(' --liveLog ',repPatt,job.jobParameters) + firstLiveLog = False + # append + newJobs.append(job) + if job.prodSourceLabel in ['user','panda','ptest','rc_test']: + ret.append((job.PandaID,job.jobDefinitionID,{'jobsetID':job.jobsetID})) + else: + ret.append((job.PandaID,job.jobDefinitionID,job.jobName)) + serNum += 1 + # release DB proxy + self.proxyPool.putProxy(proxy) + # set up dataset + if not toPending: + if joinThr: + thr = Setupper(self,newJobs,pandaDDM=usePandaDDM,forkRun=forkSetupper,resetLocation=resetLocInSetupper) + thr.start() + thr.join() + else: + # cannot use 'thr =' because it may trigger garbage collector + Setupper(self,newJobs,pandaDDM=usePandaDDM,forkRun=forkSetupper,resetLocation=resetLocInSetupper).start() + # return jobIDs + _logger.debug("storeJobs : end for %s succeeded" % user) + return ret + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("storeJobs : %s %s" % (errType,errValue)) + return "ERROR: ServerError with storeJobs" + + + # lock jobs for reassign + def lockJobsForReassign(self,tableName,timeLimit,statList,labels,processTypes,sites,clouds): + # get DB proxy + proxy = self.proxyPool.getProxy() + # exec + res = proxy.lockJobsForReassign(tableName,timeLimit,statList,labels,processTypes,sites,clouds) + # release DB proxy + self.proxyPool.putProxy(proxy) + # return + return res + + + # lock jobs for finisher + def lockJobsForFinisher(self,timeNow,rownum,highPrio): + # get DB proxy + proxy = self.proxyPool.getProxy() + # exec + res = proxy.lockJobsForFinisher(timeNow,rownum,highPrio) + # release DB proxy + self.proxyPool.putProxy(proxy) + # return + return res + + + # get number of activated/defined jobs with output datasets + def getNumWaitingJobsWithOutDS(self,outputDSs): + # get DB proxy + proxy = self.proxyPool.getProxy() + # exec + res = proxy.getNumWaitingJobsWithOutDS(outputDSs) + # release DB proxy + self.proxyPool.putProxy(proxy) + # return + return res + + + # resubmit jobs + def resubmitJobs(self,jobIDs): + # get DB proxy + proxy = self.proxyPool.getProxy() + jobs=[] + # get jobs + for jobID in jobIDs: + res = proxy.peekJob(jobID,True,False,False,False) + if res: + jobs.append(res) + # release DB proxy + self.proxyPool.putProxy(proxy) + # set up dataset + if len(jobs) > 0: + Setupper(self,jobs).start() + # return jobIDs + return True + + + # update overall job information + def updateJobs(self,jobs,inJobsDefined): + # get DB proxy + proxy = self.proxyPool.getProxy() + # loop over all jobs + returns = [] + ddmIDs = [] + ddmAttempt = 0 + newMover = None + for job in jobs: + # update DB + tmpddmIDs = [] + if job.jobStatus == 'failed' and job.prodSourceLabel == 'user' and not inJobsDefined: + # keep failed analy jobs in Active4 + ret = proxy.updateJob(job,inJobsDefined) + elif job.jobStatus in ['finished','failed','cancelled']: + ret,tmpddmIDs,ddmAttempt,newMover = proxy.archiveJob(job,inJobsDefined) + else: + ret = proxy.updateJob(job,inJobsDefined) + returns.append(ret) + # collect IDs for reassign + if ret: + ddmIDs += tmpddmIDs + # release proxy + self.proxyPool.putProxy(proxy) + # retry mover + if newMover != None: + self.storeJobs([newMover],None,joinThr=True) + # reassign jobs when ddm failed + if ddmIDs != []: + self.reassignJobs(ddmIDs,ddmAttempt,joinThr=True) + # return + return returns + + + # update job jobStatus only + def updateJobStatus(self,jobID,jobStatus,param,updateStateChange=False,attemptNr=None): + # get DB proxy + proxy = self.proxyPool.getProxy() + # update DB and buffer + if re.match('^finished$',jobStatus,re.I) or re.match('^failed$',jobStatus,re.I): + ret = proxy.archiveJobLite(jobID,jobStatus,param) + else: + ret = proxy.updateJobStatus(jobID,jobStatus,param,updateStateChange,attemptNr) + # release proxy + self.proxyPool.putProxy(proxy) + return ret + + + # finalize pending analysis jobs + def finalizePendingJobs(self,prodUserName,jobDefinitionID): + # get DB proxy + proxy = self.proxyPool.getProxy() + # update DB + ret = proxy.finalizePendingJobs(prodUserName,jobDefinitionID) + # release proxy + self.proxyPool.putProxy(proxy) + return ret + + + # retry job + def retryJob(self,jobID,param,failedInActive=False,changeJobInMem=False,inMemJob=None, + getNewPandaID=False,attemptNr=None): + # get DB proxy + proxy = self.proxyPool.getProxy() + # update DB + ret = proxy.retryJob(jobID,param,failedInActive,changeJobInMem,inMemJob, + getNewPandaID,attemptNr) + # release proxy + self.proxyPool.putProxy(proxy) + return ret + + + # retry failed analysis jobs in Active4 + def retryJobsInActive(self,prodUserName,jobDefinitionID): + # get DB proxy + proxy = self.proxyPool.getProxy() + # update DB + ret = proxy.retryJobsInActive(prodUserName,jobDefinitionID) + # release proxy + self.proxyPool.putProxy(proxy) + return ret + + + # activate jobs + def activateJobs(self,jobs): + # get DB proxy + proxy = self.proxyPool.getProxy() + # loop over all jobs + returns = [] + for job in jobs: + # update DB + ret = proxy.activateJob(job) + returns.append(ret) + # release proxy + self.proxyPool.putProxy(proxy) + return returns + + + # send jobs to jobsWaiting + def keepJobs(self,jobs): + # get DB proxy + proxy = self.proxyPool.getProxy() + # loop over all jobs + returns = [] + for job in jobs: + # update DB + ret = proxy.keepJob(job) + returns.append(ret) + # release proxy + self.proxyPool.putProxy(proxy) + return returns + + + # delete stalled jobs + def deleteStalledJobs(self,libFileName): + # get DB proxy + proxy = self.proxyPool.getProxy() + # execute + ret = proxy.deleteStalledJobs(libFileName) + # release proxy + self.proxyPool.putProxy(proxy) + return ret + + + # set debug mode + def setDebugMode(self,dn,pandaID,prodManager,modeOn): + # get DB proxy + proxy = self.proxyPool.getProxy() + # check the number of debug jobs + if modeOn == True: + jobList = proxy.getActiveDebugJobs(dn) + else: + jobList = [] + if (not prodManager and len(jobList) >= ProcessGroups.maxDebugJobs) or \ + (prodManager and len(jobList) >= ProcessGroups.maxDebugProdJobs): + # exceeded + retStr = 'You already hit the limit on the maximum number of debug subjobs per ' + if not prodManager: + retStr += 'user (%s). ' % ProcessGroups.maxDebugJobs + else: + retStr += 'prod user (%s). ' % ProcessGroups.maxDebugProdJobs + retStr += 'Please set the debug mode off for one of the following PandaIDs : ' + for tmpID in jobList: + retStr += '%s,' % tmpID + retStr = retStr[:-1] + else: + # execute + retStr = proxy.setDebugMode(dn,pandaID,prodManager,modeOn) + # release proxy + self.proxyPool.putProxy(proxy) + return retStr + + + # get jobs + def getJobs(self,nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, + atlasRelease,prodUserID,getProxyKey,countryGroup,workingGroup,allowOtherCountry): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get waiting jobs + jobs,nSent = proxy.getJobs(nJobs,siteName,prodSourceLabel,cpu,mem,diskSpace,node,timeout,computingElement, + atlasRelease,prodUserID,countryGroup,workingGroup,allowOtherCountry) + # release proxy + self.proxyPool.putProxy(proxy) + # get Proxy Key + proxyKey = {} + if getProxyKey and len(jobs) > 0: + # get MetaDB proxy + proxy = self.proxyPool.getProxy() + # get Proxy Key + proxyKey = proxy.getProxyKey(jobs[0].prodUserID) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return jobs+[nSent,proxyKey] + + + # run task assignment + def runTaskAssignment(self,jobs): + # get DB proxy + proxy = self.proxyPool.getProxy() + # loop over all jobs + retList =[] + newJobs =[] + for job in jobs: + ret = None + if not job.taskID in ['NULL',0,'']: + # get cloud + cloudTask = proxy.getCloudTask(job.taskID) + if cloudTask != None and cloudTask.status == 'assigned': + ret = cloudTask.cloud + if ret == None: + # append for TA + newJobs.append(job) + retList.append(ret) + # release DB proxy + self.proxyPool.putProxy(proxy) + # run setupper + if newJobs != []: + TaLauncher(self,newJobs).start() + # return clouds + return retList + + + # reset modification time of a task to shorten retry interval + def resetTmodCloudTask(self,tid): + # get DBproxy + proxy = self.proxyPool.getProxy() + # run + res = proxy.resetTmodCloudTask(tid) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return res + + + # get assigning task + def getAssigningTask(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # run + res = proxy.getAssigningTask() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return res + + + # get fareshare policy + def getFaresharePolicy(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # run + res = proxy.getFaresharePolicy(True) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return res + + + # check merge job generation status + def checkMergeGenerationStatus(self,dn,jobID): + # return for NA + retNA = {'status':'NA','mergeIDs':[]} + try: + # get at most 2 PandaIDs + idStatus = self.getPandIDsWithJobID(dn,jobID,2) + if idStatus == {}: + return retNA + # use larger PandaID which corresponds to runXYZ + tmpKeys = idStatus.keys() + tmpKeys.sort() + pandaID = tmpKeys[-1] + # get job + tmpJobs = self.getFullJobStatus([pandaID]) + if tmpJobs == [] or tmpJobs[0] == None: + return retNA + pandaJob = tmpJobs[0] + # non-merge job + if not '--mergeOutput' in pandaJob.jobParameters: + return retNA + # loop over all sub datasets + subDsList = [] + mergeStatus = None + mergeIDs = [] + for tmpFile in pandaJob.Files: + if tmpFile.type in ['output','log']: + if not tmpFile.destinationDBlock in subDsList: + subDsList.append(tmpFile.destinationDBlock) + # get dataset + tmpDsSpec = self.queryDatasetWithMap({'name':tmpFile.destinationDBlock}) + if tmpDsSpec != None: + if tmpDsSpec.status in ['tobemerged']: + # going to be merged + mergeStatus = 'generating' + mergeIDs = [] + elif tmpDsSpec.status in ['tobeclosed','closed','completed']: + # another dataset from --individualOutDS is waiting for Merger + if mergeStatus == 'generating': + continue + # set status + mergeStatus = 'generated' + # collect JobIDs of merge jobs + tmpMergeID = tmpDsSpec.MoverID + if not tmpMergeID in [0,None,'NULL']+mergeIDs: + mergeIDs.append(tmpMergeID) + # no merger most likely because jobs were killed + if mergeStatus == 'generated' and mergeIDs == []: + mergeStatus = 'aborted' + # jobs are still runnign + if mergeStatus == None: + mergeStatus = 'standby' + # return + return {'status':mergeStatus,'mergeIDs':mergeIDs} + except: + return retNA + + + # get job status + def getJobStatus(self,jobIDs,fromDefined=True,fromActive=True,fromArchived=True,fromWaiting=True): + # get DBproxy + proxy = self.proxyPool.getProxy() + retStatus = [] + # peek at job + for jobID in jobIDs: + res = proxy.peekJob(jobID,fromDefined,fromActive,fromArchived,fromWaiting) + if res: + retStatus.append(res.jobStatus) + else: + retStatus.append(None) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retStatus + + + # peek at jobs + def peekJobs(self,jobIDs,fromDefined=True,fromActive=True,fromArchived=True,fromWaiting=True,forAnal=False): + # get DBproxy + proxy = self.proxyPool.getProxy() + retJobs = [] + # peek at job + for jobID in jobIDs: + res = proxy.peekJob(jobID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal) + if res: + retJobs.append(res) + else: + retJobs.append(None) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retJobs + + + # get PandaID with jobexeID + def getPandaIDwithJobExeID(self,jobexeIDs): + # get DBproxy + proxy = self.proxyPool.getProxy() + retJobs = [] + # peek at job + for jobexeID in jobexeIDs: + res = proxy.getPandaIDwithJobExeID(jobexeID) + retJobs.append(res) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retJobs + + + # get slimmed file info with PandaIDs + def getSlimmedFileInfoPandaIDs(self,pandaIDs): + iPandaID = 0 + nPandaID = 100 + retInfo = {} + while iPandaID < len(pandaIDs): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + tmpRetInfo = proxy.getSlimmedFileInfoPandaIDs(pandaIDs[iPandaID:iPandaID+nPandaID]) + # release proxy + self.proxyPool.putProxy(proxy) + iPandaID += nPandaID + if retInfo == {}: + retInfo = tmpRetInfo + else: + for outKey in tmpRetInfo.keys(): + if not retInfo.has_key(outKey): + retInfo[outKey] = [] + # append + for tmpItemRetInfo in tmpRetInfo[outKey]: + if not tmpItemRetInfo in retInfo[outKey]: + retInfo[outKey].append(tmpItemRetInfo) + # return + return retInfo + + + # get JobIDs in a time range + def getJobIDsInTimeRange(self,dn,timeRangeStr): + # check DN + if dn in ['NULL','','None',None]: + return [] + # check timeRange + match = re.match('^(\d+)-(\d+)-(\d+) (\d+):(\d+):(\d+)$',timeRangeStr) + if match == None: + return [] + timeRange = datetime.datetime(year = int(match.group(1)), + month = int(match.group(2)), + day = int(match.group(3)), + hour = int(match.group(4)), + minute = int(match.group(5)), + second = int(match.group(6))) + # max range is 3 months + maxRange = datetime.datetime.utcnow() - datetime.timedelta(days=30) + if timeRange < maxRange: + timeRange = maxRange + retJobIDs = [] + # get DBproxy + proxy = self.proxyPool.getProxy() + # get JobIDs + retJobIDs = proxy.getJobIDsInTimeRange(dn,timeRange,retJobIDs) + # release proxy + self.proxyPool.putProxy(proxy) + # read ARCH when time window is more than 3days (- 3 hours as a margin) + if timeRange < datetime.datetime.utcnow() - datetime.timedelta(days=2,hours=21) : + # get ArchiveDBproxy + proxy = self.proxyPool.getProxy() + # get JobIDs + retJobIDs = proxy.getJobIDsInTimeRangeLog(dn,timeRange,retJobIDs) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retJobIDs + + + # get PandaIDs for a JobID + def getPandIDsWithJobID(self,dn,jobID,nJobs): + idStatus = {} + # check DN + if dn in ['NULL','','None',None]: + return idStatus + # check JobID + try: + jobID = long(jobID) + nJobs = long(nJobs) + except: + return idStatus + # get DBproxy + proxy = self.proxyPool.getProxy() + # get IDs + idStatus,buildJobID = proxy.getPandIDsWithJobID(dn,jobID,idStatus,nJobs) + # release proxy + self.proxyPool.putProxy(proxy) + # get ArchiveDBproxy + proxy = self.proxyPool.getProxy() + # get IDs + idStatus = proxy.getPandIDsWithJobIDLog(dn,jobID,idStatus,nJobs,buildJobID) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return idStatus + + + # get PandaIDs for a JobsetID or JobdefID in jobsArchived + def getPandIDsWithIdInArch(self,prodUserName,id,isJobset): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.getPandIDsWithIdInArch(prodUserName,id,isJobset) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get beyond pledge resource ratio + # ! this method is not thread-safe + def getPledgeResourceRatio(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.getPledgeResourceRatio() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return proxy.beyondPledgeRatio + + + # get the number of waiting jobs with a dataset + def getNumWaitingJobsForPD2P(self,datasetName): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + nJobs = proxy.getNumWaitingJobsForPD2P(datasetName) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return nJobs + + + # get the number of waiting jobsets with a dataset + def getNumWaitingJobsetsForPD2P(self,datasetName): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + nJobs = proxy.getNumWaitingJobsetsForPD2P(datasetName) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return nJobs + + + # lock job for re-brokerage + def lockJobForReBrokerage(self,dn,jobID,simulation,forceOpt,forFailed=False): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get IDs + ret = proxy.lockJobForReBrokerage(dn,jobID,simulation,forceOpt,forFailed) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # reset buildJob for re-brokerage + def resetBuildJobForReBrokerage(self,pandaID): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get IDs + ret = proxy.resetBuildJobForReBrokerage(pandaID) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get PandaIDs using libDS for re-brokerage + def getPandaIDsForReBrokerage(self,userName,jobID,fromActive,forFailed=False): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get IDs + ret = proxy.getPandaIDsForReBrokerage(userName,jobID,fromActive,forFailed) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get input datasets for rebroerage + def getInDatasetsForReBrokerage(self,jobID,userName): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get IDs + ret = proxy.getInDatasetsForReBrokerage(jobID,userName) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get outDSs with userName/jobID + def getOutDSsForReBrokerage(self,userName,jobID): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get IDs + ret = proxy.getOutDSsForReBrokerage(userName,jobID) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get full job status + def getFullJobStatus(self,jobIDs,fromDefined=True,fromActive=True,fromArchived=True,fromWaiting=True,forAnal=True): + retJobMap = {} + # peek at job + for jobID in jobIDs: + # get DBproxy for each job to avoid occupying connection for long time + proxy = self.proxyPool.getProxy() + # peek job + res = proxy.peekJob(jobID,fromDefined,fromActive,fromArchived,fromWaiting,forAnal) + retJobMap[jobID] = res + # release proxy + self.proxyPool.putProxy(proxy) + # get IDs + for jobID in jobIDs: + if retJobMap[jobID] == None: + # get ArchiveDBproxy + proxy = self.proxyPool.getProxy() + # peek job + res = proxy.peekJobLog(jobID) + retJobMap[jobID] = res + # release proxy + self.proxyPool.putProxy(proxy) + # sort + retJobs = [] + for jobID in jobIDs: + retJobs.append(retJobMap[jobID]) + # return + return retJobs + + + # get script for offline running + def getScriptOfflineRunning(self,pandaID): + try: + # get job + tmpJobs = self.getFullJobStatus([pandaID]) + if tmpJobs == [] or tmpJobs[0] == None: + return "ERROR: Cannot get PandaID=%s in DB for the last 30 days" % pandaID + tmpJob = tmpJobs[0] + # check prodSourceLabel + if not tmpJob.prodSourceLabel in ['managed','test']: + return "ERROR: Non production job : prodSourceLabel=%s. This method is only for production jobs" % tmpJob.prodSourceLabel + # release and trf + tmpRels = tmpJob.homepackage.split("\n") + tmpPars = tmpJob.jobParameters.split("\n") + tmpTrfs = tmpJob.transformation.split("\n") + if not (len(tmpRels) == len(tmpPars) == len(tmpTrfs)): + return "ERROR: The number of releases or parameters or trfs is inconsitent with others" + # construct script + scrStr = "#retrieve inputs\n\n" + # collect inputs + dsFileMap = {} + for tmpFile in tmpJob.Files: + if tmpFile.type=='input': + if not dsFileMap.has_key(tmpFile.dataset): + dsFileMap[tmpFile.dataset] = [] + if not tmpFile.lfn in dsFileMap[tmpFile.dataset]: + dsFileMap[tmpFile.dataset].append(tmpFile.lfn) + # dq2 + for tmpDS,tmpFileList in dsFileMap.iteritems(): + scrStr += "dq2-get --files " + for tmpLFN in tmpFileList: + scrStr += "%s," % tmpLFN + scrStr = scrStr[:-1] + scrStr += " %s\n" % tmpDS + # ln + for tmpLFN in tmpFileList: + scrStr += "ln -fs %s*/%s ./%s\n" % (tmpDS.rstrip("/"),tmpLFN,tmpLFN) + scrStr += "\n#transform commands\n\n" + bitNum = '32' + if 'x86_64' in tmpJob.cmtConfig: + bitNum = '64' + for tmpIdx,tmpRel in enumerate(tmpRels): + # asetup + scrStr += "asetup %s,%s,%s\n" % tuple(tmpRel.split("/")+[bitNum]) + # athenaMP + if not tmpJob.coreCount in ['NULL',None] and tmpJob.coreCount > 1: + scrStr += "export ATHENA_PROC_NUMBER=%s\n" % tmpJob.coreCount + # add double quotes for zsh + tmpParamStr = tmpPars[tmpIdx] + tmpSplitter = shlex.shlex(tmpParamStr, posix=True) + tmpSplitter.whitespace = ' ' + tmpSplitter.whitespace_split = True + # loop for params + for tmpItem in tmpSplitter: + tmpMatch = re.search('^([^=]+=)(.+)$',tmpItem) + if tmpMatch != None: + tmpArgName = tmpMatch.group(1) + tmpArgVal = tmpMatch.group(2) + tmpArgIdx = tmpParamStr.find(tmpArgName) + len(tmpArgName) + # add " + if tmpParamStr[tmpArgIdx] != '"': + tmpParamStr = tmpParamStr.replace(tmpMatch.group(0), + tmpArgName+'"'+tmpArgVal+'"') + # run trf + scrStr += "%s %s\n\n" % (tmpTrfs[tmpIdx],tmpParamStr) + return scrStr + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("getScriptOfflineRunning : %s %s" % (errType,errValue)) + return "ERROR: ServerError with getScriptOfflineRunning" + + + # kill jobs + def killJobs(self,ids,user,code,prodManager,wgProdRole=[]): + # get DBproxy + proxy = self.proxyPool.getProxy() + rets = [] + # kill jobs + pandaIDforCloserMap = {} + for id in ids: + ret,userInfo = proxy.killJob(id,user,code,prodManager,True,wgProdRole) + rets.append(ret) + if ret and userInfo['prodSourceLabel'] in ['user','managed','test']: + jobIDKey = (userInfo['prodUserID'],userInfo['jobDefinitionID'],userInfo['jobsetID']) + if not pandaIDforCloserMap.has_key(jobIDKey): + pandaIDforCloserMap[jobIDKey] = id + # release proxy + self.proxyPool.putProxy(proxy) + # run Closer + try: + if pandaIDforCloserMap != {}: + for pandaIDforCloser in pandaIDforCloserMap.values(): + tmpJobs = self.peekJobs([pandaIDforCloser]) + tmpJob = tmpJobs[0] + if tmpJob != None: + tmpDestDBlocks = [] + # get destDBlock + for tmpFile in tmpJob.Files: + if tmpFile.type in ['output','log']: + if not tmpFile.destinationDBlock in tmpDestDBlocks: + tmpDestDBlocks.append(tmpFile.destinationDBlock) + # run + cThr = Closer(self,tmpDestDBlocks,tmpJob) + cThr.start() + cThr.join() + except: + pass + # return + return rets + + + # reassign jobs + def reassignJobs(self,ids,attempt=0,joinThr=False,forkSetupper=False,forPending=False): + # get DBproxy + proxy = self.proxyPool.getProxy() + jobs = [] + oldSubMap = {} + # keep old assignment + keepSiteFlag = False + if (attempt % 2) != 0: + keepSiteFlag = True + # reset jobs + for id in ids: + try: + # try to reset active job + if not forPending: + tmpRet = proxy.resetJob(id,keepSite=keepSiteFlag,getOldSubs=True) + if isinstance(tmpRet,types.TupleType): + ret,tmpOldSubList = tmpRet + else: + ret,tmpOldSubList = tmpRet,[] + if ret != None: + jobs.append(ret) + for tmpOldSub in tmpOldSubList: + if not oldSubMap.has_key(tmpOldSub): + oldSubMap[tmpOldSub] = ret + continue + # try to reset waiting job + tmpRet = proxy.resetJob(id,False,keepSite=keepSiteFlag,getOldSubs=False,forPending=forPending) + if isinstance(tmpRet,types.TupleType): + ret,tmpOldSubList = tmpRet + else: + ret,tmpOldSubList = tmpRet,[] + if ret != None: + jobs.append(ret) + # waiting jobs don't create sub or dis + continue + # try to reset defined job + if not forPending: + tmpRet = proxy.resetDefinedJob(id,keepSite=keepSiteFlag,getOldSubs=True) + if isinstance(tmpRet,types.TupleType): + ret,tmpOldSubList = tmpRet + else: + ret,tmpOldSubList = tmpRet,[] + if ret != None: + jobs.append(ret) + for tmpOldSub in tmpOldSubList: + if not oldSubMap.has_key(tmpOldSub): + oldSubMap[tmpOldSub] = ret + continue + except: + pass + # release DB proxy + self.proxyPool.putProxy(proxy) + # run Closer for old sub datasets + if not forPending: + for tmpOldSub,tmpJob in oldSubMap.iteritems(): + cThr = Closer(self,[tmpOldSub],tmpJob) + cThr.start() + cThr.join() + # setup dataset + if jobs != []: + if joinThr: + thr = Setupper(self,jobs,resubmit=True,ddmAttempt=attempt,forkRun=forkSetupper) + thr.start() + thr.join() + else: + # cannot use 'thr =' because it may trigger garbage collector + Setupper(self,jobs,resubmit=True,ddmAttempt=attempt,forkRun=forkSetupper).start() + # return + return True + + + # awake jobs in jobsWaiting + def awakeJobs(self,ids): + # get DBproxy + proxy = self.proxyPool.getProxy() + jobs = [] + # reset jobs + for id in ids: + # try to reset waiting job + ret = proxy.resetJob(id,False) + if ret != None: + jobs.append(ret) + # release DB proxy + self.proxyPool.putProxy(proxy) + # setup dataset + Setupper(self,jobs).start() + # return + return True + + + # query PandaIDs + def queryPandaIDs(self,jobDefIDs): + # get DBproxy + proxy = self.proxyPool.getProxy() + pandaIDs = [] + # query PandaID + for jobDefID in jobDefIDs: + id = proxy.queryPandaID(jobDefID) + pandaIDs.append(id) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return pandaIDs + + + # query job info per cloud + def queryJobInfoPerCloud(self,cloud,schedulerID=None): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query job info + ret = proxy.queryJobInfoPerCloud(cloud,schedulerID) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get PandaIDs to be updated in prodDB + def getPandaIDsForProdDB(self,limit,lockedby): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query PandaID + ret = proxy.getPandaIDsForProdDB(limit,lockedby) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # update prodDBUpdateTime + def updateProdDBUpdateTimes(self,paramList): + retList = [] + # get DBproxy + proxy = self.proxyPool.getProxy() + # update + for param in paramList: + ret = proxy.updateProdDBUpdateTime(param) + retList.append(ret) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # get PandaIDs at Site + def getPandaIDsSite(self,site,status,limit): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query PandaID + ids = proxy.getPandaIDsSite(site,status,limit) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ids + + + # get input files currently in used for analysis + def getFilesInUseForAnal(self,outDataset): + # get DBproxy + proxy = self.proxyPool.getProxy() + retList = [] + # query LFNs + retList = proxy.getFilesInUseForAnal(outDataset) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # get list of dis dataset to get input files in shadow + def getDisInUseForAnal(self,outDataset): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query dis + retList = proxy.getDisInUseForAnal(outDataset) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # get input LFNs currently in use for analysis with shadow dis + def getLFNsInUseForAnal(self,inputDisList): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query dis + retList = proxy.getLFNsInUseForAnal(inputDisList) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # update input files and return corresponding PandaIDs + def updateInFilesReturnPandaIDs(self,dataset,status,fileLFN=''): + # get DBproxy + proxy = self.proxyPool.getProxy() + retList = [] + # query PandaID + retList = proxy.updateInFilesReturnPandaIDs(dataset,status,fileLFN) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # update file status in dispatch dataset + def updateFileStatusInDisp(self,dataset,fileStatusMap): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query PandaID + retVal = proxy.updateFileStatusInDisp(dataset,fileStatusMap) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retVal + + + # update output files and return corresponding PandaIDs + def updateOutFilesReturnPandaIDs(self,dataset,fileLFN=''): + # get DBproxy + proxy = self.proxyPool.getProxy() + retList = [] + # query PandaID + retList = proxy.updateOutFilesReturnPandaIDs(dataset,fileLFN) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # get datasets associated with file + def getDatasetWithFile(self,lfn,jobPrioity=0): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query PandaID + retList = proxy.getDatasetWithFile(lfn,jobPrioity) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # get _dis datasets associated to _sub + def getAssociatedDisDatasets(self,subDsName): + # get DBproxy + proxy = self.proxyPool.getProxy() + retList = [] + # query + retList = proxy.getAssociatedDisDatasets(subDsName) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # insert sandbox file info + def insertSandboxFileInfo(self,userName,hostName,fileName,fileSize,checkSum): + # get DBproxy + proxy = self.proxyPool.getProxy() + # exec + ret= proxy.insertSandboxFileInfo(userName,hostName,fileName,fileSize,checkSum) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # check duplicated sandbox file + def checkSandboxFile(self,userName,fileSize,checkSum): + # get DBproxy + proxy = self.proxyPool.getProxy() + # exec + ret= proxy.checkSandboxFile(userName,fileSize,checkSum) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # insert datasets + def insertDatasets(self,datasets): + # get DBproxy + proxy = self.proxyPool.getProxy() + retList = [] + # insert + for dataset in datasets: + ret= proxy.insertDataset(dataset) + retList.append(ret) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # query Dataset + def queryDatasetWithMap(self,map): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query Dataset + ret = proxy.queryDatasetWithMap(map) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # query last files in a dataset + def queryLastFilesInDataset(self,datasets): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query files + ret = proxy.queryLastFilesInDataset(datasets) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # set GUIDs + def setGUIDs(self,files): + # get DBproxy + proxy = self.proxyPool.getProxy() + # set GUIDs + ret = proxy.setGUIDs(files) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # query PandaID with dataset + def queryPandaIDwithDataset(self,datasets): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query Dataset + ret = proxy.queryPandaIDwithDataset(datasets) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # query PandaID with filenames + def queryPandaIDwithLFN(self,lfns): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query Dataset + ret = proxy.queryPandaIDwithLFN(lfns) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # update dataset + def updateDatasets(self,datasets,withLock=False,withCriteria="",criteriaMap={}): + # get DBproxy + proxy = self.proxyPool.getProxy() + # update Dataset + retList = proxy.updateDataset(datasets,withLock,withCriteria,criteriaMap) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # delete dataset + def deleteDatasets(self,datasets): + # get DBproxy + proxy = self.proxyPool.getProxy() + retList = [] + # query Dataset + for dataset in datasets: + ret = proxy.deleteDataset(dataset) + retList.append(ret) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # query files with map + def queryFilesWithMap(self,map): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query files + ret = proxy.queryFilesWithMap(map) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # count the number of files with map + def countFilesWithMap(self,map): + # get DBproxy + proxy = self.proxyPool.getProxy() + # query files + ret = proxy.countFilesWithMap(map) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # count the number of pending files + def countPendingFiles(self,pandaID,forInput=True): + # get DBproxy + proxy = self.proxyPool.getProxy() + # count files + ret = proxy.countPendingFiles(pandaID,forInput) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get serial number for dataset + def getSerialNumber(self,datasetname,definedFreshFlag=None): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get serial number + ret = proxy.getSerialNumber(datasetname,definedFreshFlag) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get serial number for group job + def getSerialNumberForGroupJob(self,name): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get serial number + ret = proxy.getSerialNumberForGroupJob(name) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # add metadata + def addMetadata(self,ids,metadataList): + # get DBproxy + proxy = self.proxyPool.getProxy() + # add metadata + index = 0 + retList = [] + for id in ids: + ret = proxy.addMetadata(id,metadataList[index]) + retList.append(ret) + index += 1 + # release proxy + self.proxyPool.putProxy(proxy) + # return + return retList + + + # add stdout + def addStdOut(self,id,stdout): + # get DBproxy + proxy = self.proxyPool.getProxy() + # add + ret = proxy.addStdOut(id,stdout) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # extract name from DN + def cleanUserID(self,id): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.cleanUserID(id) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # extract scope from dataset name + def extractScope(self,name): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.extractScope(name) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # change job priorities + def changeJobPriorities(self,newPrioMap): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.changeJobPriorities(newPrioMap) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get destinationDBlockToken for a dataset + def getDestTokens(self,dsname): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get token + ret = proxy.getDestTokens(dsname) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get destinationSE for a dataset + def getDestSE(self,dsname,fromArch=False): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get token + ret = proxy.getDestSE(dsname,fromArch) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get job statistics + def getJobStatistics(self,archived=False,predefined=False,workingGroup='',countryGroup='',jobType='',forAnal=None,minPriority=None): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get serial number + ret = proxy.getJobStatistics(archived,predefined,workingGroup,countryGroup,jobType,forAnal,minPriority) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get job statistics with label + def getJobStatisticsWithLabel(self,siteStr=''): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get serial number + ret = proxy.getJobStatisticsWithLabel(siteStr) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get job statistics for brokerage + def getJobStatisticsBrokerage(self,minPrio=None): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get stat + ret = proxy.getJobStatisticsBrokerage(minPrio) + # release proxy + self.proxyPool.putProxy(proxy) + # convert + conRet = ProcessGroups.countJobsPerGroup(ret) + # return + return conRet + + + # get job statistics for analysis brokerage + def getJobStatisticsAnalBrokerage(self,minPriority=None): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get stat + ret = proxy.getJobStatisticsAnalBrokerage(minPriority=minPriority) + # release proxy + self.proxyPool.putProxy(proxy) + # convert + conRet = ProcessGroups.countJobsPerGroupForAnal(ret) + # return + return conRet + + + # get the number of waiting jobs per site and user + def getJobStatisticsPerUserSite(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get stat + ret = proxy.getJobStatisticsPerUserSite() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get highest prio jobs + def getHighestPrioJobStat(self,perPG=False,useMorePG=False): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get stat + if not perPG: + ret = proxy.getHighestPrioJobStat() + else: + ret = proxy.getHighestPrioJobStatPerPG(useMorePG) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get queued analysis jobs at a site + def getQueuedAnalJobs(self,site,dn): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get stat + ret = proxy.getQueuedAnalJobs(site,dn) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get job statistics for ExtIF + def getJobStatisticsForExtIF(self,sourcetype=None): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get serial number + ret = proxy.getJobStatisticsForExtIF(sourcetype) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get job statistics for Bamboo + def getJobStatisticsForBamboo(self,useMorePG=False): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get serial number + ret = proxy.getJobStatisticsPerProcessingType(useMorePG) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get number of analysis jobs per user + def getNUserJobs(self,siteName,nJobs): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get number of analysis jobs per user + tmpRet = proxy.getNUserJobs(siteName,nJobs) + # release proxy + self.proxyPool.putProxy(proxy) + # get log proxy + proxy = self.proxyPool.getProxy() + # get Proxy Key + ret = {} + for userID,nJobs in tmpRet.iteritems(): + proxyKey = proxy.getProxyKey(userID) + if proxyKey != {}: + # add nJobs + proxyKey['nJobs'] = nJobs + # append + ret[userID] = proxyKey + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get number of activated analysis jobs + def getNAnalysisJobs(self,nProcesses): + # get DBproxy + proxy = self.proxyPool.getProxy() + # count + ret = proxy.getNAnalysisJobs(nProcesses) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # update transfer status for a dataset + def updateTransferStatus(self,datasetname,bitMap): + # get DBproxy + proxy = self.proxyPool.getProxy() + # update + ret = proxy.updateTransferStatus(datasetname,bitMap) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get CloudTask + def getCloudTask(self,tid): + # get DBproxy + proxy = self.proxyPool.getProxy() + # count + ret = proxy.getCloudTask(tid) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # set cloud to CloudTask + def setCloudTask(self,cloudTask): + # get DBproxy + proxy = self.proxyPool.getProxy() + # count + ret = proxy.setCloudTask(cloudTask) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # see CloudTask + def seeCloudTask(self,tid): + # get DBproxy + proxy = self.proxyPool.getProxy() + # count + ret = proxy.seeCloudTask(tid) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # set cloud to CloudTask by user + def setCloudTaskByUser(self,user,tid,cloud,status): + # get DBproxy + proxy = self.proxyPool.getProxy() + # count + ret = proxy.setCloudTaskByUser(user,tid,cloud,status) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # update site data + def updateSiteData(self,hostID,pilotRequests): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get serial number + ret = proxy.updateSiteData(hostID,pilotRequests) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get current site data + def getCurrentSiteData(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get serial number + ret = proxy.getCurrentSiteData() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # insert nRunning in site data + def insertnRunningInSiteData(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get serial number + ret = proxy.insertnRunningInSiteData() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get nRunning in site data + def getnRunningInSiteData(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get serial number + ret = proxy.getnRunningInSiteData() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get site list + def getSiteList(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get site info + ret = proxy.getSiteList() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get site info + def getSiteInfo(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get site info + ret = proxy.getSiteInfo() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get cloud list + def getCloudList(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get cloud list + ret = proxy.getCloudList() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # check sites with release/cache + def checkSitesWithRelease(self,sites,releases=None,caches=None,cmtConfig=None): + # get DBproxy + proxy = self.proxyPool.getProxy() + # check + ret = proxy.checkSitesWithRelease(sites,releases,caches,cmtConfig) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get sites with release/cache in cloud + def getSitesWithReleaseInCloud(self,cloud,releases=None,caches=None,validation=False): + # get DBproxy + proxy = self.proxyPool.getProxy() + # check + ret = proxy.getSitesWithReleaseInCloud(cloud,releases,caches,validation) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get list of cache prefix + def getCachePrefixes(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # check + ret = proxy.getCachePrefixes() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get pilot owners + def getPilotOwners(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get pilot owners + ret = proxy.getPilotOwners() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get allowed nodes + def getAllowedNodes(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.getAllowedNodes() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get email address + def getEmailAddr(self,name): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.getEmailAddr(name) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get client version + def getPandaClientVer(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.getPandaClientVer() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # register proxy key + def registerProxyKey(self,params): + # get DBproxy + proxy = self.proxyPool.getProxy() + # register proxy key + ret = proxy.registerProxyKey(params) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # register proxy key + def registerProxyKey(self,params): + # get DBproxy + proxy = self.proxyPool.getProxy() + # register proxy key + ret = proxy.registerProxyKey(params) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get proxy key + def getProxyKey(self,dn): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get proxy key + ret = proxy.getProxyKey(dn) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # add account to siteaccess + def addSiteAccess(self,siteID,dn): + # get DBproxy + proxy = self.proxyPool.getProxy() + # add account to siteaccess + ret = proxy.addSiteAccess(siteID,dn) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # list site access + def listSiteAccess(self,siteid,dn,longFormat=False): + # get DBproxy + proxy = self.proxyPool.getProxy() + # list site access + ret = proxy.listSiteAccess(siteid,dn,longFormat) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # update site access + def updateSiteAccess(self,method,siteid,requesterDN,userName,attrValue): + # get DBproxy + proxy = self.proxyPool.getProxy() + # update site access + ret = proxy.updateSiteAccess(method,siteid,requesterDN,userName,attrValue) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # generate pilot token + def genPilotToken(self,schedulerhost,scheduleruser,schedulerid): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.genPilotToken(schedulerhost,scheduleruser,schedulerid) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # add files to memcached + def addFilesToMemcached(self,site,node,files): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.addFilesToMemcached(site,node,files) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # delete files from memcached + def deleteFilesFromMemcached(self,site,node,files): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.deleteFilesFromMemcached(site,node,files) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # flush memcached + def flushMemcached(self,site,node): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.flushMemcached(site,node) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + # check files with memcached + def checkFilesWithMemcached(self,site,node,files): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.checkFilesWithMemcached(site,node,files) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get list of scheduler users + def getListSchedUsers(self): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.getListSchedUsers() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # query an SQL return Status + def querySQLS(self,sql,varMap,arraySize=1000): + # get DBproxy + proxy = self.proxyPool.getProxy() + # get + ret = proxy.querySQLS(sql,varMap,arraySize) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # check quota + def checkQuota(self,dn): + # query an SQL return Status + proxy = self.proxyPool.getProxy() + # get + ret = proxy.checkQuota(dn) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get JobID for user + def getJobIdUser(self,dn): + # query an SQL return Status + proxy = self.proxyPool.getProxy() + # get + ret = proxy.getJobIdUser(dn) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get user subscriptions + def getUserSubscriptions(self,datasetName,timeRange): + # query an SQL return Status + proxy = self.proxyPool.getProxy() + # get + ret = proxy.getUserSubscriptions(datasetName,timeRange) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get the number of user subscriptions + def getNumUserSubscriptions(self): + # query an SQL return Status + proxy = self.proxyPool.getProxy() + # get + ret = proxy.getNumUserSubscriptions() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # add user subscriptions + def addUserSubscription(self,datasetName,dq2IDs): + # query an SQL return Status + proxy = self.proxyPool.getProxy() + # get + ret = proxy.addUserSubscription(datasetName,dq2IDs) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # increment counter for subscription + def incrementUsedCounterSubscription(self,datasetName): + # query an SQL return Status + proxy = self.proxyPool.getProxy() + # get + ret = proxy.incrementUsedCounterSubscription(datasetName) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get active datasets + def getActiveDatasets(self,computingSite,prodSourceLabel): + # query an SQL return Status + proxy = self.proxyPool.getProxy() + # get + ret = proxy.getActiveDatasets(computingSite,prodSourceLabel) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # check status of all sub datasets to trigger Notifier + def checkDatasetStatusForNotifier(self,jobsetID,jobDefinitionID,prodUserName): + # query an SQL return Status + proxy = self.proxyPool.getProxy() + # get + ret = proxy.checkDatasetStatusForNotifier(jobsetID,jobDefinitionID,prodUserName) + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + + # get MoU share for T2 PD2P + def getMouShareForT2PD2P(self): + # query an SQL return Status + proxy = self.proxyPool.getProxy() + # get + ret = proxy.getMouShareForT2PD2P() + # release proxy + self.proxyPool.putProxy(proxy) + # return + return ret + + +# Singleton +taskBuffer = TaskBuffer() + diff --git a/current/pandaserver/taskbuffer/Utils.py b/current/pandaserver/taskbuffer/Utils.py new file mode 100755 index 000000000..e3ad1efe9 --- /dev/null +++ b/current/pandaserver/taskbuffer/Utils.py @@ -0,0 +1,512 @@ +""" +utility service + +""" +import os +import re +import sys +import zlib +import uuid +import time +import socket +import struct +import datetime +import jobdispatcher.Protocol as Protocol +import ErrorCode +from userinterface import Client +from config import panda_config + +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('Utils') + +# check if server is alive +def isAlive(req): + return "alive=yes" + + +# extract name from DN +def cleanUserID(id): + try: + up = re.compile('/(DC|O|OU|C|L)=[^\/]+') + username = up.sub('', id) + up2 = re.compile('/CN=[0-9]+') + username = up2.sub('', username) + up3 = re.compile(' [0-9]+') + username = up3.sub('', username) + up4 = re.compile('_[0-9]+') + username = up4.sub('', username) + username = username.replace('/CN=proxy','') + username = username.replace('/CN=limited proxy','') + username = username.replace('limited proxy','') + username = re.sub('/CN=Robot:[^/]+','',username) + pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)') + mat = pat.match(username) + if mat: + username = mat.group(2) + else: + username = username.replace('/CN=','') + if username.lower().find('/email') > 0: + username = username[:username.lower().find('/email')] + pat = re.compile('.*(limited.*proxy).*') + mat = pat.match(username) + if mat: + username = mat.group(1) + username = username.replace('(','') + username = username.replace(')','') + username = username.replace("'",'') + return username + except: + return id + + +# insert with rety +def insertWithRetryCassa(familyName,keyName,valMap,msgStr,nTry=3): + for iTry in range(nTry): + try: + familyName.insert(keyName,valMap) + except pycassa.MaximumRetryException,tmpE: + if iTry+1 < nTry: + _logger.debug("%s sleep %s/%s" % (msgStr,iTry,nTry)) + time.sleep(30) + else: + raise pycassa.MaximumRetryException,tmpE.value + else: + break + + +# touch in Cassandra +def touchFileCassa(filefamily,fileKeyName,timeNow): + try: + # get old timestamp + oldFileInfo = filefamily.get(fileKeyName) + except: + _logger.warning('cannot get old fileinfo for %s from Cassandra' % fileKeyName) + return False + try: + # update time in fileTable + for splitIdx in range(oldFileInfo['nSplit']): + tmpFileKeyName = fileKeyName + if splitIdx != 0: + tmpFileKeyName += '_%s' % splitIdx + insertWithRetryCassa(filefamily,tmpFileKeyName, + {'year' : timeNow.year, + 'month' : timeNow.month, + 'day' : timeNow.day, + 'hour' : timeNow.hour, + 'minute' : timeNow.minute, + 'second' : timeNow.second}, + 'touchFileCassa : %s' % fileKeyName + ) + return True + except: + errType,errValue = sys.exc_info()[:2] + errStr = "cannot touch %s due to %s %s" % (fileKeyName,errType,errValue) + _logger.error(errStr) + return False + + +# upload file +def putFile(req,file): + if not Protocol.isSecure(req): + return False + if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: + return False + _logger.debug("putFile : start %s %s" % (req.subprocess_env['SSL_CLIENT_S_DN'],file.filename)) + # size check + fullSizeLimit = 768*1024*1024 + if not file.filename.startswith('sources.'): + noBuild = True + sizeLimit = 10*1024*1024 + else: + noBuild = False + sizeLimit = fullSizeLimit + # get file size + contentLength = 0 + try: + contentLength = long(req.headers_in["content-length"]) + except: + if req.headers_in.has_key("content-length"): + _logger.error("cannot get CL : %s" % req.headers_in["content-length"]) + else: + _logger.error("no CL") + _logger.debug("size %s" % contentLength) + if contentLength > sizeLimit: + errStr = "ERROR : Upload failure. Exceeded size limit %s>%s." % (contentLength,sizeLimit) + if noBuild: + errStr += " Please submit the job without --noBuild/--libDS since those options impose a tighter size limit" + else: + errStr += " Please remove redundant files from your workarea" + _logger.error(errStr) + _logger.debug("putFile : end") + return errStr + try: + fileFullPath = '%s/%s' % (panda_config.cache_dir,file.filename.split('/')[-1]) + # avoid overwriting + if os.path.exists(fileFullPath): + # touch + os.utime(fileFullPath,None) + # send error message + errStr = "ERROR : Cannot overwrite file" + _logger.debug('putFile : cannot overwrite file %s' % file.filename) + _logger.debug("putFile : end") + return errStr + # write + fo = open(fileFullPath,'wb') + fileContent = file.file.read() + fo.write(fileContent) + fo.close() + except: + errStr = "ERROR : Cannot write file" + _logger.error(errStr) + _logger.debug("putFile : end") + return errStr + # checksum + try: + # decode Footer + footer = fileContent[-8:] + checkSum,isize = struct.unpack("II",footer) + _logger.debug("CRC from gzip Footer %s" % checkSum) + except: + # calculate on the fly + """ + import zlib + checkSum = zlib.adler32(fileContent) & 0xFFFFFFFF + """ + # use None to avoid delay for now + checkSum = None + _logger.debug("CRC calculated %s" % checkSum) + # file size + fileSize = len(fileContent) + # user name + username = cleanUserID(req.subprocess_env['SSL_CLIENT_S_DN']) + _logger.debug("putFile : written dn=%s file=%s size=%s crc=%s" % \ + (username,file.filename,fileSize,checkSum)) + # put file info to DB + statClient,outClient = Client.insertSandboxFileInfo(username,file.filename, + fileSize,checkSum) + if statClient != 0 or outClient.startswith("ERROR"): + _logger.error("putFile : failed to put sandbox to DB with %s %s" % (statClient,outClient)) + #_logger.debug("putFile : end") + #return "ERROR : Cannot insert sandbox to DB" + else: + _logger.debug("putFile : inserted sandbox to DB with %s" % outClient) + # store to cassandra + if hasattr(panda_config,'cacheUseCassandra') and panda_config.cacheUseCassandra == True: + try: + # time-stamp + timeNow = datetime.datetime.utcnow() + creationTime = timeNow.strftime('%Y-%m-%d %H:%M:%S') + # user name + username = req.subprocess_env['SSL_CLIENT_S_DN'] + username = username.replace('/CN=proxy','') + username = username.replace('/CN=limited proxy','') + # file size + fileSize = len(fileContent) + # key + fileKeyName = file.filename.split('/')[-1] + sizeCheckSum = '%s:%s' % (fileSize,checkSum) + # insert to cassandra + import pycassa + pool = pycassa.ConnectionPool(panda_config.cacheKeySpace) + filefamily = pycassa.ColumnFamily(pool,panda_config.cacheFileTable) + # avoid overwriting + gotoNextCassa = True + if filefamily.get_count(fileKeyName) > 0: + # touch + touchFlag = touchFileCassa(filefamily,fileKeyName,timeNow) + if touchFlag: + gotoNextCassa = False + # send error message + errStr = "ERROR : Cannot overwrite file in Cassandra" + _logger.error(errStr) + if not panda_config.cacheIgnoreCassandraError: + _logger.debug("putFile : end") + return errStr + # check uniqueness with size and checksum + if gotoNextCassa: + try: + uniqExp = pycassa.index.create_index_expression('uniqID',sizeCheckSum) + userExp = pycassa.index.create_index_expression('user',username) + tmpClause = pycassa.index.create_index_clause([uniqExp,userExp]) + tmpResults = filefamily.get_indexed_slices(tmpClause,columns=['creationTime']) + for oldFileKeyName,tmpDict in tmpResults: + _logger.debug('The same size and chksum %s found in old:%s and new:%s' % \ + (sizeCheckSum,oldFileKeyName,fileKeyName)) + # touch + touchFlag = touchFileCassa(filefamily,oldFileKeyName,timeNow) + if touchFlag: + # make alias + _logger.debug('Making alias %s->%s' % (fileKeyName,oldFileKeyName)) + insertWithRetryCassa(filefamily,fileKeyName, + {'alias':oldFileKeyName, + 'creationTime':creationTime, + 'nSplit':0, + }, + 'putFile : make alias for %s' % file.filename + ) + # set time + touchFileCassa(filefamily,fileKeyName,timeNow) + _logger.debug("putFile : end") + return True + except: + gotoNextCassa = False + errType,errValue = sys.exc_info()[:2] + errStr = "cannot make alias for %s due to %s %s" % (fileKeyName,errType,errValue) + _logger.error(errStr) + if not panda_config.cacheIgnoreCassandraError: + _logger.debug("putFile : end") + return errStr + # insert new record + if gotoNextCassa: + splitIdx = 0 + splitSize = 5 * 1024 * 1024 + nSplit,tmpMod = divmod(len(fileContent),splitSize) + if tmpMod != 0: + nSplit += 1 + _logger.debug('Inserting %s with %s blocks' % (fileKeyName,nSplit)) + for splitIdx in range(nSplit): + # split to small chunks since cassandra is not good at large files + tmpFileContent = fileContent[splitSize*splitIdx:splitSize*(splitIdx+1)] + tmpFileKeyName = fileKeyName + tmpAttMap = {'file':tmpFileContent, + 'user':username, + 'creationTime':creationTime, + } + if splitIdx == 0: + tmpAttMap['size'] = fileSize + tmpAttMap['nSplit'] = nSplit + tmpAttMap['uniqID'] = sizeCheckSum + tmpAttMap['checkSum'] = str(checkSum) + else: + tmpFileKeyName += '_%s' % splitIdx + tmpAttMap['size'] = 0 + tmpAttMap['nSplit'] = 0 + # insert with retry + insertWithRetryCassa(filefamily,tmpFileKeyName,tmpAttMap, + 'putFile : insert %s' % file.filename) + # set time + touchFileCassa(filefamily,fileKeyName,timeNow) + except: + errType,errValue = sys.exc_info()[:2] + errStr = "cannot put %s into Cassandra due to %s %s" % (fileKeyName,errType,errValue) + _logger.error(errStr) + # send error message + errStr = "ERROR : " + errStr + if not panda_config.cacheIgnoreCassandraError: + _logger.debug("putFile : end") + return errStr + _logger.debug("putFile : %s end" % file.filename) + return True + + +# get file +def getFile(req,fileName): + _logger.debug("getFile : %s start" % fileName) + try: + # look into cassandra + import pycassa + pool = pycassa.ConnectionPool(panda_config.cacheKeySpace) + filefamily = pycassa.ColumnFamily(pool,panda_config.cacheFileTable) + fileInfo = filefamily.get(fileName) + # check alias + if fileInfo.has_key('alias') and fileInfo['alias'] != '': + realFileName = fileInfo['alias'] + fileInfo = filefamily.get(realFileName) + _logger.debug("getFile : %s use alias=%s" % (fileName,realFileName)) + else: + realFileName = fileName + # check cached file + hostKey = socket.gethostname() + '_cache' + if fileInfo.has_key(hostKey) and fileInfo[hostKey] != '': + _logger.debug("getFile : %s found cache=%s" % (fileName,fileInfo[hostKey])) + try: + fileFullPath = '%s%s' % (panda_config.cache_dir,fileInfo[hostKey]) + # touch + os.utime(fileFullPath,None) + _logger.debug("getFile : %s end" % fileName) + # return + return ErrorCode.EC_Redirect('/cache%s' % fileInfo[hostKey]) + except: + errtype,errvalue = sys.exc_info()[:2] + _logger.debug("getFile : %s failed to touch %s due to %s:%s" % (fileName,fileFullPath,errtype,errvalue)) + # write to cache file + fileRelPath = '/cassacache/%s' % str(uuid.uuid4()) + fileFullPath = '%s%s' % (panda_config.cache_dir,fileRelPath) + _logger.debug("getFile : %s write cache to %s" % (fileName,fileFullPath)) + fo = open(fileFullPath,'wb') + fo.write(fileInfo['file']) + if fileInfo['nSplit'] > 1: + for splitIdx in range(fileInfo['nSplit']): + if splitIdx == 0: + continue + fileInfo = filefamily.get(realFileName+'_%s' % splitIdx) + fo.write(fileInfo['file']) + fo.close() + # set cache name in DB + insertWithRetryCassa(filefamily,realFileName,{hostKey:fileRelPath}, + 'getFile : set cache for %s' % fileName) + _logger.debug("getFile : %s end" % fileName) + # return + return ErrorCode.EC_Redirect('/cache%s' % fileRelPath) + except pycassa.NotFoundException: + _logger.error("getFile : %s not found" % fileName) + return ErrorCode.EC_NotFound + except: + errtype,errvalue = sys.exc_info()[:2] + errStr = "getFile : %s %s for %s" % (errtype,errvalue,fileName) + _logger.error(errStr) + raise RuntimeError,errStr + + +# get event picking request +def putEventPickingRequest(req,runEventList='',eventPickDataType='',eventPickStreamName='', + eventPickDS='',eventPickAmiTag='',userDatasetName='',lockedBy='', + params='',inputFileList=''): + if not Protocol.isSecure(req): + return "ERROR : no HTTPS" + userName = req.subprocess_env['SSL_CLIENT_S_DN'] + creationTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') + _logger.debug("putEventPickingRequest : %s start" % userName) + # size check + sizeLimit = 10*1024*1024 + # get total size + try: + contentLength = long(req.headers_in["content-length"]) + except: + errStr = "cannot get content-length from HTTP request." + _logger.error("putEventPickingRequest : " + errStr + " " + userName) + _logger.debug("putEventPickingRequest : %s end" % userName) + return "ERROR : " + errStr + _logger.debug("size %s" % contentLength) + if contentLength > sizeLimit: + errStr = "Too large run/event list. Exceeded size limit %s>%s." % (contentLength,sizeLimit) + _logger.error("putEventPickingRequest : " + errStr + " " + userName) + _logger.debug("putEventPickingRequest : %s end" % userName) + return "ERROR : " + errStr + try: + # make filename + evpFileName = '%s/evp.%s' % (panda_config.cache_dir,str(uuid.uuid4())) + _logger.debug("putEventPickingRequest : %s -> %s" % (userName,evpFileName)) + # write + fo = open(evpFileName,'wb') + fo.write("userName=%s\n" % userName) + fo.write("creationTime=%s\n" % creationTime) + fo.write("eventPickDataType=%s\n" % eventPickDataType) + fo.write("eventPickStreamName=%s\n" % eventPickStreamName) + fo.write("eventPickDS=%s\n" % eventPickDS) + fo.write("eventPickAmiTag=%s\n" % eventPickAmiTag) + fo.write("userDatasetName=%s\n" % userDatasetName) + fo.write("lockedBy=%s\n" % lockedBy) + fo.write("params=%s\n" % params) + fo.write("inputFileList=%s\n" % inputFileList) + for tmpLine in runEventList.split('\n'): + tmpItems = tmpLine.split() + if len(tmpItems) != 2: + continue + fo.write("runEvent=%s,%s\n" % tuple(tmpItems)) + fo.close() + except: + errType,errValue = sys.exc_info()[:2] + errStr = "cannot put request due to %s %s" % (errType,errValue) + _logger.error("putEventPickingRequest : " + errStr + " " + userName) + return "ERROR : " + errStr + _logger.debug("putEventPickingRequest : %s end" % userName) + return True + + +# delete file +def deleteFile(req,file): + if not Protocol.isSecure(req): + return 'False' + try: + # may be reused for rebrokreage + #os.remove('%s/%s' % (panda_config.cache_dir,file.split('/')[-1])) + return 'True' + except: + return 'False' + + +# touch file +def touchFile(req,filename): + if not Protocol.isSecure(req): + return 'False' + try: + os.utime('%s/%s' % (panda_config.cache_dir,filename.split('/')[-1]),None) + return 'True' + except: + errtype,errvalue = sys.exc_info()[:2] + _logger.error("touchFile : %s %s" % (errtype,errvalue)) + return 'False' + + +# get server name:port for SSL +def getServer(req): + return "%s:%s" % (panda_config.pserverhost,panda_config.pserverport) + + +# update stdout +def updateLog(req,file): + _logger.debug("updateLog : %s start" % file.filename) + # write to file + try: + # expand + extStr = zlib.decompress(file.file.read()) + # stdout name + logName = '%s/%s' % (panda_config.cache_dir,file.filename.split('/')[-1]) + # append + ft = open(logName,'wa') + ft.write(extStr) + ft.close() + except: + type, value, traceBack = sys.exc_info() + _logger.error("updateLog : %s %s" % (type,value)) + _logger.debug("updateLog : %s end" % file.filename) + return True + + +# fetch stdout +def fetchLog(req,logName,offset=0): + _logger.debug("fetchLog : %s start offset=%s" % (logName,offset)) + # put dummy char to avoid Internal Server Error + retStr = ' ' + try: + # stdout name + fullLogName = '%s/%s' % (panda_config.cache_dir,logName.split('/')[-1]) + # read + ft = open(fullLogName,'r') + ft.seek(long(offset)) + retStr += ft.read() + ft.close() + except: + type, value, traceBack = sys.exc_info() + _logger.error("fetchLog : %s %s" % (type,value)) + _logger.debug("fetchLog : %s end read=%s" % (logName,len(retStr))) + return retStr + + +# get VOMS attributes +def getVomsAttr(req): + vomsAttrs = [] + for tmpKey,tmpVal in req.subprocess_env.iteritems(): + # compact credentials + if tmpKey.startswith('GRST_CRED_'): + vomsAttrs.append('%s : %s\n' % (tmpKey,tmpVal)) + vomsAttrs.sort() + retStr = '' + for tmpStr in vomsAttrs: + retStr += tmpStr + return retStr + + +# get all attributes +def getAttr(req): + allAttrs = [] + for tmpKey,tmpVal in req.subprocess_env.iteritems(): + allAttrs.append('%s : %s\n' % (tmpKey,tmpVal)) + allAttrs.sort() + retStr = '' + for tmpStr in allAttrs: + retStr += tmpStr + return retStr diff --git a/current/pandaserver/taskbuffer/WrappedPickle.py b/current/pandaserver/taskbuffer/WrappedPickle.py new file mode 100644 index 000000000..a3e1fa12f --- /dev/null +++ b/current/pandaserver/taskbuffer/WrappedPickle.py @@ -0,0 +1,38 @@ +import sys +import StringIO +import cPickle as pickle + +# wrapper to avoid de-serializing unsafe objects +class WrappedPickle(object): + # allowed modules and classes + allowedModClass = { + 'copy_reg' : ['_reconstructor'], + '__builtin__' : ['object'], + 'datetime' : ['datetime'], + 'taskbuffer.JobSpec' : ['JobSpec'], + 'taskbuffer.FileSpec' : ['FileSpec'], + } + + # check module and class + @classmethod + def find_class(cls,module,name): + # check module + if not cls.allowedModClass.has_key(module): + raise pickle.UnpicklingError,'Attempting to import disallowed module %s' % module + # import module + __import__(module) + mod = sys.modules[module] + # check class + if not name in cls.allowedModClass[module]: + raise pickle.UnpicklingError,'Attempting to get disallowed class %s in %s' % (name,module) + klass = getattr(mod,name) + return klass + + # loads + @classmethod + def loads(cls,pickle_string): + pickle_obj = pickle.Unpickler(StringIO.StringIO(pickle_string)) + pickle_obj.find_global = cls.find_class + return pickle_obj.load() + + diff --git a/current/pandaserver/taskbuffer/__init__.py b/current/pandaserver/taskbuffer/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/current/pandaserver/test/XrdAna.py b/current/pandaserver/test/XrdAna.py new file mode 100755 index 000000000..37cea8021 --- /dev/null +++ b/current/pandaserver/test/XrdAna.py @@ -0,0 +1,59 @@ +import os +import re +import sys +import commands + +tarList = [] +realTime = [] +timeStamps = {} +for item in os.listdir('.'): + if item.endswith('log.tgz'): + commands.getoutput('tar xvfz %s' % item) + for dirItem in os.listdir('.'): + if os.path.isdir(dirItem): + foundTime = False + file = open('%s/pilot_child.stdout' % dirItem) + event = -1 + for line in file: + line = re.sub('\n','',line) + if line.startswith('AthenaEventLoopMgr INFO ===>>> start of event') \ + or line.startswith('Init Time :') or line.startswith('Wake Time :'): + #event = line.split()[-2] + event += 1 + match = re.search('Wake Time : \d{4}-\d{2}-\d{2} (\d{2}:\d{2}:\d{2}\.\d{3})',line) + if line.startswith('Exec Time :') or line.startswith('Init Time :') \ + or match != None: + if match != None: + timeVal = match.group(1) + else: + timeVal = line.split()[-1] + if not (int(event) < 10 or int(event) % 10 == 0): + continue + if not timeStamps.has_key(event): + timeStamps[event] = [] + timeStamps[event].append(timeVal) + if line.startswith('real'): + rT = re.sub('m',':',line.split()[-1]) + rT = re.sub('s','',rT) + realTime.append(rT) + file.close() + commands.getoutput('rm -rf %s' % dirItem) +outReal = open('real.txt','w') +for rT in realTime: + outReal.write('%s\n' % rT) +outReal.close() +nStamp = 0 +events = timeStamps.keys() +events.sort() +outStamp = open('stamp.txt','w') +for event in events: + stamps = timeStamps[event] + if nStamp == 0: + nStamp = len(stamps) + if nStamp != len(stamps): + print "ERROR : invalid nStamp %s %s" % (nStamp,len(stamps)) + str = '%s' % event + for s in stamps: + str += ',%s' % s + outStamp.write(str+'\n') +outStamp.close() diff --git a/current/pandaserver/test/XrdTest.py b/current/pandaserver/test/XrdTest.py new file mode 100755 index 000000000..f377f46de --- /dev/null +++ b/current/pandaserver/test/XrdTest.py @@ -0,0 +1,65 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = "ANALY_BNL_ATLAS_1" + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = 'BNL_SE' + +jobDefinitionID = int(time.time()) % 10000 + +jobList = [] + +for i in range(2): + job = JobSpec() + job.jobDefinitionID = jobDefinitionID + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) + job.AtlasRelease = 'Atlas-12.0.6' + job.homepackage = 'AnalysisTransforms' + job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/runAthenaXrd' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.currentPriority = 3000 + job.assignedPriority = 3000 + job.prodSourceLabel = 'user' + job.computingSite = site + + file = FileSpec() + file.lfn = "%s.AANT._%05d.root" % (job.jobName,i) + file.destinationDBlock = job.destinationDBlock + file.destinationSE = job.destinationSE + file.dataset = job.destinationDBlock + file.type = 'output' + job.addFile(file) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.type = 'log' + job.addFile(fileOL) + + fileL = FileSpec() + fileL.dataset = 'user.TadashiMaeno.acas0003.lib._000134' + fileL.prodDBlock = fileL.dataset + fileL.lfn = 'user.TadashiMaeno.acas0003.lib._000134.lib.tgz' + fileL.type = 'input' + fileL.status = 'ready' + job.addFile(fileL) + + job.jobParameters=("-l %s " % fileL.lfn) + """-r run/ -j "%20AnalysisSkeleton_topOptions.py" -i "[]" -m "[]" -n "[]" -o "{'AANT': [('AANTupleStream', 'AANT', """ + ("""'%s')]}" -c""" % file.lfn) + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/activateBNL.py b/current/pandaserver/test/activateBNL.py new file mode 100755 index 000000000..55be46f85 --- /dev/null +++ b/current/pandaserver/test/activateBNL.py @@ -0,0 +1,63 @@ +import sys +import time +from dataservice.DDM import ddm +from taskbuffer.DBProxy import DBProxy +import userinterface.Client as Client +import urllib2,urllib,datetime,time +import jobscheduler.siteinfo +import jobscheduler.Site +import brokerage.broker_util + +# password +# A very minor edit. +from config import panda_config +passwd = panda_config.dbpasswd + +# instantiate DB proxies +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +# get PandaIDs from jobsDefined +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) +sql = "SELECT dispatchDBlock from jobsDefined4 WHERE jobStatus='assigned' AND prodSourceLabel='managed' " +sql += "AND (computingSite='BNL_ATLAS_1' OR computingSite='BNL_ATLAS_2') AND modificationTime<'%s' " +sql += "GROUP BY dispatchDBlock" + +res = proxyS.querySQL(sql % timeLimit.strftime('%Y-%m-%d %H:%M:%S')) + +# emulate DDM callbacks +for dispatchDBlock, in res: + print dispatchDBlock + time.sleep(5) + # get file list + status,out = ddm.dq2.main(['listFilesInDataset',dispatchDBlock]) + if status != 0 or out.startswith('Error'): + print out + continue + # make LFN list + lfns = [] + for line in out.split('\n'): + items = line.split() + if len(items) == 2: + lfns.append(items[1]) + # skip empty datasets + if len(lfns) == 0: + print "empty dataset" + continue + # get missing file + missLFNs = brokerage.broker_util.getMissLFNsFromLRC(lfns,jobscheduler.Site.KnownSite('BNL_ATLAS_2').getDQ2URL()) + if len(missLFNs) != 0: + print "some files are missing" + continue + # get VUID and creationdate + resvuid = proxyS.querySQL("SELECT vuid from Datasets WHERE name='%s'" % dispatchDBlock) + if len(resvuid) == 1: + vuid, = resvuid[0] + # make HTTP request + node={'vuid':vuid} + url=Client.baseURLSSL+'/datasetCompleted' + rdata=urllib.urlencode(node) + req=urllib2.Request(url) + # invoke callback + fd=urllib2.urlopen(req,rdata) + diff --git a/current/pandaserver/test/activateDefJobs.py b/current/pandaserver/test/activateDefJobs.py new file mode 100755 index 000000000..d2d826c55 --- /dev/null +++ b/current/pandaserver/test/activateDefJobs.py @@ -0,0 +1,36 @@ +from taskbuffer.DBProxy import DBProxy +import userinterface.Client as Client +import urllib2,urllib,datetime,time + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# time limit +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) + +# instantiate DB proxies +proxyS = DBProxy() +proxyS.connect('adbpro.usatlas.bnl.gov',passwd,'panda-developer','PandaDevDB') + +# get PandaIDs from jobsDefined +res = proxyS.querySQL("SELECT dispatchDBlock from jobsDefined4 GROUP BY dispatchDBlock") + +# emulate DDM callbacks +jobs=[] +for dispatchDBlock, in res: + # get VUID and creationdate + resvuid = proxyS.querySQL("SELECT vuid,creationdate from Datasets WHERE name='%s'" % dispatchDBlock) + if len(resvuid) == 1: + vuid,creationdate = resvuid[0] + # convert creatindate to datetime + creation_datetime = datetime.datetime(*time.strptime(creationdate,'%Y-%m-%d %H:%M:%S')[:6]) + if creation_datetime < timeLimit: + # make HTTP request + node={'vuid':vuid} + url=Client.baseURLSSL+'/datasetCompleted' + rdata=urllib.urlencode(node) + req=urllib2.Request(url) + # invoke callback + fd=urllib2.urlopen(req,rdata) + diff --git a/current/pandaserver/test/activateDefJobs.sh b/current/pandaserver/test/activateDefJobs.sh new file mode 100755 index 000000000..b2c1bc6bf --- /dev/null +++ b/current/pandaserver/test/activateDefJobs.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +BASEPATH=/usatlas/u/sm/prod +BINPATH=/usatlas/u/sm/latest + +# for python +export PATH=$BINPATH/python/bin:$PATH +export PYTHONPATH=$BASEPATH/panda:$PYTHONPATH + +python $BASEPATH/panda/test/activateDefJobs.py diff --git a/current/pandaserver/test/activateJobs.py b/current/pandaserver/test/activateJobs.py new file mode 100755 index 000000000..b33769d45 --- /dev/null +++ b/current/pandaserver/test/activateJobs.py @@ -0,0 +1,41 @@ +import sys + +from taskbuffer.DBProxy import DBProxy +import userinterface.Client as Client +import urllib2,urllib,datetime,time + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +if len(sys.argv) == 2: + startID = int(sys.argv[1]) + endID = startID +else: + startID = int(sys.argv[1]) + endID = int(sys.argv[2]) + if startID > endID: + print '%d is less than %d' % (endID,startID) + sys.exit(1) + +# instantiate DB proxies +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +# get PandaIDs from jobsDefined +res = proxyS.querySQL("SELECT dispatchDBlock from jobsDefined4 WHERE PandaID>=%s AND PandaID<=%s GROUP BY dispatchDBlock" % (startID,endID)) + +# emulate DDM callbacks +for dispatchDBlock, in res: + # get VUID and creationdate + resvuid = proxyS.querySQL("SELECT vuid from Datasets WHERE name='%s'" % dispatchDBlock) + if len(resvuid) == 1: + vuid, = resvuid[0] + # make HTTP request + node={'vuid':vuid} + url=Client.baseURLSSL+'/datasetCompleted' + rdata=urllib.urlencode(node) + req=urllib2.Request(url) + # invoke callback + fd=urllib2.urlopen(req,rdata) + diff --git a/current/pandaserver/test/activator.py b/current/pandaserver/test/activator.py new file mode 100755 index 000000000..8ad5292de --- /dev/null +++ b/current/pandaserver/test/activator.py @@ -0,0 +1,24 @@ +import os +import re +import sys +import time +import datetime +import commands +from taskbuffer.TaskBuffer import taskBuffer +from pandalogger.PandaLogger import PandaLogger +from dataservice.Activator import Activator + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# instantiate TB +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + +if len(sys.argv) != 2: + print "datasetname is required" + +dataset = taskBuffer.queryDatasetWithMap({'name':sys.argv[1]}) +thr = Activator(taskBuffer,dataset) +thr.start() +thr.join() diff --git a/current/pandaserver/test/add.py b/current/pandaserver/test/add.py new file mode 100755 index 000000000..a3e1437e5 --- /dev/null +++ b/current/pandaserver/test/add.py @@ -0,0 +1,434 @@ +import os +import re +import sys +import time +import glob +import fcntl +import random +import datetime +import commands +import threading +from taskbuffer.TaskBuffer import taskBuffer +from pandalogger.PandaLogger import PandaLogger +from dataservice.Adder2 import Adder +from brokerage.SiteMapper import SiteMapper + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# logger +_logger = PandaLogger().getLogger('add') + +_logger.debug("===================== start =====================") + +# overall timeout value +overallTimeout = 20 + +# current minute +currentMinute = datetime.datetime.utcnow().minute + +# kill old process +try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName) + for line in out.split('\n'): + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) +except: + type, value, traceBack = sys.exc_info() + _logger.error("kill process : %s %s" % (type,value)) + + +# instantiate TB +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + +# instantiate sitemapper +aSiteMapper = SiteMapper(taskBuffer) + +# delete +_logger.debug("Del session") +status,retSel = taskBuffer.querySQLS("SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4",{}) +if retSel != None: + try: + maxID = retSel[0][0] + _logger.debug("maxID : %s" % maxID) + if maxID != None: + varMap = {} + varMap[':maxID'] = maxID + varMap[':jobStatus1'] = 'activated' + varMap[':jobStatus2'] = 'waiting' + varMap[':jobStatus3'] = 'failed' + varMap[':jobStatus4'] = 'cancelled' + status,retDel = taskBuffer.querySQLS("DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)",varMap) + except: + pass + +# count # of getJob/updateJob in dispatcher's log +try: + # don't update when logrotate is running + timeNow = datetime.datetime.utcnow() + logRotateTime = timeNow.replace(hour=3,minute=2,second=0,microsecond=0) + if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \ + (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)): + _logger.debug("skip pilotCounts session for logrotate") + else: + # log filename + dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) + # check if tgz is required + com = 'head -1 %s' % dispLogName + lostat,loout = commands.getstatusoutput(com) + useLogTgz = True + if lostat == 0: + match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',loout) + if match != None: + startTime = datetime.datetime(*time.strptime(match.group(0),'%Y-%m-%d %H:%M:%S')[:6]) + # current log contains all info + if startTime datetime.timedelta(minutes=1) and \ + (timeNow - modTime) < datetime.timedelta(hours=1): + cSt,cOut = commands.getstatusoutput('ps aux | grep fork | grep -v PYTH') + # if no process is running for the file + if cSt == 0 and not tmpName in cOut: + nThr += 1 + thr = ForkThr(tmpName) + thr.start() + forkThrList.append(thr) + if nThr > maxThr: + break + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s %s" % (errType,errValue)) + + +# thread pool +class ThreadPool: + def __init__(self): + self.lock = threading.Lock() + self.list = [] + + def add(self,obj): + self.lock.acquire() + self.list.append(obj) + self.lock.release() + + def remove(self,obj): + self.lock.acquire() + self.list.remove(obj) + self.lock.release() + + def join(self): + self.lock.acquire() + thrlist = tuple(self.list) + self.lock.release() + for thr in thrlist: + thr.join() + +# thread to adder +class AdderThr (threading.Thread): + def __init__(self,lock,pool,taskBuffer,aSiteMapper,pandaID,jobStatus,fileName,ignoreError=True): + threading.Thread.__init__(self) + self.lock = lock + self.pool = pool + self.pool.add(self) + self.adder = Adder(taskBuffer,pandaID,"",jobStatus,xmlFile=fileName, + ignoreDDMError=ignoreError,joinCloser=True,addOutput=True, + siteMapper=aSiteMapper) + + def run(self): + self.lock.acquire() + self.adder.start() + self.adder.join() + self.pool.remove(self) + self.lock.release() + + +# get buildJobs in the holding state +holdingAna = [] +varMap = {} +varMap[':prodSourceLabel'] = 'panda' +varMap[':jobStatus'] = 'holding' +status,res = taskBuffer.querySQLS("SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus",varMap) +if res != None: + for id, in res: + holdingAna.append(id) +_logger.debug("holding Ana %s " % holdingAna) + +# add files +_logger.debug("Adder session") +timeNow = datetime.datetime.utcnow() +timeInt = datetime.datetime.utcnow() +dirName = panda_config.logdir +fileList = os.listdir(dirName) +fileList.sort() +# remove duplicated files +tmpList = [] +uMap = {} +for file in fileList: + match = re.search('^(\d+)_([^_]+)_.{36}(_\d+)*$',file) + if match != None: + fileName = '%s/%s' % (dirName,file) + id = match.group(1) + if uMap.has_key(id): + try: + os.remove(fileName) + except: + pass + else: + uMap[id] = fileName + if long(id) in holdingAna: + # give a priority to buildJobs + tmpList.insert(0,file) + else: + tmpList.append(file) +nFixed = 50 +randTmp = tmpList[nFixed:] +random.shuffle(randTmp) +fileList = tmpList[:nFixed] + randTmp + +# create thread pool and semaphore +adderLock = threading.Semaphore(3) +adderThreadPool = ThreadPool() + +# add +while len(fileList) != 0: + # time limit to aviod too many copyArchve running at the sametime + if (datetime.datetime.utcnow() - timeNow) > datetime.timedelta(minutes=overallTimeout): + _logger.debug("time over in Adder session") + break + # try to get Semaphore + adderLock.acquire() + # get fileList + if (datetime.datetime.utcnow() - timeInt) > datetime.timedelta(minutes=15): + timeInt = datetime.datetime.utcnow() + # get file + fileList = os.listdir(dirName) + fileList.sort() + # remove duplicated files + tmpList = [] + uMap = {} + for file in fileList: + match = re.search('^(\d+)_([^_]+)_.{36}(_\d+)*$',file) + if match != None: + fileName = '%s/%s' % (dirName,file) + id = match.group(1) + if uMap.has_key(id): + try: + os.remove(fileName) + except: + pass + else: + uMap[id] = fileName + if long(id) in holdingAna: + # give a priority to buildJob + tmpList.insert(0,file) + else: + tmpList.append(file) + fileList = tmpList + # choose a file + file = fileList.pop(0) + # release lock + adderLock.release() + # check format + match = re.search('^(\d+)_([^_]+)_.{36}(_\d+)*$',file) + if match != None: + fileName = '%s/%s' % (dirName,file) + if not os.path.exists(fileName): + continue + try: + modTime = datetime.datetime(*(time.gmtime(os.path.getmtime(fileName))[:7])) + if (timeNow - modTime) > datetime.timedelta(hours=24): + # last chance + _logger.debug("Last Add File : %s" % fileName) + thr = AdderThr(adderLock,adderThreadPool,taskBuffer,aSiteMapper,match.group(1), + match.group(2),fileName,False) + thr.start() + elif (timeInt - modTime) > datetime.timedelta(minutes=3): + # add + _logger.debug("Add File : %s" % fileName) + thr = AdderThr(adderLock,adderThreadPool,taskBuffer,aSiteMapper,match.group(1), + match.group(2),fileName) + thr.start() + except: + type, value, traceBack = sys.exc_info() + _logger.error("%s %s" % (type,value)) + +# join all threads +adderThreadPool.join() + +# join sender +mailSender.join() + +# join fork threads +for thr in forkThrList: + thr.join() + +_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/add.sh b/current/pandaserver/test/add.sh new file mode 100755 index 000000000..fed990df6 --- /dev/null +++ b/current/pandaserver/test/add.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# Panda home +export PANDA_HOME=/home/sm/prod + +# for python +export PYTHONPATH=$PANDA_HOME/panda:$PYTHONPATH + +python $PANDA_HOME/panda/test/add.py diff --git a/current/pandaserver/test/aho.xml b/current/pandaserver/test/aho.xml new file mode 100755 index 000000000..8bfd17333 --- /dev/null +++ b/current/pandaserver/test/aho.xml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/current/pandaserver/test/analysis.py b/current/pandaserver/test/analysis.py new file mode 100755 index 000000000..91f498431 --- /dev/null +++ b/current/pandaserver/test/analysis.py @@ -0,0 +1,78 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = None + +jobList = [] +for i in range(2): + datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') + destName = 'ANALY_BNL_ATLAS_1' + + job = JobSpec() + job.jobDefinitionID = 1 + job.jobName = commands.getoutput('uuidgen') + job.AtlasRelease = 'Atlas-12.0.2' + job.homepackage = 'AnalysisTransforms' + job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/runAthena2' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.currentPriority = 3000 + job.prodSourceLabel = 'user' + job.computingSite = site + job.prodDBlock = 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103' + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen') + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.type = 'log' + job.addFile(fileOL) + + fileOZ = FileSpec() + fileOZ.lfn = "AANT.%s.root" % commands.getoutput('uuidgen') + fileOZ.destinationDBlock = job.destinationDBlock + fileOZ.destinationSE = job.destinationSE + fileOZ.dataset = job.destinationDBlock + fileOZ.type = 'output' + job.addFile(fileOZ) + + files = [ + 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00001.pool.root.1', + 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00002.pool.root.1', + 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00003.pool.root.1', + ] + for lfn in files: + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + fileI.status = 'ready' + job.addFile(fileI) + + fileL = FileSpec() + fileL.dataset = 'user.TadashiMaeno.lib._000157' + fileL.prodDBlock = 'user.TadashiMaeno.lib._000157' + fileL.lfn = 'user.TadashiMaeno.lib._000157.lib.tgz' + fileL.type = 'input' + fileL.status = 'ready' + job.addFile(fileL) + + job.jobParameters=""" -l user.TadashiMaeno.lib._000157.lib.tgz -r run/ -j " AnalysisSkeleton_jobOptions.py" -i "['testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00001.pool.root.1', 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00002.pool.root.1', 'testIdeal_06.005001.pythia_minbias.recon.AOD.v12000103._00003.pool.root.1']" -o "{'AANT': [('AANTupleStream', 'AANT', '%s')]}" """ % fileOZ.lfn + + jobList.append(job) + + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/analyzeLog.py b/current/pandaserver/test/analyzeLog.py new file mode 100755 index 000000000..8b9314e5c --- /dev/null +++ b/current/pandaserver/test/analyzeLog.py @@ -0,0 +1,55 @@ +import re +from config import panda_config + +# analyze Setupper log +logSetupper = open('%s/panda-Setupper.log' % panda_config.logdir) +# extract subscriptions +mapSub = {} +mapDataset = {} +for line in logSetupper: + items = re.findall("'registerDatasetSubscription', '(.+_dis\d+)', '([^']+)'",line) + if len(items) != 0: + dataset = items[0][0] + siteID = items[0][1] + date = '%s %s' % tuple(re.split(' |,',line)[:2]) + if not mapSub.has_key(siteID): + mapSub[siteID] = [] + # append + mapSub[siteID].append(dataset) + mapDataset[dataset] = (date,False) +logSetupper.close() + +# analyze Activator log +logActivator = open('%s/panda-Activator.log' % panda_config.logdir) +# extract callbacks +for line in logActivator: + items = re.findall("start: (\S+_dis\d+)$",line) + if len(items) != 0: + dataset = items[0] + if dataset in mapDataset.keys(): + mapDataset[dataset] = mapDataset[dataset][:-1]+(True,) +logActivator.close() + +# print +for siteID in mapSub.keys(): + print "ID : %s" % siteID + nSucceed = 0 + failedSubs = [] + for dataset in mapSub[siteID]: + # succeeded + if mapDataset[dataset][-1:][0]: + nSucceed += 1 + # failed + else: + failedSubs.append((mapDataset[dataset][0],dataset)) + # statistics + print " Total:%d Succeeded:%d" % (len(mapSub[siteID]),nSucceed) + # not completed subscriptions + print " Not completed" + for item in failedSubs: + print " %s" % item[0] + print " %s" % item[1] + print + + + diff --git a/current/pandaserver/test/archivelogs.py b/current/pandaserver/test/archivelogs.py new file mode 100644 index 000000000..86d81d8ab --- /dev/null +++ b/current/pandaserver/test/archivelogs.py @@ -0,0 +1,45 @@ +import re +import os +import glob +import stat +import commands + +from config import panda_config + +srcDir = panda_config.logdir +dstDir = '/tmp/logbackup' + srcDir + +logFiles = glob.glob(srcDir+'/*log.1.gz') + +# check time stamp +for logFile in logFiles: + baseName = logFile.split('/')[-1] + print "log name : %s" % baseName + targetFile = "%s/%s" % (dstDir,baseName) + # already exists + if os.path.exists(targetFile) and \ + os.stat(logFile)[stat.ST_SIZE] == os.stat(targetFile)[stat.ST_SIZE]: + com = 'cmp %s %s' % (logFile,targetFile) + cmpSt,cmpOut = commands.getstatusoutput(com) + if cmpSt == 0: + print " -> skip : already exists" + continue + # increment + maxIndex = 60 + if os.path.exists(targetFile): + templateName = re.sub('1\.gz$','%s.gz',baseName) + for tmpIdx in range(1,maxIndex): + renameSrc = dstDir + '/' + (templateName % (maxIndex-tmpIdx)) + renameDst = dstDir + '/' + (templateName % (maxIndex-tmpIdx+1)) + if os.path.exists(renameSrc): + com = 'mv -f %s %s' % (renameSrc,renameDst) + print com + print commands.getoutput(com) + # copy + com = 'cp -fp %s %s' % (logFile,dstDir) + print com + print commands.getoutput(com) + +# touch to avoid tmpwatch +com = 'touch %s/*' % dstDir +print commands.getoutput(com) diff --git a/current/pandaserver/test/backupJobArch.py b/current/pandaserver/test/backupJobArch.py new file mode 100755 index 000000000..6ebc8dac2 --- /dev/null +++ b/current/pandaserver/test/backupJobArch.py @@ -0,0 +1,176 @@ +import os +import re +import sys +import time +import fcntl +import types +import shelve +import random +import datetime +import commands +import threading +import userinterface.Client as Client +from dataservice.DDM import ddm +from dataservice.DDM import dashBorad +from taskbuffer.OraDBProxy import DBProxy +from taskbuffer.TaskBuffer import taskBuffer +from pandalogger.PandaLogger import PandaLogger +from jobdispatcher.Watcher import Watcher +from brokerage.SiteMapper import SiteMapper +from dataservice.Adder import Adder +from dataservice.Finisher import Finisher +from dataservice.MailUtils import MailUtils +from taskbuffer import ProcessGroups +import brokerage.broker_util +import brokerage.broker +import taskbuffer.ErrorCode +import dataservice.DDM + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# logger +_logger = PandaLogger().getLogger('backupJobArch') + +_logger.debug("===================== start =====================") + +# memory checker +def _memoryCheck(str): + try: + proc_status = '/proc/%d/status' % os.getpid() + procfile = open(proc_status) + name = "" + vmSize = "" + vmRSS = "" + # extract Name,VmSize,VmRSS + for line in procfile: + if line.startswith("Name:"): + name = line.split()[-1] + continue + if line.startswith("VmSize:"): + vmSize = "" + for item in line.split()[1:]: + vmSize += item + continue + if line.startswith("VmRSS:"): + vmRSS = "" + for item in line.split()[1:]: + vmRSS += item + continue + procfile.close() + _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str)) + except: + type, value, traceBack = sys.exc_info() + _logger.error("memoryCheck() : %s %s" % (type,value)) + _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str)) + return + +_memoryCheck("start") + +# kill old dq2 process +try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('ps axo user,pid,lstart,args | grep dq2.clientapi | grep -v PYTHONPATH | grep -v grep') + for line in out.split('\n'): + if line == '': + continue + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old dq2 process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) +except: + type, value, traceBack = sys.exc_info() + _logger.error("kill dq2 process : %s %s" % (type,value)) + + +# kill old process +try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName) + for line in out.split('\n'): + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) +except: + type, value, traceBack = sys.exc_info() + _logger.error("kill process : %s %s" % (type,value)) + + +# instantiate TB +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + +# instantiate sitemapper +siteMapper = SiteMapper(taskBuffer) + + +# table names +jobATableName = "ATLAS_PANDAARCH.jobsArchived" +filesATableName = "ATLAS_PANDAARCH.filesTable_ARCH" +paramATableName = "ATLAS_PANDAARCH.jobParamsTable_ARCH" +metaATableName = "ATLAS_PANDAARCH.metaTable_ARCH" + +# time limit +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=3) + +# copy +_logger.debug("get PandaIDs for Archive") +varMap = {} +varMap[':archivedFlag'] = 0 +status,res = taskBuffer.querySQLS("SELECT PandaID,modificationTime FROM ATLAS_PANDA.jobsArchived4 WHERE archivedFlag=:archivedFlag ORDER BY PandaID", + varMap,arraySize=1000000) +if res == None: + _logger.debug("total %s " % res) +else: + _logger.debug("total %s " % len(res)) + # copy + tmpIndex = 0 + tmpTotal = len(res) + random.shuffle(res) + for (id,srcEndTime) in res: + tmpIndex += 1 + try: + # copy + proxyS = taskBuffer.proxyPool.getProxy() + proxyS.insertJobSimpleUnread(id,srcEndTime) + taskBuffer.proxyPool.putProxy(proxyS) + _logger.debug("INSERT %s" % id) + if tmpIndex % 100 == 1: + _logger.debug(" copied %s/%s" % (tmpIndex,tmpTotal)) + except: + pass + +_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/banUser.py b/current/pandaserver/test/banUser.py new file mode 100644 index 000000000..6217a058c --- /dev/null +++ b/current/pandaserver/test/banUser.py @@ -0,0 +1,41 @@ +import sys +import time +import datetime +import optparse + +from taskbuffer.OraDBProxy import DBProxy +# password +from config import panda_config + +optP = optparse.OptionParser(conflict_handler="resolve") +optP.add_option('--user', action='store',dest='user', default=None,help='prodUserName') +optP.add_option('--unban',action='store_const',const=True,dest='unban',default=False,help='unban the user') + +options,args = optP.parse_args() + +if options.user == None: + print "--user= is required" + sys.exit(1) + +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +prodUserName = sys.argv[1] +import userinterface.Client as Client + +varMap = {} +varMap[':name'] = options.user +if options.unban: + varMap[':status'] = None +else: + varMap[':status'] = 'disabled' + +sql = "UPDATE ATLAS_PANDAMETA.users SET status=:status WHERE name=:name" + +status,res = proxyS.querySQLS(sql,varMap) +if res == None: + print "Failed with database error" +else: + print "%s rows updated" % res + + diff --git a/current/pandaserver/test/boostPrio.py b/current/pandaserver/test/boostPrio.py new file mode 100755 index 000000000..4bc13fda6 --- /dev/null +++ b/current/pandaserver/test/boostPrio.py @@ -0,0 +1,20 @@ +import time +import sys + +from taskbuffer.OraDBProxy import DBProxy + +# password +from config import panda_config + +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +varMap = {} +varMap[':prodSourceLabel'] = 'managed' +varMap[':taskID'] = sys.argv[1] +varMap[':prio'] = sys.argv[2] +sql = "UPDATE %s SET currentPriority=currentPriority+:prio WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID" +for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: + status,res = proxyS.querySQLS(sql % table,varMap) + + diff --git a/current/pandaserver/test/boostUser.py b/current/pandaserver/test/boostUser.py new file mode 100755 index 000000000..17f6c1483 --- /dev/null +++ b/current/pandaserver/test/boostUser.py @@ -0,0 +1,34 @@ +import sys +from config import panda_config + +# initialize cx_Oracle using dummy connection +from taskbuffer.Initializer import initializer +initializer.init() + +from dataservice.Merger import Merger +from taskbuffer.TaskBuffer import taskBuffer +from pandalogger.PandaLogger import PandaLogger + + +# logger +_logger = PandaLogger().getLogger('boostUser') +_logger.debug("================= start ==================") + +# instantiate TB +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + +user = sys.stdin.read() +user = user[:-1] + +sql = "UPDATE atlas_panda.%s set currentPriority=:prio where prodUserName=:uname and prodSourceLabel IN (:label1,:label2) and currentPriority<:prio" +varMap = {} +varMap[':prio'] = 4000 +varMap[':uname'] = user +varMap[':label1'] = 'user' +varMap[':label2'] = 'panda' +for table in ('jobsactive4','jobsdefined4'): + _logger.debug((sql % table) + str(varMap)) + ret = taskBuffer.querySQLS(sql % table,varMap) + _logger.debug('ret -> %s' % str(ret)) + +_logger.debug("================= end ==================") diff --git a/current/pandaserver/test/callbackDDM.py b/current/pandaserver/test/callbackDDM.py new file mode 100755 index 000000000..8564b272e --- /dev/null +++ b/current/pandaserver/test/callbackDDM.py @@ -0,0 +1,12 @@ +import sys +import urllib2,urllib + +node={} +node['vuid']=sys.argv[1] +url='https://gridui01.usatlas.bnl.gov:25443/server/panda/datasetCompleted' +rdata=urllib.urlencode(node) +req=urllib2.Request(url) +fd=urllib2.urlopen(req,rdata) +data = fd.read() + +print data diff --git a/current/pandaserver/test/checkGetJob.py b/current/pandaserver/test/checkGetJob.py new file mode 100644 index 000000000..79d1a0ecf --- /dev/null +++ b/current/pandaserver/test/checkGetJob.py @@ -0,0 +1,18 @@ +import sys +import re +import time +import datetime +timeLimit = datetime.timedelta(seconds=10) +f = open("../../httpd/logs/panda-DBProxy.log") +for line in f: + match = re.search('unlock',line) + if match: + timeM = re.search('^(\d+-\d+-\d+ \d+:\d+:\d+),(\d+)',line) + endTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%d %H:%M:%S')[:6]) + endTime = endTime.replace(microsecond = 1000*int(timeM.group(2))) + timeM = re.search('getJobs : (\d+-\d+-\d+T\d+:\d+:\d+)\.(\d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%dT%H:%M:%S')[:6]) + startTime = startTime.replace(microsecond = int(timeM.group(2))) + if (endTime-startTime) > timeLimit: + print '%s %s' % (startTime,endTime-startTime) +f.close() diff --git a/current/pandaserver/test/checkSetupper.py b/current/pandaserver/test/checkSetupper.py new file mode 100644 index 000000000..1f1dbfdd6 --- /dev/null +++ b/current/pandaserver/test/checkSetupper.py @@ -0,0 +1,31 @@ +import re +import time +import datetime +f = open("../../httpd/logs/panda-Setupper.log") +session = [] +timeList = {} +for line in f: + match = re.search('DEBUG (.*) startRun',line) + if match: + stamp = match.group(1) + stamp = stamp.strip() + session.append(stamp) + timeM = re.search('^(\d+-\d+-\d+ \d+:\d+:\d+),',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%d %H:%M:%S')[:6]) + timeList[stamp] = startTime + continue + match = re.search('DEBUG (.*) endRun',line) + if match: + stamp = match.group(1) + stamp = stamp.strip() + session.remove(stamp) + timeM = re.search('^(\d+-\d+-\d+ \d+:\d+:\d+),',line) + endTime = datetime.datetime(*time.strptime(timeM.group(1),'%Y-%m-%d %H:%M:%S')[:6]) + if timeList.has_key(stamp): + delta = endTime - timeList[stamp] + if delta > datetime.timedelta(minutes = 10): + print "Start : %s " % stamp + print " took -> %02d:%02d:%02d" % (delta.seconds/(60*60),(delta.seconds%(60*60))/60,delta.seconds%60) + continue + +print session diff --git a/current/pandaserver/test/cl_testEvgen.py b/current/pandaserver/test/cl_testEvgen.py new file mode 100644 index 000000000..137c496bd --- /dev/null +++ b/current/pandaserver/test/cl_testEvgen.py @@ -0,0 +1,70 @@ +# +# eg. python cl_testEvgen.py SACLAY FR +# +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)==2: + site = sys.argv[1] + cloud='CA' +elif len(sys.argv)==3: + site = sys.argv[1] + cloud=sys.argv[2] +else: + site = None + cloud = None + +datasetName = 'panda.destDB.%s_tid999991' % commands.getoutput('uuidgen') +taskid = 999989 + +jobList = [] + +for i in range(1): + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) +# job.AtlasRelease = 'Atlas-12.0.6' +# job.homepackage = 'AtlasProduction/12.0.6.5' + job.AtlasRelease = 'Atlas-12.0.7' + job.homepackage = 'AtlasProduction/12.0.7.1' + + job.transformation = 'csc_evgen_trf.py' + job.destinationDBlock = datasetName +# job.destinationSE = destName +# job.cloud = 'CA' + job.cloud = cloud + job.taskID = taskid + job.currentPriority = 1000 + job.prodSourceLabel = 'test' +# job.prodSourceLabel = 'cloudtest' + job.computingSite = site + + file = FileSpec() + file.lfn = "%s.evgen.pool.root" % job.jobName + file.destinationDBlock = job.destinationDBlock + file.destinationSE = job.destinationSE + file.dataset = job.destinationDBlock + file.type = 'output' + job.addFile(file) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="8072 0 5000 1 DC3.008072.JimmyPhotonJet1.py %s NONE NONE NONE" % file.lfn + jobList.append(job) + +for i in range(1): + s,o = Client.submitJobs(jobList) + print "---------------------" + print s + for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/cl_testG4sim.py b/current/pandaserver/test/cl_testG4sim.py new file mode 100644 index 000000000..ed1db41ab --- /dev/null +++ b/current/pandaserver/test/cl_testG4sim.py @@ -0,0 +1,120 @@ +# +# eg. python cl_testG4sim.py SACLAY FR +# + +import sys +import time +import random +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)==2: + site = sys.argv[1] + cloud='CA' +elif len(sys.argv)==3: + site = sys.argv[1] + cloud=sys.argv[2] +else: + site = None + cloud = None + +datasetName = 'panda.rod2.%s_tid999990' % commands.getoutput('uuidgen') +#destName = 'BNL_SE' + +if cloud=='UK': + files = { + 'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01035.pool.root.1':'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541', + } +# or mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01174.pool.root.1, mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01035.pool.root.1 +elif cloud=='CA': + files={'EVNT.012303._00901.pool.root.1':'mc12.005001.pythia_minbias.evgen.EVNT.v12000701_tid012303',} +elif cloud=='FR': + files={'EVNT.010822._00007.pool.root.1':'mc12.006873.PythiaWH140lnugamgam.evgen.EVNT.v12000701_tid010822',} +elif cloud in ['ES']: + files={'EVNT.016869._00187.pool.root.1':'mc12.005001.pythia_minbias.evgen.EVNT.v12000601_tid016869',} +elif cloud in ['DE']: + files={'EVNT.016869._00177.pool.root.2':'mc12.005001.pythia_minbias.evgen.EVNT.v12000601_tid016869',} +else: + print 'Cloud not known: %s'%cloud + cloud = None + files={'EVNT.012303._00545.pool.root.1':'rod.cloudtest1'} + +# UK +#'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541._01035.pool.root.1':'mc12.005802.JF17_pythia_jet_filter.evgen.EVNT.v12003105_tid004541', +# CA +# 'EVNT.012303._00901.pool.root.1':'mc12.005001.pythia_minbias.evgen.EVNT.v12000701_tid012303', + + + +jobList = [] + +for i in range(1): + for lfn in files.keys(): + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = commands.getoutput('uuidgen') + job.AtlasRelease = 'Atlas-12.0.7' + job.homepackage = 'AtlasProduction/12.0.7.1' +# Need different args too +# job.AtlasRelease = 'Atlas-13.0.30' +# job.homepackage = 'AtlasProduction/13.0.30.2' + job.transformation = 'csc_simul_trf.py' + job.destinationDBlock = datasetName + job.cloud = cloud + job.computingSite = site +# job.prodDBlock = 'mc12.005001.pythia_minbias.evgen.EVNT.v12000701_tid012303' + job.prodDBlock = files[lfn] + job.prodSourceLabel = 'test' +# job.prodSourceLabel = 'cloudtest' + job.currentPriority = 1001 + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + + fileD = FileSpec() + fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' + fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' + fileD.lfn = 'DBRelease-3.1.1.tar.gz' + fileD.type = 'input' + job.addFile(fileD) + + + fileOE = FileSpec() + fileOE.lfn = "%s.HITS.pool.root" % job.jobName + fileOE.destinationDBlock = job.destinationDBlock + fileOE.destinationSE = job.destinationSE + fileOE.dataset = job.destinationDBlock + fileOE.type = 'output' + job.addFile(fileOE) + + fileOA = FileSpec() + fileOA.lfn = "%s.RDO.pool.root" % job.jobName + fileOA.destinationDBlock = job.destinationDBlock + fileOA.destinationSE = job.destinationSE + fileOA.dataset = job.destinationDBlock + fileOA.type = 'output' + job.addFile(fileOA) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="%s %s %s 1 4000 153781 ATLAS-CSC-01-02-00 NONE %s" % (fileI.lfn,fileOE.lfn,fileOA.lfn,fileD.lfn) + + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/cl_testMXreco.py b/current/pandaserver/test/cl_testMXreco.py new file mode 100644 index 000000000..1fb770bee --- /dev/null +++ b/current/pandaserver/test/cl_testMXreco.py @@ -0,0 +1,112 @@ +# +# eg. python cl_testG4sim.py SACLAY FR +# + +import sys +import time +import random +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)==2: + site = sys.argv[1] + cloud='CA' +elif len(sys.argv)==3: + site = sys.argv[1] + cloud=sys.argv[2] +else: + site = None + cloud = None + +datasetName = 'panda.rod2.%s_tid999990' % commands.getoutput('uuidgen') +#destName = 'BNL_SE' + +files={'daq.m5_combined.0028997.Default.L1TT-b00000110.LB0000.SFO-1._0001.data':'M5.0028997.Default.L1TT-b00000110.RAW.v010803',} + +if cloud=='IT': + files={'daq.m5_combined.0029118.Default.L1TT-b00000010.LB0000.SFO-1._0001.data':'M5.0029118.Default.L1TT-b00000010.RAW.v010803'} + + +jobList = [] + +for i in range(1): + for lfn in files.keys(): + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = commands.getoutput('uuidgen') + job.AtlasRelease = 'Atlas-13.0.35' + job.homepackage = 'AtlasPoint1/13.0.35.1' + job.transformation = 'csc_cosmics_trf.py' + job.destinationDBlock = datasetName + job.cloud = cloud + job.computingSite = site + job.prodDBlock = files[lfn] + job.prodSourceLabel = 'test' + job.currentPriority = 1001 + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + + fileD = FileSpec() + fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' + fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' + fileD.lfn = 'DBRelease-3.1.1.tar.gz' + fileD.type = 'input' +# job.addFile(fileD) + + + fileO1 = FileSpec() + fileO1.lfn = "%s.ESD.pool.root" % job.jobName + fileO1.destinationDBlock = job.destinationDBlock + fileO1.destinationSE = job.destinationSE + fileO1.dataset = job.destinationDBlock + fileO1.type = 'output' + job.addFile(fileO1) + + fileO2 = FileSpec() + fileO2.lfn = "%s.ESDF.pool.root" % job.jobName + fileO2.destinationDBlock = job.destinationDBlock + fileO2.destinationSE = job.destinationSE + fileO2.dataset = job.destinationDBlock + fileO2.type = 'output' +# job.addFile(fileO2) + + fileO3 = FileSpec() + fileO3.lfn = "%s.NTUP.pool.root" % job.jobName + fileO3.destinationDBlock = job.destinationDBlock + fileO3.destinationSE = job.destinationSE + fileO3.dataset = job.destinationDBlock + fileO3.type = 'output' + job.addFile(fileO3) + + fileO4 = FileSpec() + fileO4.lfn = "%s.HIST.pool.root" % job.jobName + fileO4.destinationDBlock = job.destinationDBlock + fileO4.destinationSE = job.destinationSE + fileO4.dataset = job.destinationDBlock + fileO4.type = 'output' + job.addFile(fileO4) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="%s LAR_TILE_MUONS_LVL1C 10 %s NONE %s %s COMCOND-002-00 NONE" % (fileI.lfn,fileO1.lfn,fileO3.lfn,fileO4.lfn) + + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/cleanup.py b/current/pandaserver/test/cleanup.py new file mode 100644 index 000000000..a1b170d11 --- /dev/null +++ b/current/pandaserver/test/cleanup.py @@ -0,0 +1,10 @@ +import commands + +for patt in ['dq2.clientapi.cli.cliutil.getDQ2','forkSetupper.py','LFCclient.py']: + out = commands.getoutput('ps aux | grep python | grep %s' % patt) + for line in out.split('\n'): + items = line.split() + print items[1], items[8] + if items[8] in ['Sep04','Sep05']: + commands.getoutput('kill -9 %s' % items[1]) + diff --git a/current/pandaserver/test/closeDS.py b/current/pandaserver/test/closeDS.py new file mode 100755 index 000000000..4aeface4f --- /dev/null +++ b/current/pandaserver/test/closeDS.py @@ -0,0 +1,55 @@ +import os +import time +import datetime +import commands +import jobscheduler.Site +import userinterface.Client as Client +from dataservice.DDM import ddm +from taskbuffer.DBProxy import DBProxy +from taskbuffer.TaskBuffer import taskBuffer +from pandalogger.PandaLogger import PandaLogger +from jobdispatcher.Watcher import Watcher + +# logger +_logger = PandaLogger().getLogger('closeDS') + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# instantiate DB proxies +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +# time limit for dataset closing +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7) + +# close datasets +while True: + sql = "SELECT vuid,name,modificationdate FROM Datasets " + \ + "WHERE type='output' AND (status='running' OR status='created' OR status='defined') " + \ + "AND modificationdate<'%s' AND name REGEXP '_sub[[:digit:]]+$'" + ret,res = proxyS.querySQLS(sql % timeLimit.strftime('%Y-%m-%d %H:%M:%S')) + _logger.debug("# of dataset : %s" % len(res)) + if len(res) == 0: + break + for (vuid,name,modDate) in res: + _logger.debug("start %s %s" % (modDate,name)) + retF,resF = proxyS.querySQLS("SELECT lfn FROM filesTable4 WHERE destinationDBlock='%s'" % name) + if retF<0 or retF == None or retF!=len(resF): + _logger.error("SQL error") + else: + # no files in filesTable + if len(resF) == 0: + _logger.debug("freeze %s " % name) + status,out = ddm.dq2.main(['freezeDataset',name]) + if status != 0 or (out.find('Error') != -1 and out.find('DQ2 unknown dataset exception') == -1 \ + and out.find('DQ2 security exception') == -1): + _logger.error(out) + else: + proxyS.querySQL("UPDATE Datasets SET status='completed',modificationdate=UTC_TIMESTAMP() WHERE vuid='%s'" % vuid) + else: + _logger.debug("wait %s " % name) + proxyS.querySQL("UPDATE Datasets SET modificationdate=UTC_TIMESTAMP() WHERE vuid='%s'" % vuid) + _logger.debug("end %s " % name) + time.sleep(1) diff --git a/current/pandaserver/test/copyArchive.py b/current/pandaserver/test/copyArchive.py new file mode 100755 index 000000000..486e28673 --- /dev/null +++ b/current/pandaserver/test/copyArchive.py @@ -0,0 +1,1653 @@ +import os +import re +import sys +import time +import fcntl +import types +import shelve +import random +import datetime +import commands +import threading +import userinterface.Client as Client +from dataservice.DDM import ddm +from dataservice.DDM import dashBorad +from taskbuffer.OraDBProxy import DBProxy +from taskbuffer.TaskBuffer import taskBuffer +from pandalogger.PandaLogger import PandaLogger +from jobdispatcher.Watcher import Watcher +from brokerage.SiteMapper import SiteMapper +from dataservice.Adder import Adder +from dataservice.Finisher import Finisher +from dataservice.MailUtils import MailUtils +from taskbuffer import ProcessGroups +import brokerage.broker_util +import brokerage.broker +import taskbuffer.ErrorCode +import dataservice.DDM + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# logger +_logger = PandaLogger().getLogger('copyArchive') + +_logger.debug("===================== start =====================") + +# memory checker +def _memoryCheck(str): + try: + proc_status = '/proc/%d/status' % os.getpid() + procfile = open(proc_status) + name = "" + vmSize = "" + vmRSS = "" + # extract Name,VmSize,VmRSS + for line in procfile: + if line.startswith("Name:"): + name = line.split()[-1] + continue + if line.startswith("VmSize:"): + vmSize = "" + for item in line.split()[1:]: + vmSize += item + continue + if line.startswith("VmRSS:"): + vmRSS = "" + for item in line.split()[1:]: + vmRSS += item + continue + procfile.close() + _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str)) + except: + type, value, traceBack = sys.exc_info() + _logger.error("memoryCheck() : %s %s" % (type,value)) + _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str)) + return + +_memoryCheck("start") + +# kill old dq2 process +try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('ps axo user,pid,lstart,args | grep dq2.clientapi | grep -v PYTHONPATH | grep -v grep') + for line in out.split('\n'): + if line == '': + continue + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old dq2 process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) +except: + type, value, traceBack = sys.exc_info() + _logger.error("kill dq2 process : %s %s" % (type,value)) + + +# kill old process +try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=7) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName) + for line in out.split('\n'): + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) +except: + type, value, traceBack = sys.exc_info() + _logger.error("kill process : %s %s" % (type,value)) + + +# instantiate TB +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + +# instantiate sitemapper +siteMapper = SiteMapper(taskBuffer) + + + +# send email for access requests +_logger.debug("Site Access") +try: + # get contact + contactAddr = {} + siteContactAddr = {} + sql = "SELECT name,email FROM ATLAS_PANDAMETA.cloudconfig" + status,res = taskBuffer.querySQLS(sql,{}) + for cloudName,cloudEmail in res: + contactAddr[cloudName] = cloudEmail + # get requests + sql = "SELECT pandaSite,status,dn FROM ATLAS_PANDAMETA.siteaccess WHERE status IN (:status1,:status2,:status3) " + sql += "ORDER BY pandaSite,status " + varMap = {} + varMap[':status1'] = 'requested' + varMap[':status2'] = 'tobeapproved' + varMap[':status3'] = 'toberejected' + status,res = taskBuffer.querySQLS(sql,varMap) + requestsInCloud = {} + mailUtils = MailUtils() + # loop over all requests + for pandaSite,reqStatus,userName in res: + cloud = siteMapper.getSite(pandaSite).cloud + _logger.debug("request : '%s' site=%s status=%s cloud=%s" % (userName,pandaSite,reqStatus,cloud)) + # send emails to user + if reqStatus in ['tobeapproved','toberejected']: + # set status + if reqStatus == 'tobeapproved': + newStatus = 'approved' + else: + newStatus = 'rejected' + # get mail address for user + userMailAddr = '' + sqlUM = "SELECT email FROM ATLAS_PANDAMETA.users WHERE name=:userName" + varMap = {} + varMap[':userName'] = userName + stUM,resUM = taskBuffer.querySQLS(sqlUM,varMap) + if resUM == None or len(resUM) == 0: + _logger.error("email address is unavailable for '%s'" % userName) + else: + userMailAddr = resUM[0][0] + # send + if not userMailAddr in ['',None,'None','notsend']: + _logger.debug("send update to %s" % userMailAddr) + retMail = mailUtils.sendSiteAccessUpdate(userMailAddr,newStatus,pandaSite) + _logger.debug(retMail) + # update database + sqlUp = "UPDATE ATLAS_PANDAMETA.siteaccess SET status=:newStatus " + sqlUp += "WHERE pandaSite=:pandaSite AND dn=:userName" + varMap = {} + varMap[':userName'] = userName + varMap[':newStatus'] = newStatus + varMap[':pandaSite'] = pandaSite + stUp,resUp = taskBuffer.querySQLS(sqlUp,varMap) + else: + # append cloud + if not requestsInCloud.has_key(cloud): + requestsInCloud[cloud] = {} + # append site + if not requestsInCloud[cloud].has_key(pandaSite): + requestsInCloud[cloud][pandaSite] = [] + # append user + requestsInCloud[cloud][pandaSite].append(userName) + # send requests to the cloud responsible + for cloud,requestsMap in requestsInCloud.iteritems(): + _logger.debug("requests for approval : cloud=%s" % cloud) + # send + if contactAddr.has_key(cloud) and (not contactAddr[cloud] in ['',None,'None']): + # get site contact + for pandaSite,userNames in requestsMap.iteritems(): + if not siteContactAddr.has_key(pandaSite): + varMap = {} + varMap[':siteid'] = pandaSite + sqlSite = "SELECT email FROM ATLAS_PANDAMETA.schedconfig WHERE siteid=:siteid AND rownum<=1" + status,res = taskBuffer.querySQLS(sqlSite,varMap) + siteContactAddr[pandaSite] = res[0][0] + # append + if not siteContactAddr[pandaSite] in ['',None,'None']: + contactAddr[cloud] += ',%s' % siteContactAddr[pandaSite] + # send + _logger.debug("send request to %s" % contactAddr[cloud]) + retMail = mailUtils.sendSiteAccessRequest(contactAddr[cloud],requestsMap,cloud) + _logger.debug(retMail) + # update database + if retMail: + sqlUp = "UPDATE ATLAS_PANDAMETA.siteaccess SET status=:newStatus " + sqlUp += "WHERE pandaSite=:pandaSite AND dn=:userName" + for pandaSite,userNames in requestsMap.iteritems(): + for userName in userNames: + varMap = {} + varMap[':userName'] = userName + varMap[':newStatus'] = 'inprocess' + varMap[':pandaSite'] = pandaSite + stUp,resUp = taskBuffer.querySQLS(sqlUp,varMap) + else: + _logger.error("contact email address is unavailable for %s" % cloud) +except: + type, value, traceBack = sys.exc_info() + _logger.error("Failed with %s %s" % (type,value)) +_logger.debug("Site Access : done") + + +# finalize failed jobs +_logger.debug("AnalFinalizer session") +try: + # get min PandaID for failed jobs in Active table + sql = "SELECT MIN(PandaID),prodUserName,jobDefinitionID FROM ATLAS_PANDA.jobsActive4 " + sql += "WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus " + sql += "GROUP BY prodUserName,jobDefinitionID " + varMap = {} + varMap[':jobStatus'] = 'failed' + varMap[':prodSourceLabel'] = 'user' + status,res = taskBuffer.querySQLS(sql,varMap) + if res != None: + # loop over all user/jobdefID + for pandaID,prodUserName,jobDefinitionID in res: + # check + _logger.debug("check finalization for %s %s" % (prodUserName,jobDefinitionID)) + sqlC = "SELECT COUNT(*) FROM ATLAS_PANDA.jobsActive4 " + sqlC += "WHERE prodSourceLabel=:prodSourceLabel AND prodUserName=:prodUserName " + sqlC += "AND jobDefinitionID=:jobDefinitionID AND jobStatus<>:jobStatus " + varMap = {} + varMap[':jobStatus'] = 'failed' + varMap[':prodSourceLabel'] = 'user' + varMap[':jobDefinitionID'] = jobDefinitionID + varMap[':prodUserName'] = prodUserName + statC,resC = taskBuffer.querySQLS(sqlC,varMap) + # finalize if there is no non-failed jobs + if resC != None: + _logger.debug("n of non-failed jobs : %s" % resC[0][0]) + if resC[0][0] == 0: + _logger.debug("finalize %s %s" % (prodUserName,jobDefinitionID)) + taskBuffer.finalizePendingJobs(prodUserName,jobDefinitionID) + else: + _logger.debug("n of non-failed jobs : None") +except: + errType,errValue = sys.exc_info()[:2] + _logger.error("AnalFinalizer failed with %s %s" % (errType,errValue)) + + +_memoryCheck("watcher") + +_logger.debug("Watcher session") +# check heartbeat for analysis jobs +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) +varMap = {} +varMap[':modificationTime'] = timeLimit +varMap[':prodSourceLabel1'] = 'panda' +varMap[':prodSourceLabel2'] = 'user' +varMap[':jobStatus1'] = 'running' +varMap[':jobStatus2'] = 'starting' +varMap[':jobStatus3'] = 'stagein' +varMap[':jobStatus4'] = 'stageout' +sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2) " +sql += "AND (jobStatus=:jobStatus1 OR jobStatus=:jobStatus2 OR jobStatus=:jobStatus3 OR jobStatus=:jobStatus4) AND modificationTime<:modificationTime" +status,res = taskBuffer.querySQLS(sql,varMap) +if res == None: + _logger.debug("# of Anal Watcher : %s" % res) +else: + _logger.debug("# of Anal Watcher : %s" % len(res)) + for (id,) in res: + _logger.debug("Anal Watcher %s" % id) + thr = Watcher(taskBuffer,id,single=True,sleepTime=60,sitemapper=siteMapper) + thr.start() + thr.join() + time.sleep(1) + +# check heartbeat for sent jobs +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) +varMap = {} +varMap[':jobStatus'] = 'sent' +varMap[':modificationTime'] = timeLimit +status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime", + varMap) +if res == None: + _logger.debug("# of Sent Watcher : %s" % res) +else: + _logger.debug("# of Sent Watcher : %s" % len(res)) + for (id,) in res: + _logger.debug("Sent Watcher %s" % id) + thr = Watcher(taskBuffer,id,single=True,sleepTime=30,sitemapper=siteMapper) + thr.start() + thr.join() + time.sleep(1) + +# check heartbeat for 'holding' analysis/ddm jobs +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) +# get XMLs +xmlIDs = [] +xmlFiles = os.listdir(panda_config.logdir) +for file in xmlFiles: + match = re.search('^(\d+)_([^_]+)_.{36}$',file) + if match != None: + id = match.group(1) + xmlIDs.append(int(id)) +sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND (modificationTime<:modificationTime OR (endTime IS NOT NULL AND endTime<:endTime)) AND (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2 OR prodSourceLabel=:prodSourceLabel3) AND stateChangeTime != modificationTime" +varMap = {} +varMap[':modificationTime'] = timeLimit +varMap[':endTime'] = timeLimit +varMap[':jobStatus'] = 'holding' +varMap[':prodSourceLabel1'] = 'panda' +varMap[':prodSourceLabel2'] = 'user' +varMap[':prodSourceLabel3'] = 'ddm' +status,res = taskBuffer.querySQLS(sql,varMap) +if res == None: + _logger.debug("# of Holding Anal/DDM Watcher : %s" % res) +else: + _logger.debug("# of Holding Anal/DDM Watcher : %s - XMLs : %s" % (len(res),len(xmlIDs))) + for (id,) in res: + _logger.debug("Holding Anal/DDM Watcher %s" % id) + if int(id) in xmlIDs: + _logger.debug(" found XML -> skip %s" % id) + continue + thr = Watcher(taskBuffer,id,single=True,sleepTime=180,sitemapper=siteMapper) + thr.start() + thr.join() + time.sleep(1) + +# check heartbeat for production jobs +timeOutVal = 48 +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=timeOutVal) +sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND (modificationTime<:modificationTime OR (endTime IS NOT NULL AND endTime<:endTime))" +varMap = {} +varMap[':modificationTime'] = timeLimit +varMap[':endTime'] = timeLimit +varMap[':jobStatus'] = 'holding' +status,res = taskBuffer.querySQLS(sql,varMap) +if res == None: + _logger.debug("# of Holding Watcher : %s" % res) +else: + _logger.debug("# of Holding Watcher : %s" % len(res)) + for (id,) in res: + _logger.debug("Holding Watcher %s" % id) + thr = Watcher(taskBuffer,id,single=True,sleepTime=60*timeOutVal,sitemapper=siteMapper) + thr.start() + thr.join() + time.sleep(1) + +# check heartbeat for ddm jobs +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) +varMap = {} +varMap[':modificationTime'] = timeLimit +varMap[':jobStatus1'] = 'running' +varMap[':jobStatus2'] = 'starting' +varMap[':jobStatus3'] = 'stagein' +varMap[':jobStatus4'] = 'stageout' +varMap[':prodSourceLabel'] = 'ddm' +status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (jobStatus=:jobStatus1 OR jobStatus=:jobStatus2 OR jobStatus=:jobStatus3 OR jobStatus=:jobStatus4) AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel", + varMap) +if res == None: + _logger.debug("# of DDM Watcher : %s" % res) +else: + _logger.debug("# of DDM Watcher : %s" % len(res)) + for (id,) in res: + _logger.debug("DDM Watcher %s" % id) + thr = Watcher(taskBuffer,id,single=True,sleepTime=120,sitemapper=siteMapper) + thr.start() + thr.join() + time.sleep(1) + +# check heartbeat for production jobs +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=6) +varMap = {} +varMap[':modificationTime'] = timeLimit +varMap[':jobStatus1'] = 'running' +varMap[':jobStatus2'] = 'starting' +varMap[':jobStatus3'] = 'stagein' +varMap[':jobStatus4'] = 'stageout' +status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (jobStatus=:jobStatus1 OR jobStatus=:jobStatus2 OR jobStatus=:jobStatus3 OR jobStatus=:jobStatus4) AND modificationTime<:modificationTime", + varMap) +if res == None: + _logger.debug("# of General Watcher : %s" % res) +else: + _logger.debug("# of General Watcher : %s" % len(res)) + for (id,) in res: + _logger.debug("General Watcher %s" % id) + thr = Watcher(taskBuffer,id,single=True,sitemapper=siteMapper) + thr.start() + thr.join() + time.sleep(1) + +_memoryCheck("reassign") + +# kill long-waiting jobs in defined table +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7) +status,res = taskBuffer.querySQLS("SELECT PandaID,cloud,prodSourceLabel FROM ATLAS_PANDA.jobsDefined4 WHERE creationTime<:creationTime", + {':creationTime':timeLimit}) +jobs=[] +dashFileMap = {} +if res != None: + for pandaID,cloud,prodSourceLabel in res: + # collect PandaIDs + jobs.append(pandaID) + try: + if cloud in ['US']: + # skip US since file info is not available in dashboard + continue + # check file status for production + if not prodSourceLabel in ['managed']: + pass + else: + # get T1 site + tmpT1siteID = siteMapper.getCloud(cloud)['source'] + t1Site = siteMapper.getSite(tmpT1siteID) + # get pending input files + sqlF = "SELECT lfn,GUID,dispatchDBlock FROM ATLAS_PANDA.filesTable4 WHERE PandaID=:PandaID " + sqlF += "AND type=:type AND status=:status" + varMap = {} + varMap[':type'] = 'input' + varMap[':status'] = 'pending' + varMap[':PandaID'] = pandaID + stFile,resFile = taskBuffer.querySQLS(sqlF,varMap) + if resFile != None: + # loop over all files + for tmpLFN,tmpGUID,tmpDispDBlock in resFile: + # get file events + tmpDQ2IDs = t1Site.setokens.values() + tmpKey = (tuple(tmpDQ2IDs),tmpLFN) + if not dashFileMap.has_key(tmpKey): + _logger.debug('getting fileEvents for %s:%s' % tmpKey) + tmpStat,tmpOut = dashBorad.listFileEvents(tmpDQ2IDs,tmpGUID) + _logger.debug(tmpStat) + _logger.debug(tmpOut) + if tmpStat != 0: + # failed + continue + # convert to list + try: + exec "tmpEvens = %s" % tmpOut + if not isinstance(tmpEvens,types.ListType): + raise TypeError,"%s is not a list" % type(tmpEvens) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error(tmpOut) + _logger.error("invalid dashboard response %s %s" % (errType,errValue)) + continue + dashFileMap[tmpKey] = None + # look for latest events + tmpLastTime = '' + for tmpEvt in tmpEvens: + # pickup only DQ2 events + if not tmpEvt['tool_id'] in ['DQ2',None]: + continue + # pickup first one or newer + if tmpLastTime == '' or tmpLastTime < tmpEvt['modified_time']: + tmpLastTime = tmpEvt['modified_time'] + dashFileMap[tmpKey] = tmpEvt['state'] + _logger.debug('got status=%s' % dashFileMap[tmpKey]) + # update failed files + if dashFileMap[tmpKey] in ['FAILED_TRANSFER','BAD']: + sqlUpF = "UPDATE ATLAS_PANDA.filesTable4 SET status=:newStatus " + sqlUpF += "WHERE PandaID=:PandaID AND lfn=:lfn" + varMap = {} + varMap[':PandaID'] = pandaID + varMap[':lfn'] = tmpLFN + varMap[':newStatus'] = dashFileMap[tmpKey].lower() + taskBuffer.querySQLS(sqlUpF,varMap) + _logger.debug('set status=%s to %s:%s' % (dashFileMap[tmpKey],pandaID,tmpLFN)) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("dashboard access failed with %s %s" % (errType,errValue)) +if len(jobs): + _logger.debug("killJobs for Defined (%s)" % str(jobs)) + Client.killJobs(jobs,2) + +# kill long-waiting jobs in active table +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7) +varMap = {} +varMap[':jobStatus'] = 'activated' +varMap[':creationTime'] = timeLimit +status,res = taskBuffer.querySQLS("SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND creationTime<:creationTime", + varMap) +jobs=[] +if res != None: + for (id,) in res: + jobs.append(id) +if len(jobs): + _logger.debug("killJobs for Active (%s)" % str(jobs)) + Client.killJobs(jobs,2) + + +# kill long-waiting ddm jobs for dispatch +_logger.debug("kill PandaMovers") +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) +sql = "SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND transferType=:transferType AND creationTime<:creationTime" +varMap = {} +varMap[':creationTime'] = timeLimit +varMap[':prodSourceLabel'] = 'ddm' +varMap[':transferType'] = 'dis' +_logger.debug(sql+str(varMap)) +status,res = taskBuffer.querySQLS(sql,varMap) +_logger.debug(res) +jobs=[] +if res != None: + for (id,) in res: + jobs.append(id) +if len(jobs): + _logger.debug("kill DDM Jobs (%s)" % str(jobs)) + Client.killJobs(jobs,2) + +# kill hang-up movers +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) +sql = "SELECT PandaID from ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND transferType=:transferType AND jobStatus=:jobStatus AND startTime<:startTime" +varMap = {} +varMap[':startTime'] = timeLimit +varMap[':prodSourceLabel'] = 'ddm' +varMap[':transferType'] = 'dis' +varMap[':jobStatus'] = 'running' +_logger.debug(sql+str(varMap)) +status,res = taskBuffer.querySQLS(sql,varMap) +_logger.debug(res) +jobs = [] +movers = [] +if res != None: + for id, in res: + movers.append(id) + # get dispatch dataset + sql = 'SELECT name FROM ATLAS_PANDA.Datasets WHERE MoverID=:MoverID' + stDS,resDS = taskBuffer.querySQLS(sql,{':MoverID':id}) + if resDS != None: + disDS = resDS[0][0] + # get PandaIDs associated to the dis dataset + sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND dispatchDBlock=:dispatchDBlock" + varMap = {} + varMap[':jobStatus'] = 'assigned' + varMap[':dispatchDBlock'] = disDS + stP,resP = taskBuffer.querySQLS(sql,varMap) + if resP != None: + for pandaID, in resP: + jobs.append(pandaID) +# kill movers +if len(movers): + _logger.debug("kill hangup DDM Jobs (%s)" % str(movers)) + Client.killJobs(movers,2) +# reassign jobs +if len(jobs): + nJob = 100 + iJob = 0 + while iJob < len(jobs): + _logger.debug('reassignJobs for hangup movers (%s)' % jobs[iJob:iJob+nJob]) + taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) + iJob += nJob + +# reassign defined jobs in defined table +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=4) +# get PandaIDs +status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsDefined4",timeLimit,['defined'],['managed'],[],[],[]) +jobs=[] +if res != None: + for (id,) in res: + jobs.append(id) +# reassign +_logger.debug('reassignJobs for defined jobs -> #%s' % len(jobs)) +if len(jobs) > 0: + nJob = 100 + iJob = 0 + while iJob < len(jobs): + _logger.debug('reassignJobs for defined jobs (%s)' % jobs[iJob:iJob+nJob]) + taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) + _logger.debug('reassignJobs for defined jobs done %s' % jobs[iJob]) + iJob += nJob + + +# reassign when ratio of running/notrunning is too unbalanced +""" +_logger.debug("reassign Unbalanced") +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=4) +jobStat = {} +rangeValues = ['all','limit'] +for rangeVal in rangeValues: + for jobStatus in ['running','activated','assigned']: + table = 'ATLAS_PANDA.jobsDefined4' + if jobStatus in ['running','activated']: + table = 'ATLAS_PANDA.jobsActive4' + varMap = {} + varMap[':prodSourceLabel'] = 'managed' + varMap[':jobStatus'] = jobStatus + if rangeVal == 'all': + sql = "SELECT computingSite,cloud,processingType,count(*) FROM %s WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus GROUP BY computingSite,cloud,processingType" \ + % table + else: + sql = "SELECT computingSite,cloud,processingType,count(*) FROM %s WHERE prodSourceLabel=:prodSourceLabel AND jobStatus=:jobStatus AND modificationTime<:modificationTime GROUP BY computingSite,cloud,processingType" \ + % table + varMap[':modificationTime'] = timeLimit + # execute + status,res = taskBuffer.querySQLS(sql,varMap) + if res != None: + for computingSite,cloud,processingType,nJobs in res: + # add cloud + if not jobStat.has_key(cloud): + jobStat[cloud] = {} + # add site + if not jobStat[cloud].has_key(computingSite): + jobStat[cloud][computingSite] = {} + # add range + if not jobStat[cloud][computingSite].has_key(rangeVal): + jobStat[cloud][computingSite][rangeVal] = {} + # add process group + tmpProGroup = ProcessGroups.getProcessGroup(processingType) + if not jobStat[cloud][computingSite][rangeVal].has_key(tmpProGroup): + jobStat[cloud][computingSite][rangeVal][tmpProGroup] = {} + # set status + tmpStatus = jobStatus + if jobStatus != 'running': + tmpStatus = 'notrunning' + # add status + if not jobStat[cloud][computingSite][rangeVal][tmpProGroup].has_key(tmpStatus): + jobStat[cloud][computingSite][rangeVal][tmpProGroup][tmpStatus] = 0 + # add + jobStat[cloud][computingSite][rangeVal][tmpProGroup][tmpStatus] += nJobs +# look for unbalanced site +for cloud,siteVal in jobStat.iteritems(): + jobsCloud = {} + ngSites = {} + t1Site = siteMapper.getCloud(cloud)['source'] + _logger.debug("Cloud:%s" % cloud) + for computingSite,jobVal in siteVal.iteritems(): + # set 0 + for rangeVal in rangeValues: + for pgType,pgList in ProcessGroups.processGroups: + # add range + if not jobVal.has_key(rangeVal): + jobVal[rangeVal] = {} + # add process group + if not jobVal[rangeVal].has_key(pgType): + jobVal[rangeVal][pgType] = {} + # number of jobs + if not jobVal[rangeVal][pgType].has_key('running'): + jobVal[rangeVal][pgType]['running'] = 0 + if not jobVal[rangeVal][pgType].has_key('notrunning'): + jobVal[rangeVal][pgType]['notrunning'] = 0 + # check ratio + for pgType,pgList in ProcessGroups.processGroups: + # add process group to map + if not jobsCloud.has_key(pgType): + jobsCloud[pgType] = {'notrunning':0,'running':0,'notfull':False} + if not ngSites.has_key(pgType): + ngSites[pgType] = [] + # get ratio + checkRatio = jobVal['limit'][pgType]['notrunning'] > jobVal['all'][pgType]['running']*4 + jobsCloud[pgType]['running'] += jobVal['all'][pgType]['running'] + jobsCloud[pgType]['notrunning'] += jobVal['all'][pgType]['notrunning'] + # check ratio + if computingSite in [t1Site,'NULL']: + # skip T1 + statStr = '--' + else: + if checkRatio: + statStr = 'NG' + ngSites[pgType].append(computingSite) + else: + statStr = '--' + # not full + if jobVal['all'][pgType]['notrunning'] < jobVal['all'][pgType]['running']*2: + jobsCloud[pgType]['notfull'] = True + _logger.debug("%20s : %14s %s n:%-5s r:%-5s" % (computingSite,pgType,statStr,jobVal['limit'][pgType]['notrunning'], + jobVal['all'][pgType]['running'])) + # reassign + for pgType,pgList in ProcessGroups.processGroups: + _logger.debug(" %14s : n:%-5s r:%-5s %s" % (pgType,jobsCloud[pgType]['notrunning'], + jobsCloud[pgType]['running'],jobsCloud[pgType]['notfull'])) + if jobsCloud[pgType]['notrunning'] > jobsCloud[pgType]['running']*2 and ngSites[pgType] != [] and jobsCloud[pgType]['notfull']: + # reassign except reprocessing + if pgType in ['reprocessing']: + continue + # get PandaIDs + jobs = [] + for table in ['ATLAS_PANDA.jobsDefined4','ATLAS_PANDA.jobsActive4']: + varMap = {} + varMap[':prodSourceLabel'] = 'managed' + varMap[':jobStatus1'] = 'activated' + varMap[':jobStatus2'] = 'assigned' + sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND jobStatus IN (:jobStatus1,:jobStatus2) AND computingSite IN (" % table + idxSite = 1 + for ngSite in ngSites[pgType]: + tmpSiteKey = ':computingSite%s' % idxSite + sql += "%s," % tmpSiteKey + varMap[tmpSiteKey] = ngSite + idxSite += 1 + sql = sql[:-1] + if pgList != []: + sql += ") AND processingType IN (" + tmpPgList = pgList + else: + sql += ") AND processingType NOT IN (" + # get types to be excluded + tmpPgList = [] + for tmpExPgType,tmpExPgList in ProcessGroups.processGroups: + if tmpExPgType != pgType: + tmpPgList += tmpExPgList + idxPro = 1 + for pgItem in tmpPgList: + tmpProKey = ':processingType%s' % idxPro + sql += "%s," % tmpProKey + varMap[tmpProKey] = pgItem + idxPro += 1 + sql = sql[:-1] + sql += ") AND modificationTime<:modificationTime ORDER BY PandaID" + varMap[':modificationTime'] = timeLimit + # execute + _logger.debug(sql+str(varMap)) + status,res = taskBuffer.querySQLS(sql,varMap) + if res != None: + # get IDs + for id, in res: + jobs.append(id) + # reassign + if jobs != []: + if len(jobs): + nJob = 100 + iJob = 0 + while iJob < len(jobs): + #_logger.debug('reassignJobs for Unbalanced (%s)' % jobs[iJob:iJob+nJob]) + #Client.reassignJobs(jobs[iJob:iJob+nJob]) + iJob += nJob + #time.sleep(60) +""" + + +# reassign long-waiting jobs in defined table +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) +status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsDefined4",timeLimit,[],['managed'],[],[],[]) +jobs=[] +if res != None: + for (id,) in res: + jobs.append(id) +# reassign +_logger.debug('reassignJobs for long in defined table -> #%s' % len(jobs)) +if len(jobs) > 0: + nJob = 100 + iJob = 0 + while iJob < len(jobs): + _logger.debug('reassignJobs for long in defined table (%s)' % jobs[iJob:iJob+nJob]) + taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) + iJob += nJob + + +# reassign too long-standing evgen/simul jobs with active state at T1 +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=6) +for tmpCloud in siteMapper.getCloudList(): + # ignore special clouds + if tmpCloud in ['CERN','OSG']: + continue + status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsActive4",timeLimit,['activated'],['managed'], + ['evgen','simul'],[siteMapper.getCloud(tmpCloud)['tier1']],[]) + jobs = [] + if res != None: + for (id,) in res: + jobs.append(id) + _logger.debug('reassignJobs for Active T1 evgensimul in %s -> #%s' % (tmpCloud,len(jobs))) + if len(jobs) != 0: + nJob = 100 + iJob = 0 + while iJob < len(jobs): + _logger.debug('reassignJobs for Active T1 evgensimul (%s)' % jobs[iJob:iJob+nJob]) + taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) + iJob += nJob + +# reassign too long-standing evgen/simul jobs with active state at T2 +try: + _logger.debug('looking for stuck T2s to reassign evgensimul') + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=6) + varMap = {} + varMap[':jobStatus1'] = 'activated' + varMap[':jobStatus2'] = 'running' + varMap[':prodSourceLabel'] = 'managed' + varMap[':processingType1'] = 'evgen' + varMap[':processingType2'] = 'simul' + status,res = taskBuffer.querySQLS("SELECT cloud,computingSite,jobStatus,COUNT(*) FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus IN (:jobStatus1,:jobStatus2) AND prodSourceLabel=:prodSourceLabel AND processingType IN (:processingType1,:processingType2) GROUP BY cloud,computingSite,jobStatus", + varMap) + if res != None: + # get ratio of activated/running + siteStatData = {} + for tmpCloud,tmpComputingSite,tmpJobStatus,tmpCount in res: + # skip T1 + if tmpComputingSite == siteMapper.getCloud(tmpCloud)['tier1']: + continue + # add cloud/site + tmpKey = (tmpCloud,tmpComputingSite) + if not siteStatData.has_key(tmpKey): + siteStatData[tmpKey] = {'activated':0,'running':0} + # add the number of jobs + if siteStatData[tmpKey].has_key(tmpJobStatus): + siteStatData[tmpKey][tmpJobStatus] += tmpCount + # look for stuck site + stuckThr = 10 + stuckSites = [] + for tmpKey,tmpStatData in siteStatData.iteritems(): + if tmpStatData['running'] == 0 or \ + float(tmpStatData['activated'])/float(tmpStatData['running']) > stuckThr: + tmpCloud,tmpComputingSite = tmpKey + _logger.debug(' %s:%s %s/%s > %s' % (tmpCloud,tmpComputingSite,tmpStatData['activated'],tmpStatData['running'],stuckThr)) + # get stuck jobs + status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsActive4",timeLimit,['activated'],['managed'], + ['evgen','simul'],[tmpComputingSite],[tmpCloud]) + jobs = [] + if res != None: + for (id,) in res: + jobs.append(id) + _logger.debug('reassignJobs for Active T2 evgensimul %s:%s -> #%s' % (tmpCloud,tmpComputingSite,len(jobs))) + if len(jobs) > 0: + nJob = 100 + iJob = 0 + while iJob < len(jobs): + _logger.debug('reassignJobs for Active T2 evgensimul (%s)' % jobs[iJob:iJob+nJob]) + taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) + iJob += nJob +except: + errType,errValue = sys.exc_info()[:2] + _logger.error("failed to reassign T2 evgensimul with %s:%s" % (errType,errValue)) + +# reassign too long-standing jobs in active table +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=2) +status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsActive4",timeLimit,['activated'],['managed'],[],[],[]) +jobs = [] +if res != None: + for (id,) in res: + jobs.append(id) +_logger.debug('reassignJobs for long in active table -> #%s' % len(jobs)) +if len(jobs) != 0: + nJob = 100 + iJob = 0 + while iJob < len(jobs): + _logger.debug('reassignJobs for long in active table (%s)' % jobs[iJob:iJob+nJob]) + taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) + iJob += nJob + + +# kill too long-standing analysis jobs in active table +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7) +varMap = {} +varMap[':prodSourceLabel1'] = 'test' +varMap[':prodSourceLabel2'] = 'panda' +varMap[':prodSourceLabel3'] = 'user' +varMap[':modificationTime'] = timeLimit +status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE (prodSourceLabel=:prodSourceLabel1 OR prodSourceLabel=:prodSourceLabel2 OR prodSourceLabel=:prodSourceLabel3) AND modificationTime<:modificationTime ORDER BY PandaID", + varMap) +jobs = [] +if res != None: + for (id,) in res: + jobs.append(id) +# kill +if len(jobs): + Client.killJobs(jobs,2) + _logger.debug("killJobs for Anal Active (%s)" % str(jobs)) + + +# kill too long pending jobs +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) +varMap = {} +varMap[':jobStatus'] = 'pending' +varMap[':creationTime'] = timeLimit +status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsWaiting4 WHERE jobStatus=:jobStatus AND creationTime<:creationTime", + varMap) +jobs = [] +if res != None: + for (id,) in res: + jobs.append(id) +# kill +if len(jobs): + Client.killJobs(jobs,4) + _logger.debug("killJobs for Pending (%s)" % str(jobs)) + +# kill too long waiting jobs +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=1) +status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsWaiting4 WHERE creationTime<:creationTime", + {':creationTime':timeLimit}) +jobs = [] +if res != None: + for (id,) in res: + jobs.append(id) +# kill +if len(jobs): + Client.killJobs(jobs,4) + _logger.debug("killJobs for Waiting (%s)" % str(jobs)) + + +# reassign long waiting jobs +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) +status,res = taskBuffer.lockJobsForReassign("ATLAS_PANDA.jobsWaiting4",timeLimit,['waiting'],['managed'],[],[],[]) +jobs = [] +if res != None: + for (id,) in res: + jobs.append(id) +_logger.debug('reassignJobs for Waiting -> #%s' % len(jobs)) +if len(jobs): + nJob = 100 + iJob = 0 + while iJob < len(jobs): + _logger.debug('reassignJobs for Waiting (%s)' % jobs[iJob:iJob+nJob]) + taskBuffer.reassignJobs(jobs[iJob:iJob+nJob],joinThr=True) + iJob += nJob + +# kill too long running jobs +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=21) +status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE creationTime<:creationTime", + {':creationTime':timeLimit}) +jobs = [] +if res != None: + for (id,) in res: + jobs.append(id) +# kill +if len(jobs): + nJob = 100 + iJob = 0 + while iJob < len(jobs): + # set tobekill + _logger.debug('killJobs for Running (%s)' % jobs[iJob:iJob+nJob]) + Client.killJobs(jobs[iJob:iJob+nJob],2) + # run watcher + for id in jobs[iJob:iJob+nJob]: + thr = Watcher(taskBuffer,id,single=True,sitemapper=siteMapper,sleepTime=60*24*21) + thr.start() + thr.join() + time.sleep(1) + iJob += nJob + time.sleep(10) + +# kill too long waiting ddm jobs +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=5) +varMap = {} +varMap[':prodSourceLabel'] = 'ddm' +varMap[':creationTime'] = timeLimit +status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND creationTime<:creationTime", + varMap) +jobs = [] +if res != None: + for (id,) in res: + jobs.append(id) +# kill +if len(jobs): + Client.killJobs(jobs,2) + _logger.debug("killJobs for DDM (%s)" % str(jobs)) + +_memoryCheck("closing") + + +# delete old datasets +""" +timeLimitDnS = datetime.datetime.utcnow() - datetime.timedelta(days=60) +timeLimitTop = datetime.datetime.utcnow() - datetime.timedelta(days=90) +nDelDS = 1000 +for dsType,dsPrefix in [('','top'),]: + sql = 'DELETE FROM ATLAS_PANDA.Datasets ' + if dsType != '': + # dis or sub + sql += 'WHERE type=:type AND modificationdate<:modificationdate ' + sql += 'AND REGEXP_LIKE(name,:pattern) AND rownum <= %s' % nDelDS + varMap = {} + varMap[':modificationdate'] = timeLimitDnS + varMap[':type'] = dsType + varMap[':pattern'] = '_%s[[:digit:]]+$' % dsPrefix + else: + # top level datasets + sql+= 'WHERE modificationdate<:modificationdate AND rownum <= %s' % nDelDS + varMap = {} + varMap[':modificationdate'] = timeLimitTop + for i in range(100): + # del datasets + ret,res = taskBuffer.querySQLS(sql, varMap) + _logger.debug('# of %s datasets deleted: %s' % (dsPrefix,res)) + # no more datasets + if res != nDelDS: + break +""" + +# thread pool +class ThreadPool: + def __init__(self): + self.lock = threading.Lock() + self.list = [] + + def add(self,obj): + self.lock.acquire() + self.list.append(obj) + self.lock.release() + + def remove(self,obj): + self.lock.acquire() + self.list.remove(obj) + self.lock.release() + + def join(self): + self.lock.acquire() + thrlist = tuple(self.list) + self.lock.release() + for thr in thrlist: + thr.join() + + +# thread to close dataset +class CloserThr (threading.Thread): + def __init__(self,lock,proxyLock,datasets,pool): + threading.Thread.__init__(self) + self.datasets = datasets + self.lock = lock + self.proxyLock = proxyLock + self.pool = pool + self.pool.add(self) + + def run(self): + self.lock.acquire() + try: + # loop over all datasets + for vuid,name,modDate in self.datasets: + _logger.debug("Close %s %s" % (modDate,name)) + if not name.startswith('pandaddm_'): + status,out = ddm.DQ2.main('freezeDataset',name) + else: + status,out = 0,'' + if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: + _logger.error(out) + else: + self.proxyLock.acquire() + varMap = {} + varMap[':vuid'] = vuid + varMap[':status'] = 'completed' + taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", + varMap) + self.proxyLock.release() + if name.startswith('pandaddm_'): + continue + # count # of files + status,out = ddm.DQ2.main('getNumberOfFiles',name) + _logger.debug(out) + if status != 0: + _logger.error(out) + else: + try: + nFile = int(out) + _logger.debug(nFile) + if nFile == 0: + # erase dataset + _logger.debug('erase %s' % name) + status,out = ddm.DQ2.main('eraseDataset',name) + _logger.debug(out) + except: + pass + except: + pass + self.pool.remove(self) + self.lock.release() + +# close datasets +""" +timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) +timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3) +closeLock = threading.Semaphore(5) +closeProxyLock = threading.Lock() +closeThreadPool = ThreadPool() +while True: + # lock + closeLock.acquire() + # get datasets + closeProxyLock.acquire() + varMap = {} + varMap[':modificationdateU'] = timeLimitU + varMap[':modificationdateL'] = timeLimitL + varMap[':type'] = 'output' + varMap[':status'] = 'tobeclosed' + sqlQuery = 'type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= 500' + proxyS = taskBuffer.proxyPool.getProxy() + res = proxyS.getLockDatasets(sqlQuery,varMap) + taskBuffer.proxyPool.putProxy(proxyS) + if res == None: + _logger.debug('# of datasets to be closed: %s' % res) + else: + _logger.debug('# of datasets to be closed: %s' % len(res)) + if res==None or len(res)==0: + closeProxyLock.release() + closeLock.release() + break + # release + closeProxyLock.release() + closeLock.release() + # run thread + closerThr = CloserThr(closeLock,closeProxyLock,res,closeThreadPool) + closerThr.start() + +closeThreadPool.join() +""" + +# thread to freeze dataset +class Freezer (threading.Thread): + def __init__(self,lock,proxyLock,datasets,pool): + threading.Thread.__init__(self) + self.datasets = datasets + self.lock = lock + self.proxyLock = proxyLock + self.pool = pool + self.pool.add(self) + + def run(self): + self.lock.acquire() + try: + for vuid,name,modDate in self.datasets: + _logger.debug("start %s %s" % (modDate,name)) + self.proxyLock.acquire() + retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ lfn FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock", + {':destinationDBlock':name}) + self.proxyLock.release() + if retF<0: + _logger.error("SQL error") + else: + # no files in filesTable + if len(resF) == 0: + _logger.debug("freeze %s " % name) + if not name.startswith('pandaddm_'): + status,out = ddm.DQ2.main('freezeDataset',name) + else: + status,out = 0,'' + if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: + _logger.error(out) + else: + self.proxyLock.acquire() + varMap = {} + varMap[':vuid'] = vuid + varMap[':status'] = 'completed' + taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", + varMap) + self.proxyLock.release() + if name.startswith('pandaddm_'): + continue + # count # of files + status,out = ddm.DQ2.main('getNumberOfFiles',name) + _logger.debug(out) + if status != 0: + _logger.error(out) + else: + try: + nFile = int(out) + _logger.debug(nFile) + if nFile == 0: + # erase dataset + _logger.debug('erase %s' % name) + status,out = ddm.DQ2.main('eraseDataset',name) + _logger.debug(out) + except: + pass + else: + _logger.debug("wait %s " % name) + self.proxyLock.acquire() + taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid}) + self.proxyLock.release() + _logger.debug("end %s " % name) + except: + pass + self.pool.remove(self) + self.lock.release() + +# freeze dataset +""" +timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(days=4) +timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=14) +freezeLock = threading.Semaphore(5) +freezeProxyLock = threading.Lock() +freezeThreadPool = ThreadPool() +while True: + # lock + freezeLock.acquire() + # get datasets + sqlQuery = 'type=:type AND status IN (:status1,:status2,:status3) ' + \ + 'AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND REGEXP_LIKE(name,:pattern) AND rownum <= 500' + varMap = {} + varMap[':modificationdateU'] = timeLimitU + varMap[':modificationdateL'] = timeLimitL + varMap[':type'] = 'output' + varMap[':status1'] = 'running' + varMap[':status2'] = 'created' + varMap[':status3'] = 'defined' + varMap[':pattern'] = '_sub[[:digit:]]+$' + freezeProxyLock.acquire() + proxyS = taskBuffer.proxyPool.getProxy() + res = proxyS.getLockDatasets(sqlQuery,varMap) + taskBuffer.proxyPool.putProxy(proxyS) + if res == None: + _logger.debug('# of datasets to be frozen: %s' % res) + else: + _logger.debug('# of datasets to be frozen: %s' % len(res)) + if res==None or len(res)==0: + freezeProxyLock.release() + freezeLock.release() + break + freezeProxyLock.release() + # release + freezeLock.release() + # run freezer + freezer = Freezer(freezeLock,freezeProxyLock,res,freezeThreadPool) + freezer.start() + +freezeThreadPool.join() +""" + +# thread to delete dataset replica from T2 +class T2Cleaner (threading.Thread): + def __init__(self,lock,proxyLock,datasets,pool): + threading.Thread.__init__(self) + self.datasets = datasets + self.lock = lock + self.proxyLock = proxyLock + self.pool = pool + self.pool.add(self) + + def run(self): + self.lock.acquire() + try: + for vuid,name,modDate in self.datasets: + _logger.debug("cleanT2 %s" % name) + # get list of replicas + status,out = ddm.DQ2.main('listDatasetReplicas',name,0,None,False) + if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: + _logger.error(out) + continue + else: + try: + # convert res to map + exec "tmpRepSites = %s" % out + except: + tmpRepSites = {} + _logger.error("cannot convert to replica map") + _logger.error(out) + continue + # check cloud + cloudName = None + for tmpCloudName in siteMapper.getCloudList(): + t1SiteName = siteMapper.getCloud(tmpCloudName)['source'] + t1SiteDDMs = siteMapper.getSite(t1SiteName).setokens.values() + for tmpDDM in t1SiteDDMs: + if tmpRepSites.has_key(tmpDDM): + cloudName = tmpCloudName + break + # cloud is not found + if cloudName == None: + _logger.error("cannot find cloud for %s : %s" % (name,str(tmpRepSites))) + elif not cloudName in ['DE','CA','ES','FR','IT','NL','UK','TW','RU']: + # FIXME : test only EGEE for now + pass + else: + # look for T2 IDs + t2DDMs = [] + for tmpDDM in tmpRepSites.keys(): + if not tmpDDM in t1SiteDDMs and tmpDDM.endswith('_PRODDISK'): + t2DDMs.append(tmpDDM) + # delete replica for sub + if re.search('_sub\d+$',name) != None and t2DDMs != []: + _logger.debug(('deleteDatasetReplicas',name,t2DDMs)) + status,out = ddm.DQ2.main('deleteDatasetReplicas',name,t2DDMs) + if status != 0: + _logger.error(out) + if out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \ + out.find("No replica found") == -1: + continue + # update + self.proxyLock.acquire() + varMap = {} + varMap[':vuid'] = vuid + varMap[':status'] = 'completed' + taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", + varMap) + self.proxyLock.release() + _logger.debug("end %s " % name) + except: + pass + self.pool.remove(self) + self.lock.release() + +# delete dataset replica from T2 +""" +timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) +timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3) +t2cleanLock = threading.Semaphore(5) +t2cleanProxyLock = threading.Lock() +t2cleanThreadPool = ThreadPool() +while True: + # lock + t2cleanLock.acquire() + # get datasets + varMap = {} + varMap[':modificationdateU'] = timeLimitU + varMap[':modificationdateL'] = timeLimitL + varMap[':type'] = 'output' + varMap[':status'] = 'cleanup' + sqlQuery = 'type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= 500' + t2cleanProxyLock.acquire() + proxyS = taskBuffer.proxyPool.getProxy() + res = proxyS.getLockDatasets(sqlQuery,varMap) + taskBuffer.proxyPool.putProxy(proxyS) + if res == None: + _logger.debug('# of datasets to be deleted from T2: %s' % res) + else: + _logger.debug('# of datasets to be deleted from T2: %s' % len(res)) + if res==None or len(res)==0: + t2cleanProxyLock.release() + t2cleanLock.release() + break + t2cleanProxyLock.release() + # release + t2cleanLock.release() + # run t2cleanr + t2cleanr = T2Cleaner(t2cleanLock,t2cleanProxyLock,res,t2cleanThreadPool) + t2cleanr.start() + +t2cleanThreadPool.join() +""" + + +_memoryCheck("delete XML") + +# delete old files in DA cache +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=7) +files = os.listdir(panda_config.cache_dir) +for file in files: + # skip special test file + if file == 'sources.72c48dc5-f055-43e5-a86e-4ae9f8ea3497.tar.gz': + continue + if file == 'sources.090f3f51-fc81-4e80-9749-a5e4b2bd58de.tar.gz': + continue + try: + # get timestamp + timestamp = datetime.datetime.fromtimestamp(os.stat('%s/%s' % (panda_config.cache_dir,file)).st_mtime) + # delete + if timestamp < timeLimit: + _logger.debug("delete %s " % file) + os.remove('%s/%s' % (panda_config.cache_dir,file)) + except: + pass + + +_memoryCheck("delete core") + +# delete core +dirName = '%s/..' % panda_config.logdir +for file in os.listdir(dirName): + if file.startswith('core.'): + _logger.debug("delete %s " % file) + try: + os.remove('%s/%s' % (dirName,file)) + except: + pass + + +_memoryCheck("finisher") + +# finish transferring jobs +""" +timeNow = datetime.datetime.utcnow() +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) +sql = 'SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime AND rownum<=20' +for ii in range(1000): + varMap = {} + varMap[':jobStatus'] = 'transferring' + varMap[':modificationTime'] = timeLimit + ret,res = taskBuffer.querySQLS(sql, varMap) + if res == None: + _logger.debug('# of jobs to be finished : %s' % res) + break + else: + _logger.debug('# of jobs to be finished : %s' % len(res)) + if len(res) == 0: + break + # get jobs from DB + ids = [] + for (id,) in res: + ids.append(id) + jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False) + # update modificationTime to lock jobs + for job in jobs: + if job != None and job.jobStatus != 'unknown': + taskBuffer.updateJobStatus(job.PandaID,job.jobStatus,{}) + upJobs = [] + finJobs = [] + for job in jobs: + if job == None or job.jobStatus == 'unknown': + continue + # use BNL by default + dq2URL = siteMapper.getSite('BNL_ATLAS_1').dq2url + dq2SE = [] + # get LFC and SEs + if job.prodSourceLabel == 'user' and not siteMapper.siteSpecList.has_key(job.destinationSE): + # using --destSE for analysis job to transfer output + try: + dq2URL = dataservice.DDM.toa.getLocalCatalog(job.destinationSE)[-1] + match = re.search('.+://([^:/]+):*\d*/*',dataservice.DDM.toa.getSiteProperty(job.destinationSE,'srm')[-1]) + if match != None: + dq2SE.append(match.group(1)) + except: + type, value, traceBack = sys.exc_info() + _logger.error('Failed to get DQ2/SE for %s with %s %s' % (job.PandaID,type,value)) + continue + elif siteMapper.checkCloud(job.cloud): + # normal production jobs + tmpDstID = siteMapper.getCloud(job.cloud)['dest'] + tmpDstSite = siteMapper.getSite(tmpDstID) + if not tmpDstSite.lfchost in [None,'']: + # LFC + dq2URL = 'lfc://'+tmpDstSite.lfchost+':/grid/atlas/' + if tmpDstSite.se != None: + for tmpDstSiteSE in tmpDstSite.se.split(','): + match = re.search('.+://([^:/]+):*\d*/*',tmpDstSiteSE) + if match != None: + dq2SE.append(match.group(1)) + else: + # LRC + dq2URL = tmpDstSite.dq2url + dq2SE = [] + # get LFN list + lfns = [] + guids = [] + nTokens = 0 + for file in job.Files: + # only output files are checked + if file.type == 'output' or file.type == 'log': + lfns.append(file.lfn) + guids.append(file.GUID) + nTokens += len(file.destinationDBlockToken.split(',')) + # get files in LRC + _logger.debug('Cloud:%s DQ2URL:%s' % (job.cloud,dq2URL)) + okFiles = brokerage.broker_util.getFilesFromLRC(lfns,dq2URL,guids,dq2SE,getPFN=True) + # count files + nOkTokens = 0 + for okLFN,okPFNs in okFiles.iteritems(): + nOkTokens += len(okPFNs) + # check all files are ready + _logger.debug(' nToken:%s nOkToken:%s' % (nTokens,nOkTokens)) + if nTokens <= nOkTokens: + _logger.debug('Finisher : Finish %s' % job.PandaID) + for file in job.Files: + if file.type == 'output' or file.type == 'log': + file.status = 'ready' + # append to run Finisher + finJobs.append(job) + else: + endTime = job.endTime + if endTime == 'NULL': + endTime = job.startTime + # priority-dependent timeout + tmpCloudSpec = siteMapper.getCloud(job.cloud) + if job.currentPriority >= 900 and (not job.prodSourceLabel in ['user']): + if tmpCloudSpec.has_key('transtimehi'): + timeOutValue = tmpCloudSpec['transtimehi'] + else: + timeOutValue = 1 + else: + if tmpCloudSpec.has_key('transtimelo'): + timeOutValue = tmpCloudSpec['transtimelo'] + else: + timeOutValue = 2 + # protection + if timeOutValue < 1: + timeOutValue = 1 + timeOut = timeNow - datetime.timedelta(days=timeOutValue) + _logger.debug(' Priority:%s Limit:%s End:%s' % (job.currentPriority,str(timeOut),str(endTime))) + if endTime < timeOut: + # timeout + _logger.debug('Finisher : Kill %s' % job.PandaID) + strMiss = '' + for lfn in lfns: + if not lfn in okFiles: + strMiss += ' %s' % lfn + job.jobStatus = 'failed' + job.taskBufferErrorCode = taskbuffer.ErrorCode.EC_Transfer + job.taskBufferErrorDiag = 'transfer timeout for '+strMiss + guidMap = {} + for file in job.Files: + # set file status + if file.status == 'transferring': + file.status = 'failed' + # collect GUIDs to delete files from _tid datasets + if file.type == 'output' or file.type == 'log': + if not guidMap.has_key(file.destinationDBlock): + guidMap[file.destinationDBlock] = [] + guidMap[file.destinationDBlock].append(file.GUID) + else: + # wait + _logger.debug('Finisher : Wait %s' % job.PandaID) + for lfn in lfns: + if not lfn in okFiles: + _logger.debug(' -> %s' % lfn) + upJobs.append(job) + # update + _logger.debug('updating ...') + taskBuffer.updateJobs(upJobs,False) + # run Finisher + for job in finJobs: + fThr = Finisher(taskBuffer,None,job) + fThr.start() + fThr.join() + _logger.debug('done') + time.sleep(random.randint(1,10)) +""" + +# update email DB +_memoryCheck("email") +_logger.debug("Update emails") + +# lock file +_lockGetMail = open(panda_config.lockfile_getMail, 'w') +# lock email DB +fcntl.flock(_lockGetMail.fileno(), fcntl.LOCK_EX) +# open email DB +pDB = shelve.open(panda_config.emailDB) +# read +mailMap = {} +for name,addr in pDB.iteritems(): + mailMap[name] = addr +# close DB +pDB.close() +# release file lock +fcntl.flock(_lockGetMail.fileno(), fcntl.LOCK_UN) +# set email address +for name,addr in mailMap.iteritems(): + # remove _ + name = re.sub('_$','',name) + status,res = taskBuffer.querySQLS("SELECT email FROM ATLAS_PANDAMETA.users WHERE name=:name",{':name':name}) + # failed or not found + if status == -1 or len(res) == 0: + _logger.error("%s not found in user DB" % name) + continue + # already set + if not res[0][0] in ['','None',None]: + continue + # update email + _logger.debug("set '%s' to %s" % (name,addr)) + status,res = taskBuffer.querySQLS("UPDATE ATLAS_PANDAMETA.users SET email=:addr WHERE name=:name",{':addr':addr,':name':name}) + +# reassign reprocessing jobs in defined table +_memoryCheck("repro") +class ReassginRepro (threading.Thread): + def __init__(self,taskBuffer,lock,jobs): + threading.Thread.__init__(self) + self.jobs = jobs + self.lock = lock + self.taskBuffer = taskBuffer + + def run(self): + self.lock.acquire() + try: + if len(self.jobs): + nJob = 100 + iJob = 0 + while iJob < len(self.jobs): + # reassign jobs one by one to break dis dataset formation + for job in self.jobs[iJob:iJob+nJob]: + _logger.debug('reassignJobs in Pepro (%s)' % [job]) + self.taskBuffer.reassignJobs([job],joinThr=True) + iJob += nJob + except: + pass + self.lock.release() + +reproLock = threading.Semaphore(3) + +nBunch = 20 +iBunch = 0 +timeLimitMod = datetime.datetime.utcnow() - datetime.timedelta(hours=8) +timeLimitCre = datetime.datetime.utcnow() - datetime.timedelta(hours=24) +firstFlag = True +while True: + # lock + reproLock.acquire() + # get jobs + varMap = {} + varMap[':jobStatus'] = 'assigned' + varMap[':prodSourceLabel'] = 'managed' + varMap[':modificationTime'] = timeLimitMod + varMap[':creationTime'] = timeLimitCre + varMap[':processingType'] = 'reprocessing' + if firstFlag: + firstFlag = False + status,res = taskBuffer.querySQLS("SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND prodSourceLabel=:prodSourceLabel AND modificationTime<:modificationTime AND creationTime<:creationTime AND processingType=:processingType ORDER BY PandaID", + varMap) + if res != None: + _logger.debug('total Repro for reassignJobs : %s' % len(res)) + # get a bunch + status,res = taskBuffer.querySQLS("SELECT * FROM (SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND prodSourceLabel=:prodSourceLabel AND modificationTime<:modificationTime AND creationTime<:creationTime AND processingType=:processingType ORDER BY PandaID) WHERE rownum<=%s" % nBunch, + varMap) + # escape + if res == None or len(res) == 0: + reproLock.release() + break + + # get IDs + jobs=[] + for id, in res: + jobs.append(id) + + # reassign + _logger.debug('reassignJobs for Pepro %s' % (iBunch*nBunch)) + # lock + currentTime = datetime.datetime.utcnow() + for jobID in jobs: + varMap = {} + varMap[':PandaID'] = jobID + varMap[':modificationTime'] = currentTime + status,res = taskBuffer.querySQLS("UPDATE ATLAS_PANDA.jobsDefined4 SET modificationTime=:modificationTime WHERE PandaID=:PandaID", + varMap) + reproLock.release() + # run thr + reproThr = ReassginRepro(taskBuffer,reproLock,jobs) + reproThr.start() + iBunch += 1 + +_memoryCheck("end") + +_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/copyArchive.sh b/current/pandaserver/test/copyArchive.sh new file mode 100755 index 000000000..220f01ee2 --- /dev/null +++ b/current/pandaserver/test/copyArchive.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# Panda home +export PANDA_HOME=/home/sm/prod + +# for python +export PYTHONPATH=$PANDA_HOME/panda:$PYTHONPATH + +python $PANDA_HOME/panda/test/copyArchive.py diff --git a/current/pandaserver/test/copyROOT.py b/current/pandaserver/test/copyROOT.py new file mode 100644 index 000000000..aeca74801 --- /dev/null +++ b/current/pandaserver/test/copyROOT.py @@ -0,0 +1,81 @@ +import os +import re +import sys +from ftplib import FTP +from pandalogger.PandaLogger import PandaLogger + +# supported architectures +targetArchs = ['Linux-slc5-gcc4.3.tar.gz','Linux-slc5_amd64-gcc4.3.tar.gz'] + +# destination dir +destDir = '/data/atlpan/srv/var/appdir' + +# logger +_logger = PandaLogger().getLogger('copyROOT') + +_logger.debug("===================== start =====================") + +try: + # login to root repository + ftp = FTP('root.cern.ch') + output = ftp.login() + _logger.debug(output) + output = ftp.cwd('root') + _logger.debug(output) + # get list + flist = ftp.nlst() + # loop over all files + for tmpFile in flist: + # skip RC + if re.search('-rc\d\.',tmpFile) != None: + continue + # check arch + supportedFlag = False + for tmpArch in targetArchs: + if tmpFile.endswith(tmpArch): + supportedFlag = True + break + # copy + if supportedFlag: + _logger.debug('start %s' % tmpFile) + dstFileName = '%s/%s' % (destDir,tmpFile) + # check local + if os.path.exists(dstFileName): + # get remote size + rsize = ftp.size(tmpFile) + if rsize == None: + _logger.debug(' cannot get remote size for %s' % tmpFile) + else: + # local size + lsize = os.path.getsize(dstFileName) + if lsize == rsize: + _logger.debug('skip since alredy there %s' % tmpFile) + continue + # copy + _logger.debug('copy %s' % tmpFile) + outFile = open(dstFileName,'wb') + ftp.retrbinary('RETR %s' % tmpFile,outFile.write) + outFile.close() + _logger.debug('end %s' % tmpFile) + # quit + output = ftp.quit() + _logger.debug(output) + # make list + listFileName = 'applist' + listFilePath = '%s/%s' % (destDir,listFileName) + listFile = open(listFilePath,'w') + for tmpFile in os.listdir(destDir): + # skip hidden files + if tmpFile.startswith('.'): + continue + # skip applist + if tmpFile == listFileName: + continue + listFile.write('%s\n' % tmpFile) + listFile.close() +except: + errType,errValue = sys.exc_info()[:2] + _logger.error("Failed with %s %s" % (errType,errValue)) + + +_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/createPandaSiteIDs.py b/current/pandaserver/test/createPandaSiteIDs.py new file mode 100644 index 000000000..34f8ef816 --- /dev/null +++ b/current/pandaserver/test/createPandaSiteIDs.py @@ -0,0 +1,54 @@ +import re +from jobscheduler import siteinfo + +from taskbuffer.DBProxy import DBProxy + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +proxyN = DBProxy() +proxyN.connect(panda_config.logdbhost,panda_config.logdbpasswd,panda_config.logdbuser,'PandaMetaDB') + +status,res = proxyN.querySQLS("SELECT nickname from schedconfig") + +nicknames = [] +for (nickname,) in res: + nicknames.append(nickname) + + +print "PandaSiteIDs = {" +sites = siteinfo.sites.keys() +sites.sort() +for site in sites: + vals = siteinfo.sites[site] + okFlag = vals[10] + fName = '' + sitePat = site + sitePat = re.sub('_PAUL','',sitePat) + sitePat = re.sub('_TEST$','',sitePat) + sitePat = re.sub('_test$','',sitePat) + sitePat = re.sub('^ANALY_LONG_','',sitePat) + sitePat = re.sub('^ANALY_','',sitePat) + if site == 'SLACXRD': + sitePat = 'slac' + if site == 'UVIC': + sitePat = 'VICTORIA' + if sitePat == 'LYON': + sitePat = 'IN2P3-CC-T2' + if sitePat == 'Purdue-ITB': + sitePat = 'Purdue' + if sitePat == "BNL": + sitePat = "BNL_ATLAS" + if sitePat == "RAL": + sitePat = "RAL-LCG2" + if sitePat == "SACLAY": + sitePat = "GRIF-DAPNIA" + for nickname in nicknames: + if re.search(sitePat,nickname,re.I) != None: + fName = nickname + if fName == '': + #print site, sitePat + fName = 'BNL_ATLAS_1-condor' + print " %-22s : {'nickname':'%s','status':'%s'}," % ("'"+site+"'",fName,okFlag) +print "}" diff --git a/current/pandaserver/test/datasetManager.py b/current/pandaserver/test/datasetManager.py new file mode 100644 index 000000000..b5f8b7189 --- /dev/null +++ b/current/pandaserver/test/datasetManager.py @@ -0,0 +1,924 @@ +import os +import re +import sys +import time +import fcntl +import types +import shelve +import random +import datetime +import commands +import threading +import userinterface.Client as Client +from dataservice.DDM import ddm +from dataservice.DDM import dashBorad +from taskbuffer.OraDBProxy import DBProxy +from taskbuffer.TaskBuffer import taskBuffer +from pandalogger.PandaLogger import PandaLogger +from jobdispatcher.Watcher import Watcher +from brokerage.SiteMapper import SiteMapper +from dataservice.Adder import Adder +from dataservice.Finisher import Finisher +from dataservice.MailUtils import MailUtils +from taskbuffer import ProcessGroups +import brokerage.broker_util +import brokerage.broker +import taskbuffer.ErrorCode +import dataservice.DDM + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# logger +_logger = PandaLogger().getLogger('datasetManager') + +_logger.debug("===================== start =====================") + +# use native DQ2 +ddm.useDirectDQ2() + +# memory checker +def _memoryCheck(str): + try: + proc_status = '/proc/%d/status' % os.getpid() + procfile = open(proc_status) + name = "" + vmSize = "" + vmRSS = "" + # extract Name,VmSize,VmRSS + for line in procfile: + if line.startswith("Name:"): + name = line.split()[-1] + continue + if line.startswith("VmSize:"): + vmSize = "" + for item in line.split()[1:]: + vmSize += item + continue + if line.startswith("VmRSS:"): + vmRSS = "" + for item in line.split()[1:]: + vmRSS += item + continue + procfile.close() + _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str)) + except: + type, value, traceBack = sys.exc_info() + _logger.error("memoryCheck() : %s %s" % (type,value)) + _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str)) + return + +_memoryCheck("start") + +# kill old dq2 process +try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('ps axo user,pid,lstart,args | grep dq2.clientapi | grep -v PYTHONPATH | grep -v grep') + for line in out.split('\n'): + if line == '': + continue + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old dq2 process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) +except: + type, value, traceBack = sys.exc_info() + _logger.error("kill dq2 process : %s %s" % (type,value)) + + +# kill old process +try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=7) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName) + for line in out.split('\n'): + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) +except: + type, value, traceBack = sys.exc_info() + _logger.error("kill process : %s %s" % (type,value)) + + +# instantiate TB +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + +# instantiate sitemapper +siteMapper = SiteMapper(taskBuffer) + + +# list with lock +class ListWithLock: + def __init__(self): + self.lock = threading.Lock() + self.list = [] + + def __contains__(self,item): + self.lock.acquire() + ret = self.list.__contains__(item) + self.lock.release() + return ret + + def append(self,item): + appended = False + self.lock.acquire() + if not item in self.list: + self.list.append(item) + appended = True + self.lock.release() + return appended + + +# list of dis datasets to be deleted +deletedDisList = ListWithLock() + + +# set tobedeleted to dis dataset +def setTobeDeletedToDis(subDsName): + try: + # only production sub datasets + if subDsName.startswith('user') or subDsName.startswith('group') or \ + subDsName.startswith('pandaddm_') or re.search('_sub\d+$',subDsName)==None: + return + # get _dis names with _sub + disNameList = taskBuffer.getAssociatedDisDatasets(subDsName) + _logger.debug("setTobeDeletedToDis : sub:%s has dis:%s" % (subDsName,str(disNameList))) + # loop over all _dis datasets + for tmpDisName in disNameList: + # try to append to locked list + if not deletedDisList.append(tmpDisName): + # another thread already took care of the _dis + continue + # get dataset + _logger.debug("setTobeDeletedToDis : try to get %s in DB" % tmpDisName) + tmpDS = taskBuffer.queryDatasetWithMap({'name':tmpDisName}) + if tmpDS == None: + _logger.error("setTobeDeletedToDis : cannot get %s in DB" % tmpDisName) + continue + # check status + if tmpDS.status in ['tobedeleted','deleted']: + _logger.debug("setTobeDeletedToDis : skip %s since status=%s" % (tmpDisName,tmpDS.status)) + continue + # check the number of failed jobs associated to the _dis + if tmpDS.currentfiles == 0: + # all succeeded + tmpDS.status = 'deleting' + excStatus = 'deleted' + else: + # some failed, to reduce the lifetime + tmpDS.status = 'shortening' + excStatus = 'shortened' + # update dataset + retU = taskBuffer.updateDatasets([tmpDS],withLock=True,withCriteria="status<>:crStatus", + criteriaMap={':crStatus':excStatus}) + _logger.debug("setTobeDeletedToDis : set %s to %s with %s" % (tmpDS.status,tmpDisName,str(retU))) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("setTobeDeletedToDis : %s %s %s" % (subDsName,errType,errValue)) + + +# thread pool +class ThreadPool: + def __init__(self): + self.lock = threading.Lock() + self.list = [] + + def add(self,obj): + self.lock.acquire() + self.list.append(obj) + self.lock.release() + + def remove(self,obj): + self.lock.acquire() + self.list.remove(obj) + self.lock.release() + + def join(self): + self.lock.acquire() + thrlist = tuple(self.list) + self.lock.release() + for thr in thrlist: + thr.join() + + +# thread to close dataset +class CloserThr (threading.Thread): + def __init__(self,lock,proxyLock,datasets,pool): + threading.Thread.__init__(self) + self.datasets = datasets + self.lock = lock + self.proxyLock = proxyLock + self.pool = pool + self.pool.add(self) + + def run(self): + self.lock.acquire() + try: + # loop over all datasets + for vuid,name,modDate in self.datasets: + _logger.debug("Close %s %s" % (modDate,name)) + if not name.startswith('pandaddm_'): + status,out = ddm.DQ2.main('freezeDataset',name) + else: + status,out = 0,'' + if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: + _logger.error(out) + else: + self.proxyLock.acquire() + varMap = {} + varMap[':vuid'] = vuid + varMap[':newstatus'] = 'completed' + varMap[':oldstatus'] = 'tobeclosed' + taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:newstatus,modificationdate=CURRENT_DATE WHERE vuid=:vuid AND status=:oldstatus", + varMap) + self.proxyLock.release() + if name.startswith('pandaddm_'): + continue + # set tobedeleted to dis + setTobeDeletedToDis(name) + # count # of files + status,out = ddm.DQ2.main('getNumberOfFiles',name) + _logger.debug(out) + if status != 0: + _logger.error(out) + else: + try: + nFile = int(out) + _logger.debug(nFile) + if nFile == 0: + # erase dataset + _logger.debug('erase %s' % name) + status,out = ddm.DQ2.main('eraseDataset',name) + _logger.debug('OK with %s' % name) + except: + pass + except: + pass + self.pool.remove(self) + self.lock.release() + +# close datasets +_logger.debug("==== close datasets ====") +timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) +timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3) +closeLock = threading.Semaphore(5) +closeProxyLock = threading.Lock() +closeThreadPool = ThreadPool() +maxRows = 100000 +while True: + # lock + closeLock.acquire() + # get datasets + closeProxyLock.acquire() + varMap = {} + varMap[':modificationdateU'] = timeLimitU + varMap[':modificationdateL'] = timeLimitL + varMap[':type'] = 'output' + varMap[':status'] = 'tobeclosed' + sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows + proxyS = taskBuffer.proxyPool.getProxy() + res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60') + taskBuffer.proxyPool.putProxy(proxyS) + if res == None: + _logger.debug("# of datasets to be closed: %s" % res) + else: + _logger.debug("# of datasets to be closed: %s" % len(res)) + if res==None or len(res)==0: + closeProxyLock.release() + closeLock.release() + break + # release + closeProxyLock.release() + closeLock.release() + # run thread + iRows = 0 + nRows = 500 + while iRows < len(res): + closerThr = CloserThr(closeLock,closeProxyLock,res[iRows:iRows+nRows],closeThreadPool) + closerThr.start() + iRows += nRows + closeThreadPool.join() + if len(res) < maxRows: + break + + +# thread to freeze dataset +class Freezer (threading.Thread): + def __init__(self,lock,proxyLock,datasets,pool): + threading.Thread.__init__(self) + self.datasets = datasets + self.lock = lock + self.proxyLock = proxyLock + self.pool = pool + self.pool.add(self) + + def run(self): + self.lock.acquire() + try: + for vuid,name,modDate in self.datasets: + _logger.debug("start %s %s" % (modDate,name)) + self.proxyLock.acquire() + retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ lfn FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND NOT status IN (:status1,:status2,:status3)", + {':destinationDBlock':name,':status1':'ready',':status2':'failed',':status3':'skipped'}) + self.proxyLock.release() + if retF<0: + _logger.error("SQL error") + else: + # no files in filesTable + if len(resF) == 0: + _logger.debug("freeze %s " % name) + if not name.startswith('pandaddm_'): + status,out = ddm.DQ2.main('freezeDataset',name) + else: + status,out = 0,'' + if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: + _logger.error(out) + else: + self.proxyLock.acquire() + varMap = {} + varMap[':vuid'] = vuid + varMap[':status'] = 'completed' + taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", + varMap) + self.proxyLock.release() + if name.startswith('pandaddm_'): + continue + # set tobedeleted to dis + setTobeDeletedToDis(name) + # count # of files + status,out = ddm.DQ2.main('getNumberOfFiles',name) + _logger.debug(out) + if status != 0: + _logger.error(out) + else: + try: + nFile = int(out) + _logger.debug(nFile) + if nFile == 0: + # erase dataset + _logger.debug('erase %s' % name) + status,out = ddm.DQ2.main('eraseDataset',name) + _logger.debug('OK with %s' % name) + except: + pass + else: + _logger.debug("wait %s " % name) + self.proxyLock.acquire() + taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid}) + self.proxyLock.release() + _logger.debug("end %s " % name) + except: + pass + self.pool.remove(self) + self.lock.release() + +# freeze dataset +_logger.debug("==== freeze datasets ====") +timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(days=4) +timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=14) +freezeLock = threading.Semaphore(5) +freezeProxyLock = threading.Lock() +freezeThreadPool = ThreadPool() +maxRows = 100000 +while True: + # lock + freezeLock.acquire() + # get datasets + sqlQuery = "type=:type AND status IN (:status1,:status2,:status3,:status4) " + \ + "AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND subType=:subType AND rownum <= %s" % maxRows + varMap = {} + varMap[':modificationdateU'] = timeLimitU + varMap[':modificationdateL'] = timeLimitL + varMap[':type'] = 'output' + varMap[':status1'] = 'running' + varMap[':status2'] = 'created' + varMap[':status3'] = 'defined' + varMap[':status4'] = 'locked' + varMap[':subType'] = 'sub' + freezeProxyLock.acquire() + proxyS = taskBuffer.proxyPool.getProxy() + res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60') + taskBuffer.proxyPool.putProxy(proxyS) + if res == None: + _logger.debug("# of datasets to be frozen: %s" % res) + else: + _logger.debug("# of datasets to be frozen: %s" % len(res)) + if res==None or len(res)==0: + freezeProxyLock.release() + freezeLock.release() + break + freezeProxyLock.release() + # release + freezeLock.release() + # run freezer + iRows = 0 + nRows = 500 + while iRows < len(res): + freezer = Freezer(freezeLock,freezeProxyLock,res[iRows:iRows+nRows],freezeThreadPool) + freezer.start() + iRows += nRows + freezeThreadPool.join() + if len(res) < maxRows: + break + + +# thread to delete dataset replica from T2 +class T2Cleaner (threading.Thread): + def __init__(self,lock,proxyLock,datasets,pool): + threading.Thread.__init__(self) + self.datasets = datasets + self.lock = lock + self.proxyLock = proxyLock + self.pool = pool + self.pool.add(self) + + def run(self): + self.lock.acquire() + try: + for vuid,name,modDate in self.datasets: + _logger.debug("cleanT2 %s" % name) + # get list of replicas + status,out = ddm.DQ2.main('listDatasetReplicas',name,0,None,False) + if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: + _logger.error(out) + continue + else: + if out.find("DQUnknownDatasetException") == -1 and out.find("DQDeletedDatasetException") == -1: + listOut = out + try: + # convert res to map + exec "tmpRepSites = %s" % out + except: + tmpRepSites = {} + _logger.error("cannot convert to replica map") + _logger.error(out) + continue + # check if there is active subscription + _logger.debug('listSubscriptions %s' % name) + subStat,subOut = ddm.DQ2.main('listSubscriptions',name) + if subStat != 0: + _logger.error("cannot get subscriptions for %s" % name) + _logger.error(subOut) + _logger.debug('subscriptions for %s = %s' % (name,subOut)) + # active subscriotions + if subOut != '[]': + _logger.debug("wait %s due to active subscription" % name) + continue + # check cloud + self.proxyLock.acquire() + proxyS = taskBuffer.proxyPool.getProxy() + destSE = proxyS.getDestSEwithDestDBlock(name) + taskBuffer.proxyPool.putProxy(proxyS) + self.proxyLock.release() + cloudName = None + if siteMapper.checkSite(destSE): + cloudName = siteMapper.getSite(destSE).cloud + # cloud is not found + if cloudName == None: + _logger.error("cannot find cloud for %s : %s" % (name,str(tmpRepSites))) + else: + _logger.debug('cloud=%s for %s' % (cloudName,name)) + t1SiteDDMs = siteMapper.getSite(destSE).setokens.values() + # look for T2 IDs + t2DDMs = [] + for tmpDDM in tmpRepSites.keys(): + if not tmpDDM in t1SiteDDMs: + # check home cloud + notDeleteFlag = False + for tmpT2siteID,tmpT2siteSpec in siteMapper.siteSpecList.iteritems(): + if tmpT2siteSpec.ddm == tmpDDM: + # not delete if src and dest are in US. OSG is regarded as US due to tier1 + if tmpT2siteSpec.cloud in ['US'] and cloudName in ['US','OSG']: + notDeleteFlag = True + if not notDeleteFlag: + t2DDMs.append(tmpDDM) + # delete replica for sub + if re.search('_sub\d+$',name) != None and t2DDMs != []: + setMetaFlag = True + for tmpT2DDM in t2DDMs: + _logger.debug('setReplicaMetaDataAttribute %s %s' % (name,tmpT2DDM)) + status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',name,tmpT2DDM,'pin_lifetime','') + if status != 0: + _logger.error(out) + if out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \ + out.find("No replica found") == -1: + setMetaFlag = False + if not setMetaFlag: + continue + _logger.debug(('deleteDatasetReplicas',name,t2DDMs)) + status,out = ddm.DQ2.main('deleteDatasetReplicas',name,t2DDMs,0,False,False,False,False,False,'00:00:00') + if status != 0: + _logger.error(out) + if out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \ + out.find("No replica found") == -1: + continue + else: + _logger.debug('no delete for %s due to empty target in %s' % (name,listOut)) + # update + self.proxyLock.acquire() + varMap = {} + varMap[':vuid'] = vuid + varMap[':status'] = 'completed' + taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", + varMap) + self.proxyLock.release() + _logger.debug("end %s " % name) + except: + pass + self.pool.remove(self) + self.lock.release() + +# delete dataset replica from T2 +_logger.debug("==== delete datasets from T2 ====") +timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) +timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3) +t2cleanLock = threading.Semaphore(5) +t2cleanProxyLock = threading.Lock() +t2cleanThreadPool = ThreadPool() +maxRows = 100000 +while True: + # lock + t2cleanLock.acquire() + # get datasets + varMap = {} + varMap[':modificationdateU'] = timeLimitU + varMap[':modificationdateL'] = timeLimitL + varMap[':type'] = 'output' + varMap[':status'] = 'cleanup' + sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows + t2cleanProxyLock.acquire() + proxyS = taskBuffer.proxyPool.getProxy() + res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60') + taskBuffer.proxyPool.putProxy(proxyS) + if res == None: + _logger.debug("# of datasets to be deleted from T2: %s" % res) + else: + _logger.debug("# of datasets to be deleted from T2: %s" % len(res)) + if res==None or len(res)==0: + t2cleanProxyLock.release() + t2cleanLock.release() + break + t2cleanProxyLock.release() + # release + t2cleanLock.release() + # run t2cleanr + iRows = 0 + nRows = 500 + while iRows < len(res): + t2cleanr = T2Cleaner(t2cleanLock,t2cleanProxyLock,res[iRows:iRows+nRows],t2cleanThreadPool) + t2cleanr.start() + iRows += nRows + t2cleanThreadPool.join() + if len(res) < maxRows: + break + + +# delete dis datasets +class EraserThr (threading.Thread): + def __init__(self,lock,proxyLock,datasets,pool,operationType): + threading.Thread.__init__(self) + self.datasets = datasets + self.lock = lock + self.proxyLock = proxyLock + self.pool = pool + self.pool.add(self) + self.operationType = operationType + + def run(self): + self.lock.acquire() + try: + # loop over all datasets + for vuid,name,modDate in self.datasets: + # only dis datasets + if re.search('_dis\d+$',name) == None: + _logger.error("Eraser : non disDS %s" % name) + continue + # delete + _logger.debug("Eraser %s dis %s %s" % (self.operationType,modDate,name)) + # delete or shorten + if self.operationType == 'deleting': + # erase + endStatus = 'deleted' + status,out = ddm.DQ2.main('eraseDataset',name) + if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: + _logger.error(out) + continue + else: + # change replica lifetime + endStatus = 'shortened' + # get list of replicas + status,out = ddm.DQ2.main('listDatasetReplicas',name,0,None,False) + if status != 0 and out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1: + _logger.error(out) + continue + if out.find("DQUnknownDatasetException") == -1 and out.find("DQDeletedDatasetException") == -1: + try: + # convert res to map + exec "tmpRepSites = %s" % out + except: + tmpRepSites = {} + _logger.error("cannot convert to replica map") + _logger.error(out) + continue + # set replica lifetime + setMetaFlag = True + for tmpDDM in tmpRepSites.keys(): + _logger.debug('setReplicaMetaDataAttribute %s %s' % (name,tmpDDM)) + status,out = ddm.DQ2.main('setReplicaMetaDataAttribute',name,tmpDDM,'lifetime','1 days') + if status != 0: + _logger.error(out) + if out.find('DQFrozenDatasetException') == -1 and \ + out.find("DQUnknownDatasetException") == -1 and out.find("DQSecurityException") == -1 and \ + out.find("DQDeletedDatasetException") == -1 and out.find("DQUnknownDatasetException") == -1 and \ + out.find("No replica found") == -1: + setMetaFlag = False + if not setMetaFlag: + continue + _logger.debug('OK with %s' % name) + # update + self.proxyLock.acquire() + varMap = {} + varMap[':vuid'] = vuid + varMap[':status'] = endStatus + taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", + varMap) + self.proxyLock.release() + except: + pass + self.pool.remove(self) + self.lock.release() + +# delete dis datasets +_logger.debug("==== delete dis datasets ====") +timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=30) +timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=3) +disEraseLock = threading.Semaphore(5) +disEraseProxyLock = threading.Lock() +disEraseThreadPool = ThreadPool() +maxRows = 100000 +for targetStatus in ['deleting','shortening']: + # lock + disEraseLock.acquire() + # get datasets + varMap = {} + varMap[':modificationdateU'] = timeLimitU + varMap[':modificationdateL'] = timeLimitL + varMap[':type'] = 'dispatch' + varMap[':status'] = targetStatus + sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows + disEraseProxyLock.acquire() + proxyS = taskBuffer.proxyPool.getProxy() + res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60') + taskBuffer.proxyPool.putProxy(proxyS) + if res == None: + _logger.debug("# of dis datasets for %s: None" % targetStatus) + else: + _logger.debug("# of dis datasets for %s: %s" % (targetStatus,len(res))) + if res==None or len(res)==0: + disEraseProxyLock.release() + disEraseLock.release() + break + disEraseProxyLock.release() + # release + disEraseLock.release() + # run disEraser + iRows = 0 + nRows = 500 + while iRows < len(res): + disEraser = EraserThr(disEraseLock,disEraseProxyLock,res[iRows:iRows+nRows], + disEraseThreadPool,targetStatus) + disEraser.start() + iRows += nRows + disEraseThreadPool.join() + + +_memoryCheck("finisher") + +# finisher thread +class FinisherThr (threading.Thread): + def __init__(self,lock,proxyLock,ids,pool,timeNow): + threading.Thread.__init__(self) + self.ids = ids + self.lock = lock + self.proxyLock = proxyLock + self.pool = pool + self.timeNow = timeNow + self.pool.add(self) + + def run(self): + self.lock.acquire() + try: + # get jobs from DB + ids = self.ids + self.proxyLock.acquire() + jobs = taskBuffer.peekJobs(ids,fromDefined=False,fromArchived=False,fromWaiting=False) + self.proxyLock.release() + upJobs = [] + finJobs = [] + for job in jobs: + if job == None or job.jobStatus == 'unknown': + continue + # use BNL by default + dq2URL = siteMapper.getSite('BNL_ATLAS_1').dq2url + dq2SE = [] + # get LFC and SEs + if job.prodSourceLabel == 'user' and not siteMapper.siteSpecList.has_key(job.destinationSE): + # using --destSE for analysis job to transfer output + try: + dq2URL = dataservice.DDM.toa.getLocalCatalog(job.destinationSE)[-1] + match = re.search('.+://([^:/]+):*\d*/*',dataservice.DDM.toa.getSiteProperty(job.destinationSE,'srm')[-1]) + if match != None: + dq2SE.append(match.group(1)) + except: + type, value, traceBack = sys.exc_info() + _logger.error("%s Failed to get DQ2/SE with %s %s" % (job.PandaID,type,value)) + continue + elif siteMapper.checkCloud(job.cloud): + # normal production jobs + tmpDstID = siteMapper.getCloud(job.cloud)['dest'] + tmpDstSite = siteMapper.getSite(tmpDstID) + if not tmpDstSite.lfchost in [None,'']: + # LFC + dq2URL = 'lfc://'+tmpDstSite.lfchost+':/grid/atlas/' + if tmpDstSite.se != None: + for tmpDstSiteSE in tmpDstSite.se.split(','): + match = re.search('.+://([^:/]+):*\d*/*',tmpDstSiteSE) + if match != None: + dq2SE.append(match.group(1)) + else: + # LRC + dq2URL = tmpDstSite.dq2url + dq2SE = [] + # get LFN list + lfns = [] + guids = [] + nTokens = 0 + for file in job.Files: + # only output files are checked + if file.type == 'output' or file.type == 'log': + lfns.append(file.lfn) + guids.append(file.GUID) + nTokens += len(file.destinationDBlockToken.split(',')) + # get files in LRC + _logger.debug("%s Cloud:%s DQ2URL:%s" % (job.PandaID,job.cloud,dq2URL)) + okFiles = brokerage.broker_util.getFilesFromLRC(lfns,dq2URL,guids,dq2SE,getPFN=True) + # count files + nOkTokens = 0 + for okLFN,okPFNs in okFiles.iteritems(): + nOkTokens += len(okPFNs) + # check all files are ready + _logger.debug("%s nToken:%s nOkToken:%s" % (job.PandaID,nTokens,nOkTokens)) + if nTokens <= nOkTokens: + _logger.debug("%s Finisher : Finish" % job.PandaID) + for file in job.Files: + if file.type == 'output' or file.type == 'log': + file.status = 'ready' + # append to run Finisher + finJobs.append(job) + else: + endTime = job.endTime + if endTime == 'NULL': + endTime = job.startTime + # priority-dependent timeout + tmpCloudSpec = siteMapper.getCloud(job.cloud) + if job.currentPriority >= 800 and (not job.prodSourceLabel in ['user']): + if tmpCloudSpec.has_key('transtimehi'): + timeOutValue = tmpCloudSpec['transtimehi'] + else: + timeOutValue = 1 + else: + if tmpCloudSpec.has_key('transtimelo'): + timeOutValue = tmpCloudSpec['transtimelo'] + else: + timeOutValue = 2 + # protection + if timeOutValue < 1: + timeOutValue = 1 + timeOut = self.timeNow - datetime.timedelta(days=timeOutValue) + _logger.debug("%s Priority:%s Limit:%s End:%s" % (job.PandaID,job.currentPriority,str(timeOut),str(endTime))) + if endTime < timeOut: + # timeout + _logger.debug("%s Finisher : Kill" % job.PandaID) + strMiss = '' + for lfn in lfns: + if not lfn in okFiles: + strMiss += ' %s' % lfn + job.jobStatus = 'failed' + job.taskBufferErrorCode = taskbuffer.ErrorCode.EC_Transfer + job.taskBufferErrorDiag = 'transfer timeout for '+strMiss + guidMap = {} + for file in job.Files: + # set file status + if file.status == 'transferring': + file.status = 'failed' + # collect GUIDs to delete files from _tid datasets + if file.type == 'output' or file.type == 'log': + if not guidMap.has_key(file.destinationDBlock): + guidMap[file.destinationDBlock] = [] + guidMap[file.destinationDBlock].append(file.GUID) + else: + # wait + _logger.debug("%s Finisher : Wait" % job.PandaID) + for lfn in lfns: + if not lfn in okFiles: + _logger.debug("%s -> %s" % (job.PandaID,lfn)) + upJobs.append(job) + # update + _logger.debug("updating ...") + self.proxyLock.acquire() + taskBuffer.updateJobs(upJobs,False) + self.proxyLock.release() + # run Finisher + for job in finJobs: + fThr = Finisher(taskBuffer,None,job) + fThr.start() + fThr.join() + _logger.debug("done") + time.sleep(1) + except: + pass + self.pool.remove(self) + self.lock.release() + +# finish transferring jobs +_logger.debug("==== finish transferring jobs ====") +finisherLock = threading.Semaphore(3) +finisherProxyLock = threading.Lock() +finisherThreadPool = ThreadPool() +for loopIdx in ['low','high']: + timeNow = datetime.datetime.utcnow() + if loopIdx == 'high': + highPrioFlag = True + else: + highPrioFlag = False + # get jobs + for ii in range(1000): + # lock + finisherLock.acquire() + finisherProxyLock.acquire() + ret,res = taskBuffer.lockJobsForFinisher(timeNow,200,highPrioFlag) + finisherProxyLock.release() + finisherLock.release() + if res == None: + _logger.debug("# of jobs to be finished for %s : %s" % (loopIdx,res)) + else: + _logger.debug("# of jobs to be finished for %s : %s" % (loopIdx,len(res))) + if res == None or len(res) == 0: + break + # run thread + finThr = FinisherThr(finisherLock,finisherProxyLock,res,finisherThreadPool,timeNow) + finThr.start() + # wait + finisherThreadPool.join() + + +_memoryCheck("end") + +_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/deleteJobs.py b/current/pandaserver/test/deleteJobs.py new file mode 100755 index 000000000..18195c27c --- /dev/null +++ b/current/pandaserver/test/deleteJobs.py @@ -0,0 +1,175 @@ +import os +import re +import sys +import time +import fcntl +import types +import shelve +import random +import datetime +import commands +import threading +import userinterface.Client as Client +from dataservice.DDM import ddm +from dataservice.DDM import dashBorad +from taskbuffer.OraDBProxy import DBProxy +from taskbuffer.TaskBuffer import taskBuffer +from pandalogger.PandaLogger import PandaLogger +from jobdispatcher.Watcher import Watcher +from brokerage.SiteMapper import SiteMapper +from dataservice.Adder import Adder +from dataservice.Finisher import Finisher +from dataservice.MailUtils import MailUtils +from taskbuffer import ProcessGroups +import brokerage.broker_util +import brokerage.broker +import taskbuffer.ErrorCode +import dataservice.DDM + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# logger +_logger = PandaLogger().getLogger('deleteJobs') + +_logger.debug("===================== start =====================") + +# memory checker +def _memoryCheck(str): + try: + proc_status = '/proc/%d/status' % os.getpid() + procfile = open(proc_status) + name = "" + vmSize = "" + vmRSS = "" + # extract Name,VmSize,VmRSS + for line in procfile: + if line.startswith("Name:"): + name = line.split()[-1] + continue + if line.startswith("VmSize:"): + vmSize = "" + for item in line.split()[1:]: + vmSize += item + continue + if line.startswith("VmRSS:"): + vmRSS = "" + for item in line.split()[1:]: + vmRSS += item + continue + procfile.close() + _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str)) + except: + type, value, traceBack = sys.exc_info() + _logger.error("memoryCheck() : %s %s" % (type,value)) + _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str)) + return + +_memoryCheck("start") + +# kill old process +try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=2) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName) + for line in out.split('\n'): + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) +except: + type, value, traceBack = sys.exc_info() + _logger.error("kill process : %s %s" % (type,value)) + + +# instantiate TB +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + +# instantiate sitemapper +siteMapper = SiteMapper(taskBuffer) + + +# table names +jobATableName = "ATLAS_PANDAARCH.jobsArchived" +filesATableName = "ATLAS_PANDAARCH.filesTable_ARCH" +paramATableName = "ATLAS_PANDAARCH.jobParamsTable_ARCH" +metaATableName = "ATLAS_PANDAARCH.metaTable_ARCH" + +# time limit +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=3) + +# delete +_logger.debug("get PandaIDs for Delete") +sql = "SELECT COUNT(*) FROM ATLAS_PANDA.jobsArchived4 WHERE modificationTime<:modificationTime" +varMap = {} +varMap[':modificationTime'] = timeLimit +status,res = taskBuffer.querySQLS(sql,varMap) +if res != None: + tmpTotal = res[0][0] +else: + tmpTotal = None +maxBunch = 1000 +nBunch = 500 +tmpIndex = 0 +while True: + sql = "SELECT PandaID,modificationTime FROM ATLAS_PANDA.jobsArchived4 " + sql += "WHERE modificationTime<:modificationTime AND archivedFlag=:archivedFlag AND rownum<=:rowRange" + varMap = {} + varMap[':modificationTime'] = timeLimit + varMap[':archivedFlag'] = 1 + varMap[':rowRange'] = maxBunch + status,res = taskBuffer.querySQLS(sql,varMap) + if res == None: + _logger.error("failed to get PandaIDs to be deleted") + break + else: + _logger.debug("got %s for deletion" % len(res)) + if len(res) == 0: + _logger.debug("no jobs left for for deletion") + break + else: + maxBunch = len(res) + random.shuffle(res) + res = res[:nBunch] + # loop over all jobs + for (id,srcEndTime) in res: + tmpIndex += 1 + try: + # check + sql = "SELECT PandaID from %s WHERE PandaID=:PandaID" % jobATableName + varMap = {} + varMap[':PandaID'] = id + status,check = taskBuffer.querySQLS(sql,varMap) + if check == None or len(check) == 0: + # no record in ArchivedDB + _logger.error("No backup for %s" % id) + else: + # delete + _logger.debug("DEL %s : endTime %s" % (id,srcEndTime)) + proxyS = taskBuffer.proxyPool.getProxy() + proxyS.deleteJobSimple(id) + taskBuffer.proxyPool.putProxy(proxyS) + if tmpIndex % 1000 == 1: + _logger.debug(" deleted %s/%s" % (tmpIndex,tmpTotal)) + except: + pass + # terminate + if maxBunch < nBunch: + break +_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/directSubmit.py b/current/pandaserver/test/directSubmit.py new file mode 100755 index 000000000..81c96e953 --- /dev/null +++ b/current/pandaserver/test/directSubmit.py @@ -0,0 +1,163 @@ +import re +import sys +import time +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv) != 2: + print "task file is missing" + sys.exit(0) + +# open task file +taskFile = open(sys.argv[1]) + +# read common parameters +line = taskFile.readline() +items = line.split() + +# common parameters +taskID = items[0] +inTaskName = items[1] +taskName = items[2] +formats = items[3].split('.') +lparams = items[4].split(',') +vparams = items[5].split(',') +trf = items[7] +trfVer = items[8] +grid = items[10] +priority = items[11] +totalJob = items[14] +cpu = items[15] +memory = items[16] + + +# input dataset +iDataset = 'NULL' +m = re.search('(.+)\.([^\.]+)\.([^\.]+)$',inTaskName) +if m != None: + step = m.group(2) + if step == 'evgen': + format = 'EVENT' + elif step == 'digit': + format = 'RDO' + else: + format = 'AOO' + #### FIXME : _tidXXXX is missing + iDataset = '%s.%s.%s.%s' % (m.group(1),step,format,m.group(3)) + + +# output datasets +m = re.search('(.+)\.([^\.]+)\.([^\.]+)$',taskName) +oDatasets = [] +for format in formats: + step = m.group(2) + if format=='HITS': + step = 'simul' + # append + oDatasets.append('%s.%s.%s.%s_tid%06d' % (m.group(1),step,format,m.group(3),int(taskID))) + +# log dataset +lDataset = '%s.%s.%s.%s_tid%06d' % (m.group(1),m.group(2),'log',m.group(3),int(taskID)) + + +# instantiate JobSpecs +iJob = 0 +jobList = [] +for line in taskFile: + iJob += 1 + job = JobSpec() + # job ID ###### FIXME + job.jobDefinitionID = int(time.time()) % 10000 + # job name + job.jobName = "%s_%05d.job" % (taskName,iJob) + # AtlasRelease + if len(re.findall('\.',trfVer)) > 2: + match = re.search('^(\d+\.\d+\.\d+)',trfVer) + job.AtlasRelease = 'Atlas-%s' % match.group(1) + else: + job.AtlasRelease = 'Atlas-%s' % trfVer + # homepackage + vers = trfVer.split('.') + if int(vers[0]) <= 11: + job.homepackage = 'JobTransforms' + for ver in vers: + job.homepackage += "-%02d" % int(ver) + else: + job.homepackage = 'AtlasProduction/%s' % trfVer + # trf + job.transformation = trf + job.destinationDBlock = oDatasets[0] + # prod DBlock + job.prodDBlock = iDataset + # souce lavel + job.prodSeriesLabel = 'pandatest' + job.prodSourceLabel = 'managed' + # priority + job.assignedPriority = priority + job.currentPriority = priority + # CPU, memory,disk ### FIXME + + # attempt number ### FIXME + + # input files + if iDataset != 'NULL': + # remove _tidXXX + pat = re.sub('_tid\d+$','',iDataset) + # search + m = re.search('('+pat+'\S+)',line) + if m != None: + file = FileSpec() + file.lfn = m.group(1) + file.type = 'input' + file.dataset = iDataset + file.prodDBlock = iDataset + job.addFile(file) + # DB release + for i,lpar in enumerate(lparams): + if lpar == 'DBRelease': + file = FileSpec() + file.lfn = "%s-%s.tgz" % (lpar,vparams[i]) + file.type = 'input' + file.dataset = iDataset + file.prodDBlock = iDataset + job.addFile(file) + break + # output files + for oDataset in oDatasets: + # remove _tidXXX + pat = re.sub('_tid\d+$','',oDataset) + # search + m = re.search('('+pat+'\S+)',line) + if m != None: + file = FileSpec() + file.lfn = m.group(1) + file.type = 'output' + file.dataset = oDataset + file.destinationDBlock = oDataset + job.addFile(file) + # log + file = FileSpec() + file.lfn = "%s._%05d.log.tgz" % (lDataset,iJob) + file.type = 'log' + file.dataset = lDataset + file.destinationDBlock = lDataset + job.addFile(file) + + # job par + job.jobParameters = line[:-1] + + """ + print job.values() + for file in job.Files: + print file.values() + sys.exit(0) + """ + jobList.append(job) + + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/distributeDefJobs.py b/current/pandaserver/test/distributeDefJobs.py new file mode 100755 index 000000000..c1cee20a2 --- /dev/null +++ b/current/pandaserver/test/distributeDefJobs.py @@ -0,0 +1,53 @@ +import datetime +from taskbuffer.DBProxy import DBProxy +import userinterface.Client as Client +import jobscheduler.Site +import random +import time + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# time limit +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) + +# instantiate DB proxies +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +# get PandaIDs from jobsDefined +res = proxyS.querySQL("SELECT PandaID,modificationTime from jobsDefined4 ORDER BY modificationTime") + +# list of known sites +tmpSites = jobscheduler.Site.KnownSite.getAllSitesID() +allSites = [] +for site in tmpSites: + # _allSites may conain NULL after sort() + if site == 'NULL': + continue + # ignore test sites + if site.endswith('test') or site.endswith('Test'): + continue + # append + allSites.append(site) + +# reassign jobs +jobs=[] +for (id,modTime) in res: + if modTime < timeLimit: + jobs.append(id) + +# reassign +if len(jobs): + nJob = 20 + iJob = 0 + while iJob < len(jobs): + print 'reassignJobs(%s)' % jobs[iJob:iJob+nJob] + index = random.randint(1,len(allSites)) + site = allSites[int(index)-1] + print 'site=%s' % site + Client.reassignJobs(jobs[iJob:iJob+nJob],site) + iJob += nJob + time.sleep(10) + diff --git a/current/pandaserver/test/dq2cr.py b/current/pandaserver/test/dq2cr.py new file mode 100755 index 000000000..b28b0ccea --- /dev/null +++ b/current/pandaserver/test/dq2cr.py @@ -0,0 +1,45 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = None + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = 'BNL_SE' + +jobList = [] + +for i in range(1): + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) + job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/run_dq2_cr' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.currentPriority = 100000 + #job.prodSourceLabel = 'test' + job.prodSourceLabel = 'user' + job.computingSite = site + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="8072 0 5000 1 DC3.008072.JimmyPhotonJet1.py NONE NONE NONE" + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/emailfix.py b/current/pandaserver/test/emailfix.py new file mode 100755 index 000000000..a39bd3bc4 --- /dev/null +++ b/current/pandaserver/test/emailfix.py @@ -0,0 +1,16 @@ +''' +notifier + +''' + +import shelve + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger + +# open DB +pDB = shelve.open(panda_config.emailDB) + + + + diff --git a/current/pandaserver/test/evpPD2P.py b/current/pandaserver/test/evpPD2P.py new file mode 100644 index 000000000..27cb721f8 --- /dev/null +++ b/current/pandaserver/test/evpPD2P.py @@ -0,0 +1,156 @@ +import re +import sys +import glob +import time +import os.path +import commands +import datetime +import threading +from config import panda_config +from taskbuffer.TaskBuffer import taskBuffer +from brokerage import SiteMapper +from dataservice.EventPicker import EventPicker +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('evpPD2P') + +_logger.debug("===================== start =====================") + +# overall timeout value +overallTimeout = 60 +# prefix of evp files +prefixEVP = 'evp.' +# file pattern of evp files +evpFilePatt = panda_config.cache_dir + '/' + prefixEVP + '*' + +# kill old process +try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName) + for line in out.split('\n'): + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) +except: + type, value, traceBack = sys.exc_info() + _logger.error("kill process : %s %s" % (type,value)) + +# instantiate PD2P +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) +siteMapper = SiteMapper.SiteMapper(taskBuffer) + + +# thread pool +class ThreadPool: + def __init__(self): + self.lock = threading.Lock() + self.list = [] + + def add(self,obj): + self.lock.acquire() + self.list.append(obj) + self.lock.release() + + def remove(self,obj): + self.lock.acquire() + self.list.remove(obj) + self.lock.release() + + def join(self): + self.lock.acquire() + thrlist = tuple(self.list) + self.lock.release() + for thr in thrlist: + thr.join() + + +# thread to ev-pd2p +class EvpThr (threading.Thread): + def __init__(self,lock,pool,aTaskBuffer,aSiteMapper,fileName,ignoreError): + threading.Thread.__init__(self) + self.lock = lock + self.pool = pool + self.fileName = fileName + self.evp = EventPicker(aTaskBuffer,aSiteMapper,fileName,ignoreError) + self.pool.add(self) + + def run(self): + self.lock.acquire() + retRun = self.evp.run() + _logger.debug("%s : %s" % (retRun,self.fileName)) + self.pool.remove(self) + self.lock.release() + + +# get files +_logger.debug("EVP session") +timeNow = datetime.datetime.utcnow() +timeInt = datetime.datetime.utcnow() +fileList = glob.glob(evpFilePatt) +fileList.sort() + +# create thread pool and semaphore +adderLock = threading.Semaphore(3) +adderThreadPool = ThreadPool() + +# add +while len(fileList) != 0: + # time limit to aviod too many copyArchve running at the sametime + if (datetime.datetime.utcnow() - timeNow) > datetime.timedelta(minutes=overallTimeout): + _logger.debug("time over in EVP session") + break + # try to get Semaphore + adderLock.acquire() + # get fileList + if (datetime.datetime.utcnow() - timeInt) > datetime.timedelta(minutes=15): + timeInt = datetime.datetime.utcnow() + # get file + fileList = glob.glob(evpFilePatt) + fileList.sort() + # choose a file + fileName = fileList.pop(0) + # release lock + adderLock.release() + if not os.path.exists(fileName): + continue + try: + modTime = datetime.datetime(*(time.gmtime(os.path.getmtime(fileName))[:7])) + if (timeNow - modTime) > datetime.timedelta(hours=24): + # last chance + _logger.debug("Last event picking : %s" % fileName) + thr = EvpThr(adderLock,adderThreadPool,taskBuffer,siteMapper,fileName,False) + thr.start() + elif (timeInt - modTime) > datetime.timedelta(minutes=1): + # try + _logger.debug("event picking : %s" % fileName) + thr = EvpThr(adderLock,adderThreadPool,taskBuffer,siteMapper,fileName,True) + thr.start() + else: + _logger.debug("%s : %s" % ((timeInt - modTime),fileName)) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("%s %s" % (errType,errValue)) + +# join all threads +adderThreadPool.join() + +_logger.debug("===================== end =====================") + diff --git a/current/pandaserver/test/execute.py b/current/pandaserver/test/execute.py new file mode 100755 index 000000000..8cc2f2429 --- /dev/null +++ b/current/pandaserver/test/execute.py @@ -0,0 +1,66 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = None + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = 'BNL_ATLAS_2' + +jobList = [] +for i in range(20): + + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = commands.getoutput('uuidgen') + job.AtlasRelease = 'Atlas-11.0.41' + #job.AtlasRelease = 'Atlas-11.0.3' + job.homepackage = 'AnalysisTransforms' + job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/runAthena' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.currentPriority = 100 + job.prodSourceLabel = 'user' + job.computingSite = site + #job.prodDBlock = "pandatest.b1599dfa-cd36-4fc5-92f6-495781a94c66" + job.prodDBlock = "pandatest.f228b051-077b-4f81-90bf-496340644379" + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = "lib.f228b051-077b-4f81-90bf-496340644379.tgz" + fileI.type = 'input' + job.addFile(fileI) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen') + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.type = 'log' + job.addFile(fileOL) + + fileOZ = FileSpec() + fileOZ.lfn = "%s.pool.root" % commands.getoutput('uuidgen') + fileOZ.destinationDBlock = job.destinationDBlock + fileOZ.destinationSE = job.destinationSE + fileOZ.dataset = job.destinationDBlock + fileOZ.type = 'output' + job.addFile(fileOZ) + + job.jobParameters="""-l %s -r PhysicsAnalysis/AnalysisCommon/UserAnalysis/UserAnalysis-00-05-11/run -j " jobOptions.pythia.py" -i "[]" -o "{'Stream1': '%s'}" """ % (fileI.lfn,fileOZ.lfn) + + jobList.append(job) + + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/fileCallbackListener.py b/current/pandaserver/test/fileCallbackListener.py new file mode 100644 index 000000000..bad0c76cd --- /dev/null +++ b/current/pandaserver/test/fileCallbackListener.py @@ -0,0 +1,253 @@ +import os +import re +import sys +import time +import signal +import socket +import commands +import optparse +import datetime +import cPickle as pickle + +from dq2.common import log as logging +from dq2.common import stomp +from config import panda_config +from brokerage.SiteMapper import SiteMapper +from dataservice.Finisher import Finisher + +# logger +from pandalogger.PandaLogger import PandaLogger +_logger = PandaLogger().getLogger('fileCallbackListener') + +# keep PID +pidFile = '%s/file_callback_listener.pid' % panda_config.logdir + +# overall timeout value +overallTimeout = 60 * 59 + +# expiration time +expirationTime = datetime.datetime.utcnow() + datetime.timedelta(minutes=overallTimeout) + + +# kill whole process +def catch_sig(sig, frame): + try: + os.remove(pidFile) + except: + pass + # kill + _logger.debug('terminating ...') + commands.getoutput('kill -9 -- -%s' % os.getpgrp()) + # exit + sys.exit(0) + + +# callback listener +class FileCallbackListener(stomp.ConnectionListener): + + def __init__(self,conn,tb,sm): + # connection + self.conn = conn + # task buffer + self.taskBuffer = tb + # site mapper + self.siteMapper = sm + + + def on_error(self,headers,body): + _logger.error("on_error : %s" % headers['message']) + + + def on_disconnected(self,headers,body): + _logger.error("on_disconnected : %s" % headers['message']) + + + def on_message(self, headers, message): + try: + lfn = 'UNKNOWN' + # send ack + id = headers['message-id'] + self.conn.ack({'message-id':id}) + # check message type + messageType = headers['cbtype'] + if not messageType in ['FileDoneMessage']: + _logger.debug('%s skip' % messageType) + return + _logger.debug('%s start' % messageType) + # re-construct message + messageObj = pickle.loads(message) + evtTime = datetime.datetime.utcfromtimestamp(messageObj.getItem('eventTime')) + lfn = messageObj.getItem('lfn') + guid = messageObj.getItem('guid') + ddmSite = messageObj.getItem('site') + _logger.debug('%s site=%s type=%s time=%s' % \ + (lfn,ddmSite,messageType,evtTime.strftime('%Y-%m-%d %H:%M:%S'))) + # ignore non production files + flagNgPrefix = False + for ngPrefix in ['user','step']: + if lfn.startswith(ngPrefix): + flagNgPrefix = True + break + if flagNgPrefix: + _logger.debug('%s skip' % lfn) + return + # get datasets associated with the file only for high priority jobs + dsNameMap = self.taskBuffer.getDatasetWithFile(lfn,800) + _logger.debug('%s ds=%s' % (lfn,str(dsNameMap))) + # loop over all datasets + for dsName,dsData in dsNameMap.iteritems(): + pandaSite,dsToken = dsData + # skip multiple destination since each file doesn't have + # transferStatus + if not dsToken in ['',None] and ',' in dsToken: + _logger.debug('%s ignore ds=%s token=%s' % (lfn,dsName,dsToken)) + continue + # check site + tmpSiteSpec = self.siteMapper.getSite(pandaSite) + if tmpSiteSpec.setokens.has_key(dsToken): + pandaSiteDdmID = tmpSiteSpec.setokens[dsToken] + else: + pandaSiteDdmID = tmpSiteSpec.ddm + if pandaSiteDdmID != ddmSite: + _logger.debug('%s ignore ds=%s site=%s:%s <> %s' % \ + (lfn,dsName,pandaSite,pandaSiteDdmID,ddmSite)) + continue + # update file + forInput = None + if re.search('_dis\d+$',dsName) != None: + # dispatch datasets + forInput = True + ids = self.taskBuffer.updateInFilesReturnPandaIDs(dsName,'ready',lfn) + elif re.search('_sub\d+$',dsName) != None: + # sub datasets + forInput = False + ids = self.taskBuffer.updateOutFilesReturnPandaIDs(dsName,lfn) + _logger.debug('%s ds=%s ids=%s' % (lfn,dsName,str(ids))) + # loop over all PandaIDs + if forInput != None and len(ids) != 0: + # remove None and unknown + targetIDs = [] + for tmpID in ids: + # count the number of pending files + nPending = self.taskBuffer.countPendingFiles(tmpID,forInput) + _logger.debug('%s PandaID=%s nPen=%s' % (lfn,tmpID,nPending)) + if nPending != 0: + continue + targetIDs.append(tmpID) + # get jobs + targetJobs = [] + if targetIDs != []: + if forInput: + jobs = self.taskBuffer.peekJobs(targetIDs,fromActive=False,fromArchived=False, + fromWaiting=False) + else: + jobs = self.taskBuffer.peekJobs(targetIDs,fromDefined=False,fromArchived=False, + fromWaiting=False) + for tmpJob in jobs: + if tmpJob == None or tmpJob.jobStatus == 'unknown': + continue + targetJobs.append(tmpJob) + # trigger subsequent processe + if targetJobs == []: + _logger.debug('%s no jobs to be triggerd for subsequent processe' % lfn) + else: + if forInput: + # activate + _logger.debug('%s activate %s' % (lfn,str(targetIDs))) + self.taskBuffer.activateJobs(targetJobs) + else: + # finish + _logger.debug('%s finish %s' % (lfn,str(targetIDs))) + for tmpJob in targetJobs: + fThr = Finisher(self.taskBuffer,None,tmpJob) + fThr.start() + fThr.join() + _logger.debug('%s done' % lfn) + except: + errtype,errvalue = sys.exc_info()[:2] + _logger.error("on_message : %s %s %s" % (lfn,errtype,errvalue)) + + +# main +def main(backGround=False): + _logger.debug('starting ...') + # register signal handler + signal.signal(signal.SIGINT, catch_sig) + signal.signal(signal.SIGHUP, catch_sig) + signal.signal(signal.SIGTERM,catch_sig) + signal.signal(signal.SIGALRM,catch_sig) + signal.alarm(overallTimeout) + # forking + pid = os.fork() + if pid != 0: + # watch child process + os.wait() + time.sleep(1) + else: + # main loop + from taskbuffer.TaskBuffer import taskBuffer + # initialize cx_Oracle using dummy connection + from taskbuffer.Initializer import initializer + initializer.init() + # instantiate TB + taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + # instantiate sitemapper + siteMapper = SiteMapper(taskBuffer) + # ActiveMQ params + clientid = 'PANDA-' + socket.getfqdn() + queue = '/queue/Consumer.PANDA.atlas.ddm.siteservices' + ssl_opts = {'use_ssl' : True, + 'ssl_cert_file' : '/data/atlpan/pandasv1_usercert.pem', + 'ssl_key_file' : '/data/atlpan/pandasv1_userkey.pem'} + # resolve multiple brokers + brokerList = socket.gethostbyname_ex('atlasddm-mb.cern.ch')[-1] + # set listener + for tmpBroker in brokerList: + try: + _logger.debug('setting listener on %s' % tmpBroker) + conn = stomp.Connection(host_and_ports = [(tmpBroker, 6162)], **ssl_opts) + conn.set_listener('FileCallbackListener', FileCallbackListener(conn,taskBuffer,siteMapper)) + conn.start() + conn.connect(headers = {'client-id': clientid}) + conn.subscribe(destination=queue, ack='client-individual') + #,headers = {'selector':"cbtype='FileDoneMessage'"}) + if not conn.is_connected(): + _logger.error("connection failure to %s" % tmpBroker) + except: + errtype,errvalue = sys.exc_info()[:2] + _logger.error("failed to set listener on %s : %s %s" % (tmpBroker,errtype,errvalue)) + catch_sig(None,None) + +# entry +if __name__ == "__main__": + optP = optparse.OptionParser(conflict_handler="resolve") + options,args = optP.parse_args() + try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(seconds=overallTimeout-180) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName) + for line in out.split('\n'): + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) + except: + errtype,errvalue = sys.exc_info()[:2] + _logger.error("kill process : %s %s" % (errtype,errvalue)) + # main loop + main() diff --git a/current/pandaserver/test/fileClean.py b/current/pandaserver/test/fileClean.py new file mode 100755 index 000000000..edef84ea5 --- /dev/null +++ b/current/pandaserver/test/fileClean.py @@ -0,0 +1,145 @@ +import re +import sys +import datetime +from taskbuffer.DBProxy import DBProxy +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# table names +cdate = datetime.datetime.utcnow() +if cdate.month==1: + cdate = cdate.replace(year = (cdate.year-1)) + cdate = cdate.replace(month = 12, day = 1) +else: + cdate = cdate.replace(month = (cdate.month/2)*2, day = 1) +currentSuffix = "_%s%s" % (cdate.strftime('%b'),cdate.year) +if cdate.month > 2: + odate = cdate.replace(month = (cdate.month-2)) +else: + odate = cdate.replace(year = (cdate.year-1), month = 12) +previousSuffix = "_%s%s" % (odate.strftime('%b'),odate.year) + +# instantiate DB proxies +proxyS = DBProxy() +proxyN = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) +proxyN.connect(panda_config.logdbhost,panda_config.logdbpasswd,panda_config.logdbuser,'PandaArchiveDB') + +# get tables +fileTables = [] +jobsTables = {} +status,res = proxyN.querySQLS("show tables") +if res != None: + for table, in res: + if table.startswith('filesTable'): + fileTables.append(table) + if table.startswith('jobsArchived'): + # get MAX PandaID + statusJ,resJ = proxyN.querySQLS("SELECT MAX(PandaID) FROM %s" % table) + jobsTables[table] = resJ[0][0] + +# for the cumulative tables +cumulativeSuffix = '4_current' +cumulativePandaID = jobsTables['jobsArchived%s' % cumulativeSuffix] + +# create a map between MAX PandaID and suffix +suffixMap = {} +for table,maxPandaID in jobsTables.iteritems(): + # get suffix + match = re.search('(\d??_.+)$',table) + suffix = match.group(1) + # special treatment is required for the cumulative tables + if suffix == cumulativeSuffix: + continue + # name of corresponding file table + name = "filesTable%s" % suffix + if not name in fileTables: + print "%s is not found" % name + sys.exit(0) + # check duplication + if suffixMap.has_key(maxPandaID): + print "%s is already used by %s" % (maxPandaID,suffixMap[maxPandaID]) + sys.exit(0) + # append + suffixMap[maxPandaID] = suffix + +# print the cumulative +print "%8d %s" % (cumulativePandaID,cumulativeSuffix) +# sort by max PandaID +suffixKeys = suffixMap.keys() +suffixKeys.sort() +for key in suffixKeys: + print "%8d %s" % (key,suffixMap[key]) + +# get files +minPandaID = -1 +sql = "SELECT PandaID FROM filesTable4 WHERE PandaID > %s GROUP BY PandaID ORDER BY PandaID LIMIT 100" +#while True: +for i in range(5): + status,res = proxyS.querySQLS(sql % minPandaID) + # no more job + if len(res) == 0: + break + # set min + minPandaID = res[-1][0] + # loop over all PandaIDs + for id, in res: + # look for corresponding table + tableSuffix = '' + if id < cumulativePandaID: + # use the cumulative + tableSuffix = cumulativeSuffix + else: + for key in suffixKeys: + if id < key: + tableSuffix = suffixMap[key] + break + # check suffix + if tableSuffix in ['',currentSuffix,previousSuffix]: + print "Terminated since fresh PandID=%s found for '%s'" % (id,tableSuffix) + sys.exit(0) + print "PandaID:%s Suffix:%s" % (id,tableSuffix) + # get FileSpec + sqlFile = "SELECT %s FROM filesTable4 " % FileSpec.columnNames() + sqlFile+= "WHERE PandaID=%s" % id + statusF,resFs = proxyS.querySQLS(sqlFile) + for resF in resFs: + file = FileSpec() + file.pack(resF) + # create a dummy Job to set PandaID + job = JobSpec() + job.PandaID = id + job.addFile(file) + # file table + fileTable = 'filesTable%s' % tableSuffix + # check + sqlFileCheck = "SELECT PandaID FROM %s WHERE rowID=%s" % (fileTable,file.rowID) + statusC,resC = proxyN.querySQLS(sqlFileCheck) + if len(resC) != 0: + if resC[0][0] != id: + print "PandaID mismatch PandaArchive:%s PandaDB:%s for rowID=%s" % \ + (resC[0][0],id,file.rowID) + else: + print "rowID=%s not found" % file.rowID + """ + # construct SQL + sqlFileIn = "INSERT INTO %s " % fileTable + sqlFileIn+= "(%s) " % FileSpec.columnNames() + sqlFileIn+= FileSpec.valuesExpression() + try: + proxyN.cur.execute("SET AUTOCOMMIT=1") + ret = proxyN.cur.execute(sqlFileIn,file.values()) + res = proxyN.cur.fetchall() + # commit + if not proxyN._commit(): + raise RuntimeError, 'Commit error' + except: + type, value, traceBack = sys.exc_info() + print "insert error : %s %s" % (type,value) + # roll back + proxyN._rollback() + """ diff --git a/current/pandaserver/test/finishJob.py b/current/pandaserver/test/finishJob.py new file mode 100755 index 000000000..559bd61c3 --- /dev/null +++ b/current/pandaserver/test/finishJob.py @@ -0,0 +1,74 @@ +import os +import re +import sys +import urllib2,urllib + +import userinterface.Client as Client +from userinterface.Client import baseURLSSL + +import httplib +import commands + +id = sys.argv[1] +s,o = Client.getJobStatus([id]) + +if s != 0: + print "failed to get job with:%s" % s + sys.exit(0) + +job = o[0] + +if job == None: + print "got None" + sys.exit(0) + +xml = """ + + + +""" + +for file in job.Files: + if file.type in ['output','log']: + xml += """ + + + + + + + + """ % (commands.getoutput('uuidgen'),file.lfn,file.lfn) + +xml += """ + +""" + +node={} +node['jobId']=id +node['state']='finished' +node['metaData']='finished' +#node['state']='failed' +#node['pilotErrorCode']=1200 +node['siteName']='BNL_ATLAS_test' + +node['xml']=xml +url='%s/updateJob' % baseURLSSL + +match = re.search('[^:/]+://([^/]+)(/.+)',url) +host = match.group(1) +path = match.group(2) + +if os.environ.has_key('X509_USER_PROXY'): + certKey = os.environ['X509_USER_PROXY'] +else: + certKey = '/tmp/x509up_u%s' % os.getuid() + +rdata=urllib.urlencode(node) + +conn = httplib.HTTPSConnection(host,key_file=certKey,cert_file=certKey) +conn.request('POST',path,rdata) +resp = conn.getresponse() +data = resp.read() + +print data diff --git a/current/pandaserver/test/getJobs.py b/current/pandaserver/test/getJobs.py new file mode 100755 index 000000000..10fd553eb --- /dev/null +++ b/current/pandaserver/test/getJobs.py @@ -0,0 +1,54 @@ +import sys +import time +import datetime +import commands +import threading +import urllib2,urllib + +import httplib + +import re +import os + +from userinterface.Client import baseURLSSL + +node={} +node['siteName']=sys.argv[1] +node['mem']=1000 +node['node']=commands.getoutput('hostname -f') +#node['prodSourceLabel']='user' +url='%s/getJob' % baseURLSSL + +match = re.search('[^:/]+://([^/]+)(/.+)',url) +host = match.group(1) +path = match.group(2) + +if os.environ.has_key('X509_USER_PROXY'): + certKey = os.environ['X509_USER_PROXY'] +else: + certKey = '/tmp/x509up_u%s' % os.getuid() + +rdata=urllib.urlencode(node) + +class Thr(threading.Thread): + def __init__(self): + threading.Thread.__init__(self) + + def run(self): + print datetime.datetime.utcnow().isoformat(' ') + conn = httplib.HTTPSConnection(host,key_file=certKey,cert_file=certKey) + conn.request('POST',path,rdata) + resp = conn.getresponse() + data = resp.read() + conn.close() + print datetime.datetime.utcnow().isoformat(' ') + import cgi + print cgi.parse_qs(data) + +nThr = 1 +thrs = [] +for i in range(nThr): + thrs.append(Thr()) + +for thr in thrs: + thr.start() diff --git a/current/pandaserver/test/input.data b/current/pandaserver/test/input.data new file mode 100755 index 000000000..08272e947 --- /dev/null +++ b/current/pandaserver/test/input.data @@ -0,0 +1,2 @@ +pandatest.000003.dd.input:pandatest.000003.dd.input._00047.junk +pandatest.000003.dd.input:pandatest.000003.dd.input._00001.junk diff --git a/current/pandaserver/test/installSW.py b/current/pandaserver/test/installSW.py new file mode 100755 index 000000000..1dbb349bf --- /dev/null +++ b/current/pandaserver/test/installSW.py @@ -0,0 +1,83 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +# extract pacball and site +argStr = "" +pacball = None +pacFlag = False +siteName = None +siteFlag = False +for arg in sys.argv[1:]: + if arg == '--pacball': + pacFlag = True + continue + if pacFlag: + pacball = arg + pacFlag = False + continue + if arg == '--sitename': + siteFlag = True + continue + if siteFlag: + siteName = arg + siteFlag = False + continue + argStr += "%s " % arg + +# check site +if siteName == None: + print "ERROR : --sitename needs to be specified" + sys.exit(1) +# append sitename +argStr += "--sitename %s " % siteName + +# check pacball format +if pacball != None and pacball.find(':') != -1: + pacDS = pacball.split(':')[0] + pacFile = pacball.split(':')[-1] +else: + pacDS = None + pacFile = pacball + +# append pacball to arg +if pacFile != None: + argStr += "--pacball %s " % pacFile + +job = JobSpec() +job.jobDefinitionID = int(time.time()) % 10000 +job.jobName = "%s_%s" % (siteName,commands.getoutput('uuidgen')) +job.transformation = 'http://www.usatlas.bnl.gov/svn/panda/apps/sw/installAtlasSW' +job.destinationDBlock = 'panda.%s' % job.jobName +job.currentPriority = 10000 +job.prodSourceLabel = 'software' +job.computingSite = siteName +job.cloud = 'US' + +fileOL = FileSpec() +fileOL.lfn = "%s.job.log.tgz" % job.jobName +fileOL.destinationDBlock = job.destinationDBlock +fileOL.dataset = job.destinationDBlock +fileOL.type = 'log' +job.addFile(fileOL) + +# pacball +if pacDS != None: + job.prodDBlock = pacDS + fileP = FileSpec() + fileP.dataset = pacDS + fileP.prodDBlock = pacDS + fileP.lfn = pacFile + fileP.type = 'input' + job.addFile(fileP) + +job.jobParameters = argStr + +s,o = Client.submitJobs([job]) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/killDefJobs.py b/current/pandaserver/test/killDefJobs.py new file mode 100755 index 000000000..a646ea202 --- /dev/null +++ b/current/pandaserver/test/killDefJobs.py @@ -0,0 +1,26 @@ +import datetime +from taskbuffer.DBProxy import DBProxy +import userinterface.Client as Client + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# time limit +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=1) + +# instantiate DB proxies +proxyS = DBProxy() +proxyS.connect('adbpro.usatlas.bnl.gov',passwd,'panda-developer','PandaDevDB') + +# get PandaIDs from jobsDefined +res = proxyS.querySQL("SELECT PandaID,modificationTime from jobsDefined4 ORDER BY modificationTime") + +# kill f old +jobs=[] +for (id,modTime) in res: + if modTime < timeLimit: + jobs.append(id) + +Client.killJobs(jobs) + diff --git a/current/pandaserver/test/killJob.py b/current/pandaserver/test/killJob.py new file mode 100755 index 000000000..0238f2e79 --- /dev/null +++ b/current/pandaserver/test/killJob.py @@ -0,0 +1,36 @@ +import sys +import optparse +import userinterface.Client as Client + +optP = optparse.OptionParser(conflict_handler="resolve") +optP.add_option('-9',action='store_const',const=True,dest='forceKill', + default=False,help='kill jobs before next heartbeat is coming') +optP.add_option('--killOwnProdJobs',action='store_const',const=True,dest='killOwnProdJobs', + default=False,help='kill own production jobs without a production role') +optP.add_option('--killUserJobs',action='store_const',const=True,dest='killUserJobs', + default=False,help='kill user jobs using a production role') +options,args = optP.parse_args() + + +aSrvID = None + +codeV = None +useMailAsIDV = False + +if options.forceKill: + codeV = 9 +elif options.killUserJobs: + codeV = 91 +if options.killOwnProdJobs: + useMailAsIDV = True + +if len(args) == 1: + Client.killJobs([args[0]],code=codeV,useMailAsID=useMailAsIDV) +else: + startID = int(args[0]) + endID = int(args[1]) + if startID > endID: + print '%d is less than %d' % (endID,startID) + sys.exit(1) + Client.killJobs(range(startID,endID+1),code=codeV,useMailAsID=useMailAsIDV) + diff --git a/current/pandaserver/test/killJobLowPrio.py b/current/pandaserver/test/killJobLowPrio.py new file mode 100755 index 000000000..347da336a --- /dev/null +++ b/current/pandaserver/test/killJobLowPrio.py @@ -0,0 +1,86 @@ +import time +import sys +import optparse + +import userinterface.Client as Client + +aSrvID = None + +from taskbuffer.OraDBProxy import DBProxy +# password +from config import panda_config + +usageStr = """%prog [options] + +Description: kill jobs with low priorities below a given value""" +optP = optparse.OptionParser(conflict_handler="resolve",usage=usageStr) +optP.add_option('-9',action='store_const',const=True,dest='forceKill', + default=False,help='kill jobs before next heartbeat is coming') +optP.add_option('--running',action='store_const',const=True,dest='killRunning', + default=False,help='kill running jobs to free up CPU slots. jobs will be killed regardless of job status if omitted') +optP.add_option('--site',action='store',dest='site',default=None,help='computingSite') +optP.add_option('--cloud',action='store',dest='cloud',default=None,help='cloud') +optP.add_option('--maxJobs',action='store',dest='maxJobs',default=None,help='max number of jobs to be killed') +options,args = optP.parse_args() + +if options.cloud == None and options.site == None: + optP.error("--site= and/or --cloud= is required") + +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +jobsMap = {} + +if len(args) == 0: + optP.error('priority is required') + +varMap = {} +varMap[':prodSourceLabel'] = 'managed' +varMap[':currentPriority'] = args[0] +sql = "SELECT PandaID,currentPriority FROM %s WHERE prodSourceLabel=:prodSourceLabel AND currentPriority<:currentPriority " +if options.killRunning: + sql += "AND jobStatus=:jobStatus " + varMap[':jobStatus'] = 'running' +if options.cloud != None: + sql += "AND cloud=:cloud " + varMap[':cloud'] = options.cloud +if options.site != None: + sql += "AND computingSite=:site " + varMap[':site'] = options.site +for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: + status,res = proxyS.querySQLS(sql % table,varMap) + if res != None: + for id,prio in res: + if not jobsMap.has_key(prio): + jobsMap[prio] = [] + if not id in jobsMap[prio]: + jobsMap[prio].append(id) + +# order by PandaID and currentPriority +jobs = [] +prioList = jobsMap.keys() +prioList.sort() +for prio in prioList: + # reverse order by PandaID to kill newer jobs + ids = jobsMap[prio] + ids.sort() + ids.reverse() + jobs += ids + +if options.maxJobs != None: + jobs = jobs[:int(options.maxJobs)] + +print 'The number of jobs with priorities below %s : %s' % (args[0],len(jobs)) +if len(jobs): + nJob = 100 + iJob = 0 + while iJob < len(jobs): + print 'kill %s' % str(jobs[iJob:iJob+nJob]) + if options.forceKill: + Client.killJobs(jobs[iJob:iJob+nJob],9) + else: + Client.killJobs(jobs[iJob:iJob+nJob]) + iJob += nJob + time.sleep(1) + + diff --git a/current/pandaserver/test/killJobsInTask.py b/current/pandaserver/test/killJobsInTask.py new file mode 100755 index 000000000..26c9ddb16 --- /dev/null +++ b/current/pandaserver/test/killJobsInTask.py @@ -0,0 +1,53 @@ +import time +import sys +import optparse + +import userinterface.Client as Client + +aSrvID = None + +from taskbuffer.OraDBProxy import DBProxy +# password +from config import panda_config + +optP = optparse.OptionParser(conflict_handler="resolve") +optP.add_option('-9',action='store_const',const=True,dest='forceKill', + default=False,help='kill jobs before next heartbeat is coming') +options,args = optP.parse_args() + +useMailAsIDV = False +if options.killOwnProdJobs: + useMailAsIDV = True + +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +jobs = [] + +varMap = {} +varMap[':prodSourceLabel'] = 'managed' +varMap[':taskID'] = args[0] +varMap[':pandaIDl'] = args[1] +varMap[':pandaIDu'] = args[2] +sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID AND PandaID BETWEEN :pandaIDl AND :pandaIDu ORDER BY PandaID" +for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: + status,res = proxyS.querySQLS(sql % table,varMap) + if res != None: + for id, in res: + if not id in jobs: + jobs.append(id) + +print 'The number of jobs to be killed : %s' % len(jobs) +if len(jobs): + nJob = 100 + iJob = 0 + while iJob < len(jobs): + print 'kill %s' % str(jobs[iJob:iJob+nJob]) + if options.forceKill: + Client.killJobs(jobs[iJob:iJob+nJob],9,useMailAsID=useMailAsIDV) + else: + Client.killJobs(jobs[iJob:iJob+nJob],useMailAsID=useMailAsIDV) + iJob += nJob + time.sleep(1) + + diff --git a/current/pandaserver/test/killProdJobs.py b/current/pandaserver/test/killProdJobs.py new file mode 100755 index 000000000..85e8113ea --- /dev/null +++ b/current/pandaserver/test/killProdJobs.py @@ -0,0 +1,30 @@ +import sys + +import userinterface.Client as Client + +if len(sys.argv) == 2: + jobDefIDs = [sys.argv[1]] +else: + startID = int(sys.argv[1]) + endID = int(sys.argv[2]) + if startID > endID: + print '%d is less than %d' % (endID,startID) + sys.exit(1) + jobDefIDs = range(startID,endID+1) + +# quesry PandaID +status, ids = Client.queryPandaIDs(jobDefIDs) + +if status != 0: + sys.exit(0) + +# remove None +while True: + if not None in ids: + break + ids.remove(None) + +# kill +if len(ids) != 0: + Client.killJobs(ids) + diff --git a/current/pandaserver/test/killTask.py b/current/pandaserver/test/killTask.py new file mode 100755 index 000000000..0784a18b9 --- /dev/null +++ b/current/pandaserver/test/killTask.py @@ -0,0 +1,53 @@ +import time +import sys +import optparse + +import userinterface.Client as Client + +aSrvID = None + +from taskbuffer.OraDBProxy import DBProxy +# password +from config import panda_config + +optP = optparse.OptionParser(conflict_handler="resolve") +optP.add_option('-9',action='store_const',const=True,dest='forceKill', + default=False,help='kill jobs even if they are still running') +optP.add_option('--noRunning',action='store_const',const=True,dest='noRunning', + default=False,help='kill only activated/assigned/waiting jobs') +options,args = optP.parse_args() + +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +jobs = [] + +varMap = {} +varMap[':prodSourceLabel'] = 'managed' +varMap[':taskID'] = args[0] +if not options.noRunning: + sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID ORDER BY PandaID" +else: + sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID AND jobStatus<>:jobStatus ORDER BY PandaID" + varMap[':jobStatus'] = 'running' +for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: + status,res = proxyS.querySQLS(sql % table,varMap) + if res != None: + for id, in res: + if not id in jobs: + jobs.append(id) + +print 'The number of jobs to be killed : %s' % len(jobs) +if len(jobs): + nJob = 100 + iJob = 0 + while iJob < len(jobs): + print 'kill %s' % str(jobs[iJob:iJob+nJob]) + if options.forceKill: + Client.killJobs(jobs[iJob:iJob+nJob],9) + else: + Client.killJobs(jobs[iJob:iJob+nJob]) + iJob += nJob + time.sleep(1) + + diff --git a/current/pandaserver/test/killUser.py b/current/pandaserver/test/killUser.py new file mode 100644 index 000000000..4e3bbaa19 --- /dev/null +++ b/current/pandaserver/test/killUser.py @@ -0,0 +1,71 @@ +import sys +import time +import datetime +import optparse + +from taskbuffer.OraDBProxy import DBProxy +# password +from config import panda_config + +optP = optparse.OptionParser(conflict_handler="resolve") +optP.add_option('--user', action='store',dest='user', default=None,help='prodUserName') +optP.add_option('--jobID',action='store',dest='jobID',default=None,help='jobDefinitionID') +optP.add_option('--jobsetID',action='store',dest='jobsetID',default=None,help="jobsetID, or 'all' to kill all jobs") +optP.add_option('--prodSourceLabel',action='store',dest='prodSourceLabel',default=None,help='additional prodSourceLabel') + + +options,args = optP.parse_args() + +if options.user == None: + print "--user= is required" + sys.exit(1) +if options.jobID == None and options.jobsetID == None: + print "--jobID= or --jobsetID= is required" + sys.exit(1) + + +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +prodUserName = sys.argv[1] +import userinterface.Client as Client + +varMap = {} +varMap[':src1'] = 'user' +varMap[':src2'] = 'panda' +varMap[':prodUserName'] = options.user +srcSQL = '(:src1,:src2' +if options.jobID != None: + varMap[':jobDefinitionID'] = options.jobID +if not options.jobsetID in (None,'all'): + varMap[':jobsetID'] = options.jobsetID +if options.prodSourceLabel != None: + varMap[':src3'] = options.prodSourceLabel + srcSQL += ',:src3' +srcSQL += ')' + +jobs = [] +tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4'] +for table in tables: + sql = "SELECT PandaID FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel IN %s " % (table,srcSQL) + if options.jobID != None: + sql += "AND jobDefinitionID=:jobDefinitionID " + if not options.jobsetID in (None,'all'): + sql += "AND jobsetID=:jobsetID " + sql += "ORDER BY PandaID " + status,res = proxyS.querySQLS(sql,varMap) + if res != None: + for id, in res: + if not id in jobs: + jobs.append(id) +if len(jobs): + iJob = 0 + nJob = 1000 + while iJob < len(jobs): + subJobs = jobs[iJob:iJob+nJob] + print "kill %s %s/%s" % (str(subJobs),iJob,len(jobs)) + Client.killJobs(subJobs,code=9) + iJob += nJob +else: + print "no job was killed" + diff --git a/current/pandaserver/test/killWaiting.py b/current/pandaserver/test/killWaiting.py new file mode 100755 index 000000000..fe76014a8 --- /dev/null +++ b/current/pandaserver/test/killWaiting.py @@ -0,0 +1,35 @@ +import sys +import time +import datetime +from taskbuffer.DBProxy import DBProxy +import userinterface.Client as Client + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +cloud = sys.argv[1] + +# instantiate DB proxies +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +while True: + # get PandaIDs + res = proxyS.querySQL("SELECT PandaID FROM jobsWaiting4 WHERE cloud='%s' ORDER BY PandaID" % cloud) + # escape + if len(res) == 0: + break + # convert to list + jobs = [] + for id, in res: + jobs.append(id) + # reassign + nJob = 300 + iJob = 0 + while iJob < len(jobs): + print 'killJobs(%s)' % jobs[iJob:iJob+nJob] + Client.killJobs(jobs[iJob:iJob+nJob]) + iJob += nJob + time.sleep(60) + diff --git a/current/pandaserver/test/logrotate.sh b/current/pandaserver/test/logrotate.sh new file mode 100755 index 000000000..51db686c0 --- /dev/null +++ b/current/pandaserver/test/logrotate.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +/usr/sbin/logrotate /usatlas/u/sm/prod/panda/config/logrotate.conf -s /usatlas/u/sm/logrotate.status diff --git a/current/pandaserver/test/missing.py b/current/pandaserver/test/missing.py new file mode 100755 index 000000000..b77eaeecf --- /dev/null +++ b/current/pandaserver/test/missing.py @@ -0,0 +1,43 @@ +import re +import commands + +stMap = [] +tmpMap = {} +nLog = 30 +for i in range(0,nLog): + if i == 0: + out = commands.getoutput('cat /data/sm/prod/httpd/logs/panda-Adder.log') + else: + out = commands.getoutput('zcat /data/sm/prod/httpd/logs/panda-Adder.log.%s.gz' % (nLog-i)) + for line in out.split('\n'): + stStr = re.search('start: finished',line) + idsStr = re.search('ids = .*$',line) + mapStr = re.search('idMap = .*$',line) + if stStr == None and idsStr == None and mapStr == None: + continue + items = line.split() + try: + pandaID = int(items[4]) + except: + continue + if stStr != None: + stMap.append(pandaID) + if idsStr != None: + exec idsStr.group(0) + tmpMap[pandaID] = ids + if mapStr != None: + exec mapStr.group(0) + if (pandaID in stMap) and idMap == {} and tmpMap[pandaID] != ([], []): + print pandaID + print tmpMap[pandaID] + try: + del tmpMap[pandaID] + except: + pass + try: + stMap.remove(pandaID) + except: + pass +if tmpMap != {}: + print tmpMap + diff --git a/current/pandaserver/test/pandadb.sql b/current/pandaserver/test/pandadb.sql new file mode 100644 index 000000000..5bc00b59d --- /dev/null +++ b/current/pandaserver/test/pandadb.sql @@ -0,0 +1,430 @@ +DROP TABLE jobsDefined4; +DROP TABLE jobsActive4; +DROP TABLE jobsArchived4; +DROP TABLE jobsWaiting4; +DROP TABLE filesTable4; +DROP TABLE Datasets; +DROP TABLE metaTable; +DROP TABLE subCounter; + + +CREATE TABLE jobsDefined4 +( + PandaID NUMBER(11) default 0 primary key, + jobDefinitionID NUMBER(11) default 0, + schedulerID VARCHAR(128), + pilotID VARCHAR(128), + creationTime DATE, + creationHost VARCHAR(128), + modificationTime DATE, + modificationHost VARCHAR(128), + AtlasRelease VARCHAR(64), + transformation VARCHAR(250), + homepackage VARCHAR(64), + prodSeriesLabel VARCHAR(20) default 'pandatest', + prodSourceLabel VARCHAR(20) default 'managed', + prodUserID VARCHAR(250), + assignedPriority NUMBER(9) default 0, + currentPriority NUMBER(9) default 0, + attemptNr NUMBER(2) default 0, + maxAttempt NUMBER(2) default 0, + jobStatus VARCHAR(15) default 'defined', + jobName VARCHAR(128), + maxCpuCount NUMBER(9) default 0, + maxCpuUnit VARCHAR(32), + maxDiskCount NUMBER(9) default 0, + maxDiskUnit CHAR(2), + ipConnectivity CHAR(3), + minRamCount NUMBER(9) default 0, + minRamUnit CHAR(2), + startTime DATE, + endTime DATE, + cpuConsumptionTime NUMBER(20) default 0, + cpuConsumptionUnit VARCHAR(128), + commandToPilot VARCHAR(250), + transExitCode VARCHAR(128), + pilotErrorCode NUMBER(6) default 0, + pilotErrorDiag VARCHAR(250), + exeErrorCode NUMBER(6) default 0, + exeErrorDiag VARCHAR(250), + supErrorCode NUMBER(6) default 0, + supErrorDiag VARCHAR(250) default NULL, + ddmErrorCode NUMBER(6) default 0, + ddmErrorDiag VARCHAR(250) default NULL, + brokerageErrorCode NUMBER(6) default 0, + brokerageErrorDiag VARCHAR(250) default NULL, + jobDispatcherErrorCode NUMBER(6) default 0, + jobDispatcherErrorDiag VARCHAR(250) default NULL, + taskBufferErrorCode NUMBER(6) default 0, + taskBufferErrorDiag VARCHAR(250) default NULL, + computingSite VARCHAR(128), + computingElement VARCHAR(128), + jobParameters VARCHAR(4000) default NULL, + metadata VARCHAR(32) default NULL, + prodDBlock VARCHAR(250), + dispatchDBlock VARCHAR(250), + destinationDBlock VARCHAR(250), + destinationSE VARCHAR(250), + nEvents NUMBER(9) default 0, + grid VARCHAR(32), + cloud VARCHAR(32), + cpuConversion NUMBER(9,4) default NULL, + sourceSite VARCHAR(36), + destinationSite VARCHAR(36), + transferType VARCHAR(10), + taskID NUMBER(9) default NULL, + cmtConfig VARCHAR(250), + stateChangeTime DATE, + prodDBUpdateTime DATE, + lockedby VARCHAR(128), + relocationFlag NUMBER(1) default 0, + jobExecutionID NUMBER(11) default 0, + VO VARCHAR(16), + pilotTiming VARCHAR(100), + workingGroup VARCHAR(20) +); + + +CREATE TABLE jobsActive4 +( + PandaID NUMBER(11) default 0 primary key, + jobDefinitionID NUMBER(11) default 0, + schedulerID VARCHAR(128), + pilotID VARCHAR(128), + creationTime DATE, + creationHost VARCHAR(128), + modificationTime DATE, + modificationHost VARCHAR(128), + AtlasRelease VARCHAR(64), + transformation VARCHAR(250), + homepackage VARCHAR(64), + prodSeriesLabel VARCHAR(20) default 'pandatest', + prodSourceLabel VARCHAR(20) default 'managed', + prodUserID VARCHAR(250), + assignedPriority NUMBER(9) default 0, + currentPriority NUMBER(9) default 0, + attemptNr NUMBER(2) default 0, + maxAttempt NUMBER(2) default 0, + jobStatus VARCHAR(15) default 'activated', + jobName VARCHAR(128), + maxCpuCount NUMBER(9) default 0, + maxCpuUnit VARCHAR(32), + maxDiskCount NUMBER(9) default 0, + maxDiskUnit CHAR(2), + ipConnectivity CHAR(3), + minRamCount NUMBER(9) default 0, + minRamUnit CHAR(2), + startTime DATE, + endTime DATE, + cpuConsumptionTime NUMBER(20) default 0, + cpuConsumptionUnit VARCHAR(128), + commandToPilot VARCHAR(250), + transExitCode VARCHAR(128), + pilotErrorCode NUMBER(6) default 0, + pilotErrorDiag VARCHAR(250), + exeErrorCode NUMBER(6) default 0, + exeErrorDiag VARCHAR(250), + supErrorCode NUMBER(6) default 0, + supErrorDiag VARCHAR(250) default NULL, + ddmErrorCode NUMBER(6) default 0, + ddmErrorDiag VARCHAR(250) default NULL, + brokerageErrorCode NUMBER(6) default 0, + brokerageErrorDiag VARCHAR(250) default NULL, + jobDispatcherErrorCode NUMBER(6) default 0, + jobDispatcherErrorDiag VARCHAR(250) default NULL, + taskBufferErrorCode NUMBER(6) default 0, + taskBufferErrorDiag VARCHAR(250) default NULL, + computingSite VARCHAR(128), + computingElement VARCHAR(128), + jobParameters VARCHAR(4000) default NULL, + metadata VARCHAR(32) default NULL, + prodDBlock VARCHAR(250), + dispatchDBlock VARCHAR(250), + destinationDBlock VARCHAR(250), + destinationSE VARCHAR(250), + nEvents NUMBER(9) default 0, + grid VARCHAR(32), + cloud VARCHAR(32), + cpuConversion NUMBER(9,4) default NULL, + sourceSite VARCHAR(36), + destinationSite VARCHAR(36), + transferType VARCHAR(10), + taskID NUMBER(9) default NULL, + cmtConfig VARCHAR(250), + stateChangeTime DATE, + prodDBUpdateTime DATE, + lockedby VARCHAR(128), + relocationFlag NUMBER(1) default 0, + jobExecutionID NUMBER(11) default 0, + VO VARCHAR(16), + pilotTiming VARCHAR(100), + workingGroup VARCHAR(20) +); + +CREATE TABLE jobsWaiting4 +( + PandaID NUMBER(11) default 0 primary key, + jobDefinitionID NUMBER(11) default 0, + schedulerID VARCHAR(128), + pilotID VARCHAR(128), + creationTime DATE, + creationHost VARCHAR(128), + modificationTime DATE, + modificationHost VARCHAR(128), + AtlasRelease VARCHAR(64), + transformation VARCHAR(250), + homepackage VARCHAR(64), + prodSeriesLabel VARCHAR(20) default 'pandatest', + prodSourceLabel VARCHAR(20) default 'managed', + prodUserID VARCHAR(250), + assignedPriority NUMBER(9) default 0, + currentPriority NUMBER(9) default 0, + attemptNr NUMBER(2) default 0, + maxAttempt NUMBER(2) default 0, + jobStatus VARCHAR(15) default 'activated', + jobName VARCHAR(128), + maxCpuCount NUMBER(9) default 0, + maxCpuUnit VARCHAR(32), + maxDiskCount NUMBER(9) default 0, + maxDiskUnit CHAR(2), + ipConnectivity CHAR(3), + minRamCount NUMBER(9) default 0, + minRamUnit CHAR(2), + startTime DATE, + endTime DATE, + cpuConsumptionTime NUMBER(20) default 0, + cpuConsumptionUnit VARCHAR(128), + commandToPilot VARCHAR(250), + transExitCode VARCHAR(128), + pilotErrorCode NUMBER(6) default 0, + pilotErrorDiag VARCHAR(250), + exeErrorCode NUMBER(6) default 0, + exeErrorDiag VARCHAR(250), + supErrorCode NUMBER(6) default 0, + supErrorDiag VARCHAR(250) default NULL, + ddmErrorCode NUMBER(6) default 0, + ddmErrorDiag VARCHAR(250) default NULL, + brokerageErrorCode NUMBER(6) default 0, + brokerageErrorDiag VARCHAR(250) default NULL, + jobDispatcherErrorCode NUMBER(6) default 0, + jobDispatcherErrorDiag VARCHAR(250) default NULL, + taskBufferErrorCode NUMBER(6) default 0, + taskBufferErrorDiag VARCHAR(250) default NULL, + computingSite VARCHAR(128), + computingElement VARCHAR(128), + jobParameters VARCHAR(4000) default NULL, + metadata VARCHAR(32) default NULL, + prodDBlock VARCHAR(250), + dispatchDBlock VARCHAR(250), + destinationDBlock VARCHAR(250), + destinationSE VARCHAR(250), + nEvents NUMBER(9) default 0, + grid VARCHAR(32), + cloud VARCHAR(32), + cpuConversion NUMBER(9,4) default NULL, + sourceSite VARCHAR(36), + destinationSite VARCHAR(36), + transferType VARCHAR(10), + taskID NUMBER(9) default NULL, + cmtConfig VARCHAR(250), + stateChangeTime DATE, + prodDBUpdateTime DATE, + lockedby VARCHAR(128), + relocationFlag NUMBER(1) default 0, + jobExecutionID NUMBER(11) default 0, + VO VARCHAR(16), + pilotTiming VARCHAR(100), + workingGroup VARCHAR(20) +); + +CREATE TABLE jobsArchived4 +( + PandaID NUMBER(11) default 0 primary key, + jobDefinitionID NUMBER(11) default 0, + schedulerID VARCHAR(128), + pilotID VARCHAR(128), + creationTime DATE, + creationHost VARCHAR(128), + modificationTime DATE, + modificationHost VARCHAR(128), + AtlasRelease VARCHAR(64), + transformation VARCHAR(250), + homepackage VARCHAR(64), + prodSeriesLabel VARCHAR(20) default 'pandatest', + prodSourceLabel VARCHAR(20) default 'managed', + prodUserID VARCHAR(250), + assignedPriority NUMBER(9) default 0, + currentPriority NUMBER(9) default 0, + attemptNr NUMBER(2) default 0, + maxAttempt NUMBER(2) default 0, + jobStatus VARCHAR(15) default 'activated', + jobName VARCHAR(128), + maxCpuCount NUMBER(9) default 0, + maxCpuUnit VARCHAR(32), + maxDiskCount NUMBER(9) default 0, + maxDiskUnit CHAR(2), + ipConnectivity CHAR(3), + minRamCount NUMBER(9) default 0, + minRamUnit CHAR(2), + startTime DATE, + endTime DATE, + cpuConsumptionTime NUMBER(20) default 0, + cpuConsumptionUnit VARCHAR(128), + commandToPilot VARCHAR(250), + transExitCode VARCHAR(128), + pilotErrorCode NUMBER(6) default 0, + pilotErrorDiag VARCHAR(250), + exeErrorCode NUMBER(6) default 0, + exeErrorDiag VARCHAR(250), + supErrorCode NUMBER(6) default 0, + supErrorDiag VARCHAR(250) default NULL, + ddmErrorCode NUMBER(6) default 0, + ddmErrorDiag VARCHAR(250) default NULL, + brokerageErrorCode NUMBER(6) default 0, + brokerageErrorDiag VARCHAR(250) default NULL, + jobDispatcherErrorCode NUMBER(6) default 0, + jobDispatcherErrorDiag VARCHAR(250) default NULL, + taskBufferErrorCode NUMBER(6) default 0, + taskBufferErrorDiag VARCHAR(250) default NULL, + computingSite VARCHAR(128), + computingElement VARCHAR(128), + jobParameters VARCHAR(4000) default NULL, + metadata VARCHAR(32) default NULL, + prodDBlock VARCHAR(250), + dispatchDBlock VARCHAR(250), + destinationDBlock VARCHAR(250), + destinationSE VARCHAR(250), + nEvents NUMBER(9) default 0, + grid VARCHAR(32), + cloud VARCHAR(32), + cpuConversion NUMBER(9,4) default NULL, + sourceSite VARCHAR(36), + destinationSite VARCHAR(36), + transferType VARCHAR(10), + taskID NUMBER(9) default NULL, + cmtConfig VARCHAR(250), + stateChangeTime DATE, + prodDBUpdateTime DATE, + lockedby VARCHAR(128), + relocationFlag NUMBER(1) default 0, + jobExecutionID NUMBER(11) default 0, + VO VARCHAR(16), + pilotTiming VARCHAR(100), + workingGroup VARCHAR(20) +); + + +CREATE TABLE filesTable4 +( + row_ID NUMBER(11) default 0 primary key, + PandaID NUMBER(11) default 0, + GUID VARCHAR(64), + lfn VARCHAR(256), + type VARCHAR(20), + dataset VARCHAR(128), + status VARCHAR(64), + prodDBlock VARCHAR(250), + prodDBlockToken VARCHAR(250), + dispatchDBlock VARCHAR(250), + dispatchDBlockToken VARCHAR(250), + destinationDBlock VARCHAR(250), + destinationDBlockToken VARCHAR(250), + destinationSE VARCHAR(250), + fsize NUMBER(10) default 0, + md5sum CHAR(36), + checksum CHAR(36) +); + + +CREATE TABLE Datasets +( + vuid VARCHAR(40) default '' primary key, + name VARCHAR(250), + version VARCHAR(10) default NULL, + type VARCHAR(20) default NULL, + status VARCHAR(10) default NULL, + numberfiles NUMBER(9) default NULL, + currentfiles NUMBER(9) default NULL, + creationdate DATE, + modificationdate DATE, + MoverID NUMBER(11) default 0, + transferStatus NUMBER(2) default 0 +); + + +CREATE TABLE metaTable +( + PandaID NUMBER(11) default 0 primary key, + metaData VARCHAR(4000) default NULL +); + + +CREATE TABLE subCounter +( + subID NUMBER(11) default 0 +); + + + +CREATE INDEX jobsA4_currentPriority_IDX ON jobsActive4 (currentPriority); +CREATE INDEX jobsA4_jobStatus_IDX ON jobsActive4 (jobStatus); +CREATE INDEX jobsA4_computingSite_IDX ON jobsActive4 (computingSite); + +CREATE INDEX file4_PandaID_IDX ON filesTable4 (PandaID); +CREATE INDEX file4_status_IDX ON filesTable4 (status); +CREATE INDEX file4_dispDBlock_IDX ON filesTable4 (dispatchDBlock); +CREATE INDEX file4_destDBlock_IDX ON filesTable4 (destinationDBlock); + +CREATE INDEX Datasets_name_IDX ON Datasets (name); + +DROP SEQUENCE PandaID_SEQ; +DROP SEQUENCE rowID_SEQ; +DROP SEQUENCE subID_SEQ; + + +CREATE SEQUENCE PandaID_SEQ; +CREATE SEQUENCE rowID_SEQ; +CREATE SEQUENCE subID_SEQ; + + +CREATE OR REPLACE TRIGGER PandaID_TRIGGER +BEFORE INSERT ON jobsDefined4 +FOR EACH ROW +BEGIN + IF (:NEW.PandaID IS NULL) THEN + SELECT PandaID_SEQ.NEXTVAL INTO :NEW.PandaID FROM DUAL ; + END IF; +END; +/ + + +CREATE OR REPLACE TRIGGER rowID_TRIGGER +BEFORE INSERT ON filesTable4 +FOR EACH ROW +BEGIN + SELECT rowID_SEQ.NEXTVAL INTO :NEW.row_ID FROM DUAL ; +END; +/ + + +CREATE OR REPLACE TRIGGER subID_TRIGGER +BEFORE INSERT ON subCounter +FOR EACH ROW +BEGIN + SELECT subID_SEQ.NEXTVAL INTO :NEW.subID FROM DUAL ; +END; +/ + + +CREATE OR REPLACE FUNCTION BITOR( P_BITS1 IN NATURAL, P_BITS2 IN NATURAL ) +RETURN NATURAL +IS +BEGIN + RETURN UTL_RAW.CAST_TO_BINARY_INTEGER( + UTL_RAW.BIT_OR( + UTL_RAW.CAST_FROM_BINARY_INTEGER(P_BITS1), + UTL_RAW.CAST_FROM_BINARY_INTEGER(P_BITS2) + ) + ); +END; +/ diff --git a/current/pandaserver/test/pandameta.sql b/current/pandaserver/test/pandameta.sql new file mode 100644 index 000000000..ed234a5d2 --- /dev/null +++ b/current/pandaserver/test/pandameta.sql @@ -0,0 +1,97 @@ +DROP TABLE cloudconfig; +DROP TABLE schedconfig; + + +CREATE TABLE cloudconfig +( + name VARCHAR(20) primary key, + description VARCHAR(50), + tier1 VARCHAR(20), + tier1SE VARCHAR(400), + relocation VARCHAR(10), + weight NUMBER(11) default 0, + server VARCHAR(100), + status VARCHAR(20), + transtimelo NUMBER(11) default 0, + transtimehi NUMBER(11) default 0, + waittime NUMBER(11) default 0, + cloudcomment VARCHAR(200), + space NUMBER(11) default 0, + moduser VARCHAR(30), + modtime DATE default CURRENT_DATE, + validation VARCHAR(20), + mcshare NUMBER(11) default 0, + countries VARCHAR(80) +); + + +CREATE TABLE schedconfig +( + name VARCHAR(60) default 'default', + nickname VARCHAR(60) primary key, + queue VARCHAR(60), + localqueue VARCHAR(20), + system VARCHAR(60), + sysconfig VARCHAR(20), + environ VARCHAR(250), + gatekeeper VARCHAR(40), + jobmanager VARCHAR(80), + se VARCHAR(250), + ddm VARCHAR(80), + jdladd CLOB default NULL, + globusadd VARCHAR(100), + jdl VARCHAR(60), + jdltxt CLOB default NULL, + version VARCHAR(60), + site VARCHAR(60), + region VARCHAR(60), + gstat VARCHAR(60), + tags VARCHAR(200), + cmd VARCHAR(200), + lastmod TIMESTAMP default CURRENT_TIMESTAMP, + errinfo VARCHAR(80), + nqueue NUMBER(11) default 0, + queuecomment CLOB default NULL, + appdir VARCHAR(80), + datadir VARCHAR(80), + tmpdir VARCHAR(80), + wntmpdir VARCHAR(80), + dq2url VARCHAR(80), + special_par VARCHAR(80), + python_path VARCHAR(80), + nodes NUMBER(11) default 0, + status VARCHAR(10), + copytool VARCHAR(80), + copysetup VARCHAR(200), + releases VARCHAR(500), + sepath VARCHAR(80), + envsetup VARCHAR(200), + copyprefix VARCHAR(160), + lfcpath VARCHAR(80), + seopt VARCHAR(60), + sein VARCHAR(60), + seinopt VARCHAR(60), + lfchost VARCHAR(80), + cloud VARCHAR(60), + siteid VARCHAR(60), + proxy VARCHAR(80), + retry VARCHAR(10), + queuehours NUMBER(9) default 0, + envsetupin VARCHAR(200), + copytoolin VARCHAR(180), + copysetupin VARCHAR(200), + seprodpath VARCHAR(200), + lfcprodpath VARCHAR(80), + copyprefixin VARCHAR(80), + recoverdir VARCHAR(80), + memory NUMBER(11) default 0, + maxtime NUMBER(11) default 0, + space NUMBER(11) default 0, + tspace TIMESTAMP default TO_DATE('0001-01-01 00:00:00','YYYY-MM-DD HH24:MI:SS'), + cmtconfig VARCHAR(250), + setokens VARCHAR(80), + glexec VARCHAR(10), + priorityoffset VARCHAR(60), + allowedgroups VARCHAR(100), + defaulttoken VARCHAR(100) +); diff --git a/current/pandaserver/test/pcron.sh b/current/pandaserver/test/pcron.sh new file mode 100755 index 000000000..4cb8f3653 --- /dev/null +++ b/current/pandaserver/test/pcron.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +"exec" "python" "$0" "$@" + +import os +import sys +import time +import commands + +_python = "/direct/usatlas+u/gfg/python-latest/python-2.4.1/python-2.4.1/bin/python" + +class Woker: + # constructor + def __init__(self): + pass + + # main + def run(self): + os.chdir('/direct/usatlas+u/sm/panda/pilot2') + com = "python pilot.py -a /usatlas/projects/OSG -d /tmp -l /usatlas/prodjob/share/ -q http://dms02.usatlas.bnl.gov:8000/dq2/ -s BNL_ATLAS_DDM" + os.spawnv(os.P_NOWAIT,_python,com.split()) + +# count # of processes +out = commands.getoutput('ps auxww | grep pilot.py | grep -v auxww | grep -v "sh -c" | grep -v grep' ) +if out == '': + nPilot = 0 +else: + nPilot = len(out.split('\n')) +maxPilot = 10 +print nPilot +if nPilot >= maxPilot: + sys.exit(0) + +for i in range(maxPilot-nPilot): + thr = Woker() + thr.run() + time.sleep(5) diff --git a/current/pandaserver/test/pdq2_cr b/current/pandaserver/test/pdq2_cr new file mode 100755 index 000000000..538a6c5a6 --- /dev/null +++ b/current/pandaserver/test/pdq2_cr @@ -0,0 +1,159 @@ +#!/bin/bash + +"exec" "python" "$0" "$@" + + +def _usage(): + print \ +""" +NAME + pdq2_cr - copy and register DQ2 dataset via PANDA + +SYNOPSIS + + pdq2_cr [ -h | --help] + [ -p | --parallel n ] + [ -t | --timeout n ] + [ -d | --destination destination ] + [ -r | --remote remoteSite ] + [ -s | --source sourceSite ] + datasetname + [lfn1 [lfn2 [...]]] +DESCRIPTION + + dq2_cr copies and registers DQ2 dataset. It scans the LRC to find missing or corrupted + files in a dataset, copies the files to the local SE using 3rd-party transfers, and + registers the files to the LRC. + +OPTIONS + + -h | --help Print this message + + -p | --parallel Number of copy threads (default:3) + + -t | --timeout Timeout limit in second for each file transfer (default:1800) + + -d | --destination Directory in the storage element where files will be put. + + -r | --remote Specify remote site to which files get copied + + -s | --source Specify source site from which files get copied + +""" + +# error codes +EC_Configuration = 20 +EC_VUID = 30 +EC_QueryFiles = 40 +EC_Location = 50 +EC_Copy = 60 +EC_Main = 70 +EC_PFNfromLFC = 80 +EC_INVALIDSIZE = 90 +EC_RegisterLRC = 100 +EC_LS = 110 + +#################################################################### +# main +def main(): + import sys + import getopt + + # option class + class _options: + def __init__(self): + pass + options = _options() + del _options + # set default values + options.source = '' + options.destination = '' + options.remote = '' + # get command-line parameters + try: + opts, args = getopt.getopt(sys.argv[1:],"hvn:cd:p:t:s:r:l:u", + ["help","verbose","ntry=","choose", + "destination=","parallel=","timeout=", + "source=","remote=","location=","uber", + "noSleep","uberHost=","gsiHost=","srmHost=", + "guids=","lfns=","debug", + ]) + except: + _usage() + print "ERROR : Invalid options" + sys.exit(EC_Main) + # set options + for o, a in opts: + if o in ("-h","--help"): + _usage() + sys.exit() + if o in ("-s","--source"): + options.source = a + if o in ("-r","--remote"): + options.remote = a + if o in ("-d","--destination"): + options.destination = a + # datasetname + if len(args) == 0: + print "ERROR : no datasetname" + sys.exit(EC_Main) + # source + if options.source == "": + print "ERROR : no source. use -s" + sys.exit(EC_Main) + # destination + if options.destination == "": + print "ERROR : no destination. use -d" + sys.exit(EC_Main) + # remote + if options.remote == "": + print "ERROR : no remote. use -r" + sys.exit(EC_Main) + + # submit + import time + import commands + import userinterface.Client as Client + from taskbuffer.JobSpec import JobSpec + from taskbuffer.FileSpec import FileSpec + + site = "BNL_ATLAS_DDM" + + datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') + destName = 'BNL_SE' + + jobList = [] + + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s" % commands.getoutput('uuidgen') + job.transformation = 'https://gridui01.usatlas.bnl.gov:24443/dav/test/run_dq2_cr' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.currentPriority = 100000 + job.prodSourceLabel = 'test' + job.computingSite = site + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.type = 'log' + job.addFile(fileOL) + + argStr = "" + for arg in sys.argv[1:]: + argStr += "%s " % arg + job.jobParameters = argStr + + jobList.append(job) + + s,o = Client.submitJobs(jobList) + print "---------------------" + print s + for x in o: + print "PandaID=%s" % x[0] + +if __name__ == "__main__": + main() diff --git a/current/pandaserver/test/plot.py b/current/pandaserver/test/plot.py new file mode 100755 index 000000000..9d37de977 --- /dev/null +++ b/current/pandaserver/test/plot.py @@ -0,0 +1,51 @@ +import re +import time +import datetime +import pylab +file = open('panda-DBProxy.log') +datesMap = {} +valuesMap = {} +for line in file: + items = re.findall('countPilotRequests[^\']+\'([^\']+)\': (\d+)',line) + if len(items) != 0: + # statistics + site = items[0][0] + count = float(items[0][1]) + # date + items = re.split(' |,',line) + if len(items) >= 2: + strDate = '%s %s' % tuple(items[:2]) + datetimeTime = datetime.datetime(*time.strptime(strDate,'%Y-%m-%d %H:%M:%S')[:6]) + # assign + if not datesMap.has_key(site): + datesMap[site] = [] + valuesMap[site] = [] + datesMap[site].append(pylab.date2num(datetimeTime)) + valuesMap[site].append(count) +# close file +file.close() +# plot +nRow = 1 #len(datesMap.keys()) +nCol = 1 +nFig = 1 +tFig = 1 +sites = datesMap.keys() +sites.sort() +for site in sites: + if nFig == (nRow*nCol+1): + pylab.savefig('pilot%d.png' % tFig) + tFig += 1 + pylab.figure(tFig) + nFig = 1 + pylab.subplot(int('%d%d%d' % (nRow,nCol,nFig))) + pylab.title('Number of pilots @%s' % site) + pylab.plot_date(datesMap[site],valuesMap[site]) + nFig += 1 +# save the last figure +pylab.savefig('pilot%d.png' % tFig) +# show +#pylab.show() + + + + diff --git a/current/pandaserver/test/prioryMassage.py b/current/pandaserver/test/prioryMassage.py new file mode 100644 index 000000000..887bca19f --- /dev/null +++ b/current/pandaserver/test/prioryMassage.py @@ -0,0 +1,364 @@ +import os +import re +import sys +import datetime +from taskbuffer.TaskBuffer import taskBuffer +from pandalogger.PandaLogger import PandaLogger + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# logger +_logger = PandaLogger().getLogger('prioryMassage') + +_logger.debug("================= start ==================") + +# instantiate TB +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + +# get usage breakdown +usageBreakDownPerUser = {} +usageBreakDownPerSite = {} +workingGroupList = [] +for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsArchived4']: + varMap = {} + varMap[':prodSourceLabel'] = 'user' + if table == 'ATLAS_PANDA.jobsActive4': + sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table + else: + # with time range for archived table + varMap[':modificationTime'] = datetime.datetime.utcnow() - datetime.timedelta(minutes=60) + sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel AND modificationTime>:modificationTime GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table + # exec + status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10000) + if res == None: + _logger.debug("total %s " % res) + else: + _logger.debug("total %s " % len(res)) + # make map + for cnt,prodUserName,jobStatus,workingGroup,computingSite in res: + # use workingGroup name as prodUserName + if workingGroup != None: + if not workingGroup in workingGroupList: + workingGroupList.append(workingGroup) + prodUserName = workingGroup + workingGroup = None + # append to PerUser map + if not usageBreakDownPerUser.has_key(prodUserName): + usageBreakDownPerUser[prodUserName] = {} + if not usageBreakDownPerUser[prodUserName].has_key(workingGroup): + usageBreakDownPerUser[prodUserName][workingGroup] = {} + if not usageBreakDownPerUser[prodUserName][workingGroup].has_key(computingSite): + usageBreakDownPerUser[prodUserName][workingGroup][computingSite] = {'rundone':0,'activated':0} + # append to PerSite map + if not usageBreakDownPerSite.has_key(computingSite): + usageBreakDownPerSite[computingSite] = {} + if not usageBreakDownPerSite[computingSite].has_key(prodUserName): + usageBreakDownPerSite[computingSite][prodUserName] = {} + if not usageBreakDownPerSite[computingSite][prodUserName].has_key(workingGroup): + usageBreakDownPerSite[computingSite][prodUserName][workingGroup] = {'rundone':0,'activated':0} + # count # of running/done and activated + if jobStatus in ['activated']: + usageBreakDownPerUser[prodUserName][workingGroup][computingSite]['activated'] += cnt + usageBreakDownPerSite[computingSite][prodUserName][workingGroup]['activated'] += cnt + elif jobStatus in ['cancelled','holding']: + pass + else: + usageBreakDownPerUser[prodUserName][workingGroup][computingSite]['rundone'] += cnt + usageBreakDownPerSite[computingSite][prodUserName][workingGroup]['rundone'] += cnt + +# get total number of users and running/done jobs +totalUsers = 0 +totalRunDone = 0 +for prodUserName,wgValMap in usageBreakDownPerUser.iteritems(): + for workingGroup,siteValMap in wgValMap.iteritems(): + # ignore group production + if workingGroup != None: + continue + totalUsers += 1 + for computingSite,statValMap in siteValMap.iteritems(): + totalRunDone += statValMap['rundone'] + +_logger.debug("total users : %s" % totalUsers) +_logger.debug("total RunDone : %s" % totalRunDone) +_logger.debug("") + +if totalUsers == 0: + sys.exit(0) + +# global average +globalAverageRunDone = float(totalRunDone)/float(totalUsers) + +_logger.debug("global average : %s" % globalAverageRunDone) + +# count the number of users and run/done jobs for each site +siteRunDone = {} +siteUsers = {} +for computingSite,userValMap in usageBreakDownPerSite.iteritems(): + for prodUserName,wgValMap in userValMap.iteritems(): + for workingGroup,statValMap in wgValMap.iteritems(): + # ignore group production + if workingGroup != None: + continue + # count the number of users and running/done jobs + if not siteUsers.has_key(computingSite): + siteUsers[computingSite] = 0 + siteUsers[computingSite] += 1 + if not siteRunDone.has_key(computingSite): + siteRunDone[computingSite] = 0 + siteRunDone[computingSite] += statValMap['rundone'] + +# get site average +_logger.debug("site average") +siteAverageRunDone = {} +for computingSite,nRunDone in siteRunDone.iteritems(): + siteAverageRunDone[computingSite] = float(nRunDone)/float(siteUsers[computingSite]) + _logger.debug(" %-25s : %s" % (computingSite,siteAverageRunDone[computingSite])) + +# check if the number of user's jobs is lower than the average +for prodUserName,wgValMap in usageBreakDownPerUser.iteritems(): + _logger.debug("---> %s" % prodUserName) + # no private jobs + if not wgValMap.has_key(None): + _logger.debug("no private jobs") + continue + # count the number of running/done jobs + userTotalRunDone = 0 + for workingGroup,siteValMap in wgValMap.iteritems(): + if workingGroup != None: + continue + for computingSite,statValMap in siteValMap.iteritems(): + userTotalRunDone += statValMap['rundone'] + # no priority boost when the number of jobs is higher than the average + if userTotalRunDone >= globalAverageRunDone: + _logger.debug("enough running %s > %s (global average)" % (userTotalRunDone,globalAverageRunDone)) + continue + _logger.debug("user total:%s global average:%s" % (userTotalRunDone,globalAverageRunDone)) + # check with site average + toBeBoostedSites = [] + for computingSite,statValMap in wgValMap[None].iteritems(): + # the number of running/done jobs is lower than the average and activated jobs are waiting + if statValMap['rundone'] >= siteAverageRunDone[computingSite]: + _logger.debug("enough running %s > %s (site average) at %s" % \ + (statValMap['rundone'],siteAverageRunDone[computingSite],computingSite)) + elif statValMap['activated'] == 0: + _logger.debug("no activated jobs at %s" % computingSite) + else: + toBeBoostedSites.append(computingSite) + # no boost is required + if toBeBoostedSites == []: + _logger.debug("no sites to be boosted") + continue + # check special prioritized site + siteAccessForUser = {} + varMap = {} + varMap[':dn'] = prodUserName + sql = "SELECT pandaSite,pOffset,status,workingGroups FROM ATLAS_PANDAMETA.siteAccess WHERE dn=:dn" + status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10000) + if res != None: + for pandaSite,pOffset,pStatus,workingGroups in res: + # ignore special working group for now + if not workingGroups in ['',None]: + continue + # only approved sites + if pStatus != 'approved': + continue + # no priority boost + if pOffset == 0: + continue + # append + siteAccessForUser[pandaSite] = pOffset + # set weight + totalW = 0 + defaultW = 100 + for computingSite in toBeBoostedSites: + totalW += defaultW + if siteAccessForUser.has_key(computingSite): + totalW += siteAccessForUser[computingSite] + totalW = float(totalW) + # the total number of jobs to be boosted + numBoostedJobs = globalAverageRunDone - float(userTotalRunDone) + # get quota + quotaFactor = 1.0 + taskBuffer.checkQuota(prodUserName) + _logger.debug("quota factor:%s" % quotaFactor) + # make priority boost + nJobsPerPrioUnit = 5 + highestPrio = 1000 + for computingSite in toBeBoostedSites: + weight = float(defaultW) + if siteAccessForUser.has_key(computingSite): + weight += float(siteAccessForUser[computingSite]) + weight /= totalW + # the number of boosted jobs at the site + numBoostedJobsSite = int(numBoostedJobs * weight / quotaFactor) + _logger.debug("nSite:%s nAll:%s W:%s Q:%s at %s" % (numBoostedJobsSite,numBoostedJobs,weight,quotaFactor,computingSite)) + if numBoostedJobsSite/nJobsPerPrioUnit == 0: + _logger.debug("too small number of jobs %s to be boosted at %s" % (numBoostedJobsSite,computingSite)) + continue + # get the highest prio of activated jobs at the site + varMap = {} + varMap[':jobStatus'] = 'activated' + varMap[':prodSourceLabel'] = 'user' + varMap[':prodUserName'] = prodUserName + varMap[':computingSite'] = computingSite + sql = "SELECT MAX(currentPriority) FROM ATLAS_PANDA.jobsActive4 WHERE prodSourceLabel=:prodSourceLabel AND prodUserName=:prodUserName AND workingGroup IS NULL AND jobStatus=:jobStatus AND computingSite=:computingSite" + status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10) + maxPrio = None + if res != None: + try: + maxPrio = res[0][0] + except: + pass + if maxPrio == None: + _logger.debug("cannot get the highest prio at %s" % computingSite) + continue + # delta for priority boost + prioDelta = highestPrio - maxPrio + # already boosted + if prioDelta <= 0: + _logger.debug("already boosted (prio=%s) at %s" % (maxPrio,computingSite)) + continue + # lower limit + minPrio = maxPrio - numBoostedJobsSite/nJobsPerPrioUnit + # SQL for priority boost + varMap = {} + varMap[':jobStatus'] = 'activated' + varMap[':prodSourceLabel'] = 'user' + varMap[':prodUserName'] = prodUserName + varMap[':computingSite'] = computingSite + varMap[':prioDelta'] = prioDelta + varMap[':maxPrio'] = maxPrio + varMap[':minPrio'] = minPrio + varMap[':rlimit'] = numBoostedJobsSite + sql = "UPDATE ATLAS_PANDA.jobsActive4 SET currentPriority=currentPriority+:prioDelta " + sql += "WHERE prodSourceLabel=:prodSourceLabel " + if prodUserName in workingGroupList: + sql += "AND workingGroup=:prodUserName " + else: + sql += "AND prodUserName=:prodUserName AND workingGroup IS NULL " + sql += "AND jobStatus=:jobStatus AND computingSite=:computingSite AND currentPriority>:minPrio " + sql += "AND currentPriority<=:maxPrio AND rownum<=:rlimit" + _logger.debug("boost %s" % str(varMap)) + status,res = taskBuffer.querySQLS(sql,varMap,arraySize=10) + _logger.debug(" database return : %s" % res) + + +# redo stalled analysis jobs +_logger.debug("=== redo stalled jobs") +try: + varMap = {} + varMap[':prodSourceLabel'] = 'user' + sqlJ = "SELECT jobDefinitionID,prodUserName FROM ATLAS_PANDA.jobsDefined4 " + sqlJ += "WHERE prodSourceLabel=:prodSourceLabel AND modificationTime delete downstream jobs") + # FIXME + #taskBuffer.deleteStalledJobs(libLFN) + else: + # activate + if useLib and libStatus == 'ready' and (not libGUID in [None,'']) and (not libDSName in [None,'']): + # update GUID + _logger.debug(" set GUID:%s for %s" % (libGUID,libLFN)) + #retG = taskBuffer.setGUIDs([{'lfn':libLFN,'guid':libGUID}]) + # FIXME + retG = True + if not retG: + _logger.error(" failed to update GUID for %s" % libLFN) + else: + # get PandaID with lib.tgz + #ids = taskBuffer.updateInFilesReturnPandaIDs(libDSName,'ready') + ids = [] + # get jobs + jobs = taskBuffer.peekJobs(ids,fromActive=False,fromArchived=False,fromWaiting=False) + # remove None and unknown + acJobs = [] + for job in jobs: + if job == None or job.jobStatus == 'unknown': + continue + acJobs.append(job) + # activate + _logger.debug(" -> activate downstream jobs") + #taskBuffer.activateJobs(acJobs) + else: + # wait + _logger.debug(" -> wait") + varMap = {} + varMap[':prodSourceLabel'] = 'user' + varMap[':jobDefinitionID'] = jobDefinitionID + varMap[':prodUserName'] = prodUserName + # FIXME + #stU,resU = taskBuffer.querySQLS(sqlU,varMap) +except: + errtype,errvalue = sys.exc_info()[:2] + _logger.error("failed to redo stalled jobs with %s %s" % (errtype,errvalue)) + +_logger.debug("-------------- end") diff --git a/current/pandaserver/test/proxy.sh b/current/pandaserver/test/proxy.sh new file mode 100755 index 000000000..674e1d248 --- /dev/null +++ b/current/pandaserver/test/proxy.sh @@ -0,0 +1,15 @@ +#!/bin/bash -l + +echo '************** start' +date +source /afs/cern.ch/project/gd/LCG-share/current/external/etc/profile.d/grid-env.sh +echo '************** check proxy' +voms-proxy-info -all +echo '************** check novoms' +voms-proxy-info -all -file /tmp/x509up_u`id -u`_novoms +echo '************** voms-proxy-init' +voms-proxy-init -voms atlas:/atlas/usatlas/Role=production -valid 100000:0 -noregen -debug -cert /tmp/x509up_u`id -u`_novoms +echo '************** check new proxy' +voms-proxy-info -all +echo '************** end' +echo diff --git a/current/pandaserver/test/reassignDefJobs.py b/current/pandaserver/test/reassignDefJobs.py new file mode 100755 index 000000000..3aecd1374 --- /dev/null +++ b/current/pandaserver/test/reassignDefJobs.py @@ -0,0 +1,63 @@ +import sys +import time +import datetime +from taskbuffer.OraDBProxy import DBProxy +import userinterface.Client as Client +from dataservice.DDM import ddm + +timeL = 60 +if len(sys.argv) == 2: + timeL = int(sys.argv[1]) + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# erase datasets +def eraseDispDatasets(ids): + datasets = [] + # get jobs + status,jobs = Client.getJobStatus(ids) + if status != 0: + return + # gather dispDBlcoks + for job in jobs: + for file in job.Files: + if not file.dispatchDBlock in datasets: + datasets.append(file.dispatchDBlock) + # erase + for dataset in datasets: + ddm.DQ2.main(['eraseDataset',datasets]) + +# time limit +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=int(timeL)) + +# instantiate DB proxies +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +while True: + # get PandaIDs + varMap = {} + varMap[':jobStatus'] = 'defined' + varMap[':modificationTime'] = timeLimit + varMap[':prodSourceLabel'] = 'managed' + sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE jobStatus=:jobStatus AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" + status,res = proxyS.querySQLS(sql,varMap) + # escape + if len(res) == 0: + break + # convert to list + jobs = [] + for id, in res: + jobs.append(id) + # reassign + nJob = 100 + iJob = 0 + while iJob < len(jobs): + print 'reassignJobs(%s)' % jobs[iJob:iJob+nJob] + Client.reassignJobs(jobs[iJob:iJob+nJob]) + iJob += nJob + time.sleep(120) + + diff --git a/current/pandaserver/test/reassignJobs.py b/current/pandaserver/test/reassignJobs.py new file mode 100755 index 000000000..ab17c5b42 --- /dev/null +++ b/current/pandaserver/test/reassignJobs.py @@ -0,0 +1,14 @@ +import sys + +import userinterface.Client as Client + +if len(sys.argv) == 2: + Client.reassignJobs([sys.argv[1]]) +else: + startID = int(sys.argv[1]) + endID = int(sys.argv[2]) + if startID > endID: + print '%d is less than %d' % (endID,startID) + sys.exit(1) + Client.reassignJobs(range(startID,endID+1)) + diff --git a/current/pandaserver/test/reassignSite.py b/current/pandaserver/test/reassignSite.py new file mode 100644 index 000000000..2d80aaa36 --- /dev/null +++ b/current/pandaserver/test/reassignSite.py @@ -0,0 +1,64 @@ +import sys +import time +import datetime + +from taskbuffer.OraDBProxy import DBProxy +# password +from config import panda_config + +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +site = sys.argv[1] +import userinterface.Client as Client + +# erase dispatch datasets +def eraseDispDatasets(ids): + print "eraseDispDatasets" + datasets = [] + # get jobs + status,jobs = Client.getJobStatus(ids) + if status != 0: + return + # gather dispDBlcoks + for job in jobs: + # dispatchDS is not a DQ2 dataset in US + if job.cloud == 'US': + continue + # erase disp datasets for production jobs only + if job.prodSourceLabel != 'managed': + continue + for file in job.Files: + if file.dispatchDBlock == 'NULL': + continue + if (not file.dispatchDBlock in datasets) and \ + re.search('_dis\d+$',file.dispatchDBlock) != None: + datasets.append(file.dispatchDBlock) + # erase + for dataset in datasets: + print 'erase %s' % dataset + status,out = ddm.DQ2.main('eraseDataset',dataset) + print out + +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=4) +varMap[':jobStatus'] = 'activated' +varMap[':modificationTime'] = timeLimit +varMap[':prodSourceLabel'] = 'managed' +varMap[':computingSite'] = site +sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND computingSite=:computingSite AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" +status,res = proxyS.querySQLS(sql,varMap) + +jobs = [] +if res != None: + for (id,) in res: + jobs.append(id) +if len(jobs): + nJob = 100 + iJob = 0 + while iJob < len(jobs): + print 'reassign %s' % str(jobs[iJob:iJob+nJob]) + eraseDispDatasets(jobs[iJob:iJob+nJob]) + Client.reassignJobs(jobs[iJob:iJob+nJob]) + iJob += nJob + time.sleep(10) + diff --git a/current/pandaserver/test/reassignTask.py b/current/pandaserver/test/reassignTask.py new file mode 100644 index 000000000..475975aeb --- /dev/null +++ b/current/pandaserver/test/reassignTask.py @@ -0,0 +1,60 @@ +import re +import sys +import time +import datetime + +from taskbuffer.OraDBProxy import DBProxy +# password +from config import panda_config + +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +taskid = sys.argv[1] +import userinterface.Client as Client + +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) +varMap = {} +varMap[':modificationTime'] = timeLimit +varMap[':prodSourceLabel'] = 'managed' +varMap[':taskID'] = taskid +sql = "SELECT PandaID FROM ATLAS_PANDA.jobsDefined4 WHERE taskID=:taskID AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" +status,res = proxyS.querySQLS(sql,varMap) + +jobs = [] +if res != None: + for (id,) in res: + jobs.append(id) +if len(jobs): + nJob = 100 + iJob = 0 + while iJob < len(jobs): + print 'reassign %s' % str(jobs[iJob:iJob+nJob]) + Client.reassignJobs(jobs[iJob:iJob+nJob]) + iJob += nJob + time.sleep(10) + +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) +varMap = {} +varMap[':jobStatus'] = 'activated' +varMap[':modificationTime'] = timeLimit +varMap[':prodSourceLabel'] = 'managed' +varMap[':taskID'] = taskid +sql = "SELECT PandaID FROM ATLAS_PANDA.jobsActive4 WHERE jobStatus=:jobStatus AND taskID=:taskID AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" +status,res = proxyS.querySQLS(sql,varMap) + +jobs = [] +if res != None: + for (id,) in res: + jobs.append(id) +if len(jobs): + nJob = 100 + iJob = 0 + while iJob < len(jobs): + print 'reassign %s' % str(jobs[iJob:iJob+nJob]) + Client.reassignJobs(jobs[iJob:iJob+nJob]) + iJob += nJob + time.sleep(10) + + + diff --git a/current/pandaserver/test/reassignWaiting.py b/current/pandaserver/test/reassignWaiting.py new file mode 100755 index 000000000..24c8a232f --- /dev/null +++ b/current/pandaserver/test/reassignWaiting.py @@ -0,0 +1,39 @@ +import time +import datetime +from taskbuffer.OraDBProxy import DBProxy +import userinterface.Client as Client + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# time limit +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) + +# instantiate DB proxies +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +while True: + # get PandaIDs + varMap = {} + varMap[':modificationTime'] = timeLimit + sql = "SELECT PandaID FROM ATLAS_PANDA.jobsWaiting4 WHERE modificationTime<:modificationTime ORDER BY PandaID" + status,res = proxyS.querySQLS(sql,varMap) + + # escape + if len(res) == 0: + break + # convert to list + jobs = [] + for id, in res: + jobs.append(id) + # reassign + nJob = 300 + iJob = 0 + while iJob < len(jobs): + print 'reassignJobs(%s)' % jobs[iJob:iJob+nJob] + Client.reassignJobs(jobs[iJob:iJob+nJob]) + iJob += nJob + time.sleep(60) + diff --git a/current/pandaserver/test/redirectLog.py b/current/pandaserver/test/redirectLog.py new file mode 100755 index 000000000..351d4a192 --- /dev/null +++ b/current/pandaserver/test/redirectLog.py @@ -0,0 +1,40 @@ + +""" +redirect apache log to the logging server + +""" + +import re +from pandalogger.PandaLogger import PandaLogger + +# logger +_loggerMap = {} +pandaLogger = PandaLogger() + +while True: + # read line + line = raw_input() + # extract host, request and response + items = re.findall('(\S+) - - \[[^\]]+\] ("[^"]+") (\d+)',line) + if len(items) == 1: + # host + host = items[0][0] + # request + request = items[0][1].split()[1].split('/')[-1] + if request == 'isAlive': + # somehow isAlive is not recorded + request = 'IsAlive' + # set logtype + if request.startswith('datasetCompleted'): + logtype = 'datasetCompleted' + else: + logtype = request + # response + response = items[0][2] + # make message + message = '%s - %s %s' % (host,request,response) + # get logger + pandaLogger.setParam('Type',logtype) + logger = pandaLogger.getHttpLogger('prod') + # add message + logger.info(message) diff --git a/current/pandaserver/test/redirectLog.sh b/current/pandaserver/test/redirectLog.sh new file mode 100755 index 000000000..c60e9ff27 --- /dev/null +++ b/current/pandaserver/test/redirectLog.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +BASEPATH=/usatlas/u/sm/prod +BINPATH=/usatlas/u/sm/latest +LOG=$BASEPATH/httpd/logs/access_log + +# for python +export PATH=$BINPATH/python/bin:$PATH +export PYTHONPATH=$BASEPATH/panda:$PYTHONPATH + +tail -F $LOG | python $BASEPATH/panda/test/redirectLog.py diff --git a/current/pandaserver/test/resubmitJobs.py b/current/pandaserver/test/resubmitJobs.py new file mode 100755 index 000000000..7272d19ca --- /dev/null +++ b/current/pandaserver/test/resubmitJobs.py @@ -0,0 +1,14 @@ +import sys + +import userinterface.Client as Client + +if len(sys.argv) == 2: + Client.resubmitJobs([sys.argv[1]]) +else: + startID = int(sys.argv[1]) + endID = int(sys.argv[2]) + if startID > endID: + print '%d is less than %d' % (endID,startID) + sys.exit(1) + Client.resubmitJobs(range(startID,endID+1)) + diff --git a/current/pandaserver/test/runMerger.py b/current/pandaserver/test/runMerger.py new file mode 100644 index 000000000..ba765b16f --- /dev/null +++ b/current/pandaserver/test/runMerger.py @@ -0,0 +1,219 @@ +import os +import re +import sys +import time +import datetime +import commands +import threading + +from config import panda_config + +# initialize cx_Oracle using dummy connection +from taskbuffer.Initializer import initializer +initializer.init() + +from dataservice.Merger import Merger +from taskbuffer.TaskBuffer import taskBuffer +from pandalogger.PandaLogger import PandaLogger + + +# logger +_logger = PandaLogger().getLogger('runMerger') + +_logger.debug("================= start ==================") + +# overall timeout value +overallTimeout = 60 + +# kill old process +try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName) + for line in out.split('\n'): + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) +except: + type, value, traceBack = sys.exc_info() + _logger.error("kill process : %s %s" % (type,value)) + +# time limit +timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(minutes=5) +timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(hours=12) +timeLimitX = datetime.datetime.utcnow() - datetime.timedelta(hours=6) + +# instantiate TB +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + +# thread pool +class ThreadPool: + def __init__(self): + self.lock = threading.Lock() + self.list = [] + + def add(self,obj): + self.lock.acquire() + self.list.append(obj) + self.lock.release() + + def remove(self,obj): + self.lock.acquire() + self.list.remove(obj) + self.lock.release() + + def join(self): + self.lock.acquire() + thrlist = tuple(self.list) + self.lock.release() + for thr in thrlist: + thr.join() + + +# thread to merge dataset +class MergerThr (threading.Thread): + def __init__(self,lock,proxyLock,datasets,pool): + threading.Thread.__init__(self) + self.datasets = datasets + self.lock = lock + self.proxyLock = proxyLock + self.pool = pool + self.maxTry = 3 + self.pool.add(self) + + def run(self): + self.lock.acquire() + try: + # loop over all datasets + for vuid,name,modDate,verNum in self.datasets: + try: + try: + verNum = int(verNum) + except: + verNum = 0 + _logger.debug("Merge %s %s %s" % (modDate,name,verNum)) + toBeClosed = False + # close old datasets anyway + if modDate < timeLimitX or verNum >= self.maxTry: + toBeClosed = True + # check version + dsSpec = taskBuffer.queryDatasetWithMap({'vuid':vuid}) + if dsSpec == None: + _logger.error("failed to get dataset spec for %s:%s" % (name,vuid)) + continue + try: + if int(dsSpec.version) != verNum+1: + _logger.debug("skip %s due to version mismatch %s != %s+1" % (name,dsSpec.version,verNum)) + continue + except: + _logger.error("failed to convert version='%s' to int for %s" % (dsSpec.version,name)) + continue + # get PandaID + self.proxyLock.acquire() + proxyS = taskBuffer.proxyPool.getProxy() + pandaID = proxyS.getPandaIDwithDestDBlock(name) + taskBuffer.proxyPool.putProxy(proxyS) + self.proxyLock.release() + if pandaID == None: + _logger.error("failed to find PandaID for %s" % name) + toBeClosed = True + else: + # get job + self.proxyLock.acquire() + pandaJob = taskBuffer.peekJobs([pandaID])[0] + self.proxyLock.release() + if pandaJob == None: + _logger.error("failed to get job for %s PandaID=%s" % (name,pandaID)) + toBeClosed = True + else: + # run merger + _logger.debug("run merger for %s" % name) + merger = Merger(taskBuffer,pandaJob) + mRet = merger.run() + if mRet == None: + _logger.debug("got unrecoverable for %s" % name) + toBeClosed = True + elif mRet == True: + _logger.debug("succeeded for %s" % name) + toBeClosed = True + else: + _logger.debug("failed for %s" % name) + # close dataset + if toBeClosed: + _logger.debug("close %s" % name) + self.proxyLock.acquire() + varMap = {} + varMap[':vuid'] = vuid + varMap[':status'] = 'tobeclosed' + taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", + varMap) + self.proxyLock.release() + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("Failed %s with %s:%s" % (name,errType,errValue)) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("MergerThr failed with %s:%s" % (errType,errValue)) + self.pool.remove(self) + self.lock.release() + + +# start merger +mergeLock = threading.Semaphore(3) +mergeProxyLock = threading.Lock() +mergeThreadPool = ThreadPool() +maxRows = 10000 +sqlQuery = "type=:type AND status=:status AND (modificationdate BETWEEN :modificationdateL AND :modificationdateU) AND rownum <= %s" % maxRows +while True: + # lock + mergeLock.acquire() + # get datasets + mergeProxyLock.acquire() + varMap = {} + varMap[':modificationdateU'] = timeLimitU + varMap[':modificationdateL'] = timeLimitL + varMap[':type'] = 'output' + varMap[':status'] = 'tobemerged' + proxyS = taskBuffer.proxyPool.getProxy() + res = proxyS.getLockDatasets(sqlQuery,varMap,modTimeOffset='90/24/60',getVersion=True) + taskBuffer.proxyPool.putProxy(proxyS) + if res == None: + _logger.debug("# of datasets to be merged: %s" % res) + else: + _logger.debug("# of datasets to be merged: %s" % len(res)) + if res==None or len(res)==0: + mergeProxyLock.release() + mergeLock.release() + break + # release + mergeProxyLock.release() + mergeLock.release() + # run thread + iRows = 0 + nRows = 100 + while iRows < len(res): + mergerThr = MergerThr(mergeLock,mergeProxyLock,res[iRows:iRows+nRows],mergeThreadPool) + mergerThr.start() + iRows += nRows + mergeThreadPool.join() + if len(res) < maxRows: + break + + +_logger.debug("================= end ==================") diff --git a/current/pandaserver/test/runRebro.py b/current/pandaserver/test/runRebro.py new file mode 100755 index 000000000..494a0798d --- /dev/null +++ b/current/pandaserver/test/runRebro.py @@ -0,0 +1,198 @@ +import os +import re +import sys +import pytz +import time +import fcntl +import types +import shelve +import random +import datetime +import commands +import threading +import userinterface.Client as Client +from dataservice.DDM import ddm +from dataservice.DDM import dashBorad +from taskbuffer.OraDBProxy import DBProxy +from taskbuffer.TaskBuffer import taskBuffer +from pandalogger.PandaLogger import PandaLogger +from jobdispatcher.Watcher import Watcher +from brokerage.SiteMapper import SiteMapper +from dataservice.Adder import Adder +from dataservice.Finisher import Finisher +from dataservice.MailUtils import MailUtils +from taskbuffer import ProcessGroups +import brokerage.broker_util +import brokerage.broker +import taskbuffer.ErrorCode +import dataservice.DDM + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# logger +_logger = PandaLogger().getLogger('runRebro') + +_logger.debug("===================== start =====================") + +# memory checker +def _memoryCheck(str): + try: + proc_status = '/proc/%d/status' % os.getpid() + procfile = open(proc_status) + name = "" + vmSize = "" + vmRSS = "" + # extract Name,VmSize,VmRSS + for line in procfile: + if line.startswith("Name:"): + name = line.split()[-1] + continue + if line.startswith("VmSize:"): + vmSize = "" + for item in line.split()[1:]: + vmSize += item + continue + if line.startswith("VmRSS:"): + vmRSS = "" + for item in line.split()[1:]: + vmRSS += item + continue + procfile.close() + _logger.debug('MemCheck - %s Name=%s VSZ=%s RSS=%s : %s' % (os.getpid(),name,vmSize,vmRSS,str)) + except: + type, value, traceBack = sys.exc_info() + _logger.error("memoryCheck() : %s %s" % (type,value)) + _logger.debug('MemCheck - %s unknown : %s' % (os.getpid(),str)) + return + +_memoryCheck("start") + +# kill old process +try: + # time limit + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=7) + # get process list + scriptName = sys.argv[0] + out = commands.getoutput('ps axo user,pid,lstart,args | grep %s' % scriptName) + for line in out.split('\n'): + items = line.split() + # owned process + if not items[0] in ['sm','atlpan','root']: # ['os.getlogin()']: doesn't work in cron + continue + # look for python + if re.search('python',line) == None: + continue + # PID + pid = items[1] + # start time + timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) + startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) + # kill old process + if startTime < timeLimit: + _logger.debug("old process : %s %s" % (pid,startTime)) + _logger.debug(line) + commands.getoutput('kill -9 %s' % pid) +except: + type, value, traceBack = sys.exc_info() + _logger.error("kill process : %s %s" % (type,value)) + + +# instantiate TB +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + +# instantiate sitemapper +siteMapper = SiteMapper(taskBuffer) + +_memoryCheck("rebroker") + +# rebrokerage +_logger.debug("Rebrokerage start") +try: + normalTimeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=24) + sortTimeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) + sql = "SELECT jobDefinitionID,prodUserName,prodUserID,computingSite,MAX(modificationTime) FROM ATLAS_PANDA.jobsActive4 " + sql += "WHERE prodSourceLabel IN (:prodSourceLabel1,:prodSourceLabel2) AND jobStatus=:jobStatus " + sql += "AND modificationTime<:modificationTime " + sql += "AND jobsetID IS NOT NULL " + sql += "AND processingType IN (:processingType1,:processingType2) " + sql += "GROUP BY jobDefinitionID,prodUserName,prodUserID,computingSite " + varMap = {} + varMap[':prodSourceLabel1'] = 'user' + varMap[':prodSourceLabel2'] = 'panda' + varMap[':modificationTime'] = sortTimeLimit + varMap[':processingType1'] = 'pathena' + varMap[':processingType2'] = 'prun' + varMap[':jobStatus'] = 'activated' + # get jobs older than threshold + ret,res = taskBuffer.querySQLS(sql, varMap) + sql = "SELECT PandaID,modificationTime FROM %s WHERE prodUserName=:prodUserName AND jobDefinitionID=:jobDefinitionID " + sql += "AND modificationTime>:modificationTime AND rownum <= 1" + if res != None: + from userinterface.ReBroker import ReBroker + recentRuntimeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=3) + # loop over all user/jobID combinations + iComb = 0 + nComb = len(res) + _logger.debug("total combinations = %s" % nComb) + for jobDefinitionID,prodUserName,prodUserID,computingSite,maxModificationTime in res: + # check time if it is closed to log-rotate + timeNow = datetime.datetime.now(pytz.timezone('Europe/Zurich')) + timeCron = timeNow.replace(hour=4,minute=0,second=0,microsecond=0) + if (timeNow-timeCron) < datetime.timedelta(seconds=60*10) and \ + (timeCron-timeNow) < datetime.timedelta(seconds=60*30): + _logger.debug("terminate since close to log-rotate time") + break + # check if jobs with the jobID have run recently + varMap = {} + varMap[':prodUserName'] = prodUserName + varMap[':jobDefinitionID'] = jobDefinitionID + varMap[':modificationTime'] = recentRuntimeLimit + _logger.debug(" rebro:%s/%s:ID=%s:%s" % (iComb,nComb,jobDefinitionID,prodUserName)) + iComb += 1 + hasRecentJobs = False + # check site + if not siteMapper.checkSite(computingSite): + _logger.debug(" -> skip unknown site=%s" % computingSite) + continue + # check site status + tmpSiteStatus = siteMapper.getSite(computingSite).status + if not tmpSiteStatus in ['offline','test']: + # use normal time limit for nornal site status + if maxModificationTime > normalTimeLimit: + _logger.debug(" -> skip wait for normal timelimit=%s skip %s ran recently at %s" % (resU[0][0],resU[0][1])) + break + else: + _logger.debug(" -> immidiate rebro due to site status=%s" % tmpSiteStatus) + if hasRecentJobs: + # skip since some jobs have run recently + continue + else: + reBroker = ReBroker(taskBuffer) + # try to lock + rebRet,rebOut = reBroker.lockJob(prodUserID,jobDefinitionID) + if not rebRet: + # failed to lock + _logger.debug(" -> failed to lock : %s" % rebOut) + continue + else: + # start + _logger.debug(" -> start") + reBroker.start() + reBroker.join() +except: + errType,errValue = sys.exc_info()[:2] + _logger.error("rebrokerage failed with %s:%s" % (errType,errValue)) + +_logger.debug("===================== end =====================") diff --git a/current/pandaserver/test/setPriority.py b/current/pandaserver/test/setPriority.py new file mode 100755 index 000000000..7dab5b3c2 --- /dev/null +++ b/current/pandaserver/test/setPriority.py @@ -0,0 +1,30 @@ +import time +import sys +import optparse + + +from taskbuffer.OraDBProxy import DBProxy + +# password +from config import panda_config + +usage = """%prog + + Set a priority to jobs in a task""" + +optP = optparse.OptionParser(usage=usage,conflict_handler="resolve") +options,args = optP.parse_args() + + +proxyS = DBProxy() +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +varMap = {} +varMap[':prodSourceLabel'] = 'managed' +varMap[':taskID'] = sys.argv[1] +varMap[':prio'] = sys.argv[2] +sql = "UPDATE %s SET currentPriority=:prio WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID" +for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: + status,res = proxyS.querySQLS(sql % table,varMap) + + diff --git a/current/pandaserver/test/testDB.py b/current/pandaserver/test/testDB.py new file mode 100755 index 000000000..752bf3f77 --- /dev/null +++ b/current/pandaserver/test/testDB.py @@ -0,0 +1,88 @@ +#!/usr/bin/python + +""" +test DB access + +""" + +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec +from taskbuffer.DatasetSpec import DatasetSpec +from taskbuffer.DBProxyPool import DBProxyPool + +import getpass +passwd = getpass.getpass() + +pool = DBProxyPool('adbpro.usatlas.bnl.gov',passwd,2) + +proxy = pool.getProxy() + +import sys +import commands + +job1 = JobSpec() +job1.PandaID='NULL' +job1.jobStatus='unknown' +job1.computingSite="aaa" +f11 = FileSpec() +f11.lfn = 'in1.pool.root' +f11.type = 'input' +job1.addFile(f11) +f12 = FileSpec() +f12.lfn = 'out1.pool.root' +f12.type = 'output' +job1.addFile(f12) + +job2 = JobSpec() +job2.PandaID='NULL' +job2.jobStatus='unknown' +job2.computingSite="bbb" +f21 = FileSpec() +f21.lfn = 'in2.pool.root' +f21.type = 'input' +job2.addFile(f21) +f22 = FileSpec() +f22.lfn = 'out2.pool.root' +f22.type = 'output' +job2.addFile(f22) + +proxy.insertNewJob(job1) +proxy.insertNewJob(job2) +print "Inserted %d %d" % (job1.PandaID,job2.PandaID) +proxy.activateJob(job1) +proxy.activateJob(job2) +print "activated" +ret = proxy.getJobs(1,"aaa") +print "Got Jobs" +for j in ret: + print j.PandaID +print proxy.peekJob(job1.PandaID).jobStatus +proxy.updateJobStatus(job1.PandaID,"unknown") +print " ->" ,proxy.peekJob(job1.PandaID).jobStatus + +print proxy.peekJob(job2.PandaID).jobStatus +job2.jobStatus = "running" +proxy.updateJob(job2,False) +print " ->" ,proxy.peekJob(job2.PandaID).jobStatus +print "Updated" +proxy.archiveJob(job1,False) +proxy.archiveJobLite(job2.PandaID,job2.jobStatus) +print "Archived" +proxy.querySQL("DELETE FROM jobsArchived3 WHERE PandaID=%d" % job1.PandaID) +proxy.querySQL("DELETE FROM jobsArchived3 WHERE PandaID=%d" % job2.PandaID) +print "job Deleted" + +print "dataset" +dataset = DatasetSpec() +dataset.vuid = commands.getoutput('/usr/bin/uuidgen') +dataset.name = 'test.%s' % dataset.vuid + +proxy.insertDataset(dataset) +print dataset.vuid +dataset2 = proxy.queryDataset(dataset.vuid) +print dataset2.values() +dataset2.type = 'test' +proxy.updateDataset(dataset2) +dataset3 = proxy.queryDataset(dataset.vuid) +print dataset3.values() +proxy.querySQL("DELETE FROM Datasets WHERE vuid='%s'" % dataset.vuid) diff --git a/current/pandaserver/test/testDQ.py b/current/pandaserver/test/testDQ.py new file mode 100755 index 000000000..381cdece8 --- /dev/null +++ b/current/pandaserver/test/testDQ.py @@ -0,0 +1,102 @@ +import commands +from dataservice.DDM import ddm + +#print ddm.DQ2ProductionClient.generateUUID() +#print ddm.DQ2.getFilesFromCatalog('aho.xml') +#print ddm.DQ2ProductionClient.dq2_makeblocks('input.data') + +ids=['pandatest.000003.dd.input._00047.junk','09801b0a-9fd0-4237-8caf-a37932c26e39', + 'pandatest.000003.dd.input._00050.junk','6dd3d367-4aa3-4e1a-9ac3-9ad14b7311f4', + 'pandatest.000003.dd.input._00037.junk','817c2c92-467b-4a1b-9482-f2ec8468cf2e', + 'pandatest.000003.dd.input._00021.junk','7720527f-817e-40c7-9e29-ce237f59edfa', + 'pandatest.000003.dd.input._00023.junk','5f1f9982-85a3-4d1a-9ee9-f1de22c02544', + 'pandatest.000003.dd.input._00042.junk','610cc91a-c731-4bce-ac7a-ff5133e7d18b', + 'pandatest.000003.dd.input._00027.junk','bd987478-3c59-4551-b12b-2853bac25613', + 'pandatest.000003.dd.input._00032.junk','9d0424f3-7552-4282-92f2-dfe74e9a6c12', + 'pandatest.000003.dd.input._00009.junk','dce33d4a-4569-49ee-95c5-b619b161c777', + 'pandatest.000003.dd.input._00036.junk','2fc9836b-82d6-41b0-b966-a5c37662172d', + 'pandatest.000003.dd.input._00031.junk','65b957e0-5ecc-44bb-a1f9-cccb61ca2d16', + 'pandatest.000003.dd.input._00025.junk','be29fe82-17e2-4122-b4c8-f49a0b76c81f', + 'pandatest.000003.dd.input._00029.junk','afa4322f-409b-4327-9169-229d8d48ad5a', + 'pandatest.000003.dd.input._00013.junk','cf236d3b-45fd-4b58-bdfb-59abc983c886', + 'pandatest.000003.dd.input._00020.junk','b02f98da-0138-4b58-89ba-a88f37214a89', + 'pandatest.000003.dd.input._00001.junk','12ab5bb9-944e-4e75-bb90-b64c462d4cd8', + 'pandatest.000003.dd.input._00001.junk','12ab5bb9-944e-4e75-bb90-b64c462d4cd8', + 'pandatest.000003.dd.input._00006.junk','c0a422ad-e9f1-44bb-9539-cfef7e739da2', + 'pandatest.000003.dd.input._00034.junk','da670db3-3638-4f06-b650-a9315eb2bd63', + 'pandatest.000003.dd.input._00046.junk','2fcef270-2e41-472d-83c0-53749b401b74', + 'pandatest.000003.dd.input._00012.junk','5e212fa1-201f-494d-a2b2-420b229b08fc', + 'pandatest.000003.dd.input._00044.junk','87c8ebcc-a637-4204-b77b-8219e68b98d7', + 'pandatest.000003.dd.input._00030.junk','87ad811f-7d39-43d9-8a13-e117079bb208', + 'pandatest.000003.dd.input._00022.junk','6b902506-1ee1-46b1-a105-1521a8c0dbca', + 'pandatest.000003.dd.input._00017.junk','2bbed213-943c-41be-b9d7-7d86a309b0b2', + 'pandatest.000003.dd.input._00049.junk','8366e269-f9ae-4b9c-bd98-df4027c992c7', + 'pandatest.000003.dd.input._00015.junk','f3c5f37c-b4c2-4933-9633-467ba3a7c364', + 'pandatest.000003.dd.input._00004.junk','35d66be2-9d21-44a3-96f7-903a7abf4a87', + 'pandatest.000003.dd.input._00010.junk','2279ea3e-ebbb-4b19-9a69-9868f0cce694', + 'pandatest.000003.dd.input._00040.junk','a847dbbb-4f98-4b5b-b353-e29e3e3b3fd5', + 'pandatest.000003.dd.input._00007.junk','abfef002-62ca-4d84-9813-6329764e38bd', + 'pandatest.000003.dd.input._00048.junk','52854023-67d8-4a0f-99ac-bb1f0bd1dc98', + 'pandatest.000003.dd.input._00016.junk','bddf7441-6ac9-4087-bafe-32e47448cdc1', + 'pandatest.000003.dd.input._00041.junk','c76999ba-4cdf-49e9-bfa5-ff3525fbf1ab', + 'pandatest.000003.dd.input._00003.junk','4865119e-367f-4dd8-bdff-505bd878dfde', + 'pandatest.000003.dd.input._00019.junk','b9fce1fd-8d4c-4fc4-932f-12b13263ca0c', + 'pandatest.000003.dd.input._00011.junk','f93a4e08-fd4f-45fc-b324-91ff59555b1c', + 'pandatest.000003.dd.input._00018.junk','e4894561-9589-40d8-871b-b57d70564384', + 'pandatest.000003.dd.input._00002.junk','58934980-5ab3-4a66-b3da-55f86d4b54bd', + 'pandatest.000003.dd.input._00005.junk','5993fe60-bc8c-4fd8-aac1-dfd55700c9c3', + 'pandatest.000003.dd.input._00028.junk','6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27', + 'pandatest.000003.dd.input._00033.junk','98f79ba1-1793-4253-aac7-bdf90a51d1ee', + 'pandatest.000003.dd.input._00039.junk','33660dd5-7cef-422a-a7fc-6c24cb10deb1', + 'pandatest.000003.dd.input._00014.junk','5c0e9ed8-05a6-41c4-8c07-39b2be33ebc1', + 'pandatest.000003.dd.input._00008.junk','b0c184d1-5f5e-45a6-9cc8-8b0f20a85463', + 'pandatest.000003.dd.input._00038.junk','b9171997-4d2b-4075-b154-579ebe9438fa', + 'pandatest.000003.dd.input._00026.junk','89e5bdf1-15de-44ae-a388-06c1e7d7e2fc', + 'pandatest.000003.dd.input._00024.junk','c77b77a2-e6d1-4360-8751-19d9fb77e1f1', + 'pandatest.000003.dd.input._00043.junk','cc6ac2a1-4616-4551-80a7-d96f79252b64', + 'pandatest.000003.dd.input._00045.junk','ddbed17a-6d65-4e8d-890a-21e1eaa3e9d6', + 'pandatest.000003.dd.input._00035.junk','8ed1875a-eb90-4906-8fc4-0449d300ddfe' + ] + +for i in range(1): + datasetName='testDQ.%s' % commands.getoutput('/usr/bin/uuidgen') + print datasetName + + #['pandatest.000003.dd.input._00004.junk','35d66be2-9d21-44a3-96f7-903a7abf4a87'] + #'pandatest.000003.dd.input._00028.junk','6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27', + # 'pandatest.000003.dd.input._00033.junk','98f79ba1-1793-4253-aac7-bdf90a51d1ee'] + print (['registerNewDataset','-c',datasetName]+ids[i*2:i*2+2]) + ddm.DQ2.main(['registerNewDataset','-c',datasetName]+ids[i*2:i*2+2]) + ''' + status,out = ddm.RepositoryClient.main(['queryDatasetByName',datasetName]) + exec "vuids = %s" % out.split('\n')[0] + if vuids.has_key(datasetName): + vuid = vuids[datasetName] + print vuid + status,out = ddm.RepositoryClient.main(['resolveVUID',vuid]) + status,out = ddm.DQ2.getFilesFromCatalog('baka.xml') + exec "rets = %s" % out.split('\n')[0] + print rets[0] + exec "ids = %s" % out + print ddm.DQ2.main(['addFilesToDataset',datasetName]+ids) + status,out = ddm.DQ2.main(['listFilesInDataset',datasetName]) + print out + ''' + print (['registerDatasetLocations','-c',datasetName,'http://dms02.usatlas.bnl.gov/sites/bnl/lrc']) + ddm.DQ2.main(['registerDatasetLocations','-c',datasetName, + 'http://dms02.usatlas.bnl.gov/sites/bnl/lrc']) + print (['registerDatasetSubscription',datasetName,'http://doe-dhcp241.bu.edu:8000/dq2/']) + ddm.DQ2.main(['registerDatasetSubscription',datasetName,'http://doe-dhcp241.bu.edu:8000/dq2/']) +#print ddm.DQ2.main(['eraseDataset',datasetName]) + +#print ddm.DQ2.main(['eraseDataset',datasetName]) +#print ddm.DQ2ProductionClient.dq2_create_dataset(datasetName) +#status,out = ddm.DQ2ProductionClient.dq2_assign_destination(datasetName,'BNL_SE') +#print out +#print ddm.DQ2.main(['eraseDataset',datasetName]) +#status,out = ddm.DQ2.main(['listFilesInDataset','panda.destDB.11aed982-8079-4db9-964c-37a284b8597a']) +#print out + +ddm.DQ2_iter.listFileReplicasBySites('mc11_7TeV.151900.madgraph_SM_SG_SS_direct_1200_600_395.merge.AOD.e1095_a131_s1353_a145_r2993_tid723983_00', + 0,['SARA-MATRIX_DATADISK'], + 0,300) diff --git a/current/pandaserver/test/testEvgen.py b/current/pandaserver/test/testEvgen.py new file mode 100755 index 000000000..db636a439 --- /dev/null +++ b/current/pandaserver/test/testEvgen.py @@ -0,0 +1,59 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = None + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = None + +jobList = [] + +for i in range(1): + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) + job.AtlasRelease = 'Atlas-14.1.0' + job.homepackage = 'AtlasProduction/14.1.0.3' + job.transformation = 'csc_evgen_trf.py' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.currentPriority = 100 + job.prodSourceLabel = 'test' + job.computingSite = site + job.cloud = 'US' + job.cmtConfig = 'i686-slc4-gcc34-opt' + + file = FileSpec() + file.lfn = "%s.evgen.pool.root" % job.jobName + file.destinationDBlock = job.destinationDBlock + file.destinationSE = job.destinationSE + file.dataset = job.destinationDBlock + file.destinationDBlockToken = 'ATLASDATADISK' + file.type = 'output' + job.addFile(file) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="5144 1 5000 1 CSC.005144.PythiaZee.py %s NONE NONE NONE" % file.lfn + jobList.append(job) + +for i in range(1): + s,o = Client.submitJobs(jobList) + print "---------------------" + print s + for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testEvgen14.py b/current/pandaserver/test/testEvgen14.py new file mode 100755 index 000000000..af53c0e95 --- /dev/null +++ b/current/pandaserver/test/testEvgen14.py @@ -0,0 +1,59 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = None + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = 'BNL_SE' + +jobList = [] + +for i in range(1): + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) + job.AtlasRelease = 'Atlas-14.1.0' + job.homepackage = 'AtlasProduction/14.1.0.3' + job.transformation = 'csc_evgen_trf.py' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.currentPriority = 1000 + job.prodSourceLabel = 'test' + job.computingSite = site + job.processingType = 'test' + job.cmtConfig = 'i686-slc4-gcc34-opt' + + file = FileSpec() + file.lfn = "%s.evgen.pool.root" % job.jobName + file.destinationDBlock = job.destinationDBlock + file.destinationSE = job.destinationSE + file.dataset = job.destinationDBlock + file.destinationDBlockToken = 'ATLASDATADISK' + file.type = 'output' + job.addFile(file) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="5144 1 5000 1 CSC.005144.PythiaZee.py %s NONE NONE NONE" % file.lfn + jobList.append(job) + +for i in range(1): + s,o = Client.submitJobs(jobList) + print "---------------------" + print s + for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testEvgen15.py b/current/pandaserver/test/testEvgen15.py new file mode 100755 index 000000000..0753e3329 --- /dev/null +++ b/current/pandaserver/test/testEvgen15.py @@ -0,0 +1,57 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +site = sys.argv[1] +cloud = sys.argv[2] + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = None + +jobList = [] + +for i in range(1): + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) + job.AtlasRelease = 'Atlas-15.6.10' + job.homepackage = 'AtlasProduction/15.6.10.1' + job.transformation = 'Evgen_trf.py' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.currentPriority = 10000 + job.prodSourceLabel = 'test' + job.computingSite = site + job.cloud = cloud + job.cmtConfig = 'i686-slc5-gcc43-opt' + + file = FileSpec() + file.lfn = "%s.evgen.pool.root" % job.jobName + file.destinationDBlock = job.destinationDBlock + file.destinationSE = job.destinationSE + file.dataset = job.destinationDBlock + file.destinationDBlockToken = 'ATLASDATADISK' + file.type = 'output' + job.addFile(file) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="10000 105815 12330001 5000 12467 MC9.105815.JF140_pythia_jet_filter.py %s NONE NONE NONE MC09JobOpts-00-01-88.tar.gz" % file.lfn + jobList.append(job) + +for i in range(1): + s,o = Client.submitJobs(jobList) + print "---------------------" + print s + for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testEvgen16.py b/current/pandaserver/test/testEvgen16.py new file mode 100755 index 000000000..0c0cc67f4 --- /dev/null +++ b/current/pandaserver/test/testEvgen16.py @@ -0,0 +1,57 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +site = sys.argv[1] +cloud = sys.argv[2] + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = None + +jobList = [] + +for i in range(1): + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) + job.AtlasRelease = 'Atlas-16.6.2' + job.homepackage = 'AtlasProduction/16.6.2.1' + job.transformation = 'Evgen_trf.py' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.currentPriority = 10000 + job.prodSourceLabel = 'test' + job.computingSite = site + job.cloud = cloud + job.cmtConfig = 'i686-slc5-gcc43-opt' + + file = FileSpec() + file.lfn = "%s.evgen.pool.root" % job.jobName + file.destinationDBlock = job.destinationDBlock + file.destinationSE = job.destinationSE + file.dataset = job.destinationDBlock + file.destinationDBlockToken = 'ATLASDATADISK' + file.type = 'output' + job.addFile(file) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="2760 105048 19901 101 200 MC10.105048.PythiaB_ccmu3mu1X.py %s NONE NONE NONE MC10JobOpts-latest-test.tar.gz" % file.lfn + jobList.append(job) + +for i in range(1): + s,o = Client.submitJobs(jobList) + print "---------------------" + print s + for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testEvgen17.py b/current/pandaserver/test/testEvgen17.py new file mode 100755 index 000000000..ce808e4e6 --- /dev/null +++ b/current/pandaserver/test/testEvgen17.py @@ -0,0 +1,58 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +site = sys.argv[1] +cloud = sys.argv[2] + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = None + +jobList = [] + +for i in range(1): + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) + job.AtlasRelease = 'Atlas-17.0.5' + job.homepackage = 'AtlasProduction/17.0.5.6' + job.transformation = 'Evgen_trf.py' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.currentPriority = 10000 + job.prodSourceLabel = 'test' + job.computingSite = site + job.cloud = cloud + job.cmtConfig = 'i686-slc5-gcc43-opt' + + file = FileSpec() + file.lfn = "%s.evgen.pool.root" % job.jobName + file.destinationDBlock = job.destinationDBlock + file.destinationSE = job.destinationSE + file.dataset = job.destinationDBlock + file.destinationDBlockToken = 'ATLASDATADISK' + file.type = 'output' + job.addFile(file) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="7000 108316 1 5000 1 MC11.108316.Pythia8_minbias_ND.py %s" % file.lfn + + jobList.append(job) + +for i in range(1): + s,o = Client.submitJobs(jobList) + print "---------------------" + print s + for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testFinder.py b/current/pandaserver/test/testFinder.py new file mode 100644 index 000000000..09bb9574d --- /dev/null +++ b/current/pandaserver/test/testFinder.py @@ -0,0 +1,69 @@ +import sys +from taskbuffer.OraDBProxy import DBProxy + +from dataservice import AddressFinder + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# instantiate DB proxies +proxyS = DBProxy(True) +proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) + +# get DN and address +status,res = proxyS.querySQLS("SELECT dn,email,name FROM ATLAS_PANDAMETA.users",{},arraySize=1000000) +if res == None: + print "SQL error" + sys.exit(0) + +# to upper chrs +def toUpper(emails): + retA = [] + for email in emails: + retA.append(email.upper()) + return retA + +outF = open('newemail.sql','w') + +for dn,origEmail,name in res: + if dn == None: + dn = name + if dn == None: + continue + emailsP = AddressFinder.getEmailPhonebook(dn) + emailsX = AddressFinder.getEmailXwho(dn) + if toUpper(emailsP) != toUpper(emailsX) and len(emailsP) != 0: + print dn + print "ERROR : xwho != phone" + print "phone : %s" % str(emailsP) + print "xwho : %s" % str(emailsX) + print "DB : %s" % origEmail + print + elif len(emailsP) == 0: + print dn + print "ERROR : not found" + print "DB : %s" % origEmail + print + elif len(emailsP) > 1: + print dn + print "ERROR : non-unique %s" % str(emailsP) + print "DB : %s" % origEmail + print + elif origEmail == None or origEmail.upper() != emailsP[0].upper() and origEmail != 'notsend': + print dn + print "phone : %s" % str(emailsP) + print "xwho : %s" % str(emailsX) + print "ERROR : %-40s new: %s\n" % (origEmail,emailsP[0]) + outF.write("/* %-40s new: %s */\n" % (origEmail,emailsP[0])) + outF.write("UPDATE atlas_pandameta.users SET email='%s' WHERE name='%s';\n" % (emailsP[0],name)) + pass + else: + pass + #print dn + #print "OK" + +outF.write('COMMIT;') +outF.close() + + diff --git a/current/pandaserver/test/testG4sim.py b/current/pandaserver/test/testG4sim.py new file mode 100755 index 000000000..b2f8f2f9a --- /dev/null +++ b/current/pandaserver/test/testG4sim.py @@ -0,0 +1,83 @@ +import sys +import time +import random +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = None + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = 'BNL_ATLAS_2' +#destName = 'BU_ATLAS_Tier2' + +files = { + 'mc11.007204.singlepart_mu4.evgen.EVNT.v11000302._00037.pool.root.1':None, + 'mc11.007204.singlepart_mu4.evgen.EVNT.v11000302._00038.pool.root.1':None, + } + +jobList = [] + +for lfn in files.keys(): + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = commands.getoutput('uuidgen') + job.AtlasRelease = 'Atlas-11.0.3' + job.homepackage = 'JobTransforms-11-00-03-02' + job.transformation = 'share/csc.simul.trf' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.computingSite = site + job.prodDBlock = 'mc11.007204.singlepart_mu4.evgen.EVNT.v11000302' + job.cmtConfig = 'i686-slc4-gcc34-opt' + + job.prodSourceLabel = 'test' + job.currentPriority = 1000 + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + + fileOE = FileSpec() + fileOE.lfn = "%s.HITS.pool.root" % commands.getoutput('uuidgen') + fileOE.destinationDBlock = job.destinationDBlock + fileOE.destinationSE = job.destinationSE + fileOE.dataset = job.destinationDBlock + fileOE.destinationDBlockToken = 'ATLASDATADISK' + fileOE.type = 'output' + job.addFile(fileOE) + + fileOA = FileSpec() + fileOA.lfn = "%s.RDO.pool.root" % commands.getoutput('uuidgen') + fileOA.destinationDBlock = job.destinationDBlock + fileOA.destinationSE = job.destinationSE + fileOA.dataset = job.destinationDBlock + fileOA.destinationDBlockToken = 'ATLASDATADISK' + fileOA.type = 'output' + job.addFile(fileOA) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen') + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="%s %s %s 100 700 2158" % (fileI.lfn,fileOE.lfn,fileOA.lfn) + + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testG4sim15.py b/current/pandaserver/test/testG4sim15.py new file mode 100644 index 000000000..19b8d4e4b --- /dev/null +++ b/current/pandaserver/test/testG4sim15.py @@ -0,0 +1,88 @@ +import sys +import time +import random +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +site = sys.argv[1] +cloud = sys.argv[2] + +prodDBlock = 'mc09_10TeV.105807.JF35_pythia_jet_filter.evgen.EVNT.e469_tid095268' +inputFile = 'EVNT.095268._000110.pool.root.1' + +if len(sys.argv)==5: + site = sys.argv[1] + cloud = sys.argv[2] + prodDBlock = sys.argv[3] + inputFile = sys.argv[4] + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') + +files = { + inputFile:None, + } + +jobList = [] + +index = 0 +for lfn in files.keys(): + index += 1 + job = JobSpec() + job.jobDefinitionID = (time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) + job.AtlasRelease = 'Atlas-15.3.1' + job.homepackage = 'AtlasProduction/15.3.1.5' + job.transformation = 'csc_atlasG4_trf.py' + job.destinationDBlock = datasetName + job.computingSite = site + job.prodDBlock = prodDBlock + + job.prodSourceLabel = 'test' + job.processingType = 'test' + job.currentPriority = 10000 + job.cloud = cloud + job.cmtConfig = 'i686-slc4-gcc34-opt' + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + + fileD = FileSpec() + fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v070302' + fileD.prodDBlock = fileD.dataset + fileD.lfn = 'DBRelease-7.3.2.tar.gz' + fileD.type = 'input' + job.addFile(fileD) + + fileOA = FileSpec() + fileOA.lfn = "%s.HITS.pool.root" % job.jobName + fileOA.destinationDBlock = job.destinationDBlock + fileOA.destinationSE = job.destinationSE + fileOA.dataset = job.destinationDBlock + fileOA.destinationDBlockToken = 'ATLASDATADISK' + fileOA.type = 'output' + job.addFile(fileOA) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="%s %s 5 1850 8738 ATLAS-GEO-08-00-01 QGSP_BERT VertexPos.py %s OFLCOND-SIM-01-00-00 False s595" % \ + (fileI.lfn,fileOA.lfn,fileD.lfn) + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testG4sim16.py b/current/pandaserver/test/testG4sim16.py new file mode 100644 index 000000000..c540c4cba --- /dev/null +++ b/current/pandaserver/test/testG4sim16.py @@ -0,0 +1,88 @@ +import sys +import time +import random +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +site = sys.argv[1] +cloud = sys.argv[2] + +prodDBlock = 'mc10_7TeV.105001.pythia_minbias.evgen.EVNT.e574_tid153937_00' +inputFile = 'EVNT.153937._000184.pool.root.1' + +if len(sys.argv)==5: + site = sys.argv[1] + cloud = sys.argv[2] + prodDBlock = sys.argv[3] + inputFile = sys.argv[4] + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') + +files = { + inputFile:None, + } + +jobList = [] + +index = 0 +for lfn in files.keys(): + index += 1 + job = JobSpec() + job.jobDefinitionID = (time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) + job.AtlasRelease = 'Atlas-16.6.2' + job.homepackage = 'AtlasProduction/16.6.2.1' + job.transformation = 'AtlasG4_trf.py' + job.destinationDBlock = datasetName + job.computingSite = site + job.prodDBlock = prodDBlock + + job.prodSourceLabel = 'test' + job.processingType = 'test' + job.currentPriority = 10000 + job.cloud = cloud + job.cmtConfig = 'i686-slc5-gcc43-opt' + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + + fileD = FileSpec() + fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v140201' + fileD.prodDBlock = fileD.dataset + fileD.lfn = 'DBRelease-14.2.1.tar.gz' + fileD.type = 'input' + job.addFile(fileD) + + fileOA = FileSpec() + fileOA.lfn = "%s.HITS.pool.root" % job.jobName + fileOA.destinationDBlock = job.destinationDBlock + fileOA.destinationSE = job.destinationSE + fileOA.dataset = job.destinationDBlock + fileOA.destinationDBlockToken = 'ATLASDATADISK' + fileOA.type = 'output' + job.addFile(fileOA) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters='inputEvgenFile=%s outputHitsFile=%s maxEvents=3 skipEvents=1700 DBRelease=%s preInclude=SimuJobTransforms/VertexFromCondDB.py postExec="from InDetBeamSpotService.InDetBeamSpotServiceConf import BeamCondSvc;ServiceMgr+=BeamCondSvc();ServiceMgr.BeamCondSvc.useDB=False;ServiceMgr.BeamCondSvc.posX=0.1352;ServiceMgr.BeamCondSvc.posY=1.1621;ServiceMgr.BeamCondSvc.posZ=2.87;ServiceMgr.BeamCondSvc.sigmaX=0;ServiceMgr.BeamCondSvc.sigmaY=0;ServiceMgr.BeamCondSvc.sigmaZ=0" geometryVersion=ATLAS-GEO-16-00-00 conditionsTag=OFLCOND-SDR-BS7T-02 AMITag=s1019 randomSeed=568 physicsList=QGSP_BERT firstEvent=1701 RunNumber=106047' % \ + (fileI.lfn,fileOA.lfn,fileD.lfn) + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testG4sim17.py b/current/pandaserver/test/testG4sim17.py new file mode 100644 index 000000000..0b53acb0d --- /dev/null +++ b/current/pandaserver/test/testG4sim17.py @@ -0,0 +1,88 @@ +import sys +import time +import random +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +site = sys.argv[1] +cloud = sys.argv[2] + +prodDBlock = 'mc10_7TeV.105001.pythia_minbias.evgen.EVNT.e574_tid153937_00' +inputFile = 'EVNT.153937._000184.pool.root.1' + +if len(sys.argv)==5: + site = sys.argv[1] + cloud = sys.argv[2] + prodDBlock = sys.argv[3] + inputFile = sys.argv[4] + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') + +files = { + inputFile:None, + } + +jobList = [] + +index = 0 +for lfn in files.keys(): + index += 1 + job = JobSpec() + job.jobDefinitionID = (time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) + job.AtlasRelease = 'Atlas-17.0.5' + job.homepackage = 'AtlasProduction/17.0.5.6' + job.transformation = 'AtlasG4_trf.py' + job.destinationDBlock = datasetName + job.computingSite = site + job.prodDBlock = prodDBlock + + job.prodSourceLabel = 'test' + job.processingType = 'test' + job.currentPriority = 10000 + job.cloud = cloud + job.cmtConfig = 'i686-slc5-gcc43-opt' + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + + fileD = FileSpec() + fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v170602' + fileD.prodDBlock = fileD.dataset + fileD.lfn = 'DBRelease-17.6.2.tar.gz' + fileD.type = 'input' + job.addFile(fileD) + + fileOA = FileSpec() + fileOA.lfn = "%s.HITS.pool.root" % job.jobName + fileOA.destinationDBlock = job.destinationDBlock + fileOA.destinationSE = job.destinationSE + fileOA.dataset = job.destinationDBlock + fileOA.destinationDBlockToken = 'ATLASDATADISK' + fileOA.type = 'output' + job.addFile(fileOA) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters='inputEvgenFile=%s outputHitsFile=%s maxEvents=3 skipEvents=0 DBRelease=%s geometryVersion=ATLAS-GEO-18-01-03_VALIDATION conditionsTag=OFLCOND-SDR-BS7T-05-14 randomSeed=1 physicsList=QGSP_BERT RunNumber=116870 firstEvent=1' % (fileI.lfn,fileOA.lfn,fileD.lfn) + + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testGetJobStatus.py b/current/pandaserver/test/testGetJobStatus.py new file mode 100755 index 000000000..4e47c2547 --- /dev/null +++ b/current/pandaserver/test/testGetJobStatus.py @@ -0,0 +1,17 @@ +import sys +import time +import commands +import userinterface.Client as Client + +id = sys.argv[1] + +s,o = Client.getJobStatus([id]) +print s +if s == 0: + for job in o: + if job == None: + continue + print job.PandaID + for file in job.Files: + print file.lfn,file.type + diff --git a/current/pandaserver/test/testMultiTRF.py b/current/pandaserver/test/testMultiTRF.py new file mode 100755 index 000000000..c9fcd9853 --- /dev/null +++ b/current/pandaserver/test/testMultiTRF.py @@ -0,0 +1,95 @@ +import sys +import time +import random +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = None + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') + +index = 0 + +job = JobSpec() +job.jobDefinitionID = int(time.time()) % 10000 +job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) +job.AtlasRelease = 'Atlas-14.1.0\nAtlas-14.1.0' +job.homepackage = 'AtlasProduction/14.1.0.3\nAtlasProduction/14.1.0.3' +job.transformation = 'csc_digi_trf.py\ncsc_reco_trf.py' +job.destinationDBlock = datasetName + +job.computingSite = site + +job.prodDBlock = 'valid1.005200.T1_McAtNlo_Jimmy.simul.HITS.e322_s429_tid022081' + +job.prodSourceLabel = 'test' +job.currentPriority = 10000 +job.cloud = 'US' + +for lfn in ['HITS.022081._00001.pool.root','HITS.022081._00002.pool.root']: + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + +fileD1 = FileSpec() +fileD1.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v050001' +fileD1.prodDBlock = fileD1.dataset +fileD1.lfn = 'DBRelease-5.0.1.tar.gz' +fileD1.type = 'input' +job.addFile(fileD1) + +fileD2 = FileSpec() +fileD2.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v050101' +fileD2.prodDBlock = fileD2.dataset +fileD2.lfn = 'DBRelease-5.1.1.tar.gz' +fileD2.type = 'input' +job.addFile(fileD2) + +fileOE = FileSpec() +fileOE.lfn = "%s.ESD.pool.root" % job.jobName +fileOE.destinationDBlock = job.destinationDBlock +fileOE.destinationSE = job.destinationSE +fileOE.dataset = job.destinationDBlock +fileOE.type = 'output' +job.addFile(fileOE) + +fileOA = FileSpec() +fileOA.lfn = "%s.AOD.pool.root" % job.jobName +fileOA.destinationDBlock = job.destinationDBlock +fileOA.destinationSE = job.destinationSE +fileOA.dataset = job.destinationDBlock +fileOA.type = 'output' +job.addFile(fileOA) + +fileOC = FileSpec() +fileOC.lfn = "%s.NTUP.root" % job.jobName +fileOC.destinationDBlock = job.destinationDBlock +fileOC.destinationSE = job.destinationSE +fileOC.dataset = job.destinationDBlock +fileOC.type = 'output' +job.addFile(fileOC) + +fileOL = FileSpec() +fileOL.lfn = "%s.job.log.tgz" % job.jobName +fileOL.destinationDBlock = job.destinationDBlock +fileOL.destinationSE = job.destinationSE +fileOL.dataset = job.destinationDBlock +fileOL.type = 'log' +job.addFile(fileOL) + +job.jobParameters="HITS.022081._[00001,00002].pool.root RDO.TMP._00001_tmp.pool.root 250 0 ATLAS-CSC-05-00-00 1 1 NONE NONE None %s AtRndmGenSvc QGSP_EMV DEFAULT NONE NONE NONE NONE NONE\n RDO.TMP._00001_tmp.pool.root %s %s %s 250 0 ATLAS-CSC-05-00-00 DEFAULT None %s NONE" % \ + (fileD1.lfn,fileOE.lfn,fileOA.lfn,fileOC.lfn,fileD2.lfn) + +s,o = Client.submitJobs([job]) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testReco.py b/current/pandaserver/test/testReco.py new file mode 100755 index 000000000..0eb597e45 --- /dev/null +++ b/current/pandaserver/test/testReco.py @@ -0,0 +1,106 @@ +import sys +import time +import random +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = None + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = None + +files = { + 'misal1_mc12.005802.JF17_pythia_jet_filter.digit.RDO.v12000601_tid008610._11615.pool.root.1':None, + #'misal1_mc12.005802.JF17_pythia_jet_filter.digit.RDO.v12000601_tid008610._11639.pool.root.1':None, + #'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554._03634.pool.root.1':None, + #'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554._03248.pool.root.1':None, + #'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554._03634.pool.root.1':None, + } + +jobList = [] + +index = 0 +for lfn in files.keys(): + index += 1 + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) + job.AtlasRelease = 'Atlas-12.0.6' + job.homepackage = 'AtlasProduction/12.0.6.4' + job.transformation = 'csc_reco_trf.py' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.computingSite = site + #job.prodDBlock = 'misal1_mc12.005200.T1_McAtNlo_Jimmy.digit.RDO.v12000601_tid007554' + job.prodDBlock = 'misal1_mc12.005802.JF17_pythia_jet_filter.digit.RDO.v12000601_tid008610' + job.cloud = 'US' + + job.prodSourceLabel = 'test' + job.currentPriority = 10000 + job.cmtConfig = 'i686-slc4-gcc34-opt' + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + + fileD = FileSpec() + fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' + fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' + fileD.lfn = 'DBRelease-3.1.1.tar.gz' + fileD.type = 'input' + job.addFile(fileD) + + fileOE = FileSpec() + fileOE.lfn = "%s.ESD.pool.root" % job.jobName + fileOE.destinationDBlock = job.destinationDBlock + fileOE.destinationSE = job.destinationSE + fileOE.dataset = job.destinationDBlock + fileOE.destinationDBlockToken = 'ATLASDATADISK' + fileOE.type = 'output' + job.addFile(fileOE) + + fileOA = FileSpec() + fileOA.lfn = "%s.AOD.pool.root" % job.jobName + fileOA.destinationDBlock = job.destinationDBlock + fileOA.destinationSE = job.destinationSE + fileOA.dataset = job.destinationDBlock + fileOA.destinationDBlockToken = 'ATLASDATADISK' + fileOA.type = 'output' + job.addFile(fileOA) + + fileOC = FileSpec() + fileOC.lfn = "%s.NTUP.root" % job.jobName + fileOC.destinationDBlock = job.destinationDBlock + fileOC.destinationSE = job.destinationSE + fileOC.dataset = job.destinationDBlock + fileOC.destinationDBlockToken = 'ATLASDATADISK' + fileOC.type = 'output' + job.addFile(fileOC) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="%s %s %s %s 250 0 ATLAS-CSC-01-02-00 CSC-06 NoRestrictedESDRecConfig.py %s" % \ + (fileI.lfn,fileOE.lfn,fileOA.lfn,fileOC.lfn,fileD.lfn) + + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testRepro.py b/current/pandaserver/test/testRepro.py new file mode 100755 index 000000000..9b0b7f679 --- /dev/null +++ b/current/pandaserver/test/testRepro.py @@ -0,0 +1,116 @@ +import re +import sys +import time +import random +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +cloud = sys.argv[1] +if len(sys.argv)>2: + site = sys.argv[2] +else: + site = None + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = None + +files = { + 'daq.ATLAS.0092045.physics.RPCwBeam.LB0016.SFO-2._0009.data':None, + } + +jobList = [] + +index = 0 +for lfn in files.keys(): + index += 1 + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) + job.AtlasRelease = 'Atlas-14.4.0' + job.homepackage = 'AtlasTier0/14.4.0.2' + job.transformation = 'Reco_trf.py' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.computingSite = site + job.prodDBlock = 'data08_cos.00092045.physics_RPCwBeam.daq.RAW.o4_T1224560091' + + job.prodSourceLabel = 'test' + job.processingType = 'reprocessing' + job.currentPriority = 10000 + job.cloud = cloud + job.cmtConfig = 'i686-slc4-gcc34-opt' + + origParams = """inputBSFile=daq.ATLAS.0092045.physics.RPCwBeam.LB0016.SFO-2._0009.data maxEvents=5 skipEvents=0 autoConfiguration=FieldAndGeo preInclude=RecExCommission/RecExCommission.py,RecExCommission/MinimalCommissioningSetup.py,RecJobTransforms/UseOracle.py preExec="jetFlags.Enabled.set_Value_and_Lock(False)" DBRelease=DBRelease-6.2.1.5.tar.gz conditionsTag=COMCOND-ES1C-000-00 RunNumber=92045 beamType=cosmics AMITag=r595 projectName=data08_cos trigStream=physics_RPCwBeam outputTypes=DPDCOMM outputESDFile=ESD.029868._01110.pool.root outputTAGComm=TAG_COMM.029868._01110.pool.root outputAODFile=AOD.029868._01110.pool.root outputMergedDQMonitorFile=DQM_MERGED.029868._01110.root DPD_PIXELCOMM=DPD_PIXELCOMM.029868._01110.pool.root DPD_SCTCOMM=DPD_SCTCOMM.029868._01110.pool.root DPD_IDCOMM=DPD_IDCOMM.029868._01110.pool.root DPD_IDPROJCOMM=DPD_IDPROJCOMM.029868._01110.pool.root DPD_CALOCOMM=DPD_CALOCOMM.029868._01110.pool.root DPD_TILECOMM=DPD_TILECOMM.029868._01110.pool.root DPD_EMCLUSTCOMM=DPD_EMCLUSTCOMM.029868._01110.pool.root DPD_EGAMMACOMM=DPD_EGAMMACOMM.029868._01110.pool.root DPD_RPCCOMM=DPD_RPCCOMM.029868._01110.pool.root DPD_TGCCOMM=DPD_TGCCOMM.029868._01110.pool.root --ignoreunknown""" + + match = re.findall("([^\s]+=[^\s]+)",origParams) + outMap = {} + for item in match: + arg = item.split('=')[0] + var = item.split('=')[-1] + # output + if arg.startswith('output') or arg.startswith('DPD_'): + # skip some keys + if arg in ['outputTypes']: + continue + prefix = var.split('.')[0] + sumatch = re.search('(\.[^\.]+\.[^\.]+)(\.\d+)*$',var) + suffix = sumatch.group(1) + newName = '%s.%s%s' % (job.jobName,prefix,suffix) + outMap[arg] = (var,newName) + # DBRelease + elif arg == 'DBRelease': + dbrMap = (arg,var) + # input + elif arg.startswith('input') and arg.endswith('File'): + inputMap = (arg,var) + + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + + fileD = FileSpec() + fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v06020105' + fileD.prodDBlock = fileD.dataset + fileD.lfn = 'DBRelease-6.2.1.5.tar.gz' + fileD.type = 'input' + job.addFile(fileD) + + newParams = origParams + newParams = newParams.replace(dbrMap[0]+'='+dbrMap[1],dbrMap[0]+'='+fileD.lfn) + newParams = newParams.replace(inputMap[0]+'='+inputMap[1],inputMap[0]+'='+fileI.lfn) + + for arg,vars in outMap.iteritems(): + fileO = FileSpec() + fileO.lfn = vars[1] + fileO.destinationDBlock = job.destinationDBlock + fileO.destinationSE = job.destinationSE + fileO.dataset = job.destinationDBlock + fileO.destinationDBlockToken = 'ATLASDATADISK' + fileO.type = 'output' + job.addFile(fileO) + newParams = newParams.replace(arg+'='+vars[0],arg+'='+fileO.lfn) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters=newParams + + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testScript.py b/current/pandaserver/test/testScript.py new file mode 100755 index 000000000..2299a441d --- /dev/null +++ b/current/pandaserver/test/testScript.py @@ -0,0 +1,45 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +aSrvID = None + +for idx,argv in enumerate(sys.argv): + if argv == '-s': + aSrvID = sys.argv[idx+1] + sys.argv = sys.argv[:idx] + break + +site = sys.argv[1] + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = None + +job = JobSpec() +job.jobDefinitionID = int(time.time()) % 10000 +job.jobName = "%s" % commands.getoutput('uuidgen') +job.transformation = 'https://atlpan.web.cern.ch/atlpan/test.sh' +job.destinationDBlock = datasetName +job.destinationSE = destName +job.currentPriority = 1000 +job.prodSourceLabel = 'test' +job.computingSite = site + +job.jobParameters="aaaaa" + +fileOL = FileSpec() +fileOL.lfn = "%s.job.log.tgz" % job.jobName +fileOL.destinationDBlock = job.destinationDBlock +fileOL.destinationSE = job.destinationSE +fileOL.dataset = job.destinationDBlock +fileOL.type = 'log' +job.addFile(fileOL) + + +s,o = Client.submitJobs([job],srvID=aSrvID) +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testSimul13.py b/current/pandaserver/test/testSimul13.py new file mode 100644 index 000000000..4b8ef5247 --- /dev/null +++ b/current/pandaserver/test/testSimul13.py @@ -0,0 +1,81 @@ +import sys +import time +import random +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = None + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = 'BNL_ATLAS_2' + +files = { + 'EVNT.019128._00011.pool.root.1':None, + } + +jobList = [] + +index = 0 +for lfn in files.keys(): + index += 1 + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) + job.AtlasRelease = 'Atlas-13.0.40' + job.homepackage = 'AtlasProduction/13.0.40.3' + job.transformation = 'csc_simul_trf.py' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.computingSite = site + job.prodDBlock = 'valid1.005001.pythia_minbias.evgen.EVNT.e306_tid019128' + + job.prodSourceLabel = 'test' + job.currentPriority = 10000 + job.cloud = 'IT' + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + + fileD = FileSpec() + fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v040701' + fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v030101' + fileD.lfn = 'DBRelease-4.7.1.tar.gz' + fileD.type = 'input' + job.addFile(fileD) + + fileOE = FileSpec() + fileOE.lfn = "%s.HITS.pool.root" % job.jobName + fileOE.destinationDBlock = job.destinationDBlock + fileOE.destinationSE = job.destinationSE + fileOE.dataset = job.destinationDBlock + fileOE.destinationDBlockToken = 'ATLASDATADISK' + fileOE.type = 'output' + job.addFile(fileOE) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="%s %s NONE 1 3250 55866 ATLAS-CSC-02-01-00 55866 55866 QGSP_EMV None %s DEFAULT" % \ + (fileI.lfn,fileOE.lfn,fileD.lfn) + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testSimulReco14.py b/current/pandaserver/test/testSimulReco14.py new file mode 100644 index 000000000..41c78c68d --- /dev/null +++ b/current/pandaserver/test/testSimulReco14.py @@ -0,0 +1,101 @@ +import sys +import time +import random +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] + cloud = None +else: + site = None + cloud = 'US' + + + +#cloud = 'TW' +#Recent changes (BNL migration to LFC?) forvce the cloud to be specified +cloud = 'US' + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = 'BNL_ATLAS_2' + +files = { + 'EVNT.023986._00001.pool.root.1':None, + #'EVNT.023989._00001.pool.root.1':None, + } + +jobList = [] + +index = 0 +for lfn in files.keys(): + index += 1 + job = JobSpec() + job.jobDefinitionID = (time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),index) + job.AtlasRelease = 'Atlas-14.2.20' + job.homepackage = 'AtlasProduction/14.2.20.1' + job.transformation = 'csc_simul_reco_trf.py' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.computingSite = site + job.prodDBlock = 'mc08.105031.Jimmy_jetsJ2.evgen.EVNT.e347_tid023986' + #job.prodDBlock = 'mc08.105034.Jimmy_jetsJ5.evgen.EVNT.e347_tid023989' + + job.prodSourceLabel = 'test' + job.processingType = 'test' + job.currentPriority = 10000 + job.cloud = cloud + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + + fileD = FileSpec() + fileD.dataset = 'ddo.000001.Atlas.Ideal.DBRelease.v050601' + fileD.prodDBlock = 'ddo.000001.Atlas.Ideal.DBRelease.v050601' + fileD.lfn = 'DBRelease-5.6.1.tar.gz' + fileD.type = 'input' + job.addFile(fileD) + + fileOA = FileSpec() + fileOA.lfn = "%s.AOD.pool.root" % job.jobName + fileOA.destinationDBlock = job.destinationDBlock + fileOA.destinationSE = job.destinationSE + fileOA.dataset = job.destinationDBlock + fileOA.destinationDBlockToken = 'ATLASDATADISK' + fileOA.type = 'output' + job.addFile(fileOA) + + fileOE = FileSpec() + fileOE.lfn = "%s.ESD.pool.root" % job.jobName + fileOE.destinationDBlock = job.destinationDBlock + fileOE.destinationSE = job.destinationSE + fileOE.dataset = job.destinationDBlock + fileOE.destinationDBlockToken = 'ATLASDATADISK' + fileOE.type = 'output' + job.addFile(fileOE) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.destinationDBlockToken = 'ATLASDATADISK' + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="%s %s 30 500 3 ATLAS-GEO-02-01-00 3 3 QGSP_BERT jobConfig.VertexPosFastIDKiller.py FastSimulationJobTransforms/FastCaloSimAddCellsRecConfig.py,NoTrackSlimming.py %s OFF NONE NONE %s NONE" % (fileI.lfn, fileOA.lfn, fileD.lfn, fileOE.lfn) + + jobList.append(job) + +s,o = Client.submitJobs(jobList) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] diff --git a/current/pandaserver/test/testSiteMap.py b/current/pandaserver/test/testSiteMap.py new file mode 100755 index 000000000..f11053958 --- /dev/null +++ b/current/pandaserver/test/testSiteMap.py @@ -0,0 +1,23 @@ +import os +import re +import sys +import time +import random +import datetime +import commands +from taskbuffer.TaskBuffer import taskBuffer +from brokerage import SiteMapper + +# password +from config import panda_config +passwd = panda_config.dbpasswd + +# instantiate TB +taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + +siteMapper = SiteMapper.SiteMapper(taskBuffer) + +#x = siteMapper.getSite('BNL_ATLAS_1') +#print x + + diff --git a/current/pandaserver/test/testTB.py b/current/pandaserver/test/testTB.py new file mode 100755 index 000000000..d94e06560 --- /dev/null +++ b/current/pandaserver/test/testTB.py @@ -0,0 +1,145 @@ +""" +test TaskBuffer and JobDispatcher on local PC + +$ python -i testTB.py +>>> testGetJobs(10) +>>> testGetJobStatus(1) +>>> testUpdateJob(1,'running') +>>> testGetJobStatus(1) +>>> testUpdateJob(1,'finished') +>>> testGetJobStatus(1) +>>> taskBuffer.peekJobs([1,]) +>>> taskBuffer.queryPandaIDs([0,]) + + +""" + + +import time +import commands +import threading + +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +class TestThread (threading.Thread): + def __init__(self,tb,i,n,siteName): + threading.Thread.__init__(self) + self.taskbuffer = tb + self.interval = i + self.jobDefinitionID = n + self.siteName = siteName + + def run(self): + for i in range(1): + prodDBlock = 'rome.004201.evgen.ZeeJimmy' + destinationDBlock = 'pandatest.000123.test.simul' + destinationSE = 'BNL_SE' + jobs = [] + #for i in range(self.interval): + for i in range(2): + job = JobSpec() + job.jobDefinitionID=self.jobDefinitionID + job.AtlasRelease='Atlas-11.0.1' + job.prodDBlock=prodDBlock + job.destinationDBlock=destinationDBlock + job.destinationSE=destinationSE + job.currentPriority=i + + lfnI = 'rome.004201.evgen.ZeeJimmy._00001.pool.root' + file = FileSpec() + file.lfn = lfnI + file.dataset = 'rome.004201.evgen.ZeeJimmy' + file.type = 'input' + file.prodDBlock = prodDBlock + file.dataset = prodDBlock + job.addFile(file) + + lfnO ='%s.pool.root.1' % commands.getoutput('uuidgen') + file = FileSpec() + file.lfn = lfnO + file.type = 'output' + file.destinationDBlock = destinationDBlock + file.dataset = destinationDBlock + file.destinationSE = destinationSE + job.addFile(file) + + job.homepackage='JobTransforms-11-00-01-01' + job.transformation='share/rome.g4sim.standard.trf' + job.jobParameters='%s %s 1 2 14268' % (lfnI,lfnO) + jobs.append(job) + self.taskbuffer.storeJobs(jobs,None) + time.sleep(self.interval) + +from taskbuffer.TaskBuffer import taskBuffer +from jobdispatcher.JobDispatcher import jobDispatcher +from userinterface.UserIF import userIF + +import getpass +passwd = getpass.getpass() + +taskBuffer.init('adbpro.usatlas.bnl.gov',passwd,nDBConnection=3) + +jobDispatcher.init(taskBuffer) +userIF.init(taskBuffer) + +jobDefID = int(time.time()) % 10000 +thr1 = TestThread(taskBuffer,4,jobDefID,"myhost") +thr2 = TestThread(taskBuffer,3,jobDefID+1,"testsite") + +thr1.start() +#thr2.start() + +from jobdispatcher.JobDispatcher import getJob,updateJob +from userinterface.UserIF import submitJobs,getJobStatus,queryPandaIDs + + +### emulate HTTP requests + +class Request: + def __init__(self): + self.subprocess_env = {} + self.subprocess_env['SSL_CLIENT_S_DN'] = "aaa" + self.subprocess_env['HTTPS'] = "on" + +req = Request() + +def testGetJob(): + print getJob(req,"BNL_ATLAS_2") + +def testGetJobStatus(arg): + print getJobStatus(req,arg) + +def testSubmitJobs(arg): + print submitJobs(req,arg) + +def testUpdateJob(arg0,arg1): + print updateJob(req,arg0,arg1) + +def testQueryPandaIDs(arg): + print queryPandaIDs(req,arg) + +""" + +import cPickle as pickle +ids=[3023,3414] +testGetJobStatus(pickle.dumps(ids)) + +job = JobSpec() +job.jobDefinitionID='user.%s' % commands.getoutput('/usr/bin/uuidgen') +ids = {'pandatest.000003.dd.input._00028.junk':'6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27', + 'pandatest.000003.dd.input._00033.junk':'98f79ba1-1793-4253-aac7-bdf90a51d1ee', + 'pandatest.000003.dd.input._00039.junk':'33660dd5-7cef-422a-a7fc-6c24cb10deb1'} +for lfn in ids.keys(): + file = FileSpec() + file.lfn = lfn + file.GUID = ids[file.lfn] + file.dataset = 'pandatest.000003.dd.input' + file.type = 'input' + job.addFile(file) + +testSubmitJobs(pickle.dumps([job])) + +testQueryPandaIDs(pickle.dumps([10])) + +""" diff --git a/current/pandaserver/test/testTaskA2.py b/current/pandaserver/test/testTaskA2.py new file mode 100755 index 000000000..e54e3948f --- /dev/null +++ b/current/pandaserver/test/testTaskA2.py @@ -0,0 +1,64 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = None + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +#destName = 'BNL_SE' + +jobList = [] + +for i in [999905,999906,999907]: + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = "%s_%d" % (commands.getoutput('uuidgen'),i) + job.AtlasRelease = 'Atlas-14.1.0' + job.homepackage = 'AtlasProduction/12.0.6.2' + job.transformation = 'csc_evgen_trf.py' + job.destinationDBlock = datasetName + #job.destinationSE = destName + job.currentPriority = 1000 + job.prodSourceLabel = 'managed' + #job.prodSourceLabel = 'test' + #job.computingSite = site + job.cmtConfig = 'i686-slc4-gcc34-opt' + job.metadata = 'evgen;%s;%s;%s' % (str({'FR': 46, 'NL': 45, 'NDGF': 300, 'CERN': 19, 'TW': 44110, 'CA': 2922, 'DE': 9903, 'IT': 1168, 'US': 6226, 'UK': 1026, 'ES': 26619}),str({999907:100,999906:200,999905:300}),str({999905:100,999906:910,999907:500})) + #job.metadata = 'evgen;%s' % str({'FR': 46, 'NL': 45, 'NDGF': 300, 'CERN': 19, 'TW': 44110, 'CA': 2922, 'DE': 9903, 'IT': 1168, 'US': 6226, 'UK': 1026, 'ES': 26619}) + + #job.cloud = "UK" + job.taskID = i + + file = FileSpec() + file.lfn = "%s.evgen.pool.root" % job.jobName + file.destinationDBlock = job.destinationDBlock + file.destinationSE = job.destinationSE + file.dataset = job.destinationDBlock + #file.destinationDBlockToken = 'ATLASDATADISK' + file.type = 'output' + job.addFile(file) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % job.jobName + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="7087 0 500000 1 DC3.007087.singlepart_fwdgamma_etaplus_E500.py %s NONE NONE NONE" % file.lfn + jobList.append(job) + +for i in range(1): + #s,o = Client.submitJobs(jobList) + s,outS = Client.runTaskAssignment(jobList) + print "---------------------" + print s + for tmpOut in outS: + print tmpOut diff --git a/current/pandaserver/test/testUser.py b/current/pandaserver/test/testUser.py new file mode 100755 index 000000000..fd51cd1af --- /dev/null +++ b/current/pandaserver/test/testUser.py @@ -0,0 +1,44 @@ +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +job = JobSpec() +job.jobDefinitionID = int(time.time()) % 10000 +job.jobName = commands.getoutput('/usr/bin/uuidgen') +job.AtlasRelease = 'Atlas-9.0.4' +job.prodDBlock = 'pandatest.000003.dd.input' +job.destinationDBlock = 'panda.destDB.%s' % commands.getoutput('/usr/bin/uuidgen') +job.destinationSE = 'BNL_SE' + +ids = {'pandatest.000003.dd.input._00028.junk':'6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27', + 'pandatest.000003.dd.input._00033.junk':'98f79ba1-1793-4253-aac7-bdf90a51d1ee', + 'pandatest.000003.dd.input._00039.junk':'33660dd5-7cef-422a-a7fc-6c24cb10deb1'} +for lfn in ids.keys(): + file = FileSpec() + file.lfn = lfn + file.GUID = ids[file.lfn] + file.dataset = 'pandatest.000003.dd.input' + file.type = 'input' + job.addFile(file) + +s,o = Client.submitJobs([job]) +print "---------------------" +print s +print o +print "---------------------" +s,o = Client.getJobStatus([4934, 4766, 4767, 4768, 4769]) +print s +if s == 0: + for job in o: + if job == None: + continue + print job.PandaID + for file in job.Files: + print file.lfn,file.type +print "---------------------" +s,o = Client.queryPandaIDs([0]) +print s +print o + diff --git a/current/pandaserver/test/testWait.py b/current/pandaserver/test/testWait.py new file mode 100755 index 000000000..adbd9c246 --- /dev/null +++ b/current/pandaserver/test/testWait.py @@ -0,0 +1,119 @@ +import sys +import time +import commands +import userinterface.Client as Client +from taskbuffer.JobSpec import JobSpec +from taskbuffer.FileSpec import FileSpec + +if len(sys.argv)>1: + site = sys.argv[1] +else: + site = None + +datasetName = 'panda.destDB.%s' % commands.getoutput('uuidgen') +destName = 'BNL_SE' + +jobListE = [] +lfnListE = [] + +for i in range(2): + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = commands.getoutput('uuidgen') + job.AtlasRelease = 'Atlas-11.0.3' + job.homepackage = 'JobTransforms-11-00-03-03' + job.transformation = 'share/csc.evgen.trf' + job.destinationDBlock = datasetName + job.destinationSE = destName + job.currentPriority = 1000 + job.prodSourceLabel = 'test' + job.computingSite = site + + file = FileSpec() + file.lfn = "%s.evgen.pool.root" % commands.getoutput('uuidgen') + lfnListE.append(file.lfn) + file.lfn += ('.%d' % (i+1)) + file.destinationDBlock = job.destinationDBlock + file.destinationSE = job.destinationSE + file.dataset = job.destinationDBlock + file.type = 'output' + job.addFile(file) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen') + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="5056 %s NONE 81000 9000 10 DC3.005056.PythiaPhotonJet2.py NONE" % file.lfn + jobListE.append(job) + +s,o = Client.submitJobs(jobListE) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] + +time.sleep(20) + +datasetNameS = 'panda.simu.%s' % commands.getoutput('uuidgen') + +jobListS = [] + +for lfn in lfnListE: + job = JobSpec() + job.jobDefinitionID = int(time.time()) % 10000 + job.jobName = commands.getoutput('uuidgen') + job.AtlasRelease = 'Atlas-11.0.3' + job.homepackage = 'JobTransforms-11-00-03-04' + job.transformation = 'share/csc.simul.trf' + job.destinationDBlock = datasetNameS + job.destinationSE = destName + job.prodDBlock = datasetName + + job.prodSourceLabel = 'test' + job.currentPriority = 1000 + + fileI = FileSpec() + fileI.dataset = job.prodDBlock + fileI.prodDBlock = job.prodDBlock + fileI.lfn = lfn + fileI.type = 'input' + job.addFile(fileI) + + fileOE = FileSpec() + fileOE.lfn = "%s.HITS.pool.root" % commands.getoutput('uuidgen') + fileOE.destinationDBlock = job.destinationDBlock + fileOE.destinationSE = job.destinationSE + fileOE.dataset = job.destinationDBlock + fileOE.type = 'output' + job.addFile(fileOE) + + fileOA = FileSpec() + fileOA.lfn = "%s.RDO.pool.root" % commands.getoutput('uuidgen') + fileOA.destinationDBlock = job.destinationDBlock + fileOA.destinationSE = job.destinationSE + fileOA.dataset = job.destinationDBlock + fileOA.type = 'output' + job.addFile(fileOA) + + fileOL = FileSpec() + fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen') + fileOL.destinationDBlock = job.destinationDBlock + fileOL.destinationSE = job.destinationSE + fileOL.dataset = job.destinationDBlock + fileOL.type = 'log' + job.addFile(fileOL) + + job.jobParameters="%s %s %s 100 4900 400" % (fileI.lfn,fileOE.lfn,fileOA.lfn) + + jobListS.append(job) + +s,o = Client.submitJobs(jobListS) +print "---------------------" +print s +for x in o: + print "PandaID=%s" % x[0] + diff --git a/current/pandaserver/test/tmpwatch.py b/current/pandaserver/test/tmpwatch.py new file mode 100644 index 000000000..ee75d2720 --- /dev/null +++ b/current/pandaserver/test/tmpwatch.py @@ -0,0 +1,47 @@ +import os +import glob +import optparse +import datetime + +# options +optP = optparse.OptionParser(conflict_handler="resolve") +optP.add_option('-t',action='store_const',const=True,dest='test',default=False, + help='test mode') +optP.add_option('-h',action='store',type='int',dest='limit',default=12, + help='time limit in hour') +options,args = optP.parse_args() + +# patterns of tmp files +tmpPatts = ['/tmp/tmp*','/tmp/atlpan/tmp*'] + +# limit +timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=options.limit) + +# loop over all pattern +for tmpPatt in tmpPatts: + tmpFiles = glob.glob(tmpPatt) + # loop over all files + for tmpFile in tmpFiles: + try: + print 'INFO: tmpfile -> %s' % tmpFile + # only file + if not os.path.isfile(tmpFile): + continue + # not symlink + if os.path.islink(tmpFile): + continue + # writable + if not os.access(tmpFile,os.W_OK): + continue + # check time stamp + timeStamp = os.path.getmtime(tmpFile) + timeStamp = datetime.datetime.fromtimestamp(timeStamp) + if timeStamp > timeLimit: + continue + # remove + print 'INFO: remove %s' % tmpFile + if not options.test: + os.remove(tmpFile) + except: + errType,errValue = sys.exc_info()[:2] + print 'ERROR: failed with %s:%s' % (errType,errValue) diff --git a/current/pandaserver/test/update.sh b/current/pandaserver/test/update.sh new file mode 100755 index 000000000..c1edbf515 --- /dev/null +++ b/current/pandaserver/test/update.sh @@ -0,0 +1,19 @@ +#!/usr/bin/python + +import os +import sys + +os.chdir('..') + +option = '' +if len(sys.argv) > 1 and sys.argv[1] == '-n': + option = ' -n' + +packages = ['liveconfigparser','pandalogger','taskbuffer', + 'brokerage','jobdispatcher','userinterface', + 'dataservice','test','server'] #,'config'] + +for pack in packages: + com = 'cvs%s update %s' % (option,pack) + print com + os.system(com) diff --git a/current/pandaserver/test/valConf.py b/current/pandaserver/test/valConf.py new file mode 100644 index 000000000..69ec8688c --- /dev/null +++ b/current/pandaserver/test/valConf.py @@ -0,0 +1,15 @@ +from config import panda_config +from config import panda_config_new + +for item in dir(panda_config): + if item.startswith('__'): + continue + old = getattr(panda_config,item) + if not hasattr(panda_config_new,item): + print "NG : %s not found" % item + continue + new = getattr(panda_config_new,item) + if old != new: + print "NG : %s missmatch" % item + print " old:%s" % old + print " new:%s" % new diff --git a/current/pandaserver/userinterface/Client.py b/current/pandaserver/userinterface/Client.py new file mode 100755 index 000000000..529e2d11c --- /dev/null +++ b/current/pandaserver/userinterface/Client.py @@ -0,0 +1,880 @@ +''' +client methods + +''' + +import os +import re +import sys +import urllib +import commands +import cPickle as pickle + + +# configuration +try: + baseURL = os.environ['PANDA_URL'] +except: + baseURL = 'http://pandaserver.cern.ch:25080/server/panda' +try: + baseURLSSL = os.environ['PANDA_URL_SSL'] +except: + baseURLSSL = 'https://pandaserver.cern.ch:25443/server/panda' + + +# exit code +EC_Failed = 255 + + +# panda server URLs +if os.environ.has_key('PANDA_URL_MAP'): + serverURLs = {'default' : {'URL' : baseURL, + 'URLSSL' : baseURLSSL}, + } + # decode envvar to map + try: + for tmpCompStr in os.environ['PANDA_URL_MAP'].split('|'): + tmpKey,tmpURL,tmpURLSSL = tmpCompStr.split(',') + # append + serverURLs[tmpKey] = {'URL' : tmpURL, + 'URLSSL' : tmpURLSSL} + except: + pass +else: + # default + serverURLs = {'default' : {'URL' : baseURL, + 'URLSSL' : baseURLSSL}, + 'CERN' : {'URL' : 'http://pandaserver.cern.ch:25080/server/panda', + 'URLSSL' : 'https://pandaserver.cern.ch:25443/server/panda'}, + } + +# bamboo +baseURLBAMBOO = 'http://pandabamboo.cern.ch:25070/bamboo/bamboo' + + +# get URL +def _getURL(type,srvID=None): + if serverURLs.has_key(srvID): + urls = serverURLs[srvID] + else: + urls = serverURLs['default'] + return urls[type] + + +# get Panda srvIDs +def getPandas(): + srvs = serverURLs.keys() + # remove 'default' + try: + srvs.remove('default') + except: + pass + return srvs + + +# look for a grid proxy certificate +def _x509(): + # see X509_USER_PROXY + try: + return os.environ['X509_USER_PROXY'] + except: + pass + # see the default place + x509 = '/tmp/x509up_u%s' % os.getuid() + if os.access(x509,os.R_OK): + return x509 + # no valid proxy certificate + # FIXME + print "No valid grid proxy certificate found" + return '' + + +# curl class +class _Curl: + # constructor + def __init__(self): + # path to curl + self.path = 'curl' + # verification of the host certificate + self.verifyHost = False + # request a compressed response + self.compress = True + # SSL cert/key + self.sslCert = '' + self.sslKey = '' + # verbose + self.verbose = False + + + # GET method + def get(self,url,data): + # make command + com = '%s --silent --get' % self.path + if not self.verifyHost: + com += ' --insecure' + if self.compress: + com += ' --compressed' + if self.sslCert != '': + com += ' --cert %s' % self.sslCert + if self.sslKey != '': + com += ' --key %s' % self.sslKey + # timeout + com += ' -m 600' + # data + strData = '' + for key in data.keys(): + strData += 'data="%s"\n' % urllib.urlencode({key:data[key]}) + # write data to temporary config file + try: + tmpName = os.environ['PANDA_TMP'] + except: + tmpName = '/tmp' + tmpName += '/%s_%s' % (commands.getoutput('whoami'),commands.getoutput('uuidgen')) + tmpFile = open(tmpName,'w') + tmpFile.write(strData) + tmpFile.close() + com += ' --config %s' % tmpName + com += ' %s' % url + # execute + if self.verbose: + print com + print commands.getoutput('cat %s' % tmpName) + ret = commands.getstatusoutput(com) + # remove temporary file + os.remove(tmpName) + if ret[0] != 0: + ret = (ret[0]%255,ret[1]) + if self.verbose: + print ret + return ret + + + # POST method + def post(self,url,data): + # make command + com = '%s --silent' % self.path + if not self.verifyHost: + com += ' --insecure' + if self.compress: + com += ' --compressed' + if self.sslCert != '': + com += ' --cert %s' % self.sslCert + if self.sslKey != '': + com += ' --key %s' % self.sslKey + # timeout + com += ' -m 600' + # data + strData = '' + for key in data.keys(): + strData += 'data="%s"\n' % urllib.urlencode({key:data[key]}) + # write data to temporary config file + try: + tmpName = os.environ['PANDA_TMP'] + except: + tmpName = '/tmp' + tmpName += '/%s_%s' % (commands.getoutput('whoami'),commands.getoutput('uuidgen')) + tmpFile = open(tmpName,'w') + tmpFile.write(strData) + tmpFile.close() + com += ' --config %s' % tmpName + com += ' %s' % url + # execute + if self.verbose: + print com + print commands.getoutput('cat %s' % tmpName) + ret = commands.getstatusoutput(com) + # remove temporary file + os.remove(tmpName) + if ret[0] != 0: + ret = (ret[0]%255,ret[1]) + if self.verbose: + print ret + return ret + + + # PUT method + def put(self,url,data): + # make command + com = '%s --silent' % self.path + if not self.verifyHost: + com += ' --insecure' + if self.compress: + com += ' --compressed' + if self.sslCert != '': + com += ' --cert %s' % self.sslCert + if self.sslKey != '': + com += ' --key %s' % self.sslKey + # emulate PUT + for key in data.keys(): + com += ' -F "%s=@%s"' % (key,data[key]) + com += ' %s' % url + # execute + if self.verbose: + print com + ret = commands.getstatusoutput(com) + if ret[0] != 0: + ret = (ret[0]%255,ret[1]) + if self.verbose: + print ret + return ret + + +''' +public methods + +''' + +# use web cache +def useWebCache(): + global baseURL + baseURL = 'http://pandaserver.cern.ch:25085/server/panda' + global serverURLs + for tmpKey,tmpVal in serverURLs.iteritems(): + tmpVal['URL'] = baseURL + + +# submit jobs +def submitJobs(jobs,srvID=None,toPending=False): + # set hostname + hostname = commands.getoutput('hostname') + for job in jobs: + job.creationHost = hostname + # serialize + strJobs = pickle.dumps(jobs) + # instantiate curl + curl = _Curl() + curl.sslCert = _x509() + curl.sslKey = _x509() + # execute + url = _getURL('URLSSL',srvID) + '/submitJobs' + data = {'jobs':strJobs} + if toPending: + data['toPending'] = True + status,output = curl.post(url,data) + if status!=0: + print output + return status,output + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR submitJobs : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# run task assignment +def runTaskAssignment(jobs): + # set hostname + hostname = commands.getoutput('hostname') + for job in jobs: + job.creationHost = hostname + # serialize + strJobs = pickle.dumps(jobs) + # instantiate curl + curl = _Curl() + curl.sslCert = _x509() + curl.sslKey = _x509() + # execute + url = baseURLSSL + '/runTaskAssignment' + data = {'jobs':strJobs} + status,output = curl.post(url,data) + if status!=0: + print output + return status,output + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR runTaskAssignment : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# get job status +def getJobStatus(ids,srvID=None): + # serialize + strIDs = pickle.dumps(ids) + # instantiate curl + curl = _Curl() + # execute + url = _getURL('URL',srvID) + '/getJobStatus' + data = {'ids':strIDs} + status,output = curl.post(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR getJobStatus : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# get PandaID with jobexeID +def getPandaIDwithJobExeID(ids): + # serialize + strIDs = pickle.dumps(ids) + # instantiate curl + curl = _Curl() + # execute + url = _getURL('URL') + '/getPandaIDwithJobExeID' + data = {'ids':strIDs} + status,output = curl.post(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR getPandaIDwithJobExeID : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# get assigning task +def getAssigningTask(): + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/getAssigningTask' + status,output = curl.get(url,{}) + try: + return status,pickle.loads(output) + except: + print output + type, value, traceBack = sys.exc_info() + errStr = "ERROR getAssigningTask : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# get assigned cloud for tasks +def seeCloudTask(ids): + # serialize + strIDs = pickle.dumps(ids) + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/seeCloudTask' + data = {'ids':strIDs} + status,output = curl.post(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR seeCloudTask : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# kill jobs +def killJobs(ids,code=None,verbose=False,srvID=None,useMailAsID=False): + # serialize + strIDs = pickle.dumps(ids) + # instantiate curl + curl = _Curl() + curl.sslCert = _x509() + curl.sslKey = _x509() + curl.verbose = verbose + # execute + url = _getURL('URLSSL',srvID) + '/killJobs' + data = {'ids':strIDs,'code':code,'useMailAsID':useMailAsID} + status,output = curl.post(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR killJobs : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# reassign jobs +def reassignJobs(ids,forPending=False): + # serialize + strIDs = pickle.dumps(ids) + # instantiate curl + curl = _Curl() + curl.sslCert = _x509() + curl.sslKey = _x509() + # execute + url = baseURLSSL + '/reassignJobs' + data = {'ids':strIDs} + if forPending: + data['forPending'] = True + status,output = curl.post(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR reassignJobs : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# query PandaIDs +def queryPandaIDs(ids): + # serialize + strIDs = pickle.dumps(ids) + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/queryPandaIDs' + data = {'ids':strIDs} + status,output = curl.post(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR queryPandaIDs : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# query job info per cloud +def queryJobInfoPerCloud(cloud,schedulerID=None): + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/queryJobInfoPerCloud' + data = {'cloud':cloud} + if schedulerID != None: + data['schedulerID'] = schedulerID + status,output = curl.post(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR queryJobInfoPerCloud : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# get job statistics +def getJobStatistics(sourcetype=None): + # instantiate curl + curl = _Curl() + # execute + ret = {} + for srvID in getPandas(): + url = _getURL('URL',srvID) + '/getJobStatistics' + data = {} + if sourcetype != None: + data['sourcetype'] = sourcetype + status,output = curl.get(url,data) + try: + tmpRet = status,pickle.loads(output) + if status != 0: + return tmpRet + except: + print output + type, value, traceBack = sys.exc_info() + errStr = "ERROR getJobStatistics : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + # gather + for tmpCloud,tmpVal in tmpRet[1].iteritems(): + if not ret.has_key(tmpCloud): + # append cloud values + ret[tmpCloud] = tmpVal + else: + # sum statistics + for tmpStatus,tmpCount in tmpVal.iteritems(): + if ret[tmpCloud].has_key(tmpStatus): + ret[tmpCloud][tmpStatus] += tmpCount + else: + ret[tmpCloud][tmpStatus] = tmpCount + return 0,ret + + +# get job statistics for Bamboo +def getJobStatisticsForBamboo(useMorePG=False): + # instantiate curl + curl = _Curl() + # execute + ret = {} + for srvID in getPandas(): + url = _getURL('URL',srvID) + '/getJobStatisticsForBamboo' + data = {} + if useMorePG != False: + data['useMorePG'] = useMorePG + status,output = curl.get(url,data) + try: + tmpRet = status,pickle.loads(output) + if status != 0: + return tmpRet + except: + print output + type, value, traceBack = sys.exc_info() + errStr = "ERROR getJobStatisticsForBamboo : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + # gather + for tmpCloud,tmpMap in tmpRet[1].iteritems(): + if not ret.has_key(tmpCloud): + # append cloud values + ret[tmpCloud] = tmpMap + else: + # sum statistics + for tmpPType,tmpVal in tmpMap.iteritems(): + if not ret[tmpCloud].has_key(tmpPType): + ret[tmpCloud][tmpPType] = tmpVal + else: + for tmpStatus,tmpCount in tmpVal.iteritems(): + if ret[tmpCloud][tmpPType].has_key(tmpStatus): + ret[tmpCloud][tmpPType][tmpStatus] += tmpCount + else: + ret[tmpCloud][tmpPType][tmpStatus] = tmpCount + return 0,ret + + +# get highest prio jobs +def getHighestPrioJobStat(perPG=False,useMorePG=False): + # instantiate curl + curl = _Curl() + # execute + ret = {} + url = baseURL + '/getHighestPrioJobStat' + data = {'perPG':perPG} + if useMorePG != False: + data['useMorePG'] = useMorePG + status,output = curl.get(url,data) + try: + return status,pickle.loads(output) + except: + print output + type, value, traceBack = sys.exc_info() + errStr = "ERROR getHighestPrioJobStat : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# get jobs updated recently +def getJobsToBeUpdated(limit=5000,lockedby='',srvID=None): + # instantiate curl + curl = _Curl() + # execute + url = _getURL('URL',srvID) + '/getJobsToBeUpdated' + status,output = curl.get(url,{'limit':limit,'lockedby':lockedby}) + try: + return status,pickle.loads(output) + except: + print output + type, value, traceBack = sys.exc_info() + errStr = "ERROR getJobsToBeUpdated : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# update prodDBUpdateTimes +def updateProdDBUpdateTimes(params,verbose=False,srvID=None): + # serialize + strPar = pickle.dumps(params) + # instantiate curl + curl = _Curl() + curl.sslCert = _x509() + curl.sslKey = _x509() + curl.verbose = verbose + # execute + url = _getURL('URLSSL',srvID) + '/updateProdDBUpdateTimes' + data = {'params':strPar} + status,output = curl.post(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR updateProdDBUpdateTimes : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# get PandaID at site +def getPandaIDsSite(site,status,limit=500): + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/getPandaIDsSite' + status,output = curl.get(url,{'site':site,'status':status,'limit':limit}) + try: + return status,pickle.loads(output) + except: + print output + type, value, traceBack = sys.exc_info() + errStr = "ERROR getPandaIDsSite : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# get job statistics per site +def getJobStatisticsPerSite(predefined=False,workingGroup='',countryGroup='',jobType='',minPriority=None, + readArchived=None): + # instantiate curl + curl = _Curl() + # execute + ret = {} + for srvID in getPandas(): + url = _getURL('URL',srvID) + '/getJobStatisticsPerSite' + data = {'predefined':predefined} + if not workingGroup in ['',None]: + data['workingGroup'] = workingGroup + if not countryGroup in ['',None]: + data['countryGroup'] = countryGroup + if not jobType in ['',None]: + data['jobType'] = jobType + if not minPriority in ['',None]: + data['minPriority'] = minPriority + if not readArchived in ['',None]: + data['readArchived'] = readArchived + status,output = curl.get(url,data) + try: + tmpRet = status,pickle.loads(output) + if status != 0: + return tmpRet + except: + print output + type, value, traceBack = sys.exc_info() + errStr = "ERROR getJobStatisticsPerSite : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + # gather + for tmpSite,tmpVal in tmpRet[1].iteritems(): + if not ret.has_key(tmpSite): + # append site values + ret[tmpSite] = tmpVal + else: + # sum statistics + for tmpStatus,tmpCount in tmpVal.iteritems(): + if ret[tmpSite].has_key(tmpStatus): + ret[tmpSite][tmpStatus] += tmpCount + else: + ret[tmpSite][tmpStatus] = tmpCount + return 0,ret + + +# get job statistics per site with label +def getJobStatisticsWithLabel(site=''): + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/getJobStatisticsWithLabel' + data = {} + if not site in ['',None]: + data['site'] = site + status,output = curl.get(url,data) + try: + return status,pickle.loads(output) + except: + print output + type, value, traceBack = sys.exc_info() + errStr = "ERROR getJobStatisticsWithLabel : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# get the number of waiting jobs per site and user +def getJobStatisticsPerUserSite(): + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/getJobStatisticsPerUserSite' + data = {} + status,output = curl.get(url,data) + try: + return status,pickle.loads(output) + except: + print output + type, value, traceBack = sys.exc_info() + errStr = "ERROR getJobStatisticsPerUserSite : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# query last files in datasets +def queryLastFilesInDataset(datasets): + # serialize + strDSs = pickle.dumps(datasets) + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/queryLastFilesInDataset' + data = {'datasets':strDSs} + status,output = curl.post(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + print "ERROR queryLastFilesInDataset : %s %s" % (type,value) + return EC_Failed,None + + +# insert sandbox file info +def insertSandboxFileInfo(userName,fileName,fileSize,checkSum,verbose=False): + # instantiate curl + curl = _Curl() + curl.sslCert = _x509() + curl.sslKey = _x509() + curl.verbose = verbose + # execute + url = baseURLSSL + '/insertSandboxFileInfo' + data = {'userName':userName,'fileName':fileName,'fileSize':fileSize,'checkSum':checkSum} + return curl.post(url,data) + + +# put file +def putFile(file): + # instantiate curl + curl = _Curl() + curl.sslCert = _x509() + curl.sslKey = _x509() + # execute + url = baseURLSSL + '/putFile' + data = {'file':file} + return curl.put(url,data) + + +# delete file +def deleteFile(file): + # instantiate curl + curl = _Curl() + curl.sslCert = _x509() + curl.sslKey = _x509() + # execute + url = baseURLSSL + '/deleteFile' + data = {'file':file} + return curl.post(url,data) + + +# touch file +def touchFile(sourceURL,filename): + # instantiate curl + curl = _Curl() + curl.sslCert = _x509() + curl.sslKey = _x509() + # execute + url = sourceURL + '/server/panda/touchFile' + data = {'filename':filename} + return curl.post(url,data) + + +# resubmit jobs +def resubmitJobs(ids): + # serialize + strIDs = pickle.dumps(ids) + # instantiate curl + curl = _Curl() + curl.sslCert = _x509() + curl.sslKey = _x509() + # execute + url = baseURLSSL + '/resubmitJobs' + data = {'ids':strIDs} + status,output = curl.post(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + print "ERROR resubmitJobs : %s %s" % (type,value) + return EC_Failed,None + + +# get site specs +def getSiteSpecs(siteType=None): + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/getSiteSpecs' + data = {} + if siteType != None: + data = {'siteType':siteType} + status,output = curl.get(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR getSiteSpecs : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# get cloud specs +def getCloudSpecs(): + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/getCloudSpecs' + status,output = curl.get(url,{}) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR getCloudSpecs : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# get nPilots +def getNumPilots(): + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/getNumPilots' + status,output = curl.get(url,{}) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR getNumPilots : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# run brokerage +def runBrokerage(sites,atlasRelease,cmtConfig=None): + # serialize + strSites = pickle.dumps(sites) + # instantiate curl + curl = _Curl() + # execute + url = baseURL + '/runBrokerage' + data = {'sites':strSites, + 'atlasRelease':atlasRelease} + if cmtConfig != None: + data['cmtConfig'] = cmtConfig + return curl.get(url,data) + + +# get RW +def getRW(priority=0): + # instantiate curl + curl = _Curl() + # execute + url = baseURLBAMBOO + '/getRW' + # get RWs for high priority tasks + data = {'priority':priority} + status,output = curl.get(url,data) + try: + return status,pickle.loads(output) + except: + type, value, traceBack = sys.exc_info() + errStr = "ERROR getRW : %s %s" % (type,value) + print errStr + return EC_Failed,output+'\n'+errStr + + +# change job priorities +def changeJobPriorities(newPrioMap): + # serialize + newPrioMapStr = pickle.dumps(newPrioMap) + # instantiate curl + curl = _Curl() + curl.sslCert = _x509() + curl.sslKey = _x509() + # execute + url = baseURLSSL + '/changeJobPriorities' + data = {'newPrioMap':newPrioMapStr} + status,output = curl.post(url,data) + try: + return status,pickle.loads(output) + except: + errtype,errvalue = sys.exc_info()[:2] + errStr = "ERROR changeJobPriorities : %s %s" % (errtype,errvalue) + return EC_Failed,output+'\n'+errStr + + diff --git a/current/pandaserver/userinterface/RbLauncher.py b/current/pandaserver/userinterface/RbLauncher.py new file mode 100755 index 000000000..a23a6fbcf --- /dev/null +++ b/current/pandaserver/userinterface/RbLauncher.py @@ -0,0 +1,52 @@ +''' +launcer for ReBroker + +''' + +import sys +import time +import commands +import threading + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('RbLauncher') + + +class RbLauncher (threading.Thread): + # constructor + def __init__(self,dn,jobID,cloud=None,excludedSite=None): + threading.Thread.__init__(self) + self.dn = dn + self.jobID = jobID + self.cloud = cloud + self.excludedSite = excludedSite + # time stamp + self.timestamp = time.asctime() + + + # main + def run(self): + try: + _logger.debug('%s startRun' % self.timestamp) + # run + com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) + com += 'source %s; ' % panda_config.glite_source + com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/userinterface/runReBroker.py ' % \ + (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, + panda_config.pandaPython_dir) + com += '-j %s -d "%s" ' % (self.jobID,self.dn) + if self.cloud != None: + com += '-c %s ' % self.cloud + if self.excludedSite != None: + com += '-e %s ' % self.excludedSite + # exeute + _logger.debug('%s com=%s' % (self.timestamp,com)) + status,output = commands.getstatusoutput(com) + _logger.debug("%s Ret from another process: %s %s" % (self.timestamp,status,output)) + _logger.debug('%s endRun' % self.timestamp) + except: + type, value, traceBack = sys.exc_info() + _logger.error("run() : %s %s" % (type,value)) diff --git a/current/pandaserver/userinterface/ReBroker.py b/current/pandaserver/userinterface/ReBroker.py new file mode 100644 index 000000000..205b375ee --- /dev/null +++ b/current/pandaserver/userinterface/ReBroker.py @@ -0,0 +1,1022 @@ +''' +find another candidate site for analysis + +''' + +import re +import sys +import time +import random +import datetime +import threading + +from dataservice.DDM import ddm +from dataservice.DDM import dq2Common +from taskbuffer.JobSpec import JobSpec +from taskbuffer.OraDBProxy import DBProxy +from dataservice.Setupper import Setupper +from brokerage.SiteMapper import SiteMapper +import brokerage.broker + +from config import panda_config +from pandalogger.PandaLogger import PandaLogger + +# logger +_logger = PandaLogger().getLogger('ReBroker') + +def initLogger(pLogger): + # redirect logging to parent as it doesn't work in nested threads + global _logger + _logger = pLogger + + +class ReBroker (threading.Thread): + + # constructor + def __init__(self,taskBuffer,cloud=None,excludedSite=None,overrideSite=True, + simulation=False,forceOpt=False,userRequest=False,forFailed=False, + avoidSameSite=False): + threading.Thread.__init__(self) + self.job = None + self.jobID = None + self.pandaID = None + self.cloud = cloud + self.pandaJobList = [] + self.buildStatus = None + self.taskBuffer = taskBuffer + self.token = None + self.newDatasetMap = {} + self.simulation = simulation + self.forceOpt = forceOpt + self.excludedSite = excludedSite + self.overrideSite = overrideSite + self.maxPandaIDlibDS = None + self.userRequest = userRequest + self.forFailed = forFailed + self.revNum = 0 + self.avoidSameSite = avoidSameSite + self.brokerageInfo = [] + + + # main + def run(self): + try: + # get job + tmpJobs = self.taskBuffer.getFullJobStatus([self.rPandaID]) + if tmpJobs == [] or tmpJobs[0] == None: + _logger.debug("cannot find job for PandaID=%s" % self.rPandaID) + return + self.job = tmpJobs[0] + _logger.debug("%s start %s:%s:%s" % (self.token,self.job.jobDefinitionID,self.job.prodUserName,self.job.computingSite)) + # using output container + if not self.job.destinationDBlock.endswith('/'): + _logger.debug("%s ouput dataset container is required" % self.token) + _logger.debug("%s end" % self.token) + return + # FIXEME : dont' touch group jobs for now + if self.job.destinationDBlock.startswith('group') and (not self.userRequest): + _logger.debug("%s skip group jobs" % self.token) + _logger.debug("%s end" % self.token) + return + # check processingType + typesForRebro = ['pathena','prun','ganga','ganga-rbtest'] + if not self.job.processingType in typesForRebro: + _logger.debug("%s skip processingType=%s not in %s" % \ + (self.token,self.job.processingType,str(typesForRebro))) + _logger.debug("%s end" % self.token) + return + # check jobsetID + if self.job.jobsetID in [0,'NULL',None]: + _logger.debug("%s jobsetID is undefined" % self.token) + _logger.debug("%s end" % self.token) + return + # check metadata + if self.job.metadata in [None,'NULL']: + _logger.debug("%s metadata is unavailable" % self.token) + _logger.debug("%s end" % self.token) + return + # check --disableRebrokerage + match = re.search("--disableRebrokerage",self.job.metadata) + if match != None and (not self.simulation) and (not self.forceOpt) \ + and (not self.userRequest): + _logger.debug("%s diabled rebrokerage" % self.token) + _logger.debug("%s end" % self.token) + return + # check --site + match = re.search("--site",self.job.metadata) + if match != None and (not self.simulation) and (not self.forceOpt) \ + and (not self.userRequest): + _logger.debug("%s --site is used" % self.token) + _logger.debug("%s end" % self.token) + return + # check --libDS + match = re.search("--libDS",self.job.metadata) + if match != None: + _logger.debug("%s --libDS is used" % self.token) + _logger.debug("%s end" % self.token) + return + # check --workingGroup since it is site-specific + match = re.search("--workingGroup",self.job.metadata) + if match != None: + _logger.debug("%s workingGroup is specified" % self.token) + _logger.debug("%s end" % self.token) + return + # avoid too many rebrokerage + if not self.checkRev(): + _logger.debug("%s avoid too many rebrokerage" % self.token) + _logger.debug("%s end" % self.token) + return + # check if multiple JobIDs use the same libDS + if self.bPandaID != None and self.buildStatus not in ['finished','failed']: + if self.minPandaIDlibDS == None or self.maxPandaIDlibDS == None: + _logger.debug("%s max/min PandaIDs are unavailable for the libDS" % self.token) + _logger.debug("%s end" % self.token) + return + tmpPandaIDsForLibDS = self.taskBuffer.getFullJobStatus([self.minPandaIDlibDS,self.maxPandaIDlibDS]) + if len(tmpPandaIDsForLibDS) != 2 or tmpPandaIDsForLibDS[0] == None or tmpPandaIDsForLibDS[1] == None: + _logger.debug("%s failed to get max/min PandaIDs for the libDS" % self.token) + _logger.debug("%s end" % self.token) + return + # check + if tmpPandaIDsForLibDS[0].jobDefinitionID != tmpPandaIDsForLibDS[1].jobDefinitionID: + _logger.debug("%s multiple JobIDs use the libDS %s:%s %s:%s" % (self.token,tmpPandaIDsForLibDS[0].jobDefinitionID, + self.minPandaIDlibDS,tmpPandaIDsForLibDS[1].jobDefinitionID, + self.maxPandaIDlibDS)) + _logger.debug("%s end" % self.token) + return + # check excludedSite + if self.excludedSite == None: + self.excludedSite = [] + match = re.search("--excludedSite( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata) + if match != None: + self.excludedSite = match.group(3).split(',') + # remove empty + try: + self.excludedSite.remove('') + except: + pass + _logger.debug("%s excludedSite=%s" % (self.token,str(self.excludedSite))) + # check cloud + if self.cloud == None: + match = re.search("--cloud( +|=)\s*(\'|\")*([^ \"\';$]+)",self.job.metadata) + if match != None: + self.cloud = match.group(3) + _logger.debug("%s cloud=%s" % (self.token,self.cloud)) + # get inDS/LFNs + status,tmpMapInDS,maxFileSize = self.taskBuffer.getInDatasetsForReBrokerage(self.jobID,self.userName) + if not status: + # failed + _logger.error("%s failed to get inDS/LFN from DB" % self.token) + return + status,inputDS = self.getListDatasetsUsedByJob(tmpMapInDS) + if not status: + # failed + _logger.error("%s failed" % self.token) + return + # get relicas + replicaMap = {} + unknownSites = {} + for tmpDS in inputDS: + if tmpDS.endswith('/'): + # container + status,tmpRepMaps = self.getListDatasetReplicasInContainer(tmpDS) + else: + # normal dataset + status,tmpRepMap = self.getListDatasetReplicas(tmpDS) + tmpRepMaps = {tmpDS:tmpRepMap} + if not status: + # failed + _logger.debug("%s failed" % self.token) + return + # make map per site + for tmpDS,tmpRepMap in tmpRepMaps.iteritems(): + for tmpSite,tmpStat in tmpRepMap.iteritems(): + # ignore special sites + if tmpSite in ['CERN-PROD_TZERO','CERN-PROD_DAQ','CERN-PROD_TMPDISK']: + continue + # ignore tape sites + if tmpSite.endswith('TAPE'): + continue + # keep sites with unknown replica info + if tmpStat[-1]['found'] == None: + if not unknownSites.has_key(tmpDS): + unknownSites[tmpDS] = [] + unknownSites[tmpDS].append(tmpSite) + # ignore ToBeDeleted + if tmpStat[-1]['archived'] in ['ToBeDeleted',]: + continue + # change EOS + if tmpSite.startswith('CERN-PROD_EOS'): + tmpSite = 'CERN-PROD_EOS' + # change EOS TMP + if tmpSite.startswith('CERN-PROD_TMP'): + tmpSite = 'CERN-PROD_TMP' + # change DISK to SCRATCHDISK + tmpSite = re.sub('_[^_-]+DISK$','',tmpSite) + # change PERF-XYZ to SCRATCHDISK + tmpSite = re.sub('_PERF-[^_-]+$','',tmpSite) + # change PHYS-XYZ to SCRATCHDISK + tmpSite = re.sub('_PHYS-[^_-]+$','',tmpSite) + # patch for BNLPANDA + if tmpSite in ['BNLPANDA']: + tmpSite = 'BNL-OSG2' + # add to map + if not replicaMap.has_key(tmpSite): + replicaMap[tmpSite] = {} + replicaMap[tmpSite][tmpDS] = tmpStat[-1] + _logger.debug("%s replica map -> %s" % (self.token,str(replicaMap))) + # refresh replica info in needed + self.refreshReplicaInfo(unknownSites) + # instantiate SiteMapper + siteMapper = SiteMapper(self.taskBuffer) + # get original DDM + origSiteDDM = self.getAggName(siteMapper.getSite(self.job.computingSite).ddm) + # check all datasets + maxDQ2Sites = [] + if inputDS != []: + # loop over all sites + for tmpSite,tmpDsVal in replicaMap.iteritems(): + # loop over all datasets + appendFlag = True + for tmpOrigDS in inputDS: + # check completeness + if tmpDsVal.has_key(tmpOrigDS) and tmpDsVal[tmpOrigDS]['found'] != None and \ + tmpDsVal[tmpOrigDS]['total'] == tmpDsVal[tmpOrigDS]['found']: + pass + else: + appendFlag = False + # append + if appendFlag: + if not tmpSite in maxDQ2Sites: + maxDQ2Sites.append(tmpSite) + _logger.debug("%s candidate DQ2s -> %s" % (self.token,str(maxDQ2Sites))) + if inputDS != [] and maxDQ2Sites == []: + _logger.debug("%s no DQ2 candidate" % self.token) + else: + maxPandaSites = [] + # original maxinputsize + origMaxInputSize = siteMapper.getSite(self.job.computingSite).maxinputsize + # look for Panda siteIDs + for tmpSiteID,tmpSiteSpec in siteMapper.siteSpecList.iteritems(): + # use ANALY_ only + if not tmpSiteID.startswith('ANALY_'): + continue + # remove test and local + if re.search('_test',tmpSiteID,re.I) != None: + continue + if re.search('_local',tmpSiteID,re.I) != None: + continue + # avoid same site + if self.avoidSameSite and self.getAggName(tmpSiteSpec.ddm) == origSiteDDM: + continue + # check DQ2 ID + if self.cloud in [None,tmpSiteSpec.cloud] \ + and (self.getAggName(tmpSiteSpec.ddm) in maxDQ2Sites or inputDS == []): + # excluded sites + excludedFlag = False + for tmpExcSite in self.excludedSite: + if re.search(tmpExcSite,tmpSiteID) != None: + excludedFlag = True + break + if excludedFlag: + _logger.debug("%s skip %s since excluded" % (self.token,tmpSiteID)) + continue + # use online only + if tmpSiteSpec.status != 'online': + _logger.debug("%s skip %s status=%s" % (self.token,tmpSiteID,tmpSiteSpec.status)) + continue + # check maxinputsize + if (maxFileSize == None and origMaxInputSize > siteMapper.getSite(tmpSiteID).maxinputsize) or \ + maxFileSize > siteMapper.getSite(tmpSiteID).maxinputsize: + _logger.debug("%s skip %s due to maxinputsize" % (self.token,tmpSiteID)) + continue + # append + if not tmpSiteID in maxPandaSites: + maxPandaSites.append(tmpSiteID) + # choose at most 20 sites randomly to avoid too many lookup + random.shuffle(maxPandaSites) + maxPandaSites = maxPandaSites[:20] + _logger.debug("%s candidate PandaSites -> %s" % (self.token,str(maxPandaSites))) + # no Panda siteIDs + if maxPandaSites == []: + _logger.debug("%s no Panda site candidate" % self.token) + else: + # set AtlasRelease and cmtConfig to dummy job + tmpJobForBrokerage = JobSpec() + if self.job.AtlasRelease in ['NULL',None]: + tmpJobForBrokerage.AtlasRelease = '' + else: + tmpJobForBrokerage.AtlasRelease = self.job.AtlasRelease + # use nightlies + matchNight = re.search('^AnalysisTransforms-.*_(rel_\d+)$',self.job.homepackage) + if matchNight != None: + tmpJobForBrokerage.AtlasRelease += ':%s' % matchNight.group(1) + # use cache + else: + matchCache = re.search('^AnalysisTransforms-([^/]+)',self.job.homepackage) + if matchCache != None: + tmpJobForBrokerage.AtlasRelease = matchCache.group(1).replace('_','-') + if not self.job.cmtConfig in ['NULL',None]: + tmpJobForBrokerage.cmtConfig = self.job.cmtConfig + # memory size + if not self.job.minRamCount in ['NULL',None,0]: + tmpJobForBrokerage.minRamCount = self.job.minRamCount + # CPU count + if not self.job.maxCpuCount in ['NULL',None,0]: + tmpJobForBrokerage.maxCpuCount = self.job.maxCpuCount + # run brokerage + brokerage.broker.schedule([tmpJobForBrokerage],self.taskBuffer,siteMapper,forAnalysis=True, + setScanSiteList=maxPandaSites,trustIS=True,reportLog=True) + newSiteID = tmpJobForBrokerage.computingSite + self.brokerageInfo += tmpJobForBrokerage.brokerageErrorDiag + _logger.debug("%s runBrokerage - > %s" % (self.token,newSiteID)) + # unknown site + if not siteMapper.checkSite(newSiteID): + _logger.error("%s unknown site" % self.token) + _logger.debug("%s failed" % self.token) + return + # get new site spec + newSiteSpec = siteMapper.getSite(newSiteID) + # avoid repetition + if self.getAggName(newSiteSpec.ddm) == origSiteDDM: + _logger.debug("%s assigned to the same site %s " % (self.token,newSiteID)) + _logger.debug("%s end" % self.token) + return + # simulation mode + if self.simulation: + _logger.debug("%s end simulation" % self.token) + return + # prepare jobs + status = self.prepareJob(newSiteID,newSiteSpec.cloud) + if status: + # run SetUpper + statusSetUp = self.runSetUpper() + if not statusSetUp: + _logger.debug("%s runSetUpper failed" % self.token) + else: + _logger.debug("%s successfully assigned to %s" % (self.token,newSiteID)) + _logger.debug("%s end" % self.token) + except: + errType,errValue,errTraceBack = sys.exc_info() + _logger.error("%s run() : %s %s" % (self.token,errType,errValue)) + + + # get aggregated DQ2 ID + def getAggName(self,origName): + if origName.startswith('CERN-PROD_EOS'): + return 'CERN-PROD_EOS' + if origName.startswith('CERN-PROD_TMP'): + return 'CERN-PROD_TMP' + return re.sub('_[^_-]+DISK$','',origName) + + + # lock job to disable multiple broker running in parallel + def lockJob(self,dn,jobID): + # make token + tmpProxy = DBProxy() + self.token = "%s:%s:" % (tmpProxy.cleanUserID(dn),jobID) + _logger.debug("%s lockJob" % self.token) + # lock + resST,resVal = self.taskBuffer.lockJobForReBrokerage(dn,jobID,self.simulation,self.forceOpt, + forFailed=self.forFailed) + # failed + if not resST: + _logger.debug("%s lockJob failed since %s" % (self.token,resVal['err'])) + return False,resVal['err'] + # keep jobID + self.jobID = jobID + # set PandaID,buildStatus,userName + self.rPandaID = resVal['rPandaID'] + self.bPandaID = resVal['bPandaID'] + self.userName = resVal['userName'] + self.buildStatus = resVal['bStatus'] + self.buildJobID = resVal['bJobID'] + self.minPandaIDlibDS = resVal['minPandaIDlibDS'] + self.maxPandaIDlibDS = resVal['maxPandaIDlibDS'] + # use JobID as rev num + self.revNum = self.taskBuffer.getJobIdUser(dn) + _logger.debug("%s run PandaID=%s / build PandaID=%s Status=%s JobID=%s rev=%s" % \ + (self.token,self.rPandaID,self.bPandaID,self.buildStatus, + self.buildJobID,self.revNum)) + # return + return True,'' + + + # move build job to jobsDefined4 + def prepareJob(self,site,cloud): + _logger.debug("%s prepareJob" % self.token) + # reuse buildJob + all runJobs + if self.jobID == self.buildJobID and self.buildStatus in ['defined','activated']: + if self.buildStatus == 'activated': + # move build job to jobsDefined4 + ret = self.taskBuffer.resetBuildJobForReBrokerage(self.bPandaID) + if not ret: + _logger.error("%s failed to move build job %s to jobsDefined" % (self.token,self.bPandaID)) + return False + # get PandaIDs from jobsDefined4 + tmpPandaIDs = self.taskBuffer.getPandaIDsForReBrokerage(self.userName,self.jobID,False, + forFailed=self.forFailed) + if tmpPandaIDs == []: + _logger.error("%s cannot find PandaDSs" % self.token) + return False + # get jobSpecs + iBunchJobs = 0 + nBunchJobs = 500 + tmpJobsMap = {} + while iBunchJobs < len(tmpPandaIDs): + # get IDs + tmpJobs = self.taskBuffer.peekJobs(tmpPandaIDs[iBunchJobs:iBunchJobs+nBunchJobs],True,False,False,False) + for tmpJob in tmpJobs: + if tmpJob != None and tmpJob.jobStatus in ['defined','assigned']: + # remove _sub suffix + for tmpFile in tmpJob.Files: + if tmpFile.type != 'input': + tmpFile.destinationDBlock = re.sub('_sub\d+$','',tmpFile.destinationDBlock) + self.pandaJobList.append(tmpJob) + # increment index + iBunchJobs += nBunchJobs + # make new bunch + else: + # make new buildJob + if self.bPandaID != None: + tmpJobs = self.taskBuffer.getFullJobStatus([self.bPandaID]) + if tmpJobs == [] or tmpJobs[0] == None: + _logger.debug("cannot find build job for PandaID=%s" % self.bPandaID) + return False + # make + tmpBuildJob,oldLibDS,newLibDS = self.makeNewBuildJobForRebrokerage(tmpJobs[0]) + # set parameters + tmpBuildJob.jobExecutionID = self.jobID + tmpBuildJob.jobsetID = -1 + tmpBuildJob.sourceSite = self.job.jobsetID + # register + status = self.registerNewDataset(newLibDS) + if not status: + _logger.debug("%s failed to register new libDS" % self.token) + return False + # append + self.pandaJobList = [tmpBuildJob] + # prepare outputDS + status = self.prepareDS() + if not status: + _logger.error("%s failed to prepare outputDS" % self.token) + return False + # get PandaIDs + if self.buildStatus in ['finished',None]: + # from jobsActivated when buildJob already finished or noBuild + tmpPandaIDs = self.taskBuffer.getPandaIDsForReBrokerage(self.userName,self.jobID,True, + forFailed=self.forFailed) + else: + # from jobsDefined + tmpPandaIDs = self.taskBuffer.getPandaIDsForReBrokerage(self.userName,self.jobID,False, + forFailed=self.forFailed) + if tmpPandaIDs == []: + _logger.error("%s cannot find PandaDSs" % self.token) + return False + # get jobSpecs + iBunchJobs = 0 + nBunchJobs = 500 + tmpJobsMap = {} + while iBunchJobs < len(tmpPandaIDs): + # get jobs + tmpJobs = self.taskBuffer.peekJobs(tmpPandaIDs[iBunchJobs:iBunchJobs+nBunchJobs],True,True,False,False,True) + for tmpJob in tmpJobs: + # reset parameters for retry + if self.forFailed and tmpJob != None: + self.taskBuffer.retryJob(tmpJob.PandaID,{},failedInActive=True, + changeJobInMem=True,inMemJob=tmpJob) + # set holding to be compatible with rebro jobs + tmpJob.jobStatus = 'holding' + # check job status. activated jobs were changed to holding by getPandaIDsForReBrokerage + if tmpJob != None and tmpJob.jobStatus in ['defined','assigned','holding']: + # reset parameter + tmpJob.parentID = tmpJob.PandaID + tmpJob.PandaID = None + tmpJob.jobExecutionID = tmpJob.jobDefinitionID + tmpJob.jobsetID = -1 + tmpJob.sourceSite = self.job.jobsetID + if self.bPandaID != None: + tmpJob.jobParameters = re.sub(oldLibDS,newLibDS,tmpJob.jobParameters) + for tmpFile in tmpJob.Files: + tmpFile.row_ID = None + tmpFile.PandaID = None + if tmpFile.type == 'input': + if self.bPandaID != None and tmpFile.dataset == oldLibDS: + tmpFile.status = 'unknown' + tmpFile.GUID = None + tmpFile.dataset = newLibDS + tmpFile.dispatchDBlock = newLibDS + tmpFile.lfn = re.sub(oldLibDS,newLibDS,tmpFile.lfn) + else: + # use new dataset + tmpFile.destinationDBlock = re.sub('_sub\d+$','',tmpFile.destinationDBlock) + if not self.newDatasetMap.has_key(tmpFile.destinationDBlock): + _logger.error("%s cannot find new dataset for %s:%s" % (self.token,tmpFile.PandaID,tmpFile.destinationDBlock)) + return False + tmpFile.destinationDBlock = self.newDatasetMap[tmpFile.destinationDBlock] + # append + self.pandaJobList.append(tmpJob) + # increment index + iBunchJobs += nBunchJobs + # no jobs + if self.pandaJobList == []: + _logger.error("%s no jobs" % self.token) + return False + # set cloud, site, and specialHandling + for tmpJob in self.pandaJobList: + # set specialHandling + if tmpJob.specialHandling in [None,'NULL','']: + if not self.forFailed: + tmpJob.specialHandling = 'rebro' + else: + tmpJob.specialHandling = 'sretry' + else: + if not self.forFailed: + tmpJob.specialHandling += ',rebro' + else: + tmpJob.specialHandling += ',sretry' + # check if --destSE is used + oldComputingSite = tmpJob.computingSite + if tmpJob.destinationSE == oldComputingSite: + tmpJob.destinationSE = site + # set site and cloud + tmpJob.computingSite = site + tmpJob.cloud = cloud + # reset destinationDBlock + for tmpFile in tmpJob.Files: + if tmpFile.type in ['output','log']: + # set destSE + if tmpFile.destinationSE == oldComputingSite: + tmpFile.destinationSE = site + # set the same specialHandling since new build may have different specialHandling + self.pandaJobList[0].specialHandling = self.pandaJobList[-1].specialHandling + # return + return True + + + # prepare libDS + def prepareDS(self): + _logger.debug("%s prepareDS" % self.token) + # get all outDSs + shadowDsName = None + for tmpFile in self.job.Files: + if tmpFile.type in ['output','log']: + tmpDS = re.sub('_sub\d+$','',tmpFile.destinationDBlock) + # append new rev number + match = re.search('_rev(\d+)$',tmpDS) + if match == None: + newDS = tmpDS + '_rev%s' % self.revNum + else: + newDS = re.sub('_rev(\d+)$','_rev%s' % self.revNum,tmpDS) + # add shadow + """ + if shadowDsName == None and tmpFile.type == 'log': + shadowDsName = "%s_shadow" % newDS + status = self.registerNewDataset(shadowDsName) + if not status: + _logger.debug("%s prepareDS failed for shadow" % self.token) + return False + """ + # add datasets + if not tmpDS in self.newDatasetMap: + # register + status = self.registerNewDataset(newDS,tmpFile.dataset) + if not status: + _logger.debug("%s prepareDS failed" % self.token) + return False + # append + self.newDatasetMap[tmpDS] = newDS + return True + + + # run SetUpper + def runSetUpper(self): + # reuse buildJob + all runJobs + reuseFlag = False + if self.jobID == self.buildJobID and self.buildStatus in ['defined','activated']: + reuseFlag = True + _logger.debug("%s start Setupper for JobID=%s" % (self.token,self.jobID)) + thr = Setupper(self.taskBuffer,self.pandaJobList,resetLocation=True) + thr.start() + thr.join() + # new bunch + else: + # fake FQANs + fqans = [] + if not self.job.countryGroup in ['','NULL',None]: + fqans.append('/atlas/%s/Role=NULL' % self.job.countryGroup) + if self.job.destinationDBlock.startswith('group') and not self.job.workingGroup in ['','NULL',None]: + fqans.append('/atlas/%s/Role=production' % self.job.workingGroup) + # insert jobs + _logger.debug("%s start storeJobs for JobID=%s" % (self.token,self.jobID)) + ret = self.taskBuffer.storeJobs(self.pandaJobList,self.job.prodUserID,True,False,fqans, + self.job.creationHost,True,checkSpecialHandling=False) + if ret == []: + _logger.error("%s storeJobs failed with [] for JobID=%s" % (self.token,self.jobID)) + return False + # get PandaIDs to be killed + pandaIDsTobeKilled = [] + newJobDefinitionID = None + newJobsetID = None + strNewIDsList = [] + for tmpIndex,tmpItem in enumerate(ret): + if not tmpItem[0] in ['NULL',None]: + tmpJob = self.pandaJobList[tmpIndex] + if not tmpJob.parentID in [0,None,'NULL']: + pandaIDsTobeKilled.append(tmpJob.parentID) + if newJobDefinitionID == None: + newJobDefinitionID = tmpItem[1] + if newJobsetID == None: + newJobsetID = tmpItem[2]['jobsetID'] + strNewIDs = 'PandaID=%s JobsetID=%s JobID=%s' % (tmpItem[0],newJobsetID,newJobDefinitionID) + strNewIDsList.append(strNewIDs) + if pandaIDsTobeKilled != []: + strNewJobIDs = "JobsetID=%s JobID=%s" % (newJobsetID,newJobDefinitionID) + _logger.debug("%s kill jobs for JobID=%s -> new %s : %s" % \ + (self.token,self.jobID,strNewJobIDs,str(pandaIDsTobeKilled))) + for tmpIdx,tmpPandaID in enumerate(pandaIDsTobeKilled): + if not self.forFailed: + self.taskBuffer.killJobs([tmpPandaID],strNewIDsList[tmpIdx],'8',True) + else: + self.taskBuffer.killJobs([tmpPandaID],strNewIDsList[tmpIdx],'7',True) + # send brokerage info + if not self.forFailed: + tmpMsg = 'action=rebrokerage ntry=%s ' % self.pandaJobList[0].specialHandling.split(',').count('rebro') + else: + tmpMsg = 'action=serverretry ntry=%s ' % self.pandaJobList[0].specialHandling.split(',').count('sretry') + tmpMsg += 'old_jobset=%s old_jobdef=%s old_site=%s' % (self.job.jobsetID,self.jobID,self.job.computingSite) + self.brokerageInfo.append(tmpMsg) + brokerage.broker.sendMsgToLoggerHTTP(self.brokerageInfo,self.pandaJobList[0]) + # succeeded + _logger.debug("%s completed for JobID=%s" % (self.token,self.jobID)) + return True + + + # check DDM response + def isDQ2ok(self,out): + if out.find("DQ2 internal server exception") != -1 \ + or out.find("An error occurred on the central catalogs") != -1 \ + or out.find("MySQL server has gone away") != -1 \ + or out == '()': + return False + return True + + + # get list of datasets + def getListDatasets(self,dataset): + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug("%s %s/%s listDatasets %s" % (self.token,iDDMTry,nTry,dataset)) + status,out = ddm.DQ2.main('listDatasets',dataset,0,True) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if status != 0 or out.startswith('Error'): + _logger.error(self.token+' '+out) + _logger.error('%s bad DQ2 response for %s' % (self.token,dataset)) + return False,{} + try: + # convert res to map + exec "tmpDatasets = %s" % out + # remove _sub/_dis + resList = [] + for tmpDS in tmpDatasets.keys(): + if re.search('(_sub|_dis)\d+$',tmpDS) == None and re.search('(_shadow$',tmpDS) == None: + resList.append(tmpDS) + _logger.debug('%s getListDatasets->%s' % (self.token,str(resList))) + return True,resList + except: + _logger.error(self.token+' '+out) + _logger.error('%s could not convert HTTP-res to datasets for %s' % (self.token,dataset)) + return False,{} + + + # get list of replicas for a dataset + def getListDatasetReplicas(self,dataset): + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug("%s %s/%s listDatasetReplicas %s" % (self.token,iDDMTry,nTry,dataset)) + status,out = ddm.DQ2.main('listDatasetReplicas',dataset,0,None,False) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if status != 0 or out.startswith('Error'): + _logger.error(self.token+' '+out) + _logger.error('%s bad DQ2 response for %s' % (self.token,dataset)) + return False,{} + try: + # convert res to map + exec "tmpRepSites = %s" % out + _logger.debug('%s getListDatasetReplicas->%s' % (self.token,str(tmpRepSites))) + return True,tmpRepSites + except: + _logger.error(self.token+' '+out) + _logger.error('%s could not convert HTTP-res to replica map for %s' % (self.token,dataset)) + return False,{} + + + # get replicas for a container + def getListDatasetReplicasInContainer(self,container): + # response for failure + resForFailure = False,{} + # get datasets in container + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug('%s %s/%s listDatasetsInContainer %s' % (self.token,iDDMTry,nTry,container)) + status,out = ddm.DQ2.main('listDatasetsInContainer',container) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error(self.token+' '+out) + _logger.error('%s bad DQ2 response for %s' % (self.token,container)) + return resForFailure + datasets = [] + try: + # convert to list + exec "datasets = %s" % out + except: + _logger.error('%s could not convert HTTP-res to dataset list for %s' % (self.token,container)) + return resForFailure + # loop over all datasets + allRepMap = {} + for dataset in datasets: + # get replicas + status,tmpRepSites = self.getListDatasetReplicas(dataset) + if not status: + return resForFailure + # append + allRepMap[dataset] = tmpRepSites + # return + _logger.debug('%s getListDatasetReplicasInContainer done') + return True,allRepMap + + + # delete original locations + def deleteDatasetReplicas(self,datasets): + # loop over all datasets + for dataset in datasets: + # get locations + status,tmpRepSites = self.getListDatasetReplicas(dataset) + if not status: + return False + # no replicas + if len(tmpRepSites.keys()) == 0: + continue + # delete + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug("%s %s/%s deleteDatasetReplicas %s" % (self.token,iDDMTry,nTry,dataset)) + status,out = ddm.DQ2.main('deleteDatasetReplicas',dataset,tmpRepSites.keys()) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if status != 0 or out.startswith('Error'): + _logger.error(self.token+' '+out) + _logger.error('%s bad DQ2 response for %s' % (self.token,dataset)) + return False + _logger.debug(self.token+' '+out) + # return + _logger.debug('%s deleted replicas for %s' % (self.token,str(datasets))) + return True + + + # check if datasets are empty + def checkDatasetContents(self,datasets): + # loop over all datasets + for dataset in datasets: + # check + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug("%s %s/%s getNumberOfFiles %s" % (self.token,iDDMTry,nTry,dataset)) + status,out = ddm.DQ2.main('getNumberOfFiles',dataset) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if status != 0 or out.startswith('Error'): + _logger.error(self.token+' '+out) + _logger.error('%s bad DQ2 response for %s' % (self.token,dataset)) + return False + # convert to int + _logger.debug(self.token+' '+out) + try: + nFile = int(out) + # not empty + if nFile != 0: + _logger.error('%s %s is not empty' % (self.token,dataset)) + return False + except: + _logger.error("%s could not convert HTTP-res to nFiles" % (self.token,dataset)) + return False + # all OK + return True + + + # register dataset + def registerNewDataset(self,dataset,container=''): + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug("%s %s/%s registerNewDataset %s" % (self.token,iDDMTry,nTry,dataset)) + status,out = ddm.DQ2.main('registerNewDataset',dataset) + if out.find('DQDatasetExistsException') != -1: + break + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if out.find('DQDatasetExistsException') != -1: + # ignore DQDatasetExistsException + pass + elif status != 0 or out.startswith('Error'): + _logger.error(self.token+' '+out) + _logger.error('%s failed to register new dataset %s' % (self.token,dataset)) + return False + # remove /CN=proxy and /CN=limited from DN + tmpRealDN = self.job.prodUserID + tmpRealDN = re.sub('/CN=limited proxy','',tmpRealDN) + tmpRealDN = re.sub('/CN=proxy','',tmpRealDN) + status,out = dq2Common.parse_dn(tmpRealDN) + if status != 0: + _logger.error(self.token+' '+out) + _logger.error('%s failed to truncate DN:%s' % (self.token,self.job.prodUserID)) + return False + tmpRealDN = out + # set owner + for iDDMTry in range(nTry): + _logger.debug("%s %s/%s setMetaDataAttribute %s %s" % (self.token,iDDMTry,nTry,dataset,tmpRealDN)) + status,out = ddm.DQ2.main('setMetaDataAttribute',dataset,'owner',tmpRealDN) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error(self.token+' '+out) + _logger.error('%s failed to set owner to dataset %s' % (self.token,dataset)) + return False + # add to contaner + if container != '' and container.endswith('/'): + for iDDMTry in range(nTry): + _logger.debug("%s %s/%s registerDatasetsInContainer %s to %s" % (self.token,iDDMTry,nTry,dataset,container)) + status,out = ddm.DQ2.main('registerDatasetsInContainer',container,[dataset]) + if out.find('DQContainerAlreadyHasDataset') != -1: + break + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + if out.find('DQContainerAlreadyHasDataset') != -1: + # ignore DQContainerAlreadyHasDataset + pass + elif status != 0 or out.startswith('Error'): + _logger.error(self.token+' '+out) + _logger.error('%s add %s to container:%s' % (self.token,dataset,container)) + return False + # return + return True + + + # get list of dataset used by the job + def getListDatasetsUsedByJob(self,mapDsLFN): + # response for failure + resForFailure = False,[] + # loop over all datasets + retList = [] + for tmpDsContainer,tmpLFNs in mapDsLFN.iteritems(): + # not a container + if not tmpDsContainer.endswith('/'): + if not tmpDsContainer in retList: + retList.append(tmpDsContainer) + continue + # get datasets in container + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug('%s %s/%s listDatasetsInContainer %s' % (self.token,iDDMTry,nTry,tmpDsContainer)) + status,out = ddm.DQ2.main('listDatasetsInContainer',tmpDsContainer) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error(self.token+' '+out) + _logger.error('%s bad DQ2 response for %s' % (self.token,tmpDsContainer)) + return resForFailure + tmpDatasets = [] + try: + # convert to list + exec "tmpDatasets = %s" % out + except: + _logger.error('%s could not convert HTTP-res to dataset list for %s' % (self.token,tmpDsContainer)) + return resForFailure + # get files in dataset + for tmpDS in tmpDatasets: + if tmpDS in retList: + continue + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug('%s %s/%s listFilesInDataset %s' % (self.token,iDDMTry,nTry,tmpDS)) + status,out = ddm.DQ2.main('listFilesInDataset',tmpDS) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + if status != 0 or out.startswith('Error'): + _logger.error(self.token+' '+out) + _logger.error('%s bad DQ2 response for %s' % (self.token,tmpDS)) + return resForFailure + # get LFN map + tmpMapDQ2 = {} + try: + # convert to list + exec "tmpMapDQ2 = %s[0]" % out + for tmpGUID,tmpVal in tmpMapDQ2.iteritems(): + # check if a file in DS is used by the job + if tmpVal['lfn'] in tmpLFNs: + # append + if not tmpDS in retList: + retList.append(tmpDS) + break + except: + _logger.error('%s could not convert HTTP-res to LFN map for %s' % (self.token,tmpDS)) + return resForFailure + # return + _logger.debug('%s getListDatasetsUsedByJob done %s' % (self.token,str(retList))) + return True,retList + + + # refresh replica info in needed + def refreshReplicaInfo(self,unknownSites): + for tmpDS,sites in unknownSites.iteritems(): + nTry = 3 + for iDDMTry in range(nTry): + _logger.debug("%s %s/%s listFileReplicasBySites %s %s" % (self.token,iDDMTry,nTry,tmpDS,str(sites))) + status,out = ddm.DQ2_iter.listFileReplicasBySites(tmpDS,0,sites,0,300) + if status != 0 or (not self.isDQ2ok(out)): + time.sleep(60) + else: + break + # result + if status != 0 or out.startswith('Error'): + _logger.error(self.token+' '+out) + _logger.error('%s bad DQ2 response for %s' % (self.token,dataset)) + # return + return True + + + # check rev to avoid too many rebrokerage + def checkRev(self): + # check specialHandling + if self.job.specialHandling in [None,'NULL','']: + revNum = 0 + else: + revNum = self.job.specialHandling.split(',').count('rebro') + revNum += self.job.specialHandling.split(',').count('sretry') + # check with limit + if revNum < 5: + return True + return False + + + # make buildJob for re-brokerage + def makeNewBuildJobForRebrokerage(self,buildJob): + # new libDS + oldLibDS = buildJob.destinationDBlock + match = re.search('_rev(\d+)$',oldLibDS) + if match == None: + newLibDS = oldLibDS + '__id%s_rev%s' % (self.job.jobDefinitionID,self.revNum) + else: + newLibDS = re.sub('_rev(\d+)$','_rev%s' % self.revNum,oldLibDS) + # reset parameters + buildJob.PandaID = None + buildJob.jobStatus = None + buildJob.commandToPilot = None + buildJob.schedulerID = None + buildJob.pilotID = None + for attr in buildJob._attributes: + if attr.endswith('ErrorCode') or attr.endswith('ErrorDiag'): + setattr(buildJob,attr,None) + buildJob.transExitCode = None + buildJob.creationTime = datetime.datetime.utcnow() + buildJob.modificationTime = buildJob.creationTime + buildJob.startTime = None + buildJob.endTime = None + buildJob.destinationDBlock = newLibDS + buildJob.jobParameters = re.sub(oldLibDS,newLibDS,buildJob.jobParameters) + for tmpFile in buildJob.Files: + tmpFile.row_ID = None + tmpFile.GUID = None + tmpFile.status = 'unknown' + tmpFile.PandaID = None + tmpFile.dataset = newLibDS + tmpFile.destinationDBlock = tmpFile.dataset + tmpFile.lfn = re.sub(oldLibDS,newLibDS,tmpFile.lfn) + return buildJob,oldLibDS,newLibDS diff --git a/current/pandaserver/userinterface/UserIF.py b/current/pandaserver/userinterface/UserIF.py new file mode 100755 index 000000000..31aa5cc0c --- /dev/null +++ b/current/pandaserver/userinterface/UserIF.py @@ -0,0 +1,1570 @@ +''' +provide web interface to users + +''' + +import re +import sys +import time +import types +import cPickle as pickle +import jobdispatcher.Protocol as Protocol +import brokerage.broker +import taskbuffer.ProcessGroups +from config import panda_config +from taskbuffer.JobSpec import JobSpec +from taskbuffer.WrappedPickle import WrappedPickle +from brokerage.SiteMapper import SiteMapper +from pandalogger.PandaLogger import PandaLogger +from RbLauncher import RbLauncher +from ReBroker import ReBroker +from taskbuffer import PrioUtil +from dataservice.DDM import dq2Info + +# logger +_logger = PandaLogger().getLogger('UserIF') + + +# main class +class UserIF: + # constructor + def __init__(self): + self.taskBuffer = None + + + # initialize + def init(self,taskBuffer): + self.taskBuffer = taskBuffer + + + # submit jobs + def submitJobs(self,jobsStr,user,host,userFQANs,prodRole=False,toPending=False): + try: + # deserialize jobspecs + jobs = WrappedPickle.loads(jobsStr) + _logger.debug("submitJobs %s len:%s FQAN:%s" % (user,len(jobs),str(userFQANs))) + maxJobs = 5000 + if len(jobs) > maxJobs: + _logger.error("too may jobs more than %s" % maxJobs) + jobs = jobs[:maxJobs] + except: + type, value, traceBack = sys.exc_info() + _logger.error("submitJobs : %s %s" % (type,value)) + jobs = [] + # check prodSourceLabel + try: + goodProdSourceLabel = True + for tmpJob in jobs: + # prevent internal jobs from being submitted from outside + if tmpJob.prodSourceLabel in taskbuffer.ProcessGroups.internalSourceLabels: + _logger.error("submitJobs %s wrong prodSourceLabel=%s" % (user,tmpJob.prodSourceLabel)) + goodProdSourceLabel = False + break + # check production role + if tmpJob.prodSourceLabel in ['managed']: + if not prodRole: + _logger.error("submitJobs %s missing prod-role for prodSourceLabel=%s" % (user,tmpJob.prodSourceLabel)) + goodProdSourceLabel = False + break + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("submitJobs : checking goodProdSourceLabel %s %s" % (errType,errValue)) + goodProdSourceLabel = False + # reject injection for bad prodSourceLabel + if not goodProdSourceLabel: + return "ERROR: production role is required for production jobs" + # store jobs + ret = self.taskBuffer.storeJobs(jobs,user,forkSetupper=True,fqans=userFQANs, + hostname=host,toPending=toPending) + _logger.debug("submitJobs %s ->:%s" % (user,len(ret))) + # serialize + return pickle.dumps(ret) + + + # logger interface + def sendLogInfo(self,user,msgType,msgListStr): + try: + # deserialize message + msgList = WrappedPickle.loads(msgListStr) + # short user name + cUID = self.taskBuffer.cleanUserID(user) + # logging + iMsg = 0 + for msgBody in msgList: + # make message + message = "dn='%s' %s" % (cUID,msgBody) + # send message to logger + if msgType in ['analy_brokerage']: + brokerage.broker.sendMsgToLogger(message) + # get logger + _pandaLogger = PandaLogger() + _pandaLogger.lock() + _pandaLogger.setParams({'Type':msgType}) + logger = _pandaLogger.getHttpLogger(panda_config.loggername) + # add message + logger.info(message) + # release HTTP handler + _pandaLogger.release() + # sleep + iMsg += 1 + if iMsg % 5 == 0: + time.sleep(1) + except: + pass + # return + return True + + + # run task assignment + def runTaskAssignment(self,jobsStr): + try: + # deserialize jobspecs + jobs = WrappedPickle.loads(jobsStr) + except: + type, value, traceBack = sys.exc_info() + _logger.error("runTaskAssignment : %s %s" % (type,value)) + jobs = [] + # run + ret = self.taskBuffer.runTaskAssignment(jobs) + # serialize + return pickle.dumps(ret) + + + # get serial number for group job + def getSerialNumberForGroupJob(self,name): + # get + ret = self.taskBuffer.getSerialNumberForGroupJob(name) + # serialize + return pickle.dumps(ret) + + + # change job priorities + def changeJobPriorities(self,user,prodRole,newPrioMapStr): + # check production role + if not prodRole: + return False,"production role is required" + try: + # deserialize map + newPrioMap = WrappedPickle.loads(newPrioMapStr) + _logger.debug("changeJobPriorities %s : %s" % (user,str(newPrioMap))) + # change + ret = self.taskBuffer.changeJobPriorities(newPrioMap) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("changeJobPriorities : %s %s" % (errType,errValue)) + return False,'internal server error' + # serialize + return ret + + + # run rebrokerage + def runReBrokerage(self,dn,jobID,cloud,excludedSite,forceRebro): + returnVal = "True" + try: + # lock job in simulation mode to check + checker = ReBroker(self.taskBuffer,simulation=True,userRequest=True) + stLock,retLock = checker.lockJob(dn,jobID) + # failed + if not stLock: + returnVal = "ERROR: "+retLock + return returnVal + # continue to run rebrokerage in background + if excludedSite in [None,'']: + # use None for empty excludedSite + excludedSite = None + _logger.debug("runReBrokerage %s JobID:%s cloud=%s ex=%s forceOpt=%s" % (dn,jobID,cloud,str(excludedSite),forceRebro)) + # instantiate ReBroker + thr = RbLauncher(dn,jobID,cloud,excludedSite) + # start ReBroker + thr.start() + except: + errType,errValue,errTraceBack = sys.exc_info() + _logger.error("runReBrokerage: %s %s" % (errType,errValue)) + returnVal = "ERROR: runReBrokerage crashed" + # return + return returnVal + + + # retry failed subjobs in running job + def retryFailedJobsInActive(self,dn,jobID): + returnVal = False + try: + _logger.debug("retryFailedJobsInActive %s JobID:%s" % (dn,jobID)) + cUID = self.taskBuffer.cleanUserID(dn) + # instantiate ReBroker + tmpRet = self.taskBuffer.retryJobsInActive(cUID,jobID) + returnVal = True + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("retryFailedJobsInActive: %s %s" % (errType,errValue)) + returnVal = "ERROR: server side crash" + # return + return returnVal + + + # set debug mode + def setDebugMode(self,dn,pandaID,prodManager,modeOn): + ret = self.taskBuffer.setDebugMode(dn,pandaID,prodManager,modeOn) + # return + return ret + + + # insert sandbox file info + def insertSandboxFileInfo(self,userName,hostName,fileName,fileSize,checkSum): + ret = self.taskBuffer.insertSandboxFileInfo(userName,hostName,fileName,fileSize,checkSum) + # return + return ret + + + # check duplicated sandbox file + def checkSandboxFile(self,userName,fileSize,checkSum): + ret = self.taskBuffer.checkSandboxFile(userName,fileSize,checkSum) + # return + return ret + + + # get job status + def getJobStatus(self,idsStr): + try: + # deserialize jobspecs + ids = WrappedPickle.loads(idsStr) + _logger.debug("getJobStatus len : %s" % len(ids)) + maxIDs = 5500 + if len(ids) > maxIDs: + _logger.error("too long ID list more than %s" % maxIDs) + ids = ids[:maxIDs] + except: + type, value, traceBack = sys.exc_info() + _logger.error("getJobStatus : %s %s" % (type,value)) + ids = [] + _logger.debug("getJobStatus start : %s" % ids) + # peek jobs + ret = self.taskBuffer.peekJobs(ids) + _logger.debug("getJobStatus end") + # serialize + return pickle.dumps(ret) + + + # get PandaID with jobexeID + def getPandaIDwithJobExeID(self,idsStr): + try: + # deserialize jobspecs + ids = WrappedPickle.loads(idsStr) + _logger.debug("getPandaIDwithJobExeID len : %s" % len(ids)) + maxIDs = 5500 + if len(ids) > maxIDs: + _logger.error("too long ID list more than %s" % maxIDs) + ids = ids[:maxIDs] + except: + errtype,errvalue = sys.exc_info()[:2] + _logger.error("getPandaIDwithJobExeID : %s %s" % (errtype,errvalue)) + ids = [] + _logger.debug("getPandaIDwithJobExeID start : %s" % ids) + # peek jobs + ret = self.taskBuffer.getPandaIDwithJobExeID(ids) + _logger.debug("getPandaIDwithJobExeID end") + # serialize + return pickle.dumps(ret) + + + # get assigned cloud for tasks + def seeCloudTask(self,idsStr): + try: + # deserialize jobspecs + ids = WrappedPickle.loads(idsStr) + except: + type, value, traceBack = sys.exc_info() + _logger.error("seeCloudTask : %s %s" % (type,value)) + ids = [] + _logger.debug("seeCloudTask start : %s" % ids) + # peek jobs + ret = {} + for id in ids: + tmpRet = self.taskBuffer.seeCloudTask(id) + ret[id] = tmpRet + _logger.debug("seeCloudTask end") + # serialize + return pickle.dumps(ret) + + + # get active datasets + def getActiveDatasets(self,computingSite,prodSourceLabel): + # run + ret = self.taskBuffer.getActiveDatasets(computingSite,prodSourceLabel) + # return + return ret + + + # get assigning task + def getAssigningTask(self): + # run + ret = self.taskBuffer.getAssigningTask() + # serialize + return pickle.dumps(ret) + + + # set task by user + def setCloudTaskByUser(self,user,tid,cloud,status): + # run + ret = self.taskBuffer.setCloudTaskByUser(user,tid,cloud,status) + return ret + + + # add files to memcached + def addFilesToMemcached(self,site,node,files): + # add + ret = self.taskBuffer.addFilesToMemcached(site,node,files) + # return + return ret + + + # delete files from memcached + def deleteFilesFromMemcached(self,site,node,files): + # delete + ret = self.taskBuffer.deleteFilesFromMemcached(site,node,files) + # return + return ret + + + # flush memcached + def flushMemcached(self,site,node): + # flush + ret = self.taskBuffer.flushMemcached(site,node) + # return + return ret + + + # check files with memcached + def checkFilesWithMemcached(self,site,node,files): + # check + ret = self.taskBuffer.checkFilesWithMemcached(site,node,files) + # return + return ret + + + # get job statistics + def getJobStatistics(self,sourcetype=None): + # get job statistics + ret = self.taskBuffer.getJobStatisticsForExtIF(sourcetype) + # serialize + return pickle.dumps(ret) + + + # get highest prio jobs + def getHighestPrioJobStat(self,perPG=False,useMorePG=False): + # get job statistics + ret = self.taskBuffer.getHighestPrioJobStat(perPG,useMorePG) + # serialize + return pickle.dumps(ret) + + + # get queued analysis jobs at a site + def getQueuedAnalJobs(self,site,dn): + # get job statistics + ret = self.taskBuffer.getQueuedAnalJobs(site,dn) + # serialize + return pickle.dumps(ret) + + + # get job statistics for Bamboo + def getJobStatisticsForBamboo(self,useMorePG=False): + # get job statistics + ret = self.taskBuffer.getJobStatisticsForBamboo(useMorePG) + # serialize + return pickle.dumps(ret) + + + # get job statistics per site + def getJobStatisticsPerSite(self,predefined=False,workingGroup='',countryGroup='',jobType='', + minPriority=None,readArchived=True): + # get job statistics + ret = self.taskBuffer.getJobStatistics(readArchived,predefined,workingGroup,countryGroup,jobType, + minPriority=minPriority) + # serialize + return pickle.dumps(ret) + + + # get the number of waiting jobs per site and use + def getJobStatisticsPerUserSite(self): + # get job statistics + ret = self.taskBuffer.getJobStatisticsPerUserSite() + # serialize + return pickle.dumps(ret) + + + # get job statistics per site with label + def getJobStatisticsWithLabel(self,site): + # get job statistics + ret = self.taskBuffer.getJobStatisticsWithLabel(site) + # serialize + return pickle.dumps(ret) + + + # query PandaIDs + def queryPandaIDs(self,idsStr): + # deserialize IDs + ids = WrappedPickle.loads(idsStr) + # query PandaIDs + ret = self.taskBuffer.queryPandaIDs(ids) + # serialize + return pickle.dumps(ret) + + + # get number of analysis jobs per user + def getNUserJobs(self,siteName,nJobs): + # get + ret = self.taskBuffer.getNUserJobs(siteName,nJobs) + # serialize + return pickle.dumps(ret) + + + # query job info per cloud + def queryJobInfoPerCloud(self,cloud,schedulerID): + # query PandaIDs + ret = self.taskBuffer.queryJobInfoPerCloud(cloud,schedulerID) + # serialize + return pickle.dumps(ret) + + + # query PandaIDs at site + def getPandaIDsSite(self,site,status,limit): + # query PandaIDs + ret = self.taskBuffer.getPandaIDsSite(site,status,limit) + # serialize + return pickle.dumps(ret) + + + # get PandaIDs to be updated in prodDB + def getJobsToBeUpdated(self,limit,lockedby): + # query PandaIDs + ret = self.taskBuffer.getPandaIDsForProdDB(limit,lockedby) + # serialize + return pickle.dumps(ret) + + + # update prodDBUpdateTimes + def updateProdDBUpdateTimes(self,paramsStr): + # deserialize IDs + params = WrappedPickle.loads(paramsStr) + # get jobs + ret = self.taskBuffer.updateProdDBUpdateTimes(params) + # serialize + return pickle.dumps(True) + + + # query last files in datasets + def queryLastFilesInDataset(self,datasetStr): + # deserialize names + datasets = WrappedPickle.loads(datasetStr) + # get files + ret = self.taskBuffer.queryLastFilesInDataset(datasets) + # serialize + return pickle.dumps(ret) + + + # get input files currently in used for analysis + def getFilesInUseForAnal(self,outDataset): + # get files + ret = self.taskBuffer.getFilesInUseForAnal(outDataset) + # serialize + return pickle.dumps(ret) + + + # get list of dis dataset to get input files in shadow + def getDisInUseForAnal(self,outDataset): + # get files + ret = self.taskBuffer.getDisInUseForAnal(outDataset) + # serialize + return pickle.dumps(ret) + + + # get input LFNs currently in use for analysis with shadow dis + def getLFNsInUseForAnal(self,inputDisListStr): + # deserialize IDs + inputDisList = WrappedPickle.loads(inputDisListStr) + # get files + ret = self.taskBuffer.getLFNsInUseForAnal(inputDisList) + # serialize + return pickle.dumps(ret) + + + # kill jobs + def killJobs(self,idsStr,user,host,code,prodManager,useMailAsID,fqans): + # deserialize IDs + ids = WrappedPickle.loads(idsStr) + if not isinstance(ids,types.ListType): + ids = [ids] + _logger.debug("killJob : %s %s %s %s %s" % (user,code,prodManager,fqans,ids)) + try: + if useMailAsID: + _logger.debug("killJob : getting mail address for %s" % user) + realDN = re.sub('/CN=limited proxy','',user) + realDN = re.sub('(/CN=proxy)+','',realDN) + nTry = 3 + for iDDMTry in range(nTry): + status,out = dq2Info.finger(realDN) + if status == 0: + exec "userInfo=%s" % out + _logger.debug("killJob : %s is converted to %s" % (user,userInfo['email'])) + user = userInfo['email'] + break + time.sleep(1) + except: + errType,errValue = sys.exc_info()[:2] + _logger.error("killJob : failed to convert email address %s : %s %s" % (user,errType,errValue)) + # get working groups with prod role + wgProdRole = [] + for fqan in fqans: + tmpMatch = re.search('/atlas/([^/]+)/Role=production',fqan) + if tmpMatch != None: + # ignore usatlas since it is used as atlas prod role + tmpWG = tmpMatch.group(1) + if not tmpWG in ['','usatlas']+wgProdRole: + wgProdRole.append(tmpWG) + # group production + wgProdRole.append('gr_%s' % tmpWG) + # kill jobs + ret = self.taskBuffer.killJobs(ids,user,code,prodManager,wgProdRole) + # logging + try: + # make message + message = '%s - PandaID =' % host + maxID = 10 + for id in ids[:maxID]: + message += ' %s' % id + if len(ids) > maxID: + message += ' ...' + # get logger + _pandaLogger = PandaLogger() + _pandaLogger.lock() + _pandaLogger.setParams({'Type':'killJobs','User':user}) + logger = _pandaLogger.getHttpLogger(panda_config.loggername) + # add message + logger.info(message) + # release HTTP handler + _pandaLogger.release() + except: + pass + # serialize + return pickle.dumps(ret) + + + # reassign jobs + def reassignJobs(self,idsStr,user,host,forPending): + # deserialize IDs + ids = WrappedPickle.loads(idsStr) + # reassign jobs + ret = self.taskBuffer.reassignJobs(ids,forkSetupper=True,forPending=forPending) + # logging + try: + # make message + message = '%s - PandaID =' % host + maxID = 10 + for id in ids[:maxID]: + message += ' %s' % id + if len(ids) > maxID: + message += ' ...' + # get logger + _pandaLogger = PandaLogger() + _pandaLogger.lock() + _pandaLogger.setParams({'Type':'reassignJobs','User':user}) + logger = _pandaLogger.getHttpLogger(panda_config.loggername) + # add message + logger.info(message) + # release HTTP handler + _pandaLogger.release() + except: + pass + # serialize + return pickle.dumps(ret) + + + # resubmit jobs + def resubmitJobs(self,idsStr): + # deserialize IDs + ids = WrappedPickle.loads(idsStr) + # kill jobs + ret = self.taskBuffer.resubmitJobs(ids) + # serialize + return pickle.dumps(ret) + + + # get list of site spec + def getSiteSpecs(self,siteType='analysis'): + # get analysis site list + specList = {} + siteMapper = SiteMapper(self.taskBuffer) + for id,spec in siteMapper.siteSpecList.iteritems(): + if siteType == 'all' or spec.type == siteType: + # convert to map + tmpSpec = {} + for attr in spec._attributes: + tmpSpec[attr] = getattr(spec,attr) + specList[id] = tmpSpec + # serialize + return pickle.dumps(specList) + + + # get list of cloud spec + def getCloudSpecs(self): + # get cloud list + siteMapper = SiteMapper(self.taskBuffer) + # serialize + return pickle.dumps(siteMapper.cloudSpec) + + + # get list of cache prefix + def getCachePrefixes(self): + # get + ret = self.taskBuffer.getCachePrefixes() + # serialize + return pickle.dumps(ret) + + + # get nPilots + def getNumPilots(self): + # get nPilots + ret = self.taskBuffer.getCurrentSiteData() + numMap = {} + for siteID,siteNumMap in ret.iteritems(): + nPilots = 0 + # nPilots = getJob+updateJob + if siteNumMap.has_key('getJob'): + nPilots += siteNumMap['getJob'] + if siteNumMap.has_key('updateJob'): + nPilots += siteNumMap['updateJob'] + # append + numMap[siteID] = {'nPilots':nPilots} + # serialize + return pickle.dumps(numMap) + + + # run brokerage + def runBrokerage(self,sitesStr,cmtConfig,atlasRelease,trustIS=False,processingType=None, + dn=None,loggingFlag=False,memorySize=None,workingGroup=None,fqans=[], + nJobs=None,preferHomeCountry=False,siteReliability=None,maxCpuCount=None): + if not loggingFlag: + ret = 'NULL' + else: + ret = {'site':'NULL','logInfo':[]} + try: + # deserialize sites + sites = WrappedPickle.loads(sitesStr) + # instantiate siteMapper + siteMapper = SiteMapper(self.taskBuffer) + # instantiate job + job = JobSpec() + job.AtlasRelease = atlasRelease + job.cmtConfig = cmtConfig + if processingType != None: + job.processingType = processingType + if memorySize != None: + job.minRamCount = memorySize + if workingGroup != None: + userDefinedWG = True + validWorkingGroup = True + job.workingGroup = workingGroup + else: + userDefinedWG = False + validWorkingGroup = False + if maxCpuCount != None: + job.maxCpuCount = maxCpuCount + # get parameters related to priority + withProdRole,workingGroup,priorityOffset,serNum,weight = self.taskBuffer.getPrioParameters([job],dn,fqans, + userDefinedWG, + validWorkingGroup) + # get min priority using nJobs + try: + nJobs = long(nJobs) + except: + # use 200 as a default # of jobs + nJobs =200 + minPrio = PrioUtil.calculatePriority(priorityOffset,serNum+nJobs,weight) + # get countryGroup + prefCountries = [] + if preferHomeCountry: + for tmpFQAN in fqans: + match = re.search('^/atlas/([^/]+)/',tmpFQAN) + if match != None: + tmpCountry = match.group(1) + # use country code or usatlas + if len(tmpCountry) == 2: + prefCountries.append(tmpCountry) + break + # usatlas + if tmpCountry in ['usatlas']: + prefCountries.append('us') + break + # run brokerage + _logger.debug("runBrokerage for dn=%s FQAN=%s minPrio=%s preferred:%s:%s" % (dn,str(fqans),minPrio, + preferHomeCountry, + str(prefCountries))) + brokerage.broker.schedule([job],self.taskBuffer,siteMapper,True,sites,trustIS,dn, + reportLog=loggingFlag,minPriority=minPrio,preferredCountries=prefCountries, + siteReliability=siteReliability) + # get computingSite + if not loggingFlag: + ret = job.computingSite + else: + ret = pickle.dumps({'site':job.computingSite,'logInfo':job.brokerageErrorDiag}) + except: + type, value, traceBack = sys.exc_info() + _logger.error("runBrokerage : %s %s" % (type,value)) + return ret + + + # get script for offline running + def getScriptOfflineRunning(self,pandaID): + # register + ret = self.taskBuffer.getScriptOfflineRunning(pandaID) + # return + return ret + + + # register proxy key + def registerProxyKey(self,params): + # register + ret = self.taskBuffer.registerProxyKey(params) + # return + return ret + + + # get client version + def getPandaClientVer(self): + # get + ret = self.taskBuffer.getPandaClientVer() + # return + return ret + + + # get proxy key + def getProxyKey(self,dn): + # get files + ret = self.taskBuffer.getProxyKey(dn) + # serialize + return pickle.dumps(ret) + + + # get slimmed file info with PandaIDs + def getSlimmedFileInfoPandaIDs(self,pandaIDsStr,dn): + try: + # deserialize IDs + pandaIDs = WrappedPickle.loads(pandaIDsStr) + # truncate + maxIDs = 5500 + if len(pandaIDs) > maxIDs: + _logger.error("too long ID list more than %s" % maxIDs) + pandaIDs = pandaIDs[:maxIDs] + # get + _logger.debug("getSlimmedFileInfoPandaIDs start : %s %s" % (dn,len(pandaIDs))) + ret = self.taskBuffer.getSlimmedFileInfoPandaIDs(pandaIDs) + _logger.debug("getSlimmedFileInfoPandaIDs end") + except: + ret = {} + # serialize + return pickle.dumps(ret) + + + # get JobIDs in a time range + def getJobIDsInTimeRange(self,dn,timeRange): + # get IDs + ret = self.taskBuffer.getJobIDsInTimeRange(dn,timeRange) + # serialize + return pickle.dumps(ret) + + + # get PandaIDs for a JobID + def getPandIDsWithJobID(self,dn,jobID,nJobs): + # get IDs + ret = self.taskBuffer.getPandIDsWithJobID(dn,jobID,nJobs) + # serialize + return pickle.dumps(ret) + + + # check merge job generation status + def checkMergeGenerationStatus(self,dn,jobID): + # check + ret = self.taskBuffer.checkMergeGenerationStatus(dn,jobID) + # serialize + return pickle.dumps(ret) + + + # get full job status + def getFullJobStatus(self,idsStr,dn): + try: + # deserialize jobspecs + ids = WrappedPickle.loads(idsStr) + # truncate + maxIDs = 5500 + if len(ids) > maxIDs: + _logger.error("too long ID list more than %s" % maxIDs) + ids = ids[:maxIDs] + except: + type, value, traceBack = sys.exc_info() + _logger.error("getFullJobStatus : %s %s" % (type,value)) + ids = [] + _logger.debug("getFullJobStatus start : %s %s" % (dn,str(ids))) + # peek jobs + ret = self.taskBuffer.getFullJobStatus(ids) + _logger.debug("getFullJobStatus end") + # serialize + return pickle.dumps(ret) + + + # add account to siteaccess + def addSiteAccess(self,siteID,dn): + # add + ret = self.taskBuffer.addSiteAccess(siteID,dn) + # serialize + return pickle.dumps(ret) + + + # list site access + def listSiteAccess(self,siteID,dn,longFormat=False): + # list + ret = self.taskBuffer.listSiteAccess(siteID,dn,longFormat) + # serialize + return pickle.dumps(ret) + + + # update site access + def updateSiteAccess(self,method,siteid,requesterDN,userName,attrValue): + # list + ret = self.taskBuffer.updateSiteAccess(method,siteid,requesterDN,userName,attrValue) + # serialize + return str(ret) + + +# Singleton +userIF = UserIF() +del UserIF + + +# get FQANs +def _getFQAN(req): + fqans = [] + for tmpKey,tmpVal in req.subprocess_env.iteritems(): + # compact credentials + if tmpKey.startswith('GRST_CRED_'): + # VOMS attribute + if tmpVal.startswith('VOMS'): + # FQAN + fqan = tmpVal.split()[-1] + # append + fqans.append(fqan) + # old style + elif tmpKey.startswith('GRST_CONN_'): + tmpItems = tmpVal.split(':') + # FQAN + if len(tmpItems)==2 and tmpItems[0]=='fqan': + fqans.append(tmpItems[-1]) + # return + return fqans + + +# get DN +def _getDN(req): + realDN = '' + if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + realDN = req.subprocess_env['SSL_CLIENT_S_DN'] + # remove redundant CN + realDN = re.sub('/CN=limited proxy','',realDN) + realDN = re.sub('/CN=proxy(/CN=proxy)+','/CN=proxy',realDN) + return realDN + + +# check role +def _isProdRoleATLAS(req): + # check role + prodManager = False + # get FQANs + fqans = _getFQAN(req) + # loop over all FQANs + for fqan in fqans: + # check production role + for rolePat in ['/atlas/usatlas/Role=production','/atlas/Role=production']: + if fqan.startswith(rolePat): + return True + return False + + + +""" +web service interface + +""" + +# security check +def isSecure(req): + # check security + if not Protocol.isSecure(req): + return False + # disable limited proxy + if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: + _logger.warning("access via limited proxy : %s" % req.subprocess_env['SSL_CLIENT_S_DN']) + return False + return True + + +# submit jobs +def submitJobs(req,jobs,toPending=None): + # check security + if not isSecure(req): + return False + # get DN + user = None + if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + user = _getDN(req) + # get FQAN + fqans = _getFQAN(req) + # hostname + host = req.get_remote_host() + # production Role + prodRole = _isProdRoleATLAS(req) + # to pending + if toPending == 'True': + toPending = True + else: + toPending = False + return userIF.submitJobs(jobs,user,host,fqans,prodRole,toPending) + + +# run task assignment +def runTaskAssignment(req,jobs): + # check security + if not isSecure(req): + return "False" + return userIF.runTaskAssignment(jobs) + + +# get job status +def getJobStatus(req,ids): + return userIF.getJobStatus(ids) + + +# get PandaID with jobexeID +def getPandaIDwithJobExeID(req,ids): + return userIF.getPandaIDwithJobExeID(ids) + + +# get queued analysis jobs at a site +def getQueuedAnalJobs(req,site): + # check security + if not isSecure(req): + return "ERROR: SSL is required" + # get DN + user = None + if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + user = _getDN(req) + return userIF.getQueuedAnalJobs(site,user) + + +# get active datasets +def getActiveDatasets(req,computingSite,prodSourceLabel='managed'): + return userIF.getActiveDatasets(computingSite,prodSourceLabel) + + +# get assigning task +def getAssigningTask(req): + return userIF.getAssigningTask() + + +# get assigned cloud for tasks +def seeCloudTask(req,ids): + return userIF.seeCloudTask(ids) + + +# set task by user +def setCloudTaskByUser(req,tid,cloud='',status=''): + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return "ERROR: SSL connection is required" + user = _getDN(req) + # check role + if not _isProdRoleATLAS(req): + return "ERROR: production role is required" + return userIF.setCloudTaskByUser(user,tid,cloud,status) + + +# set debug mode +def setDebugMode(req,pandaID,modeOn): + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return "ERROR: SSL connection is required" + user = _getDN(req) + # check role + prodManager = _isProdRoleATLAS(req) + # mode + if modeOn == 'True': + modeOn = True + else: + modeOn = False + # exec + return userIF.setDebugMode(user,pandaID,prodManager,modeOn) + + +# insert sandbox file info +def insertSandboxFileInfo(req,userName,fileName,fileSize,checkSum): + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return "ERROR: SSL connection is required" + user = _getDN(req) + # check role + prodManager = _isProdRoleATLAS(req) + if not prodManager: + return "ERROR: missing role" + # hostname + hostName = req.get_remote_host() + # exec + return userIF.insertSandboxFileInfo(userName,hostName,fileName,fileSize,checkSum) + + +# check duplicated sandbox file +def checkSandboxFile(req,fileSize,checkSum): + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return "ERROR: SSL connection is required" + user = _getDN(req) + # exec + return userIF.checkSandboxFile(user,fileSize,checkSum) + + +# add files to memcached +def addFilesToCacheDB(req,site,node,guids='',lfns=''): + # exec + return userIF.addFilesToMemcached(site,node,lfns) + + +# delete files from memcached +def deleteFilesFromCacheDB(req,site,node,guids='',lfns=''): + # exec + return userIF.deleteFilesFromMemcached(site,node,lfns) + + +# flush memcached +def flushCacheDB(req,site,node): + # exec + return userIF.flushMemcached(site,node) + + +# check files with memcached +def checkFilesWithCacheDB(req,site,node,guids='',lfns=''): + # exec + return userIF.checkFilesWithMemcached(site,node,lfns) + + +# query PandaIDs +def queryPandaIDs(req,ids): + return userIF.queryPandaIDs(ids) + + +# query job info per cloud +def queryJobInfoPerCloud(req,cloud,schedulerID=None): + return userIF.queryJobInfoPerCloud(cloud,schedulerID) + + +# get PandaIDs at site +def getPandaIDsSite(req,site,status,limit=500): + return userIF.getPandaIDsSite(site,status,limit) + + +# get PandaIDs to be updated in prodDB +def getJobsToBeUpdated(req,limit=5000,lockedby=''): + limit = int(limit) + return userIF.getJobsToBeUpdated(limit,lockedby) + + +# update prodDBUpdateTimes +def updateProdDBUpdateTimes(req,params): + # check security + if not isSecure(req): + return False + return userIF.updateProdDBUpdateTimes(params) + + +# get job statistics +def getJobStatistics(req,sourcetype=None): + return userIF.getJobStatistics(sourcetype) + + +# get highest prio jobs +def getHighestPrioJobStat(req,perPG=None,useMorePG=None): + if perPG == 'True': + perPG = True + else: + perPG = False + if useMorePG == 'True': + useMorePG = taskbuffer.ProcessGroups.extensionLevel_1 + elif useMorePG in ['False',None]: + useMorePG = False + else: + try: + useMorePG = int(useMorePG) + except: + useMorePG = False + return userIF.getHighestPrioJobStat(perPG,useMorePG) + + +# get job statistics for Babmoo +def getJobStatisticsForBamboo(req,useMorePG=None): + if useMorePG == 'True': + useMorePG = taskbuffer.ProcessGroups.extensionLevel_1 + elif useMorePG in ['False',None]: + useMorePG = False + else: + try: + useMorePG = int(useMorePG) + except: + useMorePG = False + return userIF.getJobStatisticsForBamboo(useMorePG) + + +# get the number of waiting jobs per site and user +def getJobStatisticsPerUserSite(req): + return userIF.getJobStatisticsPerUserSite() + + +# get job statistics per site +def getJobStatisticsPerSite(req,predefined='False',workingGroup='',countryGroup='',jobType='', + minPriority=None,readArchived=None): + if predefined=='True': + predefined=True + else: + predefined=False + if minPriority != None: + try: + minPriority = int(minPriority) + except: + minPriority = None + if readArchived=='True': + readArchived = True + elif readArchived=='False': + readArchived = False + else: + host = req.get_remote_host() + # read jobsArchived for panglia + if re.search('panglia.*\.triumf\.ca$',host) != None or host in ['gridweb.triumf.ca']: + readArchived = True + else: + readArchived = False + return userIF.getJobStatisticsPerSite(predefined,workingGroup,countryGroup,jobType, + minPriority,readArchived) + + +# get job statistics per site with label +def getJobStatisticsWithLabel(req,site=''): + return userIF.getJobStatisticsWithLabel(site) + + +# query last files in datasets +def queryLastFilesInDataset(req,datasets): + return userIF.queryLastFilesInDataset(datasets) + + +# get input files currently in used for analysis +def getFilesInUseForAnal(req,outDataset): + return userIF.getFilesInUseForAnal(outDataset) + + +# get list of dis dataset to get input files in shadow +def getDisInUseForAnal(req,outDataset): + return userIF.getDisInUseForAnal(outDataset) + + +# get input LFNs currently in use for analysis with shadow dis +def getLFNsInUseForAnal(req,inputDisList): + return userIF.getLFNsInUseForAnal(inputDisList) + + +# kill jobs +def killJobs(req,ids,code=None,useMailAsID=None): + # check security + if not isSecure(req): + return False + # get DN + user = None + if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + user = _getDN(req) + # check role + prodManager = False + # get FQANs + fqans = _getFQAN(req) + # loop over all FQANs + for fqan in fqans: + # check production role + for rolePat in ['/atlas/usatlas/Role=production','/atlas/Role=production']: + if fqan.startswith(rolePat): + prodManager = True + break + # escape + if prodManager: + break + # use email address as ID + if useMailAsID == 'True': + useMailAsID = True + else: + useMailAsID = False + # hostname + host = req.get_remote_host() + return userIF.killJobs(ids,user,host,code,prodManager,useMailAsID,fqans) + + +# reassign jobs +def reassignJobs(req,ids,forPending=None): + # check security + if not isSecure(req): + return False + # get DN + user = None + if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + user = _getDN(req) + # hostname + host = req.get_remote_host() + # for pending + if forPending == 'True': + forPending = True + else: + forPending = False + return userIF.reassignJobs(ids,user,host,forPending) + + +# resubmit jobs +def resubmitJobs(req,ids): + # check security + if not isSecure(req): + return False + return userIF.resubmitJobs(ids) + + +# change job priorities +def changeJobPriorities(req,newPrioMap=None): + # check security + if not isSecure(req): + return pickle.dumps((False,'secure connection is required')) + # get DN + user = None + if req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + user = _getDN(req) + # check role + prodRole = _isProdRoleATLAS(req) + ret = userIF.changeJobPriorities(user,prodRole,newPrioMap) + return pickle.dumps(ret) + + +# get list of site spec +def getSiteSpecs(req,siteType=None): + if siteType != None: + return userIF.getSiteSpecs(siteType) + else: + return userIF.getSiteSpecs() + +# get list of cloud spec +def getCloudSpecs(req): + return userIF.getCloudSpecs() + +# get list of cache prefix +def getCachePrefixes(req): + return userIF.getCachePrefixes() + +# get client version +def getPandaClientVer(req): + return userIF.getPandaClientVer() + +# get nPilots +def getNumPilots(req): + return userIF.getNumPilots() + +# run brokerage +def runBrokerage(req,sites,cmtConfig=None,atlasRelease=None,trustIS=False,processingType=None, + loggingFlag=False,memorySize=None,workingGroup=None,nJobs=None, + siteGroup=None,maxCpuCount=None): + if trustIS=='True': + trustIS = True + else: + trustIS = False + if loggingFlag=='True': + loggingFlag = True + else: + loggingFlag = False + if memorySize != None: + try: + memorySize = long(memorySize) + except: + pass + if siteGroup != None: + try: + siteGroup = int(siteGroup) + except: + siteGroup = None + if maxCpuCount != None: + try: + maxCpuCount = int(maxCpuCount) + except: + maxCpuCount = None + preferHomeCountry = True + dn = _getDN(req) + fqans = _getFQAN(req) + return userIF.runBrokerage(sites,cmtConfig,atlasRelease,trustIS,processingType,dn, + loggingFlag,memorySize,workingGroup,fqans,nJobs,preferHomeCountry, + siteGroup,maxCpuCount) + +# run rebrokerage +def runReBrokerage(req,jobID,libDS='',cloud=None,excludedSite=None,forceOpt=None): + # check SSL + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return "ERROR: SSL connection is required" + # get DN + dn = _getDN(req) + if dn == '': + return "ERROR: could not get DN" + # convert jobID to long + try: + jobID = long(jobID) + except: + return "ERROR: jobID is not an integer" + # force option + if forceOpt == 'True': + forceOpt = True + else: + forceOpt = False + return userIF.runReBrokerage(dn,jobID,cloud,excludedSite,forceOpt) + + +# retry failed subjobs in running job +def retryFailedJobsInActive(req,jobID): + # check SSL + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return "ERROR: SSL connection is required" + # get DN + dn = _getDN(req) + if dn == '': + return "ERROR: could not get DN" + # convert jobID to long + try: + jobID = long(jobID) + except: + return "ERROR: jobID is not an integer" + return userIF.retryFailedJobsInActive(dn,jobID) + + +# logger interface +def sendLogInfo(req,msgType,msgList): + # check SSL + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return "ERROR: SSL connection is required" + # get DN + dn = _getDN(req) + if dn == '': + return "ERROR: could not get DN" + return userIF.sendLogInfo(dn,msgType,msgList) + + +# get serial number for group job +def getSerialNumberForGroupJob(req): + # check SSL + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return "ERROR: SSL connection is required" + # get DN + dn = _getDN(req) + if dn == '': + return "ERROR: could not get DN" + return userIF.getSerialNumberForGroupJob(dn) + + +# get script for offline running +def getScriptOfflineRunning(req,pandaID): + return userIF.getScriptOfflineRunning(pandaID) + + +# register proxy key +def registerProxyKey(req,credname,origin,myproxy): + # check security + if not isSecure(req): + return False + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return False + # get expiration date + if not req.subprocess_env.has_key('SSL_CLIENT_V_END'): + return False + params = {} + params['dn'] = _getDN(req) + # set parameters + params['credname'] = credname + params['origin'] = origin + params['myproxy'] = myproxy + # convert SSL_CLIENT_V_END + try: + expTime = req.subprocess_env['SSL_CLIENT_V_END'] + # remove redundant white spaces + expTime = re.sub('\s+',' ',expTime) + # convert to timestamp + expTime = time.strptime(expTime,'%b %d %H:%M:%S %Y %Z') + params['expires'] = time.strftime('%Y-%m-%d %H:%M:%S',expTime) + except: + _logger.error("registerProxyKey : failed to convert %s" % \ + req.subprocess_env['SSL_CLIENT_V_END']) + # execute + return userIF.registerProxyKey(params) + + +# register proxy key +def getProxyKey(req): + # check security + if not isSecure(req): + return False + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return False + dn = _getDN(req) + # execute + return userIF.getProxyKey(dn) + + +# get JobIDs in a time range +def getJobIDsInTimeRange(req,timeRange,dn=None): + # check security + if not isSecure(req): + return False + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return False + if dn == None: + dn = _getDN(req) + _logger.debug("getJobIDsInTimeRange %s %s" % (dn,timeRange)) + # execute + return userIF.getJobIDsInTimeRange(dn,timeRange) + + +# get PandaIDs for a JobID +def getPandIDsWithJobID(req,jobID,nJobs,dn=None): + # check security + if not isSecure(req): + return False + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return False + if dn == None: + dn = _getDN(req) + _logger.debug("getPandIDsWithJobID %s JobID=%s nJobs=%s" % (dn,jobID,nJobs)) + # execute + return userIF.getPandIDsWithJobID(dn,jobID,nJobs) + + +# check merge job generation status +def checkMergeGenerationStatus(req,jobID,dn=None): + # check security + if not isSecure(req): + return False + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return False + if dn == None: + dn = _getDN(req) + _logger.debug("checkMergeGenerationStatus %s JobID=%s" % (dn,jobID)) + # execute + return userIF.checkMergeGenerationStatus(dn,jobID) + + +# get slimmed file info with PandaIDs +def getSlimmedFileInfoPandaIDs(req,ids): + # check security + if not isSecure(req): + return False + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return False + dn = _getDN(req) + return userIF.getSlimmedFileInfoPandaIDs(ids,dn) + + +# get full job status +def getFullJobStatus(req,ids): + # check security + if not isSecure(req): + return False + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return False + dn = _getDN(req) + return userIF.getFullJobStatus(ids,dn) + + +# get number of analysis jobs per user +def getNUserJobs(req,siteName,nJobs=100): + # check security + prodManager = False + if not isSecure(req): + return "Failed : HTTPS connection is required" + # get FQANs + fqans = _getFQAN(req) + # loop over all FQANs + for fqan in fqans: + # check production role + for rolePat in ['/atlas/usatlas/Role=production', + '/atlas/Role=production', + '/atlas/usatlas/Role=pilot', + '/atlas/Role=pilot', + ]: + if fqan.startswith(rolePat): + prodManager = True + break + # escape + if prodManager: + break + # only prod managers can use this method + if not prodManager: + return "Failed : VOMS authorization failure" + # convert nJobs to int + try: + nJobs = int(nJobs) + except: + nJobs = 100 + # execute + return userIF.getNUserJobs(siteName,nJobs) + + +# add account to siteaccess +def addSiteAccess(req,siteID): + # check security + if not isSecure(req): + return "False" + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return "False" + dn = req.subprocess_env['SSL_CLIENT_S_DN'] + return userIF.addSiteAccess(siteID,dn) + + +# list site access +def listSiteAccess(req,siteID=None,longFormat=False): + # check security + if not isSecure(req): + return "False" + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return "False" + # set DN if siteID is none + dn = None + if siteID==None: + dn = req.subprocess_env['SSL_CLIENT_S_DN'] + # convert longFormat option + if longFormat == 'True': + longFormat = True + else: + longFormat = False + return userIF.listSiteAccess(siteID,dn,longFormat) + + +# update site access +def updateSiteAccess(req,method,siteid,userName,attrValue=''): + # check security + if not isSecure(req): + return "non HTTPS" + # get DN + if not req.subprocess_env.has_key('SSL_CLIENT_S_DN'): + return "invalid DN" + # set requester's DN + requesterDN = req.subprocess_env['SSL_CLIENT_S_DN'] + # update + return userIF.updateSiteAccess(method,siteid,requesterDN,userName,attrValue) diff --git a/current/pandaserver/userinterface/__init__.py b/current/pandaserver/userinterface/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/current/pandaserver/userinterface/runReBroker.py b/current/pandaserver/userinterface/runReBroker.py new file mode 100755 index 000000000..e20e6d595 --- /dev/null +++ b/current/pandaserver/userinterface/runReBroker.py @@ -0,0 +1,70 @@ +# exec +def run(dn,jobID,cloud=None,excludedSite=None): + # check parameters + if dn == '': + return False + if jobID < 0: + return False + # password + from config import panda_config + passwd = panda_config.dbpasswd + # initialize cx_Oracle using dummy connection + from taskbuffer.Initializer import initializer + initializer.init() + # instantiate TB + from taskbuffer.TaskBuffer import taskBuffer + taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) + # run ReBroker + from userinterface.ReBroker import ReBroker + reThr = ReBroker(taskBuffer,cloud,excludedSite,userRequest=True) + # lock + stLock,retLock = reThr.lockJob(dn,jobID) + # failed + if not stLock: + return False + # start + reThr.start() + reThr.join() + return True + + +#################################################################### +# main +def main(): + import sys + import getopt + # option class + class _options: + def __init__(self): + pass + options = _options() + del _options + # set default values + options.jobID = -1 + options.dn = '' + options.cloud = None + options.excludedSite = None + # get command-line parameters + try: + opts, args = getopt.getopt(sys.argv[1:],"j:d:c:e:") + # set options + for o, a in opts: + if o in ("-j",): + options.jobID = long(a) + if o in ("-d",): + options.dn = a + if o in ("-c",): + options.cloud = a + if o in ("-e",): + options.excludedSite = a.split(',') + except: + print("ERROR : Invalid options") + sys.exit(1) + # run + run(options.dn,options.jobID,options.cloud,options.excludedSite) + # return + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/current/setup.cfg b/current/setup.cfg new file mode 100644 index 000000000..74c606520 --- /dev/null +++ b/current/setup.cfg @@ -0,0 +1,7 @@ +[global] + +[bdist_rpm] +provides = panda-server +release = 1 +packager = Panda Team +requires = python, panda-common diff --git a/current/setup.py b/current/setup.py new file mode 100755 index 000000000..c88adbd41 --- /dev/null +++ b/current/setup.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python +# +# Setup prog for Panda Server +# +# +release_version='0.0.5' + +import re +import sys +import commands +from distutils.core import setup +from distutils.command.install import install as install_org +from distutils.command.install_data import install_data as install_data_org + +# get panda specific params +optPanda = {} +newArgv = [] +idx = 0 +while idx < len(sys.argv): + tmpArg = sys.argv[idx] + if tmpArg.startswith('--panda_'): + # panda params + idx += 1 + if len(tmpArg.split('=')) == 2: + # split to par and val if = is contained + tmpVal = tmpArg.split('=')[-1] + tmpArg = tmpArg.split('=')[0] + elif len(tmpArg.split('=')) == 1: + tmpVal = sys.argv[idx] + idx += 1 + else: + raise RuntimeError,"invalid panda option : %s" % tmpArg + # get key + tmpKey = re.sub('--panda_','',tmpArg) + # set params + optPanda[tmpKey] = tmpVal + else: + # normal opts + idx += 1 + newArgv.append(tmpArg) +# set new argv +sys.argv = newArgv + + +# set overall prefix for bdist_rpm +class install_panda(install_org): + def initialize_options (self): + install_org.initialize_options(self) + self.prefix = '/data/atlpan/srv' + + +# generates files using templates and install them +class install_data_panda (install_data_org): + + def initialize_options (self): + install_data_org.initialize_options (self) + self.install_purelib = None + + def finalize_options (self): + # set install_purelib + self.set_undefined_options('install', + ('install_purelib','install_purelib')) + # set reaming params + install_data_org.finalize_options(self) + # set hostname + if optPanda.has_key('hostname') and optPanda['hostname'] != '': + self.hostname = optPanda['hostname'] + else: + self.hostname = commands.getoutput('hostname -f') + # set user and group + if optPanda.has_key('username') and optPanda['username'] != '': + self.username = optPanda['username'] + else: + self.username = commands.getoutput('id -un') + if optPanda.has_key('usergroup') and optPanda['usergroup'] != '': + self.usergroup = optPanda['usergroup'] + else: + self.usergroup = commands.getoutput('id -gn') + + + def run (self): + # remove /usr for bdist/bdist_rpm + match = re.search('(build/[^/]+/dumb)/usr',self.install_dir) + if match != None: + self.install_dir = re.sub(match.group(0),match.group(1),self.install_dir) + # remove /var/tmp/*-buildroot for bdist_rpm + match = re.search('(/var/tmp/.*-buildroot)/usr',self.install_dir) + if match != None: + self.install_dir = re.sub(match.group(0),match.group(1),self.install_dir) + # create tmp area + tmpDir = 'build/tmp' + self.mkpath(tmpDir) + new_data_files = [] + for destDir,dataFiles in self.data_files: + newFilesList = [] + for srcFile in dataFiles: + # check extension + if not srcFile.endswith('.template'): + raise RuntimeError,"%s doesn't have the .template extension" % srcFile + # dest filename + destFile = re.sub('(\.exe)*\.template$','',srcFile) + destFile = destFile.split('/')[-1] + destFile = '%s/%s' % (tmpDir,destFile) + # open src + inFile = open(srcFile) + # read + filedata=inFile.read() + # close + inFile.close() + # replace patterns + for item in re.findall('@@([^@]+)@@',filedata): + if not hasattr(self,item): + raise RuntimeError,'unknown pattern %s in %s' % (item,srcFile) + # get pattern + patt = getattr(self,item) + # remove build/*/dump for bdist + patt = re.sub('build/[^/]+/dumb','',patt) + # remove /var/tmp/*-buildroot for bdist_rpm + patt = re.sub('/var/tmp/.*-buildroot','',patt) + # replace + filedata = filedata.replace('@@%s@@' % item, patt) + # write to dest + oFile = open(destFile,'w') + oFile.write(filedata) + oFile.close() + # chmod for exe + if srcFile.endswith('.exe.template'): + commands.getoutput('chmod +x %s' % destFile) + # append + newFilesList.append(destFile) + # replace dataFiles to install generated file + new_data_files.append((destDir,newFilesList)) + # install + self.data_files = new_data_files + install_data_org.run(self) + + +# setup for distutils +setup( + name="panda-server", + version=release_version, + description=' PanDA Server Package', + long_description='''This package contains PanDA Server Components''', + license='GPL', + author='Panda Team', + author_email='hn-atlas-panda-pathena@cern.ch', + url='https://twiki.cern.ch/twiki/bin/view/Atlas/PanDA', + packages=[ 'pandaserver', + 'pandaserver.brokerage', + 'pandaserver.config', + 'pandaserver.dataservice', + 'pandaserver.jobdispatcher', + 'pandaserver.server', + 'pandaserver.taskbuffer', + 'pandaserver.test', + 'pandaserver.userinterface', + ], + data_files=[ + # config files + ('etc/panda', ['templates/panda_server-httpd.conf.rpmnew.template', + 'templates/panda_server-httpd-FastCGI.conf.rpmnew.template', + 'templates/panda_server.cfg.rpmnew.template', + 'templates/panda_server-grid-env.sh.template', + ] + ), + # sysconfig + ('etc/sysconfig', ['templates/panda_server-sysconfig.rpmnew.template', + ] + ), + # logrotate + ('etc/logrotate.d', ['templates/panda_server-logrotate.template', + ] + ), + # init script + ('etc/init.d', ['templates/panda_server-ctl.exe.template', + ] + ), + # crons + ('usr/bin', ['templates/panda_server-add.sh.exe.template', + 'templates/panda_server-priority.sh.exe.template', + 'templates/panda_server-copyArchive.sh.exe.template', + 'templates/panda_server-copyROOT.sh.exe.template', + 'templates/panda_server-vomsrenew.sh.exe.template', + 'templates/panda_server-archivelog.sh.exe.template', + 'templates/panda_server-tmpwatch.sh.exe.template', + 'templates/panda_server-backupJobArch.sh.exe.template', + 'templates/panda_server-deleteJobs.sh.exe.template', + 'templates/panda_server-merge.sh.exe.template', + 'templates/panda_server-datasetManager.sh.exe.template', + 'templates/panda_server-evpPD2P.sh.exe.template', + 'templates/panda_server-callback.sh.exe.template', + 'templates/panda_server-makeSlsXml.exe.template', + 'templates/panda_server-boostUser.sh.exe.template', + 'templates/panda_server-runRebro.sh.exe.template', + ] + ), + # var dirs + #('var/log/panda', []), + #('var/cache/pandaserver', []), + ], + cmdclass={'install': install_panda, + 'install_data': install_data_panda} +) diff --git a/current/templates/panda_server-add.sh.exe.template b/current/templates/panda_server-add.sh.exe.template new file mode 100755 index 000000000..cce611988 --- /dev/null +++ b/current/templates/panda_server-add.sh.exe.template @@ -0,0 +1,12 @@ +#!/bin/bash + +# setup grid stuff +source /opt/glite/etc/profile.d/grid-env.sh + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +# set PYTHONPATH for LFC.py +export PYTHONPATH=/opt/lcg/lib64/python2.5/site-packages:$PYTHONPATH + +python2.5 @@install_purelib@@/pandaserver/test/add.py diff --git a/current/templates/panda_server-archivelog.sh.exe.template b/current/templates/panda_server-archivelog.sh.exe.template new file mode 100755 index 000000000..8a0a2c5ab --- /dev/null +++ b/current/templates/panda_server-archivelog.sh.exe.template @@ -0,0 +1,6 @@ +#!/bin/bash + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python @@install_purelib@@/pandaserver/test/archivelogs.py diff --git a/current/templates/panda_server-backupJobArch.sh.exe.template b/current/templates/panda_server-backupJobArch.sh.exe.template new file mode 100644 index 000000000..bc896d843 --- /dev/null +++ b/current/templates/panda_server-backupJobArch.sh.exe.template @@ -0,0 +1,6 @@ +#!/bin/bash + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python @@install_purelib@@/pandaserver/test/backupJobArch.py diff --git a/current/templates/panda_server-boostUser.sh.exe.template b/current/templates/panda_server-boostUser.sh.exe.template new file mode 100755 index 000000000..f1541998e --- /dev/null +++ b/current/templates/panda_server-boostUser.sh.exe.template @@ -0,0 +1,6 @@ +#!/bin/bash + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +echo $1 | python2.5 @@install_purelib@@/pandaserver/test/boostUser.py diff --git a/current/templates/panda_server-callback.sh.exe.template b/current/templates/panda_server-callback.sh.exe.template new file mode 100755 index 000000000..da833c70c --- /dev/null +++ b/current/templates/panda_server-callback.sh.exe.template @@ -0,0 +1,9 @@ +#!/bin/bash + +# setup grid stuff +source /opt/glite/etc/profile.d/grid-env.sh + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python2.5 @@install_purelib@@/pandaserver/test/fileCallbackListener.py diff --git a/current/templates/panda_server-copyArchive.sh.exe.template b/current/templates/panda_server-copyArchive.sh.exe.template new file mode 100755 index 000000000..8005b4d3e --- /dev/null +++ b/current/templates/panda_server-copyArchive.sh.exe.template @@ -0,0 +1,9 @@ +#!/bin/bash + +# setup grid stuff +source /opt/glite/etc/profile.d/grid-env.sh + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python2.5 @@install_purelib@@/pandaserver/test/copyArchive.py diff --git a/current/templates/panda_server-copyROOT.sh.exe.template b/current/templates/panda_server-copyROOT.sh.exe.template new file mode 100755 index 000000000..efbd483be --- /dev/null +++ b/current/templates/panda_server-copyROOT.sh.exe.template @@ -0,0 +1,6 @@ +#!/bin/bash + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python @@install_purelib@@/pandaserver/test/copyROOT.py diff --git a/current/templates/panda_server-ctl.exe.template b/current/templates/panda_server-ctl.exe.template new file mode 100755 index 000000000..70a849b9c --- /dev/null +++ b/current/templates/panda_server-ctl.exe.template @@ -0,0 +1,139 @@ +#!/bin/sh +# +# Copyright 2000-2004 The Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Apache control script designed to allow an easy command line interface +# to controlling Apache. Written by Marc Slemko, 1997/08/23 +# +# The exit codes returned are: +# XXX this doc is no longer correct now that the interesting +# XXX functions are handled by httpd +# 0 - operation completed successfully +# 1 - +# 2 - usage error +# 3 - httpd could not be started +# 4 - httpd could not be stopped +# 5 - httpd could not be started during a restart +# 6 - httpd could not be restarted during a restart +# 7 - httpd could not be restarted during a graceful restart +# 8 - configuration syntax error +# +# When multiple arguments are given, only the error from the _last_ +# one is reported. Run "apachectl help" for usage info +# +ARGV="$@" +# +# |||||||||||||||||||| START CONFIGURATION SECTION |||||||||||||||||||| +# -------------------- -------------------- +# +# the path to your httpd binary, including options if necessary +HTTPD='/usr/sbin/httpd.worker' + +# +# a command that outputs a formatted text version of the HTML at the +# url given on the command line. Designed for lynx, however other +# programs may work. +if [ -x /usr/bin/links ]; then + LYNX="links -dump" +elif [ -x /usr/bin/lynx ]; then + LYNX="lynx -dump" +else + LYNX="none" +fi + +# +# the URL to your server's mod_status status page. If you do not +# have one, then status and fullstatus will not work. +STATUSURL="http://localhost:80/server-status" + +# Source /etc/sysconfig/httpd for $HTTPD setting, etc. +if [ -r @@install_dir@@/etc/sysconfig/panda_server-sysconfig ]; then + . @@install_dir@@/etc/sysconfig/panda_server-sysconfig +fi + +ERROR=0 +if [ "x$ARGV" = "x" ] ; then + ARGV="-h" +fi + +function check13() { +# check for 1.3 configuration +GONE="(ServerType|BindAddress|Port|AddModule|ClearModuleList|" +GONE="${GONE}AgentLog|RefererLog|RefererIgnore|FancyIndexing|" +GONE="${GONE}AccessConfig|ResourceConfig)" +if grep -Eiq "^[[:space:]]*($GONE)" /etc/httpd/conf/httpd.conf; then + echo "$0: Apache 1.3 configuration directives found" + echo "$0: please read /usr/share/doc/httpd-2.0.52/migration.html" + exit 2 +fi +} + +function checklynx() { +if [ "$LYNX" = "none" ]; then + echo "The 'links' package is required for this functionality." + exit 8 +fi +} + +function testconfig() { +# httpd is denied terminal access in SELinux, so run in the +# current context to get stdout from $HTTPD -t. +if test -x /usr/sbin/selinuxenabled && /usr/sbin/selinuxenabled; then + runcon -- `id -Z` $HTTPD $OPTIONS -t +else + $HTTPD $OPTIONS -t +fi +ERROR=$? +} + +case $ARGV in +restart|graceful) + if $HTTPD -t >&/dev/null; then + $HTTPD $OPTIONS -k $ARGV + ERROR=$? + else + echo "apachectl: Configuration syntax error, will not run \"$ARGV\":" + testconfig + fi + ;; +start|stop) + check13 + $HTTPD $OPTIONS -k $ARGV + ERROR=$? + ;; +startssl|sslstart|start-SSL) + check13 + $HTTPD $OPTIONS -DSSL -k start + ERROR=$? + ;; +configtest) + testconfig + ;; +status) + checklynx + $LYNX $STATUSURL | awk ' /process$/ { print; exit } { print } ' + ;; +fullstatus) + checklynx + $LYNX $STATUSURL + ;; +*) + $HTTPD $OPTIONS $ARGV + ERROR=$? +esac + +exit $ERROR + diff --git a/current/templates/panda_server-datasetManager.sh.exe.template b/current/templates/panda_server-datasetManager.sh.exe.template new file mode 100644 index 000000000..32abd2976 --- /dev/null +++ b/current/templates/panda_server-datasetManager.sh.exe.template @@ -0,0 +1,9 @@ +#!/bin/bash + +# setup grid stuff +source /opt/glite/etc/profile.d/grid-env.sh + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python2.5 @@install_purelib@@/pandaserver/test/datasetManager.py diff --git a/current/templates/panda_server-deleteJobs.sh.exe.template b/current/templates/panda_server-deleteJobs.sh.exe.template new file mode 100644 index 000000000..fd48e9e7e --- /dev/null +++ b/current/templates/panda_server-deleteJobs.sh.exe.template @@ -0,0 +1,6 @@ +#!/bin/bash + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python @@install_purelib@@/pandaserver/test/deleteJobs.py diff --git a/current/templates/panda_server-evpPD2P.sh.exe.template b/current/templates/panda_server-evpPD2P.sh.exe.template new file mode 100755 index 000000000..8786da667 --- /dev/null +++ b/current/templates/panda_server-evpPD2P.sh.exe.template @@ -0,0 +1,9 @@ +#!/bin/bash + +# setup grid stuff +source /opt/glite/etc/profile.d/grid-env.sh + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python2.5 @@install_purelib@@/pandaserver/test/evpPD2P.py diff --git a/current/templates/panda_server-grid-env.sh.template b/current/templates/panda_server-grid-env.sh.template new file mode 100644 index 000000000..c1e0d3321 --- /dev/null +++ b/current/templates/panda_server-grid-env.sh.template @@ -0,0 +1,3 @@ +export LD_LIBRARY_PATH=/opt/glite/lib64:/opt/globus/lib:/opt/lcg/lib64:$LD_LIBRARY_PATH +export PYTHONPATH=/opt/glite/lib64/python:/opt/lcg/lib64/python:$PYTHONPATH +export PATH=/opt/edg/bin:/opt/glite/bin:/opt/globus/bin:/opt/lcg/bin:$PATH diff --git a/current/templates/panda_server-httpd-FastCGI.conf.rpmnew.template b/current/templates/panda_server-httpd-FastCGI.conf.rpmnew.template new file mode 100644 index 000000000..0148c1eb0 --- /dev/null +++ b/current/templates/panda_server-httpd-FastCGI.conf.rpmnew.template @@ -0,0 +1,177 @@ +LoadModule access_module modules/mod_access.so +LoadModule alias_module modules/mod_alias.so +LoadModule rewrite_module modules/mod_rewrite.so +LoadModule mime_magic_module modules/mod_mime_magic.so +LoadModule mime_module modules/mod_mime.so +LoadModule include_module modules/mod_include.so +LoadModule log_config_module modules/mod_log_config.so +LoadModule env_module modules/mod_env.so +LoadModule deflate_module modules/mod_deflate.so +LoadModule setenvif_module modules/mod_setenvif.so +LoadModule dir_module modules/mod_dir.so +LoadModule ssl_module modules/mod_ssl.so +LoadModule headers_module modules/mod_headers.so +LoadModule gridsite_module modules/mod_gridsite.so + +# FastCGI/WSGI +#LoadModule fastcgi_module modules/mod_fastcgi.so +LoadModule wsgi_module modules/mod_wsgi.so + + +User atlpan +Group zp + + +StartServers 25 +MinSpareServers 25 +ServerLimit 512 +MaxSpareServers 512 +MaxClients 512 +MaxRequestsPerChild 2000 + + +ServerName pandaserver.cern.ch + +DocumentRoot "@@install_purelib@@/pandaserver" + + + Order allow,deny + Deny from all + + +RedirectMatch 403 "/panda.py$" + + + Options FollowSymLinks + AllowOverride None + Order allow,deny + Allow from all + Deny from 192.203.218.14 + + +Alias /trf/ "@@install_dir@@/var/trf/" +Alias /cache/ "@@install_dir@@/var/cache/pandaserver/" +Alias /appdir/ "@@install_dir@@/var/appdir/" + + + Options FollowSymLinks + AllowOverride None + Order allow,deny + Allow from all + Deny from 192.203.218.14 + + + + FastCgiIpcDir @@install_dir@@/var/log/panda/fastsocks + FastCgiServer @@install_purelib@@/pandaserver/server/panda.py \ + -processes 25 -idle-timeout 300 -listen-queue-depth 1 -flush \ + -initial-env PYTHONPATH \ + -initial-env TZ \ + -initial-env HOME \ + -initial-env PANDA_HOME \ + -initial-env X509_CERT_DIR \ + -initial-env X509_USER_PROXY \ + -initial-env PANDA_URL \ + -initial-env PANDA_URL_SSL + ScriptAliasMatch ^/server/panda/(.+)$ @@install_purelib@@/pandaserver/server/panda.py + + + + WSGIDaemonProcess pandasrv_daemon processes=25 threads=2 home=/home/atlpan + WSGIProcessGroup pandasrv_daemon + WSGIApplicationGroup %{GLOBAL} + WSGIScriptAliasMatch ^/server/panda/(.+)$ @@install_purelib@@/pandaserver/server/panda.py + WSGISocketPrefix @@install_dir@@/var/log/panda/wsgisocks/wsgi + + + +Listen 25080 + + +RewriteEngine on +RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK) +RewriteRule .* - [F] +# use Cassandra for cache +RewriteRule ^/cscache/(.*)$ /server/panda/getFile?fileName=$1 [PT,L] + + + + + Order allow,deny + Allow from all + Deny from 192.203.218.14 + + + # allow .py + + Order allow,deny + Allow from all + + + # enable CGI for FastCGI/WSGI + Options FollowSymLinks +ExecCGI + + # mod_gridsite + GridSiteIndexes on + GridSiteAuth on + GridSiteDNlists /etc/grid-security/dn-lists/ + GridSiteEnvs on + + + + + +Listen 25443 + + +RewriteEngine on +RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK) +RewriteRule .* - [F] +# use Cassandra for cache +RewriteRule ^/cscache/(.*)$ /server/panda/getFile?fileName=$1 [PT,L] + +# CERN security recommendation to only allow the seven strongest ssl ciphers +SSLProtocol -all +TLSv1 +SSLv3 +SSLCipherSuite HIGH:MEDIUM:+SSLv3 + +SSLEngine on +SSLCertificateFile /etc/grid-security/hostcert.pem +SSLCertificateKeyFile /etc/grid-security/hostkey.pem +SSLCACertificatePath /etc/grid-security/certificates +SSLVerifyClient optional +SSLVerifyDepth 10 +SSLOptions +ExportCertData +StdEnvVars + + + + # allow .py + + Order allow,deny + Allow from all + + + # enable CGI for FastCGI/WSGI + Options FollowSymLinks +ExecCGI + + # mod_gridsite + GridSiteIndexes on + GridSiteAuth on + GridSiteDNlists /etc/grid-security/dn-lists/ + GridSiteGSIProxyLimit 1 + GridSiteEnvs on + + + + +LogLevel info + +LogFormat "%t %h \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined +LogFormat "%t %h \"%r\" %>s %b" common +LogFormat "%{Referer}i -> %U" referer +LogFormat "%{User-agent}i" agent +CustomLog @@install_dir@@/var/log/panda/panda_server_access_log common +ErrorLog @@install_dir@@/var/log/panda/panda_server_error_log + +PidFile @@install_dir@@/var/log/panda/panda_server_httpd.pid + +TypesConfig /etc/mime.types diff --git a/current/templates/panda_server-httpd.conf.rpmnew.template b/current/templates/panda_server-httpd.conf.rpmnew.template new file mode 100644 index 000000000..6057f0cc4 --- /dev/null +++ b/current/templates/panda_server-httpd.conf.rpmnew.template @@ -0,0 +1,141 @@ +LoadModule access_module modules/mod_access.so +LoadModule alias_module modules/mod_alias.so +LoadModule rewrite_module modules/mod_rewrite.so +LoadModule mime_magic_module modules/mod_mime_magic.so +LoadModule mime_module modules/mod_mime.so +LoadModule include_module modules/mod_include.so +LoadModule log_config_module modules/mod_log_config.so +LoadModule env_module modules/mod_env.so +LoadModule deflate_module modules/mod_deflate.so +LoadModule setenvif_module modules/mod_setenvif.so +LoadModule dir_module modules/mod_dir.so +LoadModule ssl_module modules/mod_ssl.so +LoadModule python_module modules/mod_python.so +LoadModule gridsite_module modules/mod_gridsite.so + +User atlpan +Group zp + + +StartServers 50 +MinSpareServers 50 +MaxSpareServers 50 +MaxClients 50 +MaxRequestsPerChild 0 + + + +ServerLimit 10 +StartServers 10 +MaxClients 50 +MinSpareThreads 50 +MaxSpareThreads 50 +ThreadsPerChild 5 +MaxRequestsPerChild 0 + + +ServerName pandaserver.cern.ch + +DocumentRoot "@@install_purelib@@/pandaserver" + + + Order allow,deny + Deny from all + + + + Options FollowSymLinks + AllowOverride None + Order allow,deny + Allow from all + Deny from 192.203.218.14 + + +Alias /cache/ "@@install_dir@@/var/cache/pandaserver/" + + + Options FollowSymLinks + AllowOverride None + Order allow,deny + Allow from all + Deny from 192.203.218.14 + + +Listen 25080 + + +RewriteEngine on +RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK) +RewriteRule .* - [F] + + + + + Order allow,deny + Allow from all + Deny from 192.203.218.14 + + + # mod_python + SetHandler python-program + PythonHandler mod_python.publisher + PythonDebug On + + # mod_gridsite + GridSiteIndexes on + GridSiteAuth on + GridSiteDNlists /etc/grid-security/dn-lists/ + GridSiteEnvs on + + + + + +Listen 25443 + + +RewriteEngine on +RewriteCond %{REQUEST_METHOD} ^(TRACE|TRACK) +RewriteRule .* - [F] + +# CERN security recommendation to only allow the seven strongest ssl ciphers +SSLProtocol -all +TLSv1 +SSLv3 +SSLCipherSuite HIGH:MEDIUM:+SSLv3 + +SSLEngine on +SSLCertificateFile /etc/grid-security/hostcert.pem +SSLCertificateKeyFile /etc/grid-security/hostkey.pem +SSLCACertificatePath /etc/grid-security/certificates +SSLVerifyClient optional +SSLVerifyDepth 10 +SSLOptions +ExportCertData +StdEnvVars + + + + # mod_python + SetHandler python-program + PythonHandler mod_python.publisher + PythonDebug On + + # mod_gridsite + GridSiteIndexes on + GridSiteAuth on + GridSiteDNlists /etc/grid-security/dn-lists/ + GridSiteGSIProxyLimit 1 + GridSiteEnvs on + + + + +LogLevel info + +LogFormat "%t %h \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"" combined +LogFormat "%t %h \"%r\" %>s %b" common +LogFormat "%{Referer}i -> %U" referer +LogFormat "%{User-agent}i" agent +CustomLog @@install_dir@@/var/log/panda/panda_server_access_log common +ErrorLog @@install_dir@@/var/log/panda/panda_server_error_log + +PidFile @@install_dir@@/var/log/panda/panda_server_httpd.pid + +TypesConfig /etc/mime.types diff --git a/current/templates/panda_server-logrotate.template b/current/templates/panda_server-logrotate.template new file mode 100644 index 000000000..a474741f6 --- /dev/null +++ b/current/templates/panda_server-logrotate.template @@ -0,0 +1,14 @@ +@@install_dir@@/var/log/panda/*log { + rotate 180 + daily + compress + missingok + notifempty + sharedscripts + daily + postrotate + killall -u atlpan python || true + killall -u atlpan python2.5 || true + /sbin/service httpd-pandasrv restart > /dev/null 2>/dev/null || true + endscript +} diff --git a/current/templates/panda_server-makeSlsXml.exe.template b/current/templates/panda_server-makeSlsXml.exe.template new file mode 100755 index 000000000..23e23b3d3 --- /dev/null +++ b/current/templates/panda_server-makeSlsXml.exe.template @@ -0,0 +1,334 @@ +#!/usr/bin/python2.5 + +import SLSxml +import socket +import subprocess +import re +import sys +import optparse + +########################################### +## define options +########################################### +parser = optparse.OptionParser() +parser.add_option( "-u", "--use", dest="use", type="string", + help="Use of xml, allowed values: 'mon', 'server' or 'bamboo'" ) +parser.add_option( "--host", dest="host", type="string", + help="Hostname of server to check, default is current machine hostname" ) +parser.add_option( "-d", "--dir", dest="dir", type="string", + help="Filename of the xml file output. Default is " + + "/data/atlpan/oracle/panda/monitoring" ) +parser.add_option( "--debug", action="store_true", dest="debug", + default=False, help="Print out debug statements." ) + +( options, args ) = parser.parse_args() + +def __main__() : + + if( options.host ) : + host = options.host + else : + host = socket.gethostname() + host = re.sub( r'^(\w+).*', r'\1', host ) + + if( options.use == 'mon' ) : + tmp_xml = make_monitor( host ) + file_part = 'PandaMon' + elif( options.use == 'server' ) : + tmp_xml = make_server( host ) + file_part = 'PandaServer' + elif( options.use == 'bamboo' ) : + tmp_xml = make_bamboo( host ) + file_part = 'PandaBamboo' + else : + print "Err: please choose a use, 'mon', 'server' or 'bamboo'." + return + + if( options.dir ) : + file_dir = options.dir + else : + file_dir = '/data/atlpan/oracle/panda/monitoring' + + file_name = '%s/%s_%s.xml' % ( file_dir, file_part, host ) + tmp_file = open( file_name, 'w' ) + tmp_file.write( tmp_xml ) + tmp_file.close + +def make_server( host ) : + + if( options.debug ) : print "Creating the server monitoring xml" + + server_avail = server_availability( host ) + add_processes = count_add_processes() + num_holdings = count_holdings() + data_used = volume_use( 'data' ) + var_used = volume_use( 'var' ) + ave_regtime = registration_time() + ave_regtimeDQ2 = registration_time(onlyDQ2=True) + + sls_xml = SLSxml.xml_doc() + sls_xml.set_id( 'PandaServer_%s' % ( host ) ) + sls_xml.set_shortname( 'PandaServer monitoring service at %s' % ( host ) ) + sls_xml.set_fullname( 'PandaServer monitoring service at %s' % ( host ) ) + sls_xml.set_availability( str( server_avail ) ) + + sls_xml.add_data( "AddProcesses", "Number of processes for DQ2+LFC registration", + str( add_processes ) ) + sls_xml.add_data( "HoldingJobs", "Number of holding jobs to be registered", + str( num_holdings ) ) + sls_xml.add_data( "RegistrationTime", "Average time for DQ2+LFC registration in second", + str( ave_regtime ) ) + sls_xml.add_data( "RegistrationTimeDQ2", "Average time for DQ2 registration in second", + str( ave_regtimeDQ2 ) ) + sls_xml.add_data( "DataVolumeUse", "Percent use of the local /data volume", + str( data_used ) ) + sls_xml.add_data( "VarVolumeUse", "Percent use of the local /var volume", + str( var_used ) ) + + return sls_xml.print_xml() + +def make_bamboo( host ) : + + if( options.debug ) : print "Creating the server monitoring xml" + + server_avail = bamboo_availability( host ) + + sls_xml = SLSxml.xml_doc() + sls_xml.set_id( 'PandaBamboo_%s' % ( host ) ) + sls_xml.set_shortname( 'PandaBamboo monitoring service at %s' % ( host ) ) + sls_xml.set_fullname( 'PandaBamboo monitoring service at %s' % ( host ) ) + sls_xml.set_availability( str( server_avail ) ) + return sls_xml.print_xml() + +def make_monitor( host ) : + + if( options.debug ) : print "Creating the monitor monitoring xml" + + errormes = False + messagetext = '' + + http_avail = httpd_availability( host ) + if( http_avail == 0 ) : + errormes = True + messagetext += "Error: web server on %s not working\n" % ( host ) + + squid_avail = squid_availability() + if( squid_avail == 0 ) : + errormes = True + messagetext += "Error: squid server on %s not working\n" % ( host ) + + panda_avail = panda_availability( host ) + if( panda_avail == 0 ) : + errormes = True + messagetext += "Error: panda monitor on %s not working\n" % ( host ) + + http_processes = count_processes() + + data_used = volume_use( 'data' ) + var_used = volume_use( 'var' ) + + if( errormes ) : + error_mail( host, messagetext ) + + if( options.debug ) : + print 'web - %s, squid - %s, panda - %s' % ( http_avail, squid_avail, + panda_avail ) + + sls_xml = SLSxml.xml_doc() + sls_xml.set_id( 'PandaMon_%s' % ( host ) ) + sls_xml.set_shortname( 'PandaMonitor monitoring service at %s' % ( host ) ) + sls_xml.set_fullname( 'PandaMonitor monitoring service at %s' % ( host ) ) + sls_xml.set_availability( str( panda_avail ) ) + + #adding intervention by hand here + #sls_xml.add_intervention( "2011-01-16T20:00:00", "PT36H", + # "Panda services with be out for over a day due to database server changes." ) + + sls_xml.add_data( "HttpdAvailability", "Availability of the httpd server", + str( http_avail ) ) + sls_xml.add_data( "SquidAvailability", "Availability of the squid server", + str( squid_avail ) ) + sls_xml.add_data( "PandaAvailability", "Availability of the panda monitor", + str( panda_avail ) ) + sls_xml.add_data( "HttpProcesses", "Number of processes for the panda monitor", + str( http_processes ) ) + sls_xml.add_data( "DataVolumeUse", "Percent use of the local /data volume", + str( data_used ) ) + sls_xml.add_data( "VarVolumeUse", "Percent use of the local /var volume", + str( var_used ) ) + return sls_xml.print_xml() + +def httpd_availability( host ) : + url = 'http://%s.cern.ch/robots.txt' % ( host ) + return check_url( url, "go away" ) + +def squid_availability() : + command = '/usr/bin/squidclient -p 25980 cache_object://localhost/info' + return check_command( command, 'OK' ) + +def panda_availability( host ) : + + port = '25980' + baseurl = 'http://' + host + ':' + port + '/server/pandamon/query?' + + reply = check_url( baseurl + 'isAlive', 'yes' ) + if( reply != '100' ) : return '0' + + return '100' + + #The above is a simpler test of the python code, for now, until the + #panda monitor migration is more stable, and all network tweaks are + #in quator, so things are stable on reboot/upgrade. Once that is + #true the below tests should be put back. + + reply = check_url( baseurl + 'dash=prod', 'CERN:OK' ) + if( reply != '100' ) : return '0' + + reply = check_url( baseurl + 'dash=clouds', 'Cloud status' ) + if( reply != '100' ) : return '0' + + reply = check_url( baseurl + 'overview=incidents', 'Recorded incidents' ) + if( reply != '100' ) : return '0' + + reply = check_url( baseurl + 'dash=ddm', 'Space available' ) + if( reply != '100' ) : return '0' + + return '100' + +def server_availability( host ) : + + tmp_url = '--no-check-certificate https://%s:25443/server/panda/isAlive' % ( host ) + reply = check_url( tmp_url, 'alive=yes' ) + if( reply != '100' ) : return '0' + + return '100' + +def bamboo_availability( host ) : + + tmp_url = 'http://%s:25070/bamboo/bamboo/isAlive' % ( host ) + reply = check_url( tmp_url, 'alive=yes' ) + if( reply != '100' ) : return '0' + + return '100' + +def check_url( url, check_string ) : + command = "wget -q -O - " + url + return check_command( command, check_string ) + +def check_command( command, check_string ) : + + if( options.debug ) : + print "Checking command : %s" % ( command ) + print "For string : %s" % ( check_string ) + + tmp_array = command.split() + output = subprocess.Popen( tmp_array, stdout=subprocess.PIPE ).communicate()[0] + + if( re.search( check_string, output ) ) : + if( options.debug ) : print "Found the string, return 100" + return '100' + else : + if( options.debug ) : print "String not found, return 0" + return '0' + +def count_processes() : + output = subprocess.Popen( ['ps', 'aux'], stdout=subprocess.PIPE ).communicate()[0] + count = 0 + for line in output.split( '\n' ) : + if( re.match( 'atlpan', line ) ) : + if( re.search( 'http', line ) ) : + count += 1 + return count + +def count_add_processes() : + output = subprocess.Popen( "pgrep -f add.py", + stdout=subprocess.PIPE,shell=True).communicate()[0] + count = 0 + for line in output.split( '\n' ) : + line = line.strip() + if line == '': + continue + count += 1 + return count + +def count_holdings() : + output = subprocess.Popen("ls /data/atlpan/srv/var/log/panda/ | egrep '(finished|failed)'", + stdout=subprocess.PIPE,shell=True).communicate()[0] + count =0 + for line in output.split( '\n' ) : + line = line.strip() + if line == '': + continue + count += 1 + return count + +def registration_time(timeSlice=False,onlyDQ2=False) : + aveRegTime = '0.0' + try: + if onlyDQ2: + com = "grep registraion /data/atlpan/srv/var/log/panda/panda-Adder.log | grep DQ2 | grep -v LFC" + else: + com = "grep 'LFC+DQ2' /data/atlpan/srv/var/log/panda/panda-Adder.log" + if not timeSlice: + com += ' | tail -1000' + output = subprocess.Popen(com,stdout=subprocess.PIPE,shell=True).communicate()[0] + regtimeMap = {} + for line in output.split('\n'): + try: + items = line.split() + timestamp = items[1][:2] + regtime = float(items[-2]) + if not regtimeMap.has_key(timestamp): + regtimeMap[timestamp] = {'totalTime':0.,'totalReg':0} + regtimeMap[timestamp]['totalTime'] += regtime + regtimeMap[timestamp]['totalReg'] += 1 + except: + pass + timestamps = regtimeMap.keys() + if timeSlice: + timestamps.sort() + for timestamp in timestamps: + print "%s %4.1fsec" % (timestamp,regtimeMap[timestamp]['totalTime']/float(regtimeMap[timestamp]['totalReg'])) + else: + totalTime = 0. + totalReg = 0 + for timestamp in timestamps: + totalTime += regtimeMap[timestamp]['totalTime'] + totalReg += regtimeMap[timestamp]['totalReg'] + if totalReg > 0: + aveRegTime = '%4.1f' % (totalTime/float(totalReg)) + except: + errtype,ervalue = sys.exc_info()[:2] + print "ERROR : %s:%s in registration_time" % (errtype,ervalue) + return aveRegTime + +def volume_use( volume_name ) : + command = "df -Pkh /" + volume_name + + tmp_array = command.split() + output = subprocess.Popen( tmp_array, stdout=subprocess.PIPE ).communicate()[0] + + for line in output.split( '\n' ) : + if( re.search( volume_name, line ) ) : + used_amount = re.search( r"(\d+)\%", line ).group(1) + + return used_amount + +def error_mail( host, message ) : + + mail_cmd = [] + mail_cmd.append( 'mail' ) + mail_cmd.append( '-s' ) + mail_cmd.append( 'Problems with %s' % ( host ) ) + mail_cmd.append( 'douglas@cern.ch' ) + + text = "Problems with %s :\n\n" % ( host ) + text += message + + p = subprocess.Popen( mail_cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE ) + p.stdin.write( text ) + p.stdin.close() + + +#run program +__main__() diff --git a/current/templates/panda_server-merge.sh.exe.template b/current/templates/panda_server-merge.sh.exe.template new file mode 100755 index 000000000..6acf67c5f --- /dev/null +++ b/current/templates/panda_server-merge.sh.exe.template @@ -0,0 +1,9 @@ +#!/bin/bash + +# setup grid stuff +source /opt/glite/etc/profile.d/grid-env.sh + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python2.5 @@install_purelib@@/pandaserver/test/runMerger.py diff --git a/current/templates/panda_server-priority.sh.exe.template b/current/templates/panda_server-priority.sh.exe.template new file mode 100755 index 000000000..70363d85b --- /dev/null +++ b/current/templates/panda_server-priority.sh.exe.template @@ -0,0 +1,6 @@ +#!/bin/bash + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python2.5 @@install_purelib@@/pandaserver/test/prioryMassage.py diff --git a/current/templates/panda_server-runRebro.sh.exe.template b/current/templates/panda_server-runRebro.sh.exe.template new file mode 100755 index 000000000..24dfc91c7 --- /dev/null +++ b/current/templates/panda_server-runRebro.sh.exe.template @@ -0,0 +1,9 @@ +#!/bin/bash + +# setup grid stuff +source /opt/glite/etc/profile.d/grid-env.sh + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python2.5 @@install_purelib@@/pandaserver/test/runRebro.py diff --git a/current/templates/panda_server-sysconfig.rpmnew.template b/current/templates/panda_server-sysconfig.rpmnew.template new file mode 100644 index 000000000..7d4d5f482 --- /dev/null +++ b/current/templates/panda_server-sysconfig.rpmnew.template @@ -0,0 +1,31 @@ +# Configuration file for the httpd service. + +OPTIONS="-f @@install_dir@@/etc/panda/panda_server-httpd.conf" + +# for FastCGI/WSGI +#OPTIONS="-f @@install_dir@@/etc/panda/panda_server-httpd-FastCGI.conf" +#HTTPD='/usr/sbin/httpd' + +# for DQ2 +export X509_CERT_DIR=/etc/grid-security/certificates +export RUCIO_ACCOUNT=panda +export RUCIO_APPID=pandasrv + +# panda home +export PANDA_HOME=@@install_dir@@ + +# timezone +export TZ=UTC + +# import panda modules +export PYTHONPATH=@@install_purelib@@/pandacommon:@@install_purelib@@/pandaserver + +# avoid to use AFS +export HOME=/home/atlpan + +# set user's proxy +export X509_USER_PROXY=FIXME + +# panda server URLs +export PANDA_URL='http://localhost:25080/server/panda' +export PANDA_URL_SSL='https://localhost:25443/server/panda' diff --git a/current/templates/panda_server-tmpwatch.sh.exe.template b/current/templates/panda_server-tmpwatch.sh.exe.template new file mode 100644 index 000000000..40fbd2711 --- /dev/null +++ b/current/templates/panda_server-tmpwatch.sh.exe.template @@ -0,0 +1,6 @@ +#!/bin/bash + +# import env vars from sysconfig +source @@install_dir@@/etc/sysconfig/panda_server-sysconfig + +python @@install_purelib@@/pandaserver/test/tmpwatch.py diff --git a/current/templates/panda_server-vomsrenew.sh.exe.template b/current/templates/panda_server-vomsrenew.sh.exe.template new file mode 100755 index 000000000..c4771655e --- /dev/null +++ b/current/templates/panda_server-vomsrenew.sh.exe.template @@ -0,0 +1,20 @@ +#!/bin/bash + +source /etc/profile.d/grid-env.sh + +NOVOMS=/data/atlpan/x509up_u25606_novoms + +voms-proxy-init -voms atlas:/atlas/Role=production -out /data/atlpan/x509up_u25606 -valid 96:00 -cert=$NOVOMS + +# check lifetime of certificate +grid-proxy-info -e -h 504 -f $NOVOMS +if [ $? -ne 0 ]; then + echo $NOVOMS expires in 3 weeks on `hostname` | mail -s "WARNING : Grid certificate expires soon on panda server" atlas-adc-panda-support@cern.ch +fi + +# check lifetime of certificate +voms-proxy-info -exists -hours 72 -file /data/atlpan/x509up_u25606 +if [ $? -ne 0 ]; then + echo /data/atlpan/x509up_u25606 expires in 3 days on `hostname` | mail -s "WARNING : Grid proxy expires soon on panda server" atlas-adc-panda-support@cern.ch,atlas-adc-expert@cern.ch +fi + diff --git a/current/templates/panda_server.cfg.rpmnew.template b/current/templates/panda_server.cfg.rpmnew.template new file mode 100644 index 000000000..f3274cec3 --- /dev/null +++ b/current/templates/panda_server.cfg.rpmnew.template @@ -0,0 +1,258 @@ +[server] + + +########################## +# +# Logger parameters +# + +# log directory +logdir=@@install_dir@@/var/log/panda + +# logger name +loggername = prod + + + +########################## +# +# Transaction parameters +# + +# lock file for getJobs +lockfile_getJobs = %(logdir)s/getJobs.lock + +# lock file for getSerialNumber +lockfile_getSN = %(logdir)s/getSN.lock + +# lock file for accessing email DB +lockfile_getMail = %(logdir)s/getMail.lock + +# lock file for updateDatasetStatus +lockfile_setDS = %(logdir)s/setDS.lock + +# lock file for getCloudTask +lockfile_getCT = %(logdir)s/getCT.lock + +# lock file for uuidgen +lockfile_getUU = %(logdir)s/getUU.lock + + + +########################## +# +# DA parameters +# + +# cache space +cache_dir = @@install_dir@@/var/cache/pandaserver + + + +########################## +# +# DDM parameters +# + +# dq2 dir +dq2_dir = /opt/dq2 + +# globus dir +globus_dir = /opt/globus + +# path to native python +native_python = /data/atlpan/bin + +# path to python for lfc client (/data/atlpan/bin/python cannot be used due to lack of libpythonX.Y.so) +native_python32 = /usr/bin + +# glite source file +glite_source = /opt/glite/etc/profile.d/grid-env.sh + +# location for Panda common +pandaCommon_dir = @@install_purelib@@/pandacommon + +# location for Panda server +pandaPython_dir = @@install_purelib@@/pandaserver + +# location for LFCclient +lfcClient_dir = %(pandaPython_dir)s/brokerage + +# home dir to change CWD +home_dir_cwd = /home/atlpan + + + +########################## +# +# Database parameters +# + +# host +dbhost = ADCR_PANDA + +# user +dbuser = ATLAS_PANDA_WRITER + +# password +dbpasswd = FIXME + +# database +dbname = PandaDB + +# number of connections +nDBConnection = 2 + +# number of connections for FastCGI/WSGI +nDBConForFastCGIWSGI = 1 + +# use timeout +usedbtimeout = True + +# timout value +dbtimeout = 300 + +# verbose in bridge +dbbridgeverbose = False + +# SQL dumper +dump_sql = False + + + +########################## +# +# Panda server parameters +# + +# port +pserverport = 25443 + + + +########################## +# +# proxy parameters +# + +# http +httpProxy = "" + + + +########################## +# +# E-mail DB parameters +# + +# database name for local caching +emailDB = %(logdir)s/email_db + +# SMTP server +emailSMTPsrv = cernmx.cern.ch + +# sender address for notification +emailSender = atlpan@cern.ch + +# login name for SMTP +emailLogin = atlpan + +# login password for SMTP +emailPass = FIXME + + + +########################## +# +# parameters for dynamic task assignment +# + +# enable dynamic task assignment +enableDynamicTA = True + + + +########################## +# +# parameters for redirection service +# + +# enable redirection service +enableRedirection = False + + + +########################## +# +# parameters for FastCGI/WSGI +# + +# use FastCGI with flup +useFastCGI = False + +# use WSGI without flup +useWSGI = True + +# verbose in entry point +entryVerbose = False + + + +########################## +# +# parameters for memcached +# + +# use memcached +memcached_enable = True + +# memcached servers +memcached_srvs = voatlas248.cern.ch:11211,voatlas249.cern.ch:11211,voatlas250.cern.ch:11211,voatlas251.cern.ch:11211,voatlas252.cern.ch:11211,voatlas253.cern.ch:11211 + +# expiration time in memcached +memcached_exptime = 86400 + + + +########################## +# +# nRunning parameters +# + +# interval +nrun_interval = 5 + +# the number of hosts +nrun_hosts = 3 + +# serial number +nrun_snum = 999 + + + +########################## +# +# Cassandra +# + +# use Cassandra for PandaCache +cacheUseCassandra = False + +# ignore Cassandra error +cacheIgnoreCassandraError = True + +# keyspace for PandaCache +cacheKeySpace = PandaCacheKeySpace + +# column family for files +cacheFileTable = FileTable + + + +########################## +# +# Job Status Monitor +# + +# enable job status change monitoring +record_statuschange = False