diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..8623cfc --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = Longbow +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/_static/table-overflow.css b/docs/_static/table-overflow.css new file mode 100644 index 0000000..d756b5c --- /dev/null +++ b/docs/_static/table-overflow.css @@ -0,0 +1,3 @@ +.wy-table-responsive table td { +white-space: normal; +} diff --git a/docs/changelog.rst b/docs/changelog.rst new file mode 100644 index 0000000..565b052 --- /dev/null +++ b/docs/changelog.rst @@ -0,0 +1 @@ +.. include:: ../CHANGELOG.rst diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..bc1e2dd --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('../longbow')) + + + +# -- Project information ----------------------------------------------------- + +project = u'Longbow' +copyright = u'2018, James T. Gebbie-Rayet, Gareth B. Shannon, Charles A. Laughton' +author = u'James T. Gebbie-Rayet, Gareth B. Shannon, Charles A. Laughton' + +# The short X.Y version +version = u'' +# The full version, including alpha/beta/rc tags +release = u'' + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ +'sphinx.ext.autodoc' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Longbowdoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'Longbow.tex', u'Longbow Documentation', + u'James T. Gebbie-Rayet, Gareth B. Shannon, Charles A. Laughton', 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'longbow', u'Longbow Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Longbow', u'Longbow Documentation', + author, 'Longbow', 'One line description of project.', + 'Miscellaneous'), +] + +def setup(app): + app.add_stylesheet('table-overflow.css') diff --git a/docs/dev-contributing.rst b/docs/dev-contributing.rst new file mode 100644 index 0000000..8e31057 --- /dev/null +++ b/docs/dev-contributing.rst @@ -0,0 +1,14 @@ +Contributing +************ + +Contributions to Longbow are very welcome. To prevent things from becoming unwieldy for those of us that have to support the code base long-term we ask just a few things of our contributors. + +The first step in contributing is to clone the Longbow repository and then create a branch where you will work on your code/doc contribution. Please try to limit changes to one specific or set of related changes per branch, this way you can name your branch with something meaningful. If there is an existing issue in the github tracker covering your contribution, please name your branch with the issue number. + +Once you have written your code change, then please modify the documentation both in the source code and in the user documentation to reflect your changes. You should also update the unit tests or add more to test your changes. We have sections of this documentation detailing how to do local docs and test builds so that you can get these working before pusing back to github. + +You should make sure you add your name to the contributors section of the AUTHORS.rst file. + +Then push your branch back to our repository and open a pull request for merger, your code will then be reviewed by us and if everything above checks out and your feature is in the interest of our userbase then it will be merged for the next release. + +That is all there is to it. If you are unsure about your idea, then please do contact us either through the issue tracker or via email and we will be happy to discuss it with you. diff --git a/docs/dev-documenting.rst b/docs/dev-documenting.rst new file mode 100644 index 0000000..93c581e --- /dev/null +++ b/docs/dev-documenting.rst @@ -0,0 +1,25 @@ +Documenting +*********** + +Developers wishing to add to/modify existing features of Longbow are responsible for documenting them. As is similar with missing unit tests, pull requests that arrive with an absence of documentation for the addition/modification will be rejected upon review. The Longbow project should have good documentation both at a source code level and in the written documentation that goes out to users. The Longbow project is using Sphinx to compile its documentation, you will need to get some tools installed and try out building the documentation yourself locally before adding to them. The following process will show you how to achieve this: + +**1. Install the required packages** + +Before you can start documenting, you'll need some packages. So install these if you haven't already:: + + pip install --user sphinx sphinx-autobuild sphinx_rtd_theme + + +**2. Try and make the documentation** + +The next step is to see if you can build the documentation html from the source. So change into the "docs" directory inside the Longbow source directory and run:: + + make html + +If everything has gone to plan then you should be able to now view the documentation in your web browser by navigating to the index.html file inside the "_build/html" directory. + +Adding to the documentation is easy, each page in the documentation has a .rst file associated with it. So to add documentation to an existing page then simply modify the relevant .rst file, if you are unsure which .rst file belongs to which page, then you can find out by looking at the index.rst table of contents and the titles at the top of each .rst file. + +The documentation simply uses reStructuredText format, not all features that are available in the reStructuredText will be available through Sphinx so it is best to use the Sphinx documentation for the reST format than the actual reST format documentation. This can be found at http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html + +happy documenting! diff --git a/docs/dev-integrating.rst b/docs/dev-integrating.rst new file mode 100644 index 0000000..debbb93 --- /dev/null +++ b/docs/dev-integrating.rst @@ -0,0 +1,56 @@ +Integrating Longbow +******************* + +Perhaps one of the most useful features of Longbow is the ease in which it can be utilised in other software. Longbow has been successfully applied in a range of other software products that require jobs to be offloaded onto high performance computing systems. + +There are a number of ways in which Longbow can be integrated into other software, and which method should be used is largely dependant on how Longbow is going to be utilised. The following sections will introduce some of the most popular ways to integrate Longbow into other software and some of the use cases that each method should apply to. + +Wrapping the Application +======================== + +The quickest and most stable way to integrate Longbow into another application is to wrap the executable itself. This is the most stable way to integrate Longbow when considered from a code change perspective, since it is unlikely that the way the executable is called will change, even if the API changes considerably. + +This method is extremely useful for use in scripts, where the user might wish to make use of the power of Longbow to launch jobs after some automated setup. This does not mean that this is unsuitable for integration within applications, however, there are a number of considerations that should be made before choosing this method: + +1. Configuration - you will need to provide a configuration file with at least the details of the HPC machines and some job defaults. This is useful in cases where a very loose level of integration is required, for example if these settings are not collected as part of the application that is being integrated with Longbow. If these parameters are intended to be collected (say in a GUI) then you would have to write out configuration files to pass any parameters you wish to change that are not available for configuration via the Longbow command-line. + +2. Exceptions - exceptions will be difficult to deal with in this way of integrating, what will happen is that Longbow will handle its exceptions internally and the executable will exit should an error occur that Longbow can't handle by itself, this means that exceptions won't percolate up from Longbow and into your application. This will create an extra layer of complications when trying to deal with errors. + +3. Logging - when linking the executable you will end up creating a log file by default either in the user home directory or inside the directory that Longbow is launched from. The only way you can capture the logging output is to turn on the logging to stdout and capture this output, on which you will likely need to some form of processing on to get this into a form you want. This means that you have very little control over the logging that comes out of Longbow. + +The best way to start learning to use Longbow for an integration is to start using it, the following examples will show how to call the Longbow application in scripts. Before you can do these examples, you should complete the following prerequisites: + +* Longbow should be installed as an application on the machine that the script/application subject to the integration is installed upon. + +* Longbow should be configured with at least 1 HPC resource, it should be verified that you can run at least one of the examples. + +**A Simple Python Script** + +This example will highlight the simplest example of calling and capturing the output of Longbow in a python script:: + + #!/usr/bin/env python + + import subprocess + + # Format a call to python subprocess. Here we are using a non-shell call so + # the executable is in the format of a list, the first entry in the list should + # be the longbow executable and the second entry the arguments to Longbow. We + # are also piping stdout so we can capture the output. + inst = subprocess.Popen(["longbow", "--help"], stdout=subprocess.PIPE) + + # Loop until the pipe is closed + while inst.poll() is None: + + # Extract each line. + line = inst.stdout.readline() + + # Output to the console, strip added newline chars. + print line.rstrip() + +Integration by API +================== + +The most flexible way to include Longbow into other software, is to integrate at an API level, this means that developers of other software can use Longbow in a seemless way without having to make external calls to the executable. There are many benefits to doing this, such as being able to create internal data structures directly, without having to firstly create Longbow configuration files, you can get access to the logging information and show this to users in a way you define and can interact with the Longbow exceptions. + +Over the next few months, this part of the documentation will be developed further. To get you started though, the easiest way to get going with integrating Longbow into your software, is to copy what the longbow() method is doing, for some developers simply calling this method using the "parameters" dictionary to override internal configuration will be all that is needed. But for others, a more fine grain approach will be neccessary. We will be adding examples of this to this section over the coming months. + diff --git a/docs/dev-plugins.rst b/docs/dev-plugins.rst new file mode 100644 index 0000000..041e9a3 --- /dev/null +++ b/docs/dev-plugins.rst @@ -0,0 +1,313 @@ +Writing Plugins +*************** + +The longbow plugins are the way we chose to incorporate software/scheduler specific features into an otherwise generically written library. The scope of this guide will cover how to create a whole new plugin within an existing category of plugins. The reason that creating new categories of plugins is not covered here is because to do so, the core of Longbow would have to be modified thus would not be update safe unless you submitted your changes to us and they get accepted into the core code. If making entirely new classes of plugins interests you, then please do get in touch so that we can sort out the details and support your plugins. + +Plugins are used within Longbow to extend its functionality and support to new scheduling platforms and to support new applications. In this section of the guide we will look at how to create new plugins for both new applications and new schedulers. If you do write a plugin or many plugins, and feel others would benefit from them, get in touch, we would be happy to add them into Longbow core, it doesn't matter how messy or incomplete your code is as we will help. + +Application Plugins +=================== + +Whilst it is not neccessary to have an application plugin to launch Longbow jobs for a given application, they help when things go wrong. Application plugins help Longbow to understand what basic command-line arguments an executable needs, what the common executable names are and there are "hooks" from within Longbow core that allow basic functions to be called to check that files exist that are called inside input files on the command-line. + +If a plugin does not exist then Longbow will simply try and submit a job given the command-line it was launched with, if there are files missing, or there is a typo, this might not become apparent until a decent amount of time has been wasted following error messages. These plugins help to eliminate this process entirely, plugins are also useful for beginners just starting out with HPC as they can pick out some of the basic mistakes that these types of users make. + +To create a new applications plugin, follow these simple steps: + +Create a new plugin file. This file should be a python file and should be placed inside the apps directory of the longbow source directory, and should be named after the application (currently Longbow uses this name to derive its default module to load). + +Create the EXECDATA dictionary with the following format:: + + EXECDATA = { + "executable": { + "subexecutables": [], + "requiredfiles": [] + } + } + +Where you will replace "executable" with the name of the executable, if there are sub-executables such as like those in the GROMACS packages then provide a comma separated list between the square brackets, and add a comma seperated list of flags for required files so that Longbow can check they are provided at runtime. + +There are a number of special cases for the "requiredfiles" parameter: + + a. In cases where the command-line uses piped input or if the only argument is the input file, simply add "<" to the list of required files. + b. In cases where either one of a number of parameters can be given use the "||" operator between two parameters. + +A number of examples have been given below to illustrate the above process. + +The EXECDATA dictionary provided as part of the AMBER plugin:: + + EXECDATA = { + "pmemd": { + "subexecutables": [], + "requiredfiles": ["-c", "-i", "-p"], + }, + "pmemd.MPI": { + "subexecutables": [], + "requiredfiles": ["-c", "-i", "-p"], + }, + "pmemd.cuda": { + "subexecutables": [], + "requiredfiles": ["-c", "-i", "-p"], + } + } + +The EXECDATA dictionary provided as part of the GROMACS plugin:: + + EXECDATA = { + "gmx": { + "subexecutables": ["mdrun", "mdrun_mpi"], + "requiredfiles": ["-s || -deffnm"], + }, + "gmx_d": { + "subexecutables": ["mdrun", "mdrun_mpi"], + "requiredfiles": ["-s || -deffnm"], + }, + "mdrun": { + "subexecutables": [], + "requiredfiles": ["-s || -deffnm"], + }, + "mdrun_d": { + "subexecutables": [], + "requiredfiles": ["-s || -deffnm"], + }, + "mdrun_mpi": { + "subexecutables": [], + "requiredfiles": ["-s || -deffnm"], + }, + "mdrun_mpi_d": { + "subexecutables": [], + "requiredfiles": ["-s || -deffnm"], + } + } + +The EXECDATA dictionary provided as part of the NAMD plugin:: + + EXECDATA = { + "namd2": { + "subexecutables": [], + "requiredfiles": ["<"], + }, + "namd2.mpi": { + "subexecutables": [], + "requiredfiles": ["<"], + }, + "namd2.cuda": { + "subexecutables": [], + "requiredfiles": ["<"], + } + } + +Adding new plugins in this fashion should provide an easy way to add support for new applications. We would like to encourage contributions from fields other than computational biology so that we can start to increase our domain of support out of the box. + +Scheduler Plugins +================= + +To have Longbow run jobs on schedulers that are not supported out of the box, it is necessary to write plugins to tell Longbow how to submit to this new scheduling system and then do basic tasks such as query the status etc. Whilst we endeavour to make our best effort to support fully the main schedulers, new ones crop up all the time and users might find themselves needing to write plugins to make use of say a new local machine. + +To get started creating a new scheduler plugin, you will first have to create a new python file within the schedulers directory of the Longbow install (usually will be in .local/lib/python2.7/site-packages/longbow/schedulers/). It is recommended that you name this file after the scheduler to make things easier to remember. Once you have done this, the following snippets of code will explain how to build up the plugin. + +Firstly copy and paste the following block of code at the top of your newly created python file:: + + # Imports should go here + import os + + # A query to the environment that will test positive for + # this scheduler + QUERY_STRING = "unique query here." + +You'll notice that there is a reserved place at the top for imports, as you are building up your plugin and need to import modules, then please add these here, this will keep things tidy should things go wrong. + +Next up is the "QUERY_STRING" parameter. This should be a bash query that enables Longbow to detect the scheduler within the linux environment, usually the scheduler will have created many different environment variables so you should normally be able to build this with 'env' and 'grep'. For example, the PBS query string is "env | grep -i 'pbs'". + +**The delete job function** + +Next up is the function to allow Longbow to kill jobs. Copy and paste the following block below what you have done from above:: + + def delete(job): + """A Method for deleting a single job.""" + jobid = job["jobid"] + + # Try and delete the job, otherwise raise job delete exception. + try: + + shellout = shellwrappers.sendtossh(job, ["bkill " + jobid]) + + except exceptions.SSHError: + + raise exceptions.JobdeleteError("Unable to delete job.") + + return shellout[0] + +The above code block contains the code for a delete function, Longbow will pass this function a job dictionary with all of the parameters for that current job. Usually though, for most schedulers, deleting simply requires the jobid in a simple bash kill command. The simplest way to do this is to use the above example, and modify the '"bkill " + jobid' part of the delete command to use the syntax of how you would normally delete a job in a command terminal window. + +**The prepare script function** + +The next step is to create the function that will allow Longbow to write job submit files for this new scheduler. Copy the following code block below what you have already done from above:: + + def prepare(job): + """Create the LSF jobfile ready for submitting jobs""" + + # Open file for script. + lsffile = os.path.join(job["localworkdir"], "submit.extension") + jobfile = open(lsffile, "w+") + + # Write the script + jobfile.write("#!/bin/bash --login\n") + + # Your code here. + + # Append submitfile to list of files ready for staging. + job["subfile"] = "submit.extension" # IMPORTANT + +This method is slightly more tricky, we have included the bioler-plate for creating the submit file and then appending it to the job data structure. You will need to do several things here, firstly you can change the extension in "submit.extension" to match that of the scheduler name for example, submit.pbs or submit.sge etc. Then you will need to add in the logic to create your submission files where the text "# Your code here." appears. The best way to write one of these functions is to firstly look at the existing plugins for other schedulers, then grab one of your previously made job submit scripts and start to pull out the key parts, such as the scheduler directives and then the submission part. You will find that by using existing plugins, your own submit scripts and the documentation for the Longbow data structures will easily allow you to write this part. + +**The job status function** + +Next up is the method to allow Longbow to grab the status of a job. Copy and paste the following code block below what you have done from above:: + + def status(job): + """Method for querying job.""" + + # Dictionary of states a job can take in the scheduler, + # mapped onto Longbow states. + states = { + "DONE": "Job Exited Properly", + "EXIT": "Job Exited in Error", + "PEND": "Queued", + "PSUSP": "Suspended", + "RUN": "Running", + "SSUSP": "Suspended", + "UNKWN": "Unknown Status", + "USUSP": "Suspended", + "WAIT": "Waiting for Start Time", + "ZOMBI": "Zombie Job" + } + + # Initialise job state to blank. + jobstate = "" + + # Query the job state + shellout = shellwrappers.sendtossh(job, ["bjobs -u " + job["user"]]) + + # Scheduler will return a table, so split lines into a list. + stdout = shellout[0].split("\n") + + # Loop over jobs in table. + for line in stdout: + + # Split each line into its columns. + line = line.split() + + # If the job id of our job is present in column 0. + if len(line) > 0 and job["jobid"] in line[0]: + + # Read the jobstate from column 2 and exit loop. + jobstate = states[line[2]] + break + + # If jobstate is still blank, then it must have finished. + if jobstate == "": + + jobstate = "Finished" + + return jobstate + +The code above gives a good example of how to get the status from the scheduler, this code was taken from the LSF plugin already supplied with Longbow, you will have to modify this to work with your scheduler. A few important points to note: + +1. The states dictionary, will need to be updated to reflect the states that your new scheduler uses, the left hand column containing "PEND" and "RUN" are the scheduler states, and those on the right are Longbow states. Currently, only the "Queued" and "Running" states are required, so all of the other states can in theory be omitted, although then Longbow would not be able to report on them, it is better to include them where possible. + +2. The following line:: + + shellout = shellwrappers.sendtossh(job, ["bjobs -u " + job["user"]]) + + +Will need to be modified, you will need to change the last part "bjobs -u " + job["user"] within the square brackets (important that the outer square brackets remain) to match the command you would normally type into your terminal to query all jobs running under your user id (the user query gives nicer and more generic output than per jobid). + +3. The following lines:: + + # If the job id of our job is present in column 0. + if len(line) > 0 and job["jobid"] in line[0]: + + # Read the jobstate from column 2 and exit loop. + jobstate = states[line[2]] + break + + +Will need to be modified to take account for any difference in how the data is returned by the scheduler. This code is assuming the job id appears in column 0 and that the state appears in column 2, these will both have to be corrected if this is not the case. + +**The job submit function** + +Next up is the method Longbow will use to submit jobs to the scheduler. Copy the following block of code below what you have done from above:: + + def submit(job): + """A method to submit a job.""" + # command to change into working directory and then submit the job. + cmd = ["cd " + job["destdir"] + "\n", "bsub < " + job["subfile"]] + + try: + + # Send the submit command. + shellout = shellwrappers.sendtossh(job, cmd) + + except exceptions.SSHError as inst: + + # If we have hit a queue limit, raise a special exception to trigger + # subqueuing (not all machines will have this setup). + if "limit" in inst.stderr: + + raise exceptions.QueuemaxError + + # Otherwise raise a submission exception and attach error information. + else: + + raise exceptions.JobsubmitError( + "Something went wrong when submitting. The following output " + "came back from the SSH call:\nstdout: {0}\nstderr {1}" + .format(inst.stdout, inst.stderr)) + + try: + + # Do the regex to extract the job id. + jobid = re.search(r'\d+', shellout[0]).group() + + except AttributeError: + + raise exceptions.JobsubmitError( + "Could not detect the job id during submission, this means that " + "either the submission failed in an unexpected way, or that " + "Longbow could not understand the returned information.") + + # Put jobid into the job dictionary. + job["jobid"] = jobid + +The above code block shows the basic layout of how a submit method should work. There are a number of ways this method can be adapted: + +1. Firstly the line:: + + cmd = ["cd " + job["destdir"] + "\n", "bsub < " + job["subfile"]] + + +Should be modified so that the second part with the bsub command, matches the command that your scheduler normally uses to submit jobs to its queue. + +2. If the machine that you are using, or you know the scheduler doesn't support queue slot limits, then you can remove the following block of code:: + + # If we have hit a queue limit, raise a special exception to trigger + # subqueuing (not all machines will have this setup). + if "limit" in inst.stderr: + + raise exceptions.QueuemaxError + + +and just raise the job submit error without an if/else. + +3. In the same way the code in point 2 was deleted, you can also add extra checks to this to check for common scheduler errors and raise the job submit exception with a custom error message. This is useful for example, if there is an obscure error that keeps tripping you up and forcing you to read the scheduler documentation to find out what it means. See the pbs plugin for examples of this. + +4. If the following line fails to extract the job id from what is returned:: + + jobid = re.search(r'\d+', shellout[0]).group() + +Then you will need to write your own parsing line. + +All of the above steps should get you well on your way to producing a new scheduler plugin, if any of the documentation above is not clear, or you need help then please get in touch for support through our support channels. + + diff --git a/docs/dev-testing.rst b/docs/dev-testing.rst new file mode 100644 index 0000000..0223686 --- /dev/null +++ b/docs/dev-testing.rst @@ -0,0 +1,63 @@ +Testing +******* + +Developers contributing code to the Longbow base code will be expected to provide unit tests for their contributions and have these passing at time of merger, pull requests with failing tests will not be accepted. It is also strongly suggested to gain as much testing coverage as possible for contributions, as poor coverage will also lead to rejected contributions. + +The unit tests are run via Travis CI automatically upon commits and pull requests to assist maintainers with assessing whether code contributions represent the quality and stability our users deserve. + +Developers can rely on the Travis tests if they like, but this can often mean a lot of little tweaks need commiting to GitHub branches before they are ready. It is often a better idea to implement local testing with your installed Python toolchain and then push to GitHub and have Travis do the multi-version based testing. + +A convenient way to set up a simple test environment locally is to implement the following recipe. + +**1. Install testing tools** + +To start testing, you'll need some tools, so first thing is to get these if you haven't already. + +unit testing:: + + pip install --user pytest + +mock:: + + pip install --user mock + +code coverage:: + + pip install --user coverage + +beautify output (optional):: + + pip install --user pytest-sugar + +**2. Make testing script to copy source and launch test suite** + +Next, you will need a way to run your tests without disturbing your pristine source code. The simplest way to do this is have a simple script copy and launch your tests. To do this creare a bash script:: + + nano ~/.local/bin/test-longbow + +and add:: + + #!/usr/bin/env bash + + # copy source to user home directory + cp -r /path/to/your/longbow-source ~ + + # change path to longbow source + cd ~/Longbow + + # run tests and report coverage + coverage run --source longbow -m py.test + coverage report -m + + # after testing, clean up + cd .. + rm -rf ~/Longbow + +after saving, make it executable:: + + chmod +x ~/.local/bin/test-longbow + +that should be it. You should simply be able to run "test-longbow" and see the unit testing suite run its tests locally on your machine. This will give details of the coverage report so that you can see lines that are not covered by testing and details of any tests that fail as a result of your changes. Failing tests are not always a bad idea, you may have altered core functionality to fix a bug that is currently passing an existing test, you should then fix the existing tests to test your new code. + +Thats it, happy coding..... + diff --git a/docs/images/cosec.jpg b/docs/images/cosec.jpg new file mode 100644 index 0000000..0552bb4 Binary files /dev/null and b/docs/images/cosec.jpg differ diff --git a/docs/images/hecbiosim.jpg b/docs/images/hecbiosim.jpg new file mode 100644 index 0000000..9f0e0a3 Binary files /dev/null and b/docs/images/hecbiosim.jpg differ diff --git a/docs/images/nottingham.png b/docs/images/nottingham.png new file mode 100644 index 0000000..f7614b6 Binary files /dev/null and b/docs/images/nottingham.png differ diff --git a/docs/images/priority.png b/docs/images/priority.png new file mode 100644 index 0000000..4fe080f Binary files /dev/null and b/docs/images/priority.png differ diff --git a/docs/images/stfc.jpg b/docs/images/stfc.jpg new file mode 100644 index 0000000..f3f8325 Binary files /dev/null and b/docs/images/stfc.jpg differ diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..e2d7685 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,70 @@ +Longbow Documentation +************************** + +**Longbow is a lightweight, simple and intuitive remote job submission utility.** + +The philosophy behind Longbow is that it should be easy to run simulations on a High Performance Computing (HPC) machine from your desktop and have the results delivered on a plate. No more SSH, writing job submission scripts, SFTP, ... + +Longbow is geared towards lowering the barriers surrounding simulation with HPC resources for non-traditional users and early-career scientists. + +Longbow works by creating job submission scripts for you, these are customised for the job scheduling system of the HPC machine you wish to use. Longbow sends this script and input files from the local working directory of your desktop computer to the remote HPC machine; and subsequently submits the job. Whilst the job is running, Longbow can periodically bring the results back to your local computer, bringing the power of HPC to your desktop... + +Licensing ++++++++++ + +Longbow is released under the BSD 3-clause license. A copy of this license is provided when Longbow is downloaded and installed. + +Citing +++++++ + +If you make use of Longbow in your own code or in production simulations that result in publishable output, then please reference our paper: + +Gebbie-Rayet, J, Shannon, G, Loeffler, H H and Laughton, C A 2016 Longbow: A Lightweight Remote Job Submission Tool. Journal of Open Research Software, 4: e1, DOI: http://dx.doi.org/10.5334/jors.95 + +.. _support: + +Support ++++++++ + +Support for any issues arising from using Longbow, whether these are questions, to report a bug or to suggest new ideas. You should use the Longbow issue tracker here: https://github.com/HECBioSim/Longbow/issues + +.. toctree:: + :maxdepth: 2 + :caption: User Documentation: + + usr-installation + usr-getting-started + usr-configuration + usr-running-jobs + usr-troubleshooting + changelog + +.. toctree:: + :maxdepth: 2 + :caption: API Documentation: + +.. toctree:: + :maxdepth: 2 + :caption: Developers Documentation: + + dev-integrating + dev-plugins + dev-contributing + dev-documenting + dev-testing + +Longbow has been brought to you by a collaboration between STFC and Nottingham university through the HECBioSim consortium (a part of CoSeC). + +.. image:: images/stfc.jpg + :width: 32% + :target: https://stfc.ukri.org +.. image:: images/nottingham.png + :width: 23% + :target: https://www.nottingham.ac.uk +.. image:: images/hecbiosim.jpg + :width: 27% + :target: http://www.hecbiosim.ac.uk +.. image:: images/cosec.jpg + :width: 15% + :target: https://www.scd.stfc.ac.uk/Pages/CoSeC.aspx + diff --git a/docs/usr-configuration.rst b/docs/usr-configuration.rst new file mode 100644 index 0000000..a825ef7 --- /dev/null +++ b/docs/usr-configuration.rst @@ -0,0 +1,467 @@ +Longbow Configuration +********************* + +**This section explains how to best make use of Longbow parameters.** + +To communicate, copy files and submit jobs on a remote resource, Longbow needs to be supplied with the details of the HPC machine. Also, to automatically create a job submission script to your specification, Longbow also needs to be supplied with details of the job such as the executable to run or the number of cores to use. In this documentation, these pieces of information are referred to as **parameters**. + +There are a number of ways that you can feed Longbow with these parameters, this section of documentation will explain in detail the ways and also the kinds of parameters that can be supplied to Longbow. + +Basic Principles of Configuration +================================= + +Longbow has been designed to be extremely flexible in terms of configuration, to keep things simple we have maintained a well-defined system of configuration that will assimilate information from a number of different sources. You are not limited to providing all parameters from one source either. + +This means that you can use a combination of input sources for such purposes as representing complex configuration patterns (facility specific configuration differences, or job type differences) or simply for the convenience of having a base configuration for quick job submission but that can be overridden easily if necessary. + +There are several sources from which Longbow will bring in configuration parameters, these are the command-line, job configuration files, host configuration files and Longbow internal defaults. Longbow will load up all parameters from these sources and then for each parameter make a decision which one to use based upon a simple but structured hierarchy of priority. + +This order of priority is outlined in the following diagram, it can be seen that the command-line has the highest priority. This means that a given parameter on the command-line would override any value provided for this parameter from any of the other sources. + +.. figure:: images/priority.png + :align: center + + A diagrammatic representation of the order of priority that the Longbow parameterisation streams take. + +Although Longbow has this well defined structure, not every parameter can be provided in each configuration source. For example, only a subset of the total set of parameters may be given on the command-line. For obvious reasons, there are not Longbow defaults for every single parameter either. This means that the user does have to provide certain information as a requirement in certain sources (for example the host file is always required). The following sections will discuss this in detail. + +The command-line +================ + +Provided that at least the host configuration file has been configured (**instructions for basic setup**) with at least the minimum required parameters, simple jobs can be launched from the command-line. A simple Longbow command-line looks like the following:: + + longbow [longbow arguments] executable [executableargs] + +The above command-line shows how a simple command-line job would be launched, the Longbow parameters should always come before the executable and it's arguments. So looking at the first half of the Longbow command-line above, it is well known that you will use the program executable name first in the command-line (in this case "longbow"). + +The "longbow arguments" are how we configure Longbow from the command-line, we do this by providing flags for Longbow to look for. There are two categories of flags, those that aren't related to running jobs (such as --help) and those that are explicitly related to running jobs (such as --replicates). + +Starting with command-line flags that are not related to running jobs we have + +--about This flag will output information about Longbow to the console terminal. + +--examples This flag will download the Longbow example set, you can control where these are placed by navigating in your terminal to the desired location before launching. + +--help This flag will output the Longbow help to the console, this is useful for quick command-line flag look-up. + +--version This flag will output the current version of Longbow that is installed on your system. + +The above command-line flags are designed to be run alone, not in conjunction with any other flags. They will typically perform the function described above and then Longbow will exit. An example of running with one of the above flags, would be to query which version of Longbow is installed on your machine, to do this one simply opens a terminal and types:: + + longbow --version + +The following list constains the command-line flags that are explicitly related to running jobs + +--debug This flag will trigger the output of debugging information to both your log file and the console terminal. Should only be used when requesting support. + +--disconnect This flag will activate dis-connect mode **link**. + +--hosts [/path/to/file] + + This flag will make Longbow use the host file and path specified and not the default ~/.longbow/host.conf. If only a file name is given and not a full path then longbow will search the current working directory and then the ~/.longbow directory in that order for the named file, it will use the first one it discovers. + +--job [/path/to/file] + + This flag will make Longbow look for and load parameters from a job configuration file. Note that a job configuration file is not strictly necessary to use Longbow. If the filename is provided but not the path, Longbow will search in the directory that you launch your experiment from (the current working directory) and then the directory in which the longbow executable resides if necessary. There is no default filename that is assumed for the the job configuration file. + +--jobname [jobname] + + This flag will set the job name. This will NOT override the same parameters in configuration files making it an exception to the usual parameter hierarchy. +--log [/path/to/file] + + This flag is set then the log file will be written to the specified path. If the filename is provided, but not the path, the file will be output to the current working directory. If this flag is not included, a default file called log will be output to the current working directory. + +--maxtime [HH:MM] + + This flag will override the walltime for each job. +--nochecks This flag will disable checks that are performed on the application availability on the remote HPC machine. This is for cases where the path the the executable is too complex, such that Longbow has a hard time trying to find it but you are certain that it should work. + +--recover [/path/to/file] + + This flag will start the recovery of a failed or disconnected Longbow session. Longbow will save recovery files into the ~/.longbow directory with a date and time stamp in the file name, you should supply the path to this file to initiate the recovery and continuation of the session **link** + +--resource [resource name] + + This flag specifies which HPC machine in the host configuration file to send the job to. This will overrule the same parameters in any configuration files. + +--replicates [number] + + This flag specifies the number of replicate jobs to run. This will overrule the same parameters in any configuration files. + +--verbose This flag, will turn on logging to the console terminal in addition to the log file, this is useful in cases where you are running Longbow on a desktop computer and wish to monitor the progress live rather than from file. Longbow is set to only log to file by default, so that it can be used in conjunction with local batch queue systems without duplicate output. + +Now we have seen the Longbow configuration side of the command-line all that remains is the executable side of the command-line:: + + longbow [longbow arguments] executable [executableargs] + +This is simply the command-line of the program that you would normally use if you were not using it in conjunction with Longbow. In the above example the parameter "executable" can be one of pmemd, pmemd.MPI, charmm, namd2, mdrun, mdrun_d, mdrun_mpi, mdrun_mpi_d, lmp_xc30 (Longbow can be extended to work with other software **link**). The parameter "executableargs" should be replaced with the arguments that you wish to supply to your MD program, for example + +for charmm:: + + "<" example.inp ">" output + +or for amber:: + + -i example.in -c example.min -p example.top -o output for Amber + +The executableargs must immediately follow the executable on the Longbow command line. So putting all of this together, we saw in the **Running Longbow Examples** section an example of the executable and executableargs parameters being provided on the longbow command line. The example below is similar but demonstrates how the user can specify which remote resource to use using the --resource flag:: + + longbow --resource archer-short --replicates 5 --jobname TestJob pmemd.MPI -O -i test.in -c test.min -p test.top -o test.out + +Longbow will submit the replicate job named "TestJob" to ARCHER. If a job is submitted without a job configuration file and also without the --jobname flag being set then the job name will default to "Longbowjob" and will show up as such if you query the batch queue system of the remote resource. + +The Job Configuration File +========================== + +Compared to the host configuration file (discussed in the next part), the job configuration file is designed to be changed frequently, perhaps for each job submitted. The idea is that the job configuration file will overrule parameters in the host configuration file on a parameter by parameter basis. For example, A user might have a large set of jobs that fit into different categories of resource requirement (small, medium and large), this user could then have generic job configuration files to setup the number cores and wall time requirements of those three categories of job. The job configuration file allows the user to overrule parameters listed in the host configuration file. + +A job configuration file is not necessarily needed as demonstrated in the **Running Longbow Examples** section of this guide. However, not all parameters that may change on a job by job basis can be provided on the command line, and the user may not wish to change the defaults they have specified in the host configuration file. The job configuration file was introduced to deal with such issues or prevent the need for writing lengthy commands such as:: + + longbow --cores 48 --maxtime 96:00 --memory 20 --polling-frequency 60 pmemd.MPI -i example.in -c example.min -p example.top -o output + +Instead, these parameters can be provided in the job configuration file like this:: + + [example] + resource = Archer + maxtime = 01:00 + cores = 48 + polling-frequency = 120 + remoteworkdir = /work/myproject/myproject/myusername/longbow + +Where the jobname appears within the square brackets "[ ]" and each parameter and value is listed under this jobname. Multiple jobs can be specified by including multiple sections, each starting with a jobname and then listing its parameters (more on this here **link**). Once the job configuration file has been prepared the above command-line shrinks down to:: + + longbow --job /path/filename + +If the filename is provided but not the path, Longbow will search in the (local) directory that you launch your experiment from (i.e. the working directory on your desktop) and then the directory in which the longbow executable resides if necessary. There is no default filename that is assumed for the the job configuration file unlike with the host configuration file. + +Upon launch, the above job would run on Archer in /work/myproject/myproject/myusername/longbow/exampleXXXXX where XXXXX represents a generated 5-digit number. You see, the remoteworkdir parameter in the the job configuration file would overrule that in the host configuration file. Note that the name of the job "example" is used as the subdirectory of remoteworkdir in which the job runs only with a random 5-digit number appended on the end. These random numbers are appended for all jobs regardless of the configuration methods used, this is to prevent jobs of the same name clashing on the remote resource. + +Any parameter listed in the **parameters** section can be included in the job configuration file with the exception of host and user because these are strongly tied to the HPC resource rather than the job. + +The Host Configuration File +=========================== + +The host configuration file is designed to be a reasonably static entity which contains the details to access remote HPC machines. In addition, any **parameters** the user deems would not change across jobs but across hosts or those that hardly change would be a good candidate to appear with the hosts. + +Upon installing Longbow, a default host configuration file is created in the ~/.longbow/hosts.conf file and as such the --hosts longbow argument flag needn't be provided if you are using this file. Many users will wish to modify this file to reflect their HPC resource(s). However, some users may wish to specify their own file using the --hosts longbow argument flag on the command line when submitting jobs:: + + longbow --hosts /path/filename ..... + +If the filename is provided but not the path, Longbow will search in the directory that you launch your experiment from (i.e. the working directory), then the directory the longbow executable is stored and then the ~/.longbow directory. + +The host configuration file is structured in the below format which consists of sections in square brackets (Longbow uses this as the name of the HPC machine) followed by a list of named parameters and their values:: + + [Archer] + host = login.archer.ac.uk + user = myusername + remoteworkdir = /work/myproject/myproject/myusername/ + account = myproject-Surname + +Any parameter listed in the **parameters** section below can be included in the host configuration file with the exception of the parameter "resource". There are three parameters that must be included in the host configuration file otherwise an error will occur, these are + +user +host +remoteworkdir + +Of these three, only remoteworkdir can be also supplied in the job configuration file. Due to the hierarchy of parameters, if remoteworkdir, or any other common parameter is specified in both, the value in the job configuration file will be preferentially selected. + +On a final note, if Longbow is not told which remote resource to submit jobs to using the --resource flag on the command line or parameter of the same name in the job configuration file, Longbow will default to submitting jobs to the HPC machine that appears first in the host configuration file. + +List of Configuration File Parameters +===================================== + +This section contains a list of parameters that may be used in either of the host or job configuration files, a small selection however should only be used in the host configuration file. + ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| account | If the HPC machine requires an account code (ARCHER does) supply it using this parameter or else jobs may be rejected. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| accountflag | Sometimes system administrators for whatever reason decide that they need to change the default for the account | +| | directive option. If this is the case then the user can specify what Longbow should supply with this parameter. | +| | Longbow defaults to -A for PBS, SGE and SLURM but for LSF will default to -P. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| cores | The total number of cores to request. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| corespernode | This parameter is important for Longbow to be be able to properly resource jobs and should be provided for all | +| | machines. Longbow has an internal default of 24 cores per node as this is currently a common configuration, however if | +| | the machine you are using differs then you should set it using this parameter in your host configuration file. You can | +| | normally find this information from the hardware section of your HPC machine webpages or ask their support staff. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| download-include | Exposes the rsync --include flag for downloads, these flags are used to get fine grained control over what is | +| | transferred using rysnc. Users should specify a comma separated list of files to include whilst simultaneously setting | +| | the download exclude parameter to all (download-exclude = \*) when making use of this parameter. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| download-exclude | Exposes the rsync --exclude flag for downloads, these flags are used to get fine grained control over what is | +| | transferred using rysnc. Users should either specify a comma separated list of files (black-listing) they wish to | +| | exclude from the download staging or set to all "*" in conjunction with providing a list of files to the | +| | download-include parameter listed above (white-listing). | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| email-address | This parameter allows the user to set an email address that will be written into the job submission script so that the | +| | scheduler can send an email on job completion. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| email-flags | This parameter allows the user to set the email flags that tell the scheduler how you want to receive emails about | +| | running jobs. You should use the format that you normally use in your job submission scripts ie PBS "email-flags = b" | +| | or for SGE "email-flags = beas". | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| executable | The name of the executable to use on the HPC machine. Using this argument in conjunction with executableargs parameter | +| | is a good way of avoiding having to write long command-lines. A list of executables supported out of the box is: | +| | | +| | +-------------+---------------------------------------------------------+ | +| | | **Package** | **Executables** | | +| | +-------------+---------------------------------------------------------+ | +| | | AMBER | pmemd pmemd.MPI pmemd.cuda | | +| | +-------------+---------------------------------------------------------+ | +| | | CHARMM | charmm charmm_mpi charmm_cuda | | +| | +-------------+---------------------------------------------------------+ | +| | | GROMACS | gmx gmx_d mdrun mdrun_d mdrun_mpi mdrun_mpi_d | | +| | +-------------+---------------------------------------------------------+ | +| | | LAMMPS | lmp_xc30 lmp_linux lmp_gpu lmp_mpi lmp_cuda lmp | | +| | +-------------+---------------------------------------------------------+ | +| | | NAMD | namd2 namd2.mpi namd2.cuda | | +| | +-------------+---------------------------------------------------------+ | +| | | +| | New programs and/or executables can be added by following this guide **link** | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| executableargs | As an alternative to providing the arguments for the MD package on the Longbow command-line, these arguments can be | +| | specified in a configuration file. | +| | | +| | For example, a NAMD job could be submitted on your local machine in the following fashion:: | +| | | +| | longbow --verbose namd2 "<" example.in | +| | | +| | An equivalent way to submit this job would be to specify the arguments in the job configuration file, job.conf:: | +| | | +| | [myjob] | +| | .. | +| | executable = namd2 | +| | executableargs = example.in | +| | .. | +| | | +| | and then use the following command:: | +| | | +| | longbow --verbose --job job.conf | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| handler | This parameter enables users to specify the name of the job handler that should be used to run the parallel MD job | +| | (e.g. mpirun, aprun etc). If this parameter is not included, the code will automatically try to determine which | +| | handler is present on the HPC machine. A minority of users may wish to provide additional arguments to the executable | +| | (e.g. -np for mpirun) and can do so using this parameter. Simply use for example:: | +| | | +| | [Archer] | +| | .. | +| | handler = mpirun -np 16 | +| | .. | +| | | +| | **Note**, that for the aprun handler, the -n and -N flags are provided by default by Longbow. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| host | The address of the HPC machine. For example login.archer.ac.uk | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| localworkdir | Path to the directory on the desktop from which the job should be run if this should not be the current working | +| | directory. This is optional and will override where the input files required for the MD job are to be found and where | +| | the results files should be directed to (most users should ignore this unless there is a good reason). | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| lsf-cluster | For users of HPC machines that run an LSF scheduler, the cluster the job should be submitted to can be specified with | +| | this parameter. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| modules | The modules to be loaded on the remote resource using the "module load" command. A comma separated list can be entered | +| | here if multiple modules are required. For example, if you would normally have the following lines in your submission | +| | script:: | +| | | +| | module load intel-mpi | +| | module load charmm | +| | | +| | simply include the following in the job configuration file:: | +| | | +| | modules = intel-mpi, charmm | +| | | +| | If the modules parameter isn't specified, Longbow will try assume which modules are required according to the | +| | executable name. However, this only works if the executable supplied is supported by a plugin. Out of the box the | +| | following executables map onto the following module names by default | +| | | +| | +------------------------------------------------------+------------+ | +| | | **Executable** | **Module** | | +| | +------------------------------------------------------+------------+ | +| | | pmemd, pmemd.MPI, pmemd.cuda | amber | | +| | +------------------------------------------------------+------------+ | +| | | charmm, charmm_mpi, charmm_cuda | charmm | | +| | +------------------------------------------------------+------------+ | +| | | gmx, gmx_d, mdrun, mdrun_d, mdrun_mpi, mdrun_mpi_d | gromacs | | +| | +------------------------------------------------------+------------+ | +| | | lmp, lmp_xc30, lmp_linux, lmp_gpu, lmp_mpi, lmp_cuda | lammps | | +| | +------------------------------------------------------+------------+ | +| | | namd2, namd2.mpi, namd2.cuda | namd | | +| | +------------------------------------------------------+------------+ | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| maxtime | Maximum wall clock time, this will be used to tell the scheduler how long the job should last and will likely be | +| | kicked out of the queue if it overruns. This should be given in the format "HH:MM". Longbow will automatically add | +| | zero seconds onto your entry if your scheduler requires the format "HH:MM:SS". | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| memory | Integer representing the number of GB to be assigned to the scheduler memory directive in your submit script. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| mpiprocs | Allows undersubscription or to change mpiprocs freely without hacking the corespernode parameter. This is often needed | +| | to properly run LAMMPS SMP builds. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| polling-frequency | The interval for Longbow to query the status of a job/s, this is given in seconds and should not be set too small | +| | (not less than 60) otherwise the system admins may not like you. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| port | The port number if the remote resource is using an unusual port for ssh, Longbow defaults to 22 if nothing is given. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| queue | The queue the job should be submitted to on the remote resource. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| remoteworkdir | Name the working directory on the HPC machine. If it doesn't already exist Longbow will create it. Longbow will then | +| | use remoteworkdir as its staging area, so for each job a subdirectory will be created by Longbow in which the job will | +| | run. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| replicates | Number of desired replicates for job arrays (**see the Running Jobs section**). | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| replicate-naming | Users that wish to have different naming prefix for replicate directories than the default of rep1, rep2, rep3, ... | +| | can supply something like this in their configuration scripts:: | +| | | +| | replicate-naming = run | +| | | +| | and this will result in the directories run1, run2, run3, ... getting used. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| resource | This specifies the name of the HPC machine to use, which refers to the name given within the square brackets [] in the | +| | host configuration file. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| scheduler | This is the name of the job scheduling environment (PBS/LSF/SGE/SLURM) this can be used to force Longbow to use the | +| | logic for a given scheduler if the internal tests run by Longbow are struggling to identify the setup for your HPC | +| | machine. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| scripts | This parameter is for including scripts in the job submission script written by Longbow. The script/s must already be | +| | present on the HPC machine, Longbow will not transfer these for you, so any paths must be valid for the script path on | +| | the HPC machine. | +| | | +| | An example of using this in a configuration file is to include the Linux module initialisation script:: | +| | | +| | scripts = source /etc/profile.d/modules.sh | +| | | +| | and this will append this line into your job submission file like this:: | +| | | +| | #!/bin/bash --login | +| | #$ -cwd -V | +| | #$ -N single | +| | #$ -q gpu | +| | #$ -l h_rt=24:00:00 | +| | | +| | source /etc/profile.d/modules.sh | +| | | +| | module load apps/intel/gromacs/4.6.1 | +| | | +| | mpirun mdrun -s example.tpr -deffnm output | +| | | +| | Multiple scripts can be included by referencing a comma separated list of commands:: | +| | | +| | scripts = source /etc/profile.d/modules.sh, source /this/one.too | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| sge-peflag | This parameter is specific to users that are using machines with SGE. This parameter controls the name of the parallel | +| | environment that the job is asking for, by default this is set to "mpi" which yields the following line in your job | +| | submission script:: | +| | | +| | #$ -pe mpi 8 | +| | | +| | however if you have to provide something different here such as "gpu-env" then set this flag to that name in your host | +| | or job configuration file. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| sge-peoverride | This parameter is specific to users that are using machines with SGE. Users of some machines, possibly using | +| | accelerators, will want to use a single core. In this instance Longbow would not write out a line for the parallel | +| | environment directive "#$ -pe mpi 1", however some machines have been configured in a way that jobs fail if this is | +| | not given. So to provide this for cases with 1 core jobs then provide the following in your host or job configuration | +| | file:: | +| | | +| | sge-peoverride = true | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| slurm-gres | This parameter is specific to users that are using machines with slurm. Users of machines that have generic resources | +| | configured can use them by supplying the slurm-gres parameter in a job or host configuration file:: | +| | | +| | slurm-gres = gpu:1 | +| | | +| | results in the following being added to your job submit script generated by Longbow:: | +| | | +| | #SBATCH --gres=gpu:1 | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| staging-frequency | The frequency in seconds in which files should be synced between the remote and local machine. If the frequency should | +| | be the same as the polling frequency then leave this unset and it will default to the same. This parameter should not | +| | be set too small, especially you are syncing large files otherwise you will be syncing constantly. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| stderr | This parameter will rename the stdout file that is created by the scheduling system. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| stdout | This parameter will rename the stdout file that is created by the scheduling system. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| subfile | Advanced users that use other tools to generate submission scripts but would like to take advantage of the staging and | +| | submission parts of Longbow can do so using the subfile parameter to give the exising submit file. This is for | +| | advanced users and workflow developers that understand the implications of doing this. You will still have to provide | +| | normal command-lines etc and go through all the checks and tests. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ +| user | Used to supply your user name on the HPC machine. This is the user name that you would normally use with SSH. | ++-------------------+------------------------------------------------------------------------------------------------------------------------+ + +Default values +============== + +To give users a solid starting point and to also minimise the number of false starts for new users, some parameters will have default values. These will be based on machine specific metrics for the UK national HPC - currently ARCHER. Below are listed some of the parameters that will have a default value and what that default is. + ++-------------------+-------------------+----------+ +| **Parameter** | **Default Value** | **Unit** | ++-------------------+-------------------+----------+ +| cores | 24 | - | ++-------------------+-------------------+----------+ +| corespernode | 24 | - | ++-------------------+-------------------+----------+ +| jobname | LongbowJob | - | ++-------------------+-------------------+----------+ +| maxtime | 24:00 | hrs:min | ++-------------------+-------------------+----------+ +| polling-frequency | 300 | seconds | ++-------------------+-------------------+----------+ +| port | 22 | - | ++-------------------+-------------------+----------+ +| staging-frequency | 300 | seconds | ++-------------------+-------------------+----------+ +| replicates | 1 | - | ++-------------------+-------------------+----------+ + +A Worked Example +================ + +To Demonstrate the interplay of configuration files, this section provides a demonstration of a PBS job submission script that is generated using a job configuration file and also the default host configuration file (~/.longbow/hosts.conf) and how that final submit file is assembled. + +Host configuration file:: + + [myhpc] + host = login.archer.ac.uk + user = myusername + scheduler = PBS + handler = mpirun + corespernode = 16 + +Job configuration file:: + + [single] + resource = myhpc + executable = mdrun + remoteworkdir = /work/longbow + maxtime = 01:00 + cores = 32 + memory = 20 + queue = dev + executableargs = -s example.tpr -deffnm output + +Longbow command-line:: + + longbow --verbose --job job.conf + +Below is the resultant PBS submission script that is generated by Longbow, this and all associated job files will then be copied to /work/longbow/singleXXXXX on the remote resource (where XXXXX represents a random 5-digit number) and submitted to the scheduler:: + + #!/bin/bash --login + #PBS -N single + #PBS -q dev + #PBS -l select=2:ncpus=16:mpiprocs=16:mem=20gb + #PBS -l walltime=01:00:00 + + export PBS_O_WORKDIR=$(readlink -f $PBS_O_WORKDIR) + cd $PBS_O_WORKDIR + export OMP_NUM_THREADS=1 + + module load gromacs + + mpirun mdrun -s example.tpr -deffnm output + diff --git a/docs/usr-getting-started.rst b/docs/usr-getting-started.rst new file mode 100644 index 0000000..5158afb --- /dev/null +++ b/docs/usr-getting-started.rst @@ -0,0 +1,144 @@ +Getting Started +*************** + +**Longbow is designed with beginner users in mind** + +The best way to get started with using Longbow is to start out with the examples provided by us. This section will help you get going as quickly as possible, however whilst this section does enable you to get up and running quickly it is recommended that you read the later sections of this documentation to see the full range of features that are available. + +It is intended that if you are completely new to Longbow or for fresh installs, you should work through the following parts sequentially in order to get a feel for the steps involved in configuring Longbow. + +Create Password-less SSH +======================== + +Once you have selected the first HPC machine that you would like to use with Longbow. The first and most important part (if you don't wish to write your password hundreds of times) is to configure SSH to connect using keyfiles instead of a password. A handy guide has been written on this `here `_. + +Once you have done this, then you are ready for the next part, below. + +.. _add-hpc: + +Adding a HPC machine to Longbow +=============================== + +This part is going to explain how to get your first HPC machine configured ready for running jobs. We are going to assume that you have not configured this file before, users that have previously configured their hosts can simply skip this part and use their existing information. + +The first step is to make sure that the ~/.longbow/hosts.conf file got created during installation. You can do this by opening up a terminal and do:: + + ls ~/.longbow + +If you can see the hosts.conf file in the output of the above command then you can skip ahead to the next step, if however you see that the ~/.longbow directory is missing then you should create it by:: + + mkdir ~/.longbow + +Now we want to open up ~/.longbow/hosts.conf in our favourite text editor, here we will use nano. This step is valid for all users whether or not hosts.conf exists or not as it will be created if it does not exist and for those that have it already, any contents will be replaced in the next step:: + + nano ~/.longbow/hosts.conf + +Now we want to add in the configuration for our HPC resource in this example we will be adding configuration for an account on ARCHER but you can add something else in place of this. Longbow automatically chooses the HPC machine at the top of your hosts.conf if the you do not specify one to use when running a job, so by keeping your favourite HPC machine at the top then unless specified then this is where Longbow will run them. Copy and paste the following into your hosts.conf:: + + [Archer] + host = login.archer.ac.uk + user = myusername + remoteworkdir = /work/myproject/myproject/myusername/ + account = account-code + +Now to explain a little about the information here. + +The name of HPC resource goes in the square brackets, this is important since we will use this name later when referring to resources to run on. + +You should then go through and edit the options underneath by replacing "myusername" with your login username on ARCHER, "myproject" with your project code (for example e280) on ARCHER and "account-code" with the your account code (for example e280-Surname). + +That is it for the basic setup! There are lots more parameters that can be configured at the host level (see the **Longbow Configuration** and **Running Jobs** sections for more information). A good rule of thumb to decide where to use a parameter is if it doesn't change on a given HPC between jobs then you can put it in hosts.conf under the correct HPC machine. + +Download Longbow Examples +========================= + +Before getting started with running Longbow examples, you will need to download them. The are two ways to get these examples: + +1. On the command-line, change to the location where you wish to download the examples to and run:: + + longbow --examples + +2. Download them manually from `here `_ and unzip to a location of your choice. + +Quick Start Examples +==================== + +In the examples directory you extracted in the previous part you should find a "quick_start" directory. Here there are input files for five common MD packages. Change into your favourite one to run a simple MD job. The command-line for each is given below. + +longbow-examples/quick_start/amber:: + + longbow --verbose pmemd.MPI -O -i example.in -c example.min -p example.top -o example.out + +longbow-examples/quick_start/charmm:: + + longbow --verbose charmm -i example.inp ">" example.out + +longbow-examples/quick_start/gromacs:: + + longbow --verbose gmx mdrun -s example.tpr -deffnm output + +longbow-examples/quick_start/lammps:: + + longbow --verbose lmp_xc30 -i example.in -l output + +longbow-examples/quick_start/namd:: + + longbow --verbose namd2 example.in ">" example.out + +or for SMP builds (NAMD v2.12+)*:: + + longbow --verbose namd2 +ppn 23 +pemap 1-23 +commap 0 example.in ">" example.out + +\*The parameters "cores" and "corespernode" must be set to "1" in your hosts.conf + +And that's it! Longbow should submit a job to the HPC machine specified at the top of ~/.longbow/hosts.conf. + +Notice that the above commands are similar to ordinary MD commands except the longbow executable precedes them. This is designed to make Longbow as intuitive to use as possible. + +For most users the job will run successfully first time. If your job does not, go to the :ref:`troubleshooting` or ask for :ref:`support`. + +A Simple Replicate Job Example +============================== + +Replicate jobs are convenient for submitting ensembles of jobs where the command-line for submission is identical for each job but either a number of different runs of the same files or slight variations of the input files are desirable. Replicates enables you to rapidly setup and launch large volumes of such simulations. + +The replicate job example can be found in 'longbow-examples/replicate_job'. This particular example is a replicate job consisting of 5 NAMD replicates. You will notice that the jobs are split over 5 different directories of the naming structure repx where x = 1:5. Each directory then contains a portion of the input files which could contain slightly different parameters/variables. This job is also showing how global files are used, these are files that are input files that are identical between each replicate and thus we can save on transfer time and disk space only having one copy. Longbow will detect such files placed at the same directory level as the repx directories and automatically handle them for you. + +To run this replicate job, you will notice it is not too much different from the simple NAMD example in the previous section. The difference being the --replicates flag to Longbow:: + + longbow --verbose --replicates 5 namd2 example.in + +or for SMP builds (NAMD v2.12+)*:: + + longbow --verbose --replicates 5 namd2 +ppn 23 +pemap 1-23 +commap 0 example.in ">" example.out + +\*The parameters "cores" and "corespernode" must be set to "1" in your hosts.conf + +Each of the replicates will have been submitted and run and their results downloaded into the correct directories. That's it you have run your first replicate job! + +Multijob Examples +================= + +Multi-jobs are the most flexible type of job Longbow offers, they are basically a fully customisable ensemble of jobs. The following two examples show the flexibility and power of using this type of job. These jobs use a job configuration file to gain control over each jobs parameters separately. + +Many Different Single Jobs +-------------------------- + +In the 'multiple-jobs/different_applications' directory, you can find a number of jobs that each use a different MD code and a job configuration script. This job configuration script allows us to provide parameters that differ on a per job basis, this means we can submit very different jobs to the same HPC machine all at once. + +In the 'longbow-examples/multiple-jobs/different_applications' run:: + + longbow --job job.conf --verbose + +Longbow will launch each job to the same HPC machine but for each one, will use the correct MD code. + +Mixed Job Types +--------------- + +Have a bunch of simulations where some are replicates and some are simple single use jobs? Then you can mix these too. + +Change into 'longbow-examples/multiple-jobs/different_job_types' and run:: + + longbow --job job.conf --verbose + +You will notice that the command-line for multijobs looks identical for each use case, that was intentional! You can use this simple method to build extremely complex job workflows involving different input files, different codes, different HPC machines or different resource levels. diff --git a/docs/usr-installation.rst b/docs/usr-installation.rst new file mode 100644 index 0000000..4c738dd --- /dev/null +++ b/docs/usr-installation.rst @@ -0,0 +1,49 @@ +Installation +************ + +**Longbow is designed to be as simple as possible to install** + +The software has been written in vanilla python to be compatible with versions 2.6, 2.7, 3.2, 3.3, 3.4, 3.5 and 3.6 and has no other dependencies on other python packages. The result is that Longbow is very quick and simple to install. + +There are two ways to install Longbow, we recommend you use the pip method, however your circumstances may mean that you require other ways to install (permissions, no pip, no outbound connection, firewall, or testing a development build etc). Each method is detailed below. + +Installation with pip +--------------------- + +By far, the easiest method of installation is to use pip. To install via pip, simply open up a terminal window and type:: + + pip install longbow + +Better still, to avoid permissions problems or pollution of the system python libraries:: + + pip install longbow --user + +Test that the installation went ahead:: + + longbow --about + +If a welcome message is output then you have successfully installed Longbow! If you get an error go to the :ref:`installation-troubleshooting` section to help diagnose your problem. + +Finally, notice that the installation process has created the directory ~/.longbow which contains a file called hosts.conf and will be used later to store recovery files. + +Installation with setup.py +-------------------------- + +If you don't/can't have access to pip on your computer then Longbow can be installed via its setup.py script. Before completing the installation, firstly you will need to download Longbow from `here `_ and then extract the archive. Upon extraction of the zip archive you will find a directory called "Longbow", change into this directory, within this directory you will see there is a python script in there called setup.py. Execute this script:: + + python setup.py install + +Better still, to avoid permissions problems or pollution of the system python libraries:: + + python setup.py install --user + +Test that the installation went ahead:: + + longbow --about + +If a welcome message is output then you have successfully installed Longbow! If you get an error go to the :ref:`installation-troubleshooting` section to help diagnose your problem. + +Finally, notice that the installation process has created the directory ~/.longbow which contains a file called hosts.conf and will be used later to store recovery files. + + + diff --git a/docs/usr-running-jobs.rst b/docs/usr-running-jobs.rst new file mode 100644 index 0000000..847ae49 --- /dev/null +++ b/docs/usr-running-jobs.rst @@ -0,0 +1,332 @@ +Running Jobs +************ + +**This section explains Longbow concepts for running jobs.** + +Running jobs with Longbow is designed to be as intuitive as possible. In fact, the command to submit a job using Longbow deliberately mimics that to run the MD code itself:: + + longbow [longbow arguments] executable [executableargs] + +Due to the way Longbow handles its command-line, in many cases, users can simply place the command "longbow" in front of the ordinary MD command-line to run their job using Longbow. In the following sections, guidelines to running the various types of jobs Longbow supports are outlined. Namely, single jobs, replicate jobs and multi-jobs. + +Single Jobs +=========== + +Single jobs are the simplest type of job that Longbow can run, these are the Longbow enabled equivalent of single jobs submitted to the batch system with the added benefit of having Longbow handle all staging and monitoring of jobs for you. + +For a simple CHARMM job that may be submitted with the following:: + + charmm -i example.inp + +The Longbow equivalent could be as simple as:: + + longbow charmm -i example.inp + +Generally users like to see output on the terminal they are using so our command-line becomes a tad longer:: + + longbow --verbose charmm -i input.inp + +Or with longbow arguments hosts, job and log files explicitly stated:: + + longbow --job /path/filename --log /path/filename --hosts /path/filename charmm -i example.inp + +That is all there is to it, the results files will appear in the working directory of your local machine as if the jobs had run there. There are examples of single jobs for all five supported packages covered in our **quick start section**. + +**Referring to other files within input files** + +Note, that for a single job, Longbow would expect and require all executableargs input files to be in the current working directory where the job is being ran from. Furthermore, Longbow requires just filenames to be provided, both on the Longbow command line in executableargs and within input files. DO NOT provide the paths to the files. The exception is when the user wishes to use a file that is already on the HPC machine as an additional input file. In this case the user should give the full path to the file. + +Below is a CHARMM example demonstrating these points: + + longbow --job /path/filename --log /path/filename --hosts /path/filename charmm -i example.inp + +Notice in the the command-line that the path to example.inp has NOT been provided as this should be in the current working directory. But inside that input file we might have:: + + ... + # input file in the current working directory provided WITHOUT the path as required + read param card name par_all27_prot_lipid.prm + ... + # input file on the remote resource provided WITH the path as required + OPEN UNIT 1 CARD READ NAME /charmm/c34b2/toppar/top_all27_prot_na.rtf + ... + +The file supplied without its full path should reside in the current working directory along with your other job files and Longbow will detect it and stage it to the simulation directory on the HPC machine. However the file with the full path points to a path of a file that is already stored on the HPC machine, perhaps as part of a large library of common files in a database. + + + +Replicate Jobs +============== + +Replicate jobs are the Longbow equivalent of job arrays, they are useful for submitting larger numbers of jobs that have similar files and command-line structures. + +By default, replicate jobs have a very defined directory structure, the subdirectories must be of the format /repX where X is a number from 1 to N where N is the number of replicate jobs you wish to run. However the naming scheme can be changed such that different directory naming can be used to match your use case for example by supplying "replicate-naming = run" in a configuration file you can now have directories of the format runX where x is a number from 1 to N where N is the number of replicates. + +Longbow can also handle "global files", these are files that would be identical across all of the replicate jobs and thus would simply be duplicating files if they were to be transferred, this is a waste of disk space. So to prevent this wastage, Longbow allows files to be placed in the job parent directory (ie the same level as the repx directories). Longbow will then detect these files and automatically change the paths in the generated job submit files to point to the global ones. Global files can also act like overrides so if there is a file of the same name within the repX directory and parent directory then the parent directory file overrides the ones in the individual jobs. + +Furthermore, if all input files are "global", you have no need to create the repX directories at all, Longbow will generate them for you, but this is only useful if the input files are identical across all simulations. + +A real example of a similar structure for a NAMD replicate job can be found in the **quick start examples** section of this guide. An illustrative example of how such a job would be structured and its corresponding command-line can be seen below:: + + current working directory/ + solvated.pdb + solvated.psf + par_all27_prot_lipid.prm + /rep1 + example.in + relres.coor + relres.vel + relres.xsc + /rep2 + example.in + relres.coor + relres.vel + relres.xsc + /rep3 + example.in + relres.coor + relres.vel + relres.xsc + . + . + . + /repN + example.in + relres.coor + relres.vel + relres.xsc + +This job can be executed with a command of the form:: + + longbow --verbose -replicates N namd2 example.in + +**Referring to other files within input files** + +Some simulation codes allow files to be referenced from within the input files (the ones you gave on the command-line) and if Longbow is to work for this, it needs to detect and transfer those files to the HPC machine. You will also need to make sure you reference the paths correctly in these files based on how your job is configured. The following scenarios will show you how to do this for each different scenario. + +**Input files in the repX subdirectories** + +Input files in the repX subdirectories should refer to files in the same directory by providing just the filename. On the other hand files in the job parent directory should be referred to using ../filename. + +Below is a modified extract from longbow-examples/ReplicateJob/rep1/example.in that demonstrates these points:: + + # files in the job parent directory (longbow-examples/replicate_job/) + structure ../solvated.psf + parameters ../par_all27_prot_lipid.prm + coordinates ../solvated.pdb + + # files in the rep1 subdirectory (longbow-examples/replicate_job/rep1) + binvelocities relres.vel + bincoordinates relres.coor + ExtendedSystem relres.xsc + +**Input files in job parent directory** + +Input files in the parent directory of the repX subdirectories (current working directory e.g. longbow-examples/replicate_job) can also refer to files in the same directory and in the repX subdirectories. Files in the same directory as the input file in question can be referred to by providing no path. Files in the repX subdirectories on the other hand can be referred to by ./repX/filename. + +Below is a fictitious file that is not included in the example in longbow-examples/replicate_job, but is shown here just to demonstrate the principles just outlined + +in longbow-examples/replicate_job/fictitiousfile.in:: + + # files in the job parent directory (longbow-examples/replicate_job/) + structure solvated.psf + parameters par_all27_prot_lipid.prm + coordinates solvated.pdb + + # files in the rep1 subdirectory (longbow-examples/replicate_job/rep1) + binvelocities ./rep1/relres.vel + bincoordinates ./rep1/relres.coor + ExtendedSystem ./rep1/relres.xsc + +**How to reference files on the remote resource** + +Files that are on the remote resource should be referred to in input files by providing the full path to the file, this differentiation in path types allows Longbow to make the distinction between intentional files missing locally and a user mistake (which would be reported accordingly):: + + ... + parameters /namdfiles/on/the/remote/resource/par_all27_prot_lipid.prm + ... + +Multi-Jobs +========== + +A powerful feature of Longbow is it's ability to send multiple single and replicate jobs off to many different HPC machines with the execution of a single command. Two examples of this can be found in the Longbow examples. In those examples there is an example of running a single and replicate Amber job simultaneously and an example of running multiple applications. These illustrate just two use cases of this job type, in reality all kinds of things are possible here such as running portions of jobs on different HPCs, to using different accounts or queues etc. To run a multi-job, you simply include more than one job in a job configuration file. Below is the example taken from longbow-examples/multiple_jobs/different_job_types. + +longbow-examples/multiple_jobs/different_job_types/job.conf:: + + [single] + resource = Archer + queue = short + executable = pmemd.MPI + maxtime = 00:10 + executableargs = -i example.in -c example.rst -p example.top -o example.out + + [replicate] + resource = Archer + queue = short + executable = pmemd.MPI + maxtime = 00:10 + executableargs = -i example.in -c example.rst -p example.top -o example.out + replicates = 5 + +The job directory structure would look like the following:: + + longbow-examples/multiple_jobs/different_job_types/ + job.conf + single/ + example.in + example.rst + example.top + replicate/ + example.rst + example.top + rep1/ + example.in + rep2/ + example.in + rep3/ + example.in + rep4/ + example.in + rep5/ + example.in + +This job is simply run by executing the following from the directory containing job.conf:: + + longbow --job job.conf --verbose + +Note that it is essential for the subdirectory names to be the same as the names of the jobs in the square brackets in the job configuration file, job.conf. Longbow can handle very large numbers of jobs, even if the HPC resource you are submitting to has a limit on how many jobs can be in the queue at any single time, in these cases Longbow will batch up the jobs and submit new ones as old ones finish so as to make full use of your individual queue limits. + +Supported Executables and Command-line Flags +============================================ + +Users should use the same command line flags and operators when running an MD package through longbow as they would normally. Below are the flags that are required by Longbow for each supported MD package. If those listed below are not provided Longbow will issue an error. + +**Amber** + +Executables: pmemd, pmemd.MPI, pmemd.cuda + +Amber command line flags: -i, -c, -p + +**CHARMM** + +Executables: charmm, charmm_mpi, charmm_cuda + +CHARMM command line flags: None are mandatory. The user must decide whether to use -i, <, ... + +However, if using < on the command line, ensure that it is used in quotation marks (""). For example:: + + longbow charmm "<" input.inp + +**Gromacs** + +Executables: gmx, gmx_d, mdrun, mdrun_d, mdrun_mpi, mdrun_mpi_d + +Gromacs command line flags: -s or -deffnm + +**LAMMPS** + +Executable: lmp_xc30, lmp_linux, lmp_gpu, lmp_mpi, lmp_cuda, lmp + +LAMMPS command line flags: -i + +**NAMD** + +Executable: namd2, namd2.mpi, namd2.cuda + +NAMD command line flags: None are mandatory. An input file is expected to follow the executable: namd2 + +Supported Substitutions +======================= + +Longbow will detect input files such as topology files that need to be copied to the HPC machine along with the primary input file to the executable. Longbow does this by searching the primary input file for references to other files. Any additional input files that are found are also searched for references to input files in a recursive fashion until all input files are found. + +Longbow can detect when a user has performed a parameter substitution for input files either when provided on the command line as executableargs or within an input file itself. Below the substitutions that are supported are outlined package by package. + +**CHARMM** + +Format of command line substitutions supported:: + + longbow charmm myvar:myprot "<" example.inp + longbow charmm myvar=myprot "<" example.inp + +In-file substitutions supported:: + + SET myvar = myprot + OPEN UNIT 1 CARD READ NAME @myvar.pdb + +and:: + + SET myvar myprot + OPEN UNIT 1 CARD READ NAME @myvar.pdb + +**LAMMPS** + +Format of command line substitutions supported:: + + longbow lmp_xc30 -var myvar mydata -i example.in -l output + longbow lmp_xc30 -v p myprot -i example.in -l output + +In-file substitutions supported:: + + variable myvar equal mydata + read_data ${myvar}.data + +and:: + + variable p equal myprot + coordinates $p.pdb + +**NAMD** + +In-file substitutions supported:: + + set myvar = myprot + ExtendedSystem $myVar.xsc + +and:: + + set myvar2 myparam + parameters $myvar2.prm + +**Amber** +Not currently supported. + +**Gromacs** +Not currently supported. + +Disconnectable Sessions +======================= + +A useful feature is the ability for Longbow to disconnect itself shortly after submitting jobs off to the HPC machines. This is useful for people running Longbow on desktop/laptop computers that don't have the luxury of being able to keep a connection live for the duration of simulations. By supplying a simple flag --disconnect, this tells Longbow that you simply want to submit and forget your jobs. + +Longbow will simply submit your jobs and then write out the details to a recovery file, by doing this the user always has the option to reconnect to the session later to automatically download files or to continue polling if desired. To initiate this feature one just simply adds the --disconnect flag to the Longbow part of the command-line:: + + longbow --versbose --disconnect --log new.log namd2 ">" output.out + +Persistent Reconnect/Recover Sessions +===================================== + +For recovering an intentionally disconnected Longbow session or for the hopefully more rare occasions that Longbow for some reason crashes, be it due to a spate of network instability or simply rotten luck. Longbow has a recovery mode, this recovery mode is designed to reconnect Longbow with jobs that are running on the HPC. + +Even if you know that all your jobs have managed to finish since Longbow crashed you can still reconnect and have Longbow complete the final staging for you, this is particularly handy if you had many jobs running through your Longbow session. + +To start Longbow in recovery mode, you will need to supply the following command-line:: + + longbow --recover recoveryfilename + +You do not need to provide the path to a recovery file as Longbow stores these in ~/.longbow so it knows where to find them. They will typically have the time stamp of when the Longbow session was started, further inspection of the internals of the recovery file can confirm the job information to assist with choosing the correct recovery file (the filename will also appear in your logfile). + +A small number of flags can be provided with the recover flag, such as the debug, verbose or the log flag. Often users will want to display the outcome of the recovery to their terminal to make sure the session is recovered, or to change the location of the logging to a new file such that if anything goes wrong they have all information at hand. Here is an example of a user that wants to log to the screen to monitor the recovery, but also to log to a new file so there is a record of what went wrong in the original log file:: + + longbow --verbose --log new.log --recover recoveryfilename + +Update Disconnected Sessions +============================ + +For grabbing an update of job status and to download a snapshot of the current simulation output (can save transfer time at the end) an update mode is available. This mode will simply connect and grab the latest job/s status, it will update the state of downloaded files. Also, if you have jobs that have been held back by Longbow due to queue slot limits, and jobs already submitted have finished running, then Longbow will submit these before exiting disconnecting again. Once all jobs are finished and downloaded, then running this update mode will trigger the correct cleanup and exit procedure as if it was running in persistent mode. + +To invoke this recovery mode, you just simply need to provide the recovery file to the --update flag:: + + longbow --update recoveryfilename + + + diff --git a/docs/usr-troubleshooting.rst b/docs/usr-troubleshooting.rst new file mode 100644 index 0000000..7473c78 --- /dev/null +++ b/docs/usr-troubleshooting.rst @@ -0,0 +1,77 @@ +.. _troubleshooting: + +Troubleshooting +*************** + +If you are unable to find a solution to your problem listed here, then you should contact us for support. + +.. _installation-troubleshooting: + +Installation Troubleshooting +============================ + +The issues in this section relate to problems encountered during or immediately related to the installation of Longbow. + +**When I try to launch Longbow I see a "Command not found" error message** + +If after installing Longbow, the command:: + + longbow --about + +produces an error message similar to "longbow: Command not found", first open a new terminal and try again. If the same output is observed, execute the command:: + + echo $PATH + +If in the output of this command, you cannot see the directory in which the longbow executable of the same name is installed (usually in ~/.local/bin), then you need to add this path to your environment. + +If you use the Bash shell, add the following lines to your ~/.bashrc file:: + + PATH="/home/user/.local/bin:${PATH}" + export PATH + +On the other hand, if you use a C shell, add the following lines to your ~/.cshrc file:: + + set path = ( /home/user/.local/bin $path ) + +To activate these changes, either close and reopen your terminal or use the source command:: + + source ~/.bashrc + +for bash or for c shell:: + + source ~/.cshrc + +**When I try to launch Longbow I see the message "Permission Denied"** + + +This is usually due to the execute permission not being granted on the Longbow executable file during installation. To remedy this you will need to grant permission for the Longbow executable to be executed. To do this you will need to run chmod on the longbow executable (usually this is in ~/.local/bin) to add the execute permission by doing:: + + chmod +x path/to/longbow + +If you are having difficulties finding the Longbow executable then the following might help you based on which installation methods you chose during installation. + +1. using pip or the manual setup script as root - usually when using this method the executable should be in /usr/local/bin/ + +2. using pip or setup script with --user - usually with this method the executable will be in ~/.local/bin/ + +3. manual install - the Longbow executable will be where you unpacked the archive after download. + +Troubleshooting Longbow Examples +================================ + +Due to the inevitable variation between environments, some users may find that the example job in the Running Longbow Examples section of this user guide will not run first time. The point of failure will be output to the log file and also the console if the --verbose flag is used on the command line. + +To overcome the variation between systems, Longbow has numerous parameters that can be specified in configuration files to support a range of requirements. Read the Longbow Configuration section of this user guide to see which other parameters or command line options could be included to enable your job to run. + +If you can successfully reach the stage where a submit.* file is created in the example directory, compare this file to a standard submission script you would normally use to run jobs on the remote resource. In this way, one can identify what information may be missing with the current configuration setup. + + +General troubleshooting +======================= + +**When I use < or > on the desktop terminal to supply an input script for an MD job or to redirect output, it doesn't work** + +When launching longbow with the < or > characters in the commandline, your shell will interpret these as pipes for input and output to longbow itself. To get around this put the < or > in quotation marks e.g:: + + longbow charmm "<" input.inp +