diff --git a/.github/workflows/kb_sdk_test.yaml b/.github/workflows/kb_sdk_test.yaml new file mode 100644 index 0000000..a31e626 --- /dev/null +++ b/.github/workflows/kb_sdk_test.yaml @@ -0,0 +1,62 @@ +name: KBase SDK Tests + +on: + push: + branches: + - master + - main + pull_request: + branches: + - master + - main + - develop + +jobs: + + sdk_tests: + runs-on: ubuntu-latest + steps: + + - name: Check out GitHub repo + if: "!contains(github.event.head_commit.message, 'skip ci')" + uses: actions/checkout@v2 + + - name: Check out Actions CI files + if: "!contains(github.event.head_commit.message, 'skip ci')" + uses: actions/checkout@v2 + with: + repository: 'kbaseapps/kb_sdk_actions' + path: 'kb_sdk_actions' + + + - name: Set up test environment + if: "!contains(github.event.head_commit.message, 'skip ci')" + shell: bash + env: + KBASE_TEST_TOKEN: ${{ secrets.KBASE_TEST_TOKEN }} + run: | + # Verify kb_sdk_actions clone worked + test -f "$HOME/kb_sdk_actions/bin/kb-sdk" && echo "CI files cloned" + # Pull kb-sdk & create startup script + docker pull kbase/kb-sdk + + sh $GITHUB_WORKSPACE/kb_sdk_actions/bin/make_testdir && echo "Created test_local" + test -f "test_local/test.cfg" && echo "Confirmed config exists" + + - name: Configure authentication + if: "!contains(github.event.head_commit.message, 'skip ci')" + shell: bash + env: + KBASE_TEST_TOKEN: ${{ secrets.KBASE_TEST_TOKEN }} + run: | + # Add token to config + sed -ie "s/^test_token=.*$/&$KBASE_TEST_TOKEN/g" ./test_local/test.cfg + + - name: Run tests + if: "!contains(github.event.head_commit.message, 'skip ci')" + shell: bash + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + run: | + sh $GITHUB_WORKSPACE/kb_sdk_actions/bin/kb-sdk test + bash <(curl -s https://codecov.io/bash) diff --git a/Dockerfile b/Dockerfile index 60c5fa0..14978df 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM kbase/kbase:sdkbase2.latest +FROM kbase/sdkpython:3.8.10 MAINTAINER KBase Developer [Dylan Chivian (DCChivian@lbl.gov)] # ----------------------------------------- @@ -7,15 +7,21 @@ MAINTAINER KBase Developer [Dylan Chivian (DCChivian@lbl.gov)] # install line here, a git checkout to download code, or run any other # installation scripts. -#RUN apt-get update +# Update +RUN apt-get update + +# udpate certs +RUN apt-get upgrade -y +RUN sed -i 's/\(.*DST_Root_CA_X3.crt\)/!\1/' /etc/ca-certificates.conf +RUN update-ca-certificates + # Install ETE3 -#RUN apt-get -y --fix-missing install python-numpy python-qt4 python-lxml python-six -# only need qt4 -#RUN apt-get -y install python-qt4 RUN apt-get update && \ - apt-get -y install xvfb python-qt4 && \ - pip install ete3==3.0.0b35 + apt-get -y install xvfb +RUN pip install --upgrade pip +# Note: You must use PyQt5==5.11.3 on debian +RUN pip install ete3==3.1.2 PyQt5==5.11.3 numpy==1.23.1 # ----------------------------------------- @@ -31,17 +37,12 @@ RUN make all # RUN mkdir -p /kb/module/FastTree/bin WORKDIR /kb/module/FastTree/bin -#RUN curl http://www.microbesonline.org/fasttree/FastTree > FastTree2.1.9_64 -RUN \ - git clone https://github.com/kbaseapps/kb_fasttree && \ - cp kb_fasttree/src/FastTree2.1.9_64 . && \ -# INCLUDES ARE FAILING -# gcc -Wall -O3 -finline-functions -funroll-loops -o FastTree2.1.9_64 -lm kb_fasttree/src/FastTree.c && \ -# cp kb_fasttree/src/FastTree2.1.9_64_DEBUG ./FastTree2.1.9_64 && \ +RUN curl -o FastTree2.1.11_64 http://www.microbesonline.org/fasttree/FastTree && \ #RUN \ -# curl https://github.com/dcchivian/kb_fasttree/blob/master/src/FastTree2.1.9_64 > FastTree2.1.9_64 && \ - chmod 555 FastTree2.1.9_64 && \ - ln -s FastTree2.1.9_64 FastTree +# git clone https://github.com/kbaseapps/kb_fasttree && \ +# cp kb_fasttree/src/FastTree2.1.11_64 . && \ + chmod 555 FastTree2.1.11_64 && \ + ln -s FastTree2.1.11_64 FastTree WORKDIR /kb/module diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 63943f4..0cb3356 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,10 @@ +### Version 1.1.0 +- update FastTree to v2.1.11 +- update Docker base image to kbase/sdkpython:3.8.10 +- update ETE3 to 3.1.2 +- update PyQt5 (note: must use PyQt5==5.11.3 on debian) +- added Github Actions testing + ### Version 1.0.3 - update base image to sdkbase2 diff --git a/dependencies.json b/dependencies.json new file mode 100644 index 0000000..279274c --- /dev/null +++ b/dependencies.json @@ -0,0 +1,13 @@ +[ { + "module_name" : "DataFileUtil", + "type" : "sdk", + "version_tag" : "release" +}, { + "module_name" : "KBaseReport", + "type" : "sdk", + "version_tag" : "release" +}, { + "module_name" : "Workspace", + "type" : "core", + "file_path" : "https://raw.githubusercontent.com/kbase/workspace_deluxe/master/workspace.spec" +} ] \ No newline at end of file diff --git a/kbase.yml b/kbase.yml index 98e26b8..614695e 100644 --- a/kbase.yml +++ b/kbase.yml @@ -8,7 +8,7 @@ service-language: python module-version: - 1.0.3 + 1.1.0 owners: [dylan] diff --git a/lib/DataFileUtil/authclient.py b/lib/DataFileUtil/authclient.py deleted file mode 100644 index 9a15713..0000000 --- a/lib/DataFileUtil/authclient.py +++ /dev/null @@ -1,91 +0,0 @@ -''' -Created on Aug 1, 2016 - -A very basic KBase auth client for the Python server. - -@author: gaprice@lbl.gov -''' -import time as _time -import requests as _requests -import threading as _threading -import hashlib - - -class TokenCache(object): - ''' A basic cache for tokens. ''' - - _MAX_TIME_SEC = 5 * 60 # 5 min - - _lock = _threading.RLock() - - def __init__(self, maxsize=2000): - self._cache = {} - self._maxsize = maxsize - self._halfmax = maxsize / 2 # int division to round down - - def get_user(self, token): - token = hashlib.sha256(token).hexdigest() - with self._lock: - usertime = self._cache.get(token) - if not usertime: - return None - - user, intime = usertime - if _time.time() - intime > self._MAX_TIME_SEC: - return None - return user - - def add_valid_token(self, token, user): - if not token: - raise ValueError('Must supply token') - if not user: - raise ValueError('Must supply user') - token = hashlib.sha256(token).hexdigest() - with self._lock: - self._cache[token] = [user, _time.time()] - if len(self._cache) > self._maxsize: - for i, (t, _) in enumerate(sorted(self._cache.items(), - key=lambda (_, v): v[1])): - if i <= self._halfmax: - del self._cache[t] - else: - break - - -class KBaseAuth(object): - ''' - A very basic KBase auth client for the Python server. - ''' - - _LOGIN_URL = 'https://kbase.us/services/authorization/Sessions/Login' - - def __init__(self, auth_url=None): - ''' - Constructor - ''' - self._authurl = auth_url - if not self._authurl: - self._authurl = self._LOGIN_URL - self._cache = TokenCache() - - def get_user(self, token): - if not token: - raise ValueError('Must supply token') - user = self._cache.get_user(token) - if user: - return user - - d = {'token': token, 'fields': 'user_id'} - ret = _requests.post(self._authurl, data=d) - if not ret.ok: - try: - err = ret.json() - except: - ret.raise_for_status() - raise ValueError('Error connecting to auth service: {} {}\n{}' - .format(ret.status_code, ret.reason, - err['error_msg'])) - - user = ret.json()['user_id'] - self._cache.add_valid_token(token, user) - return user diff --git a/lib/DataFileUtil/baseclient.py b/lib/DataFileUtil/baseclient.py deleted file mode 100644 index 3d2a61a..0000000 --- a/lib/DataFileUtil/baseclient.py +++ /dev/null @@ -1,268 +0,0 @@ -############################################################ -# -# Autogenerated by the KBase type compiler - -# any changes made here will be overwritten -# -############################################################ - -from __future__ import print_function - -import json as _json -import requests as _requests -import random as _random -import os as _os - -try: - from configparser import ConfigParser as _ConfigParser # py 3 -except ImportError: - from ConfigParser import ConfigParser as _ConfigParser # py 2 - -try: - from urllib.parse import urlparse as _urlparse # py3 -except ImportError: - from urlparse import urlparse as _urlparse # py2 -import time - -_CT = 'content-type' -_AJ = 'application/json' -_URL_SCHEME = frozenset(['http', 'https']) - - -def _get_token(user_id, password, auth_svc): - # This is bandaid helper function until we get a full - # KBase python auth client released - # note that currently globus usernames, and therefore kbase usernames, - # cannot contain non-ascii characters. In python 2, quote doesn't handle - # unicode, so if this changes this client will need to change. - body = ('user_id=' + _requests.utils.quote(user_id) + '&password=' + - _requests.utils.quote(password) + '&fields=token') - ret = _requests.post(auth_svc, data=body, allow_redirects=True) - status = ret.status_code - if status >= 200 and status <= 299: - tok = _json.loads(ret.text) - elif status == 403: - raise Exception('Authentication failed: Bad user_id/password ' + - 'combination for user %s' % (user_id)) - else: - raise Exception(ret.text) - return tok['token'] - - -def _read_inifile(file=_os.environ.get( # @ReservedAssignment - 'KB_DEPLOYMENT_CONFIG', _os.environ['HOME'] + - '/.kbase_config')): - # Another bandaid to read in the ~/.kbase_config file if one is present - authdata = None - if _os.path.exists(file): - try: - config = _ConfigParser() - config.read(file) - # strip down whatever we read to only what is legit - authdata = {x: config.get('authentication', x) - if config.has_option('authentication', x) - else None for x in ('user_id', 'token', - 'client_secret', 'keyfile', - 'keyfile_passphrase', 'password')} - except Exception as e: - print('Error while reading INI file {}: {}'.format(file, e)) - return authdata - - -class ServerError(Exception): - - def __init__(self, name, code, message, data=None, error=None): - super(Exception, self).__init__(message) - self.name = name - self.code = code - self.message = '' if message is None else message - self.data = data or error or '' - # data = JSON RPC 2.0, error = 1.1 - - def __str__(self): - return self.name + ': ' + str(self.code) + '. ' + self.message + \ - '\n' + self.data - - -class _JSONObjectEncoder(_json.JSONEncoder): - - def default(self, obj): - if isinstance(obj, set): - return list(obj) - if isinstance(obj, frozenset): - return list(obj) - return _json.JSONEncoder.default(self, obj) - - -class BaseClient(object): - ''' - The KBase base client. - Required initialization arguments (positional): - url - the url of the the service to contact: - For SDK methods: either the url of the callback service or the - Narrative Job Service Wrapper. - For SDK dynamic services: the url of the Service Wizard. - For other services: the url of the service. - Optional arguments (keywords in positional order): - timeout - methods will fail if they take longer than this value in seconds. - Default 1800. - user_id - a KBase user name. - password - the password corresponding to the user name. - token - a KBase authentication token. - ignore_authrc - if True, don't read auth configuration from - ~/.kbase_config. - trust_all_ssl_certificates - set to True to trust self-signed certificates. - If you don't understand the implications, leave as the default, False. - auth_svc - the url of the KBase authorization service. - lookup_url - set to true when contacting KBase dynamic services. - async_job_check_time_ms - the wait time between checking job state for - asynchronous jobs run with the run_job method. - ''' - def __init__( - self, url=None, timeout=30 * 60, user_id=None, - password=None, token=None, ignore_authrc=False, - trust_all_ssl_certificates=False, - auth_svc='https://kbase.us/services/authorization/Sessions/Login', - lookup_url=False, - async_job_check_time_ms=100, - async_job_check_time_scale_percent=150, - async_job_check_max_time_ms=300000): - if url is None: - raise ValueError('A url is required') - scheme, _, _, _, _, _ = _urlparse(url) - if scheme not in _URL_SCHEME: - raise ValueError(url + " isn't a valid http url") - self.url = url - self.timeout = int(timeout) - self._headers = dict() - self.trust_all_ssl_certificates = trust_all_ssl_certificates - self.lookup_url = lookup_url - self.async_job_check_time = async_job_check_time_ms / 1000.0 - self.async_job_check_time_scale_percent = ( - async_job_check_time_scale_percent) - self.async_job_check_max_time = async_job_check_max_time_ms / 1000.0 - # token overrides user_id and password - if token is not None: - self._headers['AUTHORIZATION'] = token - elif user_id is not None and password is not None: - self._headers['AUTHORIZATION'] = _get_token( - user_id, password, auth_svc) - elif 'KB_AUTH_TOKEN' in _os.environ: - self._headers['AUTHORIZATION'] = _os.environ.get('KB_AUTH_TOKEN') - elif not ignore_authrc: - authdata = _read_inifile() - if authdata is not None: - if authdata.get('token') is not None: - self._headers['AUTHORIZATION'] = authdata['token'] - elif(authdata.get('user_id') is not None and - authdata.get('password') is not None): - self._headers['AUTHORIZATION'] = _get_token( - authdata['user_id'], authdata['password'], auth_svc) - if self.timeout < 1: - raise ValueError('Timeout value must be at least 1 second') - - def _call(self, url, method, params, context=None): - arg_hash = {'method': method, - 'params': params, - 'version': '1.1', - 'id': str(_random.random())[2:] - } - if context: - if type(context) is not dict: - raise ValueError('context is not type dict as required.') - arg_hash['context'] = context - - body = _json.dumps(arg_hash, cls=_JSONObjectEncoder) - ret = _requests.post(url, data=body, headers=self._headers, - timeout=self.timeout, - verify=not self.trust_all_ssl_certificates) - ret.encoding = 'utf-8' - if ret.status_code == 500: - if ret.headers.get(_CT) == _AJ: - err = ret.json() - if 'error' in err: - raise ServerError(**err['error']) - else: - raise ServerError('Unknown', 0, ret.text) - else: - raise ServerError('Unknown', 0, ret.text) - if not ret.ok: - ret.raise_for_status() - resp = ret.json() - if 'result' not in resp: - raise ServerError('Unknown', 0, 'An unknown server error occurred') - if not resp['result']: - return - if len(resp['result']) == 1: - return resp['result'][0] - return resp['result'] - - def _get_service_url(self, service_method, service_version): - if not self.lookup_url: - return self.url - service, _ = service_method.split('.') - service_status_ret = self._call( - self.url, 'ServiceWizard.get_service_status', - [{'module_name': service, 'version': service_version}]) - return service_status_ret['url'] - - def _set_up_context(self, service_ver=None, context=None): - if service_ver: - if not context: - context = {} - context['service_ver'] = service_ver - return context - - def _check_job(self, service, job_id): - return self._call(self.url, service + '._check_job', [job_id]) - - def _submit_job(self, service_method, args, service_ver=None, - context=None): - context = self._set_up_context(service_ver, context) - mod, meth = service_method.split('.') - return self._call(self.url, mod + '._' + meth + '_submit', - args, context) - - def run_job(self, service_method, args, service_ver=None, context=None): - ''' - Run a SDK method asynchronously. - Required arguments: - service_method - the service and method to run, e.g. myserv.mymeth. - args - a list of arguments to the method. - Optional arguments: - service_ver - the version of the service to run, e.g. a git hash - or dev/beta/release. - context - the rpc context dict. - ''' - mod, _ = service_method.split('.') - job_id = self._submit_job(service_method, args, service_ver, context) - async_job_check_time = self.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self.async_job_check_time_scale_percent / - 100.0) - if async_job_check_time > self.async_job_check_max_time: - async_job_check_time = self.async_job_check_max_time - job_state = self._check_job(mod, job_id) - if job_state['finished']: - if not job_state['result']: - return - if len(job_state['result']) == 1: - return job_state['result'][0] - return job_state['result'] - - def call_method(self, service_method, args, service_ver=None, - context=None): - ''' - Call a standard or dynamic service synchronously. - Required arguments: - service_method - the service and method to run, e.g. myserv.mymeth. - args - a list of arguments to the method. - Optional arguments: - service_ver - the version of the service to run, e.g. a git hash - or dev/beta/release. - context - the rpc context dict. - ''' - url = self._get_service_url(service_method, service_ver) - context = self._set_up_context(service_ver, context) - return self._call(url, service_method, args, context) diff --git a/lib/KBaseReport/KBaseReportClient.py b/lib/KBaseReport/KBaseReportClient.py deleted file mode 100644 index e0ad97e..0000000 --- a/lib/KBaseReport/KBaseReportClient.py +++ /dev/null @@ -1,184 +0,0 @@ -# -*- coding: utf-8 -*- -############################################################ -# -# Autogenerated by the KBase type compiler - -# any changes made here will be overwritten -# -############################################################ - -from __future__ import print_function -# the following is a hack to get the baseclient to import whether we're in a -# package or not. This makes pep8 unhappy hence the annotations. -try: - # baseclient and this client are in a package - from .baseclient import BaseClient as _BaseClient # @UnusedImport -except: - # no they aren't - from baseclient import BaseClient as _BaseClient # @Reimport -import time - - -class KBaseReport(object): - - def __init__( - self, url=None, timeout=30 * 60, user_id=None, - password=None, token=None, ignore_authrc=False, - trust_all_ssl_certificates=False, - auth_svc='https://kbase.us/services/authorization/Sessions/Login', - service_ver='release', - async_job_check_time_ms=100, async_job_check_time_scale_percent=150, - async_job_check_max_time_ms=300000): - if url is None: - raise ValueError('A url is required') - self._service_ver = service_ver - self._client = _BaseClient( - url, timeout=timeout, user_id=user_id, password=password, - token=token, ignore_authrc=ignore_authrc, - trust_all_ssl_certificates=trust_all_ssl_certificates, - auth_svc=auth_svc, - async_job_check_time_ms=async_job_check_time_ms, - async_job_check_time_scale_percent=async_job_check_time_scale_percent, - async_job_check_max_time_ms=async_job_check_max_time_ms) - - def _check_job(self, job_id): - return self._client._check_job('KBaseReport', job_id) - - def _create_submit(self, params, context=None): - return self._client._submit_job( - 'KBaseReport.create', [params], - self._service_ver, context) - - def create(self, params, context=None): - """ - Create a KBaseReport with a brief summary of an App run. - :param params: instance of type "CreateParams" (Provide the report - information. The structure is: params = { report: { text_message: - '', warnings: ['w1'], objects_created: [ { ref: 'ws/objid', - description: '' }] }, workspace_name: 'ws' }) -> structure: - parameter "report" of type "Report" (A simple Report of a method - run in KBase. It only provides for now a way to display a fixed - width text output summary message, a list of warnings, and a list - of objects created (each with descriptions). @optional warnings - file_links html_links direct_html direct_html_link_index @metadata - ws length(warnings) as Warnings @metadata ws length(text_message) - as Size(characters) @metadata ws length(objects_created) as - Objects Created) -> structure: parameter "text_message" of String, - parameter "warnings" of list of String, parameter - "objects_created" of list of type "WorkspaceObject" (Represents a - Workspace object with some brief description text that can be - associated with the object. @optional description) -> structure: - parameter "ref" of type "ws_id" (@id ws), parameter "description" - of String, parameter "file_links" of list of type "LinkedFile" - (Represents a file or html archive that the report should like to - @optional description label) -> structure: parameter "handle" of - type "handle_ref" (Reference to a handle @id handle), parameter - "description" of String, parameter "name" of String, parameter - "label" of String, parameter "URL" of String, parameter - "html_links" of list of type "LinkedFile" (Represents a file or - html archive that the report should like to @optional description - label) -> structure: parameter "handle" of type "handle_ref" - (Reference to a handle @id handle), parameter "description" of - String, parameter "name" of String, parameter "label" of String, - parameter "URL" of String, parameter "direct_html" of String, - parameter "direct_html_link_index" of Long, parameter - "workspace_name" of String - :returns: instance of type "ReportInfo" (The reference to the saved - KBaseReport. The structure is: reportInfo = { ref: - 'ws/objid/ver', name: 'myreport.2262323452' }) -> structure: - parameter "ref" of type "ws_id" (@id ws), parameter "name" of - String - """ - job_id = self._create_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _create_extended_report_submit(self, params, context=None): - return self._client._submit_job( - 'KBaseReport.create_extended_report', [params], - self._service_ver, context) - - def create_extended_report(self, params, context=None): - """ - A more complex function to create a report that enables the user to specify files and html view that the report should link to - :param params: instance of type "CreateExtendedReportParams" - (Parameters used to create a more complex report with file and - html links The following arguments allow the user to specify the - classical data fields in the report object: string message - - simple text message to store in report object list - objects_created; list warnings - a list - of warning messages in simple text The following argument allows - the user to specify the location of html files/directories that - the report widget will render link to: list - html_links - a list of paths or shock node IDs pointing to a - single flat html file or to the top level directory of a website - The report widget can render one html view directly. Set one of - the following fields to decide which view to render: string - direct_html - simple html text that will be rendered within the - report widget int direct_html_link_index - use this to specify - the index of the page in html_links to view directly in the report - widget (ignored if html_string is set) The following argument - allows the user to specify the location of files that the report - widget should link for download: list file_links - a - list of paths or shock node IDs pointing to a single flat file The - following parameters indicate where the report object should be - saved in the workspace: string report_object_name - name to use - for the report object (job ID is used if left unspecified) - html_window_height - height of the html window in the narrative - output widget summary_window_height - height of summary window in - the narrative output widget string workspace_name - name of - workspace where object should be saved) -> structure: parameter - "message" of String, parameter "objects_created" of list of type - "WorkspaceObject" (Represents a Workspace object with some brief - description text that can be associated with the object. @optional - description) -> structure: parameter "ref" of type "ws_id" (@id - ws), parameter "description" of String, parameter "warnings" of - list of String, parameter "html_links" of list of type "File" -> - structure: parameter "path" of String, parameter "shock_id" of - String, parameter "name" of String, parameter "description" of - String, parameter "direct_html" of String, parameter - "direct_html_link_index" of Long, parameter "file_links" of list - of type "File" -> structure: parameter "path" of String, parameter - "shock_id" of String, parameter "name" of String, parameter - "description" of String, parameter "report_object_name" of String, - parameter "html_window_height" of Double, parameter - "summary_window_height" of Double, parameter "workspace_name" of - String - :returns: instance of type "ReportInfo" (The reference to the saved - KBaseReport. The structure is: reportInfo = { ref: - 'ws/objid/ver', name: 'myreport.2262323452' }) -> structure: - parameter "ref" of type "ws_id" (@id ws), parameter "name" of - String - """ - job_id = self._create_extended_report_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def status(self, context=None): - job_id = self._client._submit_job('KBaseReport.status', - [], self._service_ver, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] diff --git a/lib/KBaseReport/__init__.py b/lib/KBaseReport/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lib/DataFileUtil/DataFileUtilClient.py b/lib/installed_clients/DataFileUtilClient.py similarity index 60% rename from lib/DataFileUtil/DataFileUtilClient.py rename to lib/installed_clients/DataFileUtilClient.py index 16a8aab..54cfc59 100644 --- a/lib/DataFileUtil/DataFileUtilClient.py +++ b/lib/installed_clients/DataFileUtilClient.py @@ -12,10 +12,9 @@ try: # baseclient and this client are in a package from .baseclient import BaseClient as _BaseClient # @UnusedImport -except: +except ImportError: # no they aren't from baseclient import BaseClient as _BaseClient # @Reimport -import time class DataFileUtil(object): @@ -24,7 +23,7 @@ def __init__( self, url=None, timeout=30 * 60, user_id=None, password=None, token=None, ignore_authrc=False, trust_all_ssl_certificates=False, - auth_svc='https://kbase.us/services/authorization/Sessions/Login', + auth_svc='https://ci.kbase.us/services/auth/api/legacy/KBase/Sessions/Login', service_ver='release', async_job_check_time_ms=100, async_job_check_time_scale_percent=150, async_job_check_max_time_ms=300000): @@ -40,14 +39,6 @@ def __init__( async_job_check_time_scale_percent=async_job_check_time_scale_percent, async_job_check_max_time_ms=async_job_check_max_time_ms) - def _check_job(self, job_id): - return self._client._check_job('DataFileUtil', job_id) - - def _shock_to_file_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.shock_to_file', [params], - self._service_ver, context) - def shock_to_file(self, params, context=None): """ Download a file from Shock. @@ -84,22 +75,8 @@ def shock_to_file(self, params, context=None): parameter "file_path" of String, parameter "size" of Long, parameter "attributes" of mapping from String to unspecified object """ - job_id = self._shock_to_file_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _shock_to_file_mass_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.shock_to_file_mass', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.shock_to_file', + [params], self._service_ver, context) def shock_to_file_mass(self, params, context=None): """ @@ -137,22 +114,8 @@ def shock_to_file_mass(self, params, context=None): parameter "file_path" of String, parameter "size" of Long, parameter "attributes" of mapping from String to unspecified object """ - job_id = self._shock_to_file_mass_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _file_to_shock_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.file_to_shock', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.shock_to_file_mass', + [params], self._service_ver, context) def file_to_shock(self, params, context=None): """ @@ -160,26 +123,27 @@ def file_to_shock(self, params, context=None): :param params: instance of type "FileToShockParams" (Input for the file_to_shock function. Required parameters: file_path - the location of the file (or directory if using the pack parameter) to - load to Shock. Optional parameters: attributes - user-specified - attributes to save to the Shock node along with the file. - make_handle - make a Handle Service handle for the shock node. - Default false. pack - compress a file or archive a directory - before loading to Shock. The file_path argument will be appended - with the appropriate file extension prior to writing. For gzips - only, if the file extension denotes that the file is already - compressed, it will be skipped. If file_path is a directory and - tarring or zipping is specified, the created file name will be set - to the directory name, possibly overwriting an existing file. - Attempting to pack the root directory is an error. Do not attempt - to pack the scratch space root as noted in the module description. - The allowed values are: gzip - gzip the file given by file_path. - targz - tar and gzip the directory specified by the directory - portion of the file_path into the file specified by the file_path. - zip - as targz but zip the directory.) -> structure: parameter - "file_path" of String, parameter "attributes" of mapping from - String to unspecified object, parameter "make_handle" of type - "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), - parameter "pack" of String + load to Shock. Optional parameters: attributes - DEPRECATED: + attributes are currently ignored by the upload function and will + be removed entirely in a future version. User-specified attributes + to save to the Shock node along with the file. make_handle - make + a Handle Service handle for the shock node. Default false. pack - + compress a file or archive a directory before loading to Shock. + The file_path argument will be appended with the appropriate file + extension prior to writing. For gzips only, if the file extension + denotes that the file is already compressed, it will be skipped. + If file_path is a directory and tarring or zipping is specified, + the created file name will be set to the directory name, possibly + overwriting an existing file. Attempting to pack the root + directory is an error. Do not attempt to pack the scratch space + root as noted in the module description. The allowed values are: + gzip - gzip the file given by file_path. targz - tar and gzip the + directory specified by the directory portion of the file_path into + the file specified by the file_path. zip - as targz but zip the + directory.) -> structure: parameter "file_path" of String, + parameter "attributes" of mapping from String to unspecified + object, parameter "make_handle" of type "boolean" (A boolean - 0 + for false, 1 for true. @range (0, 1)), parameter "pack" of String :returns: instance of type "FileToShockOutput" (Output of the file_to_shock function. shock_id - the ID of the new Shock node. handle - the new handle, if created. Null otherwise. @@ -196,22 +160,8 @@ def file_to_shock(self, params, context=None): parameter "type" of String, parameter "remote_md5" of String, parameter "node_file_name" of String, parameter "size" of String """ - job_id = self._file_to_shock_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _unpack_file_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.unpack_file', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.file_to_shock', + [params], self._service_ver, context) def unpack_file(self, params, context=None): """ @@ -225,22 +175,8 @@ def unpack_file(self, params, context=None): :returns: instance of type "UnpackFileResult" -> structure: parameter "file_path" of String """ - job_id = self._unpack_file_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _pack_file_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.pack_file', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.unpack_file', + [params], self._service_ver, context) def pack_file(self, params, context=None): """ @@ -266,22 +202,8 @@ def pack_file(self, params, context=None): pack_file function. file_path - the path to the packed file.) -> structure: parameter "file_path" of String """ - job_id = self._pack_file_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _package_for_download_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.package_for_download', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.pack_file', + [params], self._service_ver, context) def package_for_download(self, params, context=None): """ @@ -298,11 +220,13 @@ def package_for_download(self, params, context=None): produce info-files in JSON format containing workspace metadata and provenance structures. It produces new files in folder pointed by file_path (or folder containing file pointed by file_path if - it's not folder). Optional parameters: attributes - user-specified - attributes to save to the Shock node along with the file.) -> - structure: parameter "file_path" of String, parameter "attributes" - of mapping from String to unspecified object, parameter "ws_refs" - of list of String + it's not folder). Optional parameters: attributes - DEPRECATED: + attributes are currently ignored by the upload function and will + be removed entirely in a future version. User-specified attributes + to save to the Shock node along with the file.) -> structure: + parameter "file_path" of String, parameter "attributes" of mapping + from String to unspecified object, parameter "ws_refs" of list of + String :returns: instance of type "PackageForDownloadOutput" (Output of the package_for_download function. shock_id - the ID of the new Shock node. node_file_name - the name of the file stored in Shock. size @@ -310,22 +234,8 @@ def package_for_download(self, params, context=None): "shock_id" of String, parameter "node_file_name" of String, parameter "size" of String """ - job_id = self._package_for_download_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _file_to_shock_mass_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.file_to_shock_mass', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.package_for_download', + [params], self._service_ver, context) def file_to_shock_mass(self, params, context=None): """ @@ -334,7 +244,9 @@ def file_to_shock_mass(self, params, context=None): for the file_to_shock function. Required parameters: file_path - the location of the file (or directory if using the pack parameter) to load to Shock. Optional parameters: attributes - - user-specified attributes to save to the Shock node along with the + DEPRECATED: attributes are currently ignored by the upload + function and will be removed entirely in a future version. + User-specified attributes to save to the Shock node along with the file. make_handle - make a Handle Service handle for the shock node. Default false. pack - compress a file or archive a directory before loading to Shock. The file_path argument will be appended @@ -369,22 +281,8 @@ def file_to_shock_mass(self, params, context=None): parameter "type" of String, parameter "remote_md5" of String, parameter "node_file_name" of String, parameter "size" of String """ - job_id = self._file_to_shock_mass_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _copy_shock_node_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.copy_shock_node', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.file_to_shock_mass', + [params], self._service_ver, context) def copy_shock_node(self, params, context=None): """ @@ -410,22 +308,8 @@ def copy_shock_node(self, params, context=None): of String, parameter "type" of String, parameter "remote_md5" of String """ - job_id = self._copy_shock_node_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _own_shock_node_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.own_shock_node', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.copy_shock_node', + [params], self._service_ver, context) def own_shock_node(self, params, context=None): """ @@ -459,22 +343,8 @@ def own_shock_node(self, params, context=None): of String, parameter "type" of String, parameter "remote_md5" of String """ - job_id = self._own_shock_node_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _ws_name_to_id_submit(self, name, context=None): - return self._client._submit_job( - 'DataFileUtil.ws_name_to_id', [name], - self._service_ver, context) + return self._client.run_job('DataFileUtil.own_shock_node', + [params], self._service_ver, context) def ws_name_to_id(self, name, context=None): """ @@ -482,27 +352,18 @@ def ws_name_to_id(self, name, context=None): :param name: instance of String :returns: instance of Long """ - job_id = self._ws_name_to_id_submit(name, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _save_objects_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.save_objects', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.ws_name_to_id', + [name], self._service_ver, context) def save_objects(self, params, context=None): """ - Save objects to the workspace. Saving over a deleted object undeletes - it. + Save objects to the workspace. + The objects will be sorted prior to saving to avoid the Workspace sort memory limit. + Note that if the object contains workspace object refs in mapping keys that may cause + the Workspace to resort the data. To avoid this, convert any refs in mapping keys to UPA + format (e.g. #/#/#, where # is a positive integer). + If the data is very large, using the WSLargeDataIO SDK module is advised. + Saving over a deleted object undeletes it. :param params: instance of type "SaveObjectsParams" (Input parameters for the "save_objects" function. Required parameters: id - the numerical ID of the workspace. objects - the objects to save. The @@ -511,21 +372,31 @@ def save_objects(self, params, context=None): type "ObjectSaveData" (An object and associated data required for saving. Required parameters: type - the workspace type string for the object. Omit the version information to use the latest - version. data - the object data. Optional parameters: One of an - object name or id. If no name or id is provided the name will be - set to 'auto' with the object id appended as a string, possibly - with -\d+ appended if that object id already exists as a name. - name - the name of the object. objid - the id of the object to - save over. meta - arbitrary user-supplied metadata for the object, - not to exceed 16kb; if the object type specifies automatic - metadata extraction with the 'meta ws' annotation, and your - metadata name conflicts, then your metadata will be silently + version. data - the object data. One of an object name or id: name + - the name of the object. objid - the id of the object to save + over. Optional parameters: meta - arbitrary user-supplied metadata + for the object, not to exceed 16kb; if the object type specifies + automatic metadata extraction with the 'meta ws' annotation, and + your metadata name conflicts, then your metadata will be silently overwritten. hidden - true if this object should not be listed - when listing workspace objects.) -> structure: parameter "type" of + when listing workspace objects. extra_provenance_input_refs - + (optional) if set, these refs will be appended to the primary + ProveanceAction input_ws_objects reference list. In general, if + the input WS object ref was passed in from a narrative App, this + will be set for you. However, there are cases where the object ref + passed to the App is a container, and you are operating on a + member or subobject of the container, in which case to maintain + that direct mapping to those subobjects in the provenance of new + objects, you can provide additional object refs here. For example, + if the input is a ReadsSet, and your App creates a new WS object + for each read library in the set, you may want a direct reference + from each new WS object not only to the set, but also to the + individual read library.) -> structure: parameter "type" of String, parameter "data" of unspecified object, parameter "name" of String, parameter "objid" of Long, parameter "meta" of mapping from String to String, parameter "hidden" of type "boolean" (A - boolean - 0 for false, 1 for true. @range (0, 1)) + boolean - 0 for false, 1 for true. @range (0, 1)), parameter + "extra_provenance_input_refs" of list of String :returns: instance of list of type "object_info" (Information about an object, including user provided metadata. objid - the numerical id of the object. name - the name of the object. type - the type @@ -542,22 +413,8 @@ def save_objects(self, params, context=None): parameter "chsum" of String, parameter "size" of Long, parameter "meta" of mapping from String to String """ - job_id = self._save_objects_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _get_objects_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.get_objects', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.save_objects', + [params], self._service_ver, context) def get_objects(self, params, context=None): """ @@ -567,12 +424,18 @@ def get_objects(self, params, context=None): a list of object references in the form X/Y/Z, where X is the workspace name or id, Y is the object name or id, and Z is the (optional) object version. In general, always use ids rather than - names if possible to avoid race conditions. Optional parameters: - ignore_errors - ignore any errors that occur when fetching an - object and instead insert a null into the returned list.) -> - structure: parameter "object_refs" of list of String, parameter - "ignore_errors" of type "boolean" (A boolean - 0 for false, 1 for - true. @range (0, 1)) + names if possible to avoid race conditions. A reference path may + be specified by separating references by a semicolon, e.g. + 4/5/6;5/7/2;8/9/4 specifies that the user wishes to retrieve the + fourth version of the object with id 9 in workspace 8, and that + there exists a reference path from the sixth version of the object + with id 5 in workspace 4, to which the user has access. The user + may or may not have access to workspaces 5 and 8. Optional + parameters: ignore_errors - ignore any errors that occur when + fetching an object and instead insert a null into the returned + list.) -> structure: parameter "object_refs" of list of String, + parameter "ignore_errors" of type "boolean" (A boolean - 0 for + false, 1 for true. @range (0, 1)) :returns: instance of type "GetObjectsResults" (Results from the get_objects function. list data - the returned objects.) -> structure: parameter "data" of list of type @@ -596,22 +459,8 @@ def get_objects(self, params, context=None): parameter "chsum" of String, parameter "size" of Long, parameter "meta" of mapping from String to String """ - job_id = self._get_objects_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _versions_submit(self, context=None): - return self._client._submit_job( - 'DataFileUtil.versions', [], - self._service_ver, context) + return self._client.run_job('DataFileUtil.get_objects', + [params], self._service_ver, context) def versions(self, context=None): """ @@ -619,22 +468,8 @@ def versions(self, context=None): :returns: multiple set - (1) parameter "wsver" of String, (2) parameter "shockver" of String """ - job_id = self._versions_submit(context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'] - - def _download_staging_file_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.download_staging_file', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.versions', + [], self._service_ver, context) def download_staging_file(self, params, context=None): """ @@ -652,22 +487,8 @@ def download_staging_file(self, params, context=None): scratch area path) -> structure: parameter "copy_file_path" of String """ - job_id = self._download_staging_file_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] - - def _download_web_file_submit(self, params, context=None): - return self._client._submit_job( - 'DataFileUtil.download_web_file', [params], - self._service_ver, context) + return self._client.run_job('DataFileUtil.download_staging_file', + [params], self._service_ver, context) def download_web_file(self, params, context=None): """ @@ -681,28 +502,9 @@ def download_web_file(self, params, context=None): download_web_file function. copy_file_path: copied file scratch area path) -> structure: parameter "copy_file_path" of String """ - job_id = self._download_web_file_submit(params, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] + return self._client.run_job('DataFileUtil.download_web_file', + [params], self._service_ver, context) def status(self, context=None): - job_id = self._client._submit_job('DataFileUtil.status', - [], self._service_ver, context) - async_job_check_time = self._client.async_job_check_time - while True: - time.sleep(async_job_check_time) - async_job_check_time = (async_job_check_time * - self._client.async_job_check_time_scale_percent / 100.0) - if async_job_check_time > self._client.async_job_check_max_time: - async_job_check_time = self._client.async_job_check_max_time - job_state = self._check_job(job_id) - if job_state['finished']: - return job_state['result'][0] + return self._client.run_job('DataFileUtil.status', + [], self._service_ver, context) diff --git a/lib/installed_clients/KBaseReportClient.py b/lib/installed_clients/KBaseReportClient.py new file mode 100644 index 0000000..61d35de --- /dev/null +++ b/lib/installed_clients/KBaseReportClient.py @@ -0,0 +1,308 @@ +# -*- coding: utf-8 -*- +############################################################ +# +# Autogenerated by the KBase type compiler - +# any changes made here will be overwritten +# +############################################################ + +from __future__ import print_function +# the following is a hack to get the baseclient to import whether we're in a +# package or not. This makes pep8 unhappy hence the annotations. +try: + # baseclient and this client are in a package + from .baseclient import BaseClient as _BaseClient # @UnusedImport +except ImportError: + # no they aren't + from baseclient import BaseClient as _BaseClient # @Reimport + + +class KBaseReport(object): + + def __init__( + self, url=None, timeout=30 * 60, user_id=None, + password=None, token=None, ignore_authrc=False, + trust_all_ssl_certificates=False, + auth_svc='https://ci.kbase.us/services/auth/api/legacy/KBase/Sessions/Login', + service_ver='release', + async_job_check_time_ms=100, async_job_check_time_scale_percent=150, + async_job_check_max_time_ms=300000): + if url is None: + raise ValueError('A url is required') + self._service_ver = service_ver + self._client = _BaseClient( + url, timeout=timeout, user_id=user_id, password=password, + token=token, ignore_authrc=ignore_authrc, + trust_all_ssl_certificates=trust_all_ssl_certificates, + auth_svc=auth_svc, + async_job_check_time_ms=async_job_check_time_ms, + async_job_check_time_scale_percent=async_job_check_time_scale_percent, + async_job_check_max_time_ms=async_job_check_max_time_ms) + + def create(self, params, context=None): + """ + Function signature for the create() method -- generate a simple, + text-based report for an app run. + @deprecated KBaseReport.create_extended_report + :param params: instance of type "CreateParams" (* Parameters for the + create() method * * Pass in *either* workspace_name or + workspace_id -- only one is needed. * Note that workspace_id is + preferred over workspace_name because workspace_id immutable. If * + both are provided, the workspace_id will be used. * * Required + arguments: * SimpleReport report - See the structure above * + string workspace_name - Workspace name of the running app. + Required * if workspace_id is absent * int + workspace_id - Workspace ID of the running app. Required if * + workspace_name is absent) -> structure: parameter "report" of type + "SimpleReport" (* A simple report for use in create() * Optional + arguments: * string text_message - Readable plain-text report + message * string direct_html - Simple HTML text that will be + rendered within the report widget * TemplateParams template - + a template file and template data to be rendered and displayed * + as HTML. Use in place of 'direct_html' * list warnings + - A list of plain-text warning messages * + list objects_created - List of result workspace + objects that this app * has created. They will get linked + in the report view) -> structure: parameter "text_message" of + String, parameter "direct_html" of String, parameter "template" of + type "TemplateParams" (* Structure representing a template to be + rendered. 'template_file' must be provided, * 'template_data_json' + is optional) -> structure: parameter "template_file" of String, + parameter "template_data_json" of String, parameter "warnings" of + list of String, parameter "objects_created" of list of type + "WorkspaceObject" (* Represents a Workspace object with some brief + description text * that can be associated with the object. * + Required arguments: * ws_id ref - workspace ID in the format + 'workspace_id/object_id/version' * Optional arguments: * + string description - A plaintext, human-readable description of + the * object created) -> structure: parameter "ref" of + type "ws_id" (* Workspace ID reference in the format + 'workspace_id/object_id/version' * @id ws), parameter + "description" of String, parameter "workspace_name" of String, + parameter "workspace_id" of Long + :returns: instance of type "ReportInfo" (* The reference to the saved + KBaseReport. This is the return object for * both create() and + create_extended() * Returned data: * ws_id ref - reference to a + workspace object in the form of * + 'workspace_id/object_id/version'. This is a reference to a saved * + Report object (see KBaseReportWorkspace.spec) * string name - + Plaintext unique name for the report. In * create_extended, + this can optionally be set in a parameter) -> structure: parameter + "ref" of type "ws_id" (* Workspace ID reference in the format + 'workspace_id/object_id/version' * @id ws), parameter "name" of + String + """ + return self._client.run_job('KBaseReport.create', + [params], self._service_ver, context) + + def create_extended_report(self, params, context=None): + """ + Create a report for the results of an app run. This method handles file + and HTML zipping, uploading, and linking as well as HTML rendering. + :param params: instance of type "CreateExtendedReportParams" (* + Parameters used to create a more complex report with file and HTML + links * * Pass in *either* workspace_name or workspace_id -- only + one is needed. * Note that workspace_id is preferred over + workspace_name because workspace_id immutable. * * Note that it is + possible to pass both 'html_links'/'direct_html_link_index' and + 'direct_html' * as parameters for an extended report; in such + cases, the file specified by the * 'direct_html_link_links' + parameter is used for the report and the 'direct_html' is ignored. + * * Required arguments: * string workspace_name - Name of the + workspace where the report * should be saved. Required if + workspace_id is absent * int workspace_id - ID of workspace + where the report should be saved. * Required if + workspace_name is absent * Optional arguments: * string + message - Simple text message to store in the report object * + list objects_created - List of result workspace + objects that this app * has created. They will be linked + in the report view * list warnings - A list of + plain-text warning messages * string direct_html - Simple HTML + text content to be rendered within the report widget. * + Set only one of 'direct_html', 'template', and + 'html_links'/'direct_html_link_index'. * Setting both + 'template' and 'direct_html' will generate an error. * + TemplateParams template - render a template to produce HTML text + content that will be * rendered within the report widget. + Setting 'template' and 'direct_html' or * + 'html_links'/'direct_html_link_index' will generate an error. * + list html_links - A list of paths, shock IDs, or template + specs pointing to HTML files or directories. * If you pass + in paths to directories, they will be zipped and uploaded * + int direct_html_link_index - Index in html_links to set the + direct/default view in the report. * Set only one of + 'direct_html', 'template', and + 'html_links'/'direct_html_link_index'. * Setting both + 'template' and 'html_links'/'direct_html_link_index' will generate + an error. * list file_links - Allows the user to specify + files that the report widget * should link for download. + If you pass in paths to directories, they will be zipped. * + Each entry should be a path, shock ID, or template specification. + * string report_object_name - Name to use for the report + object (will * be auto-generated if unspecified) * + html_window_height - Fixed height in pixels of the HTML window for + the report * summary_window_height - Fixed height in pixels of + the summary window for the report) -> structure: parameter + "message" of String, parameter "objects_created" of list of type + "WorkspaceObject" (* Represents a Workspace object with some brief + description text * that can be associated with the object. * + Required arguments: * ws_id ref - workspace ID in the format + 'workspace_id/object_id/version' * Optional arguments: * + string description - A plaintext, human-readable description of + the * object created) -> structure: parameter "ref" of + type "ws_id" (* Workspace ID reference in the format + 'workspace_id/object_id/version' * @id ws), parameter + "description" of String, parameter "warnings" of list of String, + parameter "html_links" of list of type "File" (* A file to be + linked in the report. Pass in *either* a shock_id or a * path. If + a path to a file is given, then the file will be uploaded. If a * + path to a directory is given, then it will be zipped and uploaded. + * Required arguments: * string name - Plain-text filename (eg. + "results.zip") -- shown to the user * One of the following + identifiers is required: * string path - Can be a file or + directory path. * string shock_id - Shock node ID. * + TemplateParams template - template to be rendered and saved as a + file. * Optional arguments: * string label - A short + description for the file (eg. "Filter results") * string + description - A more detailed, human-readable description of the + file) -> structure: parameter "path" of String, parameter + "shock_id" of String, parameter "template" of type + "TemplateParams" (* Structure representing a template to be + rendered. 'template_file' must be provided, * 'template_data_json' + is optional) -> structure: parameter "template_file" of String, + parameter "template_data_json" of String, parameter "name" of + String, parameter "label" of String, parameter "description" of + String, parameter "template" of type "TemplateParams" (* Structure + representing a template to be rendered. 'template_file' must be + provided, * 'template_data_json' is optional) -> structure: + parameter "template_file" of String, parameter + "template_data_json" of String, parameter "direct_html" of String, + parameter "direct_html_link_index" of Long, parameter "file_links" + of list of type "File" (* A file to be linked in the report. Pass + in *either* a shock_id or a * path. If a path to a file is given, + then the file will be uploaded. If a * path to a directory is + given, then it will be zipped and uploaded. * Required arguments: + * string name - Plain-text filename (eg. "results.zip") -- + shown to the user * One of the following identifiers is required: + * string path - Can be a file or directory path. * string + shock_id - Shock node ID. * TemplateParams template - template + to be rendered and saved as a file. * Optional arguments: * + string label - A short description for the file (eg. "Filter + results") * string description - A more detailed, + human-readable description of the file) -> structure: parameter + "path" of String, parameter "shock_id" of String, parameter + "template" of type "TemplateParams" (* Structure representing a + template to be rendered. 'template_file' must be provided, * + 'template_data_json' is optional) -> structure: parameter + "template_file" of String, parameter "template_data_json" of + String, parameter "name" of String, parameter "label" of String, + parameter "description" of String, parameter "report_object_name" + of String, parameter "html_window_height" of Double, parameter + "summary_window_height" of Double, parameter "workspace_name" of + String, parameter "workspace_id" of Long + :returns: instance of type "ReportInfo" (* The reference to the saved + KBaseReport. This is the return object for * both create() and + create_extended() * Returned data: * ws_id ref - reference to a + workspace object in the form of * + 'workspace_id/object_id/version'. This is a reference to a saved * + Report object (see KBaseReportWorkspace.spec) * string name - + Plaintext unique name for the report. In * create_extended, + this can optionally be set in a parameter) -> structure: parameter + "ref" of type "ws_id" (* Workspace ID reference in the format + 'workspace_id/object_id/version' * @id ws), parameter "name" of + String + """ + return self._client.run_job('KBaseReport.create_extended_report', + [params], self._service_ver, context) + + def render_template(self, params, context=None): + """ + Render a file from a template. This method takes a template file and + a data structure, renders the template, and saves the results to a file. + It returns the output file path in the form + { 'path': '/path/to/file' } + To ensure that the template and the output file are accessible to both + the KBaseReport service and the app requesting the template rendering, the + template file should be copied into the shared `scratch` directory and the + output_file location should also be in `scratch`. + See https://github.com/kbaseIncubator/kbase_report_templates for sample + page templates, standard includes, and instructions on creating your own + templates. + :param params: instance of type "RenderTemplateParams" (* Render a + template using the supplied data, saving the results to an output + * file in the scratch directory. * * Required arguments: * + string template_file - Path to the template file to be rendered. + * string output_file - Path to the file where the rendered + template * should be saved. Must be + in the scratch directory. * Optional: * string + template_data_json - Data for rendering in the template.) -> + structure: parameter "template_file" of String, parameter + "output_file" of String, parameter "template_data_json" of String + :returns: instance of type "File" (* A file to be linked in the + report. Pass in *either* a shock_id or a * path. If a path to a + file is given, then the file will be uploaded. If a * path to a + directory is given, then it will be zipped and uploaded. * + Required arguments: * string name - Plain-text filename (eg. + "results.zip") -- shown to the user * One of the following + identifiers is required: * string path - Can be a file or + directory path. * string shock_id - Shock node ID. * + TemplateParams template - template to be rendered and saved as a + file. * Optional arguments: * string label - A short + description for the file (eg. "Filter results") * string + description - A more detailed, human-readable description of the + file) -> structure: parameter "path" of String, parameter + "shock_id" of String, parameter "template" of type + "TemplateParams" (* Structure representing a template to be + rendered. 'template_file' must be provided, * 'template_data_json' + is optional) -> structure: parameter "template_file" of String, + parameter "template_data_json" of String, parameter "name" of + String, parameter "label" of String, parameter "description" of + String + """ + return self._client.run_job('KBaseReport.render_template', + [params], self._service_ver, context) + + def render_templates(self, params, context=None): + """ + Render files from a list of template specifications. Input is a list of dicts + with the keys 'template_file', 'output_file', and 'template_data_json', and output + is a list of dicts containing the path of the rendered files, returned in the order + that the input was specified. All 'output_file' paths must be unique. + If any template fails to render, the endpoint will return an error. + :param params: instance of list of type "RenderTemplateParams" (* + Render a template using the supplied data, saving the results to + an output * file in the scratch directory. * * Required arguments: + * string template_file - Path to the template file to be + rendered. * string output_file - Path to the file where + the rendered template * should be + saved. Must be in the scratch directory. * Optional: * string + template_data_json - Data for rendering in the template.) -> + structure: parameter "template_file" of String, parameter + "output_file" of String, parameter "template_data_json" of String + :returns: instance of list of type "File" (* A file to be linked in + the report. Pass in *either* a shock_id or a * path. If a path to + a file is given, then the file will be uploaded. If a * path to a + directory is given, then it will be zipped and uploaded. * + Required arguments: * string name - Plain-text filename (eg. + "results.zip") -- shown to the user * One of the following + identifiers is required: * string path - Can be a file or + directory path. * string shock_id - Shock node ID. * + TemplateParams template - template to be rendered and saved as a + file. * Optional arguments: * string label - A short + description for the file (eg. "Filter results") * string + description - A more detailed, human-readable description of the + file) -> structure: parameter "path" of String, parameter + "shock_id" of String, parameter "template" of type + "TemplateParams" (* Structure representing a template to be + rendered. 'template_file' must be provided, * 'template_data_json' + is optional) -> structure: parameter "template_file" of String, + parameter "template_data_json" of String, parameter "name" of + String, parameter "label" of String, parameter "description" of + String + """ + return self._client.run_job('KBaseReport.render_templates', + [params], self._service_ver, context) + + def status(self, context=None): + return self._client.run_job('KBaseReport.status', + [], self._service_ver, context) diff --git a/lib/installed_clients/WorkspaceClient.py b/lib/installed_clients/WorkspaceClient.py new file mode 100644 index 0000000..bd20541 --- /dev/null +++ b/lib/installed_clients/WorkspaceClient.py @@ -0,0 +1,5255 @@ +# -*- coding: utf-8 -*- +############################################################ +# +# Autogenerated by the KBase type compiler - +# any changes made here will be overwritten +# +############################################################ + +from __future__ import print_function +# the following is a hack to get the baseclient to import whether we're in a +# package or not. This makes pep8 unhappy hence the annotations. +try: + # baseclient and this client are in a package + from .baseclient import BaseClient as _BaseClient # @UnusedImport +except ImportError: + # no they aren't + from baseclient import BaseClient as _BaseClient # @Reimport + + +class Workspace(object): + + def __init__( + self, url=None, timeout=30 * 60, user_id=None, + password=None, token=None, ignore_authrc=False, + trust_all_ssl_certificates=False, + auth_svc='https://ci.kbase.us/services/auth/api/legacy/KBase/Sessions/Login'): + if url is None: + raise ValueError('A url is required') + self._service_ver = None + self._client = _BaseClient( + url, timeout=timeout, user_id=user_id, password=password, + token=token, ignore_authrc=ignore_authrc, + trust_all_ssl_certificates=trust_all_ssl_certificates, + auth_svc=auth_svc) + + def ver(self, context=None): + """ + Returns the version of the workspace service. + :returns: instance of String + """ + return self._client.call_method('Workspace.ver', + [], self._service_ver, context) + + def create_workspace(self, params, context=None): + """ + Creates a new workspace. + :param params: instance of type "CreateWorkspaceParams" (Input + parameters for the "create_workspace" function. Required + arguments: ws_name workspace - name of the workspace to be + created. Optional arguments: permission globalread - 'r' to set + the new workspace globally readable, default 'n'. string + description - A free-text description of the new workspace, 1000 + characters max. Longer strings will be mercilessly and brutally + truncated. usermeta meta - arbitrary user-supplied metadata for + the workspace.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "globalread" of type + "permission" (Represents the permissions a user or users have to a + workspace: 'a' - administrator. All operations allowed. 'w' - + read/write. 'r' - read. 'n' - no permissions.), parameter + "description" of String, parameter "meta" of type "usermeta" (User + provided metadata about an object. Arbitrary key-value pairs + provided by the user.) -> mapping from String to String + :returns: instance of type "workspace_info" (Information about a + workspace. ws_id id - the numerical ID of the workspace. ws_name + workspace - name of the workspace. username owner - name of the + user who owns (e.g. created) this workspace. timestamp moddate - + date when the workspace was last modified. int max_objid - the + maximum object ID appearing in this workspace. Since cloning a + workspace preserves object IDs, this number may be greater than + the number of objects in a newly cloned workspace. permission + user_permission - permissions for the authenticated user of this + workspace. permission globalread - whether this workspace is + globally readable. lock_status lockstat - the status of the + workspace lock. usermeta metadata - arbitrary user-supplied + metadata about the workspace.) -> tuple of size 9: parameter "id" + of type "ws_id" (The unique, permanent numerical ID of a + workspace.), parameter "workspace" of type "ws_name" (A string + used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "owner" of type "username" + (Login name of a KBase user account.), parameter "moddate" of type + "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is + either the character Z (representing the UTC timezone) or the + difference in time to UTC in the format +/-HHMM, eg: + 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC + time) 2013-04-03T08:56:32Z (UTC time)), parameter "max_objid" of + Long, parameter "user_permission" of type "permission" (Represents + the permissions a user or users have to a workspace: 'a' - + administrator. All operations allowed. 'w' - read/write. 'r' - + read. 'n' - no permissions.), parameter "globalread" of type + "permission" (Represents the permissions a user or users have to a + workspace: 'a' - administrator. All operations allowed. 'w' - + read/write. 'r' - read. 'n' - no permissions.), parameter + "lockstat" of type "lock_status" (The lock status of a workspace. + One of 'unlocked', 'locked', or 'published'.), parameter + "metadata" of type "usermeta" (User provided metadata about an + object. Arbitrary key-value pairs provided by the user.) -> + mapping from String to String + """ + return self._client.call_method('Workspace.create_workspace', + [params], self._service_ver, context) + + def alter_workspace_metadata(self, params, context=None): + """ + Change the metadata associated with a workspace. + :param params: instance of type "AlterWorkspaceMetadataParams" (Input + parameters for the "alter_workspace_metadata" function. Required + arguments: WorkspaceIdentity wsi - the workspace to be altered One + or both of the following arguments are required: usermeta new - + metadata to assign to the workspace. Duplicate keys will be + overwritten. list remove - these keys will be removed from + the workspace metadata key/value pairs.) -> structure: parameter + "wsi" of type "WorkspaceIdentity" (A workspace identifier. Select + a workspace by one, and only one, of the numerical id or name. + ws_id id - the numerical ID of the workspace. ws_name workspace - + the name of the workspace.) -> structure: parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "id" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "new" + of type "usermeta" (User provided metadata about an object. + Arbitrary key-value pairs provided by the user.) -> mapping from + String to String, parameter "remove" of list of String + """ + return self._client.call_method('Workspace.alter_workspace_metadata', + [params], self._service_ver, context) + + def clone_workspace(self, params, context=None): + """ + Clones a workspace. + :param params: instance of type "CloneWorkspaceParams" (Input + parameters for the "clone_workspace" function. Note that deleted + objects are not cloned, although hidden objects are and remain + hidden in the new workspace. Required arguments: WorkspaceIdentity + wsi - the workspace to be cloned. ws_name workspace - name of the + workspace to be cloned into. This must be a non-existant workspace + name. Optional arguments: permission globalread - 'r' to set the + new workspace globally readable, default 'n'. string description - + A free-text description of the new workspace, 1000 characters max. + Longer strings will be mercilessly and brutally truncated. + usermeta meta - arbitrary user-supplied metadata for the + workspace. list exclude - exclude the specified + objects from the cloned workspace. Either an object ID or a object + name must be specified in each ObjectIdentity - any supplied + reference strings, workspace names or IDs, and versions are + ignored.) -> structure: parameter "wsi" of type + "WorkspaceIdentity" (A workspace identifier. Select a workspace by + one, and only one, of the numerical id or name. ws_id id - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace.) -> structure: parameter "workspace" of type "ws_name" + (A string used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "id" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter + "workspace" of type "ws_name" (A string used as a name for a + workspace. Any string consisting of alphanumeric characters and + "_", ".", or "-" that is not an integer is acceptable. The name + may optionally be prefixed with the workspace owner's user name + and a colon, e.g. kbasetest:my_workspace.), parameter "globalread" + of type "permission" (Represents the permissions a user or users + have to a workspace: 'a' - administrator. All operations allowed. + 'w' - read/write. 'r' - read. 'n' - no permissions.), parameter + "description" of String, parameter "meta" of type "usermeta" (User + provided metadata about an object. Arbitrary key-value pairs + provided by the user.) -> mapping from String to String, parameter + "exclude" of list of type "ObjectIdentity" (An object identifier. + Select an object by either: One, and only one, of the numerical id + or name of the workspace. ws_id wsid - the numerical ID of the + workspace. ws_name workspace - the name of the workspace. AND One, + and only one, of the numerical id or name of the object. obj_id + objid- the numerical ID of the object. obj_name name - name of the + object. OPTIONALLY obj_ver ver - the version of the object. OR an + object reference string: obj_ref ref - an object reference + string.) -> structure: parameter "workspace" of type "ws_name" (A + string used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + :returns: instance of type "workspace_info" (Information about a + workspace. ws_id id - the numerical ID of the workspace. ws_name + workspace - name of the workspace. username owner - name of the + user who owns (e.g. created) this workspace. timestamp moddate - + date when the workspace was last modified. int max_objid - the + maximum object ID appearing in this workspace. Since cloning a + workspace preserves object IDs, this number may be greater than + the number of objects in a newly cloned workspace. permission + user_permission - permissions for the authenticated user of this + workspace. permission globalread - whether this workspace is + globally readable. lock_status lockstat - the status of the + workspace lock. usermeta metadata - arbitrary user-supplied + metadata about the workspace.) -> tuple of size 9: parameter "id" + of type "ws_id" (The unique, permanent numerical ID of a + workspace.), parameter "workspace" of type "ws_name" (A string + used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "owner" of type "username" + (Login name of a KBase user account.), parameter "moddate" of type + "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is + either the character Z (representing the UTC timezone) or the + difference in time to UTC in the format +/-HHMM, eg: + 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC + time) 2013-04-03T08:56:32Z (UTC time)), parameter "max_objid" of + Long, parameter "user_permission" of type "permission" (Represents + the permissions a user or users have to a workspace: 'a' - + administrator. All operations allowed. 'w' - read/write. 'r' - + read. 'n' - no permissions.), parameter "globalread" of type + "permission" (Represents the permissions a user or users have to a + workspace: 'a' - administrator. All operations allowed. 'w' - + read/write. 'r' - read. 'n' - no permissions.), parameter + "lockstat" of type "lock_status" (The lock status of a workspace. + One of 'unlocked', 'locked', or 'published'.), parameter + "metadata" of type "usermeta" (User provided metadata about an + object. Arbitrary key-value pairs provided by the user.) -> + mapping from String to String + """ + return self._client.call_method('Workspace.clone_workspace', + [params], self._service_ver, context) + + def lock_workspace(self, wsi, context=None): + """ + Lock a workspace, preventing further changes. + WARNING: Locking a workspace is permanent. A workspace, once locked, + cannot be unlocked. + + The only changes allowed for a locked workspace are changing user + based permissions or making a private workspace globally readable, + thus permanently publishing the workspace. A locked, globally readable + workspace cannot be made private. + :param wsi: instance of type "WorkspaceIdentity" (A workspace + identifier. Select a workspace by one, and only one, of the + numerical id or name. ws_id id - the numerical ID of the + workspace. ws_name workspace - the name of the workspace.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "id" of type "ws_id" (The unique, permanent numerical ID + of a workspace.) + :returns: instance of type "workspace_info" (Information about a + workspace. ws_id id - the numerical ID of the workspace. ws_name + workspace - name of the workspace. username owner - name of the + user who owns (e.g. created) this workspace. timestamp moddate - + date when the workspace was last modified. int max_objid - the + maximum object ID appearing in this workspace. Since cloning a + workspace preserves object IDs, this number may be greater than + the number of objects in a newly cloned workspace. permission + user_permission - permissions for the authenticated user of this + workspace. permission globalread - whether this workspace is + globally readable. lock_status lockstat - the status of the + workspace lock. usermeta metadata - arbitrary user-supplied + metadata about the workspace.) -> tuple of size 9: parameter "id" + of type "ws_id" (The unique, permanent numerical ID of a + workspace.), parameter "workspace" of type "ws_name" (A string + used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "owner" of type "username" + (Login name of a KBase user account.), parameter "moddate" of type + "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is + either the character Z (representing the UTC timezone) or the + difference in time to UTC in the format +/-HHMM, eg: + 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC + time) 2013-04-03T08:56:32Z (UTC time)), parameter "max_objid" of + Long, parameter "user_permission" of type "permission" (Represents + the permissions a user or users have to a workspace: 'a' - + administrator. All operations allowed. 'w' - read/write. 'r' - + read. 'n' - no permissions.), parameter "globalread" of type + "permission" (Represents the permissions a user or users have to a + workspace: 'a' - administrator. All operations allowed. 'w' - + read/write. 'r' - read. 'n' - no permissions.), parameter + "lockstat" of type "lock_status" (The lock status of a workspace. + One of 'unlocked', 'locked', or 'published'.), parameter + "metadata" of type "usermeta" (User provided metadata about an + object. Arbitrary key-value pairs provided by the user.) -> + mapping from String to String + """ + return self._client.call_method('Workspace.lock_workspace', + [wsi], self._service_ver, context) + + def get_workspacemeta(self, params, context=None): + """ + Retrieves the metadata associated with the specified workspace. + Provided for backwards compatibility. + @deprecated Workspace.get_workspace_info + :param params: instance of type "get_workspacemeta_params" + (DEPRECATED Input parameters for the "get_workspacemeta" function. + Provided for backwards compatibility. One, and only one of: + ws_name workspace - name of the workspace. ws_id id - the + numerical ID of the workspace. Optional arguments: string auth - + the authentication token of the KBase account accessing the + workspace. Overrides the client provided authorization credentials + if they exist. @deprecated Workspace.WorkspaceIdentity) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "id" of type "ws_id" (The unique, permanent numerical ID + of a workspace.), parameter "auth" of String + :returns: instance of type "workspace_metadata" (Meta data associated + with a workspace. Provided for backwards compatibility. To be + replaced by workspace_info. ws_name id - name of the workspace + username owner - name of the user who owns (who created) this + workspace timestamp moddate - date when the workspace was last + modified int objects - the approximate number of objects currently + stored in the workspace. permission user_permission - permissions + for the currently logged in user for the workspace permission + global_permission - default permissions for the workspace for all + KBase users ws_id num_id - numerical ID of the workspace + @deprecated Workspace.workspace_info) -> tuple of size 7: + parameter "id" of type "ws_name" (A string used as a name for a + workspace. Any string consisting of alphanumeric characters and + "_", ".", or "-" that is not an integer is acceptable. The name + may optionally be prefixed with the workspace owner's user name + and a colon, e.g. kbasetest:my_workspace.), parameter "owner" of + type "username" (Login name of a KBase user account.), parameter + "moddate" of type "timestamp" (A time in the format + YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "objects" of Long, parameter "user_permission" + of type "permission" (Represents the permissions a user or users + have to a workspace: 'a' - administrator. All operations allowed. + 'w' - read/write. 'r' - read. 'n' - no permissions.), parameter + "global_permission" of type "permission" (Represents the + permissions a user or users have to a workspace: 'a' - + administrator. All operations allowed. 'w' - read/write. 'r' - + read. 'n' - no permissions.), parameter "num_id" of type "ws_id" + (The unique, permanent numerical ID of a workspace.) + """ + return self._client.call_method('Workspace.get_workspacemeta', + [params], self._service_ver, context) + + def get_workspace_info(self, wsi, context=None): + """ + Get information associated with a workspace. + :param wsi: instance of type "WorkspaceIdentity" (A workspace + identifier. Select a workspace by one, and only one, of the + numerical id or name. ws_id id - the numerical ID of the + workspace. ws_name workspace - the name of the workspace.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "id" of type "ws_id" (The unique, permanent numerical ID + of a workspace.) + :returns: instance of type "workspace_info" (Information about a + workspace. ws_id id - the numerical ID of the workspace. ws_name + workspace - name of the workspace. username owner - name of the + user who owns (e.g. created) this workspace. timestamp moddate - + date when the workspace was last modified. int max_objid - the + maximum object ID appearing in this workspace. Since cloning a + workspace preserves object IDs, this number may be greater than + the number of objects in a newly cloned workspace. permission + user_permission - permissions for the authenticated user of this + workspace. permission globalread - whether this workspace is + globally readable. lock_status lockstat - the status of the + workspace lock. usermeta metadata - arbitrary user-supplied + metadata about the workspace.) -> tuple of size 9: parameter "id" + of type "ws_id" (The unique, permanent numerical ID of a + workspace.), parameter "workspace" of type "ws_name" (A string + used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "owner" of type "username" + (Login name of a KBase user account.), parameter "moddate" of type + "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is + either the character Z (representing the UTC timezone) or the + difference in time to UTC in the format +/-HHMM, eg: + 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC + time) 2013-04-03T08:56:32Z (UTC time)), parameter "max_objid" of + Long, parameter "user_permission" of type "permission" (Represents + the permissions a user or users have to a workspace: 'a' - + administrator. All operations allowed. 'w' - read/write. 'r' - + read. 'n' - no permissions.), parameter "globalread" of type + "permission" (Represents the permissions a user or users have to a + workspace: 'a' - administrator. All operations allowed. 'w' - + read/write. 'r' - read. 'n' - no permissions.), parameter + "lockstat" of type "lock_status" (The lock status of a workspace. + One of 'unlocked', 'locked', or 'published'.), parameter + "metadata" of type "usermeta" (User provided metadata about an + object. Arbitrary key-value pairs provided by the user.) -> + mapping from String to String + """ + return self._client.call_method('Workspace.get_workspace_info', + [wsi], self._service_ver, context) + + def get_workspace_description(self, wsi, context=None): + """ + Get a workspace's description. + :param wsi: instance of type "WorkspaceIdentity" (A workspace + identifier. Select a workspace by one, and only one, of the + numerical id or name. ws_id id - the numerical ID of the + workspace. ws_name workspace - the name of the workspace.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "id" of type "ws_id" (The unique, permanent numerical ID + of a workspace.) + :returns: instance of String + """ + return self._client.call_method('Workspace.get_workspace_description', + [wsi], self._service_ver, context) + + def set_permissions(self, params, context=None): + """ + Set permissions for a workspace. + :param params: instance of type "SetPermissionsParams" (Input + parameters for the "set_permissions" function. One, and only one, + of the following is required: ws_id id - the numerical ID of the + workspace. ws_name workspace - the name of the workspace. Required + arguments: permission new_permission - the permission to assign to + the users. list users - the users whose permissions will + be altered.) -> structure: parameter "workspace" of type "ws_name" + (A string used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "id" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter + "new_permission" of type "permission" (Represents the permissions + a user or users have to a workspace: 'a' - administrator. All + operations allowed. 'w' - read/write. 'r' - read. 'n' - no + permissions.), parameter "users" of list of type "username" (Login + name of a KBase user account.) + """ + return self._client.call_method('Workspace.set_permissions', + [params], self._service_ver, context) + + def set_global_permission(self, params, context=None): + """ + Set the global permission for a workspace. + :param params: instance of type "SetGlobalPermissionsParams" (Input + parameters for the "set_global_permission" function. One, and only + one, of the following is required: ws_id id - the numerical ID of + the workspace. ws_name workspace - the name of the workspace. + Required arguments: permission new_permission - the permission to + assign to all users, either 'n' or 'r'. 'r' means that all users + will be able to read the workspace; otherwise users must have + specific permission to access the workspace.) -> structure: + parameter "workspace" of type "ws_name" (A string used as a name + for a workspace. Any string consisting of alphanumeric characters + and "_", ".", or "-" that is not an integer is acceptable. The + name may optionally be prefixed with the workspace owner's user + name and a colon, e.g. kbasetest:my_workspace.), parameter "id" of + type "ws_id" (The unique, permanent numerical ID of a workspace.), + parameter "new_permission" of type "permission" (Represents the + permissions a user or users have to a workspace: 'a' - + administrator. All operations allowed. 'w' - read/write. 'r' - + read. 'n' - no permissions.) + """ + return self._client.call_method('Workspace.set_global_permission', + [params], self._service_ver, context) + + def set_workspace_description(self, params, context=None): + """ + Set the description for a workspace. + :param params: instance of type "SetWorkspaceDescriptionParams" + (Input parameters for the "set_workspace_description" function. + One, and only one, of the following is required: ws_id id - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace. Optional arguments: string description - A free-text + description of the workspace, 1000 characters max. Longer strings + will be mercilessly and brutally truncated. If omitted, the + description is set to null.) -> structure: parameter "workspace" + of type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "id" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter + "description" of String + """ + return self._client.call_method('Workspace.set_workspace_description', + [params], self._service_ver, context) + + def get_permissions_mass(self, mass, context=None): + """ + Get permissions for multiple workspaces. + :param mass: instance of type "GetPermissionsMassParams" (Input + parameters for the "get_permissions_mass" function. workspaces - + the workspaces for which to return the permissions, maximum 1000.) + -> structure: parameter "workspaces" of list of type + "WorkspaceIdentity" (A workspace identifier. Select a workspace by + one, and only one, of the numerical id or name. ws_id id - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace.) -> structure: parameter "workspace" of type "ws_name" + (A string used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "id" of type "ws_id" (The + unique, permanent numerical ID of a workspace.) + :returns: instance of type "WorkspacePermissions" (A set of workspace + permissions. perms - the list of permissions for each requested + workspace) -> structure: parameter "perms" of list of mapping from + type "username" (Login name of a KBase user account.) to type + "permission" (Represents the permissions a user or users have to a + workspace: 'a' - administrator. All operations allowed. 'w' - + read/write. 'r' - read. 'n' - no permissions.) + """ + return self._client.call_method('Workspace.get_permissions_mass', + [mass], self._service_ver, context) + + def get_permissions(self, wsi, context=None): + """ + Get permissions for a workspace. + @deprecated get_permissions_mass + :param wsi: instance of type "WorkspaceIdentity" (A workspace + identifier. Select a workspace by one, and only one, of the + numerical id or name. ws_id id - the numerical ID of the + workspace. ws_name workspace - the name of the workspace.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "id" of type "ws_id" (The unique, permanent numerical ID + of a workspace.) + :returns: instance of mapping from type "username" (Login name of a + KBase user account.) to type "permission" (Represents the + permissions a user or users have to a workspace: 'a' - + administrator. All operations allowed. 'w' - read/write. 'r' - + read. 'n' - no permissions.) + """ + return self._client.call_method('Workspace.get_permissions', + [wsi], self._service_ver, context) + + def save_object(self, params, context=None): + """ + Saves the input object data and metadata into the selected workspace, + returning the object_metadata of the saved object. Provided + for backwards compatibility. + @deprecated Workspace.save_objects + :param params: instance of type "save_object_params" (Input + parameters for the "save_object" function. Provided for backwards + compatibility. Required arguments: type_string type - type of the + object to be saved ws_name workspace - name of the workspace where + the object is to be saved obj_name id - name behind which the + object will be saved in the workspace UnspecifiedObject data - + data to be saved in the workspace Optional arguments: usermeta + metadata - arbitrary user-supplied metadata for the object, not to + exceed 16kb; if the object type specifies automatic metadata + extraction with the 'meta ws' annotation, and your metadata name + conflicts, then your metadata will be silently overwritten. string + auth - the authentication token of the KBase account accessing the + workspace. Overrides the client provided authorization credentials + if they exist. @deprecated) -> structure: parameter "id" of type + "obj_name" (A string used as a name for an object. Any string + consisting of alphanumeric characters and the characters |._- that + is not an integer is acceptable.), parameter "type" of type + "type_string" (A type string. Specifies the type and its version + in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "data" of unspecified object, + parameter "workspace" of type "ws_name" (A string used as a name + for a workspace. Any string consisting of alphanumeric characters + and "_", ".", or "-" that is not an integer is acceptable. The + name may optionally be prefixed with the workspace owner's user + name and a colon, e.g. kbasetest:my_workspace.), parameter + "metadata" of mapping from String to String, parameter "auth" of + String + :returns: instance of type "object_metadata" (Meta data associated + with an object stored in a workspace. Provided for backwards + compatibility. obj_name id - name of the object. type_string type + - type of the object. timestamp moddate - date when the object was + saved obj_ver instance - the version of the object string command + - Deprecated. Always returns the empty string. username + lastmodifier - name of the user who last saved the object, + including copying the object username owner - Deprecated. Same as + lastmodifier. ws_name workspace - name of the workspace in which + the object is stored string ref - Deprecated. Always returns the + empty string. string chsum - the md5 checksum of the object. + usermeta metadata - arbitrary user-supplied metadata about the + object. obj_id objid - the numerical id of the object. @deprecated + object_info) -> tuple of size 12: parameter "id" of type + "obj_name" (A string used as a name for an object. Any string + consisting of alphanumeric characters and the characters |._- that + is not an integer is acceptable.), parameter "type" of type + "type_string" (A type string. Specifies the type and its version + in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "moddate" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "instance" of Long, + parameter "command" of String, parameter "lastmodifier" of type + "username" (Login name of a KBase user account.), parameter + "owner" of type "username" (Login name of a KBase user account.), + parameter "workspace" of type "ws_name" (A string used as a name + for a workspace. Any string consisting of alphanumeric characters + and "_", ".", or "-" that is not an integer is acceptable. The + name may optionally be prefixed with the workspace owner's user + name and a colon, e.g. kbasetest:my_workspace.), parameter "ref" + of String, parameter "chsum" of String, parameter "metadata" of + type "usermeta" (User provided metadata about an object. Arbitrary + key-value pairs provided by the user.) -> mapping from String to + String, parameter "objid" of type "obj_id" (The unique, permanent + numerical ID of an object.) + """ + return self._client.call_method('Workspace.save_object', + [params], self._service_ver, context) + + def save_objects(self, params, context=None): + """ + Save objects to the workspace. Saving over a deleted object undeletes + it. + :param params: instance of type "SaveObjectsParams" (Input parameters + for the "save_objects" function. One, and only one, of the + following is required: ws_id id - the numerical ID of the + workspace. ws_name workspace - the name of the workspace. Required + arguments: list objects - the objects to save.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "id" of type "ws_id" (The unique, permanent numerical ID + of a workspace.), parameter "objects" of list of type + "ObjectSaveData" (An object and associated data required for + saving. Required arguments: type_string type - the type of the + object. Omit the version information to use the latest version. + UnspecifiedObject data - the object data. One, and only one, of: + obj_name name - the name of the object. obj_id objid - the id of + the object to save over. Optional arguments: usermeta meta - + arbitrary user-supplied metadata for the object, not to exceed + 16kb; if the object type specifies automatic metadata extraction + with the 'meta ws' annotation, and your metadata name conflicts, + then your metadata will be silently overwritten. + list provenance - provenance data for the + object. boolean hidden - true if this object should not be listed + when listing workspace objects.) -> structure: parameter "type" of + type "type_string" (A type string. Specifies the type and its + version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "data" of unspecified object, + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "objid" of type "obj_id" (The unique, permanent + numerical ID of an object.), parameter "meta" of type "usermeta" + (User provided metadata about an object. Arbitrary key-value pairs + provided by the user.) -> mapping from String to String, parameter + "provenance" of list of type "ProvenanceAction" (A provenance + action. A provenance action (PA) is an action taken while + transforming one data object to another. There may be several PAs + taken in series. A PA is typically running a script, running an + api command, etc. All of the following fields are optional, but + more information provided equates to better data provenance. If a + provenance action has no fields defined at all, it is silently + dropped from the list. resolved_ws_objects should never be set by + the user; it is set by the workspace service when returning data. + On input, only one of the time or epoch may be supplied. Both are + supplied on output. The maximum size of the entire provenance + object, including all actions, is 1MB. timestamp time - the time + the action was started epoch epoch - the time the action was + started. string caller - the name or id of the invoker of this + provenance action. In most cases, this will be the same for all + PAs. string service - the name of the service that performed this + action. string service_ver - the version of the service that + performed this action. string method - the method of the service + that performed this action. list method_params + - the parameters of the method that performed this action. If an + object in the parameters is a workspace object, also put the + object reference in the input_ws_object list. string script - the + name of the script that performed this action. string script_ver - + the version of the script that performed this action. string + script_command_line - the command line provided to the script that + performed this action. If workspace objects were provided in the + command line, also put the object reference in the input_ws_object + list. list input_ws_objects - the workspace objects + that were used as input to this action; typically these will also + be present as parts of the method_params or the + script_command_line arguments. A reference path into the object + graph may be supplied. list resolved_ws_objects - the + workspace objects ids from input_ws_objects resolved to permanent + workspace object references by the workspace service. list + intermediate_incoming - if the previous action produced output + that 1) was not stored in a referrable way, and 2) is used as + input for this action, provide it with an arbitrary and unique ID + here, in the order of the input arguments to this action. These + IDs can be used in the method_params argument. list + intermediate_outgoing - if this action produced output that 1) was + not stored in a referrable way, and 2) is used as input for the + next action, provide it with an arbitrary and unique ID here, in + the order of the output values from this action. These IDs can be + used in the intermediate_incoming argument in the next action. + list external_data - data external to the + workspace that was either imported to the workspace or used to + create a workspace object. list subactions - the + subactions taken as a part of this action. mapping + custom - user definable custom provenance fields and their values. + string description - a free text description of this action.) -> + structure: parameter "time" of type "timestamp" (A time in the + format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "epoch" of type "epoch" (A Unix epoch (the time + since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "caller" + of String, parameter "service" of String, parameter "service_ver" + of String, parameter "method" of String, parameter "method_params" + of list of unspecified object, parameter "script" of String, + parameter "script_ver" of String, parameter "script_command_line" + of String, parameter "input_ws_objects" of list of type + "ref_string" (A chain of objects with references to one another as + a string. A single string that is semantically identical to + ref_chain above. Represents a path from one workspace object to + another through an arbitrarily number of intermediate objects + where each object has a dependency or provenance reference to the + next object. Each entry is an obj_ref as defined earlier. Entries + are separated by semicolons. Whitespace is ignored. Examples: + 3/5/6; kbaseuser:myworkspace/myobject; 5/myobject/2 aworkspace/6), + parameter "resolved_ws_objects" of list of type "obj_ref" (A + string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "intermediate_incoming" of list of String, parameter + "intermediate_outgoing" of list of String, parameter + "external_data" of list of type "ExternalDataUnit" (An external + data unit. A piece of data from a source outside the Workspace. On + input, only one of the resource_release_date or + resource_release_epoch may be supplied. Both are supplied on + output. All fields are optional, but at least one field must be + present. string resource_name - the name of the resource, for + example JGI. string resource_url - the url of the resource, for + example http://genome.jgi.doe.gov string resource_version - + version of the resource timestamp resource_release_date - the + release date of the resource epoch resource_release_epoch - the + release date of the resource string data_url - the url of the + data, for example + http://genome.jgi.doe.gov/pages/dynamicOrganismDownload.jsf? + organism=BlaspURHD0036 string data_id - the id of the data, for + example 7625.2.79179.AGTTCC.adnq.fastq.gz string description - a + free text description of the data.) -> structure: parameter + "resource_name" of String, parameter "resource_url" of String, + parameter "resource_version" of String, parameter + "resource_release_date" of type "timestamp" (A time in the format + YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "resource_release_epoch" of type "epoch" (A Unix + epoch (the time since 00:00:00 1/1/1970 UTC) in milliseconds.), + parameter "data_url" of String, parameter "data_id" of String, + parameter "description" of String, parameter "subactions" of list + of type "SubAction" (Information about a subaction that is invoked + by a provenance action. A provenance action (PA) may invoke + subactions (SA), e.g. calling a separate piece of code, a service, + or a script. In most cases these calls are the same from PA to PA + and so do not need to be listed in the provenance since providing + information about the PA alone provides reproducibility. In some + cases, however, SAs may change over time, such that invoking the + same PA with the same parameters may produce different results. + For example, if a PA calls a remote server, that server may be + updated between a PA invoked on day T and another PA invoked on + day T+1. The SubAction structure allows for specifying information + about SAs that may dynamically change from PA invocation to PA + invocation. All fields are optional but at least one field must be + present. string name - the name of the SA. string ver - the + version of SA. string code_url - a url pointing to the SA's + codebase. string commit - a version control commit ID for the SA. + string endpoint_url - a url pointing to the access point for the + SA - a server url, for instance.) -> structure: parameter "name" + of String, parameter "ver" of String, parameter "code_url" of + String, parameter "commit" of String, parameter "endpoint_url" of + String, parameter "custom" of mapping from String to String, + parameter "description" of String, parameter "hidden" of type + "boolean" (A boolean. 0 = false, other = true.) + :returns: instance of list of type "object_info" (Information about + an object, including user provided metadata. obj_id objid - the + numerical id of the object. obj_name name - the name of the + object. type_string type - the type of the object. timestamp + save_date - the save date of the object. obj_ver ver - the version + of the object. username saved_by - the user that saved or copied + the object. ws_id wsid - the workspace containing the object. + ws_name workspace - the workspace containing the object. string + chsum - the md5 checksum of the object. int size - the size of the + object in bytes. usermeta meta - arbitrary user-supplied metadata + about the object.) -> tuple of size 11: parameter "objid" of type + "obj_id" (The unique, permanent numerical ID of an object.), + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String + """ + return self._client.call_method('Workspace.save_objects', + [params], self._service_ver, context) + + def get_object(self, params, context=None): + """ + Retrieves the specified object from the specified workspace. + Both the object data and metadata are returned. + Provided for backwards compatibility. + @deprecated Workspace.get_objects + :param params: instance of type "get_object_params" (Input parameters + for the "get_object" function. Provided for backwards + compatibility. Required arguments: ws_name workspace - Name of the + workspace containing the object to be retrieved obj_name id - Name + of the object to be retrieved Optional arguments: int instance - + Version of the object to be retrieved, enabling retrieval of any + previous version of an object string auth - the authentication + token of the KBase account accessing the object. Overrides the + client provided authorization credentials if they exist. + @deprecated Workspace.ObjectIdentity) -> structure: parameter "id" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "workspace" + of type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "instance" of Long, parameter + "auth" of String + :returns: instance of type "get_object_output" (Output generated by + the "get_object" function. Provided for backwards compatibility. + UnspecifiedObject data - The object's data. object_metadata + metadata - Metadata for object retrieved/ @deprecated + Workspaces.ObjectData) -> structure: parameter "data" of + unspecified object, parameter "metadata" of type "object_metadata" + (Meta data associated with an object stored in a workspace. + Provided for backwards compatibility. obj_name id - name of the + object. type_string type - type of the object. timestamp moddate - + date when the object was saved obj_ver instance - the version of + the object string command - Deprecated. Always returns the empty + string. username lastmodifier - name of the user who last saved + the object, including copying the object username owner - + Deprecated. Same as lastmodifier. ws_name workspace - name of the + workspace in which the object is stored string ref - Deprecated. + Always returns the empty string. string chsum - the md5 checksum + of the object. usermeta metadata - arbitrary user-supplied + metadata about the object. obj_id objid - the numerical id of the + object. @deprecated object_info) -> tuple of size 12: parameter + "id" of type "obj_name" (A string used as a name for an object. + Any string consisting of alphanumeric characters and the + characters |._- that is not an integer is acceptable.), parameter + "type" of type "type_string" (A type string. Specifies the type + and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "moddate" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "instance" of Long, + parameter "command" of String, parameter "lastmodifier" of type + "username" (Login name of a KBase user account.), parameter + "owner" of type "username" (Login name of a KBase user account.), + parameter "workspace" of type "ws_name" (A string used as a name + for a workspace. Any string consisting of alphanumeric characters + and "_", ".", or "-" that is not an integer is acceptable. The + name may optionally be prefixed with the workspace owner's user + name and a colon, e.g. kbasetest:my_workspace.), parameter "ref" + of String, parameter "chsum" of String, parameter "metadata" of + type "usermeta" (User provided metadata about an object. Arbitrary + key-value pairs provided by the user.) -> mapping from String to + String, parameter "objid" of type "obj_id" (The unique, permanent + numerical ID of an object.) + """ + return self._client.call_method('Workspace.get_object', + [params], self._service_ver, context) + + def get_object_provenance(self, object_ids, context=None): + """ + DEPRECATED + Get object provenance from the workspace. + @deprecated Workspace.get_objects2 + :param object_ids: instance of list of type "ObjectIdentity" (An + object identifier. Select an object by either: One, and only one, + of the numerical id or name of the workspace. ws_id wsid - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + :returns: instance of list of type "ObjectProvenanceInfo" (DEPRECATED + The provenance and supplemental info for an object. object_info + info - information about the object. list + provenance - the object's provenance. username creator - the user + that first saved the object to the workspace. ws_id orig_wsid - + the id of the workspace in which this object was originally saved. + Missing for objects saved prior to version 0.4.1. timestamp + created - the date the object was first saved to the workspace. + epoch epoch - the date the object was first saved to the + workspace. list - the references contained within the + object. obj_ref copied - the reference of the source object if + this object is a copy and the copy source exists and is + accessible. null otherwise. boolean copy_source_inaccessible - + true if the object was copied from another object, but that object + is no longer accessible to the user. False otherwise. + mapping> extracted_ids - any ids + extracted from the object. string handle_error - if an error + occurs while setting ACLs on embedded external IDs, it will be + reported here. If not for historical reasons the parameter would + be called "external_id_error". string handle_stacktrace - the + stacktrace for handle_error. As above, the parameter should be + called "external_id_stacktrace". @deprecated) -> structure: + parameter "info" of type "object_info" (Information about an + object, including user provided metadata. obj_id objid - the + numerical id of the object. obj_name name - the name of the + object. type_string type - the type of the object. timestamp + save_date - the save date of the object. obj_ver ver - the version + of the object. username saved_by - the user that saved or copied + the object. ws_id wsid - the workspace containing the object. + ws_name workspace - the workspace containing the object. string + chsum - the md5 checksum of the object. int size - the size of the + object in bytes. usermeta meta - arbitrary user-supplied metadata + about the object.) -> tuple of size 11: parameter "objid" of type + "obj_id" (The unique, permanent numerical ID of an object.), + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String, parameter + "provenance" of list of type "ProvenanceAction" (A provenance + action. A provenance action (PA) is an action taken while + transforming one data object to another. There may be several PAs + taken in series. A PA is typically running a script, running an + api command, etc. All of the following fields are optional, but + more information provided equates to better data provenance. If a + provenance action has no fields defined at all, it is silently + dropped from the list. resolved_ws_objects should never be set by + the user; it is set by the workspace service when returning data. + On input, only one of the time or epoch may be supplied. Both are + supplied on output. The maximum size of the entire provenance + object, including all actions, is 1MB. timestamp time - the time + the action was started epoch epoch - the time the action was + started. string caller - the name or id of the invoker of this + provenance action. In most cases, this will be the same for all + PAs. string service - the name of the service that performed this + action. string service_ver - the version of the service that + performed this action. string method - the method of the service + that performed this action. list method_params + - the parameters of the method that performed this action. If an + object in the parameters is a workspace object, also put the + object reference in the input_ws_object list. string script - the + name of the script that performed this action. string script_ver - + the version of the script that performed this action. string + script_command_line - the command line provided to the script that + performed this action. If workspace objects were provided in the + command line, also put the object reference in the input_ws_object + list. list input_ws_objects - the workspace objects + that were used as input to this action; typically these will also + be present as parts of the method_params or the + script_command_line arguments. A reference path into the object + graph may be supplied. list resolved_ws_objects - the + workspace objects ids from input_ws_objects resolved to permanent + workspace object references by the workspace service. list + intermediate_incoming - if the previous action produced output + that 1) was not stored in a referrable way, and 2) is used as + input for this action, provide it with an arbitrary and unique ID + here, in the order of the input arguments to this action. These + IDs can be used in the method_params argument. list + intermediate_outgoing - if this action produced output that 1) was + not stored in a referrable way, and 2) is used as input for the + next action, provide it with an arbitrary and unique ID here, in + the order of the output values from this action. These IDs can be + used in the intermediate_incoming argument in the next action. + list external_data - data external to the + workspace that was either imported to the workspace or used to + create a workspace object. list subactions - the + subactions taken as a part of this action. mapping + custom - user definable custom provenance fields and their values. + string description - a free text description of this action.) -> + structure: parameter "time" of type "timestamp" (A time in the + format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "epoch" of type "epoch" (A Unix epoch (the time + since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "caller" + of String, parameter "service" of String, parameter "service_ver" + of String, parameter "method" of String, parameter "method_params" + of list of unspecified object, parameter "script" of String, + parameter "script_ver" of String, parameter "script_command_line" + of String, parameter "input_ws_objects" of list of type + "ref_string" (A chain of objects with references to one another as + a string. A single string that is semantically identical to + ref_chain above. Represents a path from one workspace object to + another through an arbitrarily number of intermediate objects + where each object has a dependency or provenance reference to the + next object. Each entry is an obj_ref as defined earlier. Entries + are separated by semicolons. Whitespace is ignored. Examples: + 3/5/6; kbaseuser:myworkspace/myobject; 5/myobject/2 aworkspace/6), + parameter "resolved_ws_objects" of list of type "obj_ref" (A + string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "intermediate_incoming" of list of String, parameter + "intermediate_outgoing" of list of String, parameter + "external_data" of list of type "ExternalDataUnit" (An external + data unit. A piece of data from a source outside the Workspace. On + input, only one of the resource_release_date or + resource_release_epoch may be supplied. Both are supplied on + output. All fields are optional, but at least one field must be + present. string resource_name - the name of the resource, for + example JGI. string resource_url - the url of the resource, for + example http://genome.jgi.doe.gov string resource_version - + version of the resource timestamp resource_release_date - the + release date of the resource epoch resource_release_epoch - the + release date of the resource string data_url - the url of the + data, for example + http://genome.jgi.doe.gov/pages/dynamicOrganismDownload.jsf? + organism=BlaspURHD0036 string data_id - the id of the data, for + example 7625.2.79179.AGTTCC.adnq.fastq.gz string description - a + free text description of the data.) -> structure: parameter + "resource_name" of String, parameter "resource_url" of String, + parameter "resource_version" of String, parameter + "resource_release_date" of type "timestamp" (A time in the format + YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "resource_release_epoch" of type "epoch" (A Unix + epoch (the time since 00:00:00 1/1/1970 UTC) in milliseconds.), + parameter "data_url" of String, parameter "data_id" of String, + parameter "description" of String, parameter "subactions" of list + of type "SubAction" (Information about a subaction that is invoked + by a provenance action. A provenance action (PA) may invoke + subactions (SA), e.g. calling a separate piece of code, a service, + or a script. In most cases these calls are the same from PA to PA + and so do not need to be listed in the provenance since providing + information about the PA alone provides reproducibility. In some + cases, however, SAs may change over time, such that invoking the + same PA with the same parameters may produce different results. + For example, if a PA calls a remote server, that server may be + updated between a PA invoked on day T and another PA invoked on + day T+1. The SubAction structure allows for specifying information + about SAs that may dynamically change from PA invocation to PA + invocation. All fields are optional but at least one field must be + present. string name - the name of the SA. string ver - the + version of SA. string code_url - a url pointing to the SA's + codebase. string commit - a version control commit ID for the SA. + string endpoint_url - a url pointing to the access point for the + SA - a server url, for instance.) -> structure: parameter "name" + of String, parameter "ver" of String, parameter "code_url" of + String, parameter "commit" of String, parameter "endpoint_url" of + String, parameter "custom" of mapping from String to String, + parameter "description" of String, parameter "creator" of type + "username" (Login name of a KBase user account.), parameter + "orig_wsid" of type "ws_id" (The unique, permanent numerical ID of + a workspace.), parameter "created" of type "timestamp" (A time in + the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "epoch" of type "epoch" (A Unix epoch (the time + since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "refs" + of list of type "obj_ref" (A string that uniquely identifies an + object in the workspace service. The format is [ws_name or + id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "copied" of type "obj_ref" (A + string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "copy_source_inaccessible" of type "boolean" (A boolean. 0 = + false, other = true.), parameter "extracted_ids" of mapping from + type "id_type" (An id type (e.g. from a typespec @id annotation: + @id [idtype])) to list of type "extracted_id" (An id extracted + from an object.), parameter "handle_error" of String, parameter + "handle_stacktrace" of String + """ + return self._client.call_method('Workspace.get_object_provenance', + [object_ids], self._service_ver, context) + + def get_objects(self, object_ids, context=None): + """ + DEPRECATED + Get objects from the workspace. + @deprecated Workspace.get_objects2 + :param object_ids: instance of list of type "ObjectIdentity" (An + object identifier. Select an object by either: One, and only one, + of the numerical id or name of the workspace. ws_id wsid - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + :returns: instance of list of type "ObjectData" (The data and + supplemental info for an object. UnspecifiedObject data - the + object's data or subset data. object_info info - information about + the object. list path - the path to the object through + the object reference graph. All the references in the path are + absolute. list provenance - the object's + provenance. username creator - the user that first saved the + object to the workspace. ws_id orig_wsid - the id of the workspace + in which this object was originally saved. Missing for objects + saved prior to version 0.4.1. timestamp created - the date the + object was first saved to the workspace. epoch epoch - the date + the object was first saved to the workspace. list refs - + the references contained within the object. obj_ref copied - the + reference of the source object if this object is a copy and the + copy source exists and is accessible. null otherwise. boolean + copy_source_inaccessible - true if the object was copied from + another object, but that object is no longer accessible to the + user. False otherwise. mapping> + extracted_ids - any ids extracted from the object. string + handle_error - if an error occurs while setting ACLs on embedded + external IDs, it will be reported here. If not for historical + reasons the parameter would be called "external_id_error". string + handle_stacktrace - the stacktrace for handle_error. As above, the + parameter should be called "external_id_stacktrace".) -> + structure: parameter "data" of unspecified object, parameter + "info" of type "object_info" (Information about an object, + including user provided metadata. obj_id objid - the numerical id + of the object. obj_name name - the name of the object. type_string + type - the type of the object. timestamp save_date - the save date + of the object. obj_ver ver - the version of the object. username + saved_by - the user that saved or copied the object. ws_id wsid - + the workspace containing the object. ws_name workspace - the + workspace containing the object. string chsum - the md5 checksum + of the object. int size - the size of the object in bytes. + usermeta meta - arbitrary user-supplied metadata about the + object.) -> tuple of size 11: parameter "objid" of type "obj_id" + (The unique, permanent numerical ID of an object.), parameter + "name" of type "obj_name" (A string used as a name for an object. + Any string consisting of alphanumeric characters and the + characters |._- that is not an integer is acceptable.), parameter + "type" of type "type_string" (A type string. Specifies the type + and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String, parameter "path" of + list of type "obj_ref" (A string that uniquely identifies an + object in the workspace service. The format is [ws_name or + id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "provenance" of list of type + "ProvenanceAction" (A provenance action. A provenance action (PA) + is an action taken while transforming one data object to another. + There may be several PAs taken in series. A PA is typically + running a script, running an api command, etc. All of the + following fields are optional, but more information provided + equates to better data provenance. If a provenance action has no + fields defined at all, it is silently dropped from the list. + resolved_ws_objects should never be set by the user; it is set by + the workspace service when returning data. On input, only one of + the time or epoch may be supplied. Both are supplied on output. + The maximum size of the entire provenance object, including all + actions, is 1MB. timestamp time - the time the action was started + epoch epoch - the time the action was started. string caller - the + name or id of the invoker of this provenance action. In most + cases, this will be the same for all PAs. string service - the + name of the service that performed this action. string service_ver + - the version of the service that performed this action. string + method - the method of the service that performed this action. + list method_params - the parameters of the + method that performed this action. If an object in the parameters + is a workspace object, also put the object reference in the + input_ws_object list. string script - the name of the script that + performed this action. string script_ver - the version of the + script that performed this action. string script_command_line - + the command line provided to the script that performed this + action. If workspace objects were provided in the command line, + also put the object reference in the input_ws_object list. + list input_ws_objects - the workspace objects that + were used as input to this action; typically these will also be + present as parts of the method_params or the script_command_line + arguments. A reference path into the object graph may be supplied. + list resolved_ws_objects - the workspace objects ids from + input_ws_objects resolved to permanent workspace object references + by the workspace service. list intermediate_incoming - if + the previous action produced output that 1) was not stored in a + referrable way, and 2) is used as input for this action, provide + it with an arbitrary and unique ID here, in the order of the input + arguments to this action. These IDs can be used in the + method_params argument. list intermediate_outgoing - if + this action produced output that 1) was not stored in a referrable + way, and 2) is used as input for the next action, provide it with + an arbitrary and unique ID here, in the order of the output values + from this action. These IDs can be used in the + intermediate_incoming argument in the next action. + list external_data - data external to the + workspace that was either imported to the workspace or used to + create a workspace object. list subactions - the + subactions taken as a part of this action. mapping + custom - user definable custom provenance fields and their values. + string description - a free text description of this action.) -> + structure: parameter "time" of type "timestamp" (A time in the + format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "epoch" of type "epoch" (A Unix epoch (the time + since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "caller" + of String, parameter "service" of String, parameter "service_ver" + of String, parameter "method" of String, parameter "method_params" + of list of unspecified object, parameter "script" of String, + parameter "script_ver" of String, parameter "script_command_line" + of String, parameter "input_ws_objects" of list of type + "ref_string" (A chain of objects with references to one another as + a string. A single string that is semantically identical to + ref_chain above. Represents a path from one workspace object to + another through an arbitrarily number of intermediate objects + where each object has a dependency or provenance reference to the + next object. Each entry is an obj_ref as defined earlier. Entries + are separated by semicolons. Whitespace is ignored. Examples: + 3/5/6; kbaseuser:myworkspace/myobject; 5/myobject/2 aworkspace/6), + parameter "resolved_ws_objects" of list of type "obj_ref" (A + string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "intermediate_incoming" of list of String, parameter + "intermediate_outgoing" of list of String, parameter + "external_data" of list of type "ExternalDataUnit" (An external + data unit. A piece of data from a source outside the Workspace. On + input, only one of the resource_release_date or + resource_release_epoch may be supplied. Both are supplied on + output. All fields are optional, but at least one field must be + present. string resource_name - the name of the resource, for + example JGI. string resource_url - the url of the resource, for + example http://genome.jgi.doe.gov string resource_version - + version of the resource timestamp resource_release_date - the + release date of the resource epoch resource_release_epoch - the + release date of the resource string data_url - the url of the + data, for example + http://genome.jgi.doe.gov/pages/dynamicOrganismDownload.jsf? + organism=BlaspURHD0036 string data_id - the id of the data, for + example 7625.2.79179.AGTTCC.adnq.fastq.gz string description - a + free text description of the data.) -> structure: parameter + "resource_name" of String, parameter "resource_url" of String, + parameter "resource_version" of String, parameter + "resource_release_date" of type "timestamp" (A time in the format + YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "resource_release_epoch" of type "epoch" (A Unix + epoch (the time since 00:00:00 1/1/1970 UTC) in milliseconds.), + parameter "data_url" of String, parameter "data_id" of String, + parameter "description" of String, parameter "subactions" of list + of type "SubAction" (Information about a subaction that is invoked + by a provenance action. A provenance action (PA) may invoke + subactions (SA), e.g. calling a separate piece of code, a service, + or a script. In most cases these calls are the same from PA to PA + and so do not need to be listed in the provenance since providing + information about the PA alone provides reproducibility. In some + cases, however, SAs may change over time, such that invoking the + same PA with the same parameters may produce different results. + For example, if a PA calls a remote server, that server may be + updated between a PA invoked on day T and another PA invoked on + day T+1. The SubAction structure allows for specifying information + about SAs that may dynamically change from PA invocation to PA + invocation. All fields are optional but at least one field must be + present. string name - the name of the SA. string ver - the + version of SA. string code_url - a url pointing to the SA's + codebase. string commit - a version control commit ID for the SA. + string endpoint_url - a url pointing to the access point for the + SA - a server url, for instance.) -> structure: parameter "name" + of String, parameter "ver" of String, parameter "code_url" of + String, parameter "commit" of String, parameter "endpoint_url" of + String, parameter "custom" of mapping from String to String, + parameter "description" of String, parameter "creator" of type + "username" (Login name of a KBase user account.), parameter + "orig_wsid" of type "ws_id" (The unique, permanent numerical ID of + a workspace.), parameter "created" of type "timestamp" (A time in + the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "epoch" of type "epoch" (A Unix epoch (the time + since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "refs" + of list of type "obj_ref" (A string that uniquely identifies an + object in the workspace service. The format is [ws_name or + id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "copied" of type "obj_ref" (A + string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "copy_source_inaccessible" of type "boolean" (A boolean. 0 = + false, other = true.), parameter "extracted_ids" of mapping from + type "id_type" (An id type (e.g. from a typespec @id annotation: + @id [idtype])) to list of type "extracted_id" (An id extracted + from an object.), parameter "handle_error" of String, parameter + "handle_stacktrace" of String + """ + return self._client.call_method('Workspace.get_objects', + [object_ids], self._service_ver, context) + + def get_objects2(self, params, context=None): + """ + Get objects from the workspace. + :param params: instance of type "GetObjects2Params" (Input parameters + for the get_objects2 function. Required parameters: + list objects - the list of object + specifications for the objects to return (via reference chain and + as a subset if specified). Optional parameters: boolean + ignoreErrors - Don't throw an exception if an object cannot be + accessed; return null for that object's information instead. + Default false. boolean no_data - return the provenance, + references, and object_info for this object without the object + data. Default false. boolean skip_external_system_updates - if the + objects contain any external IDs, don't contact external systems + to perform any updates for those IDs (often ACL updates, e.g. for + handle / blobstore / sample IDs). In some cases this can speed up + fetching the data. Default false. boolean + batch_external_system_updates - if the objects contain any + external IDs, send all external system updates in a batch to each + external system when possible rather than object by object. This + can potentially speed up the updates, but the drawback is that if + the external update fails for any object, all the objects that + required updates for that system will be marked as having a failed + update. Has no effect if skip_external_system_updates is true. + Default false.) -> structure: parameter "objects" of list of type + "ObjectSpecification" (An Object Specification (OS). Inherits from + ObjectIdentity (OI). Specifies which object, and which parts of + that object, to retrieve from the Workspace Service. The fields + wsid, workspace, objid, name, and ver are identical to the OI + fields. The ref field's behavior is extended from OI. It maintains + its previous behavior, but now also can act as a reference string. + See reference following below for more information. REFERENCE + FOLLOWING: Reference following guarantees that a user that has + access to an object can always see a) objects that are referenced + inside the object and b) objects that are referenced in the + object's provenance. This ensures that the user has visibility + into the entire provenance of the object and the object's object + dependencies (e.g. references). The user must have at least read + access to the object specified in this SO, but need not have + access to any further objects in the reference chain, and those + objects may be deleted. Optional reference following fields: Note + that only one of the following fields may be specified. ref_chain + obj_path - a path to the desired object from the object specified + in this OS. In other words, the object specified in this OS is + assumed to be accessible to the user, and the objects in the + object path represent a chain of references to the desired object + at the end of the object path. If the references are all valid, + the desired object will be returned. - OR - list + obj_ref_path - shorthand for the obj_path. - OR - ref_chain + to_obj_path - identical to obj_path, except that the path is TO + the object specified in this OS, rather than from the object. In + other words the object specified by wsid/objid/ref etc. is the end + of the path, and to_obj_path is the rest of the path. The user + must have access to the first object in the to_obj_path. - OR - + list to_obj_ref_path - shorthand for the to_obj_path. - + OR - ref_string ref - A string representing a reference path from + one object to another. Unlike the previous reference following + options, the ref_string represents the ENTIRE path from the source + object to the target object. As with the OI object, the ref field + may contain a single reference. - OR - boolean find_refence_path - + This is the last, slowest, and most expensive resort for getting a + referenced object - do not use this method unless the path to the + object is unavailable by any other means. Setting the + find_refence_path parameter to true means that the workspace + service will search through the object reference graph from the + object specified in this OS to find an object that 1) the user can + access, and 2) has an unbroken reference path to the target + object. If the search succeeds, the object will be returned as + normal. Note that the search will automatically fail after a + certain (but much larger than necessary for the vast majority of + cases) number of objects are traversed. OBJECT SUBSETS: When + selecting a subset of an array in an object, the returned array is + compressed to the size of the subset, but the ordering of the + array is maintained. For example, if the array stored at the + 'feature' key of a Genome object has 4000 entries, and the object + paths provided are: /feature/7 /feature/3015 /feature/700 The + returned feature array will be of length three and the entries + will consist, in order, of the 7th, 700th, and 3015th entries of + the original array. Optional object subset fields: + list included - the portions of the object to include + in the object subset. boolean strict_maps - if true, throw an + exception if the subset specification traverses a non-existent map + key (default false) boolean strict_arrays - if true, throw an + exception if the subset specification exceeds the size of an array + (default true)) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type + "ref_string" (A chain of objects with references to one another as + a string. A single string that is semantically identical to + ref_chain above. Represents a path from one workspace object to + another through an arbitrarily number of intermediate objects + where each object has a dependency or provenance reference to the + next object. Each entry is an obj_ref as defined earlier. Entries + are separated by semicolons. Whitespace is ignored. Examples: + 3/5/6; kbaseuser:myworkspace/myobject; 5/myobject/2 aworkspace/6), + parameter "obj_path" of type "ref_chain" (A chain of objects with + references to one another. An object reference chain consists of a + list of objects where the nth object possesses a reference, either + in the object itself or in the object provenance, to the n+1th + object.) -> list of type "ObjectIdentity" (An object identifier. + Select an object by either: One, and only one, of the numerical id + or name of the workspace. ws_id wsid - the numerical ID of the + workspace. ws_name workspace - the name of the workspace. AND One, + and only one, of the numerical id or name of the object. obj_id + objid- the numerical ID of the object. obj_name name - name of the + object. OPTIONALLY obj_ver ver - the version of the object. OR an + object reference string: obj_ref ref - an object reference + string.) -> structure: parameter "workspace" of type "ws_name" (A + string used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "obj_ref_path" of list of type "obj_ref" (A string that uniquely + identifies an object in the workspace service. The format is + [ws_name or id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "to_obj_path" of type + "ref_chain" (A chain of objects with references to one another. An + object reference chain consists of a list of objects where the nth + object possesses a reference, either in the object itself or in + the object provenance, to the n+1th object.) -> list of type + "ObjectIdentity" (An object identifier. Select an object by + either: One, and only one, of the numerical id or name of the + workspace. ws_id wsid - the numerical ID of the workspace. ws_name + workspace - the name of the workspace. AND One, and only one, of + the numerical id or name of the object. obj_id objid- the + numerical ID of the object. obj_name name - name of the object. + OPTIONALLY obj_ver ver - the version of the object. OR an object + reference string: obj_ref ref - an object reference string.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "wsid" of type "ws_id" (The unique, permanent numerical + ID of a workspace.), parameter "name" of type "obj_name" (A string + used as a name for an object. Any string consisting of + alphanumeric characters and the characters |._- that is not an + integer is acceptable.), parameter "objid" of type "obj_id" (The + unique, permanent numerical ID of an object.), parameter "ver" of + type "obj_ver" (An object version. The version of the object, + starting at 1.), parameter "ref" of type "obj_ref" (A string that + uniquely identifies an object in the workspace service. The format + is [ws_name or id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "to_obj_ref_path" of list of + type "obj_ref" (A string that uniquely identifies an object in the + workspace service. The format is [ws_name or id]/[obj_name or + id]/[obj_ver]. For example, MyFirstWorkspace/MyFirstObject/3 would + identify the third version of an object called MyFirstObject in + the workspace called MyFirstWorkspace. 42/Panic/1 would identify + the first version of the object name Panic in workspace with id + 42. Towel/1/6 would identify the 6th version of the object with id + 1 in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "find_reference_path" of type "boolean" (A boolean. 0 = false, + other = true.), parameter "included" of list of type "object_path" + (A path into an object. Identify a sub portion of an object by + providing the path, delimited by a slash (/), to that portion of + the object. Thus the path may not have slashes in the structure or + mapping keys. Examples: /foo/bar/3 - specifies the bar key of the + foo mapping and the 3rd entry of the array if bar maps to an array + or the value mapped to the string "3" if bar maps to a map. + /foo/bar/[*]/baz - specifies the baz field of all the objects in + the list mapped by the bar key in the map foo. /foo/asterisk/baz - + specifies the baz field of all the objects in the values of the + foo mapping. Swap 'asterisk' for * in the path. In case you need + to use '/' or '~' in path items use JSON Pointer notation defined + here: http://tools.ietf.org/html/rfc6901), parameter "strict_maps" + of type "boolean" (A boolean. 0 = false, other = true.), parameter + "strict_arrays" of type "boolean" (A boolean. 0 = false, other = + true.), parameter "ignoreErrors" of type "boolean" (A boolean. 0 = + false, other = true.), parameter "no_data" of type "boolean" (A + boolean. 0 = false, other = true.), parameter + "skip_external_system_updates" of type "boolean" (A boolean. 0 = + false, other = true.), parameter "batch_external_system_updates" + of type "boolean" (A boolean. 0 = false, other = true.) + :returns: instance of type "GetObjects2Results" (Results from the + get_objects2 function. list data - the returned + objects.) -> structure: parameter "data" of list of type + "ObjectData" (The data and supplemental info for an object. + UnspecifiedObject data - the object's data or subset data. + object_info info - information about the object. list + path - the path to the object through the object reference graph. + All the references in the path are absolute. + list provenance - the object's provenance. + username creator - the user that first saved the object to the + workspace. ws_id orig_wsid - the id of the workspace in which this + object was originally saved. Missing for objects saved prior to + version 0.4.1. timestamp created - the date the object was first + saved to the workspace. epoch epoch - the date the object was + first saved to the workspace. list refs - the references + contained within the object. obj_ref copied - the reference of the + source object if this object is a copy and the copy source exists + and is accessible. null otherwise. boolean + copy_source_inaccessible - true if the object was copied from + another object, but that object is no longer accessible to the + user. False otherwise. mapping> + extracted_ids - any ids extracted from the object. string + handle_error - if an error occurs while setting ACLs on embedded + external IDs, it will be reported here. If not for historical + reasons the parameter would be called "external_id_error". string + handle_stacktrace - the stacktrace for handle_error. As above, the + parameter should be called "external_id_stacktrace".) -> + structure: parameter "data" of unspecified object, parameter + "info" of type "object_info" (Information about an object, + including user provided metadata. obj_id objid - the numerical id + of the object. obj_name name - the name of the object. type_string + type - the type of the object. timestamp save_date - the save date + of the object. obj_ver ver - the version of the object. username + saved_by - the user that saved or copied the object. ws_id wsid - + the workspace containing the object. ws_name workspace - the + workspace containing the object. string chsum - the md5 checksum + of the object. int size - the size of the object in bytes. + usermeta meta - arbitrary user-supplied metadata about the + object.) -> tuple of size 11: parameter "objid" of type "obj_id" + (The unique, permanent numerical ID of an object.), parameter + "name" of type "obj_name" (A string used as a name for an object. + Any string consisting of alphanumeric characters and the + characters |._- that is not an integer is acceptable.), parameter + "type" of type "type_string" (A type string. Specifies the type + and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String, parameter "path" of + list of type "obj_ref" (A string that uniquely identifies an + object in the workspace service. The format is [ws_name or + id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "provenance" of list of type + "ProvenanceAction" (A provenance action. A provenance action (PA) + is an action taken while transforming one data object to another. + There may be several PAs taken in series. A PA is typically + running a script, running an api command, etc. All of the + following fields are optional, but more information provided + equates to better data provenance. If a provenance action has no + fields defined at all, it is silently dropped from the list. + resolved_ws_objects should never be set by the user; it is set by + the workspace service when returning data. On input, only one of + the time or epoch may be supplied. Both are supplied on output. + The maximum size of the entire provenance object, including all + actions, is 1MB. timestamp time - the time the action was started + epoch epoch - the time the action was started. string caller - the + name or id of the invoker of this provenance action. In most + cases, this will be the same for all PAs. string service - the + name of the service that performed this action. string service_ver + - the version of the service that performed this action. string + method - the method of the service that performed this action. + list method_params - the parameters of the + method that performed this action. If an object in the parameters + is a workspace object, also put the object reference in the + input_ws_object list. string script - the name of the script that + performed this action. string script_ver - the version of the + script that performed this action. string script_command_line - + the command line provided to the script that performed this + action. If workspace objects were provided in the command line, + also put the object reference in the input_ws_object list. + list input_ws_objects - the workspace objects that + were used as input to this action; typically these will also be + present as parts of the method_params or the script_command_line + arguments. A reference path into the object graph may be supplied. + list resolved_ws_objects - the workspace objects ids from + input_ws_objects resolved to permanent workspace object references + by the workspace service. list intermediate_incoming - if + the previous action produced output that 1) was not stored in a + referrable way, and 2) is used as input for this action, provide + it with an arbitrary and unique ID here, in the order of the input + arguments to this action. These IDs can be used in the + method_params argument. list intermediate_outgoing - if + this action produced output that 1) was not stored in a referrable + way, and 2) is used as input for the next action, provide it with + an arbitrary and unique ID here, in the order of the output values + from this action. These IDs can be used in the + intermediate_incoming argument in the next action. + list external_data - data external to the + workspace that was either imported to the workspace or used to + create a workspace object. list subactions - the + subactions taken as a part of this action. mapping + custom - user definable custom provenance fields and their values. + string description - a free text description of this action.) -> + structure: parameter "time" of type "timestamp" (A time in the + format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "epoch" of type "epoch" (A Unix epoch (the time + since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "caller" + of String, parameter "service" of String, parameter "service_ver" + of String, parameter "method" of String, parameter "method_params" + of list of unspecified object, parameter "script" of String, + parameter "script_ver" of String, parameter "script_command_line" + of String, parameter "input_ws_objects" of list of type + "ref_string" (A chain of objects with references to one another as + a string. A single string that is semantically identical to + ref_chain above. Represents a path from one workspace object to + another through an arbitrarily number of intermediate objects + where each object has a dependency or provenance reference to the + next object. Each entry is an obj_ref as defined earlier. Entries + are separated by semicolons. Whitespace is ignored. Examples: + 3/5/6; kbaseuser:myworkspace/myobject; 5/myobject/2 aworkspace/6), + parameter "resolved_ws_objects" of list of type "obj_ref" (A + string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "intermediate_incoming" of list of String, parameter + "intermediate_outgoing" of list of String, parameter + "external_data" of list of type "ExternalDataUnit" (An external + data unit. A piece of data from a source outside the Workspace. On + input, only one of the resource_release_date or + resource_release_epoch may be supplied. Both are supplied on + output. All fields are optional, but at least one field must be + present. string resource_name - the name of the resource, for + example JGI. string resource_url - the url of the resource, for + example http://genome.jgi.doe.gov string resource_version - + version of the resource timestamp resource_release_date - the + release date of the resource epoch resource_release_epoch - the + release date of the resource string data_url - the url of the + data, for example + http://genome.jgi.doe.gov/pages/dynamicOrganismDownload.jsf? + organism=BlaspURHD0036 string data_id - the id of the data, for + example 7625.2.79179.AGTTCC.adnq.fastq.gz string description - a + free text description of the data.) -> structure: parameter + "resource_name" of String, parameter "resource_url" of String, + parameter "resource_version" of String, parameter + "resource_release_date" of type "timestamp" (A time in the format + YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "resource_release_epoch" of type "epoch" (A Unix + epoch (the time since 00:00:00 1/1/1970 UTC) in milliseconds.), + parameter "data_url" of String, parameter "data_id" of String, + parameter "description" of String, parameter "subactions" of list + of type "SubAction" (Information about a subaction that is invoked + by a provenance action. A provenance action (PA) may invoke + subactions (SA), e.g. calling a separate piece of code, a service, + or a script. In most cases these calls are the same from PA to PA + and so do not need to be listed in the provenance since providing + information about the PA alone provides reproducibility. In some + cases, however, SAs may change over time, such that invoking the + same PA with the same parameters may produce different results. + For example, if a PA calls a remote server, that server may be + updated between a PA invoked on day T and another PA invoked on + day T+1. The SubAction structure allows for specifying information + about SAs that may dynamically change from PA invocation to PA + invocation. All fields are optional but at least one field must be + present. string name - the name of the SA. string ver - the + version of SA. string code_url - a url pointing to the SA's + codebase. string commit - a version control commit ID for the SA. + string endpoint_url - a url pointing to the access point for the + SA - a server url, for instance.) -> structure: parameter "name" + of String, parameter "ver" of String, parameter "code_url" of + String, parameter "commit" of String, parameter "endpoint_url" of + String, parameter "custom" of mapping from String to String, + parameter "description" of String, parameter "creator" of type + "username" (Login name of a KBase user account.), parameter + "orig_wsid" of type "ws_id" (The unique, permanent numerical ID of + a workspace.), parameter "created" of type "timestamp" (A time in + the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "epoch" of type "epoch" (A Unix epoch (the time + since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "refs" + of list of type "obj_ref" (A string that uniquely identifies an + object in the workspace service. The format is [ws_name or + id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "copied" of type "obj_ref" (A + string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "copy_source_inaccessible" of type "boolean" (A boolean. 0 = + false, other = true.), parameter "extracted_ids" of mapping from + type "id_type" (An id type (e.g. from a typespec @id annotation: + @id [idtype])) to list of type "extracted_id" (An id extracted + from an object.), parameter "handle_error" of String, parameter + "handle_stacktrace" of String + """ + return self._client.call_method('Workspace.get_objects2', + [params], self._service_ver, context) + + def get_object_subset(self, sub_object_ids, context=None): + """ + DEPRECATED + Get portions of objects from the workspace. + When selecting a subset of an array in an object, the returned + array is compressed to the size of the subset, but the ordering of + the array is maintained. For example, if the array stored at the + 'feature' key of a Genome object has 4000 entries, and the object paths + provided are: + /feature/7 + /feature/3015 + /feature/700 + The returned feature array will be of length three and the entries will + consist, in order, of the 7th, 700th, and 3015th entries of the + original array. + @deprecated Workspace.get_objects2 + :param sub_object_ids: instance of list of type "SubObjectIdentity" + (DEPRECATED An object subset identifier. Select a subset of an + object by: EITHER One, and only one, of the numerical id or name + of the workspace. ws_id wsid - the numerical ID of the workspace. + ws_name workspace - name of the workspace. AND One, and only one, + of the numerical id or name of the object. obj_id objid- the + numerical ID of the object. obj_name name - name of the object. + OPTIONALLY obj_ver ver - the version of the object. OR an object + reference string: obj_ref ref - an object reference string. AND a + subset specification: list included - the portions of + the object to include in the object subset. boolean strict_maps - + if true, throw an exception if the subset specification traverses + a non-existant map key (default false) boolean strict_arrays - if + true, throw an exception if the subset specification exceeds the + size of an array (default true) @deprecated + Workspace.ObjectSpecification) -> structure: parameter "workspace" + of type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter "included" of + list of type "object_path" (A path into an object. Identify a sub + portion of an object by providing the path, delimited by a slash + (/), to that portion of the object. Thus the path may not have + slashes in the structure or mapping keys. Examples: /foo/bar/3 - + specifies the bar key of the foo mapping and the 3rd entry of the + array if bar maps to an array or the value mapped to the string + "3" if bar maps to a map. /foo/bar/[*]/baz - specifies the baz + field of all the objects in the list mapped by the bar key in the + map foo. /foo/asterisk/baz - specifies the baz field of all the + objects in the values of the foo mapping. Swap 'asterisk' for * in + the path. In case you need to use '/' or '~' in path items use + JSON Pointer notation defined here: + http://tools.ietf.org/html/rfc6901), parameter "strict_maps" of + type "boolean" (A boolean. 0 = false, other = true.), parameter + "strict_arrays" of type "boolean" (A boolean. 0 = false, other = + true.) + :returns: instance of list of type "ObjectData" (The data and + supplemental info for an object. UnspecifiedObject data - the + object's data or subset data. object_info info - information about + the object. list path - the path to the object through + the object reference graph. All the references in the path are + absolute. list provenance - the object's + provenance. username creator - the user that first saved the + object to the workspace. ws_id orig_wsid - the id of the workspace + in which this object was originally saved. Missing for objects + saved prior to version 0.4.1. timestamp created - the date the + object was first saved to the workspace. epoch epoch - the date + the object was first saved to the workspace. list refs - + the references contained within the object. obj_ref copied - the + reference of the source object if this object is a copy and the + copy source exists and is accessible. null otherwise. boolean + copy_source_inaccessible - true if the object was copied from + another object, but that object is no longer accessible to the + user. False otherwise. mapping> + extracted_ids - any ids extracted from the object. string + handle_error - if an error occurs while setting ACLs on embedded + external IDs, it will be reported here. If not for historical + reasons the parameter would be called "external_id_error". string + handle_stacktrace - the stacktrace for handle_error. As above, the + parameter should be called "external_id_stacktrace".) -> + structure: parameter "data" of unspecified object, parameter + "info" of type "object_info" (Information about an object, + including user provided metadata. obj_id objid - the numerical id + of the object. obj_name name - the name of the object. type_string + type - the type of the object. timestamp save_date - the save date + of the object. obj_ver ver - the version of the object. username + saved_by - the user that saved or copied the object. ws_id wsid - + the workspace containing the object. ws_name workspace - the + workspace containing the object. string chsum - the md5 checksum + of the object. int size - the size of the object in bytes. + usermeta meta - arbitrary user-supplied metadata about the + object.) -> tuple of size 11: parameter "objid" of type "obj_id" + (The unique, permanent numerical ID of an object.), parameter + "name" of type "obj_name" (A string used as a name for an object. + Any string consisting of alphanumeric characters and the + characters |._- that is not an integer is acceptable.), parameter + "type" of type "type_string" (A type string. Specifies the type + and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String, parameter "path" of + list of type "obj_ref" (A string that uniquely identifies an + object in the workspace service. The format is [ws_name or + id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "provenance" of list of type + "ProvenanceAction" (A provenance action. A provenance action (PA) + is an action taken while transforming one data object to another. + There may be several PAs taken in series. A PA is typically + running a script, running an api command, etc. All of the + following fields are optional, but more information provided + equates to better data provenance. If a provenance action has no + fields defined at all, it is silently dropped from the list. + resolved_ws_objects should never be set by the user; it is set by + the workspace service when returning data. On input, only one of + the time or epoch may be supplied. Both are supplied on output. + The maximum size of the entire provenance object, including all + actions, is 1MB. timestamp time - the time the action was started + epoch epoch - the time the action was started. string caller - the + name or id of the invoker of this provenance action. In most + cases, this will be the same for all PAs. string service - the + name of the service that performed this action. string service_ver + - the version of the service that performed this action. string + method - the method of the service that performed this action. + list method_params - the parameters of the + method that performed this action. If an object in the parameters + is a workspace object, also put the object reference in the + input_ws_object list. string script - the name of the script that + performed this action. string script_ver - the version of the + script that performed this action. string script_command_line - + the command line provided to the script that performed this + action. If workspace objects were provided in the command line, + also put the object reference in the input_ws_object list. + list input_ws_objects - the workspace objects that + were used as input to this action; typically these will also be + present as parts of the method_params or the script_command_line + arguments. A reference path into the object graph may be supplied. + list resolved_ws_objects - the workspace objects ids from + input_ws_objects resolved to permanent workspace object references + by the workspace service. list intermediate_incoming - if + the previous action produced output that 1) was not stored in a + referrable way, and 2) is used as input for this action, provide + it with an arbitrary and unique ID here, in the order of the input + arguments to this action. These IDs can be used in the + method_params argument. list intermediate_outgoing - if + this action produced output that 1) was not stored in a referrable + way, and 2) is used as input for the next action, provide it with + an arbitrary and unique ID here, in the order of the output values + from this action. These IDs can be used in the + intermediate_incoming argument in the next action. + list external_data - data external to the + workspace that was either imported to the workspace or used to + create a workspace object. list subactions - the + subactions taken as a part of this action. mapping + custom - user definable custom provenance fields and their values. + string description - a free text description of this action.) -> + structure: parameter "time" of type "timestamp" (A time in the + format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "epoch" of type "epoch" (A Unix epoch (the time + since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "caller" + of String, parameter "service" of String, parameter "service_ver" + of String, parameter "method" of String, parameter "method_params" + of list of unspecified object, parameter "script" of String, + parameter "script_ver" of String, parameter "script_command_line" + of String, parameter "input_ws_objects" of list of type + "ref_string" (A chain of objects with references to one another as + a string. A single string that is semantically identical to + ref_chain above. Represents a path from one workspace object to + another through an arbitrarily number of intermediate objects + where each object has a dependency or provenance reference to the + next object. Each entry is an obj_ref as defined earlier. Entries + are separated by semicolons. Whitespace is ignored. Examples: + 3/5/6; kbaseuser:myworkspace/myobject; 5/myobject/2 aworkspace/6), + parameter "resolved_ws_objects" of list of type "obj_ref" (A + string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "intermediate_incoming" of list of String, parameter + "intermediate_outgoing" of list of String, parameter + "external_data" of list of type "ExternalDataUnit" (An external + data unit. A piece of data from a source outside the Workspace. On + input, only one of the resource_release_date or + resource_release_epoch may be supplied. Both are supplied on + output. All fields are optional, but at least one field must be + present. string resource_name - the name of the resource, for + example JGI. string resource_url - the url of the resource, for + example http://genome.jgi.doe.gov string resource_version - + version of the resource timestamp resource_release_date - the + release date of the resource epoch resource_release_epoch - the + release date of the resource string data_url - the url of the + data, for example + http://genome.jgi.doe.gov/pages/dynamicOrganismDownload.jsf? + organism=BlaspURHD0036 string data_id - the id of the data, for + example 7625.2.79179.AGTTCC.adnq.fastq.gz string description - a + free text description of the data.) -> structure: parameter + "resource_name" of String, parameter "resource_url" of String, + parameter "resource_version" of String, parameter + "resource_release_date" of type "timestamp" (A time in the format + YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "resource_release_epoch" of type "epoch" (A Unix + epoch (the time since 00:00:00 1/1/1970 UTC) in milliseconds.), + parameter "data_url" of String, parameter "data_id" of String, + parameter "description" of String, parameter "subactions" of list + of type "SubAction" (Information about a subaction that is invoked + by a provenance action. A provenance action (PA) may invoke + subactions (SA), e.g. calling a separate piece of code, a service, + or a script. In most cases these calls are the same from PA to PA + and so do not need to be listed in the provenance since providing + information about the PA alone provides reproducibility. In some + cases, however, SAs may change over time, such that invoking the + same PA with the same parameters may produce different results. + For example, if a PA calls a remote server, that server may be + updated between a PA invoked on day T and another PA invoked on + day T+1. The SubAction structure allows for specifying information + about SAs that may dynamically change from PA invocation to PA + invocation. All fields are optional but at least one field must be + present. string name - the name of the SA. string ver - the + version of SA. string code_url - a url pointing to the SA's + codebase. string commit - a version control commit ID for the SA. + string endpoint_url - a url pointing to the access point for the + SA - a server url, for instance.) -> structure: parameter "name" + of String, parameter "ver" of String, parameter "code_url" of + String, parameter "commit" of String, parameter "endpoint_url" of + String, parameter "custom" of mapping from String to String, + parameter "description" of String, parameter "creator" of type + "username" (Login name of a KBase user account.), parameter + "orig_wsid" of type "ws_id" (The unique, permanent numerical ID of + a workspace.), parameter "created" of type "timestamp" (A time in + the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "epoch" of type "epoch" (A Unix epoch (the time + since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "refs" + of list of type "obj_ref" (A string that uniquely identifies an + object in the workspace service. The format is [ws_name or + id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "copied" of type "obj_ref" (A + string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "copy_source_inaccessible" of type "boolean" (A boolean. 0 = + false, other = true.), parameter "extracted_ids" of mapping from + type "id_type" (An id type (e.g. from a typespec @id annotation: + @id [idtype])) to list of type "extracted_id" (An id extracted + from an object.), parameter "handle_error" of String, parameter + "handle_stacktrace" of String + """ + return self._client.call_method('Workspace.get_object_subset', + [sub_object_ids], self._service_ver, context) + + def get_object_history(self, object, context=None): + """ + Get an object's history. The version argument of the ObjectIdentity is + ignored. + :param object: instance of type "ObjectIdentity" (An object + identifier. Select an object by either: One, and only one, of the + numerical id or name of the workspace. ws_id wsid - the numerical + ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + :returns: instance of list of type "object_info" (Information about + an object, including user provided metadata. obj_id objid - the + numerical id of the object. obj_name name - the name of the + object. type_string type - the type of the object. timestamp + save_date - the save date of the object. obj_ver ver - the version + of the object. username saved_by - the user that saved or copied + the object. ws_id wsid - the workspace containing the object. + ws_name workspace - the workspace containing the object. string + chsum - the md5 checksum of the object. int size - the size of the + object in bytes. usermeta meta - arbitrary user-supplied metadata + about the object.) -> tuple of size 11: parameter "objid" of type + "obj_id" (The unique, permanent numerical ID of an object.), + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String + """ + return self._client.call_method('Workspace.get_object_history', + [object], self._service_ver, context) + + def list_referencing_objects(self, object_ids, context=None): + """ + List objects that reference one or more specified objects. References + in the deleted state are not returned. + :param object_ids: instance of list of type "ObjectIdentity" (An + object identifier. Select an object by either: One, and only one, + of the numerical id or name of the workspace. ws_id wsid - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + :returns: instance of list of list of type "object_info" (Information + about an object, including user provided metadata. obj_id objid - + the numerical id of the object. obj_name name - the name of the + object. type_string type - the type of the object. timestamp + save_date - the save date of the object. obj_ver ver - the version + of the object. username saved_by - the user that saved or copied + the object. ws_id wsid - the workspace containing the object. + ws_name workspace - the workspace containing the object. string + chsum - the md5 checksum of the object. int size - the size of the + object in bytes. usermeta meta - arbitrary user-supplied metadata + about the object.) -> tuple of size 11: parameter "objid" of type + "obj_id" (The unique, permanent numerical ID of an object.), + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String + """ + return self._client.call_method('Workspace.list_referencing_objects', + [object_ids], self._service_ver, context) + + def list_referencing_object_counts(self, object_ids, context=None): + """ + DEPRECATED + List the number of times objects have been referenced. + This count includes both provenance and object-to-object references + and, unlike list_referencing_objects, includes objects that are + inaccessible to the user. + @deprecated + :param object_ids: instance of list of type "ObjectIdentity" (An + object identifier. Select an object by either: One, and only one, + of the numerical id or name of the workspace. ws_id wsid - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + :returns: instance of list of Long + """ + return self._client.call_method('Workspace.list_referencing_object_counts', + [object_ids], self._service_ver, context) + + def get_referenced_objects(self, ref_chains, context=None): + """ + DEPRECATED + Get objects by references from other objects. + NOTE: In the vast majority of cases, this method is not necessary and + get_objects should be used instead. + + get_referenced_objects guarantees that a user that has access to an + object can always see a) objects that are referenced inside the object + and b) objects that are referenced in the object's provenance. This + ensures that the user has visibility into the entire provenance of the + object and the object's object dependencies (e.g. references). + + The user must have at least read access to the first object in each + reference chain, but need not have access to any further objects in + the chain, and those objects may be deleted. + + @deprecated Workspace.get_objects2 + :param ref_chains: instance of list of type "ref_chain" (A chain of + objects with references to one another. An object reference chain + consists of a list of objects where the nth object possesses a + reference, either in the object itself or in the object + provenance, to the n+1th object.) -> list of type "ObjectIdentity" + (An object identifier. Select an object by either: One, and only + one, of the numerical id or name of the workspace. ws_id wsid - + the numerical ID of the workspace. ws_name workspace - the name of + the workspace. AND One, and only one, of the numerical id or name + of the object. obj_id objid- the numerical ID of the object. + obj_name name - name of the object. OPTIONALLY obj_ver ver - the + version of the object. OR an object reference string: obj_ref ref + - an object reference string.) -> structure: parameter "workspace" + of type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + :returns: instance of list of type "ObjectData" (The data and + supplemental info for an object. UnspecifiedObject data - the + object's data or subset data. object_info info - information about + the object. list path - the path to the object through + the object reference graph. All the references in the path are + absolute. list provenance - the object's + provenance. username creator - the user that first saved the + object to the workspace. ws_id orig_wsid - the id of the workspace + in which this object was originally saved. Missing for objects + saved prior to version 0.4.1. timestamp created - the date the + object was first saved to the workspace. epoch epoch - the date + the object was first saved to the workspace. list refs - + the references contained within the object. obj_ref copied - the + reference of the source object if this object is a copy and the + copy source exists and is accessible. null otherwise. boolean + copy_source_inaccessible - true if the object was copied from + another object, but that object is no longer accessible to the + user. False otherwise. mapping> + extracted_ids - any ids extracted from the object. string + handle_error - if an error occurs while setting ACLs on embedded + external IDs, it will be reported here. If not for historical + reasons the parameter would be called "external_id_error". string + handle_stacktrace - the stacktrace for handle_error. As above, the + parameter should be called "external_id_stacktrace".) -> + structure: parameter "data" of unspecified object, parameter + "info" of type "object_info" (Information about an object, + including user provided metadata. obj_id objid - the numerical id + of the object. obj_name name - the name of the object. type_string + type - the type of the object. timestamp save_date - the save date + of the object. obj_ver ver - the version of the object. username + saved_by - the user that saved or copied the object. ws_id wsid - + the workspace containing the object. ws_name workspace - the + workspace containing the object. string chsum - the md5 checksum + of the object. int size - the size of the object in bytes. + usermeta meta - arbitrary user-supplied metadata about the + object.) -> tuple of size 11: parameter "objid" of type "obj_id" + (The unique, permanent numerical ID of an object.), parameter + "name" of type "obj_name" (A string used as a name for an object. + Any string consisting of alphanumeric characters and the + characters |._- that is not an integer is acceptable.), parameter + "type" of type "type_string" (A type string. Specifies the type + and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String, parameter "path" of + list of type "obj_ref" (A string that uniquely identifies an + object in the workspace service. The format is [ws_name or + id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "provenance" of list of type + "ProvenanceAction" (A provenance action. A provenance action (PA) + is an action taken while transforming one data object to another. + There may be several PAs taken in series. A PA is typically + running a script, running an api command, etc. All of the + following fields are optional, but more information provided + equates to better data provenance. If a provenance action has no + fields defined at all, it is silently dropped from the list. + resolved_ws_objects should never be set by the user; it is set by + the workspace service when returning data. On input, only one of + the time or epoch may be supplied. Both are supplied on output. + The maximum size of the entire provenance object, including all + actions, is 1MB. timestamp time - the time the action was started + epoch epoch - the time the action was started. string caller - the + name or id of the invoker of this provenance action. In most + cases, this will be the same for all PAs. string service - the + name of the service that performed this action. string service_ver + - the version of the service that performed this action. string + method - the method of the service that performed this action. + list method_params - the parameters of the + method that performed this action. If an object in the parameters + is a workspace object, also put the object reference in the + input_ws_object list. string script - the name of the script that + performed this action. string script_ver - the version of the + script that performed this action. string script_command_line - + the command line provided to the script that performed this + action. If workspace objects were provided in the command line, + also put the object reference in the input_ws_object list. + list input_ws_objects - the workspace objects that + were used as input to this action; typically these will also be + present as parts of the method_params or the script_command_line + arguments. A reference path into the object graph may be supplied. + list resolved_ws_objects - the workspace objects ids from + input_ws_objects resolved to permanent workspace object references + by the workspace service. list intermediate_incoming - if + the previous action produced output that 1) was not stored in a + referrable way, and 2) is used as input for this action, provide + it with an arbitrary and unique ID here, in the order of the input + arguments to this action. These IDs can be used in the + method_params argument. list intermediate_outgoing - if + this action produced output that 1) was not stored in a referrable + way, and 2) is used as input for the next action, provide it with + an arbitrary and unique ID here, in the order of the output values + from this action. These IDs can be used in the + intermediate_incoming argument in the next action. + list external_data - data external to the + workspace that was either imported to the workspace or used to + create a workspace object. list subactions - the + subactions taken as a part of this action. mapping + custom - user definable custom provenance fields and their values. + string description - a free text description of this action.) -> + structure: parameter "time" of type "timestamp" (A time in the + format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "epoch" of type "epoch" (A Unix epoch (the time + since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "caller" + of String, parameter "service" of String, parameter "service_ver" + of String, parameter "method" of String, parameter "method_params" + of list of unspecified object, parameter "script" of String, + parameter "script_ver" of String, parameter "script_command_line" + of String, parameter "input_ws_objects" of list of type + "ref_string" (A chain of objects with references to one another as + a string. A single string that is semantically identical to + ref_chain above. Represents a path from one workspace object to + another through an arbitrarily number of intermediate objects + where each object has a dependency or provenance reference to the + next object. Each entry is an obj_ref as defined earlier. Entries + are separated by semicolons. Whitespace is ignored. Examples: + 3/5/6; kbaseuser:myworkspace/myobject; 5/myobject/2 aworkspace/6), + parameter "resolved_ws_objects" of list of type "obj_ref" (A + string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "intermediate_incoming" of list of String, parameter + "intermediate_outgoing" of list of String, parameter + "external_data" of list of type "ExternalDataUnit" (An external + data unit. A piece of data from a source outside the Workspace. On + input, only one of the resource_release_date or + resource_release_epoch may be supplied. Both are supplied on + output. All fields are optional, but at least one field must be + present. string resource_name - the name of the resource, for + example JGI. string resource_url - the url of the resource, for + example http://genome.jgi.doe.gov string resource_version - + version of the resource timestamp resource_release_date - the + release date of the resource epoch resource_release_epoch - the + release date of the resource string data_url - the url of the + data, for example + http://genome.jgi.doe.gov/pages/dynamicOrganismDownload.jsf? + organism=BlaspURHD0036 string data_id - the id of the data, for + example 7625.2.79179.AGTTCC.adnq.fastq.gz string description - a + free text description of the data.) -> structure: parameter + "resource_name" of String, parameter "resource_url" of String, + parameter "resource_version" of String, parameter + "resource_release_date" of type "timestamp" (A time in the format + YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "resource_release_epoch" of type "epoch" (A Unix + epoch (the time since 00:00:00 1/1/1970 UTC) in milliseconds.), + parameter "data_url" of String, parameter "data_id" of String, + parameter "description" of String, parameter "subactions" of list + of type "SubAction" (Information about a subaction that is invoked + by a provenance action. A provenance action (PA) may invoke + subactions (SA), e.g. calling a separate piece of code, a service, + or a script. In most cases these calls are the same from PA to PA + and so do not need to be listed in the provenance since providing + information about the PA alone provides reproducibility. In some + cases, however, SAs may change over time, such that invoking the + same PA with the same parameters may produce different results. + For example, if a PA calls a remote server, that server may be + updated between a PA invoked on day T and another PA invoked on + day T+1. The SubAction structure allows for specifying information + about SAs that may dynamically change from PA invocation to PA + invocation. All fields are optional but at least one field must be + present. string name - the name of the SA. string ver - the + version of SA. string code_url - a url pointing to the SA's + codebase. string commit - a version control commit ID for the SA. + string endpoint_url - a url pointing to the access point for the + SA - a server url, for instance.) -> structure: parameter "name" + of String, parameter "ver" of String, parameter "code_url" of + String, parameter "commit" of String, parameter "endpoint_url" of + String, parameter "custom" of mapping from String to String, + parameter "description" of String, parameter "creator" of type + "username" (Login name of a KBase user account.), parameter + "orig_wsid" of type "ws_id" (The unique, permanent numerical ID of + a workspace.), parameter "created" of type "timestamp" (A time in + the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "epoch" of type "epoch" (A Unix epoch (the time + since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "refs" + of list of type "obj_ref" (A string that uniquely identifies an + object in the workspace service. The format is [ws_name or + id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "copied" of type "obj_ref" (A + string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "copy_source_inaccessible" of type "boolean" (A boolean. 0 = + false, other = true.), parameter "extracted_ids" of mapping from + type "id_type" (An id type (e.g. from a typespec @id annotation: + @id [idtype])) to list of type "extracted_id" (An id extracted + from an object.), parameter "handle_error" of String, parameter + "handle_stacktrace" of String + """ + return self._client.call_method('Workspace.get_referenced_objects', + [ref_chains], self._service_ver, context) + + def list_workspaces(self, params, context=None): + """ + Lists the metadata of all workspaces a user has access to. Provided for + backwards compatibility - to be replaced by the functionality of + list_workspace_info + @deprecated Workspace.list_workspace_info + :param params: instance of type "list_workspaces_params" (Input + parameters for the "list_workspaces" function. Provided for + backwards compatibility. Optional parameters: string auth - the + authentication token of the KBase account accessing the list of + workspaces. Overrides the client provided authorization + credentials if they exist. boolean excludeGlobal - if + excludeGlobal is true exclude world readable workspaces. Defaults + to false. @deprecated Workspace.ListWorkspaceInfoParams) -> + structure: parameter "auth" of String, parameter "excludeGlobal" + of type "boolean" (A boolean. 0 = false, other = true.) + :returns: instance of list of type "workspace_metadata" (Meta data + associated with a workspace. Provided for backwards compatibility. + To be replaced by workspace_info. ws_name id - name of the + workspace username owner - name of the user who owns (who created) + this workspace timestamp moddate - date when the workspace was + last modified int objects - the approximate number of objects + currently stored in the workspace. permission user_permission - + permissions for the currently logged in user for the workspace + permission global_permission - default permissions for the + workspace for all KBase users ws_id num_id - numerical ID of the + workspace @deprecated Workspace.workspace_info) -> tuple of size + 7: parameter "id" of type "ws_name" (A string used as a name for a + workspace. Any string consisting of alphanumeric characters and + "_", ".", or "-" that is not an integer is acceptable. The name + may optionally be prefixed with the workspace owner's user name + and a colon, e.g. kbasetest:my_workspace.), parameter "owner" of + type "username" (Login name of a KBase user account.), parameter + "moddate" of type "timestamp" (A time in the format + YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "objects" of Long, parameter "user_permission" + of type "permission" (Represents the permissions a user or users + have to a workspace: 'a' - administrator. All operations allowed. + 'w' - read/write. 'r' - read. 'n' - no permissions.), parameter + "global_permission" of type "permission" (Represents the + permissions a user or users have to a workspace: 'a' - + administrator. All operations allowed. 'w' - read/write. 'r' - + read. 'n' - no permissions.), parameter "num_id" of type "ws_id" + (The unique, permanent numerical ID of a workspace.) + """ + return self._client.call_method('Workspace.list_workspaces', + [params], self._service_ver, context) + + def list_workspace_info(self, params, context=None): + """ + List workspaces viewable by the user. + :param params: instance of type "ListWorkspaceInfoParams" (Input + parameters for the "list_workspace_info" function. Only one of + each timestamp/epoch pair may be supplied. Optional parameters: + permission perm - filter workspaces by minimum permission level. + 'None' and 'readable' are ignored. list owners - filter + workspaces by owner. usermeta meta - filter workspaces by the user + supplied metadata. NOTE: only one key/value pair is supported at + this time. A full map is provided as input for the possibility for + expansion in the future. timestamp after - only return workspaces + that were modified after this date. timestamp before - only return + workspaces that were modified before this date. epoch after_epoch + - only return workspaces that were modified after this date. epoch + before_epoch - only return workspaces that were modified before + this date. boolean excludeGlobal - if excludeGlobal is true + exclude world readable workspaces. Defaults to false. boolean + showDeleted - show deleted workspaces that are owned by the user. + boolean showOnlyDeleted - only show deleted workspaces that are + owned by the user.) -> structure: parameter "perm" of type + "permission" (Represents the permissions a user or users have to a + workspace: 'a' - administrator. All operations allowed. 'w' - + read/write. 'r' - read. 'n' - no permissions.), parameter "owners" + of list of type "username" (Login name of a KBase user account.), + parameter "meta" of type "usermeta" (User provided metadata about + an object. Arbitrary key-value pairs provided by the user.) -> + mapping from String to String, parameter "after" of type + "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is + either the character Z (representing the UTC timezone) or the + difference in time to UTC in the format +/-HHMM, eg: + 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC + time) 2013-04-03T08:56:32Z (UTC time)), parameter "before" of type + "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is + either the character Z (representing the UTC timezone) or the + difference in time to UTC in the format +/-HHMM, eg: + 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC + time) 2013-04-03T08:56:32Z (UTC time)), parameter "after_epoch" of + type "epoch" (A Unix epoch (the time since 00:00:00 1/1/1970 UTC) + in milliseconds.), parameter "before_epoch" of type "epoch" (A + Unix epoch (the time since 00:00:00 1/1/1970 UTC) in + milliseconds.), parameter "excludeGlobal" of type "boolean" (A + boolean. 0 = false, other = true.), parameter "showDeleted" of + type "boolean" (A boolean. 0 = false, other = true.), parameter + "showOnlyDeleted" of type "boolean" (A boolean. 0 = false, other = + true.) + :returns: instance of list of type "workspace_info" (Information + about a workspace. ws_id id - the numerical ID of the workspace. + ws_name workspace - name of the workspace. username owner - name + of the user who owns (e.g. created) this workspace. timestamp + moddate - date when the workspace was last modified. int max_objid + - the maximum object ID appearing in this workspace. Since cloning + a workspace preserves object IDs, this number may be greater than + the number of objects in a newly cloned workspace. permission + user_permission - permissions for the authenticated user of this + workspace. permission globalread - whether this workspace is + globally readable. lock_status lockstat - the status of the + workspace lock. usermeta metadata - arbitrary user-supplied + metadata about the workspace.) -> tuple of size 9: parameter "id" + of type "ws_id" (The unique, permanent numerical ID of a + workspace.), parameter "workspace" of type "ws_name" (A string + used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "owner" of type "username" + (Login name of a KBase user account.), parameter "moddate" of type + "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is + either the character Z (representing the UTC timezone) or the + difference in time to UTC in the format +/-HHMM, eg: + 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC + time) 2013-04-03T08:56:32Z (UTC time)), parameter "max_objid" of + Long, parameter "user_permission" of type "permission" (Represents + the permissions a user or users have to a workspace: 'a' - + administrator. All operations allowed. 'w' - read/write. 'r' - + read. 'n' - no permissions.), parameter "globalread" of type + "permission" (Represents the permissions a user or users have to a + workspace: 'a' - administrator. All operations allowed. 'w' - + read/write. 'r' - read. 'n' - no permissions.), parameter + "lockstat" of type "lock_status" (The lock status of a workspace. + One of 'unlocked', 'locked', or 'published'.), parameter + "metadata" of type "usermeta" (User provided metadata about an + object. Arbitrary key-value pairs provided by the user.) -> + mapping from String to String + """ + return self._client.call_method('Workspace.list_workspace_info', + [params], self._service_ver, context) + + def list_workspace_ids(self, params, context=None): + """ + List workspace IDs to which the user has access. + This function returns a subset of the information in the + list_workspace_info method and should be substantially faster. + :param params: instance of type "ListWorkspaceIDsParams" (Input + parameters for the "list_workspace_ids" function. Optional + parameters: permission perm - filter workspaces by minimum + permission level. 'None' and 'readable' are ignored. boolean + onlyGlobal - if onlyGlobal is true only include world readable + workspaces. Defaults to false. If true, excludeGlobal is ignored. + boolean excludeGlobal - if excludeGlobal is true exclude world + readable workspaces. Defaults to true.) -> structure: parameter + "perm" of type "permission" (Represents the permissions a user or + users have to a workspace: 'a' - administrator. All operations + allowed. 'w' - read/write. 'r' - read. 'n' - no permissions.), + parameter "excludeGlobal" of type "boolean" (A boolean. 0 = false, + other = true.), parameter "onlyGlobal" of type "boolean" (A + boolean. 0 = false, other = true.) + :returns: instance of type "ListWorkspaceIDsResults" (Results of the + "list_workspace_ids" function. list workspaces - the + workspaces to which the user has explicit access. list pub - + the workspaces to which the user has access because they're + globally readable.) -> structure: parameter "workspaces" of list + of Long, parameter "pub" of list of Long + """ + return self._client.call_method('Workspace.list_workspace_ids', + [params], self._service_ver, context) + + def list_workspace_objects(self, params, context=None): + """ + Lists the metadata of all objects in the specified workspace with the + specified type (or with any type). Provided for backwards compatibility. + @deprecated Workspace.list_objects + :param params: instance of type "list_workspace_objects_params" + (Input parameters for the "list_workspace_objects" function. + Provided for backwards compatibility. Required arguments: ws_name + workspace - Name of the workspace for which objects should be + listed Optional arguments: type_string type - type of the objects + to be listed. Here, omitting version information will find any + objects that match the provided type - e.g. Foo.Bar-0 will match + Foo.Bar-0.X where X is any existing version. boolean + showDeletedObject - show objects that have been deleted string + auth - the authentication token of the KBase account requesting + access. Overrides the client provided authorization credentials if + they exist. @deprecated Workspace.ListObjectsParams) -> structure: + parameter "workspace" of type "ws_name" (A string used as a name + for a workspace. Any string consisting of alphanumeric characters + and "_", ".", or "-" that is not an integer is acceptable. The + name may optionally be prefixed with the workspace owner's user + name and a colon, e.g. kbasetest:my_workspace.), parameter "type" + of type "type_string" (A type string. Specifies the type and its + version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "showDeletedObject" of type + "boolean" (A boolean. 0 = false, other = true.), parameter "auth" + of String + :returns: instance of list of type "object_metadata" (Meta data + associated with an object stored in a workspace. Provided for + backwards compatibility. obj_name id - name of the object. + type_string type - type of the object. timestamp moddate - date + when the object was saved obj_ver instance - the version of the + object string command - Deprecated. Always returns the empty + string. username lastmodifier - name of the user who last saved + the object, including copying the object username owner - + Deprecated. Same as lastmodifier. ws_name workspace - name of the + workspace in which the object is stored string ref - Deprecated. + Always returns the empty string. string chsum - the md5 checksum + of the object. usermeta metadata - arbitrary user-supplied + metadata about the object. obj_id objid - the numerical id of the + object. @deprecated object_info) -> tuple of size 12: parameter + "id" of type "obj_name" (A string used as a name for an object. + Any string consisting of alphanumeric characters and the + characters |._- that is not an integer is acceptable.), parameter + "type" of type "type_string" (A type string. Specifies the type + and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "moddate" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "instance" of Long, + parameter "command" of String, parameter "lastmodifier" of type + "username" (Login name of a KBase user account.), parameter + "owner" of type "username" (Login name of a KBase user account.), + parameter "workspace" of type "ws_name" (A string used as a name + for a workspace. Any string consisting of alphanumeric characters + and "_", ".", or "-" that is not an integer is acceptable. The + name may optionally be prefixed with the workspace owner's user + name and a colon, e.g. kbasetest:my_workspace.), parameter "ref" + of String, parameter "chsum" of String, parameter "metadata" of + type "usermeta" (User provided metadata about an object. Arbitrary + key-value pairs provided by the user.) -> mapping from String to + String, parameter "objid" of type "obj_id" (The unique, permanent + numerical ID of an object.) + """ + return self._client.call_method('Workspace.list_workspace_objects', + [params], self._service_ver, context) + + def list_objects(self, params, context=None): + """ + List objects in one or more workspaces. + :param params: instance of type "ListObjectsParams" (Parameters for + the 'list_objects' function. At least one, and no more than 10000, + workspaces must be specified in one of the two following + parameters. It is strongly recommended that the list is restricted + to the workspaces of interest, or the results may be very large: + list ids - the numerical IDs of the workspaces of interest. + list workspaces - the names of the workspaces of + interest. Only one of each timestamp/epoch pair may be supplied. + Optional arguments: type_string type - type of the objects to be + listed. Here, omitting version information will find any objects + that match the provided type - e.g. Foo.Bar-0 will match + Foo.Bar-0.X where X is any existing version. permission perm - + DEPRECATED, no longer useful. Filter on minimum permission by + providing only workspaces with the desired permission levels in + the input list(s). list savedby - filter objects by the + user that saved or copied the object. usermeta meta - filter + objects by the user supplied metadata. NOTE: only one key/value + pair is supported at this time. A full map is provided as input + for the possibility for expansion in the future. timestamp after - + only return objects that were created after this date. timestamp + before - only return objects that were created before this date. + epoch after_epoch - only return objects that were created after + this date. epoch before_epoch - only return objects that were + created before this date. string startafter - a reference-like + string that determines where the list of objects will begin. It + takes the form X/Y/Z, where X is the workspace ID, Y the object + ID, and Z the version. The version may be omitted, and the object + ID omitted if the version is also omitted. After a '/' separator + either an integer or no characters at all, including whitespace, + may occur. Whitespace strings are ignored. If startafter is + provided, after, before, after_epoch, before_epoch, savedby, meta, + minObjectID, and maxObjectID may not be provided. Only objects + that are ordered after the reference, exclusive, will be included + in the result, and the resulting list will be sorted by reference. + obj_id minObjectID - only return objects with an object id greater + or equal to this value. obj_id maxObjectID - only return objects + with an object id less than or equal to this value. boolean + showDeleted - show deleted objects in workspaces to which the user + has write access. boolean showOnlyDeleted - only show deleted + objects in workspaces to which the user has write access. boolean + showHidden - show hidden objects. boolean showAllVersions - show + all versions of each object that match the filters rather than + only the most recent version. boolean includeMetadata - include + the user provided metadata in the returned object_info. If false + (0 or null), the default, the metadata will be null. boolean + excludeGlobal - DEPRECATED, no longer useful. Filter on global + workspaces by excluding them from the input workspace list(s). int + limit - limit the output to X objects. Default and maximum value + is 10000. Limit values < 1 are treated as 10000, the default.) -> + structure: parameter "workspaces" of list of type "ws_name" (A + string used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "ids" of list of type "ws_id" + (The unique, permanent numerical ID of a workspace.), parameter + "type" of type "type_string" (A type string. Specifies the type + and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "perm" of type "permission" + (Represents the permissions a user or users have to a workspace: + 'a' - administrator. All operations allowed. 'w' - read/write. 'r' + - read. 'n' - no permissions.), parameter "savedby" of list of + type "username" (Login name of a KBase user account.), parameter + "meta" of type "usermeta" (User provided metadata about an object. + Arbitrary key-value pairs provided by the user.) -> mapping from + String to String, parameter "after" of type "timestamp" (A time in + the format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "before" of type "timestamp" (A time in the + format YYYY-MM-DDThh:mm:ssZ, where Z is either the character Z + (representing the UTC timezone) or the difference in time to UTC + in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 (EST time) + 2013-04-03T08:56:32+0000 (UTC time) 2013-04-03T08:56:32Z (UTC + time)), parameter "after_epoch" of type "epoch" (A Unix epoch (the + time since 00:00:00 1/1/1970 UTC) in milliseconds.), parameter + "before_epoch" of type "epoch" (A Unix epoch (the time since + 00:00:00 1/1/1970 UTC) in milliseconds.), parameter "startafter" + of String, parameter "minObjectID" of type "obj_id" (The unique, + permanent numerical ID of an object.), parameter "maxObjectID" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "showDeleted" of type "boolean" (A boolean. 0 = false, + other = true.), parameter "showOnlyDeleted" of type "boolean" (A + boolean. 0 = false, other = true.), parameter "showHidden" of type + "boolean" (A boolean. 0 = false, other = true.), parameter + "showAllVersions" of type "boolean" (A boolean. 0 = false, other = + true.), parameter "includeMetadata" of type "boolean" (A boolean. + 0 = false, other = true.), parameter "excludeGlobal" of type + "boolean" (A boolean. 0 = false, other = true.), parameter "limit" + of Long + :returns: instance of list of type "object_info" (Information about + an object, including user provided metadata. obj_id objid - the + numerical id of the object. obj_name name - the name of the + object. type_string type - the type of the object. timestamp + save_date - the save date of the object. obj_ver ver - the version + of the object. username saved_by - the user that saved or copied + the object. ws_id wsid - the workspace containing the object. + ws_name workspace - the workspace containing the object. string + chsum - the md5 checksum of the object. int size - the size of the + object in bytes. usermeta meta - arbitrary user-supplied metadata + about the object.) -> tuple of size 11: parameter "objid" of type + "obj_id" (The unique, permanent numerical ID of an object.), + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String + """ + return self._client.call_method('Workspace.list_objects', + [params], self._service_ver, context) + + def get_objectmeta(self, params, context=None): + """ + Retrieves the metadata for a specified object from the specified + workspace. Provides access to metadata for all versions of the object + via the instance parameter. Provided for backwards compatibility. + @deprecated Workspace.get_object_info3 + :param params: instance of type "get_objectmeta_params" (Input + parameters for the "get_objectmeta" function. Required arguments: + ws_name workspace - name of the workspace containing the object + for which metadata is to be retrieved obj_name id - name of the + object for which metadata is to be retrieved Optional arguments: + int instance - Version of the object for which metadata is to be + retrieved, enabling retrieval of any previous version of an object + string auth - the authentication token of the KBase account + requesting access. Overrides the client provided authorization + credentials if they exist. @deprecated Workspace.ObjectIdentity) + -> structure: parameter "id" of type "obj_name" (A string used as + a name for an object. Any string consisting of alphanumeric + characters and the characters |._- that is not an integer is + acceptable.), parameter "workspace" of type "ws_name" (A string + used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "instance" of Long, parameter + "auth" of String + :returns: instance of type "object_metadata" (Meta data associated + with an object stored in a workspace. Provided for backwards + compatibility. obj_name id - name of the object. type_string type + - type of the object. timestamp moddate - date when the object was + saved obj_ver instance - the version of the object string command + - Deprecated. Always returns the empty string. username + lastmodifier - name of the user who last saved the object, + including copying the object username owner - Deprecated. Same as + lastmodifier. ws_name workspace - name of the workspace in which + the object is stored string ref - Deprecated. Always returns the + empty string. string chsum - the md5 checksum of the object. + usermeta metadata - arbitrary user-supplied metadata about the + object. obj_id objid - the numerical id of the object. @deprecated + object_info) -> tuple of size 12: parameter "id" of type + "obj_name" (A string used as a name for an object. Any string + consisting of alphanumeric characters and the characters |._- that + is not an integer is acceptable.), parameter "type" of type + "type_string" (A type string. Specifies the type and its version + in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "moddate" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "instance" of Long, + parameter "command" of String, parameter "lastmodifier" of type + "username" (Login name of a KBase user account.), parameter + "owner" of type "username" (Login name of a KBase user account.), + parameter "workspace" of type "ws_name" (A string used as a name + for a workspace. Any string consisting of alphanumeric characters + and "_", ".", or "-" that is not an integer is acceptable. The + name may optionally be prefixed with the workspace owner's user + name and a colon, e.g. kbasetest:my_workspace.), parameter "ref" + of String, parameter "chsum" of String, parameter "metadata" of + type "usermeta" (User provided metadata about an object. Arbitrary + key-value pairs provided by the user.) -> mapping from String to + String, parameter "objid" of type "obj_id" (The unique, permanent + numerical ID of an object.) + """ + return self._client.call_method('Workspace.get_objectmeta', + [params], self._service_ver, context) + + def get_object_info(self, object_ids, includeMetadata, context=None): + """ + Get information about objects from the workspace. + Set includeMetadata true to include the user specified metadata. + Otherwise the metadata in the object_info will be null. + This method will be replaced by the behavior of get_object_info_new + in the future. + @deprecated Workspace.get_object_info3 + :param object_ids: instance of list of type "ObjectIdentity" (An + object identifier. Select an object by either: One, and only one, + of the numerical id or name of the workspace. ws_id wsid - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + :param includeMetadata: instance of type "boolean" (A boolean. 0 = + false, other = true.) + :returns: instance of list of type "object_info" (Information about + an object, including user provided metadata. obj_id objid - the + numerical id of the object. obj_name name - the name of the + object. type_string type - the type of the object. timestamp + save_date - the save date of the object. obj_ver ver - the version + of the object. username saved_by - the user that saved or copied + the object. ws_id wsid - the workspace containing the object. + ws_name workspace - the workspace containing the object. string + chsum - the md5 checksum of the object. int size - the size of the + object in bytes. usermeta meta - arbitrary user-supplied metadata + about the object.) -> tuple of size 11: parameter "objid" of type + "obj_id" (The unique, permanent numerical ID of an object.), + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String + """ + return self._client.call_method('Workspace.get_object_info', + [object_ids, includeMetadata], self._service_ver, context) + + def get_object_info_new(self, params, context=None): + """ + Get information about objects from the workspace. + @deprecated Workspace.get_object_info3 + :param params: instance of type "GetObjectInfoNewParams" (Input + parameters for the "get_object_info_new" function. Required + arguments: list objects - the objects for + which the information should be fetched. Subsetting related + parameters are ignored. Optional arguments: boolean + includeMetadata - include the object metadata in the returned + information. Default false. boolean ignoreErrors - Don't throw an + exception if an object cannot be accessed; return null for that + object's information instead. Default false. @deprecated + Workspace.GetObjectInfo3Params) -> structure: parameter "objects" + of list of type "ObjectSpecification" (An Object Specification + (OS). Inherits from ObjectIdentity (OI). Specifies which object, + and which parts of that object, to retrieve from the Workspace + Service. The fields wsid, workspace, objid, name, and ver are + identical to the OI fields. The ref field's behavior is extended + from OI. It maintains its previous behavior, but now also can act + as a reference string. See reference following below for more + information. REFERENCE FOLLOWING: Reference following guarantees + that a user that has access to an object can always see a) objects + that are referenced inside the object and b) objects that are + referenced in the object's provenance. This ensures that the user + has visibility into the entire provenance of the object and the + object's object dependencies (e.g. references). The user must have + at least read access to the object specified in this SO, but need + not have access to any further objects in the reference chain, and + those objects may be deleted. Optional reference following fields: + Note that only one of the following fields may be specified. + ref_chain obj_path - a path to the desired object from the object + specified in this OS. In other words, the object specified in this + OS is assumed to be accessible to the user, and the objects in the + object path represent a chain of references to the desired object + at the end of the object path. If the references are all valid, + the desired object will be returned. - OR - list + obj_ref_path - shorthand for the obj_path. - OR - ref_chain + to_obj_path - identical to obj_path, except that the path is TO + the object specified in this OS, rather than from the object. In + other words the object specified by wsid/objid/ref etc. is the end + of the path, and to_obj_path is the rest of the path. The user + must have access to the first object in the to_obj_path. - OR - + list to_obj_ref_path - shorthand for the to_obj_path. - + OR - ref_string ref - A string representing a reference path from + one object to another. Unlike the previous reference following + options, the ref_string represents the ENTIRE path from the source + object to the target object. As with the OI object, the ref field + may contain a single reference. - OR - boolean find_refence_path - + This is the last, slowest, and most expensive resort for getting a + referenced object - do not use this method unless the path to the + object is unavailable by any other means. Setting the + find_refence_path parameter to true means that the workspace + service will search through the object reference graph from the + object specified in this OS to find an object that 1) the user can + access, and 2) has an unbroken reference path to the target + object. If the search succeeds, the object will be returned as + normal. Note that the search will automatically fail after a + certain (but much larger than necessary for the vast majority of + cases) number of objects are traversed. OBJECT SUBSETS: When + selecting a subset of an array in an object, the returned array is + compressed to the size of the subset, but the ordering of the + array is maintained. For example, if the array stored at the + 'feature' key of a Genome object has 4000 entries, and the object + paths provided are: /feature/7 /feature/3015 /feature/700 The + returned feature array will be of length three and the entries + will consist, in order, of the 7th, 700th, and 3015th entries of + the original array. Optional object subset fields: + list included - the portions of the object to include + in the object subset. boolean strict_maps - if true, throw an + exception if the subset specification traverses a non-existent map + key (default false) boolean strict_arrays - if true, throw an + exception if the subset specification exceeds the size of an array + (default true)) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type + "ref_string" (A chain of objects with references to one another as + a string. A single string that is semantically identical to + ref_chain above. Represents a path from one workspace object to + another through an arbitrarily number of intermediate objects + where each object has a dependency or provenance reference to the + next object. Each entry is an obj_ref as defined earlier. Entries + are separated by semicolons. Whitespace is ignored. Examples: + 3/5/6; kbaseuser:myworkspace/myobject; 5/myobject/2 aworkspace/6), + parameter "obj_path" of type "ref_chain" (A chain of objects with + references to one another. An object reference chain consists of a + list of objects where the nth object possesses a reference, either + in the object itself or in the object provenance, to the n+1th + object.) -> list of type "ObjectIdentity" (An object identifier. + Select an object by either: One, and only one, of the numerical id + or name of the workspace. ws_id wsid - the numerical ID of the + workspace. ws_name workspace - the name of the workspace. AND One, + and only one, of the numerical id or name of the object. obj_id + objid- the numerical ID of the object. obj_name name - name of the + object. OPTIONALLY obj_ver ver - the version of the object. OR an + object reference string: obj_ref ref - an object reference + string.) -> structure: parameter "workspace" of type "ws_name" (A + string used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "obj_ref_path" of list of type "obj_ref" (A string that uniquely + identifies an object in the workspace service. The format is + [ws_name or id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "to_obj_path" of type + "ref_chain" (A chain of objects with references to one another. An + object reference chain consists of a list of objects where the nth + object possesses a reference, either in the object itself or in + the object provenance, to the n+1th object.) -> list of type + "ObjectIdentity" (An object identifier. Select an object by + either: One, and only one, of the numerical id or name of the + workspace. ws_id wsid - the numerical ID of the workspace. ws_name + workspace - the name of the workspace. AND One, and only one, of + the numerical id or name of the object. obj_id objid- the + numerical ID of the object. obj_name name - name of the object. + OPTIONALLY obj_ver ver - the version of the object. OR an object + reference string: obj_ref ref - an object reference string.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "wsid" of type "ws_id" (The unique, permanent numerical + ID of a workspace.), parameter "name" of type "obj_name" (A string + used as a name for an object. Any string consisting of + alphanumeric characters and the characters |._- that is not an + integer is acceptable.), parameter "objid" of type "obj_id" (The + unique, permanent numerical ID of an object.), parameter "ver" of + type "obj_ver" (An object version. The version of the object, + starting at 1.), parameter "ref" of type "obj_ref" (A string that + uniquely identifies an object in the workspace service. The format + is [ws_name or id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "to_obj_ref_path" of list of + type "obj_ref" (A string that uniquely identifies an object in the + workspace service. The format is [ws_name or id]/[obj_name or + id]/[obj_ver]. For example, MyFirstWorkspace/MyFirstObject/3 would + identify the third version of an object called MyFirstObject in + the workspace called MyFirstWorkspace. 42/Panic/1 would identify + the first version of the object name Panic in workspace with id + 42. Towel/1/6 would identify the 6th version of the object with id + 1 in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "find_reference_path" of type "boolean" (A boolean. 0 = false, + other = true.), parameter "included" of list of type "object_path" + (A path into an object. Identify a sub portion of an object by + providing the path, delimited by a slash (/), to that portion of + the object. Thus the path may not have slashes in the structure or + mapping keys. Examples: /foo/bar/3 - specifies the bar key of the + foo mapping and the 3rd entry of the array if bar maps to an array + or the value mapped to the string "3" if bar maps to a map. + /foo/bar/[*]/baz - specifies the baz field of all the objects in + the list mapped by the bar key in the map foo. /foo/asterisk/baz - + specifies the baz field of all the objects in the values of the + foo mapping. Swap 'asterisk' for * in the path. In case you need + to use '/' or '~' in path items use JSON Pointer notation defined + here: http://tools.ietf.org/html/rfc6901), parameter "strict_maps" + of type "boolean" (A boolean. 0 = false, other = true.), parameter + "strict_arrays" of type "boolean" (A boolean. 0 = false, other = + true.), parameter "includeMetadata" of type "boolean" (A boolean. + 0 = false, other = true.), parameter "ignoreErrors" of type + "boolean" (A boolean. 0 = false, other = true.) + :returns: instance of list of type "object_info" (Information about + an object, including user provided metadata. obj_id objid - the + numerical id of the object. obj_name name - the name of the + object. type_string type - the type of the object. timestamp + save_date - the save date of the object. obj_ver ver - the version + of the object. username saved_by - the user that saved or copied + the object. ws_id wsid - the workspace containing the object. + ws_name workspace - the workspace containing the object. string + chsum - the md5 checksum of the object. int size - the size of the + object in bytes. usermeta meta - arbitrary user-supplied metadata + about the object.) -> tuple of size 11: parameter "objid" of type + "obj_id" (The unique, permanent numerical ID of an object.), + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String + """ + return self._client.call_method('Workspace.get_object_info_new', + [params], self._service_ver, context) + + def get_object_info3(self, params, context=None): + """ + :param params: instance of type "GetObjectInfo3Params" (Input + parameters for the "get_object_info3" function. Required + arguments: list objects - the objects for + which the information should be fetched. Subsetting related + parameters are ignored. Optional arguments: boolean + includeMetadata - include the object metadata in the returned + information. Default false. boolean ignoreErrors - Don't throw an + exception if an object cannot be accessed; return null for that + object's information and path instead. Default false.) -> + structure: parameter "objects" of list of type + "ObjectSpecification" (An Object Specification (OS). Inherits from + ObjectIdentity (OI). Specifies which object, and which parts of + that object, to retrieve from the Workspace Service. The fields + wsid, workspace, objid, name, and ver are identical to the OI + fields. The ref field's behavior is extended from OI. It maintains + its previous behavior, but now also can act as a reference string. + See reference following below for more information. REFERENCE + FOLLOWING: Reference following guarantees that a user that has + access to an object can always see a) objects that are referenced + inside the object and b) objects that are referenced in the + object's provenance. This ensures that the user has visibility + into the entire provenance of the object and the object's object + dependencies (e.g. references). The user must have at least read + access to the object specified in this SO, but need not have + access to any further objects in the reference chain, and those + objects may be deleted. Optional reference following fields: Note + that only one of the following fields may be specified. ref_chain + obj_path - a path to the desired object from the object specified + in this OS. In other words, the object specified in this OS is + assumed to be accessible to the user, and the objects in the + object path represent a chain of references to the desired object + at the end of the object path. If the references are all valid, + the desired object will be returned. - OR - list + obj_ref_path - shorthand for the obj_path. - OR - ref_chain + to_obj_path - identical to obj_path, except that the path is TO + the object specified in this OS, rather than from the object. In + other words the object specified by wsid/objid/ref etc. is the end + of the path, and to_obj_path is the rest of the path. The user + must have access to the first object in the to_obj_path. - OR - + list to_obj_ref_path - shorthand for the to_obj_path. - + OR - ref_string ref - A string representing a reference path from + one object to another. Unlike the previous reference following + options, the ref_string represents the ENTIRE path from the source + object to the target object. As with the OI object, the ref field + may contain a single reference. - OR - boolean find_refence_path - + This is the last, slowest, and most expensive resort for getting a + referenced object - do not use this method unless the path to the + object is unavailable by any other means. Setting the + find_refence_path parameter to true means that the workspace + service will search through the object reference graph from the + object specified in this OS to find an object that 1) the user can + access, and 2) has an unbroken reference path to the target + object. If the search succeeds, the object will be returned as + normal. Note that the search will automatically fail after a + certain (but much larger than necessary for the vast majority of + cases) number of objects are traversed. OBJECT SUBSETS: When + selecting a subset of an array in an object, the returned array is + compressed to the size of the subset, but the ordering of the + array is maintained. For example, if the array stored at the + 'feature' key of a Genome object has 4000 entries, and the object + paths provided are: /feature/7 /feature/3015 /feature/700 The + returned feature array will be of length three and the entries + will consist, in order, of the 7th, 700th, and 3015th entries of + the original array. Optional object subset fields: + list included - the portions of the object to include + in the object subset. boolean strict_maps - if true, throw an + exception if the subset specification traverses a non-existent map + key (default false) boolean strict_arrays - if true, throw an + exception if the subset specification exceeds the size of an array + (default true)) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type + "ref_string" (A chain of objects with references to one another as + a string. A single string that is semantically identical to + ref_chain above. Represents a path from one workspace object to + another through an arbitrarily number of intermediate objects + where each object has a dependency or provenance reference to the + next object. Each entry is an obj_ref as defined earlier. Entries + are separated by semicolons. Whitespace is ignored. Examples: + 3/5/6; kbaseuser:myworkspace/myobject; 5/myobject/2 aworkspace/6), + parameter "obj_path" of type "ref_chain" (A chain of objects with + references to one another. An object reference chain consists of a + list of objects where the nth object possesses a reference, either + in the object itself or in the object provenance, to the n+1th + object.) -> list of type "ObjectIdentity" (An object identifier. + Select an object by either: One, and only one, of the numerical id + or name of the workspace. ws_id wsid - the numerical ID of the + workspace. ws_name workspace - the name of the workspace. AND One, + and only one, of the numerical id or name of the object. obj_id + objid- the numerical ID of the object. obj_name name - name of the + object. OPTIONALLY obj_ver ver - the version of the object. OR an + object reference string: obj_ref ref - an object reference + string.) -> structure: parameter "workspace" of type "ws_name" (A + string used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "obj_ref_path" of list of type "obj_ref" (A string that uniquely + identifies an object in the workspace service. The format is + [ws_name or id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "to_obj_path" of type + "ref_chain" (A chain of objects with references to one another. An + object reference chain consists of a list of objects where the nth + object possesses a reference, either in the object itself or in + the object provenance, to the n+1th object.) -> list of type + "ObjectIdentity" (An object identifier. Select an object by + either: One, and only one, of the numerical id or name of the + workspace. ws_id wsid - the numerical ID of the workspace. ws_name + workspace - the name of the workspace. AND One, and only one, of + the numerical id or name of the object. obj_id objid- the + numerical ID of the object. obj_name name - name of the object. + OPTIONALLY obj_ver ver - the version of the object. OR an object + reference string: obj_ref ref - an object reference string.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "wsid" of type "ws_id" (The unique, permanent numerical + ID of a workspace.), parameter "name" of type "obj_name" (A string + used as a name for an object. Any string consisting of + alphanumeric characters and the characters |._- that is not an + integer is acceptable.), parameter "objid" of type "obj_id" (The + unique, permanent numerical ID of an object.), parameter "ver" of + type "obj_ver" (An object version. The version of the object, + starting at 1.), parameter "ref" of type "obj_ref" (A string that + uniquely identifies an object in the workspace service. The format + is [ws_name or id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "to_obj_ref_path" of list of + type "obj_ref" (A string that uniquely identifies an object in the + workspace service. The format is [ws_name or id]/[obj_name or + id]/[obj_ver]. For example, MyFirstWorkspace/MyFirstObject/3 would + identify the third version of an object called MyFirstObject in + the workspace called MyFirstWorkspace. 42/Panic/1 would identify + the first version of the object name Panic in workspace with id + 42. Towel/1/6 would identify the 6th version of the object with id + 1 in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter + "find_reference_path" of type "boolean" (A boolean. 0 = false, + other = true.), parameter "included" of list of type "object_path" + (A path into an object. Identify a sub portion of an object by + providing the path, delimited by a slash (/), to that portion of + the object. Thus the path may not have slashes in the structure or + mapping keys. Examples: /foo/bar/3 - specifies the bar key of the + foo mapping and the 3rd entry of the array if bar maps to an array + or the value mapped to the string "3" if bar maps to a map. + /foo/bar/[*]/baz - specifies the baz field of all the objects in + the list mapped by the bar key in the map foo. /foo/asterisk/baz - + specifies the baz field of all the objects in the values of the + foo mapping. Swap 'asterisk' for * in the path. In case you need + to use '/' or '~' in path items use JSON Pointer notation defined + here: http://tools.ietf.org/html/rfc6901), parameter "strict_maps" + of type "boolean" (A boolean. 0 = false, other = true.), parameter + "strict_arrays" of type "boolean" (A boolean. 0 = false, other = + true.), parameter "includeMetadata" of type "boolean" (A boolean. + 0 = false, other = true.), parameter "ignoreErrors" of type + "boolean" (A boolean. 0 = false, other = true.) + :returns: instance of type "GetObjectInfo3Results" (Output from the + get_object_info3 function. list infos - the + object_info data for each object. list paths - the + path to the object through the object reference graph for each + object. All the references in the path are absolute.) -> + structure: parameter "infos" of list of type "object_info" + (Information about an object, including user provided metadata. + obj_id objid - the numerical id of the object. obj_name name - the + name of the object. type_string type - the type of the object. + timestamp save_date - the save date of the object. obj_ver ver - + the version of the object. username saved_by - the user that saved + or copied the object. ws_id wsid - the workspace containing the + object. ws_name workspace - the workspace containing the object. + string chsum - the md5 checksum of the object. int size - the size + of the object in bytes. usermeta meta - arbitrary user-supplied + metadata about the object.) -> tuple of size 11: parameter "objid" + of type "obj_id" (The unique, permanent numerical ID of an + object.), parameter "name" of type "obj_name" (A string used as a + name for an object. Any string consisting of alphanumeric + characters and the characters |._- that is not an integer is + acceptable.), parameter "type" of type "type_string" (A type + string. Specifies the type and its version in a single string in + the format [module].[typename]-[major].[minor]: module - a string. + The module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String, parameter "paths" of + list of list of type "obj_ref" (A string that uniquely identifies + an object in the workspace service. The format is [ws_name or + id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.) + """ + return self._client.call_method('Workspace.get_object_info3', + [params], self._service_ver, context) + + def rename_workspace(self, params, context=None): + """ + Rename a workspace. + :param params: instance of type "RenameWorkspaceParams" (Input + parameters for the 'rename_workspace' function. Required + arguments: WorkspaceIdentity wsi - the workspace to rename. + ws_name new_name - the new name for the workspace.) -> structure: + parameter "wsi" of type "WorkspaceIdentity" (A workspace + identifier. Select a workspace by one, and only one, of the + numerical id or name. ws_id id - the numerical ID of the + workspace. ws_name workspace - the name of the workspace.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "id" of type "ws_id" (The unique, permanent numerical ID + of a workspace.), parameter "new_name" of type "ws_name" (A string + used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.) + :returns: instance of type "workspace_info" (Information about a + workspace. ws_id id - the numerical ID of the workspace. ws_name + workspace - name of the workspace. username owner - name of the + user who owns (e.g. created) this workspace. timestamp moddate - + date when the workspace was last modified. int max_objid - the + maximum object ID appearing in this workspace. Since cloning a + workspace preserves object IDs, this number may be greater than + the number of objects in a newly cloned workspace. permission + user_permission - permissions for the authenticated user of this + workspace. permission globalread - whether this workspace is + globally readable. lock_status lockstat - the status of the + workspace lock. usermeta metadata - arbitrary user-supplied + metadata about the workspace.) -> tuple of size 9: parameter "id" + of type "ws_id" (The unique, permanent numerical ID of a + workspace.), parameter "workspace" of type "ws_name" (A string + used as a name for a workspace. Any string consisting of + alphanumeric characters and "_", ".", or "-" that is not an + integer is acceptable. The name may optionally be prefixed with + the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "owner" of type "username" + (Login name of a KBase user account.), parameter "moddate" of type + "timestamp" (A time in the format YYYY-MM-DDThh:mm:ssZ, where Z is + either the character Z (representing the UTC timezone) or the + difference in time to UTC in the format +/-HHMM, eg: + 2012-12-17T23:24:06-0500 (EST time) 2013-04-03T08:56:32+0000 (UTC + time) 2013-04-03T08:56:32Z (UTC time)), parameter "max_objid" of + Long, parameter "user_permission" of type "permission" (Represents + the permissions a user or users have to a workspace: 'a' - + administrator. All operations allowed. 'w' - read/write. 'r' - + read. 'n' - no permissions.), parameter "globalread" of type + "permission" (Represents the permissions a user or users have to a + workspace: 'a' - administrator. All operations allowed. 'w' - + read/write. 'r' - read. 'n' - no permissions.), parameter + "lockstat" of type "lock_status" (The lock status of a workspace. + One of 'unlocked', 'locked', or 'published'.), parameter + "metadata" of type "usermeta" (User provided metadata about an + object. Arbitrary key-value pairs provided by the user.) -> + mapping from String to String + """ + return self._client.call_method('Workspace.rename_workspace', + [params], self._service_ver, context) + + def rename_object(self, params, context=None): + """ + Rename an object. User meta data is always returned as null. + :param params: instance of type "RenameObjectParams" (Input + parameters for the 'rename_object' function. Required arguments: + ObjectIdentity obj - the object to rename. obj_name new_name - the + new name for the object.) -> structure: parameter "obj" of type + "ObjectIdentity" (An object identifier. Select an object by + either: One, and only one, of the numerical id or name of the + workspace. ws_id wsid - the numerical ID of the workspace. ws_name + workspace - the name of the workspace. AND One, and only one, of + the numerical id or name of the object. obj_id objid- the + numerical ID of the object. obj_name name - name of the object. + OPTIONALLY obj_ver ver - the version of the object. OR an object + reference string: obj_ref ref - an object reference string.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "wsid" of type "ws_id" (The unique, permanent numerical + ID of a workspace.), parameter "name" of type "obj_name" (A string + used as a name for an object. Any string consisting of + alphanumeric characters and the characters |._- that is not an + integer is acceptable.), parameter "objid" of type "obj_id" (The + unique, permanent numerical ID of an object.), parameter "ver" of + type "obj_ver" (An object version. The version of the object, + starting at 1.), parameter "ref" of type "obj_ref" (A string that + uniquely identifies an object in the workspace service. The format + is [ws_name or id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.), parameter "new_name" of type "obj_name" + (A string used as a name for an object. Any string consisting of + alphanumeric characters and the characters |._- that is not an + integer is acceptable.) + :returns: instance of type "object_info" (Information about an + object, including user provided metadata. obj_id objid - the + numerical id of the object. obj_name name - the name of the + object. type_string type - the type of the object. timestamp + save_date - the save date of the object. obj_ver ver - the version + of the object. username saved_by - the user that saved or copied + the object. ws_id wsid - the workspace containing the object. + ws_name workspace - the workspace containing the object. string + chsum - the md5 checksum of the object. int size - the size of the + object in bytes. usermeta meta - arbitrary user-supplied metadata + about the object.) -> tuple of size 11: parameter "objid" of type + "obj_id" (The unique, permanent numerical ID of an object.), + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String + """ + return self._client.call_method('Workspace.rename_object', + [params], self._service_ver, context) + + def copy_object(self, params, context=None): + """ + Copy an object. Returns the object_info for the newest version. + :param params: instance of type "CopyObjectParams" (Input parameters + for the 'copy_object' function. If the 'from' ObjectIdentity + includes no version and the object is copied to a new name, the + entire version history of the object is copied. In all other cases + only the version specified, or the latest version if no version is + specified, is copied. The version from the 'to' ObjectIdentity is + always ignored. Required arguments: ObjectIdentity from - the + object to copy. ObjectIdentity to - where to copy the object.) -> + structure: parameter "from" of type "ObjectIdentity" (An object + identifier. Select an object by either: One, and only one, of the + numerical id or name of the workspace. ws_id wsid - the numerical + ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.), parameter "to" of type + "ObjectIdentity" (An object identifier. Select an object by + either: One, and only one, of the numerical id or name of the + workspace. ws_id wsid - the numerical ID of the workspace. ws_name + workspace - the name of the workspace. AND One, and only one, of + the numerical id or name of the object. obj_id objid- the + numerical ID of the object. obj_name name - name of the object. + OPTIONALLY obj_ver ver - the version of the object. OR an object + reference string: obj_ref ref - an object reference string.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "wsid" of type "ws_id" (The unique, permanent numerical + ID of a workspace.), parameter "name" of type "obj_name" (A string + used as a name for an object. Any string consisting of + alphanumeric characters and the characters |._- that is not an + integer is acceptable.), parameter "objid" of type "obj_id" (The + unique, permanent numerical ID of an object.), parameter "ver" of + type "obj_ver" (An object version. The version of the object, + starting at 1.), parameter "ref" of type "obj_ref" (A string that + uniquely identifies an object in the workspace service. The format + is [ws_name or id]/[obj_name or id]/[obj_ver]. For example, + MyFirstWorkspace/MyFirstObject/3 would identify the third version + of an object called MyFirstObject in the workspace called + MyFirstWorkspace. 42/Panic/1 would identify the first version of + the object name Panic in workspace with id 42. Towel/1/6 would + identify the 6th version of the object with id 1 in the Towel + workspace.If the version number is omitted, the latest version of + the object is assumed.) + :returns: instance of type "object_info" (Information about an + object, including user provided metadata. obj_id objid - the + numerical id of the object. obj_name name - the name of the + object. type_string type - the type of the object. timestamp + save_date - the save date of the object. obj_ver ver - the version + of the object. username saved_by - the user that saved or copied + the object. ws_id wsid - the workspace containing the object. + ws_name workspace - the workspace containing the object. string + chsum - the md5 checksum of the object. int size - the size of the + object in bytes. usermeta meta - arbitrary user-supplied metadata + about the object.) -> tuple of size 11: parameter "objid" of type + "obj_id" (The unique, permanent numerical ID of an object.), + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String + """ + return self._client.call_method('Workspace.copy_object', + [params], self._service_ver, context) + + def revert_object(self, object, context=None): + """ + Revert an object. + The object specified in the ObjectIdentity is reverted to the version + specified in the ObjectIdentity. + :param object: instance of type "ObjectIdentity" (An object + identifier. Select an object by either: One, and only one, of the + numerical id or name of the workspace. ws_id wsid - the numerical + ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + :returns: instance of type "object_info" (Information about an + object, including user provided metadata. obj_id objid - the + numerical id of the object. obj_name name - the name of the + object. type_string type - the type of the object. timestamp + save_date - the save date of the object. obj_ver ver - the version + of the object. username saved_by - the user that saved or copied + the object. ws_id wsid - the workspace containing the object. + ws_name workspace - the workspace containing the object. string + chsum - the md5 checksum of the object. int size - the size of the + object in bytes. usermeta meta - arbitrary user-supplied metadata + about the object.) -> tuple of size 11: parameter "objid" of type + "obj_id" (The unique, permanent numerical ID of an object.), + parameter "name" of type "obj_name" (A string used as a name for + an object. Any string consisting of alphanumeric characters and + the characters |._- that is not an integer is acceptable.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A + time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the + character Z (representing the UTC timezone) or the difference in + time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500 + (EST time) 2013-04-03T08:56:32+0000 (UTC time) + 2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long, + parameter "saved_by" of type "username" (Login name of a KBase + user account.), parameter "wsid" of type "ws_id" (The unique, + permanent numerical ID of a workspace.), parameter "workspace" of + type "ws_name" (A string used as a name for a workspace. Any + string consisting of alphanumeric characters and "_", ".", or "-" + that is not an integer is acceptable. The name may optionally be + prefixed with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "chsum" of String, parameter + "size" of Long, parameter "meta" of type "usermeta" (User provided + metadata about an object. Arbitrary key-value pairs provided by + the user.) -> mapping from String to String + """ + return self._client.call_method('Workspace.revert_object', + [object], self._service_ver, context) + + def get_names_by_prefix(self, params, context=None): + """ + Get object names matching a prefix. At most 1000 names are returned. + No particular ordering is guaranteed, nor is which names will be + returned if more than 1000 are found. + This function is intended for use as an autocomplete helper function. + :param params: instance of type "GetNamesByPrefixParams" (Input + parameters for the get_names_by_prefix function. Required + arguments: list workspaces - the workspaces to + search. string prefix - the prefix of the object names to return. + Optional arguments: boolean includeHidden - include names of + hidden objects in the results. Default false.) -> structure: + parameter "workspaces" of list of type "WorkspaceIdentity" (A + workspace identifier. Select a workspace by one, and only one, of + the numerical id or name. ws_id id - the numerical ID of the + workspace. ws_name workspace - the name of the workspace.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "id" of type "ws_id" (The unique, permanent numerical ID + of a workspace.), parameter "prefix" of String, parameter + "includeHidden" of type "boolean" (A boolean. 0 = false, other = + true.) + :returns: instance of type "GetNamesByPrefixResults" (Results object + for the get_names_by_prefix function. list> names - + the names matching the provided prefix, listed in order of the + input workspaces.) -> structure: parameter "names" of list of list + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.) + """ + return self._client.call_method('Workspace.get_names_by_prefix', + [params], self._service_ver, context) + + def hide_objects(self, object_ids, context=None): + """ + Hide objects. All versions of an object are hidden, regardless of + the version specified in the ObjectIdentity. Hidden objects do not + appear in the list_objects method. + :param object_ids: instance of list of type "ObjectIdentity" (An + object identifier. Select an object by either: One, and only one, + of the numerical id or name of the workspace. ws_id wsid - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + """ + return self._client.call_method('Workspace.hide_objects', + [object_ids], self._service_ver, context) + + def unhide_objects(self, object_ids, context=None): + """ + Unhide objects. All versions of an object are unhidden, regardless + of the version specified in the ObjectIdentity. + :param object_ids: instance of list of type "ObjectIdentity" (An + object identifier. Select an object by either: One, and only one, + of the numerical id or name of the workspace. ws_id wsid - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + """ + return self._client.call_method('Workspace.unhide_objects', + [object_ids], self._service_ver, context) + + def delete_objects(self, object_ids, context=None): + """ + Delete objects. All versions of an object are deleted, regardless of + the version specified in the ObjectIdentity. + :param object_ids: instance of list of type "ObjectIdentity" (An + object identifier. Select an object by either: One, and only one, + of the numerical id or name of the workspace. ws_id wsid - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + """ + return self._client.call_method('Workspace.delete_objects', + [object_ids], self._service_ver, context) + + def undelete_objects(self, object_ids, context=None): + """ + Undelete objects. All versions of an object are undeleted, regardless + of the version specified in the ObjectIdentity. If an object is not + deleted, no error is thrown. + :param object_ids: instance of list of type "ObjectIdentity" (An + object identifier. Select an object by either: One, and only one, + of the numerical id or name of the workspace. ws_id wsid - the + numerical ID of the workspace. ws_name workspace - the name of the + workspace. AND One, and only one, of the numerical id or name of + the object. obj_id objid- the numerical ID of the object. obj_name + name - name of the object. OPTIONALLY obj_ver ver - the version of + the object. OR an object reference string: obj_ref ref - an object + reference string.) -> structure: parameter "workspace" of type + "ws_name" (A string used as a name for a workspace. Any string + consisting of alphanumeric characters and "_", ".", or "-" that is + not an integer is acceptable. The name may optionally be prefixed + with the workspace owner's user name and a colon, e.g. + kbasetest:my_workspace.), parameter "wsid" of type "ws_id" (The + unique, permanent numerical ID of a workspace.), parameter "name" + of type "obj_name" (A string used as a name for an object. Any + string consisting of alphanumeric characters and the characters + |._- that is not an integer is acceptable.), parameter "objid" of + type "obj_id" (The unique, permanent numerical ID of an object.), + parameter "ver" of type "obj_ver" (An object version. The version + of the object, starting at 1.), parameter "ref" of type "obj_ref" + (A string that uniquely identifies an object in the workspace + service. The format is [ws_name or id]/[obj_name or id]/[obj_ver]. + For example, MyFirstWorkspace/MyFirstObject/3 would identify the + third version of an object called MyFirstObject in the workspace + called MyFirstWorkspace. 42/Panic/1 would identify the first + version of the object name Panic in workspace with id 42. + Towel/1/6 would identify the 6th version of the object with id 1 + in the Towel workspace.If the version number is omitted, the + latest version of the object is assumed.) + """ + return self._client.call_method('Workspace.undelete_objects', + [object_ids], self._service_ver, context) + + def delete_workspace(self, wsi, context=None): + """ + Delete a workspace. All objects contained in the workspace are deleted. + :param wsi: instance of type "WorkspaceIdentity" (A workspace + identifier. Select a workspace by one, and only one, of the + numerical id or name. ws_id id - the numerical ID of the + workspace. ws_name workspace - the name of the workspace.) -> + structure: parameter "workspace" of type "ws_name" (A string used + as a name for a workspace. Any string consisting of alphanumeric + characters and "_", ".", or "-" that is not an integer is + acceptable. The name may optionally be prefixed with the workspace + owner's user name and a colon, e.g. kbasetest:my_workspace.), + parameter "id" of type "ws_id" (The unique, permanent numerical ID + of a workspace.) + """ + return self._client.call_method('Workspace.delete_workspace', + [wsi], self._service_ver, context) + + def request_module_ownership(self, mod, context=None): + """ + Request ownership of a module name. A Workspace administrator + must approve the request. + :param mod: instance of type "modulename" (A module name defined in a + KIDL typespec.) + """ + return self._client.call_method('Workspace.request_module_ownership', + [mod], self._service_ver, context) + + def register_typespec(self, params, context=None): + """ + Register a new typespec or recompile a previously registered typespec + with new options. + See the documentation of RegisterTypespecParams for more details. + Also see the release_types function. + :param params: instance of type "RegisterTypespecParams" (Parameters + for the register_typespec function. Required arguments: One of: + typespec spec - the new typespec to register. modulename mod - the + module to recompile with updated options (see below). Optional + arguments: boolean dryrun - Return, but do not save, the results + of compiling the spec. Default true. Set to false for making + permanent changes. list new_types - types in the spec to + make available in the workspace service. When compiling a spec for + the first time, if this argument is empty no types will be made + available. Previously available types remain so upon recompilation + of a spec or compilation of a new spec. list + remove_types - no longer make these types available in the + workspace service for the new version of the spec. This does not + remove versions of types previously compiled. mapping dependencies - By default, the latest released + versions of spec dependencies will be included when compiling a + spec. Specific versions can be specified here. spec_version + prev_ver - the id of the previous version of the typespec. An + error will be thrown if this is set and prev_ver is not the most + recent version of the typespec. This prevents overwriting of + changes made since retrieving a spec and compiling an edited spec. + This argument is ignored if a modulename is passed.) -> structure: + parameter "spec" of type "typespec" (A type specification + (typespec) file in the KBase Interface Description Language + (KIDL).), parameter "mod" of type "modulename" (A module name + defined in a KIDL typespec.), parameter "new_types" of list of + type "typename" (A type definition name in a KIDL typespec.), + parameter "remove_types" of list of type "typename" (A type + definition name in a KIDL typespec.), parameter "dependencies" of + mapping from type "modulename" (A module name defined in a KIDL + typespec.) to type "spec_version" (The version of a typespec + file.), parameter "dryrun" of type "boolean" (A boolean. 0 = + false, other = true.), parameter "prev_ver" of type "spec_version" + (The version of a typespec file.) + :returns: instance of mapping from type "type_string" (A type string. + Specifies the type and its version in a single string in the + format [module].[typename]-[major].[minor]: module - a string. The + module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1) to type "jsonschema" (The JSON Schema (v4) + representation of a type definition.) + """ + return self._client.call_method('Workspace.register_typespec', + [params], self._service_ver, context) + + def register_typespec_copy(self, params, context=None): + """ + Register a copy of new typespec or refresh an existing typespec which is + loaded from another workspace for synchronization. Method returns new + version of module in current workspace. + Also see the release_types function. + :param params: instance of type "RegisterTypespecCopyParams" + (Parameters for the register_typespec_copy function. Required + arguments: string external_workspace_url - the URL of the + workspace server from which to copy a typespec. modulename mod - + the name of the module in the workspace server Optional arguments: + spec_version version - the version of the module in the workspace + server) -> structure: parameter "external_workspace_url" of + String, parameter "mod" of type "modulename" (A module name + defined in a KIDL typespec.), parameter "version" of type + "spec_version" (The version of a typespec file.) + :returns: instance of type "spec_version" (The version of a typespec + file.) + """ + return self._client.call_method('Workspace.register_typespec_copy', + [params], self._service_ver, context) + + def release_module(self, mod, context=None): + """ + Release a module for general use of its types. + Releases the most recent version of a module. Releasing a module does + two things to the module's types: + 1) If a type's major version is 0, it is changed to 1. A major + version of 0 implies that the type is in development and may have + backwards incompatible changes from minor version to minor version. + Once a type is released, backwards incompatible changes always + cause a major version increment. + 2) This version of the type becomes the default version, and if a + specific version is not supplied in a function call, this version + will be used. This means that newer, unreleased versions of the + type may be skipped. + :param mod: instance of type "modulename" (A module name defined in a + KIDL typespec.) + :returns: instance of list of type "type_string" (A type string. + Specifies the type and its version in a single string in the + format [module].[typename]-[major].[minor]: module - a string. The + module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1) + """ + return self._client.call_method('Workspace.release_module', + [mod], self._service_ver, context) + + def list_modules(self, params, context=None): + """ + List typespec modules. + :param params: instance of type "ListModulesParams" (Parameters for + the list_modules() function. Optional arguments: username owner - + only list modules owned by this user.) -> structure: parameter + "owner" of type "username" (Login name of a KBase user account.) + :returns: instance of list of type "modulename" (A module name + defined in a KIDL typespec.) + """ + return self._client.call_method('Workspace.list_modules', + [params], self._service_ver, context) + + def list_module_versions(self, params, context=None): + """ + List typespec module versions. + :param params: instance of type "ListModuleVersionsParams" + (Parameters for the list_module_versions function. Required + arguments: One of: modulename mod - returns all versions of the + module. type_string type - returns all versions of the module + associated with the type.) -> structure: parameter "mod" of type + "modulename" (A module name defined in a KIDL typespec.), + parameter "type" of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1) + :returns: instance of type "ModuleVersions" (A set of versions from a + module. modulename mod - the name of the module. + list - a set or subset of versions associated with + the module. list - a set or subset of released + versions associated with the module.) -> structure: parameter + "mod" of type "modulename" (A module name defined in a KIDL + typespec.), parameter "vers" of list of type "spec_version" (The + version of a typespec file.), parameter "released_vers" of list of + type "spec_version" (The version of a typespec file.) + """ + return self._client.call_method('Workspace.list_module_versions', + [params], self._service_ver, context) + + def get_module_info(self, params, context=None): + """ + :param params: instance of type "GetModuleInfoParams" (Parameters for + the get_module_info function. Required arguments: modulename mod - + the name of the module to retrieve. Optional arguments: + spec_version ver - the version of the module to retrieve. Defaults + to the latest version.) -> structure: parameter "mod" of type + "modulename" (A module name defined in a KIDL typespec.), + parameter "ver" of type "spec_version" (The version of a typespec + file.) + :returns: instance of type "ModuleInfo" (Information about a module. + list owners - the owners of the module. spec_version ver + - the version of the module. typespec spec - the typespec. string + description - the description of the module from the typespec. + mapping types - the types associated with + this module and their JSON schema. mapping included_spec_version - names of included modules + associated with their versions. string chsum - the md5 checksum of + the object. list functions - list of names of + functions registered in spec. boolean is_released - shows if this + version of module was released (and hence can be seen by others).) + -> structure: parameter "owners" of list of type "username" (Login + name of a KBase user account.), parameter "ver" of type + "spec_version" (The version of a typespec file.), parameter "spec" + of type "typespec" (A type specification (typespec) file in the + KBase Interface Description Language (KIDL).), parameter + "description" of String, parameter "types" of mapping from type + "type_string" (A type string. Specifies the type and its version + in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1) to type "jsonschema" (The JSON Schema (v4) + representation of a type definition.), parameter + "included_spec_version" of mapping from type "modulename" (A + module name defined in a KIDL typespec.) to type "spec_version" + (The version of a typespec file.), parameter "chsum" of String, + parameter "functions" of list of type "func_string" (A function + string for referencing a funcdef. Specifies the function and its + version in a single string in the format + [modulename].[funcname]-[major].[minor]: modulename - a string. + The name of the module containing the function. funcname - a + string. The name of the function as assigned by the funcdef + statement. major - an integer. The major version of the function. + A change in the major version implies the function has changed in + a non-backwards compatible way. minor - an integer. The minor + version of the function. A change in the minor version implies + that the function has changed in a way that is backwards + compatible with previous function definitions. In many cases, the + major and minor versions are optional, and if not provided the + most recent version will be used. Example: MyModule.MyFunc-3.1), + parameter "is_released" of type "boolean" (A boolean. 0 = false, + other = true.) + """ + return self._client.call_method('Workspace.get_module_info', + [params], self._service_ver, context) + + def get_jsonschema(self, type, context=None): + """ + Get JSON schema for a type. + :param type: instance of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1) + :returns: instance of type "jsonschema" (The JSON Schema (v4) + representation of a type definition.) + """ + return self._client.call_method('Workspace.get_jsonschema', + [type], self._service_ver, context) + + def translate_from_MD5_types(self, md5_types, context=None): + """ + Translation from types qualified with MD5 to their semantic versions + :param md5_types: instance of list of type "type_string" (A type + string. Specifies the type and its version in a single string in + the format [module].[typename]-[major].[minor]: module - a string. + The module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1) + :returns: instance of mapping from type "type_string" (A type string. + Specifies the type and its version in a single string in the + format [module].[typename]-[major].[minor]: module - a string. The + module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1) to list of type "type_string" (A type string. + Specifies the type and its version in a single string in the + format [module].[typename]-[major].[minor]: module - a string. The + module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1) + """ + return self._client.call_method('Workspace.translate_from_MD5_types', + [md5_types], self._service_ver, context) + + def translate_to_MD5_types(self, sem_types, context=None): + """ + Translation from types qualified with semantic versions to their MD5'ed versions + :param sem_types: instance of list of type "type_string" (A type + string. Specifies the type and its version in a single string in + the format [module].[typename]-[major].[minor]: module - a string. + The module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1) + :returns: instance of mapping from type "type_string" (A type string. + Specifies the type and its version in a single string in the + format [module].[typename]-[major].[minor]: module - a string. The + module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1) to type "type_string" (A type string. + Specifies the type and its version in a single string in the + format [module].[typename]-[major].[minor]: module - a string. The + module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1) + """ + return self._client.call_method('Workspace.translate_to_MD5_types', + [sem_types], self._service_ver, context) + + def get_type_info(self, type, context=None): + """ + :param type: instance of type "type_string" (A type string. Specifies + the type and its version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1) + :returns: instance of type "TypeInfo" (Information about a type + type_string type_def - resolved type definition id. string + description - the description of the type from spec file. string + spec_def - reconstruction of type definition from spec file. + jsonschema json_schema - JSON schema of this type. string + parsing_structure - json document describing parsing structure of + type in spec file including involved sub-types. list + module_vers - versions of spec-files containing given type + version. list released_module_vers - versions of + released spec-files containing given type version. + list type_vers - all versions of type with given type + name. list released_type_vers - all released versions + of type with given type name. list using_func_defs - + list of functions (with versions) referring to this type version. + list using_type_defs - list of types (with versions) + referring to this type version. list used_type_defs - + list of types (with versions) referred from this type version.) -> + structure: parameter "type_def" of type "type_string" (A type + string. Specifies the type and its version in a single string in + the format [module].[typename]-[major].[minor]: module - a string. + The module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "description" of String, parameter + "spec_def" of String, parameter "json_schema" of type "jsonschema" + (The JSON Schema (v4) representation of a type definition.), + parameter "parsing_structure" of String, parameter "module_vers" + of list of type "spec_version" (The version of a typespec file.), + parameter "released_module_vers" of list of type "spec_version" + (The version of a typespec file.), parameter "type_vers" of list + of type "type_string" (A type string. Specifies the type and its + version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "released_type_vers" of list of + type "type_string" (A type string. Specifies the type and its + version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "using_func_defs" of list of type + "func_string" (A function string for referencing a funcdef. + Specifies the function and its version in a single string in the + format [modulename].[funcname]-[major].[minor]: modulename - a + string. The name of the module containing the function. funcname - + a string. The name of the function as assigned by the funcdef + statement. major - an integer. The major version of the function. + A change in the major version implies the function has changed in + a non-backwards compatible way. minor - an integer. The minor + version of the function. A change in the minor version implies + that the function has changed in a way that is backwards + compatible with previous function definitions. In many cases, the + major and minor versions are optional, and if not provided the + most recent version will be used. Example: MyModule.MyFunc-3.1), + parameter "using_type_defs" of list of type "type_string" (A type + string. Specifies the type and its version in a single string in + the format [module].[typename]-[major].[minor]: module - a string. + The module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "used_type_defs" of list of type + "type_string" (A type string. Specifies the type and its version + in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1) + """ + return self._client.call_method('Workspace.get_type_info', + [type], self._service_ver, context) + + def get_all_type_info(self, mod, context=None): + """ + :param mod: instance of type "modulename" (A module name defined in a + KIDL typespec.) + :returns: instance of list of type "TypeInfo" (Information about a + type type_string type_def - resolved type definition id. string + description - the description of the type from spec file. string + spec_def - reconstruction of type definition from spec file. + jsonschema json_schema - JSON schema of this type. string + parsing_structure - json document describing parsing structure of + type in spec file including involved sub-types. list + module_vers - versions of spec-files containing given type + version. list released_module_vers - versions of + released spec-files containing given type version. + list type_vers - all versions of type with given type + name. list released_type_vers - all released versions + of type with given type name. list using_func_defs - + list of functions (with versions) referring to this type version. + list using_type_defs - list of types (with versions) + referring to this type version. list used_type_defs - + list of types (with versions) referred from this type version.) -> + structure: parameter "type_def" of type "type_string" (A type + string. Specifies the type and its version in a single string in + the format [module].[typename]-[major].[minor]: module - a string. + The module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "description" of String, parameter + "spec_def" of String, parameter "json_schema" of type "jsonschema" + (The JSON Schema (v4) representation of a type definition.), + parameter "parsing_structure" of String, parameter "module_vers" + of list of type "spec_version" (The version of a typespec file.), + parameter "released_module_vers" of list of type "spec_version" + (The version of a typespec file.), parameter "type_vers" of list + of type "type_string" (A type string. Specifies the type and its + version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "released_type_vers" of list of + type "type_string" (A type string. Specifies the type and its + version in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "using_func_defs" of list of type + "func_string" (A function string for referencing a funcdef. + Specifies the function and its version in a single string in the + format [modulename].[funcname]-[major].[minor]: modulename - a + string. The name of the module containing the function. funcname - + a string. The name of the function as assigned by the funcdef + statement. major - an integer. The major version of the function. + A change in the major version implies the function has changed in + a non-backwards compatible way. minor - an integer. The minor + version of the function. A change in the minor version implies + that the function has changed in a way that is backwards + compatible with previous function definitions. In many cases, the + major and minor versions are optional, and if not provided the + most recent version will be used. Example: MyModule.MyFunc-3.1), + parameter "using_type_defs" of list of type "type_string" (A type + string. Specifies the type and its version in a single string in + the format [module].[typename]-[major].[minor]: module - a string. + The module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1), parameter "used_type_defs" of list of type + "type_string" (A type string. Specifies the type and its version + in a single string in the format + [module].[typename]-[major].[minor]: module - a string. The module + name of the typespec containing the type. typename - a string. The + name of the type as assigned by the typedef statement. major - an + integer. The major version of the type. A change in the major + version implies the type has changed in a non-backwards compatible + way. minor - an integer. The minor version of the type. A change + in the minor version implies that the type has changed in a way + that is backwards compatible with previous type definitions. In + many cases, the major and minor versions are optional, and if not + provided the most recent version will be used. Example: + MyModule.MyType-3.1) + """ + return self._client.call_method('Workspace.get_all_type_info', + [mod], self._service_ver, context) + + def get_func_info(self, func, context=None): + """ + @deprecated + :param func: instance of type "func_string" (A function string for + referencing a funcdef. Specifies the function and its version in a + single string in the format + [modulename].[funcname]-[major].[minor]: modulename - a string. + The name of the module containing the function. funcname - a + string. The name of the function as assigned by the funcdef + statement. major - an integer. The major version of the function. + A change in the major version implies the function has changed in + a non-backwards compatible way. minor - an integer. The minor + version of the function. A change in the minor version implies + that the function has changed in a way that is backwards + compatible with previous function definitions. In many cases, the + major and minor versions are optional, and if not provided the + most recent version will be used. Example: MyModule.MyFunc-3.1) + :returns: instance of type "FuncInfo" (DEPRECATED @deprecated) -> + structure: parameter "func_def" of type "func_string" (A function + string for referencing a funcdef. Specifies the function and its + version in a single string in the format + [modulename].[funcname]-[major].[minor]: modulename - a string. + The name of the module containing the function. funcname - a + string. The name of the function as assigned by the funcdef + statement. major - an integer. The major version of the function. + A change in the major version implies the function has changed in + a non-backwards compatible way. minor - an integer. The minor + version of the function. A change in the minor version implies + that the function has changed in a way that is backwards + compatible with previous function definitions. In many cases, the + major and minor versions are optional, and if not provided the + most recent version will be used. Example: MyModule.MyFunc-3.1), + parameter "description" of String, parameter "spec_def" of String, + parameter "parsing_structure" of String, parameter "module_vers" + of list of type "spec_version" (The version of a typespec file.), + parameter "released_module_vers" of list of type "spec_version" + (The version of a typespec file.), parameter "func_vers" of list + of type "func_string" (A function string for referencing a + funcdef. Specifies the function and its version in a single string + in the format [modulename].[funcname]-[major].[minor]: modulename + - a string. The name of the module containing the function. + funcname - a string. The name of the function as assigned by the + funcdef statement. major - an integer. The major version of the + function. A change in the major version implies the function has + changed in a non-backwards compatible way. minor - an integer. The + minor version of the function. A change in the minor version + implies that the function has changed in a way that is backwards + compatible with previous function definitions. In many cases, the + major and minor versions are optional, and if not provided the + most recent version will be used. Example: MyModule.MyFunc-3.1), + parameter "released_func_vers" of list of type "func_string" (A + function string for referencing a funcdef. Specifies the function + and its version in a single string in the format + [modulename].[funcname]-[major].[minor]: modulename - a string. + The name of the module containing the function. funcname - a + string. The name of the function as assigned by the funcdef + statement. major - an integer. The major version of the function. + A change in the major version implies the function has changed in + a non-backwards compatible way. minor - an integer. The minor + version of the function. A change in the minor version implies + that the function has changed in a way that is backwards + compatible with previous function definitions. In many cases, the + major and minor versions are optional, and if not provided the + most recent version will be used. Example: MyModule.MyFunc-3.1), + parameter "used_type_defs" of list of type "type_string" (A type + string. Specifies the type and its version in a single string in + the format [module].[typename]-[major].[minor]: module - a string. + The module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1) + """ + return self._client.call_method('Workspace.get_func_info', + [func], self._service_ver, context) + + def get_all_func_info(self, mod, context=None): + """ + @deprecated + :param mod: instance of type "modulename" (A module name defined in a + KIDL typespec.) + :returns: instance of list of type "FuncInfo" (DEPRECATED + @deprecated) -> structure: parameter "func_def" of type + "func_string" (A function string for referencing a funcdef. + Specifies the function and its version in a single string in the + format [modulename].[funcname]-[major].[minor]: modulename - a + string. The name of the module containing the function. funcname - + a string. The name of the function as assigned by the funcdef + statement. major - an integer. The major version of the function. + A change in the major version implies the function has changed in + a non-backwards compatible way. minor - an integer. The minor + version of the function. A change in the minor version implies + that the function has changed in a way that is backwards + compatible with previous function definitions. In many cases, the + major and minor versions are optional, and if not provided the + most recent version will be used. Example: MyModule.MyFunc-3.1), + parameter "description" of String, parameter "spec_def" of String, + parameter "parsing_structure" of String, parameter "module_vers" + of list of type "spec_version" (The version of a typespec file.), + parameter "released_module_vers" of list of type "spec_version" + (The version of a typespec file.), parameter "func_vers" of list + of type "func_string" (A function string for referencing a + funcdef. Specifies the function and its version in a single string + in the format [modulename].[funcname]-[major].[minor]: modulename + - a string. The name of the module containing the function. + funcname - a string. The name of the function as assigned by the + funcdef statement. major - an integer. The major version of the + function. A change in the major version implies the function has + changed in a non-backwards compatible way. minor - an integer. The + minor version of the function. A change in the minor version + implies that the function has changed in a way that is backwards + compatible with previous function definitions. In many cases, the + major and minor versions are optional, and if not provided the + most recent version will be used. Example: MyModule.MyFunc-3.1), + parameter "released_func_vers" of list of type "func_string" (A + function string for referencing a funcdef. Specifies the function + and its version in a single string in the format + [modulename].[funcname]-[major].[minor]: modulename - a string. + The name of the module containing the function. funcname - a + string. The name of the function as assigned by the funcdef + statement. major - an integer. The major version of the function. + A change in the major version implies the function has changed in + a non-backwards compatible way. minor - an integer. The minor + version of the function. A change in the minor version implies + that the function has changed in a way that is backwards + compatible with previous function definitions. In many cases, the + major and minor versions are optional, and if not provided the + most recent version will be used. Example: MyModule.MyFunc-3.1), + parameter "used_type_defs" of list of type "type_string" (A type + string. Specifies the type and its version in a single string in + the format [module].[typename]-[major].[minor]: module - a string. + The module name of the typespec containing the type. typename - a + string. The name of the type as assigned by the typedef statement. + major - an integer. The major version of the type. A change in the + major version implies the type has changed in a non-backwards + compatible way. minor - an integer. The minor version of the type. + A change in the minor version implies that the type has changed in + a way that is backwards compatible with previous type definitions. + In many cases, the major and minor versions are optional, and if + not provided the most recent version will be used. Example: + MyModule.MyType-3.1) + """ + return self._client.call_method('Workspace.get_all_func_info', + [mod], self._service_ver, context) + + def grant_module_ownership(self, params, context=None): + """ + Grant ownership of a module. You must have grant ability on the + module. + :param params: instance of type "GrantModuleOwnershipParams" + (Parameters for the grant_module_ownership function. Required + arguments: modulename mod - the module to modify. username + new_owner - the user to add to the module's list of owners. + Optional arguments: boolean with_grant_option - true to allow the + user to add owners to the module.) -> structure: parameter "mod" + of type "modulename" (A module name defined in a KIDL typespec.), + parameter "new_owner" of type "username" (Login name of a KBase + user account.), parameter "with_grant_option" of type "boolean" (A + boolean. 0 = false, other = true.) + """ + return self._client.call_method('Workspace.grant_module_ownership', + [params], self._service_ver, context) + + def remove_module_ownership(self, params, context=None): + """ + Remove ownership from a current owner. You must have the grant ability + on the module. + :param params: instance of type "RemoveModuleOwnershipParams" + (Parameters for the remove_module_ownership function. Required + arguments: modulename mod - the module to modify. username + old_owner - the user to remove from the module's list of owners.) + -> structure: parameter "mod" of type "modulename" (A module name + defined in a KIDL typespec.), parameter "old_owner" of type + "username" (Login name of a KBase user account.) + """ + return self._client.call_method('Workspace.remove_module_ownership', + [params], self._service_ver, context) + + def list_all_types(self, params, context=None): + """ + List all released types with released version from all modules. Return + mapping from module name to mapping from type name to released type + version. + :param params: instance of type "ListAllTypesParams" (Parameters for + list_all_types function. Optional arguments: boolean + with_empty_modules - include empty module names, optional flag, + default value is false.) -> structure: parameter + "with_empty_modules" of type "boolean" (A boolean. 0 = false, + other = true.) + :returns: instance of mapping from type "modulename" (A module name + defined in a KIDL typespec.) to mapping from type "typename" (A + type definition name in a KIDL typespec.) to type "typever" (A + version of a type. Specifies the version of the type in a single + string in the format [major].[minor]: major - an integer. The + major version of the type. A change in the major version implies + the type has changed in a non-backwards compatible way. minor - an + integer. The minor version of the type. A change in the minor + version implies that the type has changed in a way that is + backwards compatible with previous type definitions.) + """ + return self._client.call_method('Workspace.list_all_types', + [params], self._service_ver, context) + + def administer(self, command, context=None): + """ + The administration interface. + :param command: instance of unspecified object + :returns: instance of unspecified object + """ + return self._client.call_method('Workspace.administer', + [command], self._service_ver, context) + + def status(self, context=None): + return self._client.call_method('Workspace.status', + [], self._service_ver, context) diff --git a/lib/DataFileUtil/__init__.py b/lib/installed_clients/__init__.py similarity index 100% rename from lib/DataFileUtil/__init__.py rename to lib/installed_clients/__init__.py diff --git a/lib/KBaseReport/authclient.py b/lib/installed_clients/authclient.py similarity index 81% rename from lib/KBaseReport/authclient.py rename to lib/installed_clients/authclient.py index 9a15713..844f9b0 100644 --- a/lib/KBaseReport/authclient.py +++ b/lib/installed_clients/authclient.py @@ -24,7 +24,7 @@ def __init__(self, maxsize=2000): self._halfmax = maxsize / 2 # int division to round down def get_user(self, token): - token = hashlib.sha256(token).hexdigest() + token = hashlib.sha256(token.encode('utf-8')).hexdigest() with self._lock: usertime = self._cache.get(token) if not usertime: @@ -40,12 +40,15 @@ def add_valid_token(self, token, user): raise ValueError('Must supply token') if not user: raise ValueError('Must supply user') - token = hashlib.sha256(token).hexdigest() + token = hashlib.sha256(token.encode('utf-8')).hexdigest() with self._lock: self._cache[token] = [user, _time.time()] if len(self._cache) > self._maxsize: - for i, (t, _) in enumerate(sorted(self._cache.items(), - key=lambda (_, v): v[1])): + sorted_items = sorted( + list(self._cache.items()), + key=(lambda v: v[1][1]) + ) + for i, (t, _) in enumerate(sorted_items): if i <= self._halfmax: del self._cache[t] else: @@ -57,7 +60,7 @@ class KBaseAuth(object): A very basic KBase auth client for the Python server. ''' - _LOGIN_URL = 'https://kbase.us/services/authorization/Sessions/Login' + _LOGIN_URL = 'https://kbase.us/services/auth/api/legacy/KBase/Sessions/Login' def __init__(self, auth_url=None): ''' @@ -80,11 +83,11 @@ def get_user(self, token): if not ret.ok: try: err = ret.json() - except: + except Exception as e: ret.raise_for_status() raise ValueError('Error connecting to auth service: {} {}\n{}' .format(ret.status_code, ret.reason, - err['error_msg'])) + err['error']['message'])) user = ret.json()['user_id'] self._cache.add_valid_token(token, user) diff --git a/lib/KBaseReport/baseclient.py b/lib/installed_clients/baseclient.py similarity index 94% rename from lib/KBaseReport/baseclient.py rename to lib/installed_clients/baseclient.py index 3d2a61a..7dc1ce1 100644 --- a/lib/KBaseReport/baseclient.py +++ b/lib/installed_clients/baseclient.py @@ -11,6 +11,9 @@ import requests as _requests import random as _random import os as _os +import traceback as _traceback +from requests.exceptions import ConnectionError +from urllib3.exceptions import ProtocolError try: from configparser import ConfigParser as _ConfigParser # py 3 @@ -26,6 +29,7 @@ _CT = 'content-type' _AJ = 'application/json' _URL_SCHEME = frozenset(['http', 'https']) +_CHECK_JOB_RETRYS = 3 def _get_token(user_id, password, auth_svc): @@ -121,7 +125,7 @@ def __init__( self, url=None, timeout=30 * 60, user_id=None, password=None, token=None, ignore_authrc=False, trust_all_ssl_certificates=False, - auth_svc='https://kbase.us/services/authorization/Sessions/Login', + auth_svc='https://kbase.us/services/auth/api/legacy/KBase/Sessions/Login', lookup_url=False, async_job_check_time_ms=100, async_job_check_time_scale_percent=150, @@ -236,20 +240,30 @@ def run_job(self, service_method, args, service_ver=None, context=None): mod, _ = service_method.split('.') job_id = self._submit_job(service_method, args, service_ver, context) async_job_check_time = self.async_job_check_time - while True: + check_job_failures = 0 + while check_job_failures < _CHECK_JOB_RETRYS: time.sleep(async_job_check_time) async_job_check_time = (async_job_check_time * self.async_job_check_time_scale_percent / 100.0) if async_job_check_time > self.async_job_check_max_time: async_job_check_time = self.async_job_check_max_time - job_state = self._check_job(mod, job_id) + + try: + job_state = self._check_job(mod, job_id) + except (ConnectionError, ProtocolError): + _traceback.print_exc() + check_job_failures += 1 + continue + if job_state['finished']: if not job_state['result']: return if len(job_state['result']) == 1: return job_state['result'][0] return job_state['result'] + raise RuntimeError("_check_job failed {} times and exceeded limit".format( + check_job_failures)) def call_method(self, service_method, args, service_ver=None, context=None): diff --git a/lib/kb_fasttree/authclient.py b/lib/kb_fasttree/authclient.py index 9a15713..844f9b0 100644 --- a/lib/kb_fasttree/authclient.py +++ b/lib/kb_fasttree/authclient.py @@ -24,7 +24,7 @@ def __init__(self, maxsize=2000): self._halfmax = maxsize / 2 # int division to round down def get_user(self, token): - token = hashlib.sha256(token).hexdigest() + token = hashlib.sha256(token.encode('utf-8')).hexdigest() with self._lock: usertime = self._cache.get(token) if not usertime: @@ -40,12 +40,15 @@ def add_valid_token(self, token, user): raise ValueError('Must supply token') if not user: raise ValueError('Must supply user') - token = hashlib.sha256(token).hexdigest() + token = hashlib.sha256(token.encode('utf-8')).hexdigest() with self._lock: self._cache[token] = [user, _time.time()] if len(self._cache) > self._maxsize: - for i, (t, _) in enumerate(sorted(self._cache.items(), - key=lambda (_, v): v[1])): + sorted_items = sorted( + list(self._cache.items()), + key=(lambda v: v[1][1]) + ) + for i, (t, _) in enumerate(sorted_items): if i <= self._halfmax: del self._cache[t] else: @@ -57,7 +60,7 @@ class KBaseAuth(object): A very basic KBase auth client for the Python server. ''' - _LOGIN_URL = 'https://kbase.us/services/authorization/Sessions/Login' + _LOGIN_URL = 'https://kbase.us/services/auth/api/legacy/KBase/Sessions/Login' def __init__(self, auth_url=None): ''' @@ -80,11 +83,11 @@ def get_user(self, token): if not ret.ok: try: err = ret.json() - except: + except Exception as e: ret.raise_for_status() raise ValueError('Error connecting to auth service: {} {}\n{}' .format(ret.status_code, ret.reason, - err['error_msg'])) + err['error']['message'])) user = ret.json()['user_id'] self._cache.add_valid_token(token, user) diff --git a/lib/kb_fasttree/baseclient.py b/lib/kb_fasttree/baseclient.py index 3d2a61a..7dc1ce1 100644 --- a/lib/kb_fasttree/baseclient.py +++ b/lib/kb_fasttree/baseclient.py @@ -11,6 +11,9 @@ import requests as _requests import random as _random import os as _os +import traceback as _traceback +from requests.exceptions import ConnectionError +from urllib3.exceptions import ProtocolError try: from configparser import ConfigParser as _ConfigParser # py 3 @@ -26,6 +29,7 @@ _CT = 'content-type' _AJ = 'application/json' _URL_SCHEME = frozenset(['http', 'https']) +_CHECK_JOB_RETRYS = 3 def _get_token(user_id, password, auth_svc): @@ -121,7 +125,7 @@ def __init__( self, url=None, timeout=30 * 60, user_id=None, password=None, token=None, ignore_authrc=False, trust_all_ssl_certificates=False, - auth_svc='https://kbase.us/services/authorization/Sessions/Login', + auth_svc='https://kbase.us/services/auth/api/legacy/KBase/Sessions/Login', lookup_url=False, async_job_check_time_ms=100, async_job_check_time_scale_percent=150, @@ -236,20 +240,30 @@ def run_job(self, service_method, args, service_ver=None, context=None): mod, _ = service_method.split('.') job_id = self._submit_job(service_method, args, service_ver, context) async_job_check_time = self.async_job_check_time - while True: + check_job_failures = 0 + while check_job_failures < _CHECK_JOB_RETRYS: time.sleep(async_job_check_time) async_job_check_time = (async_job_check_time * self.async_job_check_time_scale_percent / 100.0) if async_job_check_time > self.async_job_check_max_time: async_job_check_time = self.async_job_check_max_time - job_state = self._check_job(mod, job_id) + + try: + job_state = self._check_job(mod, job_id) + except (ConnectionError, ProtocolError): + _traceback.print_exc() + check_job_failures += 1 + continue + if job_state['finished']: if not job_state['result']: return if len(job_state['result']) == 1: return job_state['result'][0] return job_state['result'] + raise RuntimeError("_check_job failed {} times and exceeded limit".format( + check_job_failures)) def call_method(self, service_method, args, service_ver=None, context=None): diff --git a/lib/kb_fasttree/kb_fasttreeClient.py b/lib/kb_fasttree/kb_fasttreeClient.py index b509f13..67a7877 100644 --- a/lib/kb_fasttree/kb_fasttreeClient.py +++ b/lib/kb_fasttree/kb_fasttreeClient.py @@ -12,7 +12,7 @@ try: # baseclient and this client are in a package from .baseclient import BaseClient as _BaseClient # @UnusedImport -except: +except ImportError: # no they aren't from baseclient import BaseClient as _BaseClient # @Reimport @@ -23,7 +23,7 @@ def __init__( self, url=None, timeout=30 * 60, user_id=None, password=None, token=None, ignore_authrc=False, trust_all_ssl_certificates=False, - auth_svc='https://kbase.us/services/authorization/Sessions/Login'): + auth_svc='https://ci.kbase.us/services/auth/api/legacy/KBase/Sessions/Login'): if url is None: raise ValueError('A url is required') self._service_ver = None @@ -61,9 +61,8 @@ def run_FastTree(self, params, context=None): parameter "report_ref" of type "data_obj_ref", parameter "output_ref" of type "data_obj_ref" """ - return self._client.call_method( - 'kb_fasttree.run_FastTree', - [params], self._service_ver, context) + return self._client.call_method('kb_fasttree.run_FastTree', + [params], self._service_ver, context) def status(self, context=None): return self._client.call_method('kb_fasttree.status', diff --git a/lib/kb_fasttree/kb_fasttreeImpl.py b/lib/kb_fasttree/kb_fasttreeImpl.py index 9170c7e..e3859fa 100644 --- a/lib/kb_fasttree/kb_fasttreeImpl.py +++ b/lib/kb_fasttree/kb_fasttreeImpl.py @@ -11,18 +11,16 @@ import uuid from datetime import datetime from pprint import pprint, pformat -import numpy as np -import gzip - -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord -from Bio.Alphabet import generic_protein -from biokbase.workspace.client import Workspace as workspaceService -from requests_toolbelt import MultipartEncoder -from biokbase.AbstractHandle.Client import AbstractHandle as HandleService -from DataFileUtil.DataFileUtilClient import DataFileUtil as DFUClient -from KBaseReport.KBaseReportClient import KBaseReport +#import numpy as np +#import gzip + +#from Bio import SeqIO +#from Bio.Seq import Seq +#from Bio.SeqRecord import SeqRecord +#from Bio.Alphabet import generic_protein +from installed_clients.WorkspaceClient import Workspace as workspaceService +from installed_clients.DataFileUtilClient import DataFileUtil as DFUClient +from installed_clients.KBaseReportClient import KBaseReport # silence whining import requests @@ -51,20 +49,21 @@ class kb_fasttree: # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa - VERSION = "1.0.3" + VERSION = "1.1.0" GIT_URL = "https://github.com/kbaseapps/kb_fasttree" - GIT_COMMIT_HASH = "efac401948d543c0cc8972f39e6b987aceb4c377" + GIT_COMMIT_HASH = "b967ee863c008d6b131ffb70569e536dc863f127" #BEGIN_CLASS_HEADER - workspaceURL = None - shockURL = None - handleURL = None - FASTTREE_bin = '/kb/module/FastTree/bin/FastTree' - # target is a list for collecting log messages + def now_ISO(self): + now_timestamp = datetime.now() + now_secs_from_epoch = (now_timestamp - datetime(1970,1,1)).total_seconds() + now_timestamp_in_iso = datetime.fromtimestamp(int(now_secs_from_epoch)).strftime('%Y-%m-%d_%T') + return now_timestamp_in_iso + def log(self, target, message): - # we should do something better here... + message = '['+self.now_ISO()+'] '+message if target is not None: target.append(message) print(message) @@ -83,117 +82,6 @@ def get_genome_set_feature_seqs(self, ws_data, ws_info): pass - # Helper script borrowed from the transform service, logger removed - # - def upload_file_to_shock(self, - console, # DEBUG - shock_service_url = None, - filePath = None, - ssl_verify = True, - token = None): - """ - Use HTTP multi-part POST to save a file to a SHOCK instance. - """ - self.log(console,"UPLOADING FILE "+filePath+" TO SHOCK") - - if token is None: - raise Exception("Authentication token required!") - - #build the header - header = dict() - header["Authorization"] = "Oauth {0}".format(token) - if filePath is None: - raise Exception("No file given for upload to SHOCK!") - - dataFile = open(os.path.abspath(filePath), 'rb') - m = MultipartEncoder(fields={'upload': (os.path.split(filePath)[-1], dataFile)}) - header['Content-Type'] = m.content_type - - #logger.info("Sending {0} to {1}".format(filePath,shock_service_url)) - try: - response = requests.post(shock_service_url + "/node", headers=header, data=m, allow_redirects=True, verify=ssl_verify) - dataFile.close() - except: - dataFile.close() - raise - if not response.ok: - response.raise_for_status() - result = response.json() - if result['error']: - raise Exception(result['error'][0]) - else: - return result["data"] - - - def upload_SingleEndLibrary_to_shock_and_ws (self, - ctx, - console, # DEBUG - workspace_name, - obj_name, - file_path, - provenance, - sequencing_tech): - - self.log(console,'UPLOADING FILE '+file_path+' TO '+workspace_name+'/'+obj_name) - - # 1) upload files to shock - token = ctx['token'] - forward_shock_file = self.upload_file_to_shock( - console, # DEBUG - shock_service_url = self.shockURL, - filePath = file_path, - token = token - ) - #pprint(forward_shock_file) - self.log(console,'SHOCK UPLOAD DONE') - - # 2) create handle - self.log(console,'GETTING HANDLE') - hs = HandleService(url=self.handleURL, token=token) - forward_handle = hs.persist_handle({ - 'id' : forward_shock_file['id'], - 'type' : 'shock', - 'url' : self.shockURL, - 'file_name': forward_shock_file['file']['name'], - 'remote_md5': forward_shock_file['file']['checksum']['md5']}) - - - # 3) save to WS - self.log(console,'SAVING TO WORKSPACE') - single_end_library = { - 'lib': { - 'file': { - 'hid':forward_handle, - 'file_name': forward_shock_file['file']['name'], - 'id': forward_shock_file['id'], - 'url': self.shockURL, - 'type':'shock', - 'remote_md5':forward_shock_file['file']['checksum']['md5'] - }, - 'encoding':'UTF8', - 'type':'fasta', - 'size':forward_shock_file['file']['size'] - }, - 'sequencing_tech':sequencing_tech - } - self.log(console,'GETTING WORKSPACE SERVICE OBJECT') - ws = workspaceService(self.workspaceURL, token=ctx['token']) - self.log(console,'SAVE OPERATION...') - new_obj_info = ws.save_objects({ - 'workspace':workspace_name, - 'objects':[ - { - 'type':'KBaseFile.SingleEndLibrary', - 'data':single_end_library, - 'name':obj_name, - 'meta':{}, - 'provenance':provenance - }] - })[0] - self.log(console,'SAVED TO WORKSPACE') - - return new_obj_info[0] - #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't @@ -201,8 +89,6 @@ def upload_SingleEndLibrary_to_shock_and_ws (self, def __init__(self, config): #BEGIN_CONSTRUCTOR self.workspaceURL = config['workspace-url'] - self.shockURL = config['shock-url'] - self.handleURL = config['handle-service-url'] self.serviceWizardURL = config['service-wizard-url'] self.callbackURL = os.environ.get('SDK_CALLBACK_URL') @@ -306,8 +192,8 @@ def run_FastTree(self, ctx, params): else: for row_id in row_order: default_row_labels[row_id] = row_id - if len(row_order) < 2: - self.log(invalid_msgs,"must have multiple records in MSA: "+params['input_ref']) + if len(row_order) <= 2: + self.log(invalid_msgs,"must have 3 or more records in MSA: "+params['input_ref']) # DEBUG #for row_id in row_order: # self.log(console, "row_id: '"+row_id+"' default_row_label: '"+default_row_labels[row_id]+"'") @@ -321,15 +207,15 @@ def run_FastTree(self, ctx, params): for row_id in row_order: # take care of characters that will mess up newick and/or fasttree row_id_disp = re.sub('\s','_',row_id) - row_id_disp = re.sub('\/','%'+'/'.encode("hex"), row_id_disp) - row_id_disp = re.sub(r'\\','%'+'\\'.encode("hex"), row_id_disp) - row_id_disp = re.sub('\(','%'+'('.encode("hex"), row_id_disp) - row_id_disp = re.sub('\)','%'+')'.encode("hex"), row_id_disp) - row_id_disp = re.sub('\[','%'+'['.encode("hex"), row_id_disp) - row_id_disp = re.sub('\]','%'+']'.encode("hex"), row_id_disp) - row_id_disp = re.sub('\:','%'+':'.encode("hex"), row_id_disp) - row_id_disp = re.sub('\;','%'+';'.encode("hex"), row_id_disp) - row_id_disp = re.sub('\|','%'+';'.encode("hex"), row_id_disp) + row_id_disp = re.sub('\/','%'+'/'.encode("utf-8").hex(), row_id_disp) + row_id_disp = re.sub(r'\\','%'+'\\'.encode("utf-8").hex(), row_id_disp) + row_id_disp = re.sub('\(','%'+'('.encode("utf-8").hex(), row_id_disp) + row_id_disp = re.sub('\)','%'+')'.encode("utf-8").hex(), row_id_disp) + row_id_disp = re.sub('\[','%'+'['.encode("utf-8").hex(), row_id_disp) + row_id_disp = re.sub('\]','%'+']'.encode("utf-8").hex(), row_id_disp) + row_id_disp = re.sub('\:','%'+':'.encode("utf-8").hex(), row_id_disp) + row_id_disp = re.sub('\;','%'+';'.encode("utf-8").hex(), row_id_disp) + row_id_disp = re.sub('\|','%'+';'.encode("utf-8").hex(), row_id_disp) new_ids[row_id] = row_id_disp #self.log(console,"row_id: '"+row_id+"' row_id_disp: '"+row_id_disp+"'") # DEBUG @@ -342,7 +228,7 @@ def run_FastTree(self, ctx, params): records.extend(['>'+row_id_disp, MSA_in['alignment'][row_id] ]) - with open(input_MSA_file_path,'w',0) as input_MSA_file_handle: + with open(input_MSA_file_path,'w') as input_MSA_file_handle: input_MSA_file_handle.write("\n".join(records)+"\n") # DEBUG @@ -384,7 +270,7 @@ def run_FastTree(self, ctx, params): tree_in = data intree_newick_file_path = os.path.join(self.scratch, intree_name+".newick") self.log(console, 'writing intree file: '+intree_newick_file_path) - intree_newick_file_handle = open(intree_newick_file_path, 'w', 0) + intree_newick_file_handle = open(intree_newick_file_path, 'w') intree_newick_file_handle.write(tree_in['tree']) intree_newick_file_handle.close() else: @@ -392,7 +278,7 @@ def run_FastTree(self, ctx, params): # DEBUG: check the MSA file contents -# with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle: +# with open(input_MSA_file_path, 'r') as input_MSA_file_handle: # for line in input_MSA_file_handle: # #self.log(console,"MSA_LINE: '"+line+"'") # too big for console # self.log(invalid_msgs,"MSA_LINE: '"+line+"'") @@ -464,7 +350,7 @@ def run_FastTree(self, ctx, params): raise ValueError("empty file '"+input_MSA_file_path+"'") # DEBUG -# with open(input_MSA_file_path,'r',0) as input_MSA_file_handle: +# with open(input_MSA_file_path,'r') as input_MSA_file_handle: # for line in input_MSA_file_handle: # #self.log(console,"MSA LINE: '"+line+"'") # too big for console # self.log(invalid_msgs,"MSA LINE: '"+line+"'") @@ -557,9 +443,9 @@ def run_FastTree(self, ctx, params): # write MSA to process for FastTree # - with open(input_MSA_file_path,'r',0) as input_MSA_file_handle: + with open(input_MSA_file_path,'r') as input_MSA_file_handle: for line in input_MSA_file_handle: - p.stdin.write(line) + p.stdin.write(line.encode()) p.stdin.close() p.wait() @@ -614,7 +500,7 @@ def run_FastTree(self, ctx, params): if 'species_tree_flag' in params and params['species_tree_flag'] != None and params['species_tree_flag'] != 0: tree_type = 'SpeciesTree' - with open(output_newick_file_path,'r',0) as output_newick_file_handle: + with open(output_newick_file_path,'r') as output_newick_file_handle: output_newick_buf = output_newick_file_handle.read() output_newick_buf = output_newick_buf.rstrip() if not output_newick_buf.endswith(';'): @@ -631,12 +517,13 @@ def run_FastTree(self, ctx, params): if default_row_labels: default_node_labels = dict() leaf_list = [] - for row_id in default_row_labels.keys(): + #for row_id in default_row_labels.keys(): # some row ids don't wind up in trimmed MSA + for row_id in row_order: new_row_id = new_ids[row_id] #default_node_labels[row_id] = default_row_labels[row_id] default_node_labels[new_row_id] = default_row_labels[row_id] leaf_list.append(new_row_id) - + if 'ws_refs' in MSA_in.keys() and MSA_in['ws_refs'] != None: ws_refs = MSA_in['ws_refs'] if 'kb_refs' in MSA_in.keys() and MSA_in['kb_refs'] != None: @@ -719,22 +606,22 @@ def run_FastTree(self, ctx, params): new_id = new_ids[row_id] label = default_node_labels[new_id] label = re.sub('\s','_',label) - label = re.sub('\/','%'+'/'.encode("hex"), label) - label = re.sub(r'\\','%'+'\\'.encode("hex"), label) - label = re.sub('\(','%'+'('.encode("hex"), label) - label = re.sub('\)','%'+')'.encode("hex"), label) - label = re.sub('\[','%'+'['.encode("hex"), label) - label = re.sub('\]','%'+']'.encode("hex"), label) - label = re.sub('\:','%'+':'.encode("hex"), label) - label = re.sub('\;','%'+';'.encode("hex"), label) - label = re.sub('\|','%'+';'.encode("hex"), label) + label = re.sub('\/','%'+'/'.encode("utf-8").hex(), label) + label = re.sub(r'\\','%'+'\\'.encode("utf-8").hex(), label) + label = re.sub('\(','%'+'('.encode("utf-8").hex(), label) + label = re.sub('\)','%'+')'.encode("utf-8").hex(), label) + label = re.sub('\[','%'+'['.encode("utf-8").hex(), label) + label = re.sub('\]','%'+']'.encode("utf-8").hex(), label) + label = re.sub('\:','%'+':'.encode("utf-8").hex(), label) + label = re.sub('\;','%'+';'.encode("utf-8").hex(), label) + label = re.sub('\|','%'+';'.encode("utf-8").hex(), label) mod_newick_buf = re.sub ('\('+new_id+'\:', '('+label+':', mod_newick_buf) mod_newick_buf = re.sub ('\,'+new_id+'\:', ','+label+':', mod_newick_buf) #self.log(console, "new_id: '"+new_id+"' label: '"+label+"'") # DEBUG mod_newick_buf = re.sub ('_', ' ', mod_newick_buf) - with open (output_newick_labels_file_path, 'w', 0) as output_newick_labels_file_handle: + with open (output_newick_labels_file_path, 'w') as output_newick_labels_file_handle: output_newick_labels_file_handle.write(mod_newick_buf) # upload @@ -835,7 +722,7 @@ def run_FastTree(self, ctx, params): html_report_lines += [''] html_report_str = "\n".join(html_report_lines) - with open (output_html_file_path, 'w', 0) as html_handle: + with open (output_html_file_path, 'w') as html_handle: html_handle.write(html_report_str) diff --git a/lib/kb_fasttree/kb_fasttreeServer.py b/lib/kb_fasttree/kb_fasttreeServer.py index d4957fc..7b98e4a 100644 --- a/lib/kb_fasttree/kb_fasttreeServer.py +++ b/lib/kb_fasttree/kb_fasttreeServer.py @@ -1,23 +1,29 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from wsgiref.simple_server import make_server -import sys +import datetime import json +import os +import random as _random +import sys import traceback -import datetime -from multiprocessing import Process from getopt import getopt, GetoptError -from jsonrpcbase import JSONRPCService, InvalidParamsError, KeywordError,\ +from multiprocessing import Process +from os import environ +from wsgiref.simple_server import make_server + +import requests as _requests +from jsonrpcbase import JSONRPCService, InvalidParamsError, KeywordError, \ JSONRPCError, InvalidRequestError from jsonrpcbase import ServerError as JSONServerError -from os import environ -from ConfigParser import ConfigParser + from biokbase import log -import requests as _requests -import random as _random -import os from kb_fasttree.authclient import KBaseAuth as _KBaseAuth +try: + from ConfigParser import ConfigParser +except ImportError: + from configparser import ConfigParser + DEPLOY = 'KB_DEPLOYMENT_CONFIG' SERVICE = 'KB_SERVICE_NAME' AUTH = 'auth-service-url' @@ -109,11 +115,10 @@ def _call_method(self, ctx, request): # Exception was raised inside the method. newerr = JSONServerError() newerr.trace = traceback.format_exc() - if isinstance(e.message, basestring): - newerr.data = e.message + if len(e.args) == 1: + newerr.data = repr(e.args[0]) else: - # Some exceptions embed other exceptions as the message - newerr.data = repr(e.message) + newerr.data = repr(e.args) raise newerr return result @@ -175,7 +180,7 @@ def call_py(self, ctx, jsondata): def _handle_request(self, ctx, request): """Handles given request and returns its response.""" - if self.method_data[request['method']].has_key('types'): # noqa @IgnorePep8 + if 'types' in self.method_data[request['method']]: self._validate_params_types(request['method'], request['params']) result = self._call_method(ctx, request) @@ -404,7 +409,7 @@ def __call__(self, environ, start_response): ctx['user_id'] = user ctx['authenticated'] = 1 ctx['token'] = token - except Exception, e: + except Exception as e: if auth_req == 'required': err = JSONServerError() err.data = \ @@ -435,11 +440,11 @@ def __call__(self, environ, start_response): rpc_result = self.process_error(err, ctx, req, traceback.format_exc()) - # print 'Request method was %s\n' % environ['REQUEST_METHOD'] - # print 'Environment dictionary is:\n%s\n' % pprint.pformat(environ) - # print 'Request body was: %s' % request_body - # print 'Result from the method call is:\n%s\n' % \ - # pprint.pformat(rpc_result) + # print('Request method was %s\n' % environ['REQUEST_METHOD']) + # print('Environment dictionary is:\n%s\n' % pprint.pformat(environ)) + # print('Request body was: %s' % request_body) + # print('Result from the method call is:\n%s\n' % \ + # pprint.pformat(rpc_result)) if rpc_result: response_body = rpc_result @@ -453,7 +458,7 @@ def __call__(self, environ, start_response): ('content-type', 'application/json'), ('content-length', str(len(response_body)))] start_response(status, response_headers) - return [response_body] + return [response_body.encode('utf8')] def process_error(self, error, context, request, trace=None): if trace: @@ -505,7 +510,7 @@ def now_in_utc(self): # a wsgi container that has enabled gevent, such as # uwsgi with the --gevent option if config is not None and config.get('gevent_monkeypatch_all', False): - print "Monkeypatching std libraries for async" + print("Monkeypatching std libraries for async") from gevent import monkey monkey.patch_all() uwsgi.applications = {'': application} @@ -529,7 +534,7 @@ def start_server(host='localhost', port=0, newprocess=False): raise RuntimeError('server is already running') httpd = make_server(host, port, application) port = httpd.server_address[1] - print "Listening on port %s" % port + print("Listening on port %s" % port) if newprocess: _proc = Process(target=httpd.serve_forever) _proc.daemon = True @@ -608,7 +613,7 @@ def process_async_cli(input_file_path, output_file_path, token): opts, args = getopt(sys.argv[1:], "", ["port=", "host="]) except GetoptError as err: # print help information and exit: - print str(err) # will print something like "option -a not recognized" + print(str(err)) # will print something like "option -a not recognized" sys.exit(2) port = 9999 host = 'localhost' @@ -617,12 +622,12 @@ def process_async_cli(input_file_path, output_file_path, token): port = int(a) elif o == '--host': host = a - print "Host set to %s" % host + print("Host set to %s" % host) else: assert False, "unhandled option" start_server(host=host, port=port) -# print "Listening on port %s" % port +# print("Listening on port %s" % port) # httpd = make_server( host, port, application) # # httpd.serve_forever() diff --git a/scripts/prepare_deploy_cfg.py b/scripts/prepare_deploy_cfg.py index 395343f..8c3c781 100644 --- a/scripts/prepare_deploy_cfg.py +++ b/scripts/prepare_deploy_cfg.py @@ -1,6 +1,6 @@ import sys from jinja2 import Template -from ConfigParser import ConfigParser +from configparser import ConfigParser # py3 if __name__ == "__main__": if len(sys.argv) != 3: diff --git a/src/FastTree-2.1.11.c b/src/FastTree-2.1.11.c new file mode 100644 index 0000000..997b5a7 --- /dev/null +++ b/src/FastTree-2.1.11.c @@ -0,0 +1,10304 @@ +/* + * FastTree -- inferring approximately-maximum-likelihood trees for large + * multiple sequence alignments. + * + * Morgan N. Price + * http://www.microbesonline.org/fasttree/ + * + * Thanks to Jim Hester of the Cleveland Clinic Foundation for + * providing the first parallel (OpenMP) code, Siavash Mirarab of + * UT Austin for implementing the WAG option, Samuel Shepard + * at the CDC for suggesting and helping with the -quote option, and + * Aaron Darling (University of Technology, Sydney) for numerical changes + * for wide alignments of closely-related sequences. + * + * Copyright (C) 2008-2015 The Regents of the University of California + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * or visit http://www.gnu.org/copyleft/gpl.html + * + * Disclaimer + * + * NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY, + * NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, + * OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY, + * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT, + * OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE + * PRIVATELY OWNED RIGHTS. + */ + +/* + * To compile FastTree, do: + * gcc -Wall -O3 -finline-functions -funroll-loops -o FastTree -lm FastTree.c + * Use -DNO_SSE to turn off use of SSE3 instructions + * (should not be necessary because compiler should not set __SSE__ if + * not available, and modern mallocs should return 16-byte-aligned values) + * Use -DOPENMP -fopenmp to use multiple threads (note, old versions of gcc + * may not support -fopenmp) + * Use -DTRACK_MEMORY if you want detailed reports of memory usage, + * but results are not correct above 4GB because mallinfo stores int values. + * It also makes FastTree run significantly slower. + * + * To get usage guidance, do: + * FastTree -help + * + * FastTree uses profiles instead of a distance matrix, and computes + * support values for each split from the profiles of the 4 nodes + * around the split. It stores a profile for each node and a average + * profile over all active nodes (the "out-profile" for computing the + * total sum of distance to other nodes). The neighbor joining phase + * requires O(N*L*a) space, where N is the number of sequences, L is + * the alignment width, and a is the alphabet size. The top-hits + * heuristic requires an additional O(N sqrt(N)) memory. After + * neighbor-joining, FastTree improves the topology with + * nearest-neighbor interchanges (NNIs) and subtree-prune-regraft + * moves (SPRs), which does not have a significant additional memory + * requirement. (We need only store "up-profiles" on the path from our + * current traversal point to the root.) These take O(NLa) time per + * round, and with default settings, O(N log(N) L a) time total. + * FastTree further improves the topology with maximum-likelihood + * NNIs, using similar data structures and complexity, but with a + * higher constant factor, and now the "profiles" are actually + * posterior distributions for that subtree. Finally, FastTree + * resamples the site likelihoods around each NNI and uses + * the Shimodaira Hasegawa test to estimate the reliability of each split. + * + * Overview of the neighbor-joining phase: + * + * Although FastTree uses a log correction on profile distances to + * account for multiple substitutions when doing NNIs and SPRs, the + * operations on the profiles themselves involve "additive" distances + * -- either %different (for nucleotide) or by using an amino acid + * similarity matrix (for proteins). If we are using %different as + * our distance matrix then + * + * Profile_distance(A,B) = 1 - sum over characters of freq(A)*freq(B) + * + * and we can average this value over positions. Positions with gaps + * are weighted by %ungapped(A) * %ungapped(B). + * + * If we are using an amino acid dissimilarity matrix D(i,j) then at + * each position + * + * Profile_distance(A,B) = sum(i,j) freq(A==i) * freq(B==j) * D(i,j) + * = sum(k) Ak * Bk * Lambda(k) + * + * where k iterates over 20 eigenvectors, Lambda(k) is the eigenvalue, + * and if A==i, then Ak is the kth column of the inverse of the + * eigenvector matrix. + * + * The exhaustive approach (-slow) takes O(N**3*L*a) time, but + * this can be reduced to as little as O(N**(3/2)*log(N)*L*a) time + * by using heuristics. + * + * It uses a combination of three heuristics: a visible set similar to + * that of FastTree (Elias & Lagergren 2005), a local hill-climbing + * search for a better join (as in relaxed neighbor-joining, Evans et + * al. 2006), and a top-hit list to reduce the search space (see + * below). + * + * The "visible" set stores, for each node, the best join for that + * node, as identified at some point in the past + * + * If top-hits are not being used, then the neighbor-joining phase can + * be summarized as: + * + * Compute the out-profile by averaging the leaves + * Compute the out-distance of each leaf quickly, using the out-profile + * Compute the visible set (or approximate it using top-hits, see below) + * Until we're down to 3 active nodes: + * Find the best join in the visible set + * (This involves recomputing the neighbor-joining criterion, + * as out-distances and #active nodes may have changed) + * Follow a chain of best hits (again recomputing the criterion) + * until we find a locally best join, as in relaxed neighbor joining + * Create a profile of the parent node, either using simple averages (default) + * or using weighted joining as in BIONJ (if -bionj was specified) + * Update the out-profile and the out-distances + * Update the visible set: + * find the best join for the new joined node + * replace hits to the joined children with hits to the parent + * if we stumble across a join for the new node that is better + * than the corresponding entry in the visible set, "reset" + * that entry. + * + * For each iteration, this method does + * O(N) work to find the best hit in the visible set + * O(L*N*a*log(N)) work to do the local search, where log(N) + * is a pessimistic estimate of the number of iterations. In + * practice, we average <1 iteration for 2,000 sequences. + * With -fastest, this step is omitted. + * O(N*a) work to compute the joined profile and update the out-profile + * O(L*N*a) work to update the out-distances + * O(L*N*a) work to compare the joined profile to the other nodes + * (to find the new entry in the visible set) + * + * and there are N-3 iterations, so it takes O(N**2 * L * log(N) * a) time. + * + * The profile distances give exactly the same result as matrix + * distances in neighbor-joining or BIONJ would if there are no gaps + * in the alignment. If there are gaps, then it is an + * approximation. To get the same result we also store a "diameter" + * for each node (diameter is 0 for leaves). + * + * In the simpler case (NJ rather than BIONJ), when we join A and B to + * give a new node AB, + * + * Profile(AB) = (A+B)/2 + * Profile_distance(AB,C) = (Profile_distance(A,C)+Profile_distance(B,C))/2 + * because the formulas above are linear + * + * And according to the neighor-joining rule, + * d(AB,C) = (d(A,C)+d(B,C)-d(A,B))/2 + * + * and we can achieve the same value by writing + * diameter(AB) = pd(A,B)/2 + * diameter(leaf) = 0 + * d(A,B) = pd(A,B) - diameter(A) - diameter(B) + * + * because + * d(AB,C) = (d(A,C)+d(B,C)-d(A,B))/2 + * = (pd(A,C)-diam(A)-diam(C)+pd(B,C)-diam(B)-diam(C)-d(A,B)+diam(A)+diam(B))/2 + * = (pd(A,C)+pd(B,C))/2 - diam(C) - pd(A,B) + * = pd(AB,C) - diam(AB) - diam(C) + * + * If we are using BIONJ, with weight lambda for the join: + * Profile(AB) = lambda*A + (1-lambda)*B + * then a similar argument gives + * diam(AB) = lambda*diam(A) + (1-lambda)*diam(B) + lambda*d(A,AB) + (1-lambda)*d(B,AB), + * + * where, as in neighbor joining, + * d(A,AB) = d(A,B) + (total out_distance(A) - total out_distance(B))/(n-2) + * + * A similar recursion formula works for the "variance" matrix of BIONJ, + * var(AB,C) = lambda*var(A,C) + (1-lambda)*var(B,C) - lambda*(1-lambda)*var(A,B) + * is equivalent to + * var(A,B) = pv(A,B) - vd(A) - vd(B), where + * pv(A,B) = pd(A,B) + * vd(A) = 0 for leaves + * vd(AB) = lambda*vd(A) + (1-lambda)*vd(B) + lambda*(1-lambda)*var(A,B) + * + * The top-hist heuristic to reduce the work below O(N**2*L) stores a top-hit + * list of size m=sqrt(N) for each active node. + * + * The list can be initialized for all the leaves in sub (N**2 * L) time as follows: + * Pick a "seed" sequence and compare it to all others + * Store the top m hits of the seed as its top-hit list + * Take "close" hits of the seed(within the top m, and see the "close" parameter), + * and assume that their top m hits lie within the top 2*m hits of the seed. + * So, compare them to the seed's neighors (if they do not already + * have a top hit list) and set their top hits. + * + * This method does O(N*L) work for each seed, or O(N**(3/2)*L) work total. + * + * To avoid doing O(N*L) work at each iteration, we need to avoid + * updating the visible set and the out-distances. So, we use "stale" + * out-distances, and when searching the visible set for the best hit, + * we only inspect the top m=sqrt(N) entries. We then update those + * out-distances (up to 2*m*L*a work) and then find the best hit. + * + * To avoid searching the entire visible set, FastTree keeps + * and updates a list of the top sqrt(N) entries in the visible set. + * This costs O(sqrt(N)) time per join to find the best entry and to + * update, or (N sqrt(N)) time overall. + * + * Similarly, when doing the local hill-climbing, we avoid O(N*L) work + * by only considering the top-hits for the current node. So this adds + * O(m*a*log(N)) work per iteration. + * + * When we join two nodes, we compute profiles and update the + * out-profile as before. We need to compute the best hits of the node + * -- we merge the lists for the children and select the best up-to-m + * hits. If the top hit list contains a stale node we replace it with + * its parent. If we still have +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TRACK_MEMORY +/* malloc.h apparently doesn't exist on MacOS */ +#include +#endif + +/* Compile with -DOPENMP to turn on multithreading */ +#ifdef OPENMP +#include +#endif + +/* By default, tries to compile with SSE instructions for greater speed. + But if compiled with -DUSE_DOUBLE, uses double precision instead of single-precision + floating point (2x memory required), does not use SSE, and allows much shorter + branch lengths. +*/ +#ifdef __SSE__ +#if !defined(NO_SSE) && !defined(USE_DOUBLE) +#define USE_SSE3 +#endif +#endif + + +#ifdef USE_DOUBLE +#define SSE_STRING "Double precision (No SSE3)" +typedef double numeric_t; +#define ScanNumericSpec "%lf" +#else +typedef float numeric_t; +#define ScanNumericSpec "%f" +#endif + +#ifdef USE_SSE3 +#define SSE_STRING "SSE3" +#define ALIGNED __attribute__((aligned(16))) +#define IS_ALIGNED(X) ((((unsigned long) new) & 15L) == 0L) +#include + +#else + +#define ALIGNED +#define IS_ALIGNED(X) 1 + +#ifndef USE_DOUBLE +#define SSE_STRING "No SSE3" +#endif + +#endif /* USE_SSE3 */ + +#define FT_VERSION "2.1.11" + +char *usage = + " FastTree protein_alignment > tree\n" + " FastTree < protein_alignment > tree\n" + " FastTree -out tree protein_alignment\n" + " FastTree -nt nucleotide_alignment > tree\n" + " FastTree -nt -gtr < nucleotide_alignment > tree\n" + " FastTree < nucleotide_alignment > tree\n" + "FastTree accepts alignments in fasta or phylip interleaved formats\n" + "\n" + "Common options (must be before the alignment file):\n" + " -quiet to suppress reporting information\n" + " -nopr to suppress progress indicator\n" + " -log logfile -- save intermediate trees, settings, and model details\n" + " -fastest -- speed up the neighbor joining phase & reduce memory usage\n" + " (recommended for >50,000 sequences)\n" + " -n to analyze multiple alignments (phylip format only)\n" + " (use for global bootstrap, with seqboot and CompareToBootstrap.pl)\n" + " -nosupport to not compute support values\n" + " -intree newick_file to set the starting tree(s)\n" + " -intree1 newick_file to use this starting tree for all the alignments\n" + " (for faster global bootstrap on huge alignments)\n" + " -pseudo to use pseudocounts (recommended for highly gapped sequences)\n" + " -gtr -- generalized time-reversible model (nucleotide alignments only)\n" + " -lg -- Le-Gascuel 2008 model (amino acid alignments only)\n" + " -wag -- Whelan-And-Goldman 2001 model (amino acid alignments only)\n" + " -quote -- allow spaces and other restricted characters (but not ' ) in\n" + " sequence names and quote names in the output tree (fasta input only;\n" + " FastTree will not be able to read these trees back in)\n" + " -noml to turn off maximum-likelihood\n" + " -nome to turn off minimum-evolution NNIs and SPRs\n" + " (recommended if running additional ML NNIs with -intree)\n" + " -nome -mllen with -intree to optimize branch lengths for a fixed topology\n" + " -cat # to specify the number of rate categories of sites (default 20)\n" + " or -nocat to use constant rates\n" + " -gamma -- after optimizing the tree under the CAT approximation,\n" + " rescale the lengths to optimize the Gamma20 likelihood\n" + " -constraints constraintAlignment to constrain the topology search\n" + " constraintAlignment should have 1s or 0s to indicates splits\n" + " -expert -- see more options\n" + "For more information, see http://www.microbesonline.org/fasttree/\n"; + +char *expertUsage = + "FastTree [-nt] [-n 100] [-quote] [-pseudo | -pseudo 1.0]\n" + " [-boot 1000 | -nosupport]\n" + " [-intree starting_trees_file | -intree1 starting_tree_file]\n" + " [-quiet | -nopr]\n" + " [-nni 10] [-spr 2] [-noml | -mllen | -mlnni 10]\n" + " [-mlacc 2] [-cat 20 | -nocat] [-gamma]\n" + " [-slow | -fastest] [-2nd | -no2nd] [-slownni] [-seed 1253] \n" + " [-top | -notop] [-topm 1.0 [-close 0.75] [-refresh 0.8]]\n" + " [-gtr] [-gtrrates ac ag at cg ct gt] [-gtrfreq A C G T]\n" + " [ -lg | -wag | -trans transitionmatrixfile ]\n" + " [-matrix Matrix | -nomatrix] [-nj | -bionj]\n" + " [ -constraints constraintAlignment [ -constraintWeight 100.0 ] ]\n" + " [-log logfile]\n" + " [ alignment_file ]\n" + " [ -out output_newick_file | > newick_tree]\n" + "\n" + "or\n" + "\n" + "FastTree [-nt] [-matrix Matrix | -nomatrix] [-rawdist] -makematrix [alignment]\n" + " [-n 100] > phylip_distance_matrix\n" + "\n" + " FastTree supports fasta or phylip interleaved alignments\n" + " By default FastTree expects protein alignments, use -nt for nucleotides\n" + " FastTree reads standard input if no alignment file is given\n" + "\n" + "Input/output options:\n" + " -n -- read in multiple alignments in. This only\n" + " works with phylip interleaved format. For example, you can\n" + " use it with the output from phylip's seqboot. If you use -n, FastTree\n" + " will write 1 tree per line to standard output.\n" + " -intree newickfile -- read the starting tree in from newickfile.\n" + " Any branch lengths in the starting trees are ignored.\n" + " -intree with -n will read a separate starting tree for each alignment.\n" + " -intree1 newickfile -- read the same starting tree for each alignment\n" + " -quiet -- do not write to standard error during normal operation (no progress\n" + " indicator, no options summary, no likelihood values, etc.)\n" + " -nopr -- do not write the progress indicator to stderr\n" + " -log logfile -- save intermediate trees so you can extract\n" + " the trees and restart long-running jobs if they crash\n" + " -log also reports the per-site rates (1 means slowest category)\n" + " -quote -- quote sequence names in the output and allow spaces, commas,\n" + " parentheses, and colons in them but not ' characters (fasta files only)\n" + "\n" + "Distances:\n" + " Default: For protein sequences, log-corrected distances and an\n" + " amino acid dissimilarity matrix derived from BLOSUM45\n" + " or for nucleotide sequences, Jukes-Cantor distances\n" + " To specify a different matrix, use -matrix FilePrefix or -nomatrix\n" + " Use -rawdist to turn the log-correction off\n" + " or to use %different instead of Jukes-Cantor\n" + " (These options affect minimum-evolution computations only;\n" + " use -trans to affect maximum-likelihoood computations)\n" + "\n" + " -pseudo [weight] -- Use pseudocounts to estimate distances between\n" + " sequences with little or no overlap. (Off by default.) Recommended\n" + " if analyzing the alignment has sequences with little or no overlap.\n" + " If the weight is not specified, it is 1.0\n" + "\n" + "Topology refinement:\n" + " By default, FastTree tries to improve the tree with up to 4*log2(N)\n" + " rounds of minimum-evolution nearest-neighbor interchanges (NNI),\n" + " where N is the number of unique sequences, 2 rounds of\n" + " subtree-prune-regraft (SPR) moves (also min. evo.), and\n" + " up to 2*log(N) rounds of maximum-likelihood NNIs.\n" + " Use -nni to set the number of rounds of min. evo. NNIs,\n" + " and -spr to set the rounds of SPRs.\n" + " Use -noml to turn off both min-evo NNIs and SPRs (useful if refining\n" + " an approximately maximum-likelihood tree with further NNIs)\n" + " Use -sprlength set the maximum length of a SPR move (default 10)\n" + " Use -mlnni to set the number of rounds of maximum-likelihood NNIs\n" + " Use -mlacc 2 or -mlacc 3 to always optimize all 5 branches at each NNI,\n" + " and to optimize all 5 branches in 2 or 3 rounds\n" + " Use -mllen to optimize branch lengths without ML NNIs\n" + " Use -mllen -nome with -intree to optimize branch lengths on a fixed topology\n" + " Use -slownni to turn off heuristics to avoid constant subtrees (affects both\n" + " ML and ME NNIs)\n" + "\n" + "Maximum likelihood model options:\n" + " -lg -- Le-Gascuel 2008 model instead of (default) Jones-Taylor-Thorton 1992 model (a.a. only)\n" + " -wag -- Whelan-And-Goldman 2001 model instead of (default) Jones-Taylor-Thorton 1992 model (a.a. only)\n" + " -gtr -- generalized time-reversible instead of (default) Jukes-Cantor (nt only)\n" + " -cat # -- specify the number of rate categories of sites (default 20)\n" + " -nocat -- no CAT model (just 1 category)\n" + " - trans filename -- use the transition matrix from filename\n" + " This is supported for amino acid alignments only\n" + " The file must be tab-delimited with columns in the order ARNDCQEGHILKMFPSTWYV*\n" + " The additional column named * is for the stationary distribution\n" + " Each row must have a row name in the same order ARNDCQEGHILKMFPSTWYV\n" + " -gamma -- after the final round of optimizing branch lengths with the CAT model,\n" + " report the likelihood under the discrete gamma model with the same\n" + " number of categories. FastTree uses the same branch lengths but\n" + " optimizes the gamma shape parameter and the scale of the lengths.\n" + " The final tree will have rescaled lengths. Used with -log, this\n" + " also generates per-site likelihoods for use with CONSEL, see\n" + " GammaLogToPaup.pl and documentation on the FastTree web site.\n" + "\n" + "Support value options:\n" + " By default, FastTree computes local support values by resampling the site\n" + " likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome,\n" + " it will compute minimum-evolution bootstrap supports instead\n" + " In either case, the support values are proportions ranging from 0 to 1\n" + "\n" + " Use -nosupport to turn off support values or -boot 100 to use just 100 resamples\n" + " Use -seed to initialize the random number generator\n" + "\n" + "Searching for the best join:\n" + " By default, FastTree combines the 'visible set' of fast neighbor-joining with\n" + " local hill-climbing as in relaxed neighbor-joining\n" + " -slow -- exhaustive search (like NJ or BIONJ, but different gap handling)\n" + " -slow takes half an hour instead of 8 seconds for 1,250 proteins\n" + " -fastest -- search the visible set (the top hit for each node) only\n" + " Unlike the original fast neighbor-joining, -fastest updates visible(C)\n" + " after joining A and B if join(AB,C) is better than join(C,visible(C))\n" + " -fastest also updates out-distances in a very lazy way,\n" + " -fastest sets -2nd on as well, use -fastest -no2nd to avoid this\n" + "\n" + "Top-hit heuristics:\n" + " By default, FastTree uses a top-hit list to speed up search\n" + " Use -notop (or -slow) to turn this feature off\n" + " and compare all leaves to each other,\n" + " and all new joined nodes to each other\n" + " -topm 1.0 -- set the top-hit list size to parameter*sqrt(N)\n" + " FastTree estimates the top m hits of a leaf from the\n" + " top 2*m hits of a 'close' neighbor, where close is\n" + " defined as d(seed,close) < 0.75 * d(seed, hit of rank 2*m),\n" + " and updates the top-hits as joins proceed\n" + " -close 0.75 -- modify the close heuristic, lower is more conservative\n" + " -refresh 0.8 -- compare a joined node to all other nodes if its\n" + " top-hit list is less than 80% of the desired length,\n" + " or if the age of the top-hit list is log2(m) or greater\n" + " -2nd or -no2nd to turn 2nd-level top hits heuristic on or off\n" + " This reduces memory usage and running time but may lead to\n" + " marginal reductions in tree quality.\n" + " (By default, -fastest turns on -2nd.)\n" + "\n" + "Join options:\n" + " -nj: regular (unweighted) neighbor-joining (default)\n" + " -bionj: weighted joins as in BIONJ\n" + " FastTree will also weight joins during NNIs\n" + "\n" + "Constrained topology search options:\n" + " -constraints alignmentfile -- an alignment with values of 0, 1, and -\n" + " Not all sequences need be present. A column of 0s and 1s defines a\n" + " constrained split. Some constraints may be violated\n" + " (see 'violating constraints:' in standard error).\n" + " -constraintWeight -- how strongly to weight the constraints. A value of 1\n" + " means a penalty of 1 in tree length for violating a constraint\n" + " Default: 100.0\n" + "\n" + "For more information, see http://www.microbesonline.org/fasttree/\n" + " or the comments in the source code\n"; +; + + +#define MAXCODES 20 +#define NOCODE 127 +/* Note -- sequence lines longer than BUFFER_SIZE are + allowed, but FASTA header lines must be within this limit */ +#define BUFFER_SIZE 5000 +#define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) +#define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) + +typedef struct { + int nPos; + int nSeq; + char **names; + char **seqs; + int nSaved; /* actual allocated size of names and seqs */ +} alignment_t; + +/* For each position in a profile, we have a weight (% non-gapped) and a + frequency vector. (If using a matrix, the frequency vector is in eigenspace). + We also store codes for simple profile positions (all gaps or only 1 value) + If weight[pos] > 0 && codes[pos] == NOCODE then we store the vector + vectors itself is sets of nCodes long, so the vector for the ith nonconstant position + starts at &vectors[nCodes*i] + + To speed up comparison of outprofile to a sequence or other simple profile, we also + (for outprofiles) store codeDist[iPos*nCodes+k] = dist(k,profile[iPos]) + + For constraints, we store a vector of nOn and nOff + If not using constraints, those will be NULL +*/ +typedef struct { + /* alignment profile */ + numeric_t *weights; + unsigned char *codes; + numeric_t *vectors; /* NULL if no non-constant positions, e.g. for leaves */ + int nVectors; + numeric_t *codeDist; /* Optional -- distance to each code at each position */ + + /* constraint profile */ + int *nOn; + int *nOff; +} profile_t; + +/* A visible node is a pair of nodes i, j such that j is the best hit of i, + using the neighbor-joining criterion, at the time the comparison was made, + or approximately so since then. + + Note that variance = dist because in BIONJ, constant factors of variance do not matter, + and because we weight ungapped sequences higher naturally when averaging profiles, + so we do not take this into account in the computation of "lambda" for BIONJ. + + For the top-hit list heuristic, if the top hit list becomes "too short", + we store invalid entries with i=j=-1 and dist/criterion very high. +*/ +typedef struct { + int i, j; + numeric_t weight; /* Total product of weights (maximum value is nPos) + This is needed for weighted joins and for pseudocounts, + but not in most other places. + For example, it is not maintained by the top hits code */ + numeric_t dist; /* The uncorrected distance (includes diameter correction) */ + numeric_t criterion; /* changes when we update the out-profile or change nActive */ +} besthit_t; + +typedef struct { + int nChild; + int child[3]; +} children_t; + +typedef struct { + /* Distances between amino acids */ + numeric_t distances[MAXCODES][MAXCODES]; + + /* Inverse of the eigenvalue matrix, for rotating a frequency vector + into eigenspace so that profile similarity computations are + O(alphabet) not O(alphabet*alphabet) time. + */ + numeric_t eigeninv[MAXCODES][MAXCODES]; + numeric_t eigenval[MAXCODES]; /* eigenvalues */ + + + /* eigentot=eigeninv times the all-1s frequency vector + useful for normalizing rotated frequency vectors + */ + numeric_t eigentot[MAXCODES]; + + /* codeFreq is the transpose of the eigeninv matrix is + the rotated frequency vector for each code */ + numeric_t codeFreq[MAXCODES][MAXCODES]; + numeric_t gapFreq[MAXCODES]; +} distance_matrix_t; + + +/* A transition matrix gives the instantaneous rate of change of frequencies + df/dt = M . f + which is solved by + f(t) = exp(M) . f(0) + and which is not a symmetric matrix because of + non-uniform stationary frequencies stat, so that + M stat = 0 + M(i,j) is instantaneous rate of j -> i, not of i -> j + + S = diag(sqrt(stat)) is a correction so that + M' = S**-1 M S is symmetric + Let W L W**-1 = M' be an eigendecomposition of M' + Because M' is symmetric, W can be a rotation, and W**-1 = t(W) + Set V = S*W + M = V L V**-1 is an eigendecomposition of M + Note V**-1 = W**-1 S**-1 = t(W) S**-1 + + Evolution by time t is given by + + exp(M*t) = V exp(L*t) V**-1 + P(A & B | t) = B . exp(M*t) . (A * stat) + note this is *not* the same as P(A->B | t) + + and we can reduce some of the computations from O(a**2) to O(a) time, + where a is the alphabet size, by storing frequency vectors as + t(V) . f = t(W) . t(S) . f + + Then + P(f0 & f1 | t) = f1 . exp(M*t) . f0 * (f0 . stat) = sum(r0j * r1j * exp(l_j*t)) + where r0 and r1 are the transformed vectors + + Posterior distribution of P given children f0 and f1 is given by + P(i | f0, f1, t0, t1) = stat * P(i->f0 | t0) * P(i->f1 | t1) + = P(i & f0 | t0) * P(i & f1 | t1) / stat + ~ (V . exp(t0*L) . r0) * (V . exp(t1*L) . r1) / stat + + When normalize this posterior distribution (to sum to 1), divide by stat, + and transform by t(V) -- this is the "profile" of internal nodes + + To eliminate the O(N**2) step of transforming by t(V), if the posterior + distribution of an amino acid is near 1 then we can approximate it by + P(i) ~= (i==A) * w + nearP(i) * (1-w), where + w is fit so that P(i==A) is correct + nearP = Posterior(i | i, i, 0.1, 0.1) [0.1 is an arbitrary choice] + and we confirm that the approximation works well before we use it. + + Given this parameter w we can set + rotated_posterior = rotation(w * (i==A)/stat + (1-w) * nearP/stat) + = codeFreq(A) * w/stat(A) + nearFreq(A) * (1-w) + */ +typedef struct { + numeric_t stat[MAXCODES]; /* The stationary distribution */ + numeric_t statinv[MAXCODES]; /* 1/stat */ + /* the eigenmatrix, with the eigenvectors as columns and rotations of individual + characters as rows. Also includes a NOCODE entry for gaps */ + numeric_t codeFreq[NOCODE+1][MAXCODES]; + numeric_t eigeninv[MAXCODES][MAXCODES]; /* Inverse of eigenmatrix */ + numeric_t eigeninvT[MAXCODES][MAXCODES]; /* transpose of eigeninv */ + numeric_t eigenval[MAXCODES]; /* Eigenvalues */ + /* These are for approximate posteriors (off by default) */ + numeric_t nearP[MAXCODES][MAXCODES]; /* nearP[i][j] = P(parent=j | both children are i, both lengths are 0.1 */ + numeric_t nearFreq[MAXCODES][MAXCODES]; /* rotation of nearP/stat */ +} transition_matrix_t; + +typedef struct { + int nRateCategories; + numeric_t *rates; /* 1 per rate category */ + unsigned int *ratecat; /* 1 category per position */ +} rates_t; + +typedef struct { + /* The input */ + int nSeq; + int nPos; + char **seqs; /* the aligment sequences array (not reallocated) */ + distance_matrix_t *distance_matrix; /* a pointer (not reallocated), or NULL if using %identity distance */ + transition_matrix_t *transmat; /* a pointer (is allocated), or NULL for Jukes-Cantor */ + /* Topological constraints are represented for each sequence as binary characters + with values of '0', '1', or '-' (for missing data) + Sequences that have no constraint may have a NULL string + */ + int nConstraints; + char **constraintSeqs; + + /* The profile data structures */ + int maxnode; /* The next index to allocate */ + int maxnodes; /* Space allocated in data structures below */ + profile_t **profiles; /* Profiles of leaves and intermediate nodes */ + numeric_t *diameter; /* To correct for distance "up" from children (if any) */ + numeric_t *varDiameter; /* To correct variances for distance "up" */ + numeric_t *selfdist; /* Saved for use in some formulas */ + numeric_t *selfweight; /* Saved for use in some formulas */ + + /* Average profile of all active nodes, the "outprofile" + * If all inputs are ungapped, this has weight 1 (not nSequences) at each position + * The frequencies all sum to one (or that is implied by the eigen-representation) + */ + profile_t *outprofile; + double totdiam; + + /* We sometimes use stale out-distances, so we remember what nActive was */ + numeric_t *outDistances; /* Sum of distances to other active (parent==-1) nodes */ + int *nOutDistActive; /* What nActive was when this outDistance was computed */ + + /* the inferred tree */ + int root; /* index of the root. Unlike other internal nodes, it has 3 children */ + int *parent; /* -1 or index of parent */ + children_t *child; + numeric_t *branchlength; /* Distance to parent */ + numeric_t *support; /* 1 for high-confidence nodes */ + + /* auxilliary data for maximum likelihood (defaults to 1 category of rate=1.0) */ + rates_t rates; +} NJ_t; + +/* Uniquify sequences in an alignment -- map from indices + in the alignment to unique indicies in a NJ_t +*/ +typedef struct { + int nSeq; + int nUnique; + int *uniqueFirst; /* iUnique -> iAln */ + int *alnNext; /* iAln -> next, or -1 */ + int *alnToUniq; /* iAln -> iUnique, or -1 if another was the exemplar */ + char **uniqueSeq; /* indexed by iUniq -- points to strings allocated elsewhere */ +} uniquify_t; + +/* Describes which switch to do */ +typedef enum {ABvsCD,ACvsBD,ADvsBC} nni_t; + +/* A list of these describes a chain of NNI moves in a rooted tree, + making up, in total, an SPR move +*/ +typedef struct { + int nodes[2]; + double deltaLength; /* change in tree length for this step (lower is better) */ +} spr_step_t; + +/* Keep track of hits for the top-hits heuristic without wasting memory + j = -1 means empty + If j is an inactive node, this may be replaced by that node's parent (and dist recomputed) + */ +typedef struct { + int j; + numeric_t dist; +} hit_t; + +typedef struct { + int nHits; /* the allocated and desired size; some of them may be empty */ + hit_t *hits; + int hitSource; /* where to refresh hits from if a 2nd-level top-hit list, or -1 */ + int age; /* number of joins since a refresh */ +} top_hits_list_t; + +typedef struct { + int m; /* size of a full top hits list, usually sqrt(N) */ + int q; /* size of a 2nd-level top hits, usually sqrt(m) */ + int maxnodes; + top_hits_list_t *top_hits_lists; /* one per node */ + hit_t *visible; /* the "visible" (very best) hit for each node */ + + /* The top-visible set is a subset, usually of size m, of the visible set -- + it is the set of joins to select from + Each entry is either a node whose visible set entry has a good (low) criterion, + or -1 for empty, or is an obsolete node (which is effectively the same). + Whenever we update the visible set, should also call UpdateTopVisible() + which ensures that none of the topvisible set are stale (that is, they + all point to an active node). + */ + int nTopVisible; /* nTopVisible = m * topvisibleMult */ + int *topvisible; + + int topvisibleAge; /* joins since the top-visible list was recomputed */ + +#ifdef OPENMP + /* 1 lock to read or write any top hits list, no thread grabs more than one */ + omp_lock_t *locks; +#endif +} top_hits_t; + +/* Global variables */ +/* Options */ +int verbose = 1; +int showProgress = 1; +int slow = 0; +int fastest = 0; +bool useTopHits2nd = false; /* use the second-level top hits heuristic? */ +int bionj = 0; +double tophitsMult = 1.0; /* 0 means compare nodes to all other nodes */ +double tophitsClose = -1.0; /* Parameter for how close is close; also used as a coverage req. */ +double topvisibleMult = 1.5; /* nTopVisible = m * topvisibleMult; 1 or 2 did not make much difference + in either running time or accuracy so I chose a compromise. */ + +double tophitsRefresh = 0.8; /* Refresh if fraction of top-hit-length drops to this */ +double tophits2Mult = 1.0; /* Second-level top heuristic -- only with -fastest */ +int tophits2Safety = 3; /* Safety factor for second level of top-hits heuristic */ +double tophits2Refresh = 0.6; /* Refresh 2nd-level top hits if drops down to this fraction of length */ + +double staleOutLimit = 0.01; /* nActive changes by at most this amount before we recompute + an out-distance. (Only applies if using the top-hits heuristic) */ +double fResetOutProfile = 0.02; /* Recompute out profile from scratch if nActive has changed + by more than this proportion, and */ +int nResetOutProfile = 200; /* nActive has also changed more than this amount */ +int nCodes=20; /* 20 if protein, 4 if nucleotide */ +bool useMatrix=true; /* If false, use %different as the uncorrected distance */ +bool logdist = true; /* If true, do a log-correction (scoredist-like or Jukes-Cantor) + but only during NNIs and support values, not during neighbor-joining */ +double pseudoWeight = 0.0; /* The weight of pseudocounts to avoid artificial long branches when + nearby sequences in the tree have little or no overlap + (off by default). The prior distance is based on + all overlapping positions among the quartet or triplet under + consideration. The log correction takes place after the + pseudocount is used. */ +double constraintWeight = 100.0;/* Cost of violation of a topological constraint in evolutionary distance + or likelihood */ +double MEMinDelta = 1.0e-4; /* Changes of less than this in tree-length are discounted for + purposes of identifying fixed subtrees */ +bool fastNNI = true; +bool gammaLogLk = false; /* compute gamma likelihood without reoptimizing branch lengths? */ + +/* Maximum likelihood options and constants */ +/* These are used to rescale likelihood values and avoid taking a logarithm at each position */ +const double LkUnderflow = 1.0e-4; +const double LkUnderflowInv = 1.0e4; +const double LogLkUnderflow = 9.21034037197618; /* -log(LkUnderflowInv) */ +const double Log2 = 0.693147180559945; +/* These are used to limit the optimization of branch lengths. + Also very short branch lengths can create numerical problems. + In version 2.1.7, the minimum branch lengths (MLMinBranchLength and MLMinRelBranchLength) + were increased to prevent numerical problems in rare cases. + In version 2.1.8, to provide useful branch lengths for genome-wide alignments, + the minimum branch lengths were dramatically decreased if USE_DOUBLE is defined. +*/ +#ifndef USE_DOUBLE +const double MLMinBranchLengthTolerance = 1.0e-4; /* absolute tolerance for optimizing branch lengths */ +const double MLFTolBranchLength = 0.001; /* fractional tolerance for optimizing branch lengths */ +const double MLMinBranchLength = 5.0e-4; /* minimum value for branch length */ +const double MLMinRelBranchLength = 2.5e-4; /* minimum of rate * length */ +const double fPostTotalTolerance = 1.0e-10; /* posterior vector must sum to at least this before rescaling */ +#else +const double MLMinBranchLengthTolerance = 1.0e-9; +const double MLFTolBranchLength = 0.001; +const double MLMinBranchLength = 5.0e-9; +const double MLMinRelBranchLength = 2.5e-9; +const double fPostTotalTolerance = 1.0e-20; +#endif + +int mlAccuracy = 1; /* Rounds of optimization of branch lengths; 1 means do 2nd round only if close */ +double closeLogLkLimit = 5.0; /* If partial optimization of an NNI looks like it would decrease the log likelihood + by this much or more then do not optimize it further */ +double treeLogLkDelta = 0.1; /* Give up if tree log-lk changes by less than this; NNIs that change + likelihood by less than this also are considered unimportant + by some heuristics */ +bool exactML = true; /* Exact or approximate posterior distributions for a.a.s */ +double approxMLminf = 0.95; /* Only try to approximate posterior distributions if max. value is at least this high */ +double approxMLminratio = 2/3.0;/* Ratio of approximated/true posterior values must be at least this high */ +double approxMLnearT = 0.2; /* 2nd component of near-constant posterior distribution uses this time scale */ +const int nDefaultRateCats = 20; + +/* Performance and memory usage */ +long profileOps = 0; /* Full profile-based distance operations */ +long outprofileOps = 0; /* How many of profileOps are comparisons to outprofile */ +long seqOps = 0; /* Faster leaf-based distance operations */ +long profileAvgOps = 0; /* Number of profile-average steps */ +long nHillBetter = 0; /* Number of hill-climbing steps */ +long nCloseUsed = 0; /* Number of "close" neighbors we avoid full search for */ +long nClose2Used = 0; /* Number of "close" neighbors we use 2nd-level top hits for */ +long nRefreshTopHits = 0; /* Number of full-blown searches (interior nodes) */ +long nVisibleUpdate = 0; /* Number of updates of the visible set */ +long nNNI = 0; /* Number of NNI changes performed */ +long nSPR = 0; /* Number of SPR changes performed */ +long nML_NNI = 0; /* Number of max-lik. NNI changes performed */ +long nSuboptimalSplits = 0; /* # of splits that are rejected given final tree (during bootstrap) */ +long nSuboptimalConstrained = 0; /* Bad splits that are due to constraints */ +long nConstraintViolations = 0; /* Number of constraint violations */ +long nProfileFreqAlloc = 0; +long nProfileFreqAvoid = 0; +long szAllAlloc = 0; +long mymallocUsed = 0; /* useful allocations by mymalloc */ +long maxmallocHeap = 0; /* Maximum of mi.arena+mi.hblkhd from mallinfo (actual mem usage) */ +long nLkCompute = 0; /* # of likelihood computations for pairs of probability vectors */ +long nPosteriorCompute = 0; /* # of computations of posterior probabilities */ +long nAAPosteriorExact = 0; /* # of times compute exact AA posterior */ +long nAAPosteriorRough = 0; /* # of times use rough approximation */ +long nStarTests = 0; /* # of times we use star test to avoid testing an NNI */ + +/* Protein character set */ +unsigned char *codesStringAA = (unsigned char*) "ARNDCQEGHILKMFPSTWYV"; +unsigned char *codesStringNT = (unsigned char*) "ACGT"; +unsigned char *codesString = NULL; + +distance_matrix_t *ReadDistanceMatrix(char *prefix); +void SetupDistanceMatrix(/*IN/OUT*/distance_matrix_t *); /* set eigentot, codeFreq, gapFreq */ +void ReadMatrix(char *filename, /*OUT*/numeric_t codes[MAXCODES][MAXCODES], bool check_codes); +void ReadVector(char *filename, /*OUT*/numeric_t codes[MAXCODES]); +alignment_t *ReadAlignment(/*READ*/FILE *fp, bool bQuote); /* Returns a list of strings (exits on failure) */ +alignment_t *FreeAlignment(alignment_t *); /* returns NULL */ +void FreeAlignmentSeqs(/*IN/OUT*/alignment_t *); + +/* Takes as input the transpose of the matrix V, with i -> j + This routine takes care of setting the diagonals +*/ +transition_matrix_t *CreateTransitionMatrix(/*IN*/double matrix[MAXCODES][MAXCODES], + /*IN*/double stat[MAXCODES]); +transition_matrix_t *CreateGTR(double *gtrrates/*ac,ag,at,cg,ct,gt*/, double *gtrfreq/*ACGT*/); +transition_matrix_t *ReadAATransitionMatrix(/*IN*/char *filename); + +/* For converting profiles from 1 rotation to another, or converts NULL to NULL */ +distance_matrix_t *TransMatToDistanceMat(transition_matrix_t *transmat); + +/* Allocates memory, initializes leaf profiles */ +NJ_t *InitNJ(char **sequences, int nSeqs, int nPos, + /*IN OPTIONAL*/char **constraintSeqs, int nConstraints, + /*IN OPTIONAL*/distance_matrix_t *, + /*IN OPTIONAL*/transition_matrix_t *); + +NJ_t *FreeNJ(NJ_t *NJ); /* returns NULL */ +void FastNJ(/*IN/OUT*/NJ_t *NJ); /* Does the joins */ +void ReliabilityNJ(/*IN/OUT*/NJ_t *NJ, int nBootstrap); /* Estimates the reliability of the joins */ + +/* nni_stats_t is meaningless for leaves and root, so all of those entries + will just be high (for age) or 0 (for delta) +*/ +typedef struct { + int age; /* number of rounds since this node was modified by an NNI */ + int subtreeAge; /* number of rounds since self or descendent had a significant improvement */ + double delta; /* improvement in score for this node (or 0 if no change) */ + double support; /* improvement of score for self over better of alternatives */ +} nni_stats_t; + +/* One round of nearest-neighbor interchanges according to the + minimum-evolution or approximate maximum-likelihood criterion. + If doing maximum likelihood then this modifies the branch lengths. + age is the # of rounds since a node was NNId + Returns the # of topological changes performed +*/ +int NNI(/*IN/OUT*/NJ_t *NJ, int iRound, int nRounds, bool useML, + /*IN/OUT*/nni_stats_t *stats, + /*OUT*/double *maxDeltaCriterion); +nni_stats_t *InitNNIStats(NJ_t *NJ); +nni_stats_t *FreeNNIStats(nni_stats_t *, NJ_t *NJ); /* returns NULL */ + +/* One round of subtree-prune-regraft moves (minimum evolution) */ +void SPR(/*IN/OUT*/NJ_t *NJ, int maxSPRLength, int iRound, int nRounds); + +/* Recomputes all branch lengths by minimum evolution criterion*/ +void UpdateBranchLengths(/*IN/OUT*/NJ_t *NJ); + +/* Recomputes all branch lengths and, optionally, internal profiles */ +double TreeLength(/*IN/OUT*/NJ_t *NJ, bool recomputeProfiles); + +typedef struct { + int nBadSplits; + int nConstraintViolations; + int nBadBoth; + int nSplits; + /* How much length would be reduce or likelihood would be increased by the + best NNI we find (the worst "miss") */ + double dWorstDeltaUnconstrained; + double dWorstDeltaConstrained; +} SplitCount_t; + +void TestSplitsMinEvo(NJ_t *NJ, /*OUT*/SplitCount_t *splitcount); + +/* Sets SH-like support values if nBootstrap>0 */ +void TestSplitsML(/*IN/OUT*/NJ_t *NJ, /*OUT*/SplitCount_t *splitcount, int nBootstrap); + +/* Pick columns for resampling, stored as returned_vector[iBoot*nPos + j] */ +int *ResampleColumns(int nPos, int nBootstrap); + +/* Use out-profile and NJ->totdiam to recompute out-distance for node iNode + Only does this computation if the out-distance is "stale" (nOutDistActive[iNode] != nActive) + Note "IN/UPDATE" for NJ always means that we may update out-distances but otherwise + make no changes. + */ +void SetOutDistance(/*IN/UPDATE*/NJ_t *NJ, int iNode, int nActive); + +/* Always sets join->criterion; may update NJ->outDistance and NJ->nOutDistActive, + assumes join's weight and distance are already set, + and that the constraint penalty (if any) is included in the distance +*/ +void SetCriterion(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *join); + +/* Computes weight and distance (which includes the constraint penalty) + and then sets the criterion (maybe update out-distances) +*/ +void SetDistCriterion(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *join); + +/* If join->i or join->j are inactive nodes, replaces them with their active ancestors. + After doing this, if i == j, or either is -1, sets weight to 0 and dist and criterion to 1e20 + and returns false (not a valid join) + Otherwise, if i or j changed, recomputes the distance and criterion. + Note that if i and j are unchanged then the criterion could be stale + If bUpdateDist is false, and i or j change, then it just sets dist to a negative number +*/ +bool UpdateBestHit(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *join, + bool bUpdateDist); + +/* This recomputes the criterion, or returns false if the visible node + is no longer active. +*/ +bool GetVisible(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/top_hits_t *tophits, + int iNode, /*OUT*/besthit_t *visible); + +int ActiveAncestor(/*IN*/NJ_t *NJ, int node); + +/* Compute the constraint penalty for a join. This is added to the "distance" + by SetCriterion */ +int JoinConstraintPenalty(/*IN*/NJ_t *NJ, int node1, int node2); +int JoinConstraintPenaltyPiece(NJ_t *NJ, int node1, int node2, int iConstraint); + +/* Helper function for computing the number of constraints violated by + a split, represented as counts of on and off on each side */ +int SplitConstraintPenalty(int nOn1, int nOff1, int nOn2, int nOff2); + +/* Reports the (min. evo.) support for the (1,2) vs. (3,4) split + col[iBoot*nPos+j] is column j for bootstrap iBoot +*/ +double SplitSupport(profile_t *p1, profile_t *p2, profile_t *p3, profile_t *p4, + /*OPTIONAL*/distance_matrix_t *dmat, + int nPos, + int nBootstrap, + int *col); + +/* Returns SH-like support given resampling spec. (in col) and site likelihods + for the three quartets +*/ +double SHSupport(int nPos, int nBoostrap, int *col, double loglk[3], double *site_likelihoods[3]); + +profile_t *SeqToProfile(/*IN/OUT*/NJ_t *NJ, + char *seq, int nPos, + /*OPTIONAL*/char *constraintSeqs, int nConstraints, + int iNode, + unsigned long counts[256]); + +/* ProfileDist and SeqDist only set the dist and weight fields + If using an outprofile, use the second argument of ProfileDist + for better performance. + + These produce uncorrected distances. +*/ +void ProfileDist(profile_t *profile1, profile_t *profile2, int nPos, + /*OPTIONAL*/distance_matrix_t *distance_matrix, + /*OUT*/besthit_t *hit); +void SeqDist(unsigned char *codes1, unsigned char *codes2, int nPos, + /*OPTIONAL*/distance_matrix_t *distance_matrix, + /*OUT*/besthit_t *hit); + +/* Computes all pairs of profile distances, applies pseudocounts + if pseudoWeight > 0, and applies log-correction if logdist is true. + The lower index is compared to the higher index, e.g. for profiles + A,B,C,D the comparison will be as in quartet_pair_t +*/ +typedef enum {qAB,qAC,qAD,qBC,qBD,qCD} quartet_pair_t; +void CorrectedPairDistances(profile_t **profiles, int nProfiles, + /*OPTIONAL*/distance_matrix_t *distance_matrix, + int nPos, + /*OUT*/double *distances); + +/* output is indexed by nni_t + To ensure good behavior while evaluating a subtree-prune-regraft move as a series + of nearest-neighbor interchanges, this uses a distance-ish model of constraints, + as given by PairConstraintDistance(), rather than + counting the number of violated splits (which is what FastTree does + during neighbor-joining). + Thus, penalty values may well be >0 even if no constraints are violated, but the + relative scores for the three NNIs will be correct. + */ +void QuartetConstraintPenalties(profile_t *profiles[4], int nConstraints, /*OUT*/double d[3]); + +double PairConstraintDistance(int nOn1, int nOff1, int nOn2, int nOff2); + +/* the split is consistent with the constraint if any of the profiles have no data + or if three of the profiles have the same uniform value (all on or all off) + or if AB|CD = 00|11 or 11|00 (all uniform) + */ +bool SplitViolatesConstraint(profile_t *profiles[4], int iConstraint); + +/* If false, no values were set because this constraint was not relevant. + output is for the 3 splits +*/ +bool QuartetConstraintPenaltiesPiece(profile_t *profiles[4], int iConstraint, /*OUT*/double penalty[3]); + +/* Apply Jukes-Cantor or scoredist-like log(1-d) transform + to correct the distance for multiple substitutions. +*/ +double LogCorrect(double distance); + +/* AverageProfile is used to do a weighted combination of nodes + when doing a join. If weight is negative, then the value is ignored and the profiles + are averaged. The weight is *not* adjusted for the gap content of the nodes. + Also, the weight does not affect the representation of the constraints +*/ +profile_t *AverageProfile(profile_t *profile1, profile_t *profile2, + int nPos, int nConstraints, + distance_matrix_t *distance_matrix, + double weight1); + +/* PosteriorProfile() is like AverageProfile() but it computes posterior probabilities + rather than an average +*/ +profile_t *PosteriorProfile(profile_t *profile1, profile_t *profile2, + double len1, double len2, + /*OPTIONAL*/transition_matrix_t *transmat, + rates_t *rates, + int nPos, int nConstraints); + +/* Set a node's profile from its children. + Deletes the previous profile if it exists + Use -1.0 for a balanced join + Fails unless the node has two children (e.g., no leaves or root) +*/ +void SetProfile(/*IN/OUT*/NJ_t *NJ, int node, double weight1); + +/* OutProfile does an unweighted combination of nodes to create the + out-profile. It always sets code to NOCODE so that UpdateOutProfile + can work. +*/ +profile_t *OutProfile(profile_t **profiles, int nProfiles, + int nPos, int nConstraints, + distance_matrix_t *distance_matrix); + +void UpdateOutProfile(/*UPDATE*/profile_t *out, profile_t *old1, profile_t *old2, + profile_t *new, int nActiveOld, + int nPos, int nConstraints, + distance_matrix_t *distance_matrix); + +profile_t *NewProfile(int nPos, int nConstraints); /* returned has no vectors */ +profile_t *FreeProfile(profile_t *profile, int nPos, int nConstraints); /* returns NULL */ + +void AllocRateCategories(/*IN/OUT*/rates_t *rates, int nRateCategories, int nPos); + +/* f1 can be NULL if code1 != NOCODE, and similarly for f2 + Or, if (say) weight1 was 0, then can have code1==NOCODE *and* f1==NULL + In that case, returns an arbitrary large number. +*/ +double ProfileDistPiece(unsigned int code1, unsigned int code2, + numeric_t *f1, numeric_t *f2, + /*OPTIONAL*/distance_matrix_t *dmat, + /*OPTIONAL*/numeric_t *codeDist2); + +/* Adds (or subtracts, if weight is negative) fIn/codeIn from fOut + fOut is assumed to exist (as from an outprofile) + do not call unless weight of input profile > 0 + */ +void AddToFreq(/*IN/OUT*/numeric_t *fOut, double weight, + unsigned int codeIn, /*OPTIONAL*/numeric_t *fIn, + /*OPTIONAL*/distance_matrix_t *dmat); + +/* Divide the vector (of length nCodes) by a constant + so that the total (unrotated) frequency is 1.0 */ +void NormalizeFreq(/*IN/OUT*/numeric_t *freq, distance_matrix_t *distance_matrix); + +/* Allocate, if necessary, and recompute the codeDist*/ +void SetCodeDist(/*IN/OUT*/profile_t *profile, int nPos, distance_matrix_t *dmat); + +/* The allhits list contains the distances of the node to all other active nodes + This is useful for the "reset" improvement to the visible set + Note that the following routines do not handle the tophits heuristic + and assume that out-distances are up to date. +*/ +void SetBestHit(int node, NJ_t *NJ, int nActive, + /*OUT*/besthit_t *bestjoin, + /*OUT OPTIONAL*/besthit_t *allhits); +void ExhaustiveNJSearch(NJ_t *NJ, int nActive, /*OUT*/besthit_t *bestjoin); + +/* Searches the visible set */ +void FastNJSearch(NJ_t *NJ, int nActive, /*UPDATE*/besthit_t *visible, /*OUT*/besthit_t *bestjoin); + +/* Subroutines for handling the tophits heuristic */ + +top_hits_t *InitTopHits(NJ_t *NJ, int m); +top_hits_t *FreeTopHits(top_hits_t *tophits); /* returns NULL */ + +/* Before we do any joins -- sets tophits and visible + NJ may be modified by setting out-distances + */ +void SetAllLeafTopHits(/*IN/UPDATE*/NJ_t *NJ, /*IN/OUT*/top_hits_t *tophits); + +/* Find the best join to do. */ +void TopHitNJSearch(/*IN/UPDATE*/NJ_t *NJ, + int nActive, + /*IN/OUT*/top_hits_t *tophits, + /*OUT*/besthit_t *bestjoin); + +/* Returns the best hit within top hits + NJ may be modified because it updates out-distances if they are too stale + Does *not* update visible set +*/ +void GetBestFromTopHits(int iNode, /*IN/UPDATE*/NJ_t *NJ, int nActive, + /*IN*/top_hits_t *tophits, + /*OUT*/besthit_t *bestjoin); + +/* visible set is modifiable so that we can reset it more globally when we do + a "refresh", but we also set the visible set for newnode and do any + "reset" updates too. And, we update many outdistances. + */ +void TopHitJoin(int newnode, + /*IN/UPDATE*/NJ_t *NJ, int nActive, + /*IN/OUT*/top_hits_t *tophits); + +/* Sort the input besthits by criterion + and save the best nOut hits as a new array in top_hits_lists + Does not update criterion or out-distances + Ignores (silently removes) hit to self + Saved list may be shorter than requested if there are insufficient entries +*/ +void SortSaveBestHits(int iNode, /*IN/SORT*/besthit_t *besthits, + int nIn, int nOut, + /*IN/OUT*/top_hits_t *tophits); + +/* Given candidate hits from one node, "transfer" them to another node: + Stores them in a new place in the same order + searches up to active nodes if hits involve non-active nodes + If update flag is set, it also recomputes distance and criterion + (and ensures that out-distances are updated); otherwise + it sets dist to -1e20 and criterion to 1e20 + + */ +void TransferBestHits(/*IN/UPDATE*/NJ_t *NJ, int nActive, + int iNode, + /*IN*/besthit_t *oldhits, + int nOldHits, + /*OUT*/besthit_t *newhits, + bool updateDistance); + +/* Create best hit objects from 1 or more hits. Do not update out-distances or set criteria */ +void HitsToBestHits(/*IN*/hit_t *hits, int nHits, int iNode, /*OUT*/besthit_t *newhits); +besthit_t HitToBestHit(int i, hit_t hit); + +/* Given a set of besthit entries, + look for improvements to the visible set of the j entries. + Updates out-distances as it goes. + Also replaces stale nodes with this node, because a join is usually + how this happens (i.e. it does not need to walk up to ancestors). + Note this calls UpdateTopVisible() on any change +*/ +void UpdateVisible(/*IN/UPDATE*/NJ_t *NJ, int nActive, + /*IN*/besthit_t *tophitsNode, + int nTopHits, + /*IN/OUT*/top_hits_t *tophits); + +/* Update the top-visible list to perhaps include this hit (O(sqrt(N)) time) */ +void UpdateTopVisible(/*IN*/NJ_t * NJ, int nActive, + int iNode, /*IN*/hit_t *hit, + /*IN/OUT*/top_hits_t *tophits); + +/* Recompute the top-visible subset of the visible set */ +void ResetTopVisible(/*IN/UPDATE*/NJ_t *NJ, + int nActive, + /*IN/OUT*/top_hits_t *tophits); + +/* Make a shorter list with only unique entries. + Replaces any "dead" hits to nodes that have parents with their active ancestors + and ignores any that become dead. + Updates all criteria. + Combined gets sorted by i & j + The returned list is allocated to nCombined even though only *nUniqueOut entries are filled +*/ +besthit_t *UniqueBestHits(/*IN/UPDATE*/NJ_t *NJ, int nActive, + /*IN/SORT*/besthit_t *combined, int nCombined, + /*OUT*/int *nUniqueOut); + +nni_t ChooseNNI(profile_t *profiles[4], + /*OPTIONAL*/distance_matrix_t *dmat, + int nPos, int nConstraints, + /*OUT*/double criteria[3]); /* The three internal branch lengths or log likelihoods*/ + +/* length[] is ordered as described by quartet_length_t, but after we do the swap + of B with C (to give AC|BD) or B with D (to get AD|BC), if that is the returned choice + bFast means do not consider NNIs if AB|CD is noticeably better than the star topology + (as implemented by MLQuartetOptimize). + If there are constraints, then the constraint penalty is included in criteria[] +*/ +nni_t MLQuartetNNI(profile_t *profiles[4], + /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, + int nPos, int nConstraints, + /*OUT*/double criteria[3], /* The three potential quartet log-likelihoods */ + /*IN/OUT*/numeric_t length[5], + bool bFast); + +void OptimizeAllBranchLengths(/*IN/OUT*/NJ_t *NJ); +double TreeLogLk(/*IN*/NJ_t *NJ, /*OPTIONAL OUT*/double *site_loglk); +double MLQuartetLogLk(profile_t *pA, profile_t *pB, profile_t *pC, profile_t *pD, + int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, + /*IN*/double branch_lengths[5], + /*OPTIONAL OUT*/double *site_likelihoods); + +/* Given a topology and branch lengths, estimate rates & recompute profiles */ +void SetMLRates(/*IN/OUT*/NJ_t *NJ, int nRateCategories); + +/* Returns a set of nRateCategories potential rates; the caller must free it */ +numeric_t *MLSiteRates(int nRateCategories); + +/* returns site_loglk so that + site_loglk[nPos*iRate + j] is the log likelihood of site j with rate iRate + The caller must free it. +*/ +double *MLSiteLikelihoodsByRate(/*IN*/NJ_t *NJ, /*IN*/numeric_t *rates, int nRateCategories); + +typedef struct { + double mult; /* multiplier for the rates / divisor for the tree-length */ + double alpha; + int nPos; + int nRateCats; + numeric_t *rates; + double *site_loglk; +} siteratelk_t; + +double GammaLogLk(/*IN*/siteratelk_t *s, /*OPTIONAL OUT*/double *gamma_loglk_sites); + +/* Input site_loglk must be for each rate. Note that FastTree does not reoptimize + the branch lengths under the Gamma model -- it optimizes the overall scale. + Reports the gamma log likelihhod (and logs site likelihoods if fpLog is set), + and reports the rescaling value. +*/ +double RescaleGammaLogLk(int nPos, int nRateCats, + /*IN*/numeric_t *rates, /*IN*/double *site_loglk, + /*OPTIONAL*/FILE *fpLog); + +/* P(value<=x) for the gamma distribution with shape parameter alpha and scale 1/alpha */ +double PGamma(double x, double alpha); + +/* Given a topology and branch lengths, optimize GTR rates and quickly reoptimize branch lengths + If gtrfreq is NULL, then empirical frequencies are used +*/ +void SetMLGtr(/*IN/OUT*/NJ_t *NJ, /*OPTIONAL IN*/double *gtrfreq, /*OPTIONAL WRITE*/FILE *fpLog); + +/* P(A & B | len) = P(B | A, len) * P(A) + If site_likelihoods is present, multiplies those values by the site likelihood at each point + (Note it does not handle underflow) + */ +double PairLogLk(/*IN*/profile_t *p1, /*IN*/profile_t *p2, double length, + int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, + /*OPTIONAL IN/OUT*/double *site_likelihoods); + +/* Branch lengths for 4-taxon tree ((A,B),C,D); I means internal */ +typedef enum {LEN_A,LEN_B,LEN_C,LEN_D,LEN_I} quartet_length_t; + +typedef struct { + int nPos; + transition_matrix_t *transmat; + rates_t *rates; + int nEval; /* number of likelihood evaluations */ + /* The pair to optimize */ + profile_t *pair1; + profile_t *pair2; +} quartet_opt_t; + +double PairNegLogLk(double x, void *data); /* data must be a quartet_opt_t */ + +typedef struct { + NJ_t *NJ; + double freq[4]; + double rates[6]; + int iRate; /* which rate to set x from */ + FILE *fpLog; /* OPTIONAL WRITE */ +} gtr_opt_t; + +/* Returns -log_likelihood for the tree with the given rates + data must be a gtr_opt_t and x is used to set rate iRate + Does not recompute profiles -- assumes that the caller will +*/ +double GTRNegLogLk(double x, void *data); + +/* Returns the resulting log likelihood. Optionally returns whether other + topologies should be abandoned, based on the difference between AB|CD and + the "star topology" (AB|CD with a branch length of MLMinBranchLength) exceeding + closeLogLkLimit. + If bStarTest is passed in, it only optimized the internal branch if + the star test is true. Otherwise, it optimized all 5 branch lengths + in turn. + */ +double MLQuartetOptimize(profile_t *pA, profile_t *pB, profile_t *pC, profile_t *pD, + int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, + /*IN/OUT*/double branch_lengths[5], + /*OPTIONAL OUT*/bool *pStarTest, + /*OPTIONAL OUT*/double *site_likelihoods); + +/* Returns the resulting log likelihood */ +double MLPairOptimize(profile_t *pA, profile_t *pB, + int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, + /*IN/OUT*/double *branch_length); + +/* Returns the number of steps considered, with the actual steps in steps[] + Modifies the tree by this chain of NNIs +*/ +int FindSPRSteps(/*IN/OUT*/NJ_t *NJ, + int node, + int parent, /* sibling or parent of node to NNI to start the chain */ + /*IN/OUT*/profile_t **upProfiles, + /*OUT*/spr_step_t *steps, + int maxSteps, + bool bFirstAC); + +/* Undo a single NNI */ +void UnwindSPRStep(/*IN/OUT*/NJ_t *NJ, + /*IN*/spr_step_t *step, + /*IN/OUT*/profile_t **upProfiles); + + +/* Update the profile of node and its ancestor, and delete nearby out-profiles */ +void UpdateForNNI(/*IN/OUT*/NJ_t *NJ, int node, /*IN/OUT*/profile_t **upProfiles, bool useML); + +/* Sets NJ->parent[newchild] and replaces oldchild with newchild + in the list of children of parent +*/ +void ReplaceChild(/*IN/OUT*/NJ_t *NJ, int parent, int oldchild, int newchild); + +int CompareHitsByCriterion(const void *c1, const void *c2); +int CompareHitsByIJ(const void *c1, const void *c2); + +int NGaps(NJ_t *NJ, int node); /* only handles leaf sequences */ + +/* node is the parent of AB, sibling of C + node cannot be root or a leaf + If node is the child of root, then D is the other sibling of node, + and the 4th profile is D's profile. + Otherwise, D is the parent of node, and we use its upprofile + Call this with profiles=NULL to get the nodes, without fetching or + computing profiles +*/ +void SetupABCD(NJ_t *NJ, int node, + /* the 4 profiles for ABCD; the last one is an upprofile */ + /*OPTIONAL OUT*/profile_t *profiles[4], + /*OPTIONAL IN/OUT*/profile_t **upProfiles, + /*OUT*/int nodeABCD[4], + bool useML); + +int Sibling(NJ_t *NJ, int node); /* At root, no unique sibling so returns -1 */ +void RootSiblings(NJ_t *NJ, int node, /*OUT*/int sibs[2]); + +/* JC probability of nucleotide not changing, for each rate category */ +double *PSameVector(double length, rates_t *rates); + +/* JC probability of nucleotide not changing, for each rate category */ +double *PDiffVector(double *pSame, rates_t *rates); + +/* expeigen[iRate*nCodes + j] = exp(length * rate iRate * eigenvalue j) */ +numeric_t *ExpEigenRates(double length, transition_matrix_t *transmat, rates_t *rates); + +/* Print a progress report if more than 0.1 second has gone by since the progress report */ +/* Format should include 0-4 %d references and no newlines */ +void ProgressReport(char *format, int iArg1, int iArg2, int iArg3, int iArg4); +void LogTree(char *format, int round, /*OPTIONAL WRITE*/FILE *fp, NJ_t *NJ, char **names, uniquify_t *unique, bool bQuote); +void LogMLRates(/*OPTIONAL WRITE*/FILE *fpLog, NJ_t *NJ); + +void *mymalloc(size_t sz); /* Prints "Out of memory" and exits on failure */ +void *myfree(void *, size_t sz); /* Always returns NULL */ + +/* One-dimensional minimization using brent's function, with + a fractional and an absolute tolerance */ +double onedimenmin(double xmin, double xguess, double xmax, double (*f)(double,void*), void *data, + double ftol, double atol, + /*OUT*/double *fx, /*OUT*/double *f2x); + +double brent(double ax, double bx, double cx, double (*f)(double, void *), void *data, + double ftol, double atol, + double *foptx, double *f2optx, double fax, double fbx, double fcx); + +/* Vector operations, either using SSE3 or not + Code assumes that vectors are a multiple of 4 in size +*/ +void vector_multiply(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, int n, /*OUT*/numeric_t *fOut); +numeric_t vector_multiply_sum(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, int n); +void vector_add_mult(/*IN/OUT*/numeric_t *f, /*IN*/numeric_t *add, numeric_t weight, int n); + +/* multiply the transpose of a matrix by a vector */ +void matrixt_by_vector4(/*IN*/numeric_t mat[4][MAXCODES], /*IN*/numeric_t vec[4], /*OUT*/numeric_t out[4]); + +/* sum(f1*fBy)*sum(f2*fBy) */ +numeric_t vector_dot_product_rot(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, /*IN*/numeric_t* fBy, int n); + +/* sum(f1*f2*f3) */ +numeric_t vector_multiply3_sum(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, /*IN*/numeric_t* f3, int n); + +numeric_t vector_sum(/*IN*/numeric_t *f1, int n); +void vector_multiply_by(/*IN/OUT*/numeric_t *f, /*IN*/numeric_t fBy, int n); + +double clockDiff(/*IN*/struct timeval *clock_start); +int timeval_subtract (/*OUT*/struct timeval *result, /*IN*/struct timeval *x, /*IN*/struct timeval *y); + +char *OpenMPString(void); + +void ran_start(long seed); +double knuth_rand(); /* Random number between 0 and 1 */ +void tred2 (double *a, const int n, const int np, double *d, double *e); +double pythag(double a, double b); +void tqli(double *d, double *e, int n, int np, double *z); + +/* Like mymalloc; duplicates the input (returns NULL if given NULL) */ +void *mymemdup(void *data, size_t sz); +void *myrealloc(void *data, size_t szOld, size_t szNew, bool bCopy); + +double pnorm(double z); /* Probability(value <=z) */ + +/* Hashtable functions */ +typedef struct +{ + char *string; + int nCount; /* number of times this entry was seen */ + int first; /* index of first entry with this value */ +} hashbucket_t; + +typedef struct { + int nBuckets; + /* hashvalue -> bucket. Or look in bucket + 1, +2, etc., till you hit a NULL string */ + hashbucket_t *buckets; +} hashstrings_t; +typedef int hashiterator_t; + +hashstrings_t *MakeHashtable(char **strings, int nStrings); +hashstrings_t *FreeHashtable(hashstrings_t* hash); /*returns NULL*/ +hashiterator_t FindMatch(hashstrings_t *hash, char *string); + +/* Return NULL if we have run out of values */ +char *GetHashString(hashstrings_t *hash, hashiterator_t hi); +int HashCount(hashstrings_t *hash, hashiterator_t hi); +int HashFirst(hashstrings_t *hash, hashiterator_t hi); + +void PrintNJ(/*WRITE*/FILE *, NJ_t *NJ, char **names, uniquify_t *unique, bool bShowSupport, bool bQuoteNames); + +/* Print topology using node indices as node names */ +void PrintNJInternal(/*WRITE*/FILE *, NJ_t *NJ, bool useLen); + +uniquify_t *UniquifyAln(/*IN*/alignment_t *aln); +uniquify_t *FreeUniquify(uniquify_t *); /* returns NULL */ + +/* Convert a constraint alignment to a list of sequences. The returned array is indexed + by iUnique and points to values in the input alignment +*/ +char **AlnToConstraints(alignment_t *constraints, uniquify_t *unique, hashstrings_t *hashnames); + +/* ReadTree ignores non-unique leaves after the first instance. + At the end, it prunes the tree to ignore empty children and it + unroots the tree if necessary. +*/ +void ReadTree(/*IN/OUT*/NJ_t *NJ, + /*IN*/uniquify_t *unique, + /*IN*/hashstrings_t *hashnames, + /*READ*/FILE *fpInTree); +char *ReadTreeToken(/*READ*/FILE *fp); /* returns a static array, or NULL on EOF */ +void ReadTreeAddChild(int parent, int child, /*IN/OUT*/int *parents, /*IN/OUT*/children_t *children); +/* Do not add the leaf if we already set this unique-set to another parent */ +void ReadTreeMaybeAddLeaf(int parent, char *name, + hashstrings_t *hashnames, uniquify_t *unique, + /*IN/OUT*/int *parents, /*IN/OUT*/children_t *children); +void ReadTreeRemove(/*IN/OUT*/int *parents, /*IN/OUT*/children_t *children, int node); + +/* Routines to support tree traversal and prevent visiting a node >1 time + (esp. if topology changes). +*/ +typedef bool *traversal_t; +traversal_t InitTraversal(NJ_t*); +void SkipTraversalInto(int node, /*IN/OUT*/traversal_t traversal); +traversal_t FreeTraversal(traversal_t, NJ_t*); /*returns NULL*/ + +/* returns new node, or -1 if nothing left to do. Use root for the first call. + Will return every node and then root. + Uses postorder tree traversal (depth-first search going down to leaves first) + Keeps track of which nodes are visited, so even after an NNI that swaps a + visited child with an unvisited uncle, the next call will visit the + was-uncle-now-child. (However, after SPR moves, there is no such guarantee.) + + If pUp is not NULL, then, if going "back up" through a previously visited node + (presumably due to an NNI), then it will return the node another time, + with *pUp = true. +*/ +int TraversePostorder(int lastnode, NJ_t *NJ, /*IN/OUT*/traversal_t, + /*OUT OPTIONAL*/bool *pUp); + +/* Routines to support storing up-profiles during tree traversal + Eventually these should be smart enough to do weighted joins and + to minimize memory usage +*/ +profile_t **UpProfiles(NJ_t *NJ); +profile_t *GetUpProfile(/*IN/OUT*/profile_t **upProfiles, NJ_t *NJ, int node, bool useML); +profile_t *DeleteUpProfile(/*IN/OUT*/profile_t **upProfiles, NJ_t *NJ, int node); /* returns NULL */ +profile_t **FreeUpProfiles(profile_t **upProfiles, NJ_t *NJ); /* returns NULL */ + +/* Recomputes the profile for a node, presumably to reflect topology changes + If bionj is set, does a weighted join -- which requires using upProfiles + If useML is set, computes the posterior probability instead of averaging + */ +void RecomputeProfile(/*IN/OUT*/NJ_t *NJ, /*IN/OUT*/profile_t **upProfiles, int node, bool useML); + +/* Recompute profiles going up from the leaves, using the provided distance matrix + and unweighted joins +*/ +void RecomputeProfiles(/*IN/OUT*/NJ_t *NJ, /*OPTIONAL*/distance_matrix_t *dmat); + +void RecomputeMLProfiles(/*IN/OUT*/NJ_t *NJ); + +/* If bionj is set, computes the weight to be given to A when computing the + profile for the ancestor of A and B. C and D are the other profiles in the quartet + If bionj is not set, returns -1 (which means unweighted in AverageProfile). + (A and B are the first two profiles in the array) +*/ +double QuartetWeight(profile_t *profiles[4], distance_matrix_t *dmat, int nPos); + +/* Returns a list of nodes, starting with node and ending with root */ +int *PathToRoot(NJ_t *NJ, int node, /*OUT*/int *depth); +int *FreePath(int *path, NJ_t *NJ); /* returns NULL */ + +/* The default amino acid distance matrix, derived from the BLOSUM45 similarity matrix */ +distance_matrix_t matrixBLOSUM45; + +/* The default amino acid transition matrix (Jones Taylor Thorton 1992) */ +double matrixJTT92[MAXCODES][MAXCODES]; +double statJTT92[MAXCODES]; + +/* The Le-Gascuel 2008 amino acid transition matrix */ +double matrixLG08[MAXCODES][MAXCODES]; +double statLG08[MAXCODES]; + +/* The WAG amino acid transition matrix (Whelan-And-Goldman 2001) */ +double matrixWAG01[MAXCODES][MAXCODES]; +double statWAG01[MAXCODES]; + + +int main(int argc, char **argv) { + int nAlign = 1; /* number of alignments to read */ + int iArg; + char *matrixPrefix = NULL; + char *transitionFile = NULL; + distance_matrix_t *distance_matrix = NULL; + bool make_matrix = false; + char *constraintsFile = NULL; + char *intreeFile = NULL; + bool intree1 = false; /* the same starting tree each round */ + int nni = -1; /* number of rounds of NNI, defaults to 4*log2(n) */ + int spr = 2; /* number of rounds of SPR */ + int maxSPRLength = 10; /* maximum distance to move a node */ + int MLnni = -1; /* number of rounds of ML NNI, defaults to 2*log2(n) */ + bool MLlen = false; /* optimize branch lengths; no topology changes */ + int nBootstrap = 1000; /* If set, number of replicates of local bootstrap to do */ + int nRateCats = nDefaultRateCats; + char *logfile = NULL; + bool bUseGtr = false; + bool bUseLg = false; + bool bUseWag = false; + bool bUseGtrRates = false; + double gtrrates[6] = {1,1,1,1,1,1}; + bool bUseGtrFreq = false; + double gtrfreq[4] = {0.25,0.25,0.25,0.25}; + bool bQuote = false; + FILE *fpOut = stdout; + + if (isatty(STDIN_FILENO) && argc == 1) { + fprintf(stderr,"Usage for FastTree version %s %s%s:\n%s", + FT_VERSION, SSE_STRING, OpenMPString(), usage); +#if (defined _WIN32 || defined WIN32 || defined WIN64 || defined _WIN64) + fprintf(stderr, "Windows users: Please remember to run this inside a command shell\n"); + fprintf(stderr,"Hit return to continue\n"); + fgetc(stdin); +#endif + exit(0); + } + for (iArg = 1; iArg < argc; iArg++) { + if (strcmp(argv[iArg],"-makematrix") == 0) { + make_matrix = true; + } else if (strcmp(argv[iArg],"-logdist") == 0) { + fprintf(stderr, "Warning: logdist is now on by default and obsolete\n"); + } else if (strcmp(argv[iArg],"-rawdist") == 0) { + logdist = false; + } else if (strcmp(argv[iArg],"-verbose") == 0 && iArg < argc-1) { + verbose = atoi(argv[++iArg]); + } else if (strcmp(argv[iArg],"-quiet") == 0) { + verbose = 0; + showProgress = 0; + } else if (strcmp(argv[iArg],"-nopr") == 0) { + showProgress = 0; + } else if (strcmp(argv[iArg],"-slow") == 0) { + slow = 1; + } else if (strcmp(argv[iArg],"-fastest") == 0) { + fastest = 1; + tophitsRefresh = 0.5; + useTopHits2nd = true; + } else if (strcmp(argv[iArg],"-2nd") == 0) { + useTopHits2nd = true; + } else if (strcmp(argv[iArg],"-no2nd") == 0) { + useTopHits2nd = false; + } else if (strcmp(argv[iArg],"-slownni") == 0) { + fastNNI = false; + } else if (strcmp(argv[iArg], "-matrix") == 0 && iArg < argc-1) { + iArg++; + matrixPrefix = argv[iArg]; + } else if (strcmp(argv[iArg], "-nomatrix") == 0) { + useMatrix = false; + } else if (strcmp(argv[iArg], "-n") == 0 && iArg < argc-1) { + iArg++; + nAlign = atoi(argv[iArg]); + if (nAlign < 1) { + fprintf(stderr, "-n argument for #input alignments must be > 0 not %s\n", argv[iArg]); + exit(1); + } + } else if (strcmp(argv[iArg], "-quote") == 0) { + bQuote = true; + } else if (strcmp(argv[iArg], "-nt") == 0) { + nCodes = 4; + } else if (strcmp(argv[iArg], "-intree") == 0 && iArg < argc-1) { + iArg++; + intreeFile = argv[iArg]; + } else if (strcmp(argv[iArg], "-intree1") == 0 && iArg < argc-1) { + iArg++; + intreeFile = argv[iArg]; + intree1 = true; + } else if (strcmp(argv[iArg], "-nj") == 0) { + bionj = 0; + } else if (strcmp(argv[iArg], "-bionj") == 0) { + bionj = 1; + } else if (strcmp(argv[iArg], "-boot") == 0 && iArg < argc-1) { + iArg++; + nBootstrap = atoi(argv[iArg]); + } else if (strcmp(argv[iArg], "-noboot") == 0 || strcmp(argv[iArg], "-nosupport") == 0) { + nBootstrap = 0; + } else if (strcmp(argv[iArg], "-seed") == 0 && iArg < argc-1) { + iArg++; + long seed = atol(argv[iArg]); + ran_start(seed); + } else if (strcmp(argv[iArg],"-top") == 0) { + if(tophitsMult < 0.01) + tophitsMult = 1.0; + } else if (strcmp(argv[iArg],"-notop") == 0) { + tophitsMult = 0.0; + } else if (strcmp(argv[iArg], "-topm") == 0 && iArg < argc-1) { + iArg++; + tophitsMult = atof(argv[iArg]); + } else if (strcmp(argv[iArg], "-close") == 0 && iArg < argc-1) { + iArg++; + tophitsClose = atof(argv[iArg]); + if (tophitsMult <= 0) { + fprintf(stderr, "Cannot use -close unless -top is set above 0\n"); + exit(1); + } + if (tophitsClose <= 0 || tophitsClose >= 1) { + fprintf(stderr, "-close argument must be between 0 and 1\n"); + exit(1); + } + } else if (strcmp(argv[iArg], "-refresh") == 0 && iArg < argc-1) { + iArg++; + tophitsRefresh = atof(argv[iArg]); + if (tophitsMult <= 0) { + fprintf(stderr, "Cannot use -refresh unless -top is set above 0\n"); + exit(1); + } + if (tophitsRefresh <= 0 || tophitsRefresh >= 1) { + fprintf(stderr, "-refresh argument must be between 0 and 1\n"); + exit(1); + } + } else if (strcmp(argv[iArg],"-nni") == 0 && iArg < argc-1) { + iArg++; + nni = atoi(argv[iArg]); + if (nni == 0) + spr = 0; + } else if (strcmp(argv[iArg],"-spr") == 0 && iArg < argc-1) { + iArg++; + spr = atoi(argv[iArg]); + } else if (strcmp(argv[iArg],"-sprlength") == 0 && iArg < argc-1) { + iArg++; + maxSPRLength = atoi(argv[iArg]); + } else if (strcmp(argv[iArg],"-mlnni") == 0 && iArg < argc-1) { + iArg++; + MLnni = atoi(argv[iArg]); + } else if (strcmp(argv[iArg],"-noml") == 0) { + MLnni = 0; + } else if (strcmp(argv[iArg],"-mllen") == 0) { + MLnni = 0; + MLlen = true; + } else if (strcmp(argv[iArg],"-nome") == 0) { + spr = 0; + nni = 0; + } else if (strcmp(argv[iArg],"-help") == 0) { + fprintf(stderr,"FastTree %s %s%s:\n%s", FT_VERSION, SSE_STRING, OpenMPString(), usage); + exit(0); + } else if (strcmp(argv[iArg],"-expert") == 0) { + fprintf(stderr, "Detailed usage for FastTree %s %s%s:\n%s", + FT_VERSION, SSE_STRING, OpenMPString(), expertUsage); + exit(0); + } else if (strcmp(argv[iArg],"-pseudo") == 0) { + if (iArg < argc-1 && isdigit(argv[iArg+1][0])) { + iArg++; + pseudoWeight = atof(argv[iArg]); + if (pseudoWeight < 0.0) { + fprintf(stderr,"Illegal argument to -pseudo: %s\n", argv[iArg]); + exit(1); + } + } else { + pseudoWeight = 1.0; + } + } else if (strcmp(argv[iArg],"-constraints") == 0 && iArg < argc-1) { + iArg++; + constraintsFile = argv[iArg]; + } else if (strcmp(argv[iArg],"-constraintWeight") == 0 && iArg < argc-1) { + iArg++; + constraintWeight = atof(argv[iArg]); + if (constraintWeight <= 0.0) { + fprintf(stderr, "Illegal argument to -constraintWeight (must be greater than zero): %s\n", argv[iArg]); + exit(1); + } + } else if (strcmp(argv[iArg],"-mlacc") == 0 && iArg < argc-1) { + iArg++; + mlAccuracy = atoi(argv[iArg]); + if (mlAccuracy < 1) { + fprintf(stderr, "Illlegal -mlacc argument: %s\n", argv[iArg]); + exit(1); + } + } else if (strcmp(argv[iArg],"-exactml") == 0 || strcmp(argv[iArg],"-mlexact") == 0) { + fprintf(stderr,"-exactml is not required -- exact posteriors is the default now\n"); + } else if (strcmp(argv[iArg],"-approxml") == 0 || strcmp(argv[iArg],"-mlapprox") == 0) { + exactML = false; + } else if (strcmp(argv[iArg],"-cat") == 0 && iArg < argc-1) { + iArg++; + nRateCats = atoi(argv[iArg]); + if (nRateCats < 1) { + fprintf(stderr, "Illlegal argument to -ncat (must be greater than zero): %s\n", argv[iArg]); + exit(1); + } + } else if (strcmp(argv[iArg],"-nocat") == 0) { + nRateCats = 1; + } else if (strcmp(argv[iArg], "-lg") == 0) { + bUseLg = true; + } else if (strcmp(argv[iArg], "-wag") == 0) { + bUseWag = true; + } else if (strcmp(argv[iArg], "-gtr") == 0) { + bUseGtr = true; + } else if (strcmp(argv[iArg], "-trans") == 0 && iArg < argc-1) { + iArg++; + transitionFile = argv[iArg]; + } else if (strcmp(argv[iArg], "-gtrrates") == 0 && iArg < argc-6) { + bUseGtr = true; + bUseGtrRates = true; + int i; + for (i = 0; i < 6; i++) { + gtrrates[i] = atof(argv[++iArg]); + if (gtrrates[i] < 1e-5) { + fprintf(stderr, "Illegal or too small value of GTR rate: %s\n", argv[iArg]); + exit(1); + } + } + } else if (strcmp(argv[iArg],"-gtrfreq") == 0 && iArg < argc-4) { + bUseGtr = true; + bUseGtrFreq = true; + int i; + double sum = 0; + for (i = 0; i < 4; i++) { + gtrfreq[i] = atof(argv[++iArg]); + sum += gtrfreq[i]; + if (gtrfreq[i] < 1e-5) { + fprintf(stderr, "Illegal or too small value of GTR frequency: %s\n", argv[iArg]); + exit(1); + } + } + if (fabs(1.0-sum) > 0.01) { + fprintf(stderr, "-gtrfreq values do not sum to 1\n"); + exit(1); + } + for (i = 0; i < 4; i++) + gtrfreq[i] /= sum; + } else if (strcmp(argv[iArg],"-log") == 0 && iArg < argc-1) { + iArg++; + logfile = argv[iArg]; + } else if (strcmp(argv[iArg],"-gamma") == 0) { + gammaLogLk = true; + } else if (strcmp(argv[iArg],"-out") == 0 && iArg < argc-1) { + iArg++; + fpOut = fopen(argv[iArg],"w"); + if(fpOut==NULL) { + fprintf(stderr,"Cannot write to %s\n",argv[iArg]); + exit(1); + } + } else if (argv[iArg][0] == '-') { + fprintf(stderr, "Unknown or incorrect use of option %s\n%s", argv[iArg], usage); + exit(1); + } else + break; + } + if(iArg < argc-1) { + fprintf(stderr, "%s", usage); + exit(1); + } + + codesString = nCodes == 20 ? codesStringAA : codesStringNT; + if (nCodes == 4 && matrixPrefix == NULL) + useMatrix = false; /* no default nucleotide matrix */ + if (transitionFile && nCodes != 20) { + fprintf(stderr, "The -trans option is only supported for amino acid alignments\n"); + exit(1); + } +#ifndef USE_DOUBLE + if (transitionFile) + fprintf(stderr, + "Warning: custom matrices may create numerical problems for single-precision FastTree.\n" + "You may want to recompile with -DUSE_DOUBLE\n"); +#endif + + char *fileName = iArg == (argc-1) ? argv[argc-1] : NULL; + + if (slow && fastest) { + fprintf(stderr,"Cannot be both slow and fastest\n"); + exit(1); + } + if (slow && tophitsMult > 0) { + tophitsMult = 0.0; + } + + FILE *fpLog = NULL; + if (logfile != NULL) { + fpLog = fopen(logfile, "w"); + if (fpLog == NULL) { + fprintf(stderr, "Cannot write to: %s\n", logfile); + exit(1); + } + fprintf(fpLog, "Command:"); + int i; + for (i=0; i < argc; i++) + fprintf(fpLog, " %s", argv[i]); + fprintf(fpLog,"\n"); + fflush(fpLog); + } + + int i; + FILE *fps[2] = {NULL,NULL}; + int nFPs = 0; + if (verbose) + fps[nFPs++] = stderr; + if (fpLog != NULL) + fps[nFPs++] = fpLog; + + if (!make_matrix) { /* Report settings */ + char tophitString[100] = "no"; + char tophitsCloseStr[100] = "default"; + if(tophitsClose > 0) sprintf(tophitsCloseStr,"%.2f",tophitsClose); + if(tophitsMult>0) sprintf(tophitString,"%.2f*sqrtN close=%s refresh=%.2f", + tophitsMult, tophitsCloseStr, tophitsRefresh); + char supportString[100] = "none"; + if (nBootstrap>0) { + if (MLnni != 0 || MLlen) + sprintf(supportString, "SH-like %d", nBootstrap); + else + sprintf(supportString,"Local boot %d",nBootstrap); + } + char nniString[100] = "(no NNI)"; + if (nni > 0) + sprintf(nniString, "+NNI (%d rounds)", nni); + if (nni == -1) + strcpy(nniString, "+NNI"); + char sprString[100] = "(no SPR)"; + if (spr > 0) + sprintf(sprString, "+SPR (%d rounds range %d)", spr, maxSPRLength); + char mlnniString[100] = "(no ML-NNI)"; + if(MLnni > 0) + sprintf(mlnniString, "+ML-NNI (%d rounds)", MLnni); + else if (MLnni == -1) + sprintf(mlnniString, "+ML-NNI"); + else if (MLlen) + sprintf(mlnniString, "+ML branch lengths"); + if ((MLlen || MLnni != 0) && !exactML) + strcat(mlnniString, " approx"); + if (MLnni != 0) + sprintf(mlnniString+strlen(mlnniString), " opt-each=%d",mlAccuracy); + + for (i = 0; i < nFPs; i++) { + FILE *fp = fps[i]; + fprintf(fp,"FastTree Version %s %s%s\nAlignment: %s", + FT_VERSION, SSE_STRING, OpenMPString(), fileName != NULL ? fileName : "standard input"); + if (nAlign>1) + fprintf(fp, " (%d alignments)", nAlign); + fprintf(fp,"\n%s distances: %s Joins: %s Support: %s\n", + nCodes == 20 ? "Amino acid" : "Nucleotide", + matrixPrefix ? matrixPrefix : (useMatrix? "BLOSUM45" + : (nCodes==4 && logdist ? "Jukes-Cantor" : "%different")), + bionj ? "weighted" : "balanced" , + supportString); + if (intreeFile == NULL) + fprintf(fp, "Search: %s%s %s %s %s\nTopHits: %s\n", + slow?"Exhaustive (slow)" : (fastest ? "Fastest" : "Normal"), + useTopHits2nd ? "+2nd" : "", + nniString, sprString, mlnniString, + tophitString); + else + fprintf(fp, "Start at tree from %s %s %s\n", intreeFile, nniString, sprString); + + if (MLnni != 0 || MLlen) { + fprintf(fp, "ML Model: %s,", + (nCodes == 4) ? + (bUseGtr ? "Generalized Time-Reversible" : "Jukes-Cantor") : + (transitionFile ? transitionFile : + (bUseLg ? "Le-Gascuel 2008" : (bUseWag ? "Whelan-And-Goldman" : "Jones-Taylor-Thorton")))); + if (nRateCats == 1) + fprintf(fp, " No rate variation across sites"); + else + fprintf(fp, " CAT approximation with %d rate categories", nRateCats); + fprintf(fp, "\n"); + if (nCodes == 4 && bUseGtrRates) + fprintf(fp, "GTR rates(ac ag at cg ct gt) %.4f %.4f %.4f %.4f %.4f %.4f\n", + gtrrates[0],gtrrates[1],gtrrates[2],gtrrates[3],gtrrates[4],gtrrates[5]); + if (nCodes == 4 && bUseGtrFreq) + fprintf(fp, "GTR frequencies(A C G T) %.4f %.4f %.4f %.4f\n", + gtrfreq[0],gtrfreq[1],gtrfreq[2],gtrfreq[3]); + } + if (constraintsFile != NULL) + fprintf(fp, "Constraints: %s Weight: %.3f\n", constraintsFile, constraintWeight); + if (pseudoWeight > 0) + fprintf(fp, "Pseudocount weight for comparing sequences with little overlap: %.3lf\n",pseudoWeight); + fflush(fp); + } + } + if (matrixPrefix != NULL) { + if (!useMatrix) { + fprintf(stderr,"Cannot use both -matrix and -nomatrix arguments!"); + exit(1); + } + distance_matrix = ReadDistanceMatrix(matrixPrefix); + } else if (useMatrix) { /* use default matrix */ + assert(nCodes==20); + distance_matrix = &matrixBLOSUM45; + SetupDistanceMatrix(distance_matrix); + } else { + distance_matrix = NULL; + } + + int iAln; + FILE *fpIn = fileName != NULL ? fopen(fileName, "r") : stdin; + if (fpIn == NULL) { + fprintf(stderr, "Cannot read %s\n", fileName); + exit(1); + } + FILE *fpConstraints = NULL; + if (constraintsFile != NULL) { + fpConstraints = fopen(constraintsFile, "r"); + if (fpConstraints == NULL) { + fprintf(stderr, "Cannot read %s\n", constraintsFile); + exit(1); + } + } + + FILE *fpInTree = NULL; + if (intreeFile != NULL) { + fpInTree = fopen(intreeFile,"r"); + if (fpInTree == NULL) { + fprintf(stderr, "Cannot read %s\n", intreeFile); + exit(1); + } + } + + for(iAln = 0; iAln < nAlign; iAln++) { + alignment_t *aln = ReadAlignment(fpIn, bQuote); + if (aln->nSeq < 1) { + fprintf(stderr, "No alignment sequences\n"); + exit(1); + } + if (fpLog) { + fprintf(fpLog, "Read %d sequences, %d positions\n", aln->nSeq, aln->nPos); + fflush(fpLog); + } + + struct timeval clock_start; + gettimeofday(&clock_start,NULL); + ProgressReport("Read alignment",0,0,0,0); + + /* Check that all names in alignment are unique */ + hashstrings_t *hashnames = MakeHashtable(aln->names, aln->nSeq); + int i; + for (i=0; inSeq; i++) { + hashiterator_t hi = FindMatch(hashnames,aln->names[i]); + if (HashCount(hashnames,hi) != 1) { + fprintf(stderr,"Non-unique name '%s' in the alignment\n",aln->names[i]); + exit(1); + } + } + + /* Make a list of unique sequences -- note some lists are bigger than required */ + ProgressReport("Hashed the names",0,0,0,0); + if (make_matrix) { + NJ_t *NJ = InitNJ(aln->seqs, aln->nSeq, aln->nPos, + /*constraintSeqs*/NULL, /*nConstraints*/0, + distance_matrix, /*transmat*/NULL); + printf(" %d\n",aln->nSeq); + int i,j; + for(i = 0; i < NJ->nSeq; i++) { + printf("%s",aln->names[i]); + for (j = 0; j < NJ->nSeq; j++) { + besthit_t hit; + SeqDist(NJ->profiles[i]->codes,NJ->profiles[j]->codes,NJ->nPos,NJ->distance_matrix,/*OUT*/&hit); + if (logdist) + hit.dist = LogCorrect(hit.dist); + /* Make sure -0 prints as 0 */ + printf(" %f", hit.dist <= 0.0 ? 0.0 : hit.dist); + } + printf("\n"); + } + } else { + /* reset counters*/ + profileOps = 0; + outprofileOps = 0; + seqOps = 0; + profileAvgOps = 0; + nHillBetter = 0; + nCloseUsed = 0; + nClose2Used = 0; + nRefreshTopHits = 0; + nVisibleUpdate = 0; + nNNI = 0; + nML_NNI = 0; + nProfileFreqAlloc = 0; + nProfileFreqAvoid = 0; + szAllAlloc = 0; + mymallocUsed = 0; + maxmallocHeap = 0; + nLkCompute = 0; + nPosteriorCompute = 0; + nAAPosteriorExact = 0; + nAAPosteriorRough = 0; + nStarTests = 0; + + uniquify_t *unique = UniquifyAln(aln); + ProgressReport("Identified unique sequences",0,0,0,0); + + /* read constraints */ + alignment_t *constraints = NULL; + char **uniqConstraints = NULL; + if (constraintsFile != NULL) { + constraints = ReadAlignment(fpConstraints, bQuote); + if (constraints->nSeq < 4) { + fprintf(stderr, "Warning: constraints file with less than 4 sequences ignored:\nalignment #%d in %s\n", + iAln+1, constraintsFile); + constraints = FreeAlignment(constraints); + } else { + uniqConstraints = AlnToConstraints(constraints, unique, hashnames); + ProgressReport("Read the constraints",0,0,0,0); + } + } /* end load constraints */ + + transition_matrix_t *transmat = NULL; + if (nCodes == 20) { + transmat = transitionFile? ReadAATransitionMatrix(transitionFile) : + (bUseLg? CreateTransitionMatrix(matrixLG08,statLG08) : + (bUseWag? CreateTransitionMatrix(matrixWAG01,statWAG01) : + CreateTransitionMatrix(matrixJTT92,statJTT92))); + } else if (nCodes == 4 && bUseGtr && (bUseGtrRates || bUseGtrFreq)) { + transmat = CreateGTR(gtrrates,gtrfreq); + } + NJ_t *NJ = InitNJ(unique->uniqueSeq, unique->nUnique, aln->nPos, + uniqConstraints, + uniqConstraints != NULL ? constraints->nPos : 0, /* nConstraints */ + distance_matrix, + transmat); + if (verbose>2) fprintf(stderr, "read %s seqs %d (%d unique) positions %d nameLast %s seqLast %s\n", + fileName ? fileName : "standard input", + aln->nSeq, unique->nUnique, aln->nPos, aln->names[aln->nSeq-1], aln->seqs[aln->nSeq-1]); + FreeAlignmentSeqs(/*IN/OUT*/aln); /*no longer needed*/ + if (fpInTree != NULL) { + if (intree1) + fseek(fpInTree, 0L, SEEK_SET); + ReadTree(/*IN/OUT*/NJ, /*IN*/unique, /*IN*/hashnames, /*READ*/fpInTree); + if (verbose > 2) + fprintf(stderr, "Read tree from %s\n", intreeFile); + if (verbose > 2) + PrintNJ(stderr, NJ, aln->names, unique, /*support*/false, bQuote); + } else { + FastNJ(NJ); + } + LogTree("NJ", 0, fpLog, NJ, aln->names, unique, bQuote); + + /* profile-frequencies for the "up-profiles" in ReliabilityNJ take only diameter(Tree)*L*a + space not N*L*a space, because we can free them as we go. + And up-profile by their nature tend to be complicated. + So save the profile-frequency memory allocation counters now to exclude later results. + */ +#ifdef TRACK_MEMORY + long svProfileFreqAlloc = nProfileFreqAlloc; + long svProfileFreqAvoid = nProfileFreqAvoid; +#endif + int nniToDo = nni == -1 ? (int)(0.5 + 4.0 * log(NJ->nSeq)/log(2)) : nni; + int sprRemaining = spr; + int MLnniToDo = (MLnni != -1) ? MLnni : (int)(0.5 + 2.0*log(NJ->nSeq)/log(2)); + if(verbose>0) { + if (fpInTree == NULL) + fprintf(stderr, "Initial topology in %.2f seconds\n", clockDiff(&clock_start)); + if (spr > 0 || nniToDo > 0 || MLnniToDo > 0) + fprintf(stderr,"Refining topology: %d rounds ME-NNIs, %d rounds ME-SPRs, %d rounds ML-NNIs\n", nniToDo, spr, MLnniToDo); + } + + if (nniToDo>0) { + int i; + bool bConverged = false; + nni_stats_t *nni_stats = InitNNIStats(NJ); + for (i=0; i < nniToDo; i++) { + double maxDelta; + if (!bConverged) { + int nChange = NNI(/*IN/OUT*/NJ, i, nniToDo, /*use ml*/false, /*IN/OUT*/nni_stats, /*OUT*/&maxDelta); + LogTree("ME_NNI%d",i+1, fpLog, NJ, aln->names, unique, bQuote); + if (nChange == 0) { + bConverged = true; + if (verbose>1) + fprintf(stderr, "Min_evolution NNIs converged at round %d -- skipping some rounds\n", i+1); + if (fpLog) + fprintf(fpLog, "Min_evolution NNIs converged at round %d -- skipping some rounds\n", i+1); + } + } + + /* Interleave SPRs with NNIs (typically 1/3rd NNI, SPR, 1/3rd NNI, SPR, 1/3rd NNI */ + if (sprRemaining > 0 && (nniToDo/(spr+1) > 0 && ((i+1) % (nniToDo/(spr+1))) == 0)) { + SPR(/*IN/OUT*/NJ, maxSPRLength, spr-sprRemaining, spr); + LogTree("ME_SPR%d",spr-sprRemaining+1, fpLog, NJ, aln->names, unique, bQuote); + sprRemaining--; + /* Restart the NNIs -- set all ages to 0, etc. */ + bConverged = false; + nni_stats = FreeNNIStats(nni_stats, NJ); + nni_stats = InitNNIStats(NJ); + } + } + nni_stats = FreeNNIStats(nni_stats, NJ); + } + while(sprRemaining > 0) { /* do any remaining SPR rounds */ + SPR(/*IN/OUT*/NJ, maxSPRLength, spr-sprRemaining, spr); + LogTree("ME_SPR%d",spr-sprRemaining+1, fpLog, NJ, aln->names, unique, bQuote); + sprRemaining--; + } + + /* In minimum-evolution mode, update branch lengths, even if no NNIs or SPRs, + so that they are log-corrected, do not include penalties from constraints, + and avoid errors due to approximation of out-distances. + If doing maximum-likelihood NNIs, then we'll also use these + to get estimates of starting distances for quartets, etc. + */ + UpdateBranchLengths(/*IN/OUT*/NJ); + LogTree("ME_Lengths",0, fpLog, NJ, aln->names, unique, bQuote); + + double total_len = 0; + int iNode; + for (iNode = 0; iNode < NJ->maxnode; iNode++) + total_len += fabs(NJ->branchlength[iNode]); + + if (verbose>0) { + fprintf(stderr, "Total branch-length %.3f after %.2f sec\n", + total_len, clockDiff(&clock_start)); + fflush(stderr); + } + if (fpLog) { + fprintf(fpLog, "Total branch-length %.3f after %.2f sec\n", + total_len, clockDiff(&clock_start)); + fflush(stderr); + } + +#ifdef TRACK_MEMORY + if (verbose>1) { + struct mallinfo mi = mallinfo(); + fprintf(stderr, "Memory @ end of ME phase: %.2f MB (%.1f byte/pos) useful %.2f expected %.2f\n", + (mi.arena+mi.hblkhd)/1.0e6, (mi.arena+mi.hblkhd)/(double)(NJ->nSeq*(double)NJ->nPos), + mi.uordblks/1.0e6, mymallocUsed/1e6); + } +#endif + + SplitCount_t splitcount = {0,0,0,0,0.0,0.0}; + + if (MLnniToDo > 0 || MLlen) { + bool warn_len = total_len/NJ->maxnode < 0.001 && MLMinBranchLengthTolerance > 1.0/aln->nPos; + bool warn = warn_len || (total_len/NJ->maxnode < 0.001 && aln->nPos >= 10000); + if (warn) + fprintf(stderr, "\nWARNING! This alignment consists of closely-related and very-long sequences.\n"); + if (warn_len) + fprintf(stderr, + "This version of FastTree may not report reasonable branch lengths!\n" +#ifdef USE_DOUBLE + "Consider changing MLMinBranchLengthTolerance.\n" +#else + "Consider recompiling FastTree with -DUSE_DOUBLE.\n" +#endif + "For more information, visit\n" + "http://www.microbesonline.org/fasttree/#BranchLen\n\n"); + if (warn) + fprintf(stderr, "WARNING! FastTree (or other standard maximum-likelihood tools)\n" + "may not be appropriate for aligments of very closely-related sequences\n" + "like this one, as FastTree does not account for recombination or gene conversion\n\n"); + + /* Do maximum-likelihood computations */ + /* Convert profiles to use the transition matrix */ + distance_matrix_t *tmatAsDist = TransMatToDistanceMat(/*OPTIONAL*/NJ->transmat); + RecomputeProfiles(NJ, /*OPTIONAL*/tmatAsDist); + tmatAsDist = myfree(tmatAsDist, sizeof(distance_matrix_t)); + double lastloglk = -1e20; + nni_stats_t *nni_stats = InitNNIStats(NJ); + bool resetGtr = nCodes == 4 && bUseGtr && !bUseGtrRates; + + if (MLlen) { + int iRound; + int maxRound = (int)(0.5 + log(NJ->nSeq)/log(2)); + double dLastLogLk = -1e20; + for (iRound = 1; iRound <= maxRound; iRound++) { + int node; + numeric_t *oldlength = (numeric_t*)mymalloc(sizeof(numeric_t)*NJ->maxnodes); + for (node = 0; node < NJ->maxnode; node++) + oldlength[node] = NJ->branchlength[node]; + OptimizeAllBranchLengths(/*IN/OUT*/NJ); + LogTree("ML_Lengths",iRound, fpLog, NJ, aln->names, unique, bQuote); + double dMaxChange = 0; /* biggest change in branch length */ + for (node = 0; node < NJ->maxnode; node++) { + double d = fabs(oldlength[node] - NJ->branchlength[node]); + if (dMaxChange < d) + dMaxChange = d; + } + oldlength = myfree(oldlength, sizeof(numeric_t)*NJ->maxnodes); + double loglk = TreeLogLk(NJ, /*site_likelihoods*/NULL); + bool bConverged = iRound > 1 && (dMaxChange < 0.001 || loglk < (dLastLogLk+treeLogLkDelta)); + if (verbose) + fprintf(stderr, "%d rounds ML lengths: LogLk %s= %.3lf Max-change %.4lf%s Time %.2f\n", + iRound, + exactML || nCodes != 20 ? "" : "~", + loglk, + dMaxChange, + bConverged ? " (converged)" : "", + clockDiff(&clock_start)); + if (fpLog) + fprintf(fpLog, "TreeLogLk\tLength%d\t%.4lf\tMaxChange\t%.4lf\n", + iRound, loglk, dMaxChange); + if (iRound == 1) { + if (resetGtr) + SetMLGtr(/*IN/OUT*/NJ, bUseGtrFreq ? gtrfreq : NULL, fpLog); + SetMLRates(/*IN/OUT*/NJ, nRateCats); + LogMLRates(fpLog, NJ); + } + if (bConverged) + break; + } + } + + if (MLnniToDo > 0) { + /* This may help us converge faster, and is fast */ + OptimizeAllBranchLengths(/*IN/OUT*/NJ); + LogTree("ML_Lengths%d",1, fpLog, NJ, aln->names, unique, bQuote); + } + + int iMLnni; + double maxDelta; + bool bConverged = false; + for (iMLnni = 0; iMLnni < MLnniToDo; iMLnni++) { + int changes = NNI(/*IN/OUT*/NJ, iMLnni, MLnniToDo, /*use ml*/true, /*IN/OUT*/nni_stats, /*OUT*/&maxDelta); + LogTree("ML_NNI%d",iMLnni+1, fpLog, NJ, aln->names, unique, bQuote); + double loglk = TreeLogLk(NJ, /*site_likelihoods*/NULL); + bool bConvergedHere = (iMLnni > 0) && ((loglk < lastloglk + treeLogLkDelta) || maxDelta < treeLogLkDelta); + if (verbose) + fprintf(stderr, "ML-NNI round %d: LogLk %s= %.3f NNIs %d max delta %.2f Time %.2f%s\n", + iMLnni+1, + exactML || nCodes != 20 ? "" : "~", + loglk, changes, maxDelta, clockDiff(&clock_start), + bConverged ? " (final)" : ""); + if (fpLog) + fprintf(fpLog, "TreeLogLk\tML_NNI%d\t%.4lf\tMaxChange\t%.4lf\n", iMLnni+1, loglk, maxDelta); + if (bConverged) + break; /* we did our extra round */ + if (bConvergedHere) + bConverged = true; + if (bConverged || iMLnni == MLnniToDo-2) { + /* last round uses high-accuracy seettings -- reset NNI stats to tone down heuristics */ + nni_stats = FreeNNIStats(nni_stats, NJ); + nni_stats = InitNNIStats(NJ); + if (verbose) + fprintf(stderr, "Turning off heuristics for final round of ML NNIs%s\n", + bConvergedHere? " (converged)" : ""); + if (fpLog) + fprintf(fpLog, "Turning off heuristics for final round of ML NNIs%s\n", + bConvergedHere? " (converged)" : ""); + } + lastloglk = loglk; + if (iMLnni == 0 && NJ->rates.nRateCategories == 1) { + if (resetGtr) + SetMLGtr(/*IN/OUT*/NJ, bUseGtrFreq ? gtrfreq : NULL, fpLog); + SetMLRates(/*IN/OUT*/NJ, nRateCats); + LogMLRates(fpLog, NJ); + } + } + nni_stats = FreeNNIStats(nni_stats, NJ); + + /* This does not take long and improves the results */ + if (MLnniToDo > 0) { + OptimizeAllBranchLengths(/*IN/OUT*/NJ); + LogTree("ML_Lengths%d",2, fpLog, NJ, aln->names, unique, bQuote); + if (verbose || fpLog) { + double loglk = TreeLogLk(NJ, /*site_likelihoods*/NULL); + if (verbose) + fprintf(stderr, "Optimize all lengths: LogLk %s= %.3f Time %.2f\n", + exactML || nCodes != 20 ? "" : "~", + loglk, + clockDiff(&clock_start)); + if (fpLog) { + fprintf(fpLog, "TreeLogLk\tML_Lengths%d\t%.4f\n", 2, loglk); + fflush(fpLog); + } + } + } + + /* Count bad splits and compute SH-like supports if desired */ + if ((MLnniToDo > 0 && !fastest) || nBootstrap > 0) + TestSplitsML(NJ, /*OUT*/&splitcount, nBootstrap); + + /* Compute gamma-based likelihood? */ + if (gammaLogLk && nRateCats > 1) { + numeric_t *rates = MLSiteRates(nRateCats); + double *site_loglk = MLSiteLikelihoodsByRate(NJ, rates, nRateCats); + double scale = RescaleGammaLogLk(NJ->nPos, nRateCats, rates, /*IN*/site_loglk, /*OPTIONAL*/fpLog); + rates = myfree(rates, sizeof(numeric_t) * nRateCats); + site_loglk = myfree(site_loglk, sizeof(double) * nRateCats * NJ->nPos); + + for (i = 0; i < NJ->maxnodes; i++) + NJ->branchlength[i] *= scale; + } + } else { + /* Minimum evolution supports */ + TestSplitsMinEvo(NJ, /*OUT*/&splitcount); + if (nBootstrap > 0) + ReliabilityNJ(NJ, nBootstrap); + } + + for (i = 0; i < nFPs; i++) { + FILE *fp = fps[i]; + fprintf(fp, "Total time: %.2f seconds Unique: %d/%d Bad splits: %d/%d", + clockDiff(&clock_start), + NJ->nSeq, aln->nSeq, + splitcount.nBadSplits, splitcount.nSplits); + if (splitcount.dWorstDeltaUnconstrained > 0) + fprintf(fp, " Worst %sdelta-%s %.3f", + uniqConstraints != NULL ? "unconstrained " : "", + (MLnniToDo > 0 || MLlen) ? "LogLk" : "Len", + splitcount.dWorstDeltaUnconstrained); + fprintf(fp,"\n"); + if (NJ->nSeq > 3 && NJ->nConstraints > 0) { + fprintf(fp, "Violating constraints: %d both bad: %d", + splitcount.nConstraintViolations, splitcount.nBadBoth); + if (splitcount.dWorstDeltaConstrained > 0) + fprintf(fp, " Worst delta-%s due to constraints: %.3f", + (MLnniToDo > 0 || MLlen) ? "LogLk" : "Len", + splitcount.dWorstDeltaConstrained); + fprintf(fp,"\n"); + } + if (verbose > 1 || fp == fpLog) { + double dN2 = NJ->nSeq*(double)NJ->nSeq; + fprintf(fp, "Dist/N**2: by-profile %.3f (out %.3f) by-leaf %.3f avg-prof %.3f\n", + profileOps/dN2, outprofileOps/dN2, seqOps/dN2, profileAvgOps/dN2); + if (nCloseUsed>0 || nClose2Used > 0 || nRefreshTopHits>0) + fprintf(fp, "Top hits: close neighbors %ld/%d 2nd-level %ld refreshes %ld", + nCloseUsed, NJ->nSeq, nClose2Used, nRefreshTopHits); + if(!slow) fprintf(fp, " Hill-climb: %ld Update-best: %ld\n", nHillBetter, nVisibleUpdate); + if (nniToDo > 0 || spr > 0 || MLnniToDo > 0) + fprintf(fp, "NNI: %ld SPR: %ld ML-NNI: %ld\n", nNNI, nSPR, nML_NNI); + if (MLnniToDo > 0) { + fprintf(fp, "Max-lk operations: lk %ld posterior %ld", nLkCompute, nPosteriorCompute); + if (nAAPosteriorExact > 0 || nAAPosteriorRough > 0) + fprintf(fp, " approximate-posteriors %.2f%%", + (100.0*nAAPosteriorRough)/(double)(nAAPosteriorExact+nAAPosteriorRough)); + if (mlAccuracy < 2) + fprintf(fp, " star-only %ld", nStarTests); + fprintf(fp, "\n"); + } + } +#ifdef TRACK_MEMORY + fprintf(fp, "Memory: %.2f MB (%.1f byte/pos) ", + maxmallocHeap/1.0e6, maxmallocHeap/(double)(aln->nSeq*(double)aln->nPos)); + /* Only report numbers from before we do reliability estimates */ + fprintf(fp, "profile-freq-alloc %ld avoided %.2f%%\n", + svProfileFreqAlloc, + svProfileFreqAvoid > 0 ? + 100.0*svProfileFreqAvoid/(double)(svProfileFreqAlloc+svProfileFreqAvoid) + : 0); +#endif + fflush(fp); + } + PrintNJ(fpOut, NJ, aln->names, unique, /*support*/nBootstrap > 0, bQuote); + fflush(fpOut); + if (fpLog) { + fprintf(fpLog,"TreeCompleted\n"); + fflush(fpLog); + } + FreeNJ(NJ); + if (uniqConstraints != NULL) + uniqConstraints = myfree(uniqConstraints, sizeof(char*) * unique->nUnique); + constraints = FreeAlignment(constraints); + unique = FreeUniquify(unique); + } /* end build tree */ + hashnames = FreeHashtable(hashnames); + aln = FreeAlignment(aln); + } /* end loop over alignments */ + if (fpLog != NULL) + fclose(fpLog); + if (fpOut != stdout) fclose(fpOut); + exit(0); +} + +void ProgressReport(char *format, int i1, int i2, int i3, int i4) { + static bool time_set = false; + static struct timeval time_last; + static struct timeval time_begin; + + if (!showProgress) + return; + + static struct timeval time_now; + gettimeofday(&time_now,NULL); + if (!time_set) { + time_begin = time_last = time_now; + time_set = true; + } + static struct timeval elapsed; + timeval_subtract(&elapsed,&time_now,&time_last); + + if (elapsed.tv_sec > 1 || elapsed.tv_usec > 100*1000 || verbose > 1) { + timeval_subtract(&elapsed,&time_now,&time_begin); + fprintf(stderr, "%7i.%2.2i seconds: ", (int)elapsed.tv_sec, (int)(elapsed.tv_usec/10000)); + fprintf(stderr, format, i1, i2, i3, i4); + if (verbose > 1 || !isatty(STDERR_FILENO)) { + fprintf(stderr, "\n"); + } else { + fprintf(stderr, " \r"); + } + fflush(stderr); + time_last = time_now; + } +} + +void LogMLRates(/*OPTIONAL WRITE*/FILE *fpLog, NJ_t *NJ) { + if (fpLog != NULL) { + rates_t *rates = &NJ->rates; + fprintf(fpLog, "NCategories\t%d\nRates",rates->nRateCategories); + assert(rates->nRateCategories > 0); + int iRate; + for (iRate = 0; iRate < rates->nRateCategories; iRate++) + fprintf(fpLog, " %f", rates->rates[iRate]); + fprintf(fpLog,"\nSiteCategories"); + int iPos; + for (iPos = 0; iPos < NJ->nPos; iPos++) { + iRate = rates->ratecat[iPos]; + fprintf(fpLog," %d",iRate+1); + } + fprintf(fpLog,"\n"); + fflush(fpLog); + } +} + +void LogTree(char *format, int i, /*OPTIONAL WRITE*/FILE *fpLog, NJ_t *NJ, char **names, uniquify_t *unique, bool bQuote) { + if(fpLog != NULL) { + fprintf(fpLog, format, i); + fprintf(fpLog, "\t"); + PrintNJ(fpLog, NJ, names, unique, /*support*/false, bQuote); + fflush(fpLog); + } +} + +NJ_t *InitNJ(char **sequences, int nSeq, int nPos, + /*OPTIONAL*/char **constraintSeqs, int nConstraints, + /*OPTIONAL*/distance_matrix_t *distance_matrix, + /*OPTIONAL*/transition_matrix_t *transmat) { + int iNode; + + NJ_t *NJ = (NJ_t*)mymalloc(sizeof(NJ_t)); + NJ->root = -1; /* set at end of FastNJ() */ + NJ->maxnode = NJ->nSeq = nSeq; + NJ->nPos = nPos; + NJ->maxnodes = 2*nSeq; + NJ->seqs = sequences; + NJ->distance_matrix = distance_matrix; + NJ->transmat = transmat; + NJ->nConstraints = nConstraints; + NJ->constraintSeqs = constraintSeqs; + + NJ->profiles = (profile_t **)mymalloc(sizeof(profile_t*) * NJ->maxnodes); + + unsigned long counts[256]; + int i; + for (i = 0; i < 256; i++) + counts[i] = 0; + for (iNode = 0; iNode < NJ->nSeq; iNode++) { + NJ->profiles[iNode] = SeqToProfile(NJ, NJ->seqs[iNode], nPos, + constraintSeqs != NULL ? constraintSeqs[iNode] : NULL, + nConstraints, + iNode, + /*IN/OUT*/counts); + } + unsigned long totCount = 0; + for (i = 0; i < 256; i++) + totCount += counts[i]; + + /* warnings about unknown characters */ + for (i = 0; i < 256; i++) { + if (counts[i] == 0 || i == '.' || i == '-') + continue; + unsigned char *codesP; + bool bMatched = false; + for (codesP = codesString; *codesP != '\0'; codesP++) { + if (*codesP == i || tolower(*codesP) == i) { + bMatched = true; + break; + } + } + if (!bMatched) + fprintf(stderr, "Ignored unknown character %c (seen %lu times)\n", i, counts[i]); + } + + + /* warnings about the counts */ + double fACGTUN = (counts['A'] + counts['C'] + counts['G'] + counts['T'] + counts['U'] + counts['N'] + + counts['a'] + counts['c'] + counts['g'] + counts['t'] + counts['u'] + counts['n']) + / (double)(totCount - counts['-'] - counts['.']); + if (nCodes == 4 && fACGTUN < 0.9) + fprintf(stderr, "WARNING! ONLY %.1f%% NUCLEOTIDE CHARACTERS -- IS THIS REALLY A NUCLEOTIDE ALIGNMENT?\n", + 100.0 * fACGTUN); + else if (nCodes == 20 && fACGTUN >= 0.9) + fprintf(stderr, "WARNING! %.1f%% NUCLEOTIDE CHARACTERS -- IS THIS REALLY A PROTEIN ALIGNMENT?\n", + 100.0 * fACGTUN); + + if(verbose>10) fprintf(stderr,"Made sequence profiles\n"); + for (iNode = NJ->nSeq; iNode < NJ->maxnodes; iNode++) + NJ->profiles[iNode] = NULL; /* not yet exists */ + + NJ->outprofile = OutProfile(NJ->profiles, NJ->nSeq, + NJ->nPos, NJ->nConstraints, + NJ->distance_matrix); + if(verbose>10) fprintf(stderr,"Made out-profile\n"); + + NJ->totdiam = 0.0; + + NJ->diameter = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); + for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->diameter[iNode] = 0; + + NJ->varDiameter = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); + for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->varDiameter[iNode] = 0; + + NJ->selfdist = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); + for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->selfdist[iNode] = 0; + + NJ->selfweight = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); + for (iNode = 0; iNode < NJ->nSeq; iNode++) + NJ->selfweight[iNode] = NJ->nPos - NGaps(NJ,iNode); + + NJ->outDistances = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); + NJ->nOutDistActive = (int *)mymalloc(sizeof(int)*NJ->maxnodes); + for (iNode = 0; iNode < NJ->maxnodes; iNode++) + NJ->nOutDistActive[iNode] = NJ->nSeq * 10; /* unreasonably high value */ + NJ->parent = NULL; /* so SetOutDistance ignores it */ + for (iNode = 0; iNode < NJ->nSeq; iNode++) + SetOutDistance(/*IN/UPDATE*/NJ, iNode, /*nActive*/NJ->nSeq); + + if (verbose>2) { + for (iNode = 0; iNode < 4 && iNode < NJ->nSeq; iNode++) + fprintf(stderr, "Node %d outdist %f\n", iNode, NJ->outDistances[iNode]); + } + + NJ->parent = (int *)mymalloc(sizeof(int)*NJ->maxnodes); + for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->parent[iNode] = -1; + + NJ->branchlength = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); /* distance to parent */ + for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->branchlength[iNode] = 0; + + NJ->support = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); + for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->support[iNode] = -1.0; + + NJ->child = (children_t*)mymalloc(sizeof(children_t)*NJ->maxnodes); + for (iNode= 0; iNode < NJ->maxnode; iNode++) NJ->child[iNode].nChild = 0; + + NJ->rates.nRateCategories = 0; + NJ->rates.rates = NULL; + NJ->rates.ratecat = NULL; + AllocRateCategories(&NJ->rates, 1, NJ->nPos); + return(NJ); +} + +NJ_t *FreeNJ(NJ_t *NJ) { + if (NJ==NULL) + return(NJ); + + int i; + for (i=0; i < NJ->maxnode; i++) + NJ->profiles[i] = FreeProfile(NJ->profiles[i], NJ->nPos, NJ->nConstraints); + NJ->profiles = myfree(NJ->profiles, sizeof(profile_t*) * NJ->maxnodes); + NJ->outprofile = FreeProfile(NJ->outprofile, NJ->nPos, NJ->nConstraints); + NJ->diameter = myfree(NJ->diameter, sizeof(numeric_t)*NJ->maxnodes); + NJ->varDiameter = myfree(NJ->varDiameter, sizeof(numeric_t)*NJ->maxnodes); + NJ->selfdist = myfree(NJ->selfdist, sizeof(numeric_t)*NJ->maxnodes); + NJ->selfweight = myfree(NJ->selfweight, sizeof(numeric_t)*NJ->maxnodes); + NJ->outDistances = myfree(NJ->outDistances, sizeof(numeric_t)*NJ->maxnodes); + NJ->nOutDistActive = myfree(NJ->nOutDistActive, sizeof(int)*NJ->maxnodes); + NJ->parent = myfree(NJ->parent, sizeof(int)*NJ->maxnodes); + NJ->branchlength = myfree(NJ->branchlength, sizeof(numeric_t)*NJ->maxnodes); + NJ->support = myfree(NJ->support, sizeof(numeric_t)*NJ->maxnodes); + NJ->child = myfree(NJ->child, sizeof(children_t)*NJ->maxnodes); + NJ->transmat = myfree(NJ->transmat, sizeof(transition_matrix_t)); + AllocRateCategories(&NJ->rates, 0, NJ->nPos); + return(myfree(NJ, sizeof(NJ_t))); +} + +/* Allocate or reallocate the rate categories, and set every position + to category 0 and every category's rate to 1.0 + If nRateCategories=0, just deallocate +*/ +void AllocRateCategories(/*IN/OUT*/rates_t *rates, int nRateCategories, int nPos) { + assert(nRateCategories >= 0); + rates->rates = myfree(rates->rates, sizeof(numeric_t)*rates->nRateCategories); + rates->ratecat = myfree(rates->ratecat, sizeof(unsigned int)*nPos); + rates->nRateCategories = nRateCategories; + if (rates->nRateCategories > 0) { + rates->rates = (numeric_t*)mymalloc(sizeof(numeric_t)*rates->nRateCategories); + int i; + for (i = 0; i < nRateCategories; i++) + rates->rates[i] = 1.0; + rates->ratecat = (unsigned int *)mymalloc(sizeof(unsigned int)*nPos); + for (i = 0; i < nPos; i++) + rates->ratecat[i] = 0; + } +} + +void FastNJ(NJ_t *NJ) { + int iNode; + + assert(NJ->nSeq >= 1); + if (NJ->nSeq < 3) { + NJ->root = NJ->maxnode++; + NJ->child[NJ->root].nChild = NJ->nSeq; + for (iNode = 0; iNode < NJ->nSeq; iNode++) { + NJ->parent[iNode] = NJ->root; + NJ->child[NJ->root].child[iNode] = iNode; + } + if (NJ->nSeq == 1) { + NJ->branchlength[0] = 0; + } else { + assert (NJ->nSeq == 2); + besthit_t hit; + SeqDist(NJ->profiles[0]->codes,NJ->profiles[1]->codes,NJ->nPos,NJ->distance_matrix,/*OUT*/&hit); + NJ->branchlength[0] = hit.dist/2.0; + NJ->branchlength[1] = hit.dist/2.0; + } + return; + } + + /* else 3 or more sequences */ + + /* The visible set stores the best hit of each node (unless using top hits, in which case + it is handled by the top hits routines) */ + besthit_t *visible = NULL; /* Not used if doing top hits */ + besthit_t *besthitNew = NULL; /* All hits of new node -- not used if doing top-hits */ + + /* The top-hits lists, with the key parameter m = length of each top-hit list */ + top_hits_t *tophits = NULL; + int m = 0; /* maximum length of a top-hits list */ + if (tophitsMult > 0) { + m = (int)(0.5 + tophitsMult*sqrt(NJ->nSeq)); + if(m<4 || 2*m >= NJ->nSeq) { + m=0; + if(verbose>1) fprintf(stderr,"Too few leaves, turning off top-hits\n"); + } else { + if(verbose>2) fprintf(stderr,"Top-hit-list size = %d of %d\n", m, NJ->nSeq); + } + } + assert(!(slow && m>0)); + + /* Initialize top-hits or visible set */ + if (m>0) { + tophits = InitTopHits(NJ, m); + SetAllLeafTopHits(/*IN/UPDATE*/NJ, /*OUT*/tophits); + ResetTopVisible(/*IN/UPDATE*/NJ, /*nActive*/NJ->nSeq, /*IN/OUT*/tophits); + } else if (!slow) { + visible = (besthit_t*)mymalloc(sizeof(besthit_t)*NJ->maxnodes); + besthitNew = (besthit_t*)mymalloc(sizeof(besthit_t)*NJ->maxnodes); + for (iNode = 0; iNode < NJ->nSeq; iNode++) + SetBestHit(iNode, NJ, /*nActive*/NJ->nSeq, /*OUT*/&visible[iNode], /*OUT IGNORED*/NULL); + } + + /* Iterate over joins */ + int nActiveOutProfileReset = NJ->nSeq; + int nActive; + for (nActive = NJ->nSeq; nActive > 3; nActive--) { + int nJoinsDone = NJ->nSeq - nActive; + if (nJoinsDone > 0 && (nJoinsDone % 100) == 0) + ProgressReport("Joined %6d of %6d", nJoinsDone, NJ->nSeq-3, 0, 0); + + besthit_t join; /* the join to do */ + if (slow) { + ExhaustiveNJSearch(NJ,nActive,/*OUT*/&join); + } else if (m>0) { + TopHitNJSearch(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits, /*OUT*/&join); + } else { + FastNJSearch(NJ, nActive, /*IN/OUT*/visible, /*OUT*/&join); + } + + if (verbose>2) { + double penalty = constraintWeight + * (double)JoinConstraintPenalty(NJ, join.i, join.j); + if (penalty > 0.001) { + fprintf(stderr, "Constraint violation during neighbor-joining %d %d into %d penalty %.3f\n", + join.i, join.j, NJ->maxnode, penalty); + int iC; + for (iC = 0; iC < NJ->nConstraints; iC++) { + int local = JoinConstraintPenaltyPiece(NJ, join.i, join.j, iC); + if (local > 0) + fprintf(stderr, "Constraint %d piece %d %d/%d %d/%d %d/%d\n", iC, local, + NJ->profiles[join.i]->nOn[iC], + NJ->profiles[join.i]->nOff[iC], + NJ->profiles[join.j]->nOn[iC], + NJ->profiles[join.j]->nOff[iC], + NJ->outprofile->nOn[iC] - NJ->profiles[join.i]->nOn[iC] - NJ->profiles[join.j]->nOn[iC], + NJ->outprofile->nOff[iC] - NJ->profiles[join.i]->nOff[iC] - NJ->profiles[join.j]->nOff[iC]); + } + } + } + + /* because of the stale out-distance heuristic, make sure that these are up-to-date */ + SetOutDistance(NJ, join.i, nActive); + SetOutDistance(NJ, join.j, nActive); + /* Make sure weight is set and criterion is up to date */ + SetDistCriterion(NJ, nActive, /*IN/OUT*/&join); + assert(NJ->nOutDistActive[join.i] == nActive); + assert(NJ->nOutDistActive[join.j] == nActive); + + int newnode = NJ->maxnode++; + NJ->parent[join.i] = newnode; + NJ->parent[join.j] = newnode; + NJ->child[newnode].nChild = 2; + NJ->child[newnode].child[0] = join.i < join.j ? join.i : join.j; + NJ->child[newnode].child[1] = join.i > join.j ? join.i : join.j; + + double rawIJ = join.dist + NJ->diameter[join.i] + NJ->diameter[join.j]; + double distIJ = join.dist; + + double deltaDist = (NJ->outDistances[join.i]-NJ->outDistances[join.j])/(double)(nActive-2); + NJ->branchlength[join.i] = (distIJ + deltaDist)/2; + NJ->branchlength[join.j] = (distIJ - deltaDist)/2; + + double bionjWeight = 0.5; /* IJ = bionjWeight*I + (1-bionjWeight)*J */ + double varIJ = rawIJ - NJ->varDiameter[join.i] - NJ->varDiameter[join.j]; + + if (bionj && join.weight > 0.01 && varIJ > 0.001) { + /* Set bionjWeight according to the BIONJ formula, where + the variance matrix is approximated by + + Vij = ProfileVar(i,j) - varDiameter(i) - varDiameter(j) + ProfileVar(i,j) = distance(i,j) = top(i,j)/weight(i,j) + + (The node's distance diameter does not affect the variances.) + + The BIONJ formula is equation 9 from Gascuel 1997: + + bionjWeight = 1/2 + sum(k!=i,j) (Vjk - Vik) / ((nActive-2)*Vij) + sum(k!=i,j) (Vjk - Vik) = sum(k!=i,j) Vik - varDiameter(j) + varDiameter(i) + = sum(k!=i,j) ProfileVar(j,k) - sum(k!=i,j) ProfileVar(i,k) + (nActive-2)*(varDiameter(i)-varDiameter(j)) + + sum(k!=i,j) ProfileVar(i,k) + ~= (sum(k!=i,j) distance(i,k) * weight(i,k))/(mean(k!=i,j) weight(i,k)) + ~= (N-2) * top(i, Out-i-j) / weight(i, Out-i-j) + + weight(i, Out-i-j) = N*weight(i,Out) - weight(i,i) - weight(i,j) + top(i, Out-i-j) = N*top(i,Out) - top(i,i) - top(i,j) + */ + besthit_t outI; + besthit_t outJ; + ProfileDist(NJ->profiles[join.i],NJ->outprofile,NJ->nPos,NJ->distance_matrix,/*OUT*/&outI); + ProfileDist(NJ->profiles[join.j],NJ->outprofile,NJ->nPos,NJ->distance_matrix,/*OUT*/&outJ); + outprofileOps += 2; + + double varIWeight = (nActive * outI.weight - NJ->selfweight[join.i] - join.weight); + double varJWeight = (nActive * outJ.weight - NJ->selfweight[join.j] - join.weight); + + double varITop = outI.dist * outI.weight * nActive + - NJ->selfdist[join.i] * NJ->selfweight[join.i] - rawIJ * join.weight; + double varJTop = outJ.dist * outJ.weight * nActive + - NJ->selfdist[join.j] * NJ->selfweight[join.j] - rawIJ * join.weight; + + double deltaProfileVarOut = (nActive-2) * (varJTop/varJWeight - varITop/varIWeight); + double deltaVarDiam = (nActive-2)*(NJ->varDiameter[join.i] - NJ->varDiameter[join.j]); + if (varJWeight > 0.01 && varIWeight > 0.01) + bionjWeight = 0.5 + (deltaProfileVarOut+deltaVarDiam)/(2*(nActive-2)*varIJ); + if(bionjWeight<0) bionjWeight=0; + if(bionjWeight>1) bionjWeight=1; + if (verbose>2) fprintf(stderr,"dVarO %f dVarDiam %f varIJ %f from dist %f weight %f (pos %d) bionjWeight %f %f\n", + deltaProfileVarOut, deltaVarDiam, + varIJ, join.dist, join.weight, NJ->nPos, + bionjWeight, 1-bionjWeight); + if (verbose>3 && (newnode%5) == 0) { + /* Compare weight estimated from outprofiles from weight made by summing over other nodes */ + double deltaProfileVarTot = 0; + for (iNode = 0; iNode < newnode; iNode++) { + if (NJ->parent[iNode] < 0) { /* excludes join.i, join.j */ + besthit_t di, dj; + ProfileDist(NJ->profiles[join.i],NJ->profiles[iNode],NJ->nPos,NJ->distance_matrix,/*OUT*/&di); + ProfileDist(NJ->profiles[join.j],NJ->profiles[iNode],NJ->nPos,NJ->distance_matrix,/*OUT*/&dj); + deltaProfileVarTot += dj.dist - di.dist; + } + } + double lambdaTot = 0.5 + (deltaProfileVarTot+deltaVarDiam)/(2*(nActive-2)*varIJ); + if (lambdaTot < 0) lambdaTot = 0; + if (lambdaTot > 1) lambdaTot = 1; + if (fabs(bionjWeight-lambdaTot) > 0.01 || verbose > 4) + fprintf(stderr, "deltaProfileVar actual %.6f estimated %.6f lambda actual %.3f estimated %.3f\n", + deltaProfileVarTot,deltaProfileVarOut,lambdaTot,bionjWeight); + } + } + if (verbose > 2) fprintf(stderr, "Join\t%d\t%d\t%.6f\tlambda\t%.6f\tselfw\t%.3f\t%.3f\tnew\t%d\n", + join.i < join.j ? join.i : join.j, + join.i < join.j ? join.j : join.i, + join.criterion, bionjWeight, + NJ->selfweight[join.i < join.j ? join.i : join.j], + NJ->selfweight[join.i < join.j ? join.j : join.i], + newnode); + + NJ->diameter[newnode] = bionjWeight * (NJ->branchlength[join.i] + NJ->diameter[join.i]) + + (1-bionjWeight) * (NJ->branchlength[join.j] + NJ->diameter[join.j]); + NJ->varDiameter[newnode] = bionjWeight * NJ->varDiameter[join.i] + + (1-bionjWeight) * NJ->varDiameter[join.j] + + bionjWeight * (1-bionjWeight) * varIJ; + + NJ->profiles[newnode] = AverageProfile(NJ->profiles[join.i],NJ->profiles[join.j], + NJ->nPos, NJ->nConstraints, + NJ->distance_matrix, + bionj ? bionjWeight : /*noweight*/-1.0); + + /* Update out-distances and total diameters */ + int changedActiveOutProfile = nActiveOutProfileReset - (nActive-1); + if (changedActiveOutProfile >= nResetOutProfile + && changedActiveOutProfile >= fResetOutProfile * nActiveOutProfileReset) { + /* Recompute the outprofile from scratch to avoid roundoff error */ + profile_t **activeProfiles = (profile_t**)mymalloc(sizeof(profile_t*)*(nActive-1)); + int nSaved = 0; + NJ->totdiam = 0; + for (iNode=0;iNodemaxnode;iNode++) { + if (NJ->parent[iNode]<0) { + assert(nSaved < nActive-1); + activeProfiles[nSaved++] = NJ->profiles[iNode]; + NJ->totdiam += NJ->diameter[iNode]; + } + } + assert(nSaved==nActive-1); + FreeProfile(NJ->outprofile, NJ->nPos, NJ->nConstraints); + if(verbose>2) fprintf(stderr,"Recomputing outprofile %d %d\n",nActiveOutProfileReset,nActive-1); + NJ->outprofile = OutProfile(activeProfiles, nSaved, + NJ->nPos, NJ->nConstraints, + NJ->distance_matrix); + activeProfiles = myfree(activeProfiles, sizeof(profile_t*)*(nActive-1)); + nActiveOutProfileReset = nActive-1; + } else { + UpdateOutProfile(/*OUT*/NJ->outprofile, + NJ->profiles[join.i], NJ->profiles[join.j], NJ->profiles[newnode], + nActive, + NJ->nPos, NJ->nConstraints, + NJ->distance_matrix); + NJ->totdiam += NJ->diameter[newnode] - NJ->diameter[join.i] - NJ->diameter[join.j]; + } + + /* Store self-dist for use in other computations */ + besthit_t selfdist; + ProfileDist(NJ->profiles[newnode],NJ->profiles[newnode],NJ->nPos,NJ->distance_matrix,/*OUT*/&selfdist); + NJ->selfdist[newnode] = selfdist.dist; + NJ->selfweight[newnode] = selfdist.weight; + + /* Find the best hit of the joined node IJ */ + if (m>0) { + TopHitJoin(newnode, /*IN/UPDATE*/NJ, nActive-1, /*IN/OUT*/tophits); + } else { + /* Not using top-hits, so we update all out-distances */ + for (iNode = 0; iNode < NJ->maxnode; iNode++) { + if (NJ->parent[iNode] < 0) { + /* True nActive is now nActive-1 */ + SetOutDistance(/*IN/UPDATE*/NJ, iNode, nActive-1); + } + } + + if(visible != NULL) { + SetBestHit(newnode, NJ, nActive-1, /*OUT*/&visible[newnode], /*OUT OPTIONAL*/besthitNew); + if (verbose>2) + fprintf(stderr,"Visible %d %d %f %f\n", + visible[newnode].i, visible[newnode].j, + visible[newnode].dist, visible[newnode].criterion); + if (besthitNew != NULL) { + /* Use distances to new node to update visible set entries that are non-optimal */ + for (iNode = 0; iNode < NJ->maxnode; iNode++) { + if (NJ->parent[iNode] >= 0 || iNode == newnode) + continue; + int iOldVisible = visible[iNode].j; + assert(iOldVisible>=0); + assert(visible[iNode].i == iNode); + + /* Update the criterion; use nActive-1 because haven't decremented nActive yet */ + if (NJ->parent[iOldVisible] < 0) + SetCriterion(/*IN/OUT*/NJ, nActive-1, &visible[iNode]); + + if (NJ->parent[iOldVisible] >= 0 + || besthitNew[iNode].criterion < visible[iNode].criterion) { + if(verbose>3) fprintf(stderr,"Visible %d reset from %d to %d (%f vs. %f)\n", + iNode, iOldVisible, + newnode, visible[iNode].criterion, besthitNew[iNode].criterion); + if(NJ->parent[iOldVisible] < 0) nVisibleUpdate++; + visible[iNode].j = newnode; + visible[iNode].dist = besthitNew[iNode].dist; + visible[iNode].criterion = besthitNew[iNode].criterion; + } + } /* end loop over all nodes */ + } /* end if recording all hits of new node */ + } /* end if keeping a visible set */ + } /* end else (m==0) */ + } /* end loop over nActive */ + +#ifdef TRACK_MEMORY + if (verbose>1) { + struct mallinfo mi = mallinfo(); + fprintf(stderr, "Memory @ end of FastNJ(): %.2f MB (%.1f byte/pos) useful %.2f expected %.2f\n", + (mi.arena+mi.hblkhd)/1.0e6, (mi.arena+mi.hblkhd)/(double)(NJ->nSeq*(double)NJ->nPos), + mi.uordblks/1.0e6, mymallocUsed/1e6); + } +#endif + + /* We no longer need the tophits, visible set, etc. */ + if (visible != NULL) visible = myfree(visible,sizeof(besthit_t)*NJ->maxnodes); + if (besthitNew != NULL) besthitNew = myfree(besthitNew,sizeof(besthit_t)*NJ->maxnodes); + tophits = FreeTopHits(tophits); + + /* Add a root for the 3 remaining nodes */ + int top[3]; + int nTop = 0; + for (iNode = 0; iNode < NJ->maxnode; iNode++) { + if (NJ->parent[iNode] < 0) { + assert(nTop <= 2); + top[nTop++] = iNode; + } + } + assert(nTop==3); + + NJ->root = NJ->maxnode++; + NJ->child[NJ->root].nChild = 3; + for (nTop = 0; nTop < 3; nTop++) { + NJ->parent[top[nTop]] = NJ->root; + NJ->child[NJ->root].child[nTop] = top[nTop]; + } + + besthit_t dist01, dist02, dist12; + ProfileDist(NJ->profiles[top[0]], NJ->profiles[top[1]], NJ->nPos, NJ->distance_matrix, /*OUT*/&dist01); + ProfileDist(NJ->profiles[top[0]], NJ->profiles[top[2]], NJ->nPos, NJ->distance_matrix, /*OUT*/&dist02); + ProfileDist(NJ->profiles[top[1]], NJ->profiles[top[2]], NJ->nPos, NJ->distance_matrix, /*OUT*/&dist12); + + double d01 = dist01.dist - NJ->diameter[top[0]] - NJ->diameter[top[1]]; + double d02 = dist02.dist - NJ->diameter[top[0]] - NJ->diameter[top[2]]; + double d12 = dist12.dist - NJ->diameter[top[1]] - NJ->diameter[top[2]]; + NJ->branchlength[top[0]] = (d01 + d02 - d12)/2; + NJ->branchlength[top[1]] = (d01 + d12 - d02)/2; + NJ->branchlength[top[2]] = (d02 + d12 - d01)/2; + + /* Check how accurate the outprofile is */ + if (verbose>2) { + profile_t *p[3] = {NJ->profiles[top[0]], NJ->profiles[top[1]], NJ->profiles[top[2]]}; + profile_t *out = OutProfile(p, 3, NJ->nPos, NJ->nConstraints, NJ->distance_matrix); + int i; + double freqerror = 0; + double weighterror = 0; + for (i=0;inPos;i++) { + weighterror += fabs(out->weights[i] - NJ->outprofile->weights[i]); + int k; + for(k=0;kvectors[nCodes*i+k] - NJ->outprofile->vectors[nCodes*i+k]); + } + fprintf(stderr,"Roundoff error in outprofile@end: WeightError %f FreqError %f\n", weighterror, freqerror); + FreeProfile(out, NJ->nPos, NJ->nConstraints); + } + return; +} + +void ExhaustiveNJSearch(NJ_t *NJ, int nActive, /*OUT*/besthit_t *join) { + join->i = -1; + join->j = -1; + join->weight = 0; + join->dist = 1e20; + join->criterion = 1e20; + double bestCriterion = 1e20; + + int i, j; + for (i = 0; i < NJ->maxnode-1; i++) { + if (NJ->parent[i] < 0) { + for (j = i+1; j < NJ->maxnode; j++) { + if (NJ->parent[j] < 0) { + besthit_t hit; + hit.i = i; + hit.j = j; + SetDistCriterion(NJ, nActive, /*IN/OUT*/&hit); + if (hit.criterion < bestCriterion) { + *join = hit; + bestCriterion = hit.criterion; + } + } + } + } + } + assert (join->i >= 0 && join->j >= 0); +} + +void FastNJSearch(NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *besthits, /*OUT*/besthit_t *join) { + join->i = -1; + join->j = -1; + join->dist = 1e20; + join->weight = 0; + join->criterion = 1e20; + int iNode; + for (iNode = 0; iNode < NJ->maxnode; iNode++) { + int jNode = besthits[iNode].j; + if (NJ->parent[iNode] < 0 && NJ->parent[jNode] < 0) { /* both i and j still active */ + /* recompute criterion to reflect the current out-distances */ + SetCriterion(NJ, nActive, /*IN/OUT*/&besthits[iNode]); + if (besthits[iNode].criterion < join->criterion) + *join = besthits[iNode]; + } + } + + if(!fastest) { + int changed; + do { + changed = 0; + assert(join->i >= 0 && join->j >= 0); + SetBestHit(join->i, NJ, nActive, /*OUT*/&besthits[join->i], /*OUT IGNORED*/NULL); + if (besthits[join->i].j != join->j) { + changed = 1; + if (verbose>2) + fprintf(stderr,"BetterI\t%d\t%d\t%d\t%d\t%f\t%f\n", + join->i,join->j,besthits[join->i].i,besthits[join->i].j, + join->criterion,besthits[join->i].criterion); + } + + /* Save the best hit either way, because the out-distance has probably changed + since we started the computation. */ + join->j = besthits[join->i].j; + join->weight = besthits[join->i].weight; + join->dist = besthits[join->i].dist; + join->criterion = besthits[join->i].criterion; + + SetBestHit(join->j, NJ, nActive, /*OUT*/&besthits[join->j], /*OUT IGNORE*/NULL); + if (besthits[join->j].j != join->i) { + changed = 1; + if (verbose>2) + fprintf(stderr,"BetterJ\t%d\t%d\t%d\t%d\t%f\t%f\n", + join->i,join->j,besthits[join->j].i,besthits[join->j].j, + join->criterion,besthits[join->j].criterion); + join->i = besthits[join->j].j; + join->weight = besthits[join->j].weight; + join->dist = besthits[join->j].dist; + join->criterion = besthits[join->j].criterion; + } + if(changed) nHillBetter++; + } while(changed); + } +} + +/* A token is one of ():;, or an alphanumeric string without whitespace + Any whitespace between tokens is ignored */ +char *ReadTreeToken(FILE *fp) { + static char buf[BUFFER_SIZE]; + int len = 0; + int c; + for (c = fgetc(fp); c != EOF; c = fgetc(fp)) { + if (c == '(' || c == ')' || c == ':' || c == ';' || c == ',') { + /* standalone token */ + if (len == 0) { + buf[len++] = c; + buf[len] = '\0'; + return(buf); + } else { + ungetc(c, fp); + buf[len] = '\0'; + return(buf); + } + } else if (isspace(c)) { + if (len > 0) { + buf[len] = '\0'; + return(buf); + } + /* else ignore whitespace at beginning of token */ + } else { + /* not whitespace or standalone token */ + buf[len++] = c; + if (len >= BUFFER_SIZE) { + buf[BUFFER_SIZE-1] = '\0'; + fprintf(stderr, "Token too long in tree file, token begins with\n%s\n", buf); + exit(1); + } + } + } + if (len > 0) { + /* return the token we have so far */ + buf[len] = '\0'; + return(buf); + } + /* else */ + return(NULL); +} + +void ReadTreeError(char *err, char *token) { + fprintf(stderr, "Tree parse error: unexpected token '%s' -- %s\n", + token == NULL ? "(End of file)" : token, + err); + exit(1); +} + +void ReadTreeAddChild(int parent, int child, /*IN/OUT*/int *parents, /*IN/OUT*/children_t *children) { + assert(parent >= 0); + assert(child >= 0); + assert(parents[child] < 0); + assert(children[parent].nChild < 3); + parents[child] = parent; + children[parent].child[children[parent].nChild++] = child; +} + +void ReadTreeMaybeAddLeaf(int parent, char *name, + hashstrings_t *hashnames, uniquify_t *unique, + /*IN/OUT*/int *parents, /*IN/OUT*/children_t *children) { + hashiterator_t hi = FindMatch(hashnames,name); + if (HashCount(hashnames,hi) != 1) + ReadTreeError("not recognized as a sequence name", name); + + int iSeqNonunique = HashFirst(hashnames,hi); + assert(iSeqNonunique >= 0 && iSeqNonunique < unique->nSeq); + int iSeqUnique = unique->alnToUniq[iSeqNonunique]; + assert(iSeqUnique >= 0 && iSeqUnique < unique->nUnique); + /* Either record this leaves' parent (if it is -1) or ignore this leaf (if already seen) */ + if (parents[iSeqUnique] < 0) { + ReadTreeAddChild(parent, iSeqUnique, /*IN/OUT*/parents, /*IN/OUT*/children); + if(verbose > 5) + fprintf(stderr, "Found leaf uniq%d name %s child of %d\n", iSeqUnique, name, parent); + } else { + if (verbose > 5) + fprintf(stderr, "Skipped redundant leaf uniq%d name %s\n", iSeqUnique, name); + } +} + +void ReadTreeRemove(/*IN/OUT*/int *parents, /*IN/OUT*/children_t *children, int node) { + if(verbose > 5) + fprintf(stderr,"Removing node %d parent %d\n", node, parents[node]); + assert(parents[node] >= 0); + int parent = parents[node]; + parents[node] = -1; + children_t *pc = &children[parent]; + int oldn; + for (oldn = 0; oldn < pc->nChild; oldn++) { + if (pc->child[oldn] == node) + break; + } + assert(oldn < pc->nChild); + + /* move successor nodes back in child list and shorten list */ + int i; + for (i = oldn; i < pc->nChild-1; i++) + pc->child[i] = pc->child[i+1]; + pc->nChild--; + + /* add its children to parent's child list */ + children_t *nc = &children[node]; + if (nc->nChild > 0) { + assert(nc->nChild<=2); + assert(pc->nChild < 3); + assert(pc->nChild + nc->nChild <= 3); + int j; + for (j = 0; j < nc->nChild; j++) { + if(verbose > 5) + fprintf(stderr,"Repointing parent %d to child %d\n", parent, nc->child[j]); + pc->child[pc->nChild++] = nc->child[j]; + parents[nc->child[j]] = parent; + } + nc->nChild = 0; + } +} + +void ReadTree(/*IN/OUT*/NJ_t *NJ, + /*IN*/uniquify_t *unique, + /*IN*/hashstrings_t *hashnames, + /*READ*/FILE *fpInTree) { + assert(NJ->nSeq == unique->nUnique); + /* First, do a preliminary parse of the tree to with non-unique leaves ignored + We need to store this separately from NJ because it may have too many internal nodes + (matching sequences show up once in the NJ but could be in multiple places in the tree) + Will use iUnique as the index of nodes, as in the NJ structure + */ + int maxnodes = unique->nSeq*2; + int maxnode = unique->nSeq; + int *parent = (int*)mymalloc(sizeof(int)*maxnodes); + children_t *children = (children_t *)mymalloc(sizeof(children_t)*maxnodes); + int root = maxnode++; + int i; + for (i = 0; i < maxnodes; i++) { + parent[i] = -1; + children[i].nChild = 0; + } + + /* The stack is the current path to the root, with the root at the first (top) position */ + int stack_size = 1; + int *stack = (int*)mymalloc(sizeof(int)*maxnodes); + stack[0] = root; + int nDown = 0; + int nUp = 0; + + char *token; + token = ReadTreeToken(fpInTree); + if (token == NULL || *token != '(') + ReadTreeError("No '(' at start", token); + /* nDown is still 0 because we have created the root */ + + while ((token = ReadTreeToken(fpInTree)) != NULL) { + if (nDown > 0) { /* In a stream of parentheses */ + if (*token == '(') + nDown++; + else if (*token == ',' || *token == ';' || *token == ':' || *token == ')') + ReadTreeError("while reading parentheses", token); + else { + /* Add intermediate nodes if nDown was > 1 (for nDown=1, the only new node is the leaf) */ + while (nDown-- > 0) { + int new = maxnode++; + assert(new < maxnodes); + ReadTreeAddChild(stack[stack_size-1], new, /*IN/OUT*/parent, /*IN/OUT*/children); + if(verbose > 5) + fprintf(stderr, "Added internal child %d of %d, stack size increase to %d\n", + new, stack[stack_size-1],stack_size+1); + stack[stack_size++] = new; + assert(stack_size < maxnodes); + } + ReadTreeMaybeAddLeaf(stack[stack_size-1], token, + hashnames, unique, + /*IN/OUT*/parent, /*IN/OUT*/children); + } + } else if (nUp > 0) { + if (*token == ';') { /* end the tree? */ + if (nUp != stack_size) + ReadTreeError("unbalanced parentheses", token); + else + break; + } else if (*token == ')') + nUp++; + else if (*token == '(') + ReadTreeError("unexpected '(' after ')'", token); + else if (*token == ':') { + token = ReadTreeToken(fpInTree); + /* Read the branch length and ignore it */ + if (token == NULL || (*token != '-' && !isdigit(*token))) + ReadTreeError("not recognized as a branch length", token); + } else if (*token == ',') { + /* Go back up the stack the correct #times */ + while (nUp-- > 0) { + stack_size--; + if(verbose > 5) + fprintf(stderr, "Up to nUp=%d stack size %d at %d\n", + nUp, stack_size, stack[stack_size-1]); + if (stack_size <= 0) + ReadTreeError("too many ')'", token); + } + nUp = 0; + } else if (*token == '-' || isdigit(*token)) + ; /* ignore bootstrap value */ + else + fprintf(stderr, "Warning while parsing tree: non-numeric label %s for internal node\n", + token); + } else if (*token == '(') { + nDown = 1; + } else if (*token == ')') { + nUp = 1; + } else if (*token == ':') { + token = ReadTreeToken(fpInTree); + if (token == NULL || (*token != '-' && !isdigit(*token))) + ReadTreeError("not recognized as a branch length", token); + } else if (*token == ',') { + ; /* do nothing */ + } else if (*token == ';') + ReadTreeError("unexpected token", token); + else + ReadTreeMaybeAddLeaf(stack[stack_size-1], token, + hashnames, unique, + /*IN/OUT*/parent, /*IN/OUT*/children); + } + + /* Verify that all sequences were seen */ + for (i = 0; i < unique->nUnique; i++) { + if (parent[i] < 0) { + fprintf(stderr, "Alignment sequence %d (unique %d) absent from input tree\n" + "The starting tree (the argument to -intree) must include all sequences in the alignment!\n", + unique->uniqueFirst[i], i); + exit(1); + } + } + + /* Simplify the tree -- remove all internal nodes with < 2 children + Keep trying until no nodes get removed + */ + int nRemoved; + do { + nRemoved = 0; + /* Here stack is the list of nodes we haven't visited yet while doing + a tree traversal */ + stack_size = 1; + stack[0] = root; + while (stack_size > 0) { + int node = stack[--stack_size]; + if (node >= unique->nUnique) { /* internal node */ + if (children[node].nChild <= 1) { + if (node != root) { + ReadTreeRemove(/*IN/OUT*/parent,/*IN/OUT*/children,node); + nRemoved++; + } else if (node == root && children[node].nChild == 1) { + int newroot = children[node].child[0]; + parent[newroot] = -1; + children[root].nChild = 0; + nRemoved++; + if(verbose > 5) + fprintf(stderr,"Changed root from %d to %d\n",root,newroot); + root = newroot; + stack[stack_size++] = newroot; + } + } else { + int j; + for (j = 0; j < children[node].nChild; j++) { + assert(stack_size < maxnodes); + stack[stack_size++] = children[node].child[j]; + if(verbose > 5) + fprintf(stderr,"Added %d to stack\n", stack[stack_size-1]); + } + } + } + } + } while (nRemoved > 0); + + /* Simplify the root node to 3 children if it has 2 */ + if (children[root].nChild == 2) { + for (i = 0; i < 2; i++) { + int child = children[root].child[i]; + assert(child >= 0 && child < maxnodes); + if (children[child].nChild == 2) { + ReadTreeRemove(parent,children,child); /* replace root -> child -> A,B with root->A,B */ + break; + } + } + } + + for (i = 0; i < maxnodes; i++) + if(verbose > 5) + fprintf(stderr,"Simplfied node %d has parent %d nchild %d\n", + i, parent[i], children[i].nChild); + + /* Map the remaining internal nodes to NJ nodes */ + int *map = (int*)mymalloc(sizeof(int)*maxnodes); + for (i = 0; i < unique->nUnique; i++) + map[i] = i; + for (i = unique->nUnique; i < maxnodes; i++) + map[i] = -1; + stack_size = 1; + stack[0] = root; + while (stack_size > 0) { + int node = stack[--stack_size]; + if (node >= unique->nUnique) { /* internal node */ + assert(node == root || children[node].nChild > 1); + map[node] = NJ->maxnode++; + for (i = 0; i < children[node].nChild; i++) { + assert(stack_size < maxnodes); + stack[stack_size++] = children[node].child[i]; + } + } + } + for (i = 0; i < maxnodes; i++) + if(verbose > 5) + fprintf(stderr,"Map %d to %d (parent %d nchild %d)\n", + i, map[i], parent[i], children[i].nChild); + + /* Set NJ->parent, NJ->children, NJ->root */ + NJ->root = map[root]; + int node; + for (node = 0; node < maxnodes; node++) { + int njnode = map[node]; + if (njnode >= 0) { + NJ->child[njnode].nChild = children[node].nChild; + for (i = 0; i < children[node].nChild; i++) { + assert(children[node].child[i] >= 0 && children[node].child[i] < maxnodes); + NJ->child[njnode].child[i] = map[children[node].child[i]]; + } + if (parent[node] >= 0) + NJ->parent[njnode] = map[parent[node]]; + } + } + + /* Make sure that parent/child relationships match */ + for (i = 0; i < NJ->maxnode; i++) { + children_t *c = &NJ->child[i]; + int j; + for (j = 0; j < c->nChild;j++) + assert(c->child[j] >= 0 && c->child[j] < NJ->maxnode && NJ->parent[c->child[j]] == i); + } + assert(NJ->parent[NJ->root] < 0); + + map = myfree(map,sizeof(int)*maxnodes); + stack = myfree(stack,sizeof(int)*maxnodes); + children = myfree(children,sizeof(children_t)*maxnodes); + parent = myfree(parent,sizeof(int)*maxnodes); + + /* Compute profiles as balanced -- the NNI stage will recompute these + profiles anyway + */ + traversal_t traversal = InitTraversal(NJ); + node = NJ->root; + while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { + if (node >= NJ->nSeq && node != NJ->root) + SetProfile(/*IN/OUT*/NJ, node, /*noweight*/-1.0); + } + traversal = FreeTraversal(traversal,NJ); +} + +/* Print topology using node indices as node names */ +void PrintNJInternal(FILE *fp, NJ_t *NJ, bool useLen) { + if (NJ->nSeq < 4) { + return; + } + typedef struct { int node; int end; } stack_t; + stack_t *stack = (stack_t *)mymalloc(sizeof(stack_t)*NJ->maxnodes); + int stackSize = 1; + stack[0].node = NJ->root; + stack[0].end = 0; + + while(stackSize>0) { + stack_t *last = &stack[stackSize-1]; + stackSize--; + /* Save last, as we are about to overwrite it */ + int node = last->node; + int end = last->end; + + if (node < NJ->nSeq) { + if (NJ->child[NJ->parent[node]].child[0] != node) fputs(",",fp); + fprintf(fp, "%d", node); + if (useLen) + fprintf(fp, ":%.4f", NJ->branchlength[node]); + } else if (end) { + fprintf(fp, ")%d", node); + if (useLen) + fprintf(fp, ":%.4f", NJ->branchlength[node]); + } else { + if (node != NJ->root && NJ->child[NJ->parent[node]].child[0] != node) fprintf(fp, ","); + fprintf(fp, "("); + stackSize++; + stack[stackSize-1].node = node; + stack[stackSize-1].end = 1; + children_t *c = &NJ->child[node]; + /* put children on in reverse order because we use the last one first */ + int i; + for (i = c->nChild-1; i >=0; i--) { + stackSize++; + stack[stackSize-1].node = c->child[i]; + stack[stackSize-1].end = 0; + } + } + } + fprintf(fp, ";\n"); + stack = myfree(stack, sizeof(stack_t)*NJ->maxnodes); +} + +void PrintNJ(FILE *fp, NJ_t *NJ, char **names, uniquify_t *unique, bool bShowSupport, bool bQuote) { + /* And print the tree: depth first search + * The stack contains + * list of remaining children with their depth + * parent node, with a flag of -1 so I know to print right-paren + */ + if (NJ->nSeq==1 && unique->alnNext[unique->uniqueFirst[0]] >= 0) { + /* Special case -- otherwise we end up with double parens */ + int first = unique->uniqueFirst[0]; + assert(first >= 0 && first < unique->nSeq); + fprintf(fp, bQuote ? "('%s':0.0" : "(%s:0.0", names[first]); + int iName = unique->alnNext[first]; + while (iName >= 0) { + assert(iName < unique->nSeq); + fprintf(fp, bQuote ? ",'%s':0.0" : ",%s:0.0", names[iName]); + iName = unique->alnNext[iName]; + } + fprintf(fp,");\n"); + return; + } + + typedef struct { int node; int end; } stack_t; + stack_t *stack = (stack_t *)mymalloc(sizeof(stack_t)*NJ->maxnodes); + int stackSize = 1; + stack[0].node = NJ->root; + stack[0].end = 0; + + while(stackSize>0) { + stack_t *last = &stack[stackSize-1]; + stackSize--; + /* Save last, as we are about to overwrite it */ + int node = last->node; + int end = last->end; + + if (node < NJ->nSeq) { + if (NJ->child[NJ->parent[node]].child[0] != node) fputs(",",fp); + int first = unique->uniqueFirst[node]; + assert(first >= 0 && first < unique->nSeq); + /* Print the name, or the subtree of duplicate names */ + if (unique->alnNext[first] == -1) { + fprintf(fp, bQuote ? "'%s'" : "%s", names[first]); + } else { + fprintf(fp, bQuote ? "('%s':0.0" : "(%s:0.0", names[first]); + int iName = unique->alnNext[first]; + while (iName >= 0) { + assert(iName < unique->nSeq); + fprintf(fp, bQuote ? ",'%s':0.0" : ",%s:0.0", names[iName]); + iName = unique->alnNext[iName]; + } + fprintf(fp,")"); + } + /* Print the branch length */ +#ifdef USE_DOUBLE +#define FP_FORMAT "%.9f" +#else +#define FP_FORMAT "%.5f" +#endif + fprintf(fp, ":" FP_FORMAT, NJ->branchlength[node]); + } else if (end) { + if (node == NJ->root) + fprintf(fp, ")"); + else if (bShowSupport) + fprintf(fp, ")%.3f:" FP_FORMAT, NJ->support[node], NJ->branchlength[node]); + else + fprintf(fp, "):" FP_FORMAT, NJ->branchlength[node]); + } else { + if (node != NJ->root && NJ->child[NJ->parent[node]].child[0] != node) fprintf(fp, ","); + fprintf(fp, "("); + stackSize++; + stack[stackSize-1].node = node; + stack[stackSize-1].end = 1; + children_t *c = &NJ->child[node]; + /* put children on in reverse order because we use the last one first */ + int i; + for (i = c->nChild-1; i >=0; i--) { + stackSize++; + stack[stackSize-1].node = c->child[i]; + stack[stackSize-1].end = 0; + } + } + } + fprintf(fp, ";\n"); + stack = myfree(stack, sizeof(stack_t)*NJ->maxnodes); +} + +alignment_t *ReadAlignment(/*IN*/FILE *fp, bool bQuote) { + /* bQuote supports the -quote option */ + int nSeq = 0; + int nPos = 0; + char **names = NULL; + char **seqs = NULL; + char buf[BUFFER_SIZE] = ""; + if (fgets(buf,sizeof(buf),fp) == NULL) { + fprintf(stderr, "Error reading header line\n"); + exit(1); + } + int nSaved = 100; + if (buf[0] == '>') { + /* FASTA, truncate names at any of these */ + char *nameStop = bQuote ? "'\t\r\n" : "(),: \t\r\n"; + char *seqSkip = " \t\r\n"; /* skip these characters in the sequence */ + seqs = (char**)mymalloc(sizeof(char*) * nSaved); + names = (char**)mymalloc(sizeof(char*) * nSaved); + + do { + /* loop over lines */ + if (buf[0] == '>') { + /* truncate the name */ + char *p, *q; + for (p = buf+1; *p != '\0'; p++) { + for (q = nameStop; *q != '\0'; q++) { + if (*p == *q) { + *p = '\0'; + break; + } + } + if (*p == '\0') break; + } + + /* allocate space for another sequence */ + nSeq++; + if (nSeq > nSaved) { + int nNewSaved = nSaved*2; + seqs = myrealloc(seqs,sizeof(char*)*nSaved,sizeof(char*)*nNewSaved, /*copy*/false); + names = myrealloc(names,sizeof(char*)*nSaved,sizeof(char*)*nNewSaved, /*copy*/false); + nSaved = nNewSaved; + } + names[nSeq-1] = (char*)mymemdup(buf+1,strlen(buf)); + seqs[nSeq-1] = NULL; + } else { + /* count non-space characters and append to sequence */ + int nKeep = 0; + char *p, *q; + for (p=buf; *p != '\0'; p++) { + for (q=seqSkip; *q != '\0'; q++) { + if (*p == *q) + break; + } + if (*p != *q) + nKeep++; + } + int nOld = (seqs[nSeq-1] == NULL) ? 0 : strlen(seqs[nSeq-1]); + seqs[nSeq-1] = (char*)myrealloc(seqs[nSeq-1], nOld, nOld+nKeep+1, /*copy*/false); + if (nOld+nKeep > nPos) + nPos = nOld + nKeep; + char *out = seqs[nSeq-1] + nOld; + for (p=buf; *p != '\0'; p++) { + for (q=seqSkip; *q != '\0'; q++) { + if (*p == *q) + break; + } + if (*p != *q) { + *out = *p; + out++; + } + } + assert(out-seqs[nSeq-1] == nKeep + nOld); + *out = '\0'; + } + } while(fgets(buf,sizeof(buf),fp) != NULL); + + if (seqs[nSeq-1] == NULL) { + fprintf(stderr, "No sequence data for last entry %s\n",names[nSeq-1]); + exit(1); + } + names = myrealloc(names,sizeof(char*)*nSaved,sizeof(char*)*nSeq, /*copy*/false); + seqs = myrealloc(seqs,sizeof(char*)*nSaved,sizeof(char*)*nSeq, /*copy*/false); + } else { + /* PHYLIP interleaved-like format + Allow arbitrary length names, require spaces between names and sequences + Allow multiple alignments, either separated by a single empty line (e.g. seqboot output) + or not. + */ + if (buf[0] == '\n' || buf[0] == '\r') { + if (fgets(buf,sizeof(buf),fp) == NULL) { + fprintf(stderr, "Empty header line followed by EOF\n"); + exit(1); + } + } + if (sscanf(buf, "%d%d", &nSeq, &nPos) != 2 + || nSeq < 1 || nPos < 1) { + fprintf(stderr, "Error parsing header line:%s\n", buf); + exit(1); + } + names = (char **)mymalloc(sizeof(char*) * nSeq); + seqs = (char **)mymalloc(sizeof(char*) * nSeq); + nSaved = nSeq; + + int i; + for (i = 0; i < nSeq; i++) { + names[i] = NULL; + seqs[i] = (char *)mymalloc(nPos+1); /* null-terminate */ + seqs[i][0] = '\0'; + } + int iSeq = 0; + + while(fgets(buf,sizeof(buf),fp)) { + if ((buf[0] == '\n' || buf[0] == '\r') && (iSeq == nSeq || iSeq == 0)) { + iSeq = 0; + } else { + int j = 0; /* character just past end of name */ + if (buf[0] == ' ') { + if (names[iSeq] == NULL) { + fprintf(stderr, "No name in phylip line %s", buf); + exit(1); + } + } else { + while (buf[j] != '\n' && buf[j] != '\0' && buf[j] != ' ') + j++; + if (buf[j] != ' ' || j == 0) { + fprintf(stderr, "No sequence in phylip line %s", buf); + exit(1); + } + if (iSeq >= nSeq) { + fprintf(stderr, "No empty line between sequence blocks (is the sequence count wrong?)\n"); + exit(1); + } + if (names[iSeq] == NULL) { + /* save the name */ + names[iSeq] = (char *)mymalloc(j+1); + int k; + for (k = 0; k < j; k++) names[iSeq][k] = buf[k]; + names[iSeq][j] = '\0'; + } else { + /* check the name */ + int k; + int match = 1; + for (k = 0; k < j; k++) { + if (names[iSeq][k] != buf[k]) { + match = 0; + break; + } + } + if (!match || names[iSeq][j] != '\0') { + fprintf(stderr, "Wrong name in phylip line %s\nExpected %s\n", buf, names[iSeq]); + exit(1); + } + } + } + int seqlen = strlen(seqs[iSeq]); + for (; buf[j] != '\n' && buf[j] != '\0'; j++) { + if (buf[j] != ' ') { + if (seqlen >= nPos) { + fprintf(stderr, "Too many characters (expected %d) for sequence named %s\nSo far have:\n%s\n", + nPos, names[iSeq], seqs[iSeq]); + exit(1); + } + seqs[iSeq][seqlen++] = toupper(buf[j]); + } + } + seqs[iSeq][seqlen] = '\0'; /* null-terminate */ + if(verbose>10) fprintf(stderr,"Read iSeq %d name %s seqsofar %s\n", iSeq, names[iSeq], seqs[iSeq]); + iSeq++; + if (iSeq == nSeq && strlen(seqs[0]) == nPos) + break; /* finished alignment */ + } /* end else non-empty phylip line */ + } + if (iSeq != nSeq && iSeq != 0) { + fprintf(stderr, "Wrong number of sequences: expected %d\n", nSeq); + exit(1); + } + } + /* Check lengths of sequences */ + int i; + for (i = 0; i < nSeq; i++) { + int seqlen = strlen(seqs[i]); + if (seqlen != nPos) { + fprintf(stderr, "Wrong number of characters for %s: expected %d but have %d instead.\n" + "This sequence may be truncated, or another sequence may be too long.\n", + names[i], nPos, seqlen); + exit(1); + } + } + /* Replace "." with "-" and warn if we find any */ + /* If nucleotide sequences, replace U with T and N with X */ + bool findDot = false; + for (i = 0; i < nSeq; i++) { + char *p; + for (p = seqs[i]; *p != '\0'; p++) { + if (*p == '.') { + findDot = true; + *p = '-'; + } + if (nCodes == 4 && *p == 'U') + *p = 'T'; + if (nCodes == 4 && *p == 'N') + *p = 'X'; + } + } + if (findDot) + fprintf(stderr, "Warning! Found \".\" character(s). These are treated as gaps\n"); + + if (ferror(fp)) { + fprintf(stderr, "Error reading input file\n"); + exit(1); + } + + alignment_t *align = (alignment_t*)mymalloc(sizeof(alignment_t)); + align->nSeq = nSeq; + align->nPos = nPos; + align->names = names; + align->seqs = seqs; + align->nSaved = nSaved; + return(align); +} + +void FreeAlignmentSeqs(/*IN/OUT*/alignment_t *aln) { + assert(aln != NULL); + int i; + for (i = 0; i < aln->nSeq; i++) + aln->seqs[i] = myfree(aln->seqs[i], aln->nPos+1); +} + +alignment_t *FreeAlignment(alignment_t *aln) { + if(aln==NULL) + return(NULL); + int i; + for (i = 0; i < aln->nSeq; i++) { + aln->names[i] = myfree(aln->names[i],strlen(aln->names[i])+1); + aln->seqs[i] = myfree(aln->seqs[i], aln->nPos+1); + } + aln->names = myfree(aln->names, sizeof(char*)*aln->nSaved); + aln->seqs = myfree(aln->seqs, sizeof(char*)*aln->nSaved); + myfree(aln, sizeof(alignment_t)); + return(NULL); +} + +char **AlnToConstraints(alignment_t *constraints, uniquify_t *unique, hashstrings_t *hashnames) { + /* look up constraints as names and map to unique-space */ + char ** uniqConstraints = (char**)mymalloc(sizeof(char*) * unique->nUnique); + int i; + for (i = 0; i < unique->nUnique; i++) + uniqConstraints[i] = NULL; + for (i = 0; i < constraints->nSeq; i++) { + char *name = constraints->names[i]; + char *constraintSeq = constraints->seqs[i]; + hashiterator_t hi = FindMatch(hashnames,name); + if (HashCount(hashnames,hi) != 1) { + fprintf(stderr, "Sequence %s from constraints file is not in the alignment\n", name); + exit(1); + } + int iSeqNonunique = HashFirst(hashnames,hi); + assert(iSeqNonunique >= 0 && iSeqNonunique < unique->nSeq); + int iSeqUnique = unique->alnToUniq[iSeqNonunique]; + assert(iSeqUnique >= 0 && iSeqUnique < unique->nUnique); + if (uniqConstraints[iSeqUnique] != NULL) { + /* Already set a constraint for this group of sequences! + Warn that we are ignoring this one unless the constraints match */ + if (strcmp(uniqConstraints[iSeqUnique],constraintSeq) != 0) { + fprintf(stderr, + "Warning: ignoring constraints for %s:\n%s\n" + "Another sequence has the same sequence but different constraints\n", + name, constraintSeq); + } + } else { + uniqConstraints[iSeqUnique] = constraintSeq; + } + } + return(uniqConstraints); +} + + +profile_t *SeqToProfile(/*IN/OUT*/NJ_t *NJ, + char *seq, int nPos, + /*OPTIONAL*/char *constraintSeq, int nConstraints, + int iNode, + unsigned long counts[256]) { + static unsigned char charToCode[256]; + static int codeSet = 0; + int c, i; + + if (!codeSet) { + for (c = 0; c < 256; c++) { + charToCode[c] = nCodes; + } + for (i = 0; codesString[i]; i++) { + charToCode[codesString[i]] = i; + charToCode[tolower(codesString[i])] = i; + } + charToCode['-'] = NOCODE; + codeSet=1; + } + + assert(strlen(seq) == nPos); + profile_t *profile = NewProfile(nPos,nConstraints); + + for (i = 0; i < nPos; i++) { + unsigned int character = (unsigned int) seq[i]; + counts[character]++; + c = charToCode[character]; + if(verbose>10 && i < 2) fprintf(stderr,"pos %d char %c code %d\n", i, seq[i], c); + /* treat unknowns as gaps */ + if (c == nCodes || c == NOCODE) { + profile->codes[i] = NOCODE; + profile->weights[i] = 0.0; + } else { + profile->codes[i] = c; + profile->weights[i] = 1.0; + } + } + if (nConstraints > 0) { + for (i = 0; i < nConstraints; i++) { + profile->nOn[i] = 0; + profile->nOff[i] = 0; + } + bool bWarn = false; + if (constraintSeq != NULL) { + assert(strlen(constraintSeq) == nConstraints); + for (i = 0; i < nConstraints; i++) { + if (constraintSeq[i] == '1') { + profile->nOn[i] = 1; + } else if (constraintSeq[i] == '0') { + profile->nOff[i] = 1; + } else if (constraintSeq[i] != '-') { + if (!bWarn) { + fprintf(stderr, "Constraint characters in unique sequence %d replaced with gap:", iNode+1); + bWarn = true; + } + fprintf(stderr, " %c%d", constraintSeq[i], i+1); + /* For the benefit of ConstraintSequencePenalty -- this is a bit of a hack, as + this modifies the value read from the alignment + */ + constraintSeq[i] = '-'; + } + } + if (bWarn) + fprintf(stderr, "\n"); + } + } + return profile; +} + +void SeqDist(unsigned char *codes1, unsigned char *codes2, int nPos, + distance_matrix_t *dmat, + /*OUT*/besthit_t *hit) { + double top = 0; /* summed over positions */ + int nUse = 0; + int i; + if (dmat==NULL) { + int nDiff = 0; + for (i = 0; i < nPos; i++) { + if (codes1[i] != NOCODE && codes2[i] != NOCODE) { + nUse++; + if (codes1[i] != codes2[i]) nDiff++; + } + } + top = (double)nDiff; + } else { + for (i = 0; i < nPos; i++) { + if (codes1[i] != NOCODE && codes2[i] != NOCODE) { + nUse++; + top += dmat->distances[(unsigned int)codes1[i]][(unsigned int)codes2[i]]; + } + } + } + hit->weight = (double)nUse; + hit->dist = nUse > 0 ? top/(double)nUse : 1.0; + seqOps++; +} + +void CorrectedPairDistances(profile_t **profiles, int nProfiles, + /*OPTIONAL*/distance_matrix_t *distance_matrix, + int nPos, + /*OUT*/double *distances) { + assert(distances != NULL); + assert(profiles != NULL); + assert(nProfiles>1 && nProfiles <= 4); + besthit_t hit[6]; + int iHit,i,j; + + for (iHit=0, i=0; i < nProfiles; i++) { + for (j=i+1; j < nProfiles; j++, iHit++) { + ProfileDist(profiles[i],profiles[j],nPos,distance_matrix,/*OUT*/&hit[iHit]); + distances[iHit] = hit[iHit].dist; + } + } + if (pseudoWeight > 0) { + /* Estimate the prior distance */ + double dTop = 0; + double dBottom = 0; + for (iHit=0; iHit < (nProfiles*(nProfiles-1))/2; iHit++) { + dTop += hit[iHit].dist * hit[iHit].weight; + dBottom += hit[iHit].weight; + } + double prior = (dBottom > 0.01) ? dTop/dBottom : 3.0; + for (iHit=0; iHit < (nProfiles*(nProfiles-1))/2; iHit++) + distances[iHit] = (distances[iHit] * hit[iHit].weight + prior * pseudoWeight) + / (hit[iHit].weight + pseudoWeight); + } + if (logdist) { + for (iHit=0; iHit < (nProfiles*(nProfiles-1))/2; iHit++) + distances[iHit] = LogCorrect(distances[iHit]); + } +} + +/* During the neighbor-joining phase, a join only violates our constraints if + node1, node2, and other are all represented in the constraint + and if one of the 3 is split and the other two do not agree + */ +int JoinConstraintPenalty(/*IN*/NJ_t *NJ, int node1, int node2) { + if (NJ->nConstraints == 0) + return(0.0); + int penalty = 0; + int iC; + for (iC = 0; iC < NJ->nConstraints; iC++) + penalty += JoinConstraintPenaltyPiece(NJ, node1, node2, iC); + return(penalty); +} + +int JoinConstraintPenaltyPiece(NJ_t *NJ, int node1, int node2, int iC) { + profile_t *pOut = NJ->outprofile; + profile_t *p1 = NJ->profiles[node1]; + profile_t *p2 = NJ->profiles[node2]; + int nOn1 = p1->nOn[iC]; + int nOff1 = p1->nOff[iC]; + int nOn2 = p2->nOn[iC]; + int nOff2 = p2->nOff[iC]; + int nOnOut = pOut->nOn[iC] - nOn1 - nOn2; + int nOffOut = pOut->nOff[iC] - nOff1 - nOff2; + + if ((nOn1+nOff1) > 0 && (nOn2+nOff2) > 0 && (nOnOut+nOffOut) > 0) { + /* code is -1 for split, 0 for off, 1 for on */ + int code1 = (nOn1 > 0 && nOff1 > 0) ? -1 : (nOn1 > 0 ? 1 : 0); + int code2 = (nOn2 > 0 && nOff2 > 0) ? -1 : (nOn2 > 0 ? 1 : 0); + int code3 = (nOnOut > 0 && nOffOut) > 0 ? -1 : (nOnOut > 0 ? 1 : 0); + int nSplit = (code1 == -1 ? 1 : 0) + (code2 == -1 ? 1 : 0) + (code3 == -1 ? 1 : 0); + int nOn = (code1 == 1 ? 1 : 0) + (code2 == 1 ? 1 : 0) + (code3 == 1 ? 1 : 0); + if (nSplit == 1 && nOn == 1) + return(SplitConstraintPenalty(nOn1+nOn2, nOff1+nOff2, nOnOut, nOffOut)); + } + /* else */ + return(0); +} + +void QuartetConstraintPenalties(profile_t *profiles[4], int nConstraints, /*OUT*/double penalty[3]) { + int i; + for (i=0; i < 3; i++) + penalty[i] = 0.0; + if(nConstraints == 0) + return; + int iC; + for (iC = 0; iC < nConstraints; iC++) { + double part[3]; + if (QuartetConstraintPenaltiesPiece(profiles, iC, /*OUT*/part)) { + for (i=0;i<3;i++) + penalty[i] += part[i]; + + if (verbose>2 + && (fabs(part[ABvsCD]-part[ACvsBD]) > 0.001 || fabs(part[ABvsCD]-part[ADvsBC]) > 0.001)) + fprintf(stderr, "Constraint Penalties at %d: ABvsCD %.3f ACvsBD %.3f ADvsBC %.3f %d/%d %d/%d %d/%d %d/%d\n", + iC, part[ABvsCD], part[ACvsBD], part[ADvsBC], + profiles[0]->nOn[iC], profiles[0]->nOff[iC], + profiles[1]->nOn[iC], profiles[1]->nOff[iC], + profiles[2]->nOn[iC], profiles[2]->nOff[iC], + profiles[3]->nOn[iC], profiles[3]->nOff[iC]); + } + } + if (verbose>2) + fprintf(stderr, "Total Constraint Penalties: ABvsCD %.3f ACvsBD %.3f ADvsBC %.3f\n", + penalty[ABvsCD], penalty[ACvsBD], penalty[ADvsBC]); +} + +double PairConstraintDistance(int nOn1, int nOff1, int nOn2, int nOff2) { + double f1 = nOn1/(double)(nOn1+nOff1); + double f2 = nOn2/(double)(nOn2+nOff2); + /* 1 - f1 * f2 - (1-f1)*(1-f2) = 1 - f1 * f2 - 1 + f1 + f2 - f1 * f2 */ + return(f1 + f2 - 2.0 * f1 * f2); +} + +bool QuartetConstraintPenaltiesPiece(profile_t *profiles[4], int iC, /*OUT*/double piece[3]) { + int nOn[4]; + int nOff[4]; + int i; + int nSplit = 0; + int nPlus = 0; + int nMinus = 0; + + for (i=0; i < 4; i++) { + nOn[i] = profiles[i]->nOn[iC]; + nOff[i] = profiles[i]->nOff[iC]; + if (nOn[i] + nOff[i] == 0) + return(false); /* ignore */ + else if (nOn[i] > 0 && nOff[i] > 0) + nSplit++; + else if (nOn[i] > 0) + nPlus++; + else + nMinus++; + } + /* If just one of them is split or on the other side and the others all agree, also ignore */ + if (nPlus >= 3 || nMinus >= 3) + return(false); + piece[ABvsCD] = constraintWeight + * (PairConstraintDistance(nOn[0],nOff[0],nOn[1],nOff[1]) + + PairConstraintDistance(nOn[2],nOff[2],nOn[3],nOff[3])); + piece[ACvsBD] = constraintWeight + * (PairConstraintDistance(nOn[0],nOff[0],nOn[2],nOff[2]) + + PairConstraintDistance(nOn[1],nOff[1],nOn[3],nOff[3])); + piece[ADvsBC] = constraintWeight + * (PairConstraintDistance(nOn[0],nOff[0],nOn[3],nOff[3]) + + PairConstraintDistance(nOn[2],nOff[2],nOn[1],nOff[1])); + return(true); +} + +/* Minimum number of constrained leaves that need to be moved + to satisfy the constraint (or 0 if constraint is satisfied) + Defining it this way should ensure that SPR moves that break + constraints get a penalty +*/ +int SplitConstraintPenalty(int nOn1, int nOff1, int nOn2, int nOff2) { + return(nOn1 + nOff2 < nOn2 + nOff1 ? + (nOn1 < nOff2 ? nOn1 : nOff2) + : (nOn2 < nOff1 ? nOn2 : nOff1)); +} + +bool SplitViolatesConstraint(profile_t *profiles[4], int iConstraint) { + int i; + int codes[4]; /* 0 for off, 1 for on, -1 for split (quit if not constrained at all) */ + for (i = 0; i < 4; i++) { + if (profiles[i]->nOn[iConstraint] + profiles[i]->nOff[iConstraint] == 0) + return(false); + else if (profiles[i]->nOn[iConstraint] > 0 && profiles[i]->nOff[iConstraint] == 0) + codes[i] = 1; + else if (profiles[i]->nOn[iConstraint] == 0 && profiles[i]->nOff[iConstraint] > 0) + codes[i] = 0; + else + codes[i] = -1; + } + int n0 = 0; + int n1 = 0; + for (i = 0; i < 4; i++) { + if (codes[i] == 0) + n0++; + else if (codes[i] == 1) + n1++; + } + /* 3 on one side means no violation, even if other is code -1 + otherwise must have code != -1 and agreement on the split + */ + if (n0 >= 3 || n1 >= 3) + return(false); + if (n0==2 && n1==2 && codes[0] == codes[1] && codes[2] == codes[3]) + return(false); + return(true); +} + +double LogCorrect(double dist) { + const double maxscore = 3.0; + if (nCodes == 4 && !useMatrix) { /* Jukes-Cantor */ + dist = dist < 0.74 ? -0.75*log(1.0 - dist * 4.0/3.0) : maxscore; + } else { /* scoredist-like */ + dist = dist < 0.99 ? -1.3*log(1.0 - dist) : maxscore; + } + return (dist < maxscore ? dist : maxscore); +} + +/* A helper function -- f1 and f2 can be NULL if the corresponding code != NOCODE +*/ +double ProfileDistPiece(unsigned int code1, unsigned int code2, + numeric_t *f1, numeric_t *f2, + /*OPTIONAL*/distance_matrix_t *dmat, + /*OPTIONAL*/numeric_t *codeDist2) { + if (dmat) { + if (code1 != NOCODE && code2 != NOCODE) { /* code1 vs code2 */ + return(dmat->distances[code1][code2]); + } else if (codeDist2 != NULL && code1 != NOCODE) { /* code1 vs. codeDist2 */ + return(codeDist2[code1]); + } else { /* f1 vs f2 */ + if (f1 == NULL) { + if(code1 == NOCODE) return(10.0); + f1 = &dmat->codeFreq[code1][0]; + } + if (f2 == NULL) { + if(code2 == NOCODE) return(10.0); + f2 = &dmat->codeFreq[code2][0]; + } + return(vector_multiply3_sum(f1,f2,dmat->eigenval,nCodes)); + } + } else { + /* no matrix */ + if (code1 != NOCODE) { + if (code2 != NOCODE) { + return(code1 == code2 ? 0.0 : 1.0); /* code1 vs code2 */ + } else { + if(f2 == NULL) return(10.0); + return(1.0 - f2[code1]); /* code1 vs. f2 */ + } + } else { + if (code2 != NOCODE) { + if(f1 == NULL) return(10.0); + return(1.0 - f1[code2]); /* f1 vs code2 */ + } else { /* f1 vs. f2 */ + if (f1 == NULL || f2 == NULL) return(10.0); + double piece = 1.0; + int k; + for (k = 0; k < nCodes; k++) { + piece -= f1[k] * f2[k]; + } + return(piece); + } + } + } + assert(0); +} + +/* E.g. GET_FREQ(profile,iPos,iVector) + Gets the next element of the vectors (and updates iVector), or + returns NULL if we didn't store a vector +*/ +#define GET_FREQ(P,I,IVECTOR) \ +(P->weights[I] > 0 && P->codes[I] == NOCODE ? &P->vectors[nCodes*(IVECTOR++)] : NULL) + +void ProfileDist(profile_t *profile1, profile_t *profile2, int nPos, + /*OPTIONAL*/distance_matrix_t *dmat, + /*OUT*/besthit_t *hit) { + double top = 0; + double denom = 0; + int iFreq1 = 0; + int iFreq2 = 0; + int i = 0; + for (i = 0; i < nPos; i++) { + numeric_t *f1 = GET_FREQ(profile1,i,/*IN/OUT*/iFreq1); + numeric_t *f2 = GET_FREQ(profile2,i,/*IN/OUT*/iFreq2); + if (profile1->weights[i] > 0 && profile2->weights[i] > 0) { + double weight = profile1->weights[i] * profile2->weights[i]; + denom += weight; + double piece = ProfileDistPiece(profile1->codes[i],profile2->codes[i],f1,f2,dmat, + profile2->codeDist ? &profile2->codeDist[i*nCodes] : NULL); + top += weight * piece; + } + } + assert(iFreq1 == profile1->nVectors); + assert(iFreq2 == profile2->nVectors); + hit->weight = denom > 0 ? denom : 0.01; /* 0.01 is an arbitrarily low value of weight (normally >>1) */ + hit->dist = denom > 0 ? top/denom : 1; + profileOps++; +} + +/* This should not be called if the update weight is 0, as + in that case code==NOCODE and in=NULL is possible, and then + it will fail. +*/ +void AddToFreq(/*IN/OUT*/numeric_t *fOut, + double weight, + unsigned int codeIn, /*OPTIONAL*/numeric_t *fIn, + /*OPTIONAL*/distance_matrix_t *dmat) { + assert(fOut != NULL); + if (fIn != NULL) { + vector_add_mult(fOut, fIn, weight, nCodes); + } else if (dmat) { + assert(codeIn != NOCODE); + vector_add_mult(fOut, dmat->codeFreq[codeIn], weight, nCodes); + } else { + assert(codeIn != NOCODE); + fOut[codeIn] += weight; + } +} + +void SetProfile(/*IN/OUT*/NJ_t *NJ, int node, double weight1) { + children_t *c = &NJ->child[node]; + assert(c->nChild == 2); + assert(NJ->profiles[c->child[0]] != NULL); + assert(NJ->profiles[c->child[1]] != NULL); + if (NJ->profiles[node] != NULL) + FreeProfile(NJ->profiles[node], NJ->nPos, NJ->nConstraints); + NJ->profiles[node] = AverageProfile(NJ->profiles[c->child[0]], + NJ->profiles[c->child[1]], + NJ->nPos, NJ->nConstraints, + NJ->distance_matrix, + weight1); +} + +/* bionjWeight is the weight of the first sequence (between 0 and 1), + or -1 to do the average. + */ +profile_t *AverageProfile(profile_t *profile1, profile_t *profile2, + int nPos, int nConstraints, + distance_matrix_t *dmat, + double bionjWeight) { + int i; + if (bionjWeight < 0) { + bionjWeight = 0.5; + } + + /* First, set codes and weights and see how big vectors will be */ + profile_t *out = NewProfile(nPos, nConstraints); + + for (i = 0; i < nPos; i++) { + out->weights[i] = bionjWeight * profile1->weights[i] + + (1-bionjWeight) * profile2->weights[i]; + out->codes[i] = NOCODE; + if (out->weights[i] > 0) { + if (profile1->weights[i] > 0 && profile1->codes[i] != NOCODE + && (profile2->weights[i] <= 0 || profile1->codes[i] == profile2->codes[i])) { + out->codes[i] = profile1->codes[i]; + } else if (profile1->weights[i] <= 0 + && profile2->weights[i] > 0 + && profile2->codes[i] != NOCODE) { + out->codes[i] = profile2->codes[i]; + } + if (out->codes[i] == NOCODE) out->nVectors++; + } + } + + /* Allocate and set the vectors */ + out->vectors = (numeric_t*)mymalloc(sizeof(numeric_t)*nCodes*out->nVectors); + for (i = 0; i < nCodes * out->nVectors; i++) out->vectors[i] = 0; + nProfileFreqAlloc += out->nVectors; + nProfileFreqAvoid += nPos - out->nVectors; + int iFreqOut = 0; + int iFreq1 = 0; + int iFreq2 = 0; + for (i=0; i < nPos; i++) { + numeric_t *f = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); + numeric_t *f1 = GET_FREQ(profile1,i,/*IN/OUT*/iFreq1); + numeric_t *f2 = GET_FREQ(profile2,i,/*IN/OUT*/iFreq2); + if (f != NULL) { + if (profile1->weights[i] > 0) + AddToFreq(/*IN/OUT*/f, profile1->weights[i] * bionjWeight, + profile1->codes[i], f1, dmat); + if (profile2->weights[i] > 0) + AddToFreq(/*IN/OUT*/f, profile2->weights[i] * (1.0-bionjWeight), + profile2->codes[i], f2, dmat); + NormalizeFreq(/*IN/OUT*/f, dmat); + } /* end if computing f */ + if (verbose > 10 && i < 5) { + fprintf(stderr,"Average profiles: pos %d in-w1 %f in-w2 %f bionjWeight %f to weight %f code %d\n", + i, profile1->weights[i], profile2->weights[i], bionjWeight, + out->weights[i], out->codes[i]); + if (f!= NULL) { + int k; + for (k = 0; k < nCodes; k++) + fprintf(stderr, "\t%c:%f", codesString[k], f ? f[k] : -1.0); + fprintf(stderr,"\n"); + } + } + } /* end loop over positions */ + assert(iFreq1 == profile1->nVectors); + assert(iFreq2 == profile2->nVectors); + assert(iFreqOut == out->nVectors); + + /* compute total constraints */ + for (i = 0; i < nConstraints; i++) { + out->nOn[i] = profile1->nOn[i] + profile2->nOn[i]; + out->nOff[i] = profile1->nOff[i] + profile2->nOff[i]; + } + profileAvgOps++; + return(out); +} + +/* Make the (unrotated) frequencies sum to 1 + Simply dividing by total_weight is not ideal because of roundoff error + So compute total_freq instead +*/ +void NormalizeFreq(/*IN/OUT*/numeric_t *freq, distance_matrix_t *dmat) { + double total_freq = 0; + int k; + if (dmat != NULL) { + /* The total frequency is dot_product(true_frequencies, 1) + So we rotate the 1 vector by eigeninv (stored in eigentot) + */ + total_freq = vector_multiply_sum(freq, dmat->eigentot, nCodes); + } else { + for (k = 0; k < nCodes; k++) + total_freq += freq[k]; + } + if (total_freq > fPostTotalTolerance) { + numeric_t inverse_weight = 1.0/total_freq; + vector_multiply_by(/*IN/OUT*/freq, inverse_weight, nCodes); + } else { + /* This can happen if we are in a very low-weight region, e.g. if a mostly-gap position gets weighted down + repeatedly; just set them all to arbitrary but legal values */ + if (dmat == NULL) { + for (k = 0; k < nCodes; k++) + freq[k] = 1.0/nCodes; + } else { + for (k = 0; k < nCodes; k++) + freq[k] = dmat->codeFreq[0][k]; + } + } +} + +/* OutProfile() computes the out-profile */ +profile_t *OutProfile(profile_t **profiles, int nProfiles, + int nPos, int nConstraints, + distance_matrix_t *dmat) { + int i; /* position */ + int in; /* profile */ + profile_t *out = NewProfile(nPos, nConstraints); + + double inweight = 1.0/(double)nProfiles; /* The maximal output weight is 1.0 */ + + /* First, set weights -- code is always NOCODE, prevent weight=0 */ + for (i = 0; i < nPos; i++) { + out->weights[i] = 0; + for (in = 0; in < nProfiles; in++) + out->weights[i] += profiles[in]->weights[i] * inweight; + if (out->weights[i] <= 0) out->weights[i] = 1e-20; /* always store a vector */ + out->nVectors++; + out->codes[i] = NOCODE; /* outprofile is normally complicated */ + } + + /* Initialize the frequencies to 0 */ + out->vectors = (numeric_t*)mymalloc(sizeof(numeric_t)*nCodes*out->nVectors); + for (i = 0; i < nCodes*out->nVectors; i++) + out->vectors[i] = 0; + + /* Add up the weights, going through each sequence in turn */ + for (in = 0; in < nProfiles; in++) { + int iFreqOut = 0; + int iFreqIn = 0; + for (i = 0; i < nPos; i++) { + numeric_t *fIn = GET_FREQ(profiles[in],i,/*IN/OUT*/iFreqIn); + numeric_t *fOut = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); + if (profiles[in]->weights[i] > 0) + AddToFreq(/*IN/OUT*/fOut, profiles[in]->weights[i], + profiles[in]->codes[i], fIn, dmat); + } + assert(iFreqOut == out->nVectors); + assert(iFreqIn == profiles[in]->nVectors); + } + + /* And normalize the frequencies to sum to 1 */ + int iFreqOut = 0; + for (i = 0; i < nPos; i++) { + numeric_t *fOut = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); + if (fOut) + NormalizeFreq(/*IN/OUT*/fOut, dmat); + } + assert(iFreqOut == out->nVectors); + if (verbose > 10) fprintf(stderr,"Average %d profiles\n", nProfiles); + if(dmat) + SetCodeDist(/*IN/OUT*/out, nPos, dmat); + + /* Compute constraints */ + for (i = 0; i < nConstraints; i++) { + out->nOn[i] = 0; + out->nOff[i] = 0; + for (in = 0; in < nProfiles; in++) { + out->nOn[i] += profiles[in]->nOn[i]; + out->nOff[i] += profiles[in]->nOff[i]; + } + } + return(out); +} + +void UpdateOutProfile(/*IN/OUT*/profile_t *out, profile_t *old1, profile_t *old2, + profile_t *new, int nActiveOld, + int nPos, int nConstraints, + distance_matrix_t *dmat) { + int i, k; + int iFreqOut = 0; + int iFreq1 = 0; + int iFreq2 = 0; + int iFreqNew = 0; + assert(nActiveOld > 0); + + for (i = 0; i < nPos; i++) { + numeric_t *fOut = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); + numeric_t *fOld1 = GET_FREQ(old1,i,/*IN/OUT*/iFreq1); + numeric_t *fOld2 = GET_FREQ(old2,i,/*IN/OUT*/iFreq2); + numeric_t *fNew = GET_FREQ(new,i,/*IN/OUT*/iFreqNew); + + assert(out->codes[i] == NOCODE && fOut != NULL); /* No no-vector optimization for outprofiles */ + if (verbose > 3 && i < 3) { + fprintf(stderr,"Updating out-profile position %d weight %f (mult %f)\n", + i, out->weights[i], out->weights[i]*nActiveOld); + } + double originalMult = out->weights[i]*nActiveOld; + double newMult = originalMult + new->weights[i] - old1->weights[i] - old2->weights[i]; + out->weights[i] = newMult/(nActiveOld-1); + if (out->weights[i] <= 0) out->weights[i] = 1e-20; /* always use the vector */ + + for (k = 0; k < nCodes; k++) fOut[k] *= originalMult; + + if (old1->weights[i] > 0) + AddToFreq(/*IN/OUT*/fOut, -old1->weights[i], old1->codes[i], fOld1, dmat); + if (old2->weights[i] > 0) + AddToFreq(/*IN/OUT*/fOut, -old2->weights[i], old2->codes[i], fOld2, dmat); + if (new->weights[i] > 0) + AddToFreq(/*IN/OUT*/fOut, new->weights[i], new->codes[i], fNew, dmat); + + /* And renormalize */ + NormalizeFreq(/*IN/OUT*/fOut, dmat); + + if (verbose > 2 && i < 3) { + fprintf(stderr,"Updated out-profile position %d weight %f (mult %f)", + i, out->weights[i], out->weights[i]*nActiveOld); + if(out->weights[i] > 0) + for (k=0;knVectors); + assert(iFreq1 == old1->nVectors); + assert(iFreq2 == old2->nVectors); + assert(iFreqNew == new->nVectors); + if(dmat) + SetCodeDist(/*IN/OUT*/out,nPos,dmat); + + /* update constraints -- note in practice this should be a no-op */ + for (i = 0; i < nConstraints; i++) { + out->nOn[i] += new->nOn[i] - old1->nOn[i] - old2->nOn[i]; + out->nOff[i] += new->nOff[i] - old1->nOff[i] - old2->nOff[i]; + } +} + +void SetCodeDist(/*IN/OUT*/profile_t *profile, int nPos, + distance_matrix_t *dmat) { + if (profile->codeDist == NULL) + profile->codeDist = (numeric_t*)mymalloc(sizeof(numeric_t)*nPos*nCodes); + int i; + int iFreq = 0; + for (i = 0; i < nPos; i++) { + numeric_t *f = GET_FREQ(profile,i,/*IN/OUT*/iFreq); + + int k; + for (k = 0; k < nCodes; k++) + profile->codeDist[i*nCodes+k] = ProfileDistPiece(/*code1*/profile->codes[i], /*code2*/k, + /*f1*/f, /*f2*/NULL, + dmat, NULL); + } + assert(iFreq==profile->nVectors); +} + + +void SetBestHit(int node, NJ_t *NJ, int nActive, + /*OUT*/besthit_t *bestjoin, /*OUT OPTIONAL*/besthit_t *allhits) { + assert(NJ->parent[node] < 0); + + bestjoin->i = node; + bestjoin->j = -1; + bestjoin->dist = 1e20; + bestjoin->criterion = 1e20; + + int j; + besthit_t tmp; + +#ifdef OPENMP + /* Note -- if we are already in a parallel region, this will be ignored */ + #pragma omp parallel for schedule(dynamic, 50) +#endif + for (j = 0; j < NJ->maxnode; j++) { + besthit_t *sv = allhits != NULL ? &allhits[j] : &tmp; + sv->i = node; + sv->j = j; + if (NJ->parent[j] >= 0) { + sv->i = -1; /* illegal/empty join */ + sv->weight = 0.0; + sv->criterion = sv->dist = 1e20; + continue; + } + /* Note that we compute self-distances (allow j==node) because the top-hit heuristic + expects self to be within its top hits, but we exclude those from the bestjoin + that we return... + */ + SetDistCriterion(NJ, nActive, /*IN/OUT*/sv); + if (sv->criterion < bestjoin->criterion && node != j) + *bestjoin = *sv; + } + if (verbose>5) { + fprintf(stderr, "SetBestHit %d %d %f %f\n", bestjoin->i, bestjoin->j, bestjoin->dist, bestjoin->criterion); + } +} + +void ReadMatrix(char *filename, /*OUT*/numeric_t codes[MAXCODES][MAXCODES], bool checkCodes) { + char buf[BUFFER_SIZE] = ""; + FILE *fp = fopen(filename, "r"); + if (fp == NULL) { + fprintf(stderr, "Cannot read %s\n",filename); + exit(1); + } + if (fgets(buf,sizeof(buf),fp) == NULL) { + fprintf(stderr, "Error reading header line for %s:\n%s\n", filename, buf); + exit(1); + } + if (checkCodes) { + int i; + int iBufPos; + for (iBufPos=0,i=0;i BUFFER_SIZE-20) { + fprintf(stderr,"Filename %s too long\n", prefix); + exit(1); + } + + strcpy(buffer, prefix); + strcat(buffer, ".distances"); + ReadMatrix(buffer, /*OUT*/dmat->distances, /*checkCodes*/true); + + strcpy(buffer, prefix); + strcat(buffer, ".inverses"); + ReadMatrix(buffer, /*OUT*/dmat->eigeninv, /*checkCodes*/false); + + strcpy(buffer, prefix); + strcat(buffer, ".eigenvalues"); + ReadVector(buffer, /*OUT*/dmat->eigenval); + + if(verbose>1) fprintf(stderr, "Read distance matrix from %s\n",prefix); + SetupDistanceMatrix(/*IN/OUT*/dmat); + return(dmat); +} + +void SetupDistanceMatrix(/*IN/OUT*/distance_matrix_t *dmat) { + /* Check that the eigenvalues and eigen-inverse are consistent with the + distance matrix and that the matrix is symmetric */ + int i,j,k; + for (i = 0; i < nCodes; i++) { + for (j = 0; j < nCodes; j++) { + if(fabs(dmat->distances[i][j]-dmat->distances[j][i]) > 1e-6) { + fprintf(stderr,"Distance matrix not symmetric for %d,%d: %f vs %f\n", + i+1,j+1, + dmat->distances[i][j], + dmat->distances[j][i]); + exit(1); + } + double total = 0.0; + for (k = 0; k < nCodes; k++) + total += dmat->eigenval[k] * dmat->eigeninv[k][i] * dmat->eigeninv[k][j]; + if(fabs(total - dmat->distances[i][j]) > 1e-6) { + fprintf(stderr,"Distance matrix entry %d,%d should be %f but eigen-representation gives %f\n", + i+1,j+1,dmat->distances[i][j],total); + exit(1); + } + } + } + + /* And compute eigentot */ + for (k = 0; k < nCodes; k++) { + dmat->eigentot[k] = 0.; + int j; + for (j = 0; j < nCodes; j++) + dmat->eigentot[k] += dmat->eigeninv[k][j]; + } + + /* And compute codeFreq */ + int code; + for(code = 0; code < nCodes; code++) { + for (k = 0; k < nCodes; k++) { + dmat->codeFreq[code][k] = dmat->eigeninv[k][code]; + } + } + /* And gapFreq */ + for(code = 0; code < nCodes; code++) { + double gapFreq = 0.0; + for (k = 0; k < nCodes; k++) + gapFreq += dmat->codeFreq[k][code]; + dmat->gapFreq[code] = gapFreq / nCodes; + } + + if(verbose>10) fprintf(stderr, "Made codeFreq\n"); +} + +nni_t ChooseNNI(profile_t *profiles[4], + /*OPTIONAL*/distance_matrix_t *dmat, + int nPos, int nConstraints, + /*OUT*/double criteria[3]) { + double d[6]; + CorrectedPairDistances(profiles, 4, dmat, nPos, /*OUT*/d); + double penalty[3]; /* indexed as nni_t */ + QuartetConstraintPenalties(profiles, nConstraints, /*OUT*/penalty); + criteria[ABvsCD] = d[qAB] + d[qCD] + penalty[ABvsCD]; + criteria[ACvsBD] = d[qAC] + d[qBD] + penalty[ACvsBD]; + criteria[ADvsBC] = d[qAD] + d[qBC] + penalty[ADvsBC]; + + nni_t choice = ABvsCD; + if (criteria[ACvsBD] < criteria[ABvsCD] && criteria[ACvsBD] <= criteria[ADvsBC]) { + choice = ACvsBD; + } else if (criteria[ADvsBC] < criteria[ABvsCD] && criteria[ADvsBC] <= criteria[ACvsBD]) { + choice = ADvsBC; + } + if (verbose > 1 && penalty[choice] > penalty[ABvsCD] + 1e-6) { + fprintf(stderr, "Worsen constraint: from %.3f to %.3f distance %.3f to %.3f: ", + penalty[ABvsCD], penalty[choice], + criteria[ABvsCD], choice == ACvsBD ? criteria[ACvsBD] : criteria[ADvsBC]); + int iC; + for (iC = 0; iC < nConstraints; iC++) { + double ppart[3]; + if (QuartetConstraintPenaltiesPiece(profiles, iC, /*OUT*/ppart)) { + double old_penalty = ppart[ABvsCD]; + double new_penalty = ppart[choice]; + if (new_penalty > old_penalty + 1e-6) + fprintf(stderr, " %d (%d/%d %d/%d %d/%d %d/%d)", iC, + profiles[0]->nOn[iC], profiles[0]->nOff[iC], + profiles[1]->nOn[iC], profiles[1]->nOff[iC], + profiles[2]->nOn[iC], profiles[2]->nOff[iC], + profiles[3]->nOn[iC], profiles[3]->nOff[iC]); + } + } + fprintf(stderr,"\n"); + } + if (verbose > 3) + fprintf(stderr, "NNI scores ABvsCD %.5f ACvsBD %.5f ADvsBC %.5f choice %s\n", + criteria[ABvsCD], criteria[ACvsBD], criteria[ADvsBC], + choice == ABvsCD ? "AB|CD" : (choice == ACvsBD ? "AC|BD" : "AD|BC")); + return(choice); +} + +profile_t *PosteriorProfile(profile_t *p1, profile_t *p2, + double len1, double len2, + /*OPTIONAL*/transition_matrix_t *transmat, + rates_t *rates, + int nPos, int nConstraints) { + if (len1 < MLMinBranchLength) + len1 = MLMinBranchLength; + if (len2 < MLMinBranchLength) + len2 = MLMinBranchLength; + + int i,j,k; + profile_t *out = NewProfile(nPos, nConstraints); + for (i = 0; i < nPos; i++) { + out->codes[i] = NOCODE; + out->weights[i] = 1.0; + } + out->nVectors = nPos; + out->vectors = (numeric_t*)mymalloc(sizeof(numeric_t)*nCodes*out->nVectors); + for (i = 0; i < nCodes * out->nVectors; i++) out->vectors[i] = 0; + int iFreqOut = 0; + int iFreq1 = 0; + int iFreq2 = 0; + numeric_t *expeigenRates1 = NULL, *expeigenRates2 = NULL; + + if (transmat != NULL) { + expeigenRates1 = ExpEigenRates(len1, transmat, rates); + expeigenRates2 = ExpEigenRates(len2, transmat, rates); + } + + if (transmat == NULL) { /* Jukes-Cantor */ + assert(nCodes == 4); + + double *PSame1 = PSameVector(len1, rates); + double *PDiff1 = PDiffVector(PSame1, rates); + double *PSame2 = PSameVector(len2, rates); + double *PDiff2 = PDiffVector(PSame2, rates); + + numeric_t mix1[4], mix2[4]; + + for (i=0; i < nPos; i++) { + int iRate = rates->ratecat[i]; + double w1 = p1->weights[i]; + double w2 = p2->weights[i]; + int code1 = p1->codes[i]; + int code2 = p2->codes[i]; + numeric_t *f1 = GET_FREQ(p1,i,/*IN/OUT*/iFreq1); + numeric_t *f2 = GET_FREQ(p2,i,/*IN/OUT*/iFreq2); + + /* First try to store a simple profile */ + if (f1 == NULL && f2 == NULL) { + if (code1 == NOCODE && code2 == NOCODE) { + out->codes[i] = NOCODE; + out->weights[i] = 0.0; + continue; + } else if (code1 == NOCODE) { + /* Posterior(parent | character & gap, len1, len2) = Posterior(parent | character, len1) + = PSame() for matching characters and 1-PSame() for the rest + = (pSame - pDiff) * character + (1-(pSame-pDiff)) * gap + */ + out->codes[i] = code2; + out->weights[i] = w2 * (PSame2[iRate] - PDiff2[iRate]); + continue; + } else if (code2 == NOCODE) { + out->codes[i] = code1; + out->weights[i] = w1 * (PSame1[iRate] - PDiff1[iRate]); + continue; + } else if (code1 == code2) { + out->codes[i] = code1; + double f12code = (w1*PSame1[iRate] + (1-w1)*0.25) * (w2*PSame2[iRate] + (1-w2)*0.25); + double f12other = (w1*PDiff1[iRate] + (1-w1)*0.25) * (w2*PDiff2[iRate] + (1-w2)*0.25); + /* posterior probability of code1/code2 after scaling */ + double pcode = f12code/(f12code+3*f12other); + /* Now f = w * (code ? 1 : 0) + (1-w) * 0.25, so to get pcode we need + fcode = 1/4 + w1*3/4 or w = (f-1/4)*4/3 + */ + out->weights[i] = (pcode - 0.25) * 4.0/3.0; + /* This can be zero because of numerical problems, I think */ + if (out->weights[i] < 1e-6) { + if (verbose > 1) + fprintf(stderr, "Replaced weight %f with %f from w1 %f w2 %f PSame %f %f f12code %f f12other %f\n", + out->weights[i], 1e-6, + w1, w2, + PSame1[iRate], PSame2[iRate], + f12code, f12other); + out->weights[i] = 1e-6; + } + continue; + } + } + /* if we did not compute a simple profile, then do the full computation and + store the full vector + */ + if (f1 == NULL) { + for (j = 0; j < 4; j++) + mix1[j] = (1-w1)*0.25; + if(code1 != NOCODE) + mix1[code1] += w1; + f1 = mix1; + } + if (f2 == NULL) { + for (j = 0; j < 4; j++) + mix2[j] = (1-w2)*0.25; + if(code2 != NOCODE) + mix2[code2] += w2; + f2 = mix2; + } + out->codes[i] = NOCODE; + out->weights[i] = 1.0; + numeric_t *f = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); + double lkAB = 0; + for (j = 0; j < 4; j++) { + f[j] = (f1[j] * PSame1[iRate] + (1.0-f1[j]) * PDiff1[iRate]) + * (f2[j] * PSame2[iRate] + (1.0-f2[j]) * PDiff2[iRate]); + lkAB += f[j]; + } + double lkABInv = 1.0/lkAB; + for (j = 0; j < 4; j++) + f[j] *= lkABInv; + } + PSame1 = myfree(PSame1, sizeof(double) * rates->nRateCategories); + PSame2 = myfree(PSame2, sizeof(double) * rates->nRateCategories); + PDiff1 = myfree(PDiff1, sizeof(double) * rates->nRateCategories); + PDiff2 = myfree(PDiff2, sizeof(double) * rates->nRateCategories); + } else if (nCodes == 4) { /* matrix model on nucleotides */ + numeric_t *fGap = &transmat->codeFreq[NOCODE][0]; + numeric_t f1mix[4], f2mix[4]; + + for (i=0; i < nPos; i++) { + if (p1->codes[i] == NOCODE && p2->codes[i] == NOCODE + && p1->weights[i] == 0 && p2->weights[i] == 0) { + /* aligning gap with gap -- just output a gap + out->codes[i] is already set to NOCODE so need not set that */ + out->weights[i] = 0; + continue; + } + int iRate = rates->ratecat[i]; + numeric_t *expeigen1 = &expeigenRates1[iRate*4]; + numeric_t *expeigen2 = &expeigenRates2[iRate*4]; + numeric_t *f1 = GET_FREQ(p1,i,/*IN/OUT*/iFreq1); + numeric_t *f2 = GET_FREQ(p2,i,/*IN/OUT*/iFreq2); + numeric_t *fOut = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); + assert(fOut != NULL); + + if (f1 == NULL) { + f1 = &transmat->codeFreq[p1->codes[i]][0]; /* codeFreq includes an entry for NOCODE */ + double w = p1->weights[i]; + if (w > 0.0 && w < 1.0) { + for (j = 0; j < 4; j++) + f1mix[j] = w * f1[j] + (1.0-w) * fGap[j]; + f1 = f1mix; + } + } + if (f2 == NULL) { + f2 = &transmat->codeFreq[p2->codes[i]][0]; + double w = p2->weights[i]; + if (w > 0.0 && w < 1.0) { + for (j = 0; j < 4; j++) + f2mix[j] = w * f2[j] + (1.0-w) * fGap[j]; + f2 = f2mix; + } + } + numeric_t fMult1[4] ALIGNED; /* rotated1 * expeigen1 */ + numeric_t fMult2[4] ALIGNED; /* rotated2 * expeigen2 */ +#if 0 /* SSE3 is slower */ + vector_multiply(f1, expeigen1, 4, /*OUT*/fMult1); + vector_multiply(f2, expeigen2, 4, /*OUT*/fMult2); +#else + for (j = 0; j < 4; j++) { + fMult1[j] = f1[j]*expeigen1[j]; + fMult2[j] = f2[j]*expeigen2[j]; + } +#endif + numeric_t fPost[4] ALIGNED; /* in unrotated space */ + for (j = 0; j < 4; j++) { +#if 0 /* SSE3 is slower */ + fPost[j] = vector_dot_product_rot(fMult1, fMult2, &transmat->codeFreq[j][0], 4) + * transmat->statinv[j]; */ +#else + double out1 = 0; + double out2 = 0; + for (k = 0; k < 4; k++) { + out1 += fMult1[k] * transmat->codeFreq[j][k]; + out2 += fMult2[k] * transmat->codeFreq[j][k]; + } + fPost[j] = out1*out2*transmat->statinv[j]; +#endif + } + double fPostTot = 0; + for (j = 0; j < 4; j++) + fPostTot += fPost[j]; + assert(fPostTot > fPostTotalTolerance); + double fPostInv = 1.0/fPostTot; +#if 0 /* SSE3 is slower */ + vector_multiply_by(fPost, fPostInv, 4); +#else + for (j = 0; j < 4; j++) + fPost[j] *= fPostInv; +#endif + + /* and finally, divide by stat again & rotate to give the new frequencies */ + matrixt_by_vector4(transmat->eigeninvT, fPost, /*OUT*/fOut); + } /* end loop over position i */ + } else if (nCodes == 20) { /* matrix model on amino acids */ + numeric_t *fGap = &transmat->codeFreq[NOCODE][0]; + numeric_t f1mix[20] ALIGNED; + numeric_t f2mix[20] ALIGNED; + + for (i=0; i < nPos; i++) { + if (p1->codes[i] == NOCODE && p2->codes[i] == NOCODE + && p1->weights[i] == 0 && p2->weights[i] == 0) { + /* aligning gap with gap -- just output a gap + out->codes[i] is already set to NOCODE so need not set that */ + out->weights[i] = 0; + continue; + } + int iRate = rates->ratecat[i]; + numeric_t *expeigen1 = &expeigenRates1[iRate*20]; + numeric_t *expeigen2 = &expeigenRates2[iRate*20]; + numeric_t *f1 = GET_FREQ(p1,i,/*IN/OUT*/iFreq1); + numeric_t *f2 = GET_FREQ(p2,i,/*IN/OUT*/iFreq2); + numeric_t *fOut = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); + assert(fOut != NULL); + + if (f1 == NULL) { + f1 = &transmat->codeFreq[p1->codes[i]][0]; /* codeFreq includes an entry for NOCODE */ + double w = p1->weights[i]; + if (w > 0.0 && w < 1.0) { + for (j = 0; j < 20; j++) + f1mix[j] = w * f1[j] + (1.0-w) * fGap[j]; + f1 = f1mix; + } + } + if (f2 == NULL) { + f2 = &transmat->codeFreq[p2->codes[i]][0]; + double w = p2->weights[i]; + if (w > 0.0 && w < 1.0) { + for (j = 0; j < 20; j++) + f2mix[j] = w * f2[j] + (1.0-w) * fGap[j]; + f2 = f2mix; + } + } + numeric_t fMult1[20] ALIGNED; /* rotated1 * expeigen1 */ + numeric_t fMult2[20] ALIGNED; /* rotated2 * expeigen2 */ + vector_multiply(f1, expeigen1, 20, /*OUT*/fMult1); + vector_multiply(f2, expeigen2, 20, /*OUT*/fMult2); + numeric_t fPost[20] ALIGNED; /* in unrotated space */ + for (j = 0; j < 20; j++) { + numeric_t value = vector_dot_product_rot(fMult1, fMult2, &transmat->codeFreq[j][0], 20) + * transmat->statinv[j]; + /* Added this logic try to avoid rare numerical problems */ + fPost[j] = value >= 0 ? value : 0; + } + double fPostTot = vector_sum(fPost, 20); + assert(fPostTot > fPostTotalTolerance); + double fPostInv = 1.0/fPostTot; + vector_multiply_by(/*IN/OUT*/fPost, fPostInv, 20); + int ch = -1; /* the dominant character, if any */ + if (!exactML) { + for (j = 0; j < 20; j++) { + if (fPost[j] >= approxMLminf) { + ch = j; + break; + } + } + } + + /* now, see if we can use the approximation + fPost ~= (1 or 0) * w + nearP * (1-w) + to avoid rotating */ + double w = 0; + if (ch >= 0) { + w = (fPost[ch] - transmat->nearP[ch][ch]) / (1.0 - transmat->nearP[ch][ch]); + for (j = 0; j < 20; j++) { + if (j != ch) { + double fRough = (1.0-w) * transmat->nearP[ch][j]; + if (fRough < fPost[j] * approxMLminratio) { + ch = -1; /* give up on the approximation */ + break; + } + } + } + } + if (ch >= 0) { + nAAPosteriorRough++; + double wInvStat = w * transmat->statinv[ch]; + for (j = 0; j < 20; j++) + fOut[j] = wInvStat * transmat->codeFreq[ch][j] + (1.0-w) * transmat->nearFreq[ch][j]; + } else { + /* and finally, divide by stat again & rotate to give the new frequencies */ + nAAPosteriorExact++; + for (j = 0; j < 20; j++) + fOut[j] = vector_multiply_sum(fPost, &transmat->eigeninv[j][0], 20); + } + } /* end loop over position i */ + } else { + assert(0); /* illegal nCodes */ + } + + if (transmat != NULL) { + expeigenRates1 = myfree(expeigenRates1, sizeof(numeric_t) * rates->nRateCategories * nCodes); + expeigenRates2 = myfree(expeigenRates2, sizeof(numeric_t) * rates->nRateCategories * nCodes); + } + + /* Reallocate out->vectors to be the right size */ + out->nVectors = iFreqOut; + if (out->nVectors == 0) + out->vectors = (numeric_t*)myfree(out->vectors, sizeof(numeric_t)*nCodes*nPos); + else + out->vectors = (numeric_t*)myrealloc(out->vectors, + /*OLDSIZE*/sizeof(numeric_t)*nCodes*nPos, + /*NEWSIZE*/sizeof(numeric_t)*nCodes*out->nVectors, + /*copy*/true); /* try to save space */ + nProfileFreqAlloc += out->nVectors; + nProfileFreqAvoid += nPos - out->nVectors; + + /* compute total constraints */ + for (i = 0; i < nConstraints; i++) { + out->nOn[i] = p1->nOn[i] + p2->nOn[i]; + out->nOff[i] = p1->nOff[i] + p2->nOff[i]; + } + nPosteriorCompute++; + return(out); +} + +double *PSameVector(double length, rates_t *rates) { + double *pSame = mymalloc(sizeof(double) * rates->nRateCategories); + int iRate; + for (iRate = 0; iRate < rates->nRateCategories; iRate++) + pSame[iRate] = 0.25 + 0.75 * exp((-4.0/3.0) * fabs(length*rates->rates[iRate])); + return(pSame); +} + +double *PDiffVector(double *pSame, rates_t *rates) { + double *pDiff = mymalloc(sizeof(double) * rates->nRateCategories); + int iRate; + for (iRate = 0; iRate < rates->nRateCategories; iRate++) + pDiff[iRate] = (1.0 - pSame[iRate])/3.0; + return(pDiff); +} + +numeric_t *ExpEigenRates(double length, transition_matrix_t *transmat, rates_t *rates) { + numeric_t *expeigen = mymalloc(sizeof(numeric_t) * nCodes * rates->nRateCategories); + int iRate, j; + for (iRate = 0; iRate < rates->nRateCategories; iRate++) { + for (j = 0; j < nCodes; j++) { + double relLen = length * rates->rates[iRate]; + /* very short branch lengths lead to numerical problems so prevent them */ + if (relLen < MLMinRelBranchLength) + relLen = MLMinRelBranchLength; + expeigen[iRate*nCodes + j] = exp(relLen * transmat->eigenval[j]); + } + } + return(expeigen); +} + +double PairLogLk(profile_t *pA, profile_t *pB, double length, int nPos, + /*OPTIONAL*/transition_matrix_t *transmat, + rates_t *rates, + /*OPTIONAL IN/OUT*/double *site_likelihoods) { + double lk = 1.0; + double loglk = 0.0; /* stores underflow of lk during the loop over positions */ + int i,j; + assert(rates != NULL && rates->nRateCategories > 0); + numeric_t *expeigenRates = NULL; + if (transmat != NULL) + expeigenRates = ExpEigenRates(length, transmat, rates); + + if (transmat == NULL) { /* Jukes-Cantor */ + assert (nCodes == 4); + double *pSame = PSameVector(length, rates); + double *pDiff = PDiffVector(pSame, rates); + + int iFreqA = 0; + int iFreqB = 0; + for (i = 0; i < nPos; i++) { + int iRate = rates->ratecat[i]; + double wA = pA->weights[i]; + double wB = pB->weights[i]; + int codeA = pA->codes[i]; + int codeB = pB->codes[i]; + numeric_t *fA = GET_FREQ(pA,i,/*IN/OUT*/iFreqA); + numeric_t *fB = GET_FREQ(pB,i,/*IN/OUT*/iFreqB); + double lkAB = 0; + + if (fA == NULL && fB == NULL) { + if (codeA == NOCODE) { /* A is all gaps */ + /* gap to gap is sum(j) 0.25 * (0.25 * pSame + 0.75 * pDiff) = sum(i) 0.25*0.25 = 0.25 + gap to any character gives the same result + */ + lkAB = 0.25; + } else if (codeB == NOCODE) { /* B is all gaps */ + lkAB = 0.25; + } else if (codeA == codeB) { /* A and B match */ + lkAB = pSame[iRate] * wA*wB + 0.25 * (1-wA*wB); + } else { /* codeA != codeB */ + lkAB = pDiff[iRate] * wA*wB + 0.25 * (1-wA*wB); + } + } else if (fA == NULL) { + /* Compare codeA to profile of B */ + if (codeA == NOCODE) + lkAB = 0.25; + else + lkAB = wA * (pDiff[iRate] + fB[codeA] * (pSame[iRate]-pDiff[iRate])) + (1.0-wA) * 0.25; + /* because lkAB = wA * P(codeA->B) + (1-wA) * 0.25 + P(codeA -> B) = sum(j) P(B==j) * (j==codeA ? pSame : pDiff) + = sum(j) P(B==j) * pDiff + + = pDiff + P(B==codeA) * (pSame-pDiff) + */ + } else if (fB == NULL) { /* Compare codeB to profile of A */ + if (codeB == NOCODE) + lkAB = 0.25; + else + lkAB = wB * (pDiff[iRate] + fA[codeB] * (pSame[iRate]-pDiff[iRate])) + (1.0-wB) * 0.25; + } else { /* both are full profiles */ + for (j = 0; j < 4; j++) + lkAB += fB[j] * (fA[j] * pSame[iRate] + (1-fA[j])* pDiff[iRate]); /* P(A|B) */ + } + assert(lkAB > 0); + lk *= lkAB; + while (lk < LkUnderflow) { + lk *= LkUnderflowInv; + loglk -= LogLkUnderflow; + } + if (site_likelihoods != NULL) + site_likelihoods[i] *= lkAB; + } + pSame = myfree(pSame, sizeof(double) * rates->nRateCategories); + pDiff = myfree(pDiff, sizeof(double) * rates->nRateCategories); + } else if (nCodes == 4) { /* matrix model on nucleotides */ + int iFreqA = 0; + int iFreqB = 0; + numeric_t fAmix[4], fBmix[4]; + numeric_t *fGap = &transmat->codeFreq[NOCODE][0]; + + for (i = 0; i < nPos; i++) { + int iRate = rates->ratecat[i]; + numeric_t *expeigen = &expeigenRates[iRate*4]; + double wA = pA->weights[i]; + double wB = pB->weights[i]; + if (wA == 0 && wB == 0 && pA->codes[i] == NOCODE && pB->codes[i] == NOCODE) { + /* Likelihood of A vs B is 1, so nothing changes + Do not need to advance iFreqA or iFreqB */ + continue; + } + numeric_t *fA = GET_FREQ(pA,i,/*IN/OUT*/iFreqA); + numeric_t *fB = GET_FREQ(pB,i,/*IN/OUT*/iFreqB); + if (fA == NULL) + fA = &transmat->codeFreq[pA->codes[i]][0]; + if (wA > 0.0 && wA < 1.0) { + for (j = 0; j < 4; j++) + fAmix[j] = wA*fA[j] + (1.0-wA)*fGap[j]; + fA = fAmix; + } + if (fB == NULL) + fB = &transmat->codeFreq[pB->codes[i]][0]; + if (wB > 0.0 && wB < 1.0) { + for (j = 0; j < 4; j++) + fBmix[j] = wB*fB[j] + (1.0-wB)*fGap[j]; + fB = fBmix; + } + /* SSE3 instructions do not speed this step up: + numeric_t lkAB = vector_multiply3_sum(expeigen, fA, fB); */ + // dsp this is where check for <=0 was added in 2.1.1.LG + double lkAB = 0; + for (j = 0; j < 4; j++) + lkAB += expeigen[j]*fA[j]*fB[j]; + assert(lkAB > 0); + if (site_likelihoods != NULL) + site_likelihoods[i] *= lkAB; + lk *= lkAB; + while (lk < LkUnderflow) { + lk *= LkUnderflowInv; + loglk -= LogLkUnderflow; + } + while (lk > LkUnderflowInv) { + lk *= LkUnderflow; + loglk += LogLkUnderflow; + } + } + } else if (nCodes == 20) { /* matrix model on amino acids */ + int iFreqA = 0; + int iFreqB = 0; + numeric_t fAmix[20], fBmix[20]; + numeric_t *fGap = &transmat->codeFreq[NOCODE][0]; + + for (i = 0; i < nPos; i++) { + int iRate = rates->ratecat[i]; + numeric_t *expeigen = &expeigenRates[iRate*20]; + double wA = pA->weights[i]; + double wB = pB->weights[i]; + if (wA == 0 && wB == 0 && pA->codes[i] == NOCODE && pB->codes[i] == NOCODE) { + /* Likelihood of A vs B is 1, so nothing changes + Do not need to advance iFreqA or iFreqB */ + continue; + } + numeric_t *fA = GET_FREQ(pA,i,/*IN/OUT*/iFreqA); + numeric_t *fB = GET_FREQ(pB,i,/*IN/OUT*/iFreqB); + if (fA == NULL) + fA = &transmat->codeFreq[pA->codes[i]][0]; + if (wA > 0.0 && wA < 1.0) { + for (j = 0; j < 20; j++) + fAmix[j] = wA*fA[j] + (1.0-wA)*fGap[j]; + fA = fAmix; + } + if (fB == NULL) + fB = &transmat->codeFreq[pB->codes[i]][0]; + if (wB > 0.0 && wB < 1.0) { + for (j = 0; j < 20; j++) + fBmix[j] = wB*fB[j] + (1.0-wB)*fGap[j]; + fB = fBmix; + } + numeric_t lkAB = vector_multiply3_sum(expeigen, fA, fB, 20); + if (!(lkAB > 0)) { + /* If this happens, it indicates a numerical problem that needs to be addressed elsewhere, + so report all the details */ + fprintf(stderr, "# FastTree.c::PairLogLk -- numerical problem!\n"); + fprintf(stderr, "# This block is intended for loading into R\n"); + + fprintf(stderr, "lkAB = %.8g\n", lkAB); + fprintf(stderr, "Branch_length= %.8g\nalignment_position=%d\nnCodes=%d\nrate_category=%d\nrate=%.8g\n", + length, i, nCodes, iRate, rates->rates[iRate]); + fprintf(stderr, "wA=%.8g\nwB=%.8g\n", wA, wB); + fprintf(stderr, "codeA = %d\ncodeB = %d\n", pA->codes[i], pB->codes[i]); + + fprintf(stderr, "fA = c("); + for (j = 0; j < nCodes; j++) fprintf(stderr, "%s %.8g", j==0?"":",", fA[j]); + fprintf(stderr,")\n"); + + fprintf(stderr, "fB = c("); + for (j = 0; j < nCodes; j++) fprintf(stderr, "%s %.8g", j==0?"":",", fB[j]); + fprintf(stderr,")\n"); + + fprintf(stderr, "stat = c("); + for (j = 0; j < nCodes; j++) fprintf(stderr, "%s %.8g", j==0?"":",", transmat->stat[j]); + fprintf(stderr,")\n"); + + fprintf(stderr, "eigenval = c("); + for (j = 0; j < nCodes; j++) fprintf(stderr, "%s %.8g", j==0?"":",", transmat->eigenval[j]); + fprintf(stderr,")\n"); + + fprintf(stderr, "expeigen = c("); + for (j = 0; j < nCodes; j++) fprintf(stderr, "%s %.8g", j==0?"":",", expeigen[j]); + fprintf(stderr,")\n"); + + int k; + fprintf(stderr, "codeFreq = c("); + for (j = 0; j < nCodes; j++) for(k = 0; k < nCodes; k++) fprintf(stderr, "%s %.8g", j==0 && k==0?"":",", + transmat->codeFreq[j][k]); + fprintf(stderr,")\n"); + + fprintf(stderr, "eigeninv = c("); + for (j = 0; j < nCodes; j++) for(k = 0; k < nCodes; k++) fprintf(stderr, "%s %.8g", j==0 && k==0?"":",", + transmat->eigeninv[j][k]); + fprintf(stderr,")\n"); + + fprintf(stderr, "# Transform into matrices and compute un-rotated vectors for profiles A and B\n"); + fprintf(stderr, "codeFreq = matrix(codeFreq,nrow=20);\n"); + fprintf(stderr, "eigeninv = matrix(eigeninv,nrow=20);\n"); + fputs("unrotA = stat * (eigeninv %*% fA)\n", stderr); + fputs("unrotB = stat * (eigeninv %*% fB)\n", stderr); + fprintf(stderr,"# End of R block\n"); + } + assert(lkAB > 0); + if (site_likelihoods != NULL) + site_likelihoods[i] *= lkAB; + lk *= lkAB; + while (lk < LkUnderflow) { + lk *= LkUnderflowInv; + loglk -= LogLkUnderflow; + } + while (lk > LkUnderflowInv) { + lk *= LkUnderflow; + loglk += LogLkUnderflow; + } + } + } else { + assert(0); /* illegal nCodes */ + } + if (transmat != NULL) + expeigenRates = myfree(expeigenRates, sizeof(numeric_t) * rates->nRateCategories * 20); + loglk += log(lk); + nLkCompute++; + return(loglk); +} + +double MLQuartetLogLk(profile_t *pA, profile_t *pB, profile_t *pC, profile_t *pD, + int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, + /*IN*/double branch_lengths[5], + /*OPTIONAL OUT*/double *site_likelihoods) { + profile_t *pAB = PosteriorProfile(pA, pB, + branch_lengths[0], branch_lengths[1], + transmat, + rates, + nPos, /*nConstraints*/0); + profile_t *pCD = PosteriorProfile(pC, pD, + branch_lengths[2], branch_lengths[3], + transmat, + rates, + nPos, /*nConstraints*/0); + if (site_likelihoods != NULL) { + int i; + for (i = 0; i < nPos; i++) + site_likelihoods[i] = 1.0; + } + /* Roughly, P(A,B,C,D) = P(A) P(B|A) P(D|C) P(AB | CD) */ + double loglk = PairLogLk(pA, pB, branch_lengths[0]+branch_lengths[1], + nPos, transmat, rates, /*OPTIONAL IN/OUT*/site_likelihoods) + + PairLogLk(pC, pD, branch_lengths[2]+branch_lengths[3], + nPos, transmat, rates, /*OPTIONAL IN/OUT*/site_likelihoods) + + PairLogLk(pAB, pCD, branch_lengths[4], + nPos, transmat, rates, /*OPTIONAL IN/OUT*/site_likelihoods); + pAB = FreeProfile(pAB, nPos, /*nConstraints*/0); + pCD = FreeProfile(pCD, nPos, /*nConstraints*/0); + return(loglk); +} + +double PairNegLogLk(double x, void *data) { + quartet_opt_t *qo = (quartet_opt_t *)data; + assert(qo != NULL); + assert(qo->pair1 != NULL && qo->pair2 != NULL); + qo->nEval++; + double loglk = PairLogLk(qo->pair1, qo->pair2, x, qo->nPos, qo->transmat, qo->rates, /*site_lk*/NULL); + assert(loglk < 1e100); + if (verbose > 5) + fprintf(stderr, "PairLogLk(%.4f) = %.4f\n", x, loglk); + return(-loglk); +} + +double MLQuartetOptimize(profile_t *pA, profile_t *pB, profile_t *pC, profile_t *pD, + int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, + /*IN/OUT*/double branch_lengths[5], + /*OPTIONAL OUT*/bool *pStarTest, + /*OPTIONAL OUT*/double *site_likelihoods) { + int j; + double start_length[5]; + for (j = 0; j < 5; j++) { + start_length[j] = branch_lengths[j]; + if (branch_lengths[j] < MLMinBranchLength) + branch_lengths[j] = MLMinBranchLength; + } + quartet_opt_t qopt = { nPos, transmat, rates, /*nEval*/0, + /*pair1*/NULL, /*pair2*/NULL }; + double f2x, negloglk; + + if (pStarTest != NULL) + *pStarTest = false; + + /* First optimize internal branch, then branch to A, B, C, D, in turn + May use star test to quit after internal branch + */ + profile_t *pAB = PosteriorProfile(pA, pB, + branch_lengths[LEN_A], branch_lengths[LEN_B], + transmat, rates, nPos, /*nConstraints*/0); + profile_t *pCD = PosteriorProfile(pC, pD, + branch_lengths[LEN_C], branch_lengths[LEN_D], + transmat, rates, nPos, /*nConstraints*/0); + qopt.pair1 = pAB; + qopt.pair2 = pCD; + branch_lengths[LEN_I] = onedimenmin(/*xmin*/MLMinBranchLength, + /*xguess*/branch_lengths[LEN_I], + /*xmax*/6.0, + PairNegLogLk, + /*data*/&qopt, + /*ftol*/MLFTolBranchLength, + /*atol*/MLMinBranchLengthTolerance, + /*OUT*/&negloglk, + /*OUT*/&f2x); + + if (pStarTest != NULL) { + assert(site_likelihoods == NULL); + double loglkStar = -PairNegLogLk(MLMinBranchLength, &qopt); + if (loglkStar < -negloglk - closeLogLkLimit) { + *pStarTest = true; + double off = PairLogLk(pA, pB, + branch_lengths[LEN_A] + branch_lengths[LEN_B], + qopt.nPos, qopt.transmat, qopt.rates, /*site_lk*/NULL) + + PairLogLk(pC, pD, + branch_lengths[LEN_C] + branch_lengths[LEN_D], + qopt.nPos, qopt.transmat, qopt.rates, /*site_lk*/NULL); + pAB = FreeProfile(pAB, nPos, /*nConstraints*/0); + pCD = FreeProfile(pCD, nPos, /*nConstraints*/0); + return (-negloglk + off); + } + } + pAB = FreeProfile(pAB, nPos, /*nConstraints*/0); + profile_t *pBCD = PosteriorProfile(pB, pCD, + branch_lengths[LEN_B], branch_lengths[LEN_I], + transmat, rates, nPos, /*nConstraints*/0); + qopt.pair1 = pA; + qopt.pair2 = pBCD; + branch_lengths[LEN_A] = onedimenmin(/*xmin*/MLMinBranchLength, + /*xguess*/branch_lengths[LEN_A], + /*xmax*/6.0, + PairNegLogLk, + /*data*/&qopt, + /*ftol*/MLFTolBranchLength, + /*atol*/MLMinBranchLengthTolerance, + /*OUT*/&negloglk, + /*OUT*/&f2x); + pBCD = FreeProfile(pBCD, nPos, /*nConstraints*/0); + profile_t *pACD = PosteriorProfile(pA, pCD, + branch_lengths[LEN_A], branch_lengths[LEN_I], + transmat, rates, nPos, /*nConstraints*/0); + qopt.pair1 = pB; + qopt.pair2 = pACD; + branch_lengths[LEN_B] = onedimenmin(/*xmin*/MLMinBranchLength, + /*xguess*/branch_lengths[LEN_B], + /*xmax*/6.0, + PairNegLogLk, + /*data*/&qopt, + /*ftol*/MLFTolBranchLength, + /*atol*/MLMinBranchLengthTolerance, + /*OUT*/&negloglk, + /*OUT*/&f2x); + pACD = FreeProfile(pACD, nPos, /*nConstraints*/0); + pCD = FreeProfile(pCD, nPos, /*nConstraints*/0); + pAB = PosteriorProfile(pA, pB, + branch_lengths[LEN_A], branch_lengths[LEN_B], + transmat, rates, nPos, /*nConstraints*/0); + profile_t *pABD = PosteriorProfile(pAB, pD, + branch_lengths[LEN_I], branch_lengths[LEN_D], + transmat, rates, nPos, /*nConstraints*/0); + qopt.pair1 = pC; + qopt.pair2 = pABD; + branch_lengths[LEN_C] = onedimenmin(/*xmin*/MLMinBranchLength, + /*xguess*/branch_lengths[LEN_C], + /*xmax*/6.0, + PairNegLogLk, + /*data*/&qopt, + /*ftol*/MLFTolBranchLength, + /*atol*/MLMinBranchLengthTolerance, + /*OUT*/&negloglk, + /*OUT*/&f2x); + pABD = FreeProfile(pABD, nPos, /*nConstraints*/0); + profile_t *pABC = PosteriorProfile(pAB, pC, + branch_lengths[LEN_I], branch_lengths[LEN_C], + transmat, rates, nPos, /*nConstraints*/0); + qopt.pair1 = pD; + qopt.pair2 = pABC; + branch_lengths[LEN_D] = onedimenmin(/*xmin*/MLMinBranchLength, + /*xguess*/branch_lengths[LEN_D], + /*xmax*/6.0, + PairNegLogLk, + /*data*/&qopt, + /*ftol*/MLFTolBranchLength, + /*atol*/MLMinBranchLengthTolerance, + /*OUT*/&negloglk, + /*OUT*/&f2x); + + /* Compute the total quartet likelihood + PairLogLk(ABC,D) + PairLogLk(AB,C) + PairLogLk(A,B) + */ + double loglkABCvsD = -negloglk; + if (site_likelihoods) { + for (j = 0; j < nPos; j++) + site_likelihoods[j] = 1.0; + PairLogLk(pABC, pD, branch_lengths[LEN_D], + qopt.nPos, qopt.transmat, qopt.rates, /*IN/OUT*/site_likelihoods); + } + double quartetloglk = loglkABCvsD + + PairLogLk(pAB, pC, branch_lengths[LEN_I] + branch_lengths[LEN_C], + qopt.nPos, qopt.transmat, qopt.rates, + /*IN/OUT*/site_likelihoods) + + PairLogLk(pA, pB, branch_lengths[LEN_A] + branch_lengths[LEN_B], + qopt.nPos, qopt.transmat, qopt.rates, + /*IN/OUT*/site_likelihoods); + + pABC = FreeProfile(pABC, nPos, /*nConstraints*/0); + pAB = FreeProfile(pAB, nPos, /*nConstraints*/0); + + if (verbose > 3) { + double loglkStart = MLQuartetLogLk(pA, pB, pC, pD, nPos, transmat, rates, start_length, /*site_lk*/NULL); + fprintf(stderr, "Optimize loglk from %.5f to %.5f eval %d lengths from\n" + " %.5f %.5f %.5f %.5f %.5f to\n" + " %.5f %.5f %.5f %.5f %.5f\n", + loglkStart, quartetloglk, qopt.nEval, + start_length[0], start_length[1], start_length[2], start_length[3], start_length[4], + branch_lengths[0], branch_lengths[1], branch_lengths[2], branch_lengths[3], branch_lengths[4]); + } + return(quartetloglk); +} + +nni_t MLQuartetNNI(profile_t *profiles[4], + /*OPTIONAL*/transition_matrix_t *transmat, + rates_t *rates, + int nPos, int nConstraints, + /*OUT*/double criteria[3], /* The three potential quartet log-likelihoods */ + /*IN/OUT*/numeric_t len[5], + bool bFast) +{ + int i; + double lenABvsCD[5] = {len[LEN_A], len[LEN_B], len[LEN_C], len[LEN_D], len[LEN_I]}; + double lenACvsBD[5] = {len[LEN_A], len[LEN_C], len[LEN_B], len[LEN_D], len[LEN_I]}; /* Swap B & C */ + double lenADvsBC[5] = {len[LEN_A], len[LEN_D], len[LEN_C], len[LEN_B], len[LEN_I]}; /* Swap B & D */ + bool bConsiderAC = true; + bool bConsiderAD = true; + int iRound; + int nRounds = mlAccuracy < 2 ? 2 : mlAccuracy; + double penalty[3]; + QuartetConstraintPenalties(profiles, nConstraints, /*OUT*/penalty); + if (penalty[ABvsCD] > penalty[ACvsBD] || penalty[ABvsCD] > penalty[ADvsBC]) + bFast = false; +#ifdef OPENMP + bFast = false; /* turn off star topology test */ +#endif + + for (iRound = 0; iRound < nRounds; iRound++) { + bool bStarTest = false; + { +#ifdef OPENMP + #pragma omp parallel + #pragma omp sections +#endif + { +#ifdef OPENMP + #pragma omp section +#endif + { + criteria[ABvsCD] = MLQuartetOptimize(profiles[0], profiles[1], profiles[2], profiles[3], + nPos, transmat, rates, + /*IN/OUT*/lenABvsCD, + bFast ? &bStarTest : NULL, + /*site_likelihoods*/NULL) + - penalty[ABvsCD]; /* subtract penalty b/c we are trying to maximize log lk */ + } + +#ifdef OPENMP + #pragma omp section +#else + if (bStarTest) { + nStarTests++; + criteria[ACvsBD] = -1e20; + criteria[ADvsBC] = -1e20; + len[LEN_I] = lenABvsCD[LEN_I]; + return(ABvsCD); + } +#endif + { + if (bConsiderAC) + criteria[ACvsBD] = MLQuartetOptimize(profiles[0], profiles[2], profiles[1], profiles[3], + nPos, transmat, rates, + /*IN/OUT*/lenACvsBD, NULL, /*site_likelihoods*/NULL) + - penalty[ACvsBD]; + } + +#ifdef OPENMP + #pragma omp section +#endif + { + if (bConsiderAD) + criteria[ADvsBC] = MLQuartetOptimize(profiles[0], profiles[3], profiles[2], profiles[1], + nPos, transmat, rates, + /*IN/OUT*/lenADvsBC, NULL, /*site_likelihoods*/NULL) + - penalty[ADvsBC]; + } + } + } /* end parallel sections */ + if (mlAccuracy < 2) { + /* If clearly worse then ABvsCD, or have short internal branch length and worse, then + give up */ + if (criteria[ACvsBD] < criteria[ABvsCD] - closeLogLkLimit + || (lenACvsBD[LEN_I] <= 2.0*MLMinBranchLength && criteria[ACvsBD] < criteria[ABvsCD])) + bConsiderAC = false; + if (criteria[ADvsBC] < criteria[ABvsCD] - closeLogLkLimit + || (lenADvsBC[LEN_I] <= 2.0*MLMinBranchLength && criteria[ADvsBC] < criteria[ABvsCD])) + bConsiderAD = false; + if (!bConsiderAC && !bConsiderAD) + break; + /* If clearly better than either alternative, then give up + (Comparison is probably biased in favor of ABvsCD anyway) */ + if (criteria[ACvsBD] > criteria[ABvsCD] + closeLogLkLimit + && criteria[ACvsBD] > criteria[ADvsBC] + closeLogLkLimit) + break; + if (criteria[ADvsBC] > criteria[ABvsCD] + closeLogLkLimit + && criteria[ADvsBC] > criteria[ACvsBD] + closeLogLkLimit) + break; + } + } /* end loop over rounds */ + + if (verbose > 2) { + fprintf(stderr, "Optimized quartet for %d rounds: ABvsCD %.5f ACvsBD %.5f ADvsBC %.5f\n", + iRound, criteria[ABvsCD], criteria[ACvsBD], criteria[ADvsBC]); + } + if (criteria[ACvsBD] > criteria[ABvsCD] && criteria[ACvsBD] > criteria[ADvsBC]) { + for (i = 0; i < 5; i++) len[i] = lenACvsBD[i]; + return(ACvsBD); + } else if (criteria[ADvsBC] > criteria[ABvsCD] && criteria[ADvsBC] > criteria[ACvsBD]) { + for (i = 0; i < 5; i++) len[i] = lenADvsBC[i]; + return(ADvsBC); + } else { + for (i = 0; i < 5; i++) len[i] = lenABvsCD[i]; + return(ABvsCD); + } +} + +double TreeLength(/*IN/OUT*/NJ_t *NJ, bool recomputeProfiles) { + if (recomputeProfiles) { + traversal_t traversal2 = InitTraversal(NJ); + int j = NJ->root; + while((j = TraversePostorder(j, NJ, /*IN/OUT*/traversal2, /*pUp*/NULL)) >= 0) { + /* nothing to do for leaves or root */ + if (j >= NJ->nSeq && j != NJ->root) + SetProfile(/*IN/OUT*/NJ, j, /*noweight*/-1.0); + } + traversal2 = FreeTraversal(traversal2,NJ); + } + UpdateBranchLengths(/*IN/OUT*/NJ); + double total_len = 0; + int iNode; + for (iNode = 0; iNode < NJ->maxnode; iNode++) + total_len += NJ->branchlength[iNode]; + return(total_len); +} + +double TreeLogLk(/*IN*/NJ_t *NJ, /*OPTIONAL OUT*/double *site_loglk) { + int i; + if (NJ->nSeq < 2) + return(0.0); + double loglk = 0.0; + double *site_likelihood = NULL; + if (site_loglk != NULL) { + site_likelihood = mymalloc(sizeof(double)*NJ->nPos); + for (i = 0; i < NJ->nPos; i++) { + site_likelihood[i] = 1.0; + site_loglk[i] = 0.0; + } + } + traversal_t traversal = InitTraversal(NJ); + int node = NJ->root; + while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { + int nChild = NJ->child[node].nChild; + if (nChild == 0) + continue; + assert(nChild >= 2); + int *children = NJ->child[node].child; + double loglkchild = PairLogLk(NJ->profiles[children[0]], NJ->profiles[children[1]], + NJ->branchlength[children[0]]+NJ->branchlength[children[1]], + NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/site_likelihood); + loglk += loglkchild; + if (site_likelihood != NULL) { + /* prevent underflows */ + for (i = 0; i < NJ->nPos; i++) { + while(site_likelihood[i] < LkUnderflow) { + site_likelihood[i] *= LkUnderflowInv; + site_loglk[i] -= LogLkUnderflow; + } + } + } + if (verbose > 2) + fprintf(stderr, "At %d: LogLk(%d:%.4f,%d:%.4f) = %.3f\n", + node, + children[0], NJ->branchlength[children[0]], + children[1], NJ->branchlength[children[1]], + loglkchild); + if (NJ->child[node].nChild == 3) { + assert(node == NJ->root); + /* Infer the common parent of the 1st two to define the third... */ + profile_t *pAB = PosteriorProfile(NJ->profiles[children[0]], + NJ->profiles[children[1]], + NJ->branchlength[children[0]], + NJ->branchlength[children[1]], + NJ->transmat, &NJ->rates, + NJ->nPos, /*nConstraints*/0); + double loglkup = PairLogLk(pAB, NJ->profiles[children[2]], + NJ->branchlength[children[2]], + NJ->nPos, NJ->transmat, &NJ->rates, + /*IN/OUT*/site_likelihood); + loglk += loglkup; + if (verbose > 2) + fprintf(stderr, "At root %d: LogLk((%d/%d),%d:%.3f) = %.3f\n", + node, children[0], children[1], children[2], + NJ->branchlength[children[2]], + loglkup); + pAB = FreeProfile(pAB, NJ->nPos, NJ->nConstraints); + } + } + traversal = FreeTraversal(traversal,NJ); + if (site_likelihood != NULL) { + for (i = 0; i < NJ->nPos; i++) { + site_loglk[i] += log(site_likelihood[i]); + } + site_likelihood = myfree(site_likelihood, sizeof(double)*NJ->nPos); + } + + /* For Jukes-Cantor, with a tree of size 4, if the children of the root are + (A,B), C, and D, then + P(ABCD) = P(A) P(B|A) P(C|AB) P(D|ABC) + + Above we compute P(B|A) P(C|AB) P(D|ABC) -- note P(B|A) is at the child of root + and P(C|AB) P(D|ABC) is at root. + + Similarly if the children of the root are C, D, and (A,B), then + P(ABCD) = P(C|D) P(A|B) P(AB|CD) P(D), and above we compute that except for P(D) + + So we need to multiply by P(A) = 0.25, so we pay log(4) at each position + (if ungapped). Each gapped position in any sequence reduces the payment by log(4) + + For JTT or GTR, we are computing P(A & B) and the posterior profiles are scaled to take + the prior into account, so we do not need any correction. + codeFreq[NOCODE] is scaled x higher so that P(-) = 1 not P(-)=1/nCodes, so gaps + do not need to be corrected either. + */ + + if (nCodes == 4 && NJ->transmat == NULL) { + int nGaps = 0; + double logNCodes = log((double)nCodes); + for (i = 0; i < NJ->nPos; i++) { + int nGapsThisPos = 0; + for (node = 0; node < NJ->nSeq; node++) { + unsigned char *codes = NJ->profiles[node]->codes; + if (codes[i] == NOCODE) + nGapsThisPos++; + } + nGaps += nGapsThisPos; + if (site_loglk != NULL) { + site_loglk[i] += nGapsThisPos * logNCodes; + if (nCodes == 4 && NJ->transmat == NULL) + site_loglk[i] -= logNCodes; + } + } + loglk -= NJ->nPos * logNCodes; + loglk += nGaps * logNCodes; /* do not pay for gaps -- only Jukes-Cantor */ + } + return(loglk); +} + +void SetMLGtr(/*IN/OUT*/NJ_t *NJ, /*OPTIONAL IN*/double *freq_in, /*OPTIONAL WRITE*/FILE *fpLog) { + int i; + assert(nCodes==4); + gtr_opt_t gtr; + gtr.NJ = NJ; + gtr.fpLog = fpLog; + if (freq_in != NULL) { + for (i=0; i<4; i++) + gtr.freq[i]=freq_in[i]; + } else { + /* n[] and sum were int in FastTree 2.1.9 and earlier -- this + caused gtr analyses to fail on analyses with >2e9 positions */ + long n[4] = {1,1,1,1}; /* pseudocounts */ + for (i=0; inSeq; i++) { + unsigned char *codes = NJ->profiles[i]->codes; + int iPos; + for (iPos=0; iPosnPos; iPos++) + if (codes[iPos] < 4) + n[codes[iPos]]++; + } + long sum = n[0]+n[1]+n[2]+n[3]; + for (i=0; i<4; i++) + gtr.freq[i] = n[i]/(double)sum; + } + for (i=0; i<6; i++) + gtr.rates[i] = 1.0; + int nRounds = mlAccuracy < 2 ? 2 : mlAccuracy; + for (i = 0; i < nRounds; i++) { + for (gtr.iRate = 0; gtr.iRate < 6; gtr.iRate++) { + ProgressReport("Optimizing GTR model, step %d of %d", i*6+gtr.iRate+1, 12, 0, 0); + double negloglk, f2x; + gtr.rates[gtr.iRate] = onedimenmin(/*xmin*/0.05, + /*xguess*/gtr.rates[gtr.iRate], + /*xmax*/20.0, + GTRNegLogLk, + /*data*/>r, + /*ftol*/0.001, + /*atol*/0.0001, + /*OUT*/&negloglk, + /*OUT*/&f2x); + } + } + /* normalize gtr so last rate is 1 -- specifying that rate separately is useful for optimization only */ + for (i = 0; i < 5; i++) + gtr.rates[i] /= gtr.rates[5]; + gtr.rates[5] = 1.0; + if (verbose) { + fprintf(stderr, "GTR Frequencies: %.4f %.4f %.4f %.4f\n", gtr.freq[0], gtr.freq[1], gtr.freq[2], gtr.freq[3]); + fprintf(stderr, "GTR rates(ac ag at cg ct gt) %.4f %.4f %.4f %.4f %.4f %.4f\n", + gtr.rates[0],gtr.rates[1],gtr.rates[2],gtr.rates[3],gtr.rates[4],gtr.rates[5]); + } + if (fpLog != NULL) { + fprintf(fpLog, "GTRFreq\t%.4f\t%.4f\t%.4f\t%.4f\n", gtr.freq[0], gtr.freq[1], gtr.freq[2], gtr.freq[3]); + fprintf(fpLog, "GTRRates\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\n", + gtr.rates[0],gtr.rates[1],gtr.rates[2],gtr.rates[3],gtr.rates[4],gtr.rates[5]); + } + myfree(NJ->transmat, sizeof(transition_matrix_t)); + NJ->transmat = CreateGTR(gtr.rates, gtr.freq); + RecomputeMLProfiles(/*IN/OUT*/NJ); + OptimizeAllBranchLengths(/*IN/OUT*/NJ); +} + +double GTRNegLogLk(double x, void *data) { + + gtr_opt_t *gtr = (gtr_opt_t*)data; + assert(nCodes == 4); + assert(gtr->NJ != NULL); + assert(gtr->iRate >= 0 && gtr->iRate < 6); + assert(x > 0); + transition_matrix_t *old = gtr->NJ->transmat; + double rates[6]; + int i; + for (i = 0; i < 6; i++) + rates[i] = gtr->rates[i]; + rates[gtr->iRate] = x; + + FILE *fpLog = gtr->fpLog; + if (fpLog) + fprintf(fpLog, "GTR_Opt\tfreq %.5f %.5f %.5f %.5f rates %.5f %.5f %.5f %.5f %.5f %.5f\n", + gtr->freq[0], gtr->freq[1], gtr->freq[2], gtr->freq[3], + rates[0], rates[1], rates[2], rates[3], rates[4], rates[5]); + + gtr->NJ->transmat = CreateGTR(rates, gtr->freq); + RecomputeMLProfiles(/*IN/OUT*/gtr->NJ); + double loglk = TreeLogLk(gtr->NJ, /*site_loglk*/NULL); + myfree(gtr->NJ->transmat, sizeof(transition_matrix_t)); + gtr->NJ->transmat = old; + /* Do not recompute profiles -- assume the caller will do that */ + if (verbose > 2) + fprintf(stderr, "GTR LogLk(%.5f %.5f %.5f %.5f %.5f %.5f) = %f\n", + rates[0], rates[1], rates[2], rates[3], rates[4], rates[5], loglk); + if (fpLog) + fprintf(fpLog, "GTR_Opt\tGTR LogLk(%.5f %.5f %.5f %.5f %.5f %.5f) = %f\n", + rates[0], rates[1], rates[2], rates[3], rates[4], rates[5], loglk); + return(-loglk); +} + +/* Caller must free the resulting vector of n rates */ +numeric_t *MLSiteRates(int nRateCategories) { + /* Even spacing from 1/nRate to nRate */ + double logNCat = log((double)nRateCategories); + double logMinRate = -logNCat; + double logMaxRate = logNCat; + double logd = (logMaxRate-logMinRate)/(double)(nRateCategories-1); + + numeric_t *rates = mymalloc(sizeof(numeric_t)*nRateCategories); + int i; + for (i = 0; i < nRateCategories; i++) + rates[i] = exp(logMinRate + logd*(double)i); + return(rates); +} + +double *MLSiteLikelihoodsByRate(/*IN*/NJ_t *NJ, /*IN*/numeric_t *rates, int nRateCategories) { + double *site_loglk = mymalloc(sizeof(double)*NJ->nPos*nRateCategories); + + /* save the original rates */ + assert(NJ->rates.nRateCategories > 0); + numeric_t *oldRates = NJ->rates.rates; + NJ->rates.rates = mymalloc(sizeof(numeric_t) * NJ->rates.nRateCategories); + + /* Compute site likelihood for each rate */ + int iPos; + int iRate; + for (iRate = 0; iRate < nRateCategories; iRate++) { + int i; + for (i = 0; i < NJ->rates.nRateCategories; i++) + NJ->rates.rates[i] = rates[iRate]; + RecomputeMLProfiles(/*IN/OUT*/NJ); + double loglk = TreeLogLk(NJ, /*OUT*/&site_loglk[NJ->nPos*iRate]); + ProgressReport("Site likelihoods with rate category %d of %d", iRate+1, nRateCategories, 0, 0); + if(verbose > 2) { + fprintf(stderr, "Rate %.3f Loglk %.3f SiteLogLk", rates[iRate], loglk); + for (iPos = 0; iPos < NJ->nPos; iPos++) + fprintf(stderr,"\t%.3f", site_loglk[NJ->nPos*iRate + iPos]); + fprintf(stderr,"\n"); + } + } + + /* restore original rates and profiles */ + myfree(NJ->rates.rates, sizeof(numeric_t) * NJ->rates.nRateCategories); + NJ->rates.rates = oldRates; + RecomputeMLProfiles(/*IN/OUT*/NJ); + + return(site_loglk); +} + +void SetMLRates(/*IN/OUT*/NJ_t *NJ, int nRateCategories) { + assert(nRateCategories > 0); + AllocRateCategories(/*IN/OUT*/&NJ->rates, 1, NJ->nPos); /* set to 1 category of rate 1 */ + if (nRateCategories == 1) { + RecomputeMLProfiles(/*IN/OUT*/NJ); + return; + } + numeric_t *rates = MLSiteRates(nRateCategories); + double *site_loglk = MLSiteLikelihoodsByRate(/*IN*/NJ, /*IN*/rates, nRateCategories); + + /* Select best rate for each site, correcting for the prior + For a prior, use a gamma distribution with shape parameter 3, scale 1/3, so + Prior(rate) ~ rate**2 * exp(-3*rate) + log Prior(rate) = C + 2 * log(rate) - 3 * rate + */ + double sumRates = 0; + int iPos; + int iRate; + for (iPos = 0; iPos < NJ->nPos; iPos++) { + int iBest = -1; + double dBest = -1e20; + for (iRate = 0; iRate < nRateCategories; iRate++) { + double site_loglk_with_prior = site_loglk[NJ->nPos*iRate + iPos] + + 2.0 * log(rates[iRate]) - 3.0 * rates[iRate]; + if (site_loglk_with_prior > dBest) { + iBest = iRate; + dBest = site_loglk_with_prior; + } + } + if (verbose > 2) + fprintf(stderr, "Selected rate category %d rate %.3f for position %d\n", + iBest, rates[iBest], iPos+1); + NJ->rates.ratecat[iPos] = iBest; + sumRates += rates[iBest]; + } + site_loglk = myfree(site_loglk, sizeof(double)*NJ->nPos*nRateCategories); + + /* Force the rates to average to 1 */ + double avgRate = sumRates/NJ->nPos; + for (iRate = 0; iRate < nRateCategories; iRate++) + rates[iRate] /= avgRate; + + /* Save the rates */ + NJ->rates.rates = myfree(NJ->rates.rates, sizeof(numeric_t) * NJ->rates.nRateCategories); + NJ->rates.rates = rates; + NJ->rates.nRateCategories = nRateCategories; + + /* Update profiles based on rates */ + RecomputeMLProfiles(/*IN/OUT*/NJ); + + if (verbose) { + fprintf(stderr, "Switched to using %d rate categories (CAT approximation)\n", nRateCategories); + fprintf(stderr, "Rate categories were divided by %.3f so that average rate = 1.0\n", avgRate); + fprintf(stderr, "CAT-based log-likelihoods may not be comparable across runs\n"); + if (!gammaLogLk) + fprintf(stderr, "Use -gamma for approximate but comparable Gamma(20) log-likelihoods\n"); + } +} + +double GammaLogLk(/*IN*/siteratelk_t *s, /*OPTIONAL OUT*/double *gamma_loglk_sites) { + int iRate, iPos; + double *dRate = mymalloc(sizeof(double) * s->nRateCats); + for (iRate = 0; iRate < s->nRateCats; iRate++) { + /* The probability density for each rate is approximated by the total + density between the midpoints */ + double pMin = iRate == 0 ? 0.0 : + PGamma(s->mult * (s->rates[iRate-1] + s->rates[iRate])/2.0, s->alpha); + double pMax = iRate == s->nRateCats-1 ? 1.0 : + PGamma(s->mult * (s->rates[iRate]+s->rates[iRate+1])/2.0, s->alpha); + dRate[iRate] = pMax-pMin; + } + + double loglk = 0.0; + for (iPos = 0; iPos < s->nPos; iPos++) { + /* Prevent underflow on large trees by comparing to maximum loglk */ + double maxloglk = -1e20; + for (iRate = 0; iRate < s->nRateCats; iRate++) { + double site_loglk = s->site_loglk[s->nPos*iRate + iPos]; + if (site_loglk > maxloglk) + maxloglk = site_loglk; + } + double rellk = 0; /* likelihood scaled by exp(maxloglk) */ + for (iRate = 0; iRate < s->nRateCats; iRate++) { + double lk = exp(s->site_loglk[s->nPos*iRate + iPos] - maxloglk); + rellk += lk * dRate[iRate]; + } + double loglk_site = maxloglk + log(rellk); + loglk += loglk_site; + if (gamma_loglk_sites != NULL) + gamma_loglk_sites[iPos] = loglk_site; + } + dRate = myfree(dRate, sizeof(double)*s->nRateCats); + return(loglk); +} + +double OptAlpha(double alpha, void *data) { + siteratelk_t *s = (siteratelk_t *)data; + s->alpha = alpha; + return(-GammaLogLk(s, NULL)); +} + +double OptMult(double mult, void *data) { + siteratelk_t *s = (siteratelk_t *)data; + s->mult = mult; + return(-GammaLogLk(s, NULL)); +} + +/* Input site_loglk must be for each rate */ +double RescaleGammaLogLk(int nPos, int nRateCats, /*IN*/numeric_t *rates, /*IN*/double *site_loglk, + /*OPTIONAL*/FILE *fpLog) { + siteratelk_t s = { /*mult*/1.0, /*alpha*/1.0, nPos, nRateCats, rates, site_loglk }; + double fx, f2x; + int i; + fx = -GammaLogLk(&s, NULL); + if (verbose>2) + fprintf(stderr, "Optimizing alpha, starting at loglk %.3f\n", -fx); + for (i = 0; i < 10; i++) { + ProgressReport("Optimizing alpha round %d", i+1, 0, 0, 0); + double start = fx; + s.alpha = onedimenmin(0.01, s.alpha, 10.0, OptAlpha, &s, 0.001, 0.001, &fx, &f2x); + if (verbose>2) + fprintf(stderr, "Optimize alpha round %d to %.3f lk %.3f\n", i+1, s.alpha, -fx); + s.mult = onedimenmin(0.01, s.mult, 10.0, OptMult, &s, 0.001, 0.001, &fx, &f2x); + if (verbose>2) + fprintf(stderr, "Optimize mult round %d to %.3f lk %.3f\n", i+1, s.mult, -fx); + if (fx > start - 0.001) { + if (verbose>2) + fprintf(stderr, "Optimizing alpha & mult converged\n"); + break; + } + } + + double *gamma_loglk_sites = mymalloc(sizeof(double) * nPos); + double gammaLogLk = GammaLogLk(&s, /*OUT*/gamma_loglk_sites); + if (verbose > 0) + fprintf(stderr, "Gamma(%d) LogLk = %.3f alpha = %.3f rescaling lengths by %.3f\n", + nRateCats, gammaLogLk, s.alpha, 1/s.mult); + if (fpLog) { + int iPos; + int iRate; + fprintf(fpLog, "Gamma%dLogLk\t%.3f\tApproximate\tAlpha\t%.3f\tRescale\t%.3f\n", + nRateCats, gammaLogLk, s.alpha, 1/s.mult); + fprintf(fpLog, "Gamma%d\tSite\tLogLk", nRateCats); + for (iRate = 0; iRate < nRateCats; iRate++) + fprintf(fpLog, "\tr=%.3f", rates[iRate]/s.mult); + fprintf(fpLog,"\n"); + for (iPos = 0; iPos < nPos; iPos++) { + fprintf(fpLog, "Gamma%d\t%d\t%.3f", nRateCats, iPos, gamma_loglk_sites[iPos]); + for (iRate = 0; iRate < nRateCats; iRate++) + fprintf(fpLog, "\t%.3f", site_loglk[nPos*iRate + iPos]); + fprintf(fpLog,"\n"); + } + } + gamma_loglk_sites = myfree(gamma_loglk_sites, sizeof(double) * nPos); + return(1.0/s.mult); +} + +double MLPairOptimize(profile_t *pA, profile_t *pB, + int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, + /*IN/OUT*/double *branch_length) { + quartet_opt_t qopt = { nPos, transmat, rates, + /*nEval*/0, /*pair1*/pA, /*pair2*/pB }; + double f2x,negloglk; + *branch_length = onedimenmin(/*xmin*/MLMinBranchLength, + /*xguess*/*branch_length, + /*xmax*/6.0, + PairNegLogLk, + /*data*/&qopt, + /*ftol*/MLFTolBranchLength, + /*atol*/MLMinBranchLengthTolerance, + /*OUT*/&negloglk, + /*OUT*/&f2x); + return(-negloglk); /* the log likelihood */ +} + +void OptimizeAllBranchLengths(/*IN/OUT*/NJ_t *NJ) { + if (NJ->nSeq < 2) + return; + if (NJ->nSeq == 2) { + int parent = NJ->root; + assert(NJ->child[parent].nChild==2); + int nodes[2] = { NJ->child[parent].child[0], NJ->child[parent].child[1] }; + double length = 1.0; + (void)MLPairOptimize(NJ->profiles[nodes[0]], NJ->profiles[nodes[1]], + NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/&length); + NJ->branchlength[nodes[0]] = length/2.0; + NJ->branchlength[nodes[1]] = length/2.0; + return; + }; + + traversal_t traversal = InitTraversal(NJ); + profile_t **upProfiles = UpProfiles(NJ); + int node = NJ->root; + int iDone = 0; + while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { + int nChild = NJ->child[node].nChild; + if (nChild > 0) { + if ((iDone % 100) == 0) + ProgressReport("ML Lengths %d of %d splits", iDone+1, NJ->maxnode - NJ->nSeq, 0, 0); + iDone++; + + /* optimize the branch lengths between self, parent, and children, + with two iterations + */ + assert(nChild == 2 || nChild == 3); + int nodes[3] = { NJ->child[node].child[0], + NJ->child[node].child[1], + nChild == 3 ? NJ->child[node].child[2] : node }; + profile_t *profiles[3] = { NJ->profiles[nodes[0]], + NJ->profiles[nodes[1]], + nChild == 3 ? NJ->profiles[nodes[2]] + : GetUpProfile(/*IN/OUT*/upProfiles, NJ, node, /*useML*/true) }; + int iter; + for (iter = 0; iter < 2; iter++) { + int i; + for (i = 0; i < 3; i++) { + profile_t *pA = profiles[i]; + int b1 = (i+1) % 3; + int b2 = (i+2) % 3; + profile_t *pB = PosteriorProfile(profiles[b1], profiles[b2], + NJ->branchlength[nodes[b1]], + NJ->branchlength[nodes[b2]], + NJ->transmat, &NJ->rates, NJ->nPos, /*nConstraints*/0); + double len = NJ->branchlength[nodes[i]]; + if (len < MLMinBranchLength) + len = MLMinBranchLength; + (void)MLPairOptimize(pA, pB, NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/&len); + NJ->branchlength[nodes[i]] = len; + pB = FreeProfile(pB, NJ->nPos, /*nConstraints*/0); + if (verbose>3) + fprintf(stderr, "Optimize length for %d to %.3f\n", + nodes[i], NJ->branchlength[nodes[i]]); + } + } + if (node != NJ->root) { + RecomputeProfile(/*IN/OUT*/NJ, /*IN/OUT*/upProfiles, node, /*useML*/true); + DeleteUpProfile(upProfiles, NJ, node); + } + } + } + traversal = FreeTraversal(traversal,NJ); + upProfiles = FreeUpProfiles(upProfiles,NJ); +} + +void RecomputeMLProfiles(/*IN/OUT*/NJ_t *NJ) { + traversal_t traversal = InitTraversal(NJ); + int node = NJ->root; + while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { + if (NJ->child[node].nChild == 2) { + NJ->profiles[node] = FreeProfile(NJ->profiles[node], NJ->nPos, NJ->nConstraints); + int *children = NJ->child[node].child; + NJ->profiles[node] = PosteriorProfile(NJ->profiles[children[0]], NJ->profiles[children[1]], + NJ->branchlength[children[0]], NJ->branchlength[children[1]], + NJ->transmat, &NJ->rates, NJ->nPos, NJ->nConstraints); + } + } + traversal = FreeTraversal(traversal, NJ); +} + +void RecomputeProfiles(/*IN/OUT*/NJ_t *NJ, /*OPTIONAL*/distance_matrix_t *dmat) { + traversal_t traversal = InitTraversal(NJ); + int node = NJ->root; + while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { + if (NJ->child[node].nChild == 2) { + int *child = NJ->child[node].child; + NJ->profiles[node] = FreeProfile(NJ->profiles[node], NJ->nPos, NJ->nConstraints); + NJ->profiles[node] = AverageProfile(NJ->profiles[child[0]], NJ->profiles[child[1]], + NJ->nPos, NJ->nConstraints, + dmat, /*unweighted*/-1.0); + } + } + traversal = FreeTraversal(traversal,NJ); +} + +int NNI(/*IN/OUT*/NJ_t *NJ, int iRound, int nRounds, bool useML, + /*IN/OUT*/nni_stats_t *stats, + /*OUT*/double *dMaxDelta) { + /* For each non-root node N, with children A,B, sibling C, and uncle D, + we compare the current topology AB|CD to the alternate topologies + AC|BD and AD|BC, by using the 4 relevant profiles. + + If useML is true, it uses quartet maximum likelihood, and it + updates branch lengths as it goes. + + If useML is false, it uses the minimum-evolution criterion with + log-corrected distances on profiles. (If logdist is false, then + the log correction is not done.) If useML is false, then NNI() + does NOT modify the branch lengths. + + Regardless of whether it changes the topology, it recomputes the + profile for the node, using the pairwise distances and BIONJ-like + weightings (if bionj is set). The parent's profile has changed, + but recomputing it is not necessary because we will visit it + before we need it (we use postorder, so we may visit the sibling + and its children before we visit the parent, but we never + consider an ancestor's profile, so that is OK). When we change + the parent's profile, this alters the uncle's up-profile, so we + remove that. Finally, if the topology has changed, we remove the + up-profiles of the nodes. + + If we do an NNI during post-order traversal, the result is a bit + tricky. E.g. if we are at node N, and have visited its children A + and B but not its uncle C, and we do an NNI that swaps B & C, + then the post-order traversal will visit C, and its children, but + then on the way back up, it will skip N, as it has already + visited it. So, the profile of N will not be recomputed: any + changes beneath C will not be reflected in the profile of N, and + the profile of N will be slightly stale. This will be corrected + on the next round of NNIs. + */ + double supportThreshold = useML ? treeLogLkDelta : MEMinDelta; + int i; + *dMaxDelta = 0.0; + int nNNIThisRound = 0; + + if (NJ->nSeq <= 3) + return(0); /* nothing to do */ + if (verbose > 2) { + fprintf(stderr, "Beginning round %d of NNIs with ml? %d\n", iRound, useML?1:0); + PrintNJInternal(/*WRITE*/stderr, NJ, /*useLen*/useML && iRound > 0 ? 1 : 0); + } + /* For each node the upProfile or NULL */ + profile_t **upProfiles = UpProfiles(NJ); + + traversal_t traversal = InitTraversal(NJ); + + /* Identify nodes we can skip traversing into */ + int node; + if (fastNNI) { + for (node = 0; node < NJ->maxnode; node++) { + if (node != NJ->root + && node >= NJ->nSeq + && stats[node].age >= 2 + && stats[node].subtreeAge >= 2 + && stats[node].support > supportThreshold) { + int nodeABCD[4]; + SetupABCD(NJ, node, NULL, NULL, /*OUT*/nodeABCD, useML); + for (i = 0; i < 4; i++) + if (stats[nodeABCD[i]].age == 0 && stats[nodeABCD[i]].support > supportThreshold) + break; + if (i == 4) { + SkipTraversalInto(node, /*IN/OUT*/traversal); + if (verbose > 2) + fprintf(stderr, "Skipping subtree at %d: child %d %d parent %d age %d subtreeAge %d support %.3f\n", + node, nodeABCD[0], nodeABCD[1], NJ->parent[node], + stats[node].age, stats[node].subtreeAge, stats[node].support); + } + } + } + } + + int iDone = 0; + bool bUp; + node = NJ->root; + while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, &bUp)) >= 0) { + if (node < NJ->nSeq || node == NJ->root) + continue; /* nothing to do for leaves or root */ + if (bUp) { + if(verbose > 2) + fprintf(stderr, "Going up back to node %d\n", node); + /* No longer needed */ + for (i = 0; i < NJ->child[node].nChild; i++) + DeleteUpProfile(upProfiles, NJ, NJ->child[node].child[i]); + DeleteUpProfile(upProfiles, NJ, node); + RecomputeProfile(/*IN/OUT*/NJ, /*IN/OUT*/upProfiles, node, useML); + continue; + } + if ((iDone % 100) == 0) { + char buf[100]; + sprintf(buf, "%s NNI round %%d of %%d, %%d of %%d splits", useML ? "ML" : "ME"); + if (iDone > 0) + sprintf(buf+strlen(buf), ", %d changes", nNNIThisRound); + if (nNNIThisRound > 0) + sprintf(buf+strlen(buf), " (max delta %.3f)", *dMaxDelta); + ProgressReport(buf, iRound+1, nRounds, iDone+1, NJ->maxnode - NJ->nSeq); + } + iDone++; + + profile_t *profiles[4]; + int nodeABCD[4]; + /* Note -- during the first round of ML NNIs, we use the min-evo-based branch lengths, + which may be suboptimal */ + SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, useML); + + /* Given our 4 profiles, consider doing a swap */ + int nodeA = nodeABCD[0]; + int nodeB = nodeABCD[1]; + int nodeC = nodeABCD[2]; + int nodeD = nodeABCD[3]; + + nni_t choice = ABvsCD; + + if (verbose > 2) + fprintf(stderr,"Considering NNI around %d: Swap A=%d B=%d C=%d D=up(%d) or parent %d\n", + node, nodeA, nodeB, nodeC, nodeD, NJ->parent[node]); + if (verbose > 3 && useML) { + double len[5] = { NJ->branchlength[nodeA], NJ->branchlength[nodeB], NJ->branchlength[nodeC], NJ->branchlength[nodeD], + NJ->branchlength[node] }; + for (i=0; i < 5; i++) + if (len[i] < MLMinBranchLength) + len[i] = MLMinBranchLength; + fprintf(stderr, "Starting quartet likelihood %.3f len %.3f %.3f %.3f %.3f %.3f\n", + MLQuartetLogLk(profiles[0],profiles[1],profiles[2],profiles[3],NJ->nPos,NJ->transmat,&NJ->rates,len, /*site_lk*/NULL), + len[0], len[1], len[2], len[3], len[4]); + } + + numeric_t newlength[5]; + double criteria[3]; + if (useML) { + for (i = 0; i < 4; i++) + newlength[i] = NJ->branchlength[nodeABCD[i]]; + newlength[4] = NJ->branchlength[node]; + bool bFast = mlAccuracy < 2 && stats[node].age > 0; + choice = MLQuartetNNI(profiles, NJ->transmat, &NJ->rates, NJ->nPos, NJ->nConstraints, + /*OUT*/criteria, /*IN/OUT*/newlength, bFast); + } else { + choice = ChooseNNI(profiles, NJ->distance_matrix, NJ->nPos, NJ->nConstraints, + /*OUT*/criteria); + /* invert criteria so that higher is better, as in ML case, to simplify code below */ + for (i = 0; i < 3; i++) + criteria[i] = -criteria[i]; + } + + if (choice == ACvsBD) { + /* swap B and C */ + ReplaceChild(/*IN/OUT*/NJ, node, nodeB, nodeC); + ReplaceChild(/*IN/OUT*/NJ, NJ->parent[node], nodeC, nodeB); + } else if (choice == ADvsBC) { + /* swap A and C */ + ReplaceChild(/*IN/OUT*/NJ, node, nodeA, nodeC); + ReplaceChild(/*IN/OUT*/NJ, NJ->parent[node], nodeC, nodeA); + } + + if (useML) { + /* update branch length for the internal branch, and of any + branches that lead to leaves, b/c those will not are not + the internal branch for NNI and would not otherwise be set. + */ + if (choice == ADvsBC) { + /* For ADvsBC, MLQuartetNNI swaps B with D, but we swap A with C */ + double length2[5] = { newlength[LEN_C], newlength[LEN_D], + newlength[LEN_A], newlength[LEN_B], + newlength[LEN_I] }; + int i; + for (i = 0; i < 5; i++) newlength[i] = length2[i]; + /* and swap A and C */ + double tmp = newlength[LEN_A]; + newlength[LEN_A] = newlength[LEN_C]; + newlength[LEN_C] = tmp; + } else if (choice == ACvsBD) { + /* swap B and C */ + double tmp = newlength[LEN_B]; + newlength[LEN_B] = newlength[LEN_C]; + newlength[LEN_C] = tmp; + } + + NJ->branchlength[node] = newlength[LEN_I]; + NJ->branchlength[nodeA] = newlength[LEN_A]; + NJ->branchlength[nodeB] = newlength[LEN_B]; + NJ->branchlength[nodeC] = newlength[LEN_C]; + NJ->branchlength[nodeD] = newlength[LEN_D]; + } + + if (verbose>2 && (choice != ABvsCD || verbose > 2)) + fprintf(stderr,"NNI around %d: Swap A=%d B=%d C=%d D=out(C) -- choose %s %s %.4f\n", + node, nodeA, nodeB, nodeC, + choice == ACvsBD ? "AC|BD" : (choice == ABvsCD ? "AB|CD" : "AD|BC"), + useML ? "delta-loglk" : "-deltaLen", + criteria[choice] - criteria[ABvsCD]); + if(verbose >= 3 && slow && useML) + fprintf(stderr, "Old tree lk -- %.4f\n", TreeLogLk(NJ, /*site_likelihoods*/NULL)); + + /* update stats, *dMaxDelta, etc. */ + if (choice == ABvsCD) { + stats[node].age++; + } else { + if (useML) + nML_NNI++; + else + nNNI++; + nNNIThisRound++; + stats[node].age = 0; + stats[nodeA].age = 0; + stats[nodeB].age = 0; + stats[nodeC].age = 0; + stats[nodeD].age = 0; + } + stats[node].delta = criteria[choice] - criteria[ABvsCD]; /* 0 if ABvsCD */ + if (stats[node].delta > *dMaxDelta) + *dMaxDelta = stats[node].delta; + + /* support is improvement of score for self over better of alternatives */ + stats[node].support = 1e20; + for (i = 0; i < 3; i++) + if (choice != i && criteria[choice]-criteria[i] < stats[node].support) + stats[node].support = criteria[choice]-criteria[i]; + + /* subtreeAge is the number of rounds since self or descendent had a significant improvement */ + if (stats[node].delta > supportThreshold) + stats[node].subtreeAge = 0; + else { + stats[node].subtreeAge++; + for (i = 0; i < 2; i++) { + int child = NJ->child[node].child[i]; + if (stats[node].subtreeAge > stats[child].subtreeAge) + stats[node].subtreeAge = stats[child].subtreeAge; + } + } + + /* update profiles and free up unneeded up-profiles */ + if (choice == ABvsCD) { + /* No longer needed */ + DeleteUpProfile(upProfiles, NJ, nodeA); + DeleteUpProfile(upProfiles, NJ, nodeB); + DeleteUpProfile(upProfiles, NJ, nodeC); + RecomputeProfile(/*IN/OUT*/NJ, /*IN/OUT*/upProfiles, node, useML); + if(slow && useML) + UpdateForNNI(NJ, node, upProfiles, useML); + } else { + UpdateForNNI(NJ, node, upProfiles, useML); + } + if(verbose > 2 && slow && useML) { + /* Note we recomputed profiles back up to root already if slow */ + PrintNJInternal(/*WRITE*/stderr, NJ, /*useLen*/true); + fprintf(stderr, "New tree lk -- %.4f\n", TreeLogLk(NJ, /*site_likelihoods*/NULL)); + } + } /* end postorder traversal */ + traversal = FreeTraversal(traversal,NJ); + if (verbose>=2) { + int nUp = 0; + for (i = 0; i < NJ->maxnodes; i++) + if (upProfiles[i] != NULL) + nUp++; + fprintf(stderr, "N up profiles at end of NNI: %d\n", nUp); + } + upProfiles = FreeUpProfiles(upProfiles,NJ); + return(nNNIThisRound); +} + +nni_stats_t *InitNNIStats(NJ_t *NJ) { + nni_stats_t *stats = mymalloc(sizeof(nni_stats_t)*NJ->maxnode); + const int LargeAge = 1000000; + int i; + for (i = 0; i < NJ->maxnode; i++) { + stats[i].delta = 0; + stats[i].support = 0; + if (i == NJ->root || i < NJ->nSeq) { + stats[i].age = LargeAge; + stats[i].subtreeAge = LargeAge; + } else { + stats[i].age = 0; + stats[i].subtreeAge = 0; + } + } + return(stats); +} + +nni_stats_t *FreeNNIStats(nni_stats_t *stats, NJ_t *NJ) { + return(myfree(stats, sizeof(nni_stats_t)*NJ->maxnode)); +} + +int FindSPRSteps(/*IN/OUT*/NJ_t *NJ, + int nodeMove, /* the node to move multiple times */ + int nodeAround, /* sibling or parent of node to NNI to start the chain */ + /*IN/OUT*/profile_t **upProfiles, + /*OUT*/spr_step_t *steps, + int maxSteps, + bool bFirstAC) { + int iStep; + for (iStep = 0; iStep < maxSteps; iStep++) { + if (NJ->child[nodeAround].nChild != 2) + break; /* no further to go */ + + /* Consider the NNIs around nodeAround */ + profile_t *profiles[4]; + int nodeABCD[4]; + SetupABCD(NJ, nodeAround, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, /*useML*/false); + double criteria[3]; + (void) ChooseNNI(profiles, NJ->distance_matrix, NJ->nPos, NJ->nConstraints, + /*OUT*/criteria); + + /* Do & save the swap */ + spr_step_t *step = &steps[iStep]; + if (iStep == 0 ? bFirstAC : criteria[ACvsBD] < criteria[ADvsBC]) { + /* swap B & C to put AC together */ + step->deltaLength = criteria[ACvsBD] - criteria[ABvsCD]; + step->nodes[0] = nodeABCD[1]; + step->nodes[1] = nodeABCD[2]; + } else { + /* swap AC to put AD together */ + step->deltaLength = criteria[ADvsBC] - criteria[ABvsCD]; + step->nodes[0] = nodeABCD[0]; + step->nodes[1] = nodeABCD[2]; + } + + if (verbose>3) { + fprintf(stderr, "SPR chain step %d for %d around %d swap %d %d deltaLen %.5f\n", + iStep+1, nodeAround, nodeMove, step->nodes[0], step->nodes[1], step->deltaLength); + if (verbose>4) + PrintNJInternal(stderr, NJ, /*useLen*/false); + } + ReplaceChild(/*IN/OUT*/NJ, nodeAround, step->nodes[0], step->nodes[1]); + ReplaceChild(/*IN/OUT*/NJ, NJ->parent[nodeAround], step->nodes[1], step->nodes[0]); + UpdateForNNI(/*IN/OUT*/NJ, nodeAround, /*IN/OUT*/upProfiles, /*useML*/false); + + /* set the new nodeAround -- either parent(nodeMove) or sibling(nodeMove) -- + so that it different from current nodeAround + */ + int newAround[2] = { NJ->parent[nodeMove], Sibling(NJ, nodeMove) }; + if (NJ->parent[nodeMove] == NJ->root) + RootSiblings(NJ, nodeMove, /*OUT*/newAround); + assert(newAround[0] == nodeAround || newAround[1] == nodeAround); + assert(newAround[0] != newAround[1]); + nodeAround = newAround[newAround[0] == nodeAround ? 1 : 0]; + } + return(iStep); +} + +void UnwindSPRStep(/*IN/OUT*/NJ_t *NJ, + /*IN*/spr_step_t *step, + /*IN/OUT*/profile_t **upProfiles) { + int parents[2]; + int i; + for (i = 0; i < 2; i++) { + assert(step->nodes[i] >= 0 && step->nodes[i] < NJ->maxnodes); + parents[i] = NJ->parent[step->nodes[i]]; + assert(parents[i] >= 0); + } + assert(parents[0] != parents[1]); + ReplaceChild(/*IN/OUT*/NJ, parents[0], step->nodes[0], step->nodes[1]); + ReplaceChild(/*IN/OUT*/NJ, parents[1], step->nodes[1], step->nodes[0]); + int iYounger = 0; + if (NJ->parent[parents[0]] == parents[1]) { + iYounger = 0; + } else { + assert(NJ->parent[parents[1]] == parents[0]); + iYounger = 1; + } + UpdateForNNI(/*IN/OUT*/NJ, parents[iYounger], /*IN/OUT*/upProfiles, /*useML*/false); +} + +/* Update the profile of node and its ancestor, and delete nearby out-profiles */ +void UpdateForNNI(/*IN/OUT*/NJ_t *NJ, int node, /*IN/OUT*/profile_t **upProfiles, + bool useML) { + int i; + if (slow) { + /* exhaustive update */ + for (i = 0; i < NJ->maxnodes; i++) + DeleteUpProfile(upProfiles, NJ, i); + + /* update profiles back to root */ + int ancestor; + for (ancestor = node; ancestor >= 0; ancestor = NJ->parent[ancestor]) + RecomputeProfile(/*IN/OUT*/NJ, upProfiles, ancestor, useML); + + /* remove any up-profiles made while doing that*/ + for (i = 0; i < NJ->maxnodes; i++) + DeleteUpProfile(upProfiles, NJ, i); + } else { + /* if fast, only update around self + note that upProfile(parent) is still OK after an NNI, but + up-profiles of uncles may not be + */ + DeleteUpProfile(upProfiles, NJ, node); + for (i = 0; i < NJ->child[node].nChild; i++) + DeleteUpProfile(upProfiles, NJ, NJ->child[node].child[i]); + assert(node != NJ->root); + int parent = NJ->parent[node]; + int neighbors[2] = { parent, Sibling(NJ, node) }; + if (parent == NJ->root) + RootSiblings(NJ, node, /*OUT*/neighbors); + DeleteUpProfile(upProfiles, NJ, neighbors[0]); + DeleteUpProfile(upProfiles, NJ, neighbors[1]); + int uncle = Sibling(NJ, parent); + if (uncle >= 0) + DeleteUpProfile(upProfiles, NJ, uncle); + RecomputeProfile(/*IN/OUT*/NJ, upProfiles, node, useML); + RecomputeProfile(/*IN/OUT*/NJ, upProfiles, parent, useML); + } +} + +void SPR(/*IN/OUT*/NJ_t *NJ, int maxSPRLength, int iRound, int nRounds) { + /* Given a non-root node N with children A,B, sibling C, and uncle D, + we can try to move A by doing three types of moves (4 choices): + "down" -- swap A with a child of B (if B is not a leaf) [2 choices] + "over" -- swap B with C + "up" -- swap A with D + We follow down moves with down moves, over moves with down moves, and + up moves with either up or over moves. (Other choices are just backing + up and hence useless.) + + As with NNIs, we keep track of up-profiles as we go. However, some of the regular + profiles may also become "stale" so it is a bit trickier. + + We store the traversal before we do SPRs to avoid any possible infinite loop + */ + double last_tot_len = 0.0; + if (NJ->nSeq <= 3 || maxSPRLength < 1) + return; + if (slow) + last_tot_len = TreeLength(NJ, /*recomputeLengths*/true); + int *nodeList = mymalloc(sizeof(int) * NJ->maxnodes); + int nodeListLen = 0; + traversal_t traversal = InitTraversal(NJ); + int node = NJ->root; + while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { + nodeList[nodeListLen++] = node; + } + assert(nodeListLen == NJ->maxnode); + traversal = FreeTraversal(traversal,NJ); + + profile_t **upProfiles = UpProfiles(NJ); + spr_step_t *steps = mymalloc(sizeof(spr_step_t) * maxSPRLength); /* current chain of SPRs */ + + int i; + for (i = 0; i < nodeListLen; i++) { + node = nodeList[i]; + if ((i % 100) == 0) + ProgressReport("SPR round %3d of %3d, %d of %d nodes", + iRound+1, nRounds, i+1, nodeListLen); + if (node == NJ->root) + continue; /* nothing to do for root */ + /* The nodes to NNI around */ + int nodeAround[2] = { NJ->parent[node], Sibling(NJ, node) }; + if (NJ->parent[node] == NJ->root) { + /* NNI around both siblings instead */ + RootSiblings(NJ, node, /*OUT*/nodeAround); + } + bool bChanged = false; + int iAround; + for (iAround = 0; iAround < 2 && bChanged == false; iAround++) { + int ACFirst; + for (ACFirst = 0; ACFirst < 2 && bChanged == false; ACFirst++) { + if(verbose > 3) + PrintNJInternal(stderr, NJ, /*useLen*/false); + int chainLength = FindSPRSteps(/*IN/OUT*/NJ, node, nodeAround[iAround], + upProfiles, /*OUT*/steps, maxSPRLength, (bool)ACFirst); + double dMinDelta = 0.0; + int iCBest = -1; + double dTotDelta = 0.0; + int iC; + for (iC = 0; iC < chainLength; iC++) { + dTotDelta += steps[iC].deltaLength; + if (dTotDelta < dMinDelta) { + dMinDelta = dTotDelta; + iCBest = iC; + } + } + + if (verbose>3) { + fprintf(stderr, "SPR %s %d around %d chainLength %d of %d deltaLength %.5f swaps:", + iCBest >= 0 ? "move" : "abandoned", + node,nodeAround[iAround],iCBest+1,chainLength,dMinDelta); + for (iC = 0; iC < chainLength; iC++) + fprintf(stderr, " (%d,%d)%.4f", steps[iC].nodes[0], steps[iC].nodes[1], steps[iC].deltaLength); + fprintf(stderr,"\n"); + } + for (iC = chainLength - 1; iC > iCBest; iC--) + UnwindSPRStep(/*IN/OUT*/NJ, /*IN*/&steps[iC], /*IN/OUT*/upProfiles); + if(verbose > 3) + PrintNJInternal(stderr, NJ, /*useLen*/false); + while (slow && iCBest >= 0) { + double expected_tot_len = last_tot_len + dMinDelta; + double new_tot_len = TreeLength(NJ, /*recompute*/true); + if (verbose > 2) + fprintf(stderr, "Total branch-length is now %.4f was %.4f expected %.4f\n", + new_tot_len, last_tot_len, expected_tot_len); + if (new_tot_len < last_tot_len) { + last_tot_len = new_tot_len; + break; /* no rewinding necessary */ + } + if (verbose > 2) + fprintf(stderr, "Rewinding SPR to %d\n",iCBest); + UnwindSPRStep(/*IN/OUT*/NJ, /*IN*/&steps[iCBest], /*IN/OUT*/upProfiles); + dMinDelta -= steps[iCBest].deltaLength; + iCBest--; + } + if (iCBest >= 0) + bChanged = true; + } /* loop over which step to take at 1st NNI */ + } /* loop over which node to pivot around */ + + if (bChanged) { + nSPR++; /* the SPR move is OK */ + /* make sure all the profiles are OK */ + int j; + for (j = 0; j < NJ->maxnodes; j++) + DeleteUpProfile(upProfiles, NJ, j); + int ancestor; + for (ancestor = NJ->parent[node]; ancestor >= 0; ancestor = NJ->parent[ancestor]) + RecomputeProfile(/*IN/OUT*/NJ, upProfiles, ancestor, /*useML*/false); + } + } /* end loop over subtrees to prune & regraft */ + steps = myfree(steps, sizeof(spr_step_t) * maxSPRLength); + upProfiles = FreeUpProfiles(upProfiles,NJ); + nodeList = myfree(nodeList, sizeof(int) * NJ->maxnodes); +} + +void RecomputeProfile(/*IN/OUT*/NJ_t *NJ, /*IN/OUT*/profile_t **upProfiles, int node, + bool useML) { + if (node < NJ->nSeq || node == NJ->root) + return; /* no profile to compute */ + assert(NJ->child[node].nChild==2); + + profile_t *profiles[4]; + double weight = 0.5; + if (useML || !bionj) { + profiles[0] = NJ->profiles[NJ->child[node].child[0]]; + profiles[1] = NJ->profiles[NJ->child[node].child[1]]; + } else { + int nodeABCD[4]; + SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, useML); + weight = QuartetWeight(profiles, NJ->distance_matrix, NJ->nPos); + } + if (verbose>3) { + if (useML) { + fprintf(stderr, "Recompute %d from %d %d lengths %.4f %.4f\n", + node, + NJ->child[node].child[0], + NJ->child[node].child[1], + NJ->branchlength[NJ->child[node].child[0]], + NJ->branchlength[NJ->child[node].child[1]]); + } else { + fprintf(stderr, "Recompute %d from %d %d weight %.3f\n", + node, NJ->child[node].child[0], NJ->child[node].child[1], weight); + } + } + NJ->profiles[node] = FreeProfile(NJ->profiles[node], NJ->nPos, NJ->nConstraints); + if (useML) { + NJ->profiles[node] = PosteriorProfile(profiles[0], profiles[1], + NJ->branchlength[NJ->child[node].child[0]], + NJ->branchlength[NJ->child[node].child[1]], + NJ->transmat, &NJ->rates, NJ->nPos, NJ->nConstraints); + } else { + NJ->profiles[node] = AverageProfile(profiles[0], profiles[1], + NJ->nPos, NJ->nConstraints, + NJ->distance_matrix, weight); + } +} + +/* The BIONJ-like formula for the weight of A when building a profile for AB is + 1/2 + (avgD(B,CD) - avgD(A,CD))/(2*d(A,B)) +*/ +double QuartetWeight(profile_t *profiles[4], distance_matrix_t *dmat, int nPos) { + if (!bionj) + return(-1.0); /* even weighting */ + double d[6]; + CorrectedPairDistances(profiles, 4, dmat, nPos, /*OUT*/d); + if (d[qAB] < 0.01) + return -1.0; + double weight = 0.5 + ((d[qBC]+d[qBD])-(d[qAC]+d[qAD]))/(4*d[qAB]); + if (weight < 0) + weight = 0; + if (weight > 1) + weight = 1; + return (weight); +} + +/* Resets the children entry of parent and also the parent entry of newchild */ +void ReplaceChild(/*IN/OUT*/NJ_t *NJ, int parent, int oldchild, int newchild) { + NJ->parent[newchild] = parent; + + int iChild; + for (iChild = 0; iChild < NJ->child[parent].nChild; iChild++) { + if (NJ->child[parent].child[iChild] == oldchild) { + NJ->child[parent].child[iChild] = newchild; + return; + } + } + assert(0); +} + +/* Recomputes all branch lengths + + For internal branches such as (A,B) vs. (C,D), uses the formula + + length(AB|CD) = (d(A,C)+d(A,D)+d(B,C)+d(B,D))/4 - d(A,B)/2 - d(C,D)/2 + + (where all distances are profile distances - diameters). + + For external branches (e.g. to leaves) A vs. (B,C), use the formula + + length(A|BC) = (d(A,B)+d(A,C)-d(B,C))/2 +*/ +void UpdateBranchLengths(/*IN/OUT*/NJ_t *NJ) { + if (NJ->nSeq < 2) + return; + else if (NJ->nSeq == 2) { + int root = NJ->root; + int nodeA = NJ->child[root].child[0]; + int nodeB = NJ->child[root].child[1]; + besthit_t h; + ProfileDist(NJ->profiles[nodeA],NJ->profiles[nodeB], + NJ->nPos, NJ->distance_matrix, /*OUT*/&h); + if (logdist) + h.dist = LogCorrect(h.dist); + NJ->branchlength[nodeA] = h.dist/2.0; + NJ->branchlength[nodeB] = h.dist/2.0; + return; + } + + profile_t **upProfiles = UpProfiles(NJ); + traversal_t traversal = InitTraversal(NJ); + int node = NJ->root; + + while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { + /* reset branch length of node (distance to its parent) */ + if (node == NJ->root) + continue; /* no branch length to set */ + if (node < NJ->nSeq) { /* a leaf */ + profile_t *profileA = NJ->profiles[node]; + profile_t *profileB = NULL; + profile_t *profileC = NULL; + + int sib = Sibling(NJ,node); + if (sib == -1) { /* at root, have 2 siblings */ + int sibs[2]; + RootSiblings(NJ, node, /*OUT*/sibs); + profileB = NJ->profiles[sibs[0]]; + profileC = NJ->profiles[sibs[1]]; + } else { + profileB = NJ->profiles[sib]; + profileC = GetUpProfile(/*IN/OUT*/upProfiles, NJ, NJ->parent[node], /*useML*/false); + } + profile_t *profiles[3] = {profileA,profileB,profileC}; + double d[3]; /*AB,AC,BC*/ + CorrectedPairDistances(profiles, 3, NJ->distance_matrix, NJ->nPos, /*OUT*/d); + /* d(A,BC) = (dAB+dAC-dBC)/2 */ + NJ->branchlength[node] = (d[0]+d[1]-d[2])/2.0; + } else { + profile_t *profiles[4]; + int nodeABCD[4]; + SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, /*useML*/false); + double d[6]; + CorrectedPairDistances(profiles, 4, NJ->distance_matrix, NJ->nPos, /*OUT*/d); + NJ->branchlength[node] = (d[qAC]+d[qAD]+d[qBC]+d[qBD])/4.0 - (d[qAB]+d[qCD])/2.0; + + /* no longer needed */ + DeleteUpProfile(upProfiles, NJ, nodeABCD[0]); + DeleteUpProfile(upProfiles, NJ, nodeABCD[1]); + } + } + traversal = FreeTraversal(traversal,NJ); + upProfiles = FreeUpProfiles(upProfiles,NJ); +} + +/* Pick columns for resampling, stored as returned_vector[iBoot*nPos + j] */ +int *ResampleColumns(int nPos, int nBootstrap) { + long lPos = nPos; /* to prevent overflow on very long alignments when multiplying nPos * nBootstrap */ + int *col = (int*)mymalloc(sizeof(int)*lPos*(size_t)nBootstrap); + int i; + for (i = 0; i < nBootstrap; i++) { + int j; + for (j = 0; j < nPos; j++) { + int pos = (int)(knuth_rand() * nPos); + if (pos<0) + pos = 0; + else if (pos == nPos) + pos = nPos-1; + col[i*lPos + j] = pos; + } + } + if (verbose > 5) { + for (i=0; i < 3 && i < nBootstrap; i++) { + fprintf(stderr,"Boot%d",i); + int j; + for (j = 0; j < nPos; j++) { + fprintf(stderr,"\t%d",col[i*lPos+j]); + } + fprintf(stderr,"\n"); + } + } + return(col); +} + +void ReliabilityNJ(/*IN/OUT*/NJ_t *NJ, int nBootstrap) { + /* For each non-root node N, with children A,B, parent P, sibling C, and grandparent G, + we test the reliability of the split (A,B) versus rest by comparing the profiles + of A, B, C, and the "up-profile" of P. + + Each node's upProfile is the average of its sibling's (down)-profile + its parent's up-profile + (If node's parent is the root, then there are two siblings and we don't need an up-profile) + + To save memory, we do depth-first-search down from the root, and we only keep + up-profiles for nodes in the active path. + */ + if (NJ->nSeq <= 3 || nBootstrap <= 0) + return; /* nothing to do */ + int *col = ResampleColumns(NJ->nPos, nBootstrap); + + profile_t **upProfiles = UpProfiles(NJ); + traversal_t traversal = InitTraversal(NJ); + int node = NJ->root; + int iNodesDone = 0; + while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { + if (node < NJ->nSeq || node == NJ->root) + continue; /* nothing to do for leaves or root */ + + if(iNodesDone > 0 && (iNodesDone % 100) == 0) + ProgressReport("Local bootstrap for %6d of %6d internal splits", iNodesDone, NJ->nSeq-3, 0, 0); + iNodesDone++; + + profile_t *profiles[4]; + int nodeABCD[4]; + SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, /*useML*/false); + + NJ->support[node] = SplitSupport(profiles[0], profiles[1], profiles[2], profiles[3], + NJ->distance_matrix, + NJ->nPos, + nBootstrap, + col); + + /* no longer needed */ + DeleteUpProfile(upProfiles, NJ, nodeABCD[0]); + DeleteUpProfile(upProfiles, NJ, nodeABCD[1]); + DeleteUpProfile(upProfiles, NJ, nodeABCD[2]); + } + traversal = FreeTraversal(traversal,NJ); + upProfiles = FreeUpProfiles(upProfiles,NJ); + col = myfree(col, sizeof(int)*((size_t)NJ->nPos)*nBootstrap); +} + +profile_t *NewProfile(int nPos, int nConstraints) { + profile_t *profile = (profile_t *)mymalloc(sizeof(profile_t)); + profile->weights = mymalloc(sizeof(numeric_t)*nPos); + profile->codes = mymalloc(sizeof(unsigned char)*nPos); + profile->vectors = NULL; + profile->nVectors = 0; + profile->codeDist = NULL; + if (nConstraints == 0) { + profile->nOn = NULL; + profile->nOff = NULL; + } else { + profile->nOn = mymalloc(sizeof(int)*nConstraints); + profile->nOff = mymalloc(sizeof(int)*nConstraints); + } + return(profile); +} + +profile_t *FreeProfile(profile_t *profile, int nPos, int nConstraints) { + if(profile==NULL) return(NULL); + myfree(profile->codes, nPos); + myfree(profile->weights, nPos); + myfree(profile->vectors, sizeof(numeric_t)*nCodes*profile->nVectors); + myfree(profile->codeDist, sizeof(numeric_t)*nCodes*nPos); + if (nConstraints > 0) { + myfree(profile->nOn, sizeof(int)*nConstraints); + myfree(profile->nOff, sizeof(int)*nConstraints); + } + return(myfree(profile, sizeof(profile_t))); +} + +void SetupABCD(NJ_t *NJ, int node, + /* the 4 profiles; the last one is an outprofile */ + /*OPTIONAL OUT*/profile_t *profiles[4], + /*OPTIONAL IN/OUT*/profile_t **upProfiles, + /*OUT*/int nodeABCD[4], + bool useML) { + int parent = NJ->parent[node]; + assert(parent >= 0); + assert(NJ->child[node].nChild == 2); + nodeABCD[0] = NJ->child[node].child[0]; /*A*/ + nodeABCD[1] = NJ->child[node].child[1]; /*B*/ + + profile_t *profile4 = NULL; + if (parent == NJ->root) { + int sibs[2]; + RootSiblings(NJ, node, /*OUT*/sibs); + nodeABCD[2] = sibs[0]; + nodeABCD[3] = sibs[1]; + if (profiles == NULL) + return; + profile4 = NJ->profiles[sibs[1]]; + } else { + nodeABCD[2] = Sibling(NJ,node); + assert(nodeABCD[2] >= 0); + nodeABCD[3] = parent; + if (profiles == NULL) + return; + profile4 = GetUpProfile(upProfiles,NJ,parent,useML); + } + assert(upProfiles != NULL); + int i; + for (i = 0; i < 3; i++) + profiles[i] = NJ->profiles[nodeABCD[i]]; + profiles[3] = profile4; +} + + +int Sibling(NJ_t *NJ, int node) { + int parent = NJ->parent[node]; + if (parent < 0 || parent == NJ->root) + return(-1); + int iChild; + for(iChild=0;iChildchild[parent].nChild;iChild++) { + if(NJ->child[parent].child[iChild] != node) + return (NJ->child[parent].child[iChild]); + } + assert(0); + return(-1); +} + +void RootSiblings(NJ_t *NJ, int node, /*OUT*/int sibs[2]) { + assert(NJ->parent[node] == NJ->root); + assert(NJ->child[NJ->root].nChild == 3); + + int nSibs = 0; + int iChild; + for(iChild=0; iChild < NJ->child[NJ->root].nChild; iChild++) { + int child = NJ->child[NJ->root].child[iChild]; + if (child != node) sibs[nSibs++] = child; + } + assert(nSibs==2); +} + +void TestSplitsML(/*IN/OUT*/NJ_t *NJ, /*OUT*/SplitCount_t *splitcount, int nBootstrap) { + const double tolerance = 1e-6; + splitcount->nBadSplits = 0; + splitcount->nConstraintViolations = 0; + splitcount->nBadBoth = 0; + splitcount->nSplits = 0; + splitcount->dWorstDeltaUnconstrained = 0; + splitcount->dWorstDeltaConstrained = 0; + + profile_t **upProfiles = UpProfiles(NJ); + traversal_t traversal = InitTraversal(NJ); + int node = NJ->root; + + int *col = nBootstrap > 0 ? ResampleColumns(NJ->nPos, nBootstrap) : NULL; + double *site_likelihoods[3]; + int choice; + for (choice = 0; choice < 3; choice++) + site_likelihoods[choice] = mymalloc(sizeof(double)*NJ->nPos); + + int iNodesDone = 0; + while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { + if (node < NJ->nSeq || node == NJ->root) + continue; /* nothing to do for leaves or root */ + + if(iNodesDone > 0 && (iNodesDone % 100) == 0) + ProgressReport("ML split tests for %6d of %6d internal splits", iNodesDone, NJ->nSeq-3, 0, 0); + iNodesDone++; + + profile_t *profiles[4]; + int nodeABCD[4]; + SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, /*useML*/true); + double loglk[3]; + double len[5]; + int i; + for (i = 0; i < 4; i++) + len[i] = NJ->branchlength[nodeABCD[i]]; + len[4] = NJ->branchlength[node]; + double lenABvsCD[5] = {len[LEN_A], len[LEN_B], len[LEN_C], len[LEN_D], len[LEN_I]}; + double lenACvsBD[5] = {len[LEN_A], len[LEN_C], len[LEN_B], len[LEN_D], len[LEN_I]}; /* Swap B & C */ + double lenADvsBC[5] = {len[LEN_A], len[LEN_D], len[LEN_C], len[LEN_B], len[LEN_I]}; /* Swap B & D */ + + { +#ifdef OPENMP + #pragma omp parallel + #pragma omp sections +#endif + { +#ifdef OPENMP + #pragma omp section +#endif + { + /* Lengths are already optimized for ABvsCD */ + loglk[ABvsCD] = MLQuartetLogLk(profiles[0], profiles[1], profiles[2], profiles[3], + NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/lenABvsCD, + /*OUT*/site_likelihoods[ABvsCD]); + } + +#ifdef OPENMP + #pragma omp section +#endif + { + loglk[ACvsBD] = MLQuartetOptimize(profiles[0], profiles[2], profiles[1], profiles[3], + NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/lenACvsBD, /*pStarTest*/NULL, + /*OUT*/site_likelihoods[ACvsBD]); + } + +#ifdef OPENMP + #pragma omp section +#endif + { + loglk[ADvsBC] = MLQuartetOptimize(profiles[0], profiles[3], profiles[2], profiles[1], + NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/lenADvsBC, /*pStarTest*/NULL, + /*OUT*/site_likelihoods[ADvsBC]); + } + } + } + + /* do a second pass on the better alternative if it is close */ + if (loglk[ACvsBD] > loglk[ADvsBC]) { + if (mlAccuracy > 1 || loglk[ACvsBD] > loglk[ABvsCD] - closeLogLkLimit) { + loglk[ACvsBD] = MLQuartetOptimize(profiles[0], profiles[2], profiles[1], profiles[3], + NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/lenACvsBD, /*pStarTest*/NULL, + /*OUT*/site_likelihoods[ACvsBD]); + } + } else { + if (mlAccuracy > 1 || loglk[ADvsBC] > loglk[ABvsCD] - closeLogLkLimit) { + loglk[ADvsBC] = MLQuartetOptimize(profiles[0], profiles[3], profiles[2], profiles[1], + NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/lenADvsBC, /*pStarTest*/NULL, + /*OUT*/site_likelihoods[ADvsBC]); + } + } + + if (loglk[ABvsCD] >= loglk[ACvsBD] && loglk[ABvsCD] >= loglk[ADvsBC]) + choice = ABvsCD; + else if (loglk[ACvsBD] >= loglk[ABvsCD] && loglk[ACvsBD] >= loglk[ADvsBC]) + choice = ACvsBD; + else + choice = ADvsBC; + bool badSplit = loglk[choice] > loglk[ABvsCD] + treeLogLkDelta; /* ignore small changes in likelihood */ + + /* constraint penalties, indexed by nni_t (lower is better) */ + double p[3]; + QuartetConstraintPenalties(profiles, NJ->nConstraints, /*OUT*/p); + bool bBadConstr = p[ABvsCD] > p[ACvsBD] + tolerance || p[ABvsCD] > p[ADvsBC] + tolerance; + bool violateConstraint = false; + int iC; + for (iC=0; iC < NJ->nConstraints; iC++) { + if (SplitViolatesConstraint(profiles, iC)) { + violateConstraint = true; + break; + } + } + splitcount->nSplits++; + if (violateConstraint) + splitcount->nConstraintViolations++; + if (badSplit) + splitcount->nBadSplits++; + if (badSplit && bBadConstr) + splitcount->nBadBoth++; + if (badSplit) { + double delta = loglk[choice] - loglk[ABvsCD]; + /* If ABvsCD is favored over the more likely NNI by constraints, + then this is probably a bad split because of the constraint */ + if (p[choice] > p[ABvsCD] + tolerance) + splitcount->dWorstDeltaConstrained = MAX(delta, splitcount->dWorstDeltaConstrained); + else + splitcount->dWorstDeltaUnconstrained = MAX(delta, splitcount->dWorstDeltaUnconstrained); + } + if (nBootstrap>0) + NJ->support[node] = badSplit ? 0.0 : SHSupport(NJ->nPos, nBootstrap, col, loglk, site_likelihoods); + + /* No longer needed */ + DeleteUpProfile(upProfiles, NJ, nodeABCD[0]); + DeleteUpProfile(upProfiles, NJ, nodeABCD[1]); + DeleteUpProfile(upProfiles, NJ, nodeABCD[2]); + } + traversal = FreeTraversal(traversal,NJ); + upProfiles = FreeUpProfiles(upProfiles,NJ); + if (nBootstrap>0) + col = myfree(col, sizeof(int)*((size_t)NJ->nPos)*nBootstrap); + for (choice = 0; choice < 3; choice++) + site_likelihoods[choice] = myfree(site_likelihoods[choice], sizeof(double)*NJ->nPos); +} + + +void TestSplitsMinEvo(NJ_t *NJ, /*OUT*/SplitCount_t *splitcount) { + const double tolerance = 1e-6; + splitcount->nBadSplits = 0; + splitcount->nConstraintViolations = 0; + splitcount->nBadBoth = 0; + splitcount->nSplits = 0; + splitcount->dWorstDeltaUnconstrained = 0.0; + splitcount->dWorstDeltaConstrained = 0.0; + + profile_t **upProfiles = UpProfiles(NJ); + traversal_t traversal = InitTraversal(NJ); + int node = NJ->root; + + while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { + if (node < NJ->nSeq || node == NJ->root) + continue; /* nothing to do for leaves or root */ + + profile_t *profiles[4]; + int nodeABCD[4]; + SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, /*useML*/false); + + if (verbose>2) + fprintf(stderr,"Testing Split around %d: A=%d B=%d C=%d D=up(%d) or node parent %d\n", + node, nodeABCD[0], nodeABCD[1], nodeABCD[2], nodeABCD[3], NJ->parent[node]); + + double d[6]; /* distances, perhaps log-corrected distances, no constraint penalties */ + CorrectedPairDistances(profiles, 4, NJ->distance_matrix, NJ->nPos, /*OUT*/d); + + /* alignment-based scores for each split (lower is better) */ + double sABvsCD = d[qAB] + d[qCD]; + double sACvsBD = d[qAC] + d[qBD]; + double sADvsBC = d[qAD] + d[qBC]; + + /* constraint penalties, indexed by nni_t (lower is better) */ + double p[3]; + QuartetConstraintPenalties(profiles, NJ->nConstraints, /*OUT*/p); + + int nConstraintsViolated = 0; + int iC; + for (iC=0; iC < NJ->nConstraints; iC++) { + if (SplitViolatesConstraint(profiles, iC)) { + nConstraintsViolated++; + if (verbose > 2) { + double penalty[3] = {0.0,0.0,0.0}; + (void)QuartetConstraintPenaltiesPiece(profiles, iC, /*OUT*/penalty); + fprintf(stderr, "Violate constraint %d at %d (children %d %d) penalties %.3f %.3f %.3f %d/%d %d/%d %d/%d %d/%d\n", + iC, node, NJ->child[node].child[0], NJ->child[node].child[1], + penalty[ABvsCD], penalty[ACvsBD], penalty[ADvsBC], + profiles[0]->nOn[iC], profiles[0]->nOff[iC], + profiles[1]->nOn[iC], profiles[1]->nOff[iC], + profiles[2]->nOn[iC], profiles[2]->nOff[iC], + profiles[3]->nOn[iC], profiles[3]->nOff[iC]); + } + } + } + + double delta = sABvsCD - MIN(sACvsBD,sADvsBC); + bool bBadDist = delta > tolerance; + bool bBadConstr = p[ABvsCD] > p[ACvsBD] + tolerance || p[ABvsCD] > p[ADvsBC] + tolerance; + + splitcount->nSplits++; + if (bBadDist) { + nni_t choice = sACvsBD < sADvsBC ? ACvsBD : ADvsBC; + /* If ABvsCD is favored over the shorter NNI by constraints, + then this is probably a bad split because of the constraint */ + if (p[choice] > p[ABvsCD] + tolerance) + splitcount->dWorstDeltaConstrained = MAX(delta, splitcount->dWorstDeltaConstrained); + else + splitcount->dWorstDeltaUnconstrained = MAX(delta, splitcount->dWorstDeltaUnconstrained); + } + + if (nConstraintsViolated > 0) + splitcount->nConstraintViolations++; /* count splits with any violations, not #constraints in a splits */ + if (bBadDist) + splitcount->nBadSplits++; + if (bBadDist && bBadConstr) + splitcount->nBadBoth++; + if (bBadConstr && verbose > 2) { + /* Which NNI would be better */ + double dist_advantage = 0; + double constraint_penalty = 0; + if (p[ACvsBD] < p[ADvsBC]) { + dist_advantage = sACvsBD - sABvsCD; + constraint_penalty = p[ABvsCD] - p[ACvsBD]; + } else { + dist_advantage = sADvsBC - sABvsCD; + constraint_penalty = p[ABvsCD] - p[ADvsBC]; + } + fprintf(stderr, "Violate constraints %d distance_advantage %.3f constraint_penalty %.3f (children %d %d):", + node, dist_advantage, constraint_penalty, + NJ->child[node].child[0], NJ->child[node].child[1]); + /* list the constraints with a penalty, meaning that ABCD all have non-zero + values and that AB|CD worse than others */ + for (iC = 0; iC < NJ->nConstraints; iC++) { + double ppart[6]; + if (QuartetConstraintPenaltiesPiece(profiles, iC, /*OUT*/ppart)) { + if (ppart[qAB] + ppart[qCD] > ppart[qAD] + ppart[qBC] + tolerance + || ppart[qAB] + ppart[qCD] > ppart[qAC] + ppart[qBD] + tolerance) + fprintf(stderr, " %d (%d/%d %d/%d %d/%d %d/%d)", iC, + profiles[0]->nOn[iC], profiles[0]->nOff[iC], + profiles[1]->nOn[iC], profiles[1]->nOff[iC], + profiles[2]->nOn[iC], profiles[2]->nOff[iC], + profiles[3]->nOn[iC], profiles[3]->nOff[iC]); + } + } + fprintf(stderr, "\n"); + } + + /* no longer needed */ + DeleteUpProfile(upProfiles, NJ, nodeABCD[0]); + DeleteUpProfile(upProfiles, NJ, nodeABCD[1]); + } + traversal = FreeTraversal(traversal,NJ); + upProfiles = FreeUpProfiles(upProfiles,NJ); +} + +/* Computes support for (A,B),(C,D) compared to that for (A,C),(B,D) and (A,D),(B,C) */ +double SplitSupport(profile_t *pA, profile_t *pB, profile_t *pC, profile_t *pD, + /*OPTIONAL*/distance_matrix_t *dmat, + int nPos, + int nBootstrap, + int *col) { + int i,j; + long lPos = nPos; /* to avoid overflow when multiplying */ + + /* Note distpieces are weighted */ + double *distpieces[6]; + double *weights[6]; + for (j = 0; j < 6; j++) { + distpieces[j] = (double*)mymalloc(sizeof(double)*nPos); + weights[j] = (double*)mymalloc(sizeof(double)*nPos); + } + + int iFreqA = 0; + int iFreqB = 0; + int iFreqC = 0; + int iFreqD = 0; + for (i = 0; i < nPos; i++) { + numeric_t *fA = GET_FREQ(pA, i, /*IN/OUT*/iFreqA); + numeric_t *fB = GET_FREQ(pB, i, /*IN/OUT*/iFreqB); + numeric_t *fC = GET_FREQ(pC, i, /*IN/OUT*/iFreqC); + numeric_t *fD = GET_FREQ(pD, i, /*IN/OUT*/iFreqD); + + weights[qAB][i] = pA->weights[i] * pB->weights[i]; + weights[qAC][i] = pA->weights[i] * pC->weights[i]; + weights[qAD][i] = pA->weights[i] * pD->weights[i]; + weights[qBC][i] = pB->weights[i] * pC->weights[i]; + weights[qBD][i] = pB->weights[i] * pD->weights[i]; + weights[qCD][i] = pC->weights[i] * pD->weights[i]; + + distpieces[qAB][i] = weights[qAB][i] * ProfileDistPiece(pA->codes[i], pB->codes[i], fA, fB, dmat, NULL); + distpieces[qAC][i] = weights[qAC][i] * ProfileDistPiece(pA->codes[i], pC->codes[i], fA, fC, dmat, NULL); + distpieces[qAD][i] = weights[qAD][i] * ProfileDistPiece(pA->codes[i], pD->codes[i], fA, fD, dmat, NULL); + distpieces[qBC][i] = weights[qBC][i] * ProfileDistPiece(pB->codes[i], pC->codes[i], fB, fC, dmat, NULL); + distpieces[qBD][i] = weights[qBD][i] * ProfileDistPiece(pB->codes[i], pD->codes[i], fB, fD, dmat, NULL); + distpieces[qCD][i] = weights[qCD][i] * ProfileDistPiece(pC->codes[i], pD->codes[i], fC, fD, dmat, NULL); + } + assert(iFreqA == pA->nVectors); + assert(iFreqB == pB->nVectors); + assert(iFreqC == pC->nVectors); + assert(iFreqD == pD->nVectors); + + double totpieces[6]; + double totweights[6]; + double dists[6]; + for (j = 0; j < 6; j++) { + totpieces[j] = 0.0; + totweights[j] = 0.0; + for (i = 0; i < nPos; i++) { + totpieces[j] += distpieces[j][i]; + totweights[j] += weights[j][i]; + } + dists[j] = totweights[j] > 0.01 ? totpieces[j]/totweights[j] : 3.0; + if (logdist) + dists[j] = LogCorrect(dists[j]); + } + + /* Support1 = Support(AB|CD over AC|BD) = d(A,C)+d(B,D)-d(A,B)-d(C,D) + Support2 = Support(AB|CD over AD|BC) = d(A,D)+d(B,C)-d(A,B)-d(C,D) + */ + double support1 = dists[qAC] + dists[qBD] - dists[qAB] - dists[qCD]; + double support2 = dists[qAD] + dists[qBC] - dists[qAB] - dists[qCD]; + + if (support1 < 0 || support2 < 0) { + nSuboptimalSplits++; /* Another split seems superior */ + } + + assert(nBootstrap > 0); + int nSupport = 0; + + int iBoot; + for (iBoot=0;iBoot 0.01 ? totp/totw : 3.0; + if (logdist) + dists[j] = LogCorrect(dists[j]); + } + support1 = dists[qAC] + dists[qBD] - dists[qAB] - dists[qCD]; + support2 = dists[qAD] + dists[qBC] - dists[qAB] - dists[qCD]; + if (support1 > 0 && support2 > 0) + nSupport++; + } /* end loop over bootstrap replicates */ + + for (j = 0; j < 6; j++) { + distpieces[j] = myfree(distpieces[j], sizeof(double)*nPos); + weights[j] = myfree(weights[j], sizeof(double)*nPos); + } + return( nSupport/(double)nBootstrap ); +} + +double SHSupport(int nPos, int nBootstrap, int *col, double loglk[3], double *site_likelihoods[3]) { + long lPos = nPos; /* to avoid overflow when multiplying */ + assert(nBootstrap>0); + double delta1 = loglk[0]-loglk[1]; + double delta2 = loglk[0]-loglk[2]; + double delta = delta1 < delta2 ? delta1 : delta2; + + double *siteloglk[3]; + int i,j; + for (i = 0; i < 3; i++) { + siteloglk[i] = mymalloc(sizeof(double)*nPos); + for (j = 0; j < nPos; j++) + siteloglk[i][j] = log(site_likelihoods[i][j]); + } + + int nSupport = 0; + int iBoot; + for (iBoot = 0; iBoot < nBootstrap; iBoot++) { + double resampled[3]; + for (i = 0; i < 3; i++) + resampled[i] = -loglk[i]; + for (j = 0; j < nPos; j++) { + int pos = col[iBoot*lPos+j]; + for (i = 0; i < 3; i++) + resampled[i] += siteloglk[i][pos]; + } + int iBest = 0; + for (i = 1; i < 3; i++) + if (resampled[i] > resampled[iBest]) + iBest = i; + double resample1 = resampled[iBest] - resampled[(iBest+1)%3]; + double resample2 = resampled[iBest] - resampled[(iBest+2)%3]; + double resampleDelta = resample1 < resample2 ? resample1 : resample2; + if (resampleDelta < delta) + nSupport++; + } + for (i=0;i<3;i++) + siteloglk[i] = myfree(siteloglk[i], sizeof(double)*nPos); + return(nSupport/(double)nBootstrap); +} + + +void SetDistCriterion(/*IN/OUT*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *hit) { + if (hit->i < NJ->nSeq && hit->j < NJ->nSeq) { + SeqDist(NJ->profiles[hit->i]->codes, + NJ->profiles[hit->j]->codes, + NJ->nPos, NJ->distance_matrix, /*OUT*/hit); + } else { + ProfileDist(NJ->profiles[hit->i], + NJ->profiles[hit->j], + NJ->nPos, NJ->distance_matrix, /*OUT*/hit); + hit->dist -= (NJ->diameter[hit->i] + NJ->diameter[hit->j]); + } + hit->dist += constraintWeight + * (double)JoinConstraintPenalty(NJ, hit->i, hit->j); + SetCriterion(NJ,nActive,/*IN/OUT*/hit); +} + +void SetCriterion(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *join) { + if(join->i < 0 + || join->j < 0 + || NJ->parent[join->i] >= 0 + || NJ->parent[join->j] >= 0) + return; + assert(NJ->nOutDistActive[join->i] >= nActive); + assert(NJ->nOutDistActive[join->j] >= nActive); + + int nDiffAllow = tophitsMult > 0 ? (int)(nActive*staleOutLimit) : 0; + if (NJ->nOutDistActive[join->i] - nActive > nDiffAllow) + SetOutDistance(NJ, join->i, nActive); + if (NJ->nOutDistActive[join->j] - nActive > nDiffAllow) + SetOutDistance(NJ, join->j, nActive); + double outI = NJ->outDistances[join->i]; + if (NJ->nOutDistActive[join->i] != nActive) + outI *= (nActive-1)/(double)(NJ->nOutDistActive[join->i]-1); + double outJ = NJ->outDistances[join->j]; + if (NJ->nOutDistActive[join->j] != nActive) + outJ *= (nActive-1)/(double)(NJ->nOutDistActive[join->j]-1); + join->criterion = join->dist - (outI+outJ)/(double)(nActive-2); + if (verbose > 2 && nActive <= 5) { + fprintf(stderr, "Set Criterion to join %d %d with nActive=%d dist+penalty %.3f criterion %.3f\n", + join->i, join->j, nActive, join->dist, join->criterion); + } +} + +void SetOutDistance(NJ_t *NJ, int iNode, int nActive) { + if (NJ->nOutDistActive[iNode] == nActive) + return; + + /* May be called by InitNJ before we have parents */ + assert(iNode>=0 && (NJ->parent == NULL || NJ->parent[iNode]<0)); + besthit_t dist; + ProfileDist(NJ->profiles[iNode], NJ->outprofile, NJ->nPos, NJ->distance_matrix, &dist); + outprofileOps++; + + /* out(A) = sum(X!=A) d(A,X) + = sum(X!=A) (profiledist(A,X) - diam(A) - diam(X)) + = sum(X!=A) profiledist(A,X) - (N-1)*diam(A) - (totdiam - diam(A)) + + in the absence of gaps: + profiledist(A,out) = mean profiledist(A, all active nodes) + sum(X!=A) profiledist(A,X) = N * profiledist(A,out) - profiledist(A,A) + + With gaps, we need to take the weights of the comparisons into account, where + w(Ai) is the weight of position i in profile A: + w(A,B) = sum_i w(Ai) * w(Bi) + d(A,B) = sum_i w(Ai) * w(Bi) * d(Ai,Bi) / w(A,B) + + sum(X!=A) profiledist(A,X) ~= (N-1) * profiledist(A, Out w/o A) + profiledist(A, Out w/o A) = sum_X!=A sum_i d(Ai,Xi) * w(Ai) * w(Bi) / ( sum_X!=A sum_i w(Ai) * w(Bi) ) + d(A, Out) = sum_A sum_i d(Ai,Xi) * w(Ai) * w(Bi) / ( sum_X sum_i w(Ai) * w(Bi) ) + + and so we get + profiledist(A,out w/o A) = (top of d(A,Out) - top of d(A,A)) / (weight of d(A,Out) - weight of d(A,A)) + top = dist * weight + with another correction of nActive because the weight of the out-profile is the average + weight not the total weight. + */ + double top = (nActive-1) + * (dist.dist * dist.weight * nActive - NJ->selfweight[iNode] * NJ->selfdist[iNode]); + double bottom = (dist.weight * nActive - NJ->selfweight[iNode]); + double pdistOutWithoutA = top/bottom; + NJ->outDistances[iNode] = bottom > 0.01 ? + pdistOutWithoutA - NJ->diameter[iNode] * (nActive-1) - (NJ->totdiam - NJ->diameter[iNode]) + : 3.0; + NJ->nOutDistActive[iNode] = nActive; + + if(verbose>3 && iNode < 5) + fprintf(stderr,"NewOutDist for %d %f from dist %f selfd %f diam %f totdiam %f newActive %d\n", + iNode, NJ->outDistances[iNode], dist.dist, NJ->selfdist[iNode], NJ->diameter[iNode], + NJ->totdiam, nActive); + if (verbose>6 && (iNode % 10) == 0) { + /* Compute the actual out-distance and compare */ + double total = 0.0; + double total_pd = 0.0; + int j; + for (j=0;jmaxnode;j++) { + if (j!=iNode && (NJ->parent==NULL || NJ->parent[j]<0)) { + besthit_t bh; + ProfileDist(NJ->profiles[iNode], NJ->profiles[j], NJ->nPos, NJ->distance_matrix, /*OUT*/&bh); + total_pd += bh.dist; + total += bh.dist - (NJ->diameter[iNode] + NJ->diameter[j]); + } + } + fprintf(stderr,"OutDist for Node %d %f truth %f profiled %f truth %f pd_err %f\n", + iNode, NJ->outDistances[iNode], total, pdistOutWithoutA, total_pd,fabs(pdistOutWithoutA-total_pd)); + } +} + +top_hits_t *FreeTopHits(top_hits_t *tophits) { + if (tophits == NULL) + return(NULL); + int iNode; + for (iNode = 0; iNode < tophits->maxnodes; iNode++) { + top_hits_list_t *l = &tophits->top_hits_lists[iNode]; + if (l->hits != NULL) + l->hits = myfree(l->hits, sizeof(hit_t) * l->nHits); + } + tophits->top_hits_lists = myfree(tophits->top_hits_lists, sizeof(top_hits_list_t) * tophits->maxnodes); + tophits->visible = myfree(tophits->visible, sizeof(hit_t*) * tophits->maxnodes); + tophits->topvisible = myfree(tophits->topvisible, sizeof(int) * tophits->nTopVisible); +#ifdef OPENMP + for (iNode = 0; iNode < tophits->maxnodes; iNode++) + omp_destroy_lock(&tophits->locks[iNode]); + tophits->locks = myfree(tophits->locks, sizeof(omp_lock_t) * tophits->maxnodes); +#endif + return(myfree(tophits, sizeof(top_hits_t))); +} + +top_hits_t *InitTopHits(NJ_t *NJ, int m) { + int iNode; + assert(m > 0); + top_hits_t *tophits = mymalloc(sizeof(top_hits_t)); + tophits->m = m; + tophits->q = (int)(0.5 + tophits2Mult * sqrt(tophits->m)); + if (!useTopHits2nd || tophits->q >= tophits->m) + tophits->q = 0; + tophits->maxnodes = NJ->maxnodes; + tophits->top_hits_lists = mymalloc(sizeof(top_hits_list_t) * tophits->maxnodes); + tophits->visible = mymalloc(sizeof(hit_t) * tophits->maxnodes); + tophits->nTopVisible = (int)(0.5 + topvisibleMult*m); + tophits->topvisible = mymalloc(sizeof(int) * tophits->nTopVisible); +#ifdef OPENMP + tophits->locks = mymalloc(sizeof(omp_lock_t) * tophits->maxnodes); + for (iNode = 0; iNode < tophits->maxnodes; iNode++) + omp_init_lock(&tophits->locks[iNode]); +#endif + int i; + for (i = 0; i < tophits->nTopVisible; i++) + tophits->topvisible[i] = -1; /* empty */ + tophits->topvisibleAge = 0; + + for (iNode = 0; iNode < tophits->maxnodes; iNode++) { + top_hits_list_t *l = &tophits->top_hits_lists[iNode]; + l->nHits = 0; + l->hits = NULL; + l->hitSource = -1; + l->age = 0; + hit_t *v = &tophits->visible[iNode]; + v->j = -1; + v->dist = 1e20; + } + return(tophits); +} + +/* Helper function for sorting in SetAllLeafTopHits, + and the global variables it needs +*/ +NJ_t *CompareSeedNJ = NULL; +int *CompareSeedGaps = NULL; +int CompareSeeds(const void *c1, const void *c2) { + int seed1 = *(int *)c1; + int seed2 = *(int *)c2; + int gapdiff = CompareSeedGaps[seed1] - CompareSeedGaps[seed2]; + if (gapdiff != 0) return(gapdiff); /* fewer gaps is better */ + double outdiff = CompareSeedNJ->outDistances[seed1] - CompareSeedNJ->outDistances[seed2]; + if(outdiff < 0) return(-1); /* closer to more nodes is better */ + if(outdiff > 0) return(1); + return(0); +} + +/* Using the seed heuristic and the close global variable */ +void SetAllLeafTopHits(/*IN/UPDATE*/NJ_t *NJ, /*IN/OUT*/top_hits_t *tophits) { + double close = tophitsClose; + if (close < 0) { + if (fastest && NJ->nSeq >= 50000) { + close = 0.99; + } else { + double logN = log((double)NJ->nSeq)/log(2.0); + close = logN/(logN+2.0); + } + } + /* Sort the potential seeds, by a combination of nGaps and NJ->outDistances + We don't store nGaps so we need to compute that + */ + int *nGaps = (int*)mymalloc(sizeof(int)*NJ->nSeq); + int iNode; + for(iNode=0; iNodenSeq; iNode++) { + nGaps[iNode] = (int)(0.5 + NJ->nPos - NJ->selfweight[iNode]); + } + int *seeds = (int*)mymalloc(sizeof(int)*NJ->nSeq); + for (iNode=0; iNodenSeq; iNode++) seeds[iNode] = iNode; + CompareSeedNJ = NJ; + CompareSeedGaps = nGaps; + qsort(/*IN/OUT*/seeds, NJ->nSeq, sizeof(int), CompareSeeds); + CompareSeedNJ = NULL; + CompareSeedGaps = NULL; + + /* For each seed, save its top 2*m hits and then look for close neighbors */ + assert(2 * tophits->m <= NJ->nSeq); + int iSeed; + int nHasTopHits = 0; +#ifdef OPENMP + #pragma omp parallel for schedule(dynamic, 50) +#endif + for(iSeed=0; iSeed < NJ->nSeq; iSeed++) { + int seed = seeds[iSeed]; + if (iSeed > 0 && (iSeed % 100) == 0) { +#ifdef OPENMP + #pragma omp critical +#endif + ProgressReport("Top hits for %6d of %6d seqs (at seed %6d)", + nHasTopHits, NJ->nSeq, + iSeed, 0); + } + if (tophits->top_hits_lists[seed].nHits > 0) { + if(verbose>2) fprintf(stderr, "Skipping seed %d\n", seed); + continue; + } + + besthit_t *besthitsSeed = (besthit_t*)mymalloc(sizeof(besthit_t)*NJ->nSeq); + besthit_t *besthitsNeighbor = (besthit_t*)mymalloc(sizeof(besthit_t) * 2 * tophits->m); + besthit_t bestjoin; + + if(verbose>2) fprintf(stderr,"Trying seed %d\n", seed); + SetBestHit(seed, NJ, /*nActive*/NJ->nSeq, /*OUT*/&bestjoin, /*OUT*/besthitsSeed); + + /* sort & save top hits of self. besthitsSeed is now sorted. */ + SortSaveBestHits(seed, /*IN/SORT*/besthitsSeed, /*IN-SIZE*/NJ->nSeq, + /*OUT-SIZE*/tophits->m, /*IN/OUT*/tophits); + nHasTopHits++; + + /* find "close" neighbors and compute their top hits */ + double neardist = besthitsSeed[2 * tophits->m - 1].dist * close; + /* must have at least average weight, rem higher is better + and allow a bit more than average, e.g. if we are looking for within 30% away, + 20% more gaps than usual seems OK + Alternatively, have a coverage requirement in case neighbor is short + If fastest, consider the top q/2 hits to be close neighbors, regardless + */ + double nearweight = 0; + int iClose; + for (iClose = 0; iClose < 2 * tophits->m; iClose++) + nearweight += besthitsSeed[iClose].weight; + nearweight = nearweight/(2.0 * tophits->m); /* average */ + nearweight *= (1.0-2.0*neardist/3.0); + double nearcover = 1.0 - neardist/2.0; + + if(verbose>2) fprintf(stderr,"Distance limit for close neighbors %f weight %f ungapped %d\n", + neardist, nearweight, NJ->nPos-nGaps[seed]); + for (iClose = 0; iClose < tophits->m; iClose++) { + besthit_t *closehit = &besthitsSeed[iClose]; + int closeNode = closehit->j; + if (tophits->top_hits_lists[closeNode].nHits > 0) + continue; + + /* If within close-distance, or identical, use as close neighbor */ + bool close = closehit->dist <= neardist + && (closehit->weight >= nearweight + || closehit->weight >= (NJ->nPos-nGaps[closeNode])*nearcover); + bool identical = closehit->dist < 1e-6 + && fabs(closehit->weight - (NJ->nPos - nGaps[seed])) < 1e-5 + && fabs(closehit->weight - (NJ->nPos - nGaps[closeNode])) < 1e-5; + if (useTopHits2nd && iClose < tophits->q && (close || identical)) { + nHasTopHits++; + nClose2Used++; + int nUse = MIN(tophits->q * tophits2Safety, 2 * tophits->m); + besthit_t *besthitsClose = mymalloc(sizeof(besthit_t) * nUse); + TransferBestHits(NJ, /*nActive*/NJ->nSeq, + closeNode, + /*IN*/besthitsSeed, /*SIZE*/nUse, + /*OUT*/besthitsClose, + /*updateDistance*/true); + SortSaveBestHits(closeNode, /*IN/SORT*/besthitsClose, + /*IN-SIZE*/nUse, /*OUT-SIZE*/tophits->q, + /*IN/OUT*/tophits); + tophits->top_hits_lists[closeNode].hitSource = seed; + besthitsClose = myfree(besthitsClose, sizeof(besthit_t) * nUse); + } else if (close || identical || (fastest && iClose < (tophits->q+1)/2)) { + nHasTopHits++; + nCloseUsed++; + if(verbose>2) fprintf(stderr, "Near neighbor %d (rank %d weight %f ungapped %d %d)\n", + closeNode, iClose, besthitsSeed[iClose].weight, + NJ->nPos-nGaps[seed], + NJ->nPos-nGaps[closeNode]); + + /* compute top 2*m hits */ + TransferBestHits(NJ, /*nActive*/NJ->nSeq, + closeNode, + /*IN*/besthitsSeed, /*SIZE*/2 * tophits->m, + /*OUT*/besthitsNeighbor, + /*updateDistance*/true); + SortSaveBestHits(closeNode, /*IN/SORT*/besthitsNeighbor, + /*IN-SIZE*/2 * tophits->m, /*OUT-SIZE*/tophits->m, + /*IN/OUT*/tophits); + + /* And then try for a second level of transfer. We assume we + are in a good area, because of the 1st + level of transfer, and in a small neighborhood, because q is + small (32 for 1 million sequences), so we do not make any close checks. + */ + int iClose2; + for (iClose2 = 0; iClose2 < tophits->q && iClose2 < 2 * tophits->m; iClose2++) { + int closeNode2 = besthitsNeighbor[iClose2].j; + assert(closeNode2 >= 0); + if (tophits->top_hits_lists[closeNode2].hits == NULL) { + nClose2Used++; + nHasTopHits++; + int nUse = MIN(tophits->q * tophits2Safety, 2 * tophits->m); + besthit_t *besthitsClose2 = mymalloc(sizeof(besthit_t) * nUse); + TransferBestHits(NJ, /*nActive*/NJ->nSeq, + closeNode2, + /*IN*/besthitsNeighbor, /*SIZE*/nUse, + /*OUT*/besthitsClose2, + /*updateDistance*/true); + SortSaveBestHits(closeNode2, /*IN/SORT*/besthitsClose2, + /*IN-SIZE*/nUse, /*OUT-SIZE*/tophits->q, + /*IN/OUT*/tophits); + tophits->top_hits_lists[closeNode2].hitSource = closeNode; + besthitsClose2 = myfree(besthitsClose2, sizeof(besthit_t) * nUse); + } /* end if should do 2nd-level transfer */ + } + } + } /* end loop over close candidates */ + besthitsSeed = myfree(besthitsSeed, sizeof(besthit_t)*NJ->nSeq); + besthitsNeighbor = myfree(besthitsNeighbor, sizeof(besthit_t) * 2 * tophits->m); + } /* end loop over seeds */ + + for (iNode=0; iNodenSeq; iNode++) { + top_hits_list_t *l = &tophits->top_hits_lists[iNode]; + assert(l->hits != NULL); + assert(l->hits[0].j >= 0); + assert(l->hits[0].j < NJ->nSeq); + assert(l->hits[0].j != iNode); + tophits->visible[iNode] = l->hits[0]; + } + + if (verbose >= 2) fprintf(stderr, "#Close neighbors among leaves: 1st-level %ld 2nd-level %ld seeds %ld\n", + nCloseUsed, nClose2Used, NJ->nSeq-nCloseUsed-nClose2Used); + nGaps = myfree(nGaps, sizeof(int)*NJ->nSeq); + seeds = myfree(seeds, sizeof(int)*NJ->nSeq); + + /* Now add a "checking phase" where we ensure that the q or 2*sqrt(m) hits + of i are represented in j (if they should be) + */ + long lReplace = 0; + int nCheck = tophits->q > 0 ? tophits->q : (int)(0.5 + 2.0*sqrt(tophits->m)); + for (iNode = 0; iNode < NJ->nSeq; iNode++) { + if ((iNode % 100) == 0) + ProgressReport("Checking top hits for %6d of %6d seqs", + iNode+1, NJ->nSeq, 0, 0); + top_hits_list_t *lNode = &tophits->top_hits_lists[iNode]; + int iHit; + for (iHit = 0; iHit < nCheck && iHit < lNode->nHits; iHit++) { + besthit_t bh = HitToBestHit(iNode, lNode->hits[iHit]); + SetCriterion(NJ, /*nActive*/NJ->nSeq, /*IN/OUT*/&bh); + top_hits_list_t *lTarget = &tophits->top_hits_lists[bh.j]; + + /* If this criterion is worse than the nCheck-1 entry of the target, + then skip the check. + This logic is based on assuming that the list is sorted, + which is true initially but may not be true later. + Still, is a good heuristic. + */ + assert(nCheck > 0); + assert(nCheck <= lTarget->nHits); + besthit_t bhCheck = HitToBestHit(bh.j, lTarget->hits[nCheck-1]); + SetCriterion(NJ, /*nActive*/NJ->nSeq, /*IN/OUT*/&bhCheck); + if (bhCheck.criterion < bh.criterion) + continue; /* no check needed */ + + /* Check if this is present in the top-hit list */ + int iHit2; + bool bFound = false; + for (iHit2 = 0; iHit2 < lTarget->nHits && !bFound; iHit2++) + if (lTarget->hits[iHit2].j == iNode) + bFound = true; + if (!bFound) { + /* Find the hit with the worst criterion and replace it with this one */ + int iWorst = -1; + double dWorstCriterion = -1e20; + for (iHit2 = 0; iHit2 < lTarget->nHits; iHit2++) { + besthit_t bh2 = HitToBestHit(bh.j, lTarget->hits[iHit2]); + SetCriterion(NJ, /*nActive*/NJ->nSeq, /*IN/OUT*/&bh2); + if (bh2.criterion > dWorstCriterion) { + iWorst = iHit2; + dWorstCriterion = bh2.criterion; + } + } + if (dWorstCriterion > bh.criterion) { + assert(iWorst >= 0); + lTarget->hits[iWorst].j = iNode; + lTarget->hits[iWorst].dist = bh.dist; + lReplace++; + /* and perhaps update visible */ + besthit_t v; + bool bSuccess = GetVisible(NJ, /*nActive*/NJ->nSeq, tophits, bh.j, /*OUT*/&v); + assert(bSuccess); + if (bh.criterion < v.criterion) + tophits->visible[bh.j] = lTarget->hits[iWorst]; + } + } + } + } + + if (verbose >= 2) + fprintf(stderr, "Replaced %ld top hit entries\n", lReplace); +} + +/* Updates out-distances but does not reset or update visible set */ +void GetBestFromTopHits(int iNode, + /*IN/UPDATE*/NJ_t *NJ, + int nActive, + /*IN*/top_hits_t *tophits, + /*OUT*/besthit_t *bestjoin) { + assert(iNode >= 0); + assert(NJ->parent[iNode] < 0); + top_hits_list_t *l = &tophits->top_hits_lists[iNode]; + assert(l->nHits > 0); + assert(l->hits != NULL); + + if(!fastest) + SetOutDistance(NJ, iNode, nActive); /* ensure out-distances are not stale */ + + bestjoin->i = -1; + bestjoin->j = -1; + bestjoin->dist = 1e20; + bestjoin->criterion = 1e20; + + int iBest; + for(iBest=0; iBest < l->nHits; iBest++) { + besthit_t bh = HitToBestHit(iNode, l->hits[iBest]); + if (UpdateBestHit(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/&bh, /*update dist*/true)) { + SetCriterion(/*IN/OUT*/NJ, nActive, /*IN/OUT*/&bh); /* make sure criterion is correct */ + if (bh.criterion < bestjoin->criterion) + *bestjoin = bh; + } + } + assert(bestjoin->j >= 0); /* a hit was found */ + assert(bestjoin->i == iNode); +} + +int ActiveAncestor(/*IN*/NJ_t *NJ, int iNode) { + if (iNode < 0) + return(iNode); + while(NJ->parent[iNode] >= 0) + iNode = NJ->parent[iNode]; + return(iNode); +} + +bool UpdateBestHit(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *hit, + bool bUpdateDist) { + int i = ActiveAncestor(/*IN*/NJ, hit->i); + int j = ActiveAncestor(/*IN*/NJ, hit->j); + if (i < 0 || j < 0 || i == j) { + hit->i = -1; + hit->j = -1; + hit->weight = 0; + hit->dist = 1e20; + hit->criterion = 1e20; + return(false); + } + if (i != hit->i || j != hit->j) { + hit->i = i; + hit->j = j; + if (bUpdateDist) { + SetDistCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/hit); + } else { + hit->dist = -1e20; + hit->criterion = 1e20; + } + } + return(true); +} + +bool GetVisible(/*IN/UPDATE*/NJ_t *NJ, int nActive, + /*IN/OUT*/top_hits_t *tophits, + int iNode, /*OUT*/besthit_t *visible) { + if (iNode < 0 || NJ->parent[iNode] >= 0) + return(false); + hit_t *v = &tophits->visible[iNode]; + if (v->j < 0 || NJ->parent[v->j] >= 0) + return(false); + *visible = HitToBestHit(iNode, *v); + SetCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/visible); + return(true); +} + +besthit_t *UniqueBestHits(/*IN/UPDATE*/NJ_t *NJ, int nActive, + /*IN/SORT*/besthit_t *combined, int nCombined, + /*OUT*/int *nUniqueOut) { + int iHit; + for (iHit = 0; iHit < nCombined; iHit++) { + besthit_t *hit = &combined[iHit]; + UpdateBestHit(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/hit, /*update*/false); + } + qsort(/*IN/OUT*/combined, nCombined, sizeof(besthit_t), CompareHitsByIJ); + + besthit_t *uniqueList = (besthit_t*)mymalloc(sizeof(besthit_t)*nCombined); + int nUnique = 0; + int iSavedLast = -1; + + /* First build the new list */ + for (iHit = 0; iHit < nCombined; iHit++) { + besthit_t *hit = &combined[iHit]; + if (hit->i < 0 || hit->j < 0) + continue; + if (iSavedLast >= 0) { + /* toss out duplicates */ + besthit_t *saved = &combined[iSavedLast]; + if (saved->i == hit->i && saved->j == hit->j) + continue; + } + assert(nUnique < nCombined); + assert(hit->j >= 0 && NJ->parent[hit->j] < 0); + uniqueList[nUnique++] = *hit; + iSavedLast = iHit; + } + *nUniqueOut = nUnique; + + /* Then do any updates to the criterion or the distances in parallel */ +#ifdef OPENMP + #pragma omp parallel for schedule(dynamic, 50) +#endif + for (iHit = 0; iHit < nUnique; iHit++) { + besthit_t *hit = &uniqueList[iHit]; + if (hit->dist < 0.0) + SetDistCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/hit); + else + SetCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/hit); + } + return(uniqueList); +} + +/* + Create a top hit list for the new node, either + from children (if there are enough best hits left) or by a "refresh" + Also set visible set for newnode + Also update visible set for other nodes if we stumble across a "better" hit +*/ + +void TopHitJoin(int newnode, + /*IN/UPDATE*/NJ_t *NJ, + int nActive, + /*IN/OUT*/top_hits_t *tophits) { + long startProfileOps = profileOps; + long startOutProfileOps = outprofileOps; + assert(NJ->child[newnode].nChild == 2); + top_hits_list_t *lNew = &tophits->top_hits_lists[newnode]; + assert(lNew->hits == NULL); + + /* Copy the hits */ + int i; + top_hits_list_t *lChild[2]; + for (i = 0; i< 2; i++) { + lChild[i] = &tophits->top_hits_lists[NJ->child[newnode].child[i]]; + assert(lChild[i]->hits != NULL && lChild[i]->nHits > 0); + } + int nCombined = lChild[0]->nHits + lChild[1]->nHits; + besthit_t *combinedList = (besthit_t*)mymalloc(sizeof(besthit_t)*nCombined); + HitsToBestHits(lChild[0]->hits, lChild[0]->nHits, NJ->child[newnode].child[0], + /*OUT*/combinedList); + HitsToBestHits(lChild[1]->hits, lChild[1]->nHits, NJ->child[newnode].child[1], + /*OUT*/combinedList + lChild[0]->nHits); + int nUnique; + /* UniqueBestHits() replaces children (used in the calls to HitsToBestHits) + with active ancestors, so all distances & criteria will be recomputed */ + besthit_t *uniqueList = UniqueBestHits(/*IN/UPDATE*/NJ, nActive, + /*IN/SORT*/combinedList, + nCombined, + /*OUT*/&nUnique); + int nUniqueAlloc = nCombined; + combinedList = myfree(combinedList, sizeof(besthit_t)*nCombined); + + /* Forget the top-hit lists of the joined nodes */ + for (i = 0; i < 2; i++) { + lChild[i]->hits = myfree(lChild[i]->hits, sizeof(hit_t) * lChild[i]->nHits); + lChild[i]->nHits = 0; + } + + /* Use the average age, rounded up, by 1 Versions 2.0 and earlier + used the maximum age, which leads to more refreshes without + improving the accuracy of the NJ phase. Intuitively, if one of + them was just refreshed then another refresh is unlikely to help. + */ + lNew->age = (lChild[0]->age+lChild[1]->age+1)/2 + 1; + + /* If top hit ages always match (perfectly balanced), then a + limit of log2(m) would mean a refresh after + m joins, which is about what we want. + */ + int tophitAgeLimit = MAX(1, (int)(0.5 + log((double)tophits->m)/log(2.0))); + + /* Either use the merged list as candidate top hits, or + move from 2nd level to 1st level, or do a refresh + UniqueBestHits eliminates hits to self, so if nUnique==nActive-1, + we've already done the exhaustive search. + + Either way, we set tophits, visible(newnode), update visible of its top hits, + and modify topvisible: if we do a refresh, then we reset it, otherwise we update + */ + bool bSecondLevel = lChild[0]->hitSource >= 0 && lChild[1]->hitSource >= 0; + bool bUseUnique = nUnique==nActive-1 + || (lNew->age <= tophitAgeLimit + && nUnique >= (bSecondLevel ? (int)(0.5 + tophits2Refresh * tophits->q) + : (int)(0.5 + tophits->m * tophitsRefresh) )); + if (bUseUnique && verbose > 2) + fprintf(stderr,"Top hits for %d from combined %d nActive=%d tophitsage %d %s\n", + newnode,nUnique,nActive,lNew->age, + bSecondLevel ? "2ndlevel" : "1stlevel"); + + if (!bUseUnique + && bSecondLevel + && lNew->age <= tophitAgeLimit) { + int source = ActiveAncestor(NJ, lChild[0]->hitSource); + if (source == newnode) + source = ActiveAncestor(NJ, lChild[1]->hitSource); + /* In parallel mode, it is possible that we would select a node as the + hit-source and then over-write that top hit with a short list. + So we need this sanity check. + */ + if (source != newnode + && source >= 0 + && tophits->top_hits_lists[source].hitSource < 0) { + + /* switch from 2nd-level to 1st-level top hits -- compute top hits list + of node from what we have so far plus the active source plus its top hits */ + top_hits_list_t *lSource = &tophits->top_hits_lists[source]; + assert(lSource->hitSource < 0); + assert(lSource->nHits > 0); + int nMerge = 1 + lSource->nHits + nUnique; + besthit_t *mergeList = mymalloc(sizeof(besthit_t) * nMerge); + memcpy(/*to*/mergeList, /*from*/uniqueList, nUnique * sizeof(besthit_t)); + + int iMerge = nUnique; + mergeList[iMerge].i = newnode; + mergeList[iMerge].j = source; + SetDistCriterion(NJ, nActive, /*IN/OUT*/&mergeList[iMerge]); + iMerge++; + HitsToBestHits(lSource->hits, lSource->nHits, newnode, /*OUT*/mergeList+iMerge); + for (i = 0; i < lSource->nHits; i++) { + SetDistCriterion(NJ, nActive, /*IN/OUT*/&mergeList[iMerge]); + iMerge++; + } + assert(iMerge == nMerge); + + uniqueList = myfree(uniqueList, nUniqueAlloc * sizeof(besthit_t)); + uniqueList = UniqueBestHits(/*IN/UPDATE*/NJ, nActive, + /*IN/SORT*/mergeList, + nMerge, + /*OUT*/&nUnique); + nUniqueAlloc = nMerge; + mergeList = myfree(mergeList, sizeof(besthit_t)*nMerge); + + assert(nUnique > 0); + bUseUnique = nUnique >= (int)(0.5 + tophits->m * tophitsRefresh); + bSecondLevel = false; + + if (bUseUnique && verbose > 2) + fprintf(stderr, "Top hits for %d from children and source %d's %d hits, nUnique %d\n", + newnode, source, lSource->nHits, nUnique); + } + } + + if (bUseUnique) { + if (bSecondLevel) { + /* pick arbitrarily */ + lNew->hitSource = lChild[0]->hitSource; + } + int nSave = MIN(nUnique, bSecondLevel ? tophits->q : tophits->m); + assert(nSave>0); + if (verbose > 2) + fprintf(stderr, "Combined %d ops so far %ld\n", nUnique, profileOps - startProfileOps); + SortSaveBestHits(newnode, /*IN/SORT*/uniqueList, /*nIn*/nUnique, + /*nOut*/nSave, /*IN/OUT*/tophits); + assert(lNew->hits != NULL); /* set by sort/save */ + tophits->visible[newnode] = lNew->hits[0]; + UpdateTopVisible(/*IN*/NJ, nActive, newnode, &tophits->visible[newnode], + /*IN/OUT*/tophits); + UpdateVisible(/*IN/UPDATE*/NJ, nActive, /*IN*/uniqueList, nSave, /*IN/OUT*/tophits); + } else { + /* need to refresh: set top hits for node and for its top hits */ + if(verbose > 2) fprintf(stderr,"Top hits for %d by refresh (%d unique age %d) nActive=%d\n", + newnode,nUnique,lNew->age,nActive); + nRefreshTopHits++; + lNew->age = 0; + + int iNode; + /* ensure all out-distances are up to date ahead of time + to avoid any data overwriting issues. + */ +#ifdef OPENMP + #pragma omp parallel for schedule(dynamic, 50) +#endif + for (iNode = 0; iNode < NJ->maxnode; iNode++) { + if (NJ->parent[iNode] < 0) { + if (fastest) { + besthit_t bh; + bh.i = iNode; + bh.j = iNode; + bh.dist = 0; + SetCriterion(/*IN/UPDATE*/NJ, nActive, &bh); + } else { + SetOutDistance(/*IN/UDPATE*/NJ, iNode, nActive); + } + } + } + + /* exhaustively get the best 2*m hits for newnode, set visible, and save the top m */ + besthit_t *allhits = (besthit_t*)mymalloc(sizeof(besthit_t)*NJ->maxnode); + assert(2 * tophits->m <= NJ->maxnode); + besthit_t bh; + SetBestHit(newnode, NJ, nActive, /*OUT*/&bh, /*OUT*/allhits); + qsort(/*IN/OUT*/allhits, NJ->maxnode, sizeof(besthit_t), CompareHitsByCriterion); + SortSaveBestHits(newnode, /*IN/SORT*/allhits, /*nIn*/NJ->maxnode, + /*nOut*/tophits->m, /*IN/OUT*/tophits); + + /* Do not need to call UpdateVisible because we set visible below */ + + /* And use the top 2*m entries to expand other best-hit lists, but only for top m */ + int iHit; +#ifdef OPENMP + #pragma omp parallel for schedule(dynamic, 50) +#endif + for (iHit=0; iHit < tophits->m; iHit++) { + if (allhits[iHit].i < 0) continue; + int iNode = allhits[iHit].j; + assert(iNode>=0); + if (NJ->parent[iNode] >= 0) continue; + top_hits_list_t *l = &tophits->top_hits_lists[iNode]; + int nHitsOld = l->nHits; + assert(nHitsOld <= tophits->m); + l->age = 0; + + /* Merge: old hits into 0->nHitsOld and hits from iNode above that */ + besthit_t *bothList = (besthit_t*)mymalloc(sizeof(besthit_t) * 3 * tophits->m); + HitsToBestHits(/*IN*/l->hits, nHitsOld, iNode, /*OUT*/bothList); /* does not compute criterion */ + for (i = 0; i < nHitsOld; i++) + SetCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/&bothList[i]); + if (nActive <= 2 * tophits->m) + l->hitSource = -1; /* abandon the 2nd-level top-hits heuristic */ + int nNewHits = l->hitSource >= 0 ? tophits->q : tophits->m; + assert(nNewHits > 0); + + TransferBestHits(/*IN/UPDATE*/NJ, nActive, iNode, + /*IN*/allhits, /*nOldHits*/2 * nNewHits, + /*OUT*/&bothList[nHitsOld], + /*updateDist*/false); /* rely on UniqueBestHits to update dist and/or criterion */ + int nUnique2; + besthit_t *uniqueList2 = UniqueBestHits(/*IN/UPDATE*/NJ, nActive, + /*IN/SORT*/bothList, nHitsOld + 2 * nNewHits, + /*OUT*/&nUnique2); + assert(nUnique2 > 0); + bothList = myfree(bothList,3 * tophits->m * sizeof(besthit_t)); + + /* Note this will overwrite l, but we saved nHitsOld */ + SortSaveBestHits(iNode, /*IN/SORT*/uniqueList2, /*nIn*/nUnique2, + /*nOut*/nNewHits, /*IN/OUT*/tophits); + /* will update topvisible below */ + tophits->visible[iNode] = tophits->top_hits_lists[iNode].hits[0]; + uniqueList2 = myfree(uniqueList2, (nHitsOld + 2 * tophits->m) * sizeof(besthit_t)); + } + + ResetTopVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits); /* outside of the parallel phase */ + allhits = myfree(allhits,sizeof(besthit_t)*NJ->maxnode); + } + uniqueList = myfree(uniqueList, nUniqueAlloc * sizeof(besthit_t)); + if (verbose > 2) { + fprintf(stderr, "New top-hit list for %d profile-ops %ld (out-ops %ld): source %d age %d members ", + newnode, + profileOps - startProfileOps, + outprofileOps - startOutProfileOps, + lNew->hitSource, lNew->age); + + int i; + for (i = 0; i < lNew->nHits; i++) + fprintf(stderr, " %d", lNew->hits[i].j); + fprintf(stderr,"\n"); + } +} + +void UpdateVisible(/*IN/UPDATE*/NJ_t *NJ, int nActive, + /*IN*/besthit_t *tophitsNode, + int nTopHits, + /*IN/OUT*/top_hits_t *tophits) { + int iHit; + + for(iHit = 0; iHit < nTopHits; iHit++) { + besthit_t *hit = &tophitsNode[iHit]; + if (hit->i < 0) continue; /* possible empty entries */ + assert(NJ->parent[hit->i] < 0); + assert(hit->j >= 0 && NJ->parent[hit->j] < 0); + besthit_t visible; + bool bSuccess = GetVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits, hit->j, /*OUT*/&visible); + if (!bSuccess || hit->criterion < visible.criterion) { + if (bSuccess) + nVisibleUpdate++; + hit_t *v = &tophits->visible[hit->j]; + v->j = hit->i; + v->dist = hit->dist; + UpdateTopVisible(NJ, nActive, hit->j, v, /*IN/OUT*/tophits); + if(verbose>5) fprintf(stderr,"NewVisible %d %d %f\n", + hit->j,v->j,v->dist); + } + } /* end loop over hits */ +} + +/* Update the top-visible list to perhaps include visible[iNode] */ +void UpdateTopVisible(/*IN*/NJ_t * NJ, int nActive, + int iIn, /*IN*/hit_t *hit, + /*IN/OUT*/top_hits_t *tophits) { + assert(tophits != NULL); + bool bIn = false; /* placed in the list */ + int i; + + /* First, if the list is not full, put it in somewhere */ + for (i = 0; i < tophits->nTopVisible && !bIn; i++) { + int iNode = tophits->topvisible[i]; + if (iNode == iIn) { + /* this node is already in the top hit list */ + bIn = true; + } else if (iNode < 0 || NJ->parent[iNode] >= 0) { + /* found an empty spot */ + bIn = true; + tophits->topvisible[i] = iIn; + } + } + + int iPosWorst = -1; + double dCriterionWorst = -1e20; + if (!bIn) { + /* Search for the worst hit */ + for (i = 0; i < tophits->nTopVisible && !bIn; i++) { + int iNode = tophits->topvisible[i]; + assert(iNode >= 0 && NJ->parent[iNode] < 0 && iNode != iIn); + besthit_t visible; + if (!GetVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits, iNode, /*OUT*/&visible)) { + /* found an empty spot */ + tophits->topvisible[i] = iIn; + bIn = true; + } else if (visible.i == hit->j && visible.j == iIn) { + /* the reverse hit is already in the top hit list */ + bIn = true; + } else if (visible.criterion >= dCriterionWorst) { + iPosWorst = i; + dCriterionWorst = visible.criterion; + } + } + } + + if (!bIn && iPosWorst >= 0) { + besthit_t visible = HitToBestHit(iIn, *hit); + SetCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/&visible); + if (visible.criterion < dCriterionWorst) { + if (verbose > 2) { + int iOld = tophits->topvisible[iPosWorst]; + fprintf(stderr, "TopVisible replace %d=>%d with %d=>%d\n", + iOld, tophits->visible[iOld].j, visible.i, visible.j); + } + tophits->topvisible[iPosWorst] = iIn; + } + } + + if (verbose > 2) { + fprintf(stderr, "Updated TopVisible: "); + for (i = 0; i < tophits->nTopVisible; i++) { + int iNode = tophits->topvisible[i]; + if (iNode >= 0 && NJ->parent[iNode] < 0) { + besthit_t bh = HitToBestHit(iNode, tophits->visible[iNode]); + SetDistCriterion(NJ, nActive, &bh); + fprintf(stderr, " %d=>%d:%.4f", bh.i, bh.j, bh.criterion); + } + } + fprintf(stderr,"\n"); + } +} + +/* Recompute the topvisible list */ +void ResetTopVisible(/*IN/UPDATE*/NJ_t *NJ, + int nActive, + /*IN/OUT*/top_hits_t *tophits) { + besthit_t *visibleSorted = mymalloc(sizeof(besthit_t)*nActive); + int nVisible = 0; /* #entries in visibleSorted */ + int iNode; + for (iNode = 0; iNode < NJ->maxnode; iNode++) { + /* skip joins involving stale nodes */ + if (NJ->parent[iNode] >= 0) + continue; + besthit_t v; + if (GetVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits, iNode, /*OUT*/&v)) { + assert(nVisible < nActive); + visibleSorted[nVisible++] = v; + } + } + assert(nVisible > 0); + + qsort(/*IN/OUT*/visibleSorted,nVisible,sizeof(besthit_t),CompareHitsByCriterion); + + /* Only keep the top m items, and try to avoid duplicating i->j with j->i + Note that visible(i) -> j does not necessarily imply visible(j) -> i, + so we store what the pairing was (or -1 for not used yet) + */ + int *inTopVisible = malloc(sizeof(int) * NJ->maxnodes); + int i; + for (i = 0; i < NJ->maxnodes; i++) + inTopVisible[i] = -1; + + if (verbose > 2) + fprintf(stderr, "top-hit search: nActive %d nVisible %d considering up to %d items\n", + nActive, nVisible, tophits->m); + + /* save the sorted indices in topvisible */ + int iSave = 0; + for (i = 0; i < nVisible && iSave < tophits->nTopVisible; i++) { + besthit_t *v = &visibleSorted[i]; + if (inTopVisible[v->i] != v->j) { /* not seen already */ + tophits->topvisible[iSave++] = v->i; + inTopVisible[v->i] = v->j; + inTopVisible[v->j] = v->i; + } + } + while(iSave < tophits->nTopVisible) + tophits->topvisible[iSave++] = -1; + myfree(visibleSorted, sizeof(besthit_t)*nActive); + myfree(inTopVisible, sizeof(int) * NJ->maxnodes); + tophits->topvisibleAge = 0; + if (verbose > 2) { + fprintf(stderr, "Reset TopVisible: "); + for (i = 0; i < tophits->nTopVisible; i++) { + int iNode = tophits->topvisible[i]; + if (iNode < 0) + break; + fprintf(stderr, " %d=>%d", iNode, tophits->visible[iNode].j); + } + fprintf(stderr,"\n"); + } +} + +/* + Find best hit to do in O(N*log(N) + m*L*log(N)) time, by + copying and sorting the visible list + updating out-distances for the top (up to m) candidates + selecting the best hit + if !fastest then + local hill-climbing for a better join, + using best-hit lists only, and updating + all out-distances in every best-hit list +*/ +void TopHitNJSearch(/*IN/UPDATE*/NJ_t *NJ, int nActive, + /*IN/OUT*/top_hits_t *tophits, + /*OUT*/besthit_t *join) { + /* first, do we have at least m/2 candidates in topvisible? + And remember the best one */ + int nCandidate = 0; + int iNodeBestCandidate = -1; + double dBestCriterion = 1e20; + + int i; + for (i = 0; i < tophits->nTopVisible; i++) { + int iNode = tophits->topvisible[i]; + besthit_t visible; + if (GetVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits, iNode, /*OUT*/&visible)) { + nCandidate++; + if (iNodeBestCandidate < 0 || visible.criterion < dBestCriterion) { + iNodeBestCandidate = iNode; + dBestCriterion = visible.criterion; + } + } + } + + tophits->topvisibleAge++; + /* Note we may have only nActive/2 joins b/c we try to store them once */ + if (2 * tophits->topvisibleAge > tophits->m + || (3*nCandidate < tophits->nTopVisible && 3*nCandidate < nActive)) { + /* recompute top visible */ + if (verbose > 2) + fprintf(stderr, "Resetting the top-visible list at nActive=%d\n",nActive); + + /* If age is low, then our visible set is becoming too sparse, because we have + recently recomputed the top visible subset. This is very rare but can happen + with -fastest. A quick-and-dirty solution is to walk up + the parents to get additional entries in top hit lists. To ensure that the + visible set becomes full, pick an arbitrary node if walking up terminates at self. + */ + if (tophits->topvisibleAge <= 2) { + if (verbose > 2) + fprintf(stderr, "Expanding visible set by walking up to active nodes at nActive=%d\n", nActive); + int iNode; + for (iNode = 0; iNode < NJ->maxnode; iNode++) { + if (NJ->parent[iNode] >= 0) + continue; + hit_t *v = &tophits->visible[iNode]; + int newj = ActiveAncestor(NJ, v->j); + if (newj >= 0 && newj != v->j) { + if (newj == iNode) { + /* pick arbitrarily */ + newj = 0; + while (NJ->parent[newj] >= 0 || newj == iNode) + newj++; + } + assert(newj >= 0 && newj < NJ->maxnodes + && newj != iNode + && NJ->parent[newj] < 0); + + /* Set v to point to newj */ + besthit_t bh = { iNode, newj, -1e20, -1e20, -1e20 }; + SetDistCriterion(NJ, nActive, /*IN/OUT*/&bh); + v->j = newj; + v->dist = bh.dist; + } + } + } + ResetTopVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits); + /* and recurse to try again */ + TopHitNJSearch(NJ, nActive, tophits, join); + return; + } + if (verbose > 2) + fprintf(stderr, "Top-visible list size %d (nActive %d m %d)\n", + nCandidate, nActive, tophits->m); + assert(iNodeBestCandidate >= 0 && NJ->parent[iNodeBestCandidate] < 0); + bool bSuccess = GetVisible(NJ, nActive, tophits, iNodeBestCandidate, /*OUT*/join); + assert(bSuccess); + assert(join->i >= 0 && NJ->parent[join->i] < 0); + assert(join->j >= 0 && NJ->parent[join->j] < 0); + + if(fastest) + return; + + int changed; + do { + changed = 0; + + besthit_t bestI; + GetBestFromTopHits(join->i, NJ, nActive, tophits, /*OUT*/&bestI); + assert(bestI.i == join->i); + if (bestI.j != join->j && bestI.criterion < join->criterion) { + changed = 1; + if (verbose>2) + fprintf(stderr,"BetterI\t%d\t%d\t%d\t%d\t%f\t%f\n", + join->i,join->j,bestI.i,bestI.j, + join->criterion,bestI.criterion); + *join = bestI; + } + + besthit_t bestJ; + GetBestFromTopHits(join->j, NJ, nActive, tophits, /*OUT*/&bestJ); + assert(bestJ.i == join->j); + if (bestJ.j != join->i && bestJ.criterion < join->criterion) { + changed = 1; + if (verbose>2) + fprintf(stderr,"BetterJ\t%d\t%d\t%d\t%d\t%f\t%f\n", + join->i,join->j,bestJ.i,bestJ.j, + join->criterion,bestJ.criterion); + *join = bestJ; + } + if(changed) nHillBetter++; + } while(changed); +} + +int NGaps(/*IN*/NJ_t *NJ, int iNode) { + assert(iNode < NJ->nSeq); + int nGaps = 0; + int p; + for(p=0; pnPos; p++) { + if (NJ->profiles[iNode]->codes[p] == NOCODE) + nGaps++; + } + return(nGaps); +} + +int CompareHitsByCriterion(const void *c1, const void *c2) { + const besthit_t *hit1 = (besthit_t*)c1; + const besthit_t *hit2 = (besthit_t*)c2; + if (hit1->criterion < hit2->criterion) return(-1); + if (hit1->criterion > hit2->criterion) return(1); + return(0); +} + +int CompareHitsByIJ(const void *c1, const void *c2) { + const besthit_t *hit1 = (besthit_t*)c1; + const besthit_t *hit2 = (besthit_t*)c2; + return hit1->i != hit2->i ? hit1->i - hit2->i : hit1->j - hit2->j; +} + +void SortSaveBestHits(int iNode, /*IN/SORT*/besthit_t *besthits, + int nIn, int nOut, + /*IN/OUT*/top_hits_t *tophits) { + assert(nIn > 0); + assert(nOut > 0); + top_hits_list_t *l = &tophits->top_hits_lists[iNode]; + /* */ + qsort(/*IN/OUT*/besthits,nIn,sizeof(besthit_t),CompareHitsByCriterion); + + /* First count how many we will save + Not sure if removing duplicates is actually necessary. + */ + int nSave = 0; + int jLast = -1; + int iBest; + for (iBest = 0; iBest < nIn && nSave < nOut; iBest++) { + if (besthits[iBest].i < 0) + continue; + assert(besthits[iBest].i == iNode); + int j = besthits[iBest].j; + if (j != iNode && j != jLast && j >= 0) { + nSave++; + jLast = j; + } + } + + assert(nSave > 0); + +#ifdef OPENMP + omp_set_lock(&tophits->locks[iNode]); +#endif + if (l->hits != NULL) { + l->hits = myfree(l->hits, l->nHits * sizeof(hit_t)); + l->nHits = 0; + } + l->hits = mymalloc(sizeof(hit_t) * nSave); + l->nHits = nSave; + int iSave = 0; + jLast = -1; + for (iBest = 0; iBest < nIn && iSave < nSave; iBest++) { + int j = besthits[iBest].j; + if (j != iNode && j != jLast && j >= 0) { + l->hits[iSave].j = j; + l->hits[iSave].dist = besthits[iBest].dist; + iSave++; + jLast = j; + } + } +#ifdef OPENMP + omp_unset_lock(&tophits->locks[iNode]); +#endif + assert(iSave == nSave); +} + +void TransferBestHits(/*IN/UPDATE*/NJ_t *NJ, + int nActive, + int iNode, + /*IN*/besthit_t *oldhits, + int nOldHits, + /*OUT*/besthit_t *newhits, + bool updateDistances) { + assert(iNode >= 0); + assert(NJ->parent[iNode] < 0); + + int iBest; + for(iBest = 0; iBest < nOldHits; iBest++) { + besthit_t *old = &oldhits[iBest]; + besthit_t *new = &newhits[iBest]; + new->i = iNode; + new->j = ActiveAncestor(/*IN*/NJ, old->j); + new->dist = old->dist; /* may get reset below */ + new->weight = old->weight; + new->criterion = old->criterion; + + if(new->j < 0 || new->j == iNode) { + new->weight = 0; + new->dist = -1e20; + new->criterion = 1e20; + } else if (new->i != old->i || new->j != old->j) { + if (updateDistances) + SetDistCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/new); + else { + new->dist = -1e20; + new->criterion = 1e20; + } + } else { + if (updateDistances) + SetCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/new); + else + new->criterion = 1e20; /* leave dist alone */ + } + } +} + +void HitsToBestHits(/*IN*/hit_t *hits, int nHits, int iNode, /*OUT*/besthit_t *newhits) { + int i; + for (i = 0; i < nHits; i++) { + hit_t *hit = &hits[i]; + besthit_t *bh = &newhits[i]; + bh->i = iNode; + bh->j = hit->j; + bh->dist = hit->dist; + bh->criterion = 1e20; + bh->weight = -1; /* not the true value -- we compute these directly when needed */ + } +} + +besthit_t HitToBestHit(int i, hit_t hit) { + besthit_t bh; + bh.i = i; + bh.j = hit.j; + bh.dist = hit.dist; + bh.criterion = 1e20; + bh.weight = -1; + return(bh); +} + +char *OpenMPString(void) { +#ifdef OPENMP + static char buf[100]; + sprintf(buf, ", OpenMP (%d threads)", omp_get_max_threads()); + return(buf); +#else + return(""); +#endif +} + +/* Algorithm 26.2.17 from Abromowitz and Stegun, Handbook of Mathematical Functions + Absolute accuracy of only about 1e-7, which is enough for us +*/ +double pnorm(double x) +{ + double b1 = 0.319381530; + double b2 = -0.356563782; + double b3 = 1.781477937; + double b4 = -1.821255978; + double b5 = 1.330274429; + double p = 0.2316419; + double c = 0.39894228; + + if(x >= 0.0) { + double t = 1.0 / ( 1.0 + p * x ); + return (1.0 - c * exp( -x * x / 2.0 ) * t * + ( t *( t * ( t * ( t * b5 + b4 ) + b3 ) + b2 ) + b1 )); + } + /*else*/ + double t = 1.0 / ( 1.0 - p * x ); + return ( c * exp( -x * x / 2.0 ) * t * + ( t *( t * ( t * ( t * b5 + b4 ) + b3 ) + b2 ) + b1 )); +} + +void *mymalloc(size_t sz) { + if (sz == 0) return(NULL); + void *new = malloc(sz); + if (new == NULL) { + fprintf(stderr, "Out of memory\n"); + exit(1); + } + szAllAlloc += sz; + mymallocUsed += sz; +#ifdef TRACK_MEMORY + struct mallinfo mi = mallinfo(); + if (mi.arena+mi.hblkhd > maxmallocHeap) + maxmallocHeap = mi.arena+mi.hblkhd; +#endif + /* gcc malloc should always return 16-byte-aligned values... */ + assert(IS_ALIGNED(new)); + return (new); +} + +void *mymemdup(void *data, size_t sz) { + if(data==NULL) return(NULL); + void *new = mymalloc(sz); + memcpy(/*to*/new, /*from*/data, sz); + return(new); +} + +void *myrealloc(void *data, size_t szOld, size_t szNew, bool bCopy) { + if (data == NULL && szOld == 0) + return(mymalloc(szNew)); + if (data == NULL || szOld == 0 || szNew == 0) { + fprintf(stderr,"Empty myrealloc\n"); + exit(1); + } + if (szOld == szNew) + return(data); + void *new = NULL; + if (bCopy) { + /* Try to reduce memory fragmentation by allocating anew and copying + Seems to help in practice */ + new = mymemdup(data, szNew); + myfree(data, szOld); + } else { + new = realloc(data,szNew); + if (new == NULL) { + fprintf(stderr, "Out of memory\n"); + exit(1); + } + assert(IS_ALIGNED(new)); + szAllAlloc += (szNew-szOld); + mymallocUsed += (szNew-szOld); +#ifdef TRACK_MEMORY + struct mallinfo mi = mallinfo(); + if (mi.arena+mi.hblkhd > maxmallocHeap) + maxmallocHeap = mi.arena+mi.hblkhd; +#endif + } + return(new); +} + +void *myfree(void *p, size_t sz) { + if(p==NULL) return(NULL); + free(p); + mymallocUsed -= sz; + return(NULL); +} + +/******************************************************************************/ +/* Minimization of a 1-dimensional function by Brent's method (Numerical Recipes) + * Borrowed from Tree-Puzzle 5.1 util.c under GPL + * Modified by M.N.P to pass in the accessory data for the optimization function, + * to use 2x bounds around the starting guess and expand them if necessary, + * and to use both a fractional and an absolute tolerance + */ + +#define ITMAX 100 +#define CGOLD 0.3819660 +#define TINY 1.0e-20 +#define ZEPS 1.0e-10 +#define SHFT(a,b,c,d) (a)=(b);(b)=(c);(c)=(d); +#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a)) + +/* Brents method in one dimension */ +double brent(double ax, double bx, double cx, double (*f)(double, void *), void *data, + double ftol, double atol, + double *foptx, double *f2optx, double fax, double fbx, double fcx) +{ + int iter; + double a,b,d=0,etemp,fu,fv,fw,fx,p,q,r,tol1,tol2,u,v,w,x,xm; + double xw,wv,vx; + double e=0.0; + + a=(ax < cx ? ax : cx); + b=(ax > cx ? ax : cx); + x=bx; + fx=fbx; + if (fax < fcx) { + w=ax; + fw=fax; + v=cx; + fv=fcx; + } else { + w=cx; + fw=fcx; + v=ax; + fv=fax; + } + for (iter=1;iter<=ITMAX;iter++) { + xm=0.5*(a+b); + tol1=ftol*fabs(x); + tol2=2.0*(tol1+ZEPS); + if (fabs(x-xm) <= (tol2-0.5*(b-a)) + || fabs(a-b) < atol) { + *foptx = fx; + xw = x-w; + wv = w-v; + vx = v-x; + *f2optx = 2.0*(fv*xw + fx*wv + fw*vx)/ + (v*v*xw + x*x*wv + w*w*vx); + return x; + } + if (fabs(e) > tol1) { + r=(x-w)*(fx-fv); + q=(x-v)*(fx-fw); + p=(x-v)*q-(x-w)*r; + q=2.0*(q-r); + if (q > 0.0) p = -p; + q=fabs(q); + etemp=e; + e=d; + if (fabs(p) >= fabs(0.5*q*etemp) || p <= q*(a-x) || p >= q*(b-x)) + d=CGOLD*(e=(x >= xm ? a-x : b-x)); + else { + d=p/q; + u=x+d; + if (u-a < tol2 || b-u < tol2) + d=SIGN(tol1,xm-x); + } + } else { + d=CGOLD*(e=(x >= xm ? a-x : b-x)); + } + u=(fabs(d) >= tol1 ? x+d : x+SIGN(tol1,d)); + fu=(*f)(u,data); + if (fu <= fx) { + if (u >= x) a=x; else b=x; + SHFT(v,w,x,u) + SHFT(fv,fw,fx,fu) + } else { + if (u < x) a=u; else b=u; + if (fu <= fw || w == x) { + v=w; + w=u; + fv=fw; + fw=fu; + } else if (fu <= fv || v == x || v == w) { + v=u; + fv=fu; + } + } + } + *foptx = fx; + xw = x-w; + wv = w-v; + vx = v-x; + *f2optx = 2.0*(fv*xw + fx*wv + fw*vx)/ + (v*v*xw + x*x*wv + w*w*vx); + return x; +} /* brent */ +#undef ITMAX +#undef CGOLD +#undef ZEPS +#undef SHFT +#undef SIGN + +/* one-dimensional minimization - as input a lower and an upper limit and a trial + value for the minimum is needed: xmin < xguess < xmax + the function and a fractional tolerance has to be specified + onedimenmin returns the optimal x value and the value of the function + and its second derivative at this point + */ +double onedimenmin(double xmin, double xguess, double xmax, double (*f)(double,void*), void *data, + double ftol, double atol, + /*OUT*/double *fx, /*OUT*/double *f2x) +{ + double optx, ax, bx, cx, fa, fb, fc; + + /* first attempt to bracketize minimum */ + if (xguess == xmin) { + ax = xmin; + bx = 2.0*xguess; + cx = 10.0*xguess; + } else if (xguess <= 2.0 * xmin) { + ax = xmin; + bx = xguess; + cx = 5.0*xguess; + } else { + ax = 0.5*xguess; + bx = xguess; + cx = 2.0*xguess; + } + if (cx > xmax) + cx = xmax; + if (bx >= cx) + bx = 0.5*(ax+cx); + if (verbose > 4) + fprintf(stderr, "onedimenmin lo %.4f guess %.4f hi %.4f range %.4f %.4f\n", + ax, bx, cx, xmin, xmax); + /* ideally this range includes the true minimum, i.e., + fb < fa and fb < fc + if not, we gradually expand the boundaries until it does, + or we near the boundary of the allowed range and use that + */ + fa = (*f)(ax,data); + fb = (*f)(bx,data); + fc = (*f)(cx,data); + while(fa < fb && ax > xmin) { + ax = (ax+xmin)/2.0; + if (ax < 2.0*xmin) /* give up on shrinking the region */ + ax = xmin; + fa = (*f)(ax,data); + } + while(fc < fb && cx < xmax) { + cx = (cx+xmax)/2.0; + if (cx > xmax * 0.95) + cx = xmax; + fc = (*f)(cx,data); + } + optx = brent(ax, bx, cx, f, data, ftol, atol, fx, f2x, fa, fb, fc); + + if (verbose > 4) + fprintf(stderr, "onedimenmin reaches optimum f(%.4f) = %.4f f2x %.4f\n", optx, *fx, *f2x); + return optx; /* return optimal x */ +} /* onedimenmin */ + +/* Numerical code for the gamma distribution is modified from the PhyML 3 code + (GNU public license) of Stephane Guindon +*/ + +double LnGamma (double alpha) +{ +/* returns ln(gamma(alpha)) for alpha>0, accurate to 10 decimal places. + Stirling's formula is used for the central polynomial part of the procedure. + Pike MC & Hill ID (1966) Algorithm 291: Logarithm of the gamma function. + Communications of the Association for Computing Machinery, 9:684 +*/ + double x=alpha, f=0, z; + if (x<7) { + f=1; z=x-1; + while (++z<7) f*=z; + x=z; f=-(double)log(f); + } + z = 1/(x*x); + return f + (x-0.5)*(double)log(x) - x + .918938533204673 + + (((-.000595238095238*z+.000793650793651)*z-.002777777777778)*z + +.083333333333333)/x; +} + +double IncompleteGamma(double x, double alpha, double ln_gamma_alpha) +{ +/* returns the incomplete gamma ratio I(x,alpha) where x is the upper + limit of the integration and alpha is the shape parameter. + returns (-1) if in error + ln_gamma_alpha = ln(Gamma(alpha)), is almost redundant. + (1) series expansion if (alpha>x || x<=1) + (2) continued fraction otherwise + RATNEST FORTRAN by + Bhattacharjee GP (1970) The incomplete gamma integral. Applied Statistics, + 19: 285-287 (AS32) +*/ + int i; + double p=alpha, g=ln_gamma_alpha; + double accurate=1e-8, overflow=1e30; + double factor, gin=0, rn=0, a=0,b=0,an=0,dif=0, term=0, pn[6]; + + if (x==0) return (0); + if (x<0 || p<=0) return (-1); + + factor=(double)exp(p*(double)log(x)-x-g); + if (x>1 && x>=p) goto l30; + /* (1) series expansion */ + gin=1; term=1; rn=p; + l20: + rn++; + term*=x/rn; gin+=term; + + if (term > accurate) goto l20; + gin*=factor/p; + goto l50; + l30: + /* (2) continued fraction */ + a=1-p; b=a+x+1; term=0; + pn[0]=1; pn[1]=x; pn[2]=x+1; pn[3]=x*b; + gin=pn[2]/pn[3]; + l32: + a++; b+=2; term++; an=a*term; + for (i=0; i<2; i++) pn[i+4]=b*pn[i+2]-an*pn[i]; + if (pn[5] == 0) goto l35; + rn=pn[4]/pn[5]; dif=fabs(gin-rn); + if (dif>accurate) goto l34; + if (dif<=accurate*rn) goto l42; + l34: + gin=rn; + l35: + for (i=0; i<4; i++) pn[i]=pn[i+2]; + if (fabs(pn[4]) < overflow) goto l32; + for (i=0; i<4; i++) pn[i]/=overflow; + goto l32; + l42: + gin=1-factor*gin; + + l50: + return (gin); +} + +double PGamma(double x, double alpha) +{ + /* scale = 1/alpha */ + return IncompleteGamma(x*alpha,alpha,LnGamma(alpha)); +} + +/* helper function to subtract timval structures */ +/* Subtract the `struct timeval' values X and Y, + storing the result in RESULT. + Return 1 if the difference is negative, otherwise 0. */ +int timeval_subtract (struct timeval *result, struct timeval *x, struct timeval *y) +{ + /* Perform the carry for the later subtraction by updating y. */ + if (x->tv_usec < y->tv_usec) { + int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; + y->tv_usec -= 1000000 * nsec; + y->tv_sec += nsec; + } + if (x->tv_usec - y->tv_usec > 1000000) { + int nsec = (x->tv_usec - y->tv_usec) / 1000000; + y->tv_usec += 1000000 * nsec; + y->tv_sec -= nsec; + } + + /* Compute the time remaining to wait. + tv_usec is certainly positive. */ + result->tv_sec = x->tv_sec - y->tv_sec; + result->tv_usec = x->tv_usec - y->tv_usec; + + /* Return 1 if result is negative. */ + return x->tv_sec < y->tv_sec; +} + +double clockDiff(/*IN*/struct timeval *clock_start) { + struct timeval time_now, elapsed; + gettimeofday(/*OUT*/&time_now,NULL); + timeval_subtract(/*OUT*/&elapsed,/*IN*/&time_now,/*IN*/clock_start); + return(elapsed.tv_sec + elapsed.tv_usec*1e-6); +} + + +/* The random number generator is taken from D E Knuth + http://www-cs-faculty.stanford.edu/~knuth/taocp.html +*/ + +/* This program by D E Knuth is in the public domain and freely copyable. + * It is explained in Seminumerical Algorithms, 3rd edition, Section 3.6 + * (or in the errata to the 2nd edition --- see + * http://www-cs-faculty.stanford.edu/~knuth/taocp.html + * in the changes to Volume 2 on pages 171 and following). */ + +/* N.B. The MODIFICATIONS introduced in the 9th printing (2002) are + included here; there's no backwards compatibility with the original. */ + +/* This version also adopts Brendan McKay's suggestion to + accommodate naive users who forget to call ran_start(seed). */ + +/* If you find any bugs, please report them immediately to + * taocp@cs.stanford.edu + * (and you will be rewarded if the bug is genuine). Thanks! */ + +/************ see the book for explanations and caveats! *******************/ +/************ in particular, you need two's complement arithmetic **********/ + +#define KK 100 /* the long lag */ +#define LL 37 /* the short lag */ +#define MM (1L<<30) /* the modulus */ +#define mod_diff(x,y) (((x)-(y))&(MM-1)) /* subtraction mod MM */ + +long ran_x[KK]; /* the generator state */ + +#ifdef __STDC__ +void ran_array(long aa[],int n) +#else + void ran_array(aa,n) /* put n new random numbers in aa */ + long *aa; /* destination */ + int n; /* array length (must be at least KK) */ +#endif +{ + register int i,j; + for (j=0;j=MM) ss-=MM-2; /* cyclic shift 29 bits */ + } + x[1]++; /* make x[1] (and only x[1]) odd */ + for (ss=seed&(MM-1),t=TT-1; t; ) { + for (j=KK-1;j>0;j--) x[j+j]=x[j], x[j+j-1]=0; /* "square" */ + for (j=KK+KK-2;j>=KK;j--) + x[j-(KK-LL)]=mod_diff(x[j-(KK-LL)],x[j]), + x[j-KK]=mod_diff(x[j-KK],x[j]); + if (is_odd(ss)) { /* "multiply by z" */ + for (j=KK;j>0;j--) x[j]=x[j-1]; + x[0]=x[KK]; /* shift the buffer cyclically */ + x[LL]=mod_diff(x[LL],x[KK]); + } + if (ss) ss>>=1; else t--; + } + for (j=0;j=0? *ran_arr_ptr++: ran_arr_cycle()) +long ran_arr_cycle() +{ + if (ran_arr_ptr==&ran_arr_dummy) + ran_start(314159L); /* the user forgot to initialize */ + ran_array(ran_arr_buf,QUALITY); + ran_arr_buf[KK]=-1; + ran_arr_ptr=ran_arr_buf+1; + return ran_arr_buf[0]; +} + +/* end of code from Knuth */ + +double knuth_rand() { + return(9.31322574615479e-10 * ran_arr_next()); /* multiply by 2**-30 */ +} + +hashstrings_t *MakeHashtable(char **strings, int nStrings) { + hashstrings_t *hash = (hashstrings_t*)mymalloc(sizeof(hashstrings_t)); + hash->nBuckets = 8*nStrings; + hash->buckets = (hashbucket_t*)mymalloc(sizeof(hashbucket_t) * hash->nBuckets); + int i; + for (i=0; i < hash->nBuckets; i++) { + hash->buckets[i].string = NULL; + hash->buckets[i].nCount = 0; + hash->buckets[i].first = -1; + } + for (i=0; i < nStrings; i++) { + hashiterator_t hi = FindMatch(hash, strings[i]); + if (hash->buckets[hi].string == NULL) { + /* save a unique entry */ + assert(hash->buckets[hi].nCount == 0); + hash->buckets[hi].string = strings[i]; + hash->buckets[hi].nCount = 1; + hash->buckets[hi].first = i; + } else { + /* record a duplicate entry */ + assert(hash->buckets[hi].string != NULL); + assert(strcmp(hash->buckets[hi].string, strings[i]) == 0); + assert(hash->buckets[hi].first >= 0); + hash->buckets[hi].nCount++; + } + } + return(hash); +} + +hashstrings_t *FreeHashtable(hashstrings_t* hash) { + if (hash != NULL) { + myfree(hash->buckets, sizeof(hashbucket_t) * hash->nBuckets); + myfree(hash, sizeof(hashstrings_t)); + } + return(NULL); +} + +#define MAXADLER 65521 +hashiterator_t FindMatch(hashstrings_t *hash, char *string) { + /* Adler-32 checksum */ + unsigned int hashA = 1; + unsigned int hashB = 0; + char *p; + for (p = string; *p != '\0'; p++) { + hashA = ((unsigned int)*p + hashA); + hashB = hashA+hashB; + } + hashA %= MAXADLER; + hashB %= MAXADLER; + hashiterator_t hi = (hashB*65536+hashA) % hash->nBuckets; + while(hash->buckets[hi].string != NULL + && strcmp(hash->buckets[hi].string, string) != 0) { + hi++; + if (hi >= hash->nBuckets) + hi = 0; + } + return(hi); +} + +char *GetHashString(hashstrings_t *hash, hashiterator_t hi) { + return(hash->buckets[hi].string); +} + +int HashCount(hashstrings_t *hash, hashiterator_t hi) { + return(hash->buckets[hi].nCount); +} + +int HashFirst(hashstrings_t *hash, hashiterator_t hi) { + return(hash->buckets[hi].first); +} + +uniquify_t *UniquifyAln(alignment_t *aln) { + int nUniqueSeq = 0; + char **uniqueSeq = (char**)mymalloc(aln->nSeq * sizeof(char*)); /* iUnique -> seq */ + int *uniqueFirst = (int*)mymalloc(aln->nSeq * sizeof(int)); /* iUnique -> iFirst in aln */ + int *alnNext = (int*)mymalloc(aln->nSeq * sizeof(int)); /* i in aln -> next, or -1 */ + int *alnToUniq = (int*)mymalloc(aln->nSeq * sizeof(int)); /* i in aln -> iUnique; many -> -1 */ + + int i; + for (i = 0; i < aln->nSeq; i++) { + uniqueSeq[i] = NULL; + uniqueFirst[i] = -1; + alnNext[i] = -1; + alnToUniq[i] = -1; + } + hashstrings_t *hashseqs = MakeHashtable(aln->seqs, aln->nSeq); + for (i=0; inSeq; i++) { + hashiterator_t hi = FindMatch(hashseqs,aln->seqs[i]); + int first = HashFirst(hashseqs,hi); + if (first == i) { + uniqueSeq[nUniqueSeq] = aln->seqs[i]; + uniqueFirst[nUniqueSeq] = i; + alnToUniq[i] = nUniqueSeq; + nUniqueSeq++; + } else { + int last = first; + while (alnNext[last] != -1) + last = alnNext[last]; + assert(last>=0); + alnNext[last] = i; + assert(alnToUniq[last] >= 0 && alnToUniq[last] < nUniqueSeq); + alnToUniq[i] = alnToUniq[last]; + } + } + assert(nUniqueSeq>0); + hashseqs = FreeHashtable(hashseqs); + + uniquify_t *uniquify = (uniquify_t*)mymalloc(sizeof(uniquify_t)); + uniquify->nSeq = aln->nSeq; + uniquify->nUnique = nUniqueSeq; + uniquify->uniqueFirst = uniqueFirst; + uniquify->alnNext = alnNext; + uniquify->alnToUniq = alnToUniq; + uniquify->uniqueSeq = uniqueSeq; + return(uniquify); +} + +uniquify_t *FreeUniquify(uniquify_t *unique) { + if (unique != NULL) { + myfree(unique->uniqueFirst, sizeof(int)*unique->nSeq); + myfree(unique->alnNext, sizeof(int)*unique->nSeq); + myfree(unique->alnToUniq, sizeof(int)*unique->nSeq); + myfree(unique->uniqueSeq, sizeof(char*)*unique->nSeq); + myfree(unique,sizeof(uniquify_t)); + unique = NULL; + } + return(unique); +} + +traversal_t InitTraversal(NJ_t *NJ) { + traversal_t worked = (bool*)mymalloc(sizeof(bool)*NJ->maxnodes); + int i; + for (i=0; imaxnodes; i++) + worked[i] = false; + return(worked); +} + +void SkipTraversalInto(int node, /*IN/OUT*/traversal_t traversal) { + traversal[node] = true; +} + +int TraversePostorder(int node, NJ_t *NJ, /*IN/OUT*/traversal_t traversal, + /*OPTIONAL OUT*/bool *pUp) { + if (pUp) + *pUp = false; + while(1) { + assert(node >= 0); + + /* move to a child if possible */ + bool found = false; + int iChild; + for (iChild=0; iChild < NJ->child[node].nChild; iChild++) { + int child = NJ->child[node].child[iChild]; + if (!traversal[child]) { + node = child; + found = true; + break; + } + } + if (found) + continue; /* keep moving down */ + if (!traversal[node]) { + traversal[node] = true; + return(node); + } + /* If we've already done this node, need to move up */ + if (node == NJ->root) + return(-1); /* nowhere to go -- done traversing */ + node = NJ->parent[node]; + /* If we go up to someplace that was already marked as visited, this is due + to a change in topology, so return it marked as "up" */ + if (pUp && traversal[node]) { + *pUp = true; + return(node); + } + } +} + +traversal_t FreeTraversal(traversal_t traversal, NJ_t *NJ) { + myfree(traversal, sizeof(bool)*NJ->maxnodes); + return(NULL); +} + +profile_t **UpProfiles(NJ_t *NJ) { + profile_t **upProfiles = (profile_t**)mymalloc(sizeof(profile_t*)*NJ->maxnodes); + int i; + for (i=0; imaxnodes; i++) upProfiles[i] = NULL; + return(upProfiles); +} + +profile_t *GetUpProfile(/*IN/OUT*/profile_t **upProfiles, NJ_t *NJ, int outnode, bool useML) { + assert(outnode != NJ->root && outnode >= NJ->nSeq); /* not for root or leaves */ + if (upProfiles[outnode] != NULL) + return(upProfiles[outnode]); + + int depth; + int *pathToRoot = PathToRoot(NJ, outnode, /*OUT*/&depth); + int i; + /* depth-1 is root */ + for (i = depth-2; i>=0; i--) { + int node = pathToRoot[i]; + + if (upProfiles[node] == NULL) { + /* Note -- SetupABCD may call GetUpProfile, but it should do it farther + up in the path to the root + */ + profile_t *profiles[4]; + int nodeABCD[4]; + SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, useML); + if (useML) { + /* If node is a child of root, then the 4th profile is of the 2nd root-sibling of node + Otherwise, the 4th profile is the up-profile of the parent of node, and that + is the branch-length we need + */ + double lenC = NJ->branchlength[nodeABCD[2]]; + double lenD = NJ->branchlength[nodeABCD[3]]; + if (verbose > 3) { + fprintf(stderr, "Computing UpProfile for node %d with lenC %.4f lenD %.4f pair-loglk %.3f\n", + node, lenC, lenD, + PairLogLk(profiles[2],profiles[3],lenC+lenD,NJ->nPos,NJ->transmat,&NJ->rates, /*site_lk*/NULL)); + PrintNJInternal(stderr, NJ, /*useLen*/true); + } + upProfiles[node] = PosteriorProfile(/*C*/profiles[2], /*D*/profiles[3], + lenC, lenD, + NJ->transmat, &NJ->rates, NJ->nPos, NJ->nConstraints); + } else { + profile_t *profilesCDAB[4] = { profiles[2], profiles[3], profiles[0], profiles[1] }; + double weight = QuartetWeight(profilesCDAB, NJ->distance_matrix, NJ->nPos); + if (verbose>3) + fprintf(stderr, "Compute upprofile of %d from %d and parents (vs. children %d %d) with weight %.3f\n", + node, nodeABCD[2], nodeABCD[0], nodeABCD[1], weight); + upProfiles[node] = AverageProfile(profiles[2], profiles[3], + NJ->nPos, NJ->nConstraints, + NJ->distance_matrix, + weight); + } + } + } + FreePath(pathToRoot,NJ); + assert(upProfiles[outnode] != NULL); + return(upProfiles[outnode]); +} + +profile_t *DeleteUpProfile(/*IN/OUT*/profile_t **upProfiles, NJ_t *NJ, int node) { + assert(node>=0 && node < NJ->maxnodes); + if (upProfiles[node] != NULL) + upProfiles[node] = FreeProfile(upProfiles[node], NJ->nPos, NJ->nConstraints); /* returns NULL */ + return(NULL); +} + +profile_t **FreeUpProfiles(profile_t **upProfiles, NJ_t *NJ) { + int i; + int nUsed = 0; + for (i=0; i < NJ->maxnodes; i++) { + if (upProfiles[i] != NULL) + nUsed++; + DeleteUpProfile(upProfiles, NJ, i); + } + myfree(upProfiles, sizeof(profile_t*)*NJ->maxnodes); + if (verbose >= 3) + fprintf(stderr,"FreeUpProfiles -- freed %d\n", nUsed); + return(NULL); +} + +int *PathToRoot(NJ_t *NJ, int node, /*OUT*/int *outDepth) { + int *pathToRoot = (int*)mymalloc(sizeof(int)*NJ->maxnodes); + int depth = 0; + int ancestor = node; + while(ancestor >= 0) { + pathToRoot[depth] = ancestor; + ancestor = NJ->parent[ancestor]; + depth++; + } + *outDepth = depth; + return(pathToRoot); +} + +int *FreePath(int *path, NJ_t *NJ) { + myfree(path, sizeof(int)*NJ->maxnodes); + return(NULL); +} + +transition_matrix_t *CreateGTR(double *r/*ac ag at cg ct gt*/, double *f/*acgt*/) { + double matrix[4][MAXCODES]; + assert(nCodes==4); + int i, j; + /* Place rates onto a symmetric matrix, but correct by f(target), so that + stationary distribution f[] is maintained + Leave diagonals as 0 (CreateTransitionMatrix will fix them) + */ + int imat = 0; + for (i = 0; i < nCodes; i++) { + matrix[i][i] = 0; + for (j = i+1; j < nCodes; j++) { + double rate = r[imat++]; + assert(rate > 0); + /* Want t(matrix) * f to be 0 */ + matrix[i][j] = rate * f[i]; + matrix[j][i] = rate * f[j]; + } + } + /* Compute average mutation rate */ + double total_rate = 0; + for (i = 0; i < nCodes; i++) + for (j = 0; j < nCodes; j++) + total_rate += f[i] * matrix[i][j]; + assert(total_rate > 1e-6); + double inv = 1.0/total_rate; + for (i = 0; i < nCodes; i++) + for (j = 0; j < nCodes; j++) + matrix[i][j] *= inv; + return(CreateTransitionMatrix(matrix,f)); +} + +transition_matrix_t *CreateTransitionMatrix(/*IN*/double matrix[MAXCODES][MAXCODES], + /*IN*/double stat[MAXCODES]) { + int i,j,k; + transition_matrix_t *transmat = mymalloc(sizeof(transition_matrix_t)); + double sqrtstat[20]; + for (i = 0; i < nCodes; i++) { + transmat->stat[i] = stat[i]; + transmat->statinv[i] = 1.0/stat[i]; + sqrtstat[i] = sqrt(stat[i]); + } + + double sym[20*20]; /* symmetrized matrix M' */ + /* set diagonals so columns sums are 0 before symmetrization */ + for (i = 0; i < nCodes; i++) + for (j = 0; j < nCodes; j++) + sym[nCodes*i+j] = matrix[i][j]; + for (j = 0; j < nCodes; j++) { + double sum = 0; + sym[nCodes*j+j] = 0; + for (i = 0; i < nCodes; i++) + sum += sym[nCodes*i+j]; + sym[nCodes*j+j] = -sum; + } + /* M' = S**-1 M S */ + for (i = 0; i < nCodes; i++) + for (j = 0; j < nCodes; j++) + sym[nCodes*i+j] *= sqrtstat[j]/sqrtstat[i]; + + /* eigen decomposition of M' -- note that eigenW is the transpose of what we want, + which is eigenvectors in columns */ + double eigenW[20*20], eval[20], e[20]; + for (i = 0; i < nCodes*nCodes; i++) + eigenW[i] = sym[i]; + tred2(eigenW, nCodes, nCodes, eval, e); + tqli(eval, e, nCodes , nCodes, eigenW); + + /* save eigenvalues */ + for (i = 0; i < nCodes; i++) + transmat->eigenval[i] = eval[i]; + + /* compute eigen decomposition of M into t(codeFreq): V = S*W */ + /* compute inverse of V in eigeninv: V**-1 = t(W) S**-1 */ + for (i = 0; i < nCodes; i++) { + for (j = 0; j < nCodes; j++) { + transmat->eigeninv[i][j] = eigenW[nCodes*i+j] / sqrtstat[j]; + transmat->eigeninvT[j][i] = transmat->eigeninv[i][j]; + } + } + for (i = 0; i < nCodes; i++) + for (j = 0; j < nCodes; j++) + transmat->codeFreq[i][j] = eigenW[j*nCodes+i] * sqrtstat[i]; + /* codeFreq[NOCODE] is the rotation of (1,1,...) not (1/nCodes,1/nCodes,...), which + gives correct posterior probabilities + */ + for (j = 0; j < nCodes; j++) { + transmat->codeFreq[NOCODE][j] = 0.0; + for (i = 0; i < nCodes; i++) + transmat->codeFreq[NOCODE][j] += transmat->codeFreq[i][j]; + } + /* save some posterior probabilities for approximating later: + first, we compute P(B | A, t) for t = approxMLnearT, by using + V * exp(L*t) * V**-1 */ + double expvalues[MAXCODES]; + for (i = 0; i < nCodes; i++) + expvalues[i] = exp(approxMLnearT * transmat->eigenval[i]); + double LVinv[MAXCODES][MAXCODES]; /* exp(L*t) * V**-1 */ + for (i = 0; i < nCodes; i++) { + for (j = 0; j < nCodes; j++) + LVinv[i][j] = transmat->eigeninv[i][j] * expvalues[i]; + } + /* matrix transform for converting A -> B given t: transt[i][j] = P(j->i | t) */ + double transt[MAXCODES][MAXCODES]; + for (i = 0; i < nCodes; i++) { + for (j = 0; j < nCodes; j++) { + transt[i][j] = 0; + for (k = 0; k < nCodes; k++) + transt[i][j] += transmat->codeFreq[i][k] * LVinv[k][j]; + } + } + /* nearP[i][j] = P(parent = j | both children are i) = P(j | i,i) ~ stat(j) * P(j->i | t)**2 */ + for (i = 0; i < nCodes; i++) { + double nearP[MAXCODES]; + double tot = 0; + for (j = 0; j < nCodes; j++) { + assert(transt[j][i] > 0); + assert(transmat->stat[j] > 0); + nearP[j] = transmat->stat[j] * transt[i][j] * transt[i][j]; + tot += nearP[j]; + } + assert(tot > 0); + for (j = 0; j < nCodes; j++) + nearP[j] *= 1.0/tot; + /* save nearP in transmat->nearP[i][] */ + for (j = 0; j < nCodes; j++) + transmat->nearP[i][j] = nearP[j]; + /* multiply by 1/stat and rotate nearP */ + for (j = 0; j < nCodes; j++) + nearP[j] /= transmat->stat[j]; + for (j = 0; j < nCodes; j++) { + double rot = 0; + for (k = 0; k < nCodes; k++) + rot += nearP[k] * transmat->codeFreq[i][j]; + transmat->nearFreq[i][j] = rot; + } + } + return(transmat); + assert(0); +} + +distance_matrix_t *TransMatToDistanceMat(transition_matrix_t *transmat) { + if (transmat == NULL) + return(NULL); + distance_matrix_t *dmat = mymalloc(sizeof(distance_matrix_t)); + int i, j; + for (i=0; idistances[i][j] = 0; /* never actually used */ + dmat->eigeninv[i][j] = transmat->eigeninv[i][j]; + dmat->codeFreq[i][j] = transmat->codeFreq[i][j]; + } + } + /* eigentot . rotated-vector is the total frequency of the unrotated vector + (used to normalize in NormalizeFreq() + For transition matrices, we rotate by transpose of eigenvectors, so + we need to multiply by the inverse matrix by 1....1 to get this vector, + or in other words, sum the columns + */ + for(i = 0; ieigentot[i] = 0.0; + for (j = 0; jeigentot[i] += transmat->eigeninv[i][j]; + } + return(dmat); +} + +/* Numerical recipes code for eigen decomposition (actually taken from RAxML rev_functions.c) */ +void tred2 (double *a, const int n, const int np, double *d, double *e) +{ +#define a(i,j) a[(j-1)*np + (i-1)] +#define e(i) e[i-1] +#define d(i) d[i-1] + int i, j, k, l; + double f, g, h, hh, scale; + for (i = n; i > 1; i--) { + l = i-1; + h = 0; + scale = 0; + if ( l > 1 ) { + for ( k = 1; k <= l; k++ ) + scale += fabs(a(i,k)); + if (scale == 0) + e(i) = a(i,l); + else { + for (k = 1; k <= l; k++) { + a(i,k) /= scale; + h += a(i,k) * a(i,k); + } + f = a(i,l); + g = -sqrt(h); + if (f < 0) g = -g; + e(i) = scale *g; + h -= f*g; + a(i,l) = f-g; + f = 0; + for (j = 1; j <=l ; j++) { + a(j,i) = a(i,j) / h; + g = 0; + for (k = 1; k <= j; k++) + g += a(j,k)*a(i,k); + for (k = j+1; k <= l; k++) + g += a(k,j)*a(i,k); + e(j) = g/h; + f += e(j)*a(i,j); + } + hh = f/(h+h); + for (j = 1; j <= l; j++) { + f = a(i,j); + g = e(j) - hh * f; + e(j) = g; + for (k = 1; k <= j; k++) + a(j,k) -= f*e(k) + g*a(i,k); + } + } + } else + e(i) = a(i,l); + d(i) = h; + } + d(1) = 0; + e(1) = 0; + for (i = 1; i <= n; i++) { + l = i-1; + if (d(i) != 0) { + for (j = 1; j <=l; j++) { + g = 0; + for (k = 1; k <= l; k++) + g += a(i,k)*a(k,j); + for (k=1; k <=l; k++) + a(k,j) -= g * a(k,i); + } + } + d(i) = a(i,i); + a(i,i) = 1; + for (j=1; j<=l; j++) + a(i,j) = a(j,i) = 0; + } + + return; +#undef a +#undef e +#undef d +} + +double pythag(double a, double b) { + double absa = fabs(a), absb = fabs(b); + return (absa > absb) ? + absa * sqrt(1+ (absb/absa)*(absb/absa)) : + absb == 0 ? + 0 : + absb * sqrt(1+ (absa/absb)*(absa/absb)); +} + +void tqli(double *d, double *e, int n, int np, double *z) +{ +#define z(i,j) z[(j-1)*np + (i-1)] +#define e(i) e[i-1] +#define d(i) d[i-1] + + int i = 0, iter = 0, k = 0, l = 0, m = 0; + double b = 0, c = 0, dd = 0, f = 0, g = 0, p = 0, r = 0, s = 0; + + for(i=2; i<=n; i++) + e(i-1) = e(i); + e(n) = 0; + + for (l = 1; l <= n; l++) + { + iter = 0; + labelExtra: + + for (m = l; (m < n); m++) + { + dd = fabs(d(m))+fabs(d(m+1)); + + if (fabs(e(m))+dd == dd) + break; + } + + if (m != l) + { + assert(iter < 30); + + iter++; + g = (d(l+1)-d(l))/(2*e(l)); + r = pythag(g,1.); + g = d(m)-d(l)+e(l)/(g+(g<0?-r:r)); + s = 1; + c = 1; + p = 0; + + for (i = m-1; i>=l; i--) + { + f = s*e(i); + b = c*e(i); + r = pythag(f,g); + + e(i+1) = r; + if (r == 0) + { + d (i+1) -= p; + e (m) = 0; + + goto labelExtra; + } + s = f/r; + c = g/r; + g = d(i+1)-p; + r = (d(i)-g)*s + 2*c*b; + p = s*r; + d(i+1) = g + p; + g = c*r - b; + for (k=1; k <= n; k++) + { + f = z(k,i+1); + z(k,i+1) = s * z(k,i) + c*f; + z(k,i) = c * z(k,i) - s*f; + } + } + d(l) -= p; + e(l) = g; + e(m) = 0; + + goto labelExtra; + } + } + + return; +#undef z +#undef e +#undef d + +} + +#ifdef USE_SSE3 +inline float mm_sum(register __m128 sum) { +#if 1 + /* stupider but faster */ + float f[4] ALIGNED; + _mm_store_ps(f,sum); + return(f[0]+f[1]+f[2]+f[3]); +#else + /* first we get sum[0]+sum[1], sum[2]+sum[3] by selecting 0/1 and 2/3 */ + sum = _mm_add_ps(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(0,1,2,3))); + /* then get sum[0]+sum[1]+sum[2]+sum[3] by selecting 0/1 and 0/1 */ + sum = _mm_add_ps(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(0,1,0,1))); + float f; + _mm_store_ss(&f, sum); /* save the lowest word */ + return(f); +#endif +} +#endif + +void vector_multiply(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, int n, /*OUT*/numeric_t *fOut) { +#ifdef USE_SSE3 + int i; + for (i = 0; i < n; i += 4) { + __m128 a, b, c; + a = _mm_load_ps(f1+i); + b = _mm_load_ps(f2+i); + c = _mm_mul_ps(a, b); + _mm_store_ps(fOut+i,c); + } +#else + int i; + for (i = 0; i < n; i++) + fOut[i] = f1[i]*f2[i]; +#endif +} + +numeric_t vector_multiply_sum(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, int n) { +#ifdef USE_SSE3 + if (n == 4) + return(f1[0]*f2[0]+f1[1]*f2[1]+f1[2]*f2[2]+f1[3]*f2[3]); + __m128 sum = _mm_setzero_ps(); + int i; + for (i = 0; i < n; i += 4) { + __m128 a, b, c; + a = _mm_load_ps(f1+i); + b = _mm_load_ps(f2+i); + c = _mm_mul_ps(a, b); + sum = _mm_add_ps(c, sum); + } + return(mm_sum(sum)); +#else + int i; + numeric_t out = 0.0; + for (i=0; i < n; i++) + out += f1[i]*f2[i]; + return(out); +#endif +} + +/* sum(f1*f2*f3) */ +numeric_t vector_multiply3_sum(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, /*IN*/numeric_t* f3, int n) { +#ifdef USE_SSE3 + __m128 sum = _mm_setzero_ps(); + int i; + for (i = 0; i < n; i += 4) { + __m128 a1, a2, a3; + a1 = _mm_load_ps(f1+i); + a2 = _mm_load_ps(f2+i); + a3 = _mm_load_ps(f3+i); + sum = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(a1,a2),a3),sum); + } + return(mm_sum(sum)); +#else + int i; + numeric_t sum = 0.0; + for (i = 0; i < n; i++) + sum += f1[i]*f2[i]*f3[i]; + return(sum); +#endif +} + +numeric_t vector_dot_product_rot(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, /*IN*/numeric_t *fBy, int n) { +#ifdef USE_SSE3 + __m128 sum1 = _mm_setzero_ps(); + __m128 sum2 = _mm_setzero_ps(); + int i; + for (i = 0; i < n; i += 4) { + __m128 a1, a2, aBy; + a1 = _mm_load_ps(f1+i); + a2 = _mm_load_ps(f2+i); + aBy = _mm_load_ps(fBy+i); + sum1 = _mm_add_ps(_mm_mul_ps(a1, aBy), sum1); + sum2 = _mm_add_ps(_mm_mul_ps(a2, aBy), sum2); + } + return(mm_sum(sum1)*mm_sum(sum2)); +#else + int i; + numeric_t out1 = 0.0; + numeric_t out2 = 0.0; + for (i=0; i < n; i++) { + out1 += f1[i]*fBy[i]; + out2 += f2[i]*fBy[i]; + } + return(out1*out2); +#endif +} + +numeric_t vector_sum(/*IN*/numeric_t *f1, int n) { +#ifdef USE_SSE3 + if (n==4) + return(f1[0]+f1[1]+f1[2]+f1[3]); + __m128 sum = _mm_setzero_ps(); + int i; + for (i = 0; i < n; i+=4) { + __m128 a; + a = _mm_load_ps(f1+i); + sum = _mm_add_ps(a, sum); + } + return(mm_sum(sum)); +#else + numeric_t out = 0.0; + int i; + for (i = 0; i < n; i++) + out += f1[i]; + return(out); +#endif +} + +void vector_multiply_by(/*IN/OUT*/numeric_t *f, /*IN*/numeric_t fBy, int n) { + int i; +#ifdef USE_SSE3 + __m128 c = _mm_set1_ps(fBy); + for (i = 0; i < n; i += 4) { + __m128 a, b; + a = _mm_load_ps(f+i); + b = _mm_mul_ps(a,c); + _mm_store_ps(f+i,b); + } +#else + for (i = 0; i < n; i++) + f[i] *= fBy; +#endif +} + +void vector_add_mult(/*IN/OUT*/numeric_t *fTot, /*IN*/numeric_t *fAdd, numeric_t weight, int n) { +#ifdef USE_SSE3 + int i; + __m128 w = _mm_set1_ps(weight); + for (i = 0; i < n; i += 4) { + __m128 tot, add; + tot = _mm_load_ps(fTot+i); + add = _mm_load_ps(fAdd+i); + _mm_store_ps(fTot+i, _mm_add_ps(tot, _mm_mul_ps(add,w))); + } +#else + int i; + for (i = 0; i < n; i++) + fTot[i] += fAdd[i] * weight; +#endif +} + +void matrixt_by_vector4(/*IN*/numeric_t mat[4][MAXCODES], /*IN*/numeric_t vec[4], /*OUT*/numeric_t out[4]) { +#ifdef USE_SSE3 + /*__m128 v = _mm_load_ps(vec);*/ + __m128 o = _mm_setzero_ps(); + int j; + /* result is a sum of vectors: sum(k) v[k] * mat[k][] */ + for (j = 0; j < 4; j++) { + __m128 m = _mm_load_ps(&mat[j][0]); + __m128 vj = _mm_load1_ps(&vec[j]); /* is it faster to shuffle v? */ + o = _mm_add_ps(o, _mm_mul_ps(vj,m)); + } + _mm_store_ps(out, o); +#else + int j,k; + for (j = 0; j < 4; j++) { + double sum = 0; + for (k = 0; k < 4; k++) + sum += vec[k] * mat[k][j]; + out[j] = sum; + } +#endif +} + +transition_matrix_t *ReadAATransitionMatrix(/*IN*/char *filename) { + assert(nCodes==20); + double stat[20]; + static double matrix[MAXCODES][MAXCODES]; + static char buf[BUFFER_SIZE]; + FILE *fp = fopen(filename, "r"); + if (fp == NULL) { + fprintf(stderr, "Cannot read transition matrix file %s\n", filename); + exit(1); + } + char expected[2*MAXCODES+20]; + int posE = 0; + int i, j; + for (i = 0; i < 20; i++) { + expected[posE++] = codesStringAA[i]; + expected[posE++] = '\t'; + } + expected[posE++] = '*'; + expected[posE++] = '\n'; + expected[posE++] = '\0'; + + if (fgets(buf, sizeof(buf), fp) == NULL) { + fprintf(stderr, "Error reading header line from transition matrix file\n"); + exit(1); + } + if (strcmp(buf, expected) != 0) { + fprintf(stderr, "Invalid header line in transition matrix file, it must match:\n%s\n", expected); + exit(1); + } + for (i = 0; i < 20; i++) { + if (fgets(buf, sizeof(buf), fp) == NULL) { + fprintf(stderr, "Error reading matrix line\n"); + exit(1); + } + char *field = strtok(buf,"\t\r\n"); + if (field == NULL || strlen(field) != 1 || field[0] != codesStringAA[i]) { + fprintf(stderr, "Line for amino acid %c does not have the expected beginning\n", codesStringAA[i]); + exit(1); + } + for (j = 0; j < 20; j++) { + field = strtok(NULL, "\t\r\n"); + if (field == NULL) { + fprintf(stderr, "Not enough fields for amino acid %c\n", codesStringAA[i]); + exit(1); + } + matrix[i][j] = atof(field); + } + field = strtok(NULL, "\t\r\n"); + if (field == NULL) { + fprintf(stderr, "Not enough fields for amino acid %c\n", codesStringAA[i]); + exit(1); + } + stat[i] = atof(field); + } + + double tol = 1e-5; + /* Verify that stat is positive and sums to 1 */ + double statTot = 0; + for (i = 0; i < 20; i++) { + if (stat[i] < tol) { + fprintf(stderr, "stationary frequency for amino acid %c must be positive\n", codesStringAA[i]); + exit(1); + } + statTot += stat[i]; + } + if (fabs(statTot - 1) > tol) { + fprintf(stderr, "stationary frequencies must sum to 1 -- actual sum is %g\n", statTot); + exit(1); + } + + /* Verify that diagonals are negative and dot product of stat and diagonals is -1 */ + double totRate = 0; + for (i = 0; i < 20; i++) { + double diag = matrix[i][i]; + if (diag > -tol) { + fprintf(stderr, "transition rate(%c,%c) must be negative\n", + codesStringAA[i], codesStringAA[i]); + exit(1); + } + totRate += stat[i] * diag; + } + if (fabs(totRate + 1) > tol) { + fprintf(stderr, "Dot product of matrix diagonal and stationary frequencies must be -1 -- actual dot product is %g\n", + totRate); + exit(1); + } + + /* Verify that each off-diagonal entry is nonnegative and that each column sums to 0 */ + for (j = 0; j < 20; j++) { + double colSum = 0; + for (i = 0; i < 20; i++) { + double value = matrix[i][j]; + colSum += value; + if (i != j && value < 0) { + fprintf(stderr, "Off-diagonal matrix entry for (%c,%c) is negative\n", + codesStringAA[i], codesStringAA[j]); + exit(1); + } + } + if (fabs(colSum) > tol) { + fprintf(stderr, "Sum of column %c must be zero -- actual sum is %g\n", + codesStringAA[j], colSum); + exit(1); + } + } + return CreateTransitionMatrix(matrix, stat); +} + +distance_matrix_t matrixBLOSUM45 = + { + /*distances*/ + { + {0, 1.31097856157468, 1.06573001937323, 1.2682782988532, 0.90471293383305, 1.05855446876905, 1.05232790675508, 0.769574440593014, 1.27579668305679, 0.964604099952603, 0.987178199640556, 1.05007594438157, 1.05464162250736, 1.1985987403937, 0.967404475245526, 0.700490199584332, 0.880060189098976, 1.09748548316685, 1.28141710375267, 0.800038509951648}, + {1.31097856157468, 0, 0.8010890222701, 0.953340718498495, 1.36011107208122, 0.631543775840481, 0.791014908659279, 1.15694899265629, 0.761152570032029, 1.45014917711188, 1.17792001455227, 0.394661075648738, 0.998807558909651, 1.135143404599, 1.15432562628921, 1.05309036790541, 1.05010474413616, 1.03938321130789, 0.963216908696184, 1.20274751778601}, + {1.06573001937323, 0.8010890222701, 0, 0.488217214273568, 1.10567116937273, 0.814970207038261, 0.810176440932339, 0.746487413974582, 0.61876156253224, 1.17886558630004, 1.52003670190022, 0.808442678243754, 1.2889025816028, 1.16264109995678, 1.18228799147301, 0.679475681649858, 0.853658619686283, 1.68988558988005, 1.24297493464833, 1.55207513886163}, + {1.2682782988532, 0.953340718498495, 0.488217214273568, 0, 1.31581050011876, 0.769778474953791, 0.482077627352988, 0.888361752320536, 0.736360849050364, 1.76756333403346, 1.43574761894039, 0.763612910719347, 1.53386612356483, 1.74323672079854, 0.886347403928663, 0.808614044804528, 1.01590147813779, 1.59617804551619, 1.1740494822217, 1.46600946033173}, + {0.90471293383305, 1.36011107208122, 1.10567116937273, 1.31581050011876, 0, 1.3836789310481, 1.37553994252576, 1.26740695314856, 1.32361065635259, 1.26087264215993, 1.02417540515351, 1.37259631233791, 1.09416720447891, 0.986982088723923, 1.59321190226694, 0.915638787768407, 0.913042853922533, 1.80744143643002, 1.3294417177004, 0.830022143283238}, + {1.05855446876905, 0.631543775840481, 0.814970207038261, 0.769778474953791, 1.3836789310481, 0, 0.506942797642807, 1.17699648087288, 0.614595446514896, 1.17092829494457, 1.19833088638994, 0.637341078675405, 0.806490842729072, 1.83315144709714, 0.932064479113502, 0.850321696813199, 1.06830084665916, 1.05739353225849, 0.979907428113788, 1.5416250309563}, + {1.05232790675508, 0.791014908659279, 0.810176440932339, 0.482077627352988, 1.37553994252576, 0.506942797642807, 0, 1.17007322676118, 0.769786956320484, 1.46659942462342, 1.19128214039009, 0.633592151371708, 1.27269395724349, 1.44641491621774, 0.735428579892476, 0.845319988414402, 1.06201695511881, 1.324395996498, 1.22734387448031, 1.53255698189437}, + {0.769574440593014, 1.15694899265629, 0.746487413974582, 0.888361752320536, 1.26740695314856, 1.17699648087288, 1.17007322676118, 0, 1.1259007054424, 1.7025415585924, 1.38293205218175, 1.16756929156758, 1.17264582493965, 1.33271035269688, 1.07564768421292, 0.778868281341681, 1.23287107008366, 0.968539655354582, 1.42479529031801, 1.41208067821187}, + {1.27579668305679, 0.761152570032029, 0.61876156253224, 0.736360849050364, 1.32361065635259, 0.614595446514896, 0.769786956320484, 1.1259007054424, 0, 1.4112324673522, 1.14630894167097, 0.967795284542623, 0.771479459384692, 1.10468029976148, 1.12334774065132, 1.02482926701639, 1.28754326478771, 1.27439749294131, 0.468683841672724, 1.47469999960758}, + {0.964604099952603, 1.45014917711188, 1.17886558630004, 1.76756333403346, 1.26087264215993, 1.17092829494457, 1.46659942462342, 1.7025415585924, 1.4112324673522, 0, 0.433350517223017, 1.463460928818, 0.462965544381851, 0.66291968000662, 1.07010201755441, 1.23000200130049, 0.973485453109068, 0.963546200571036, 0.708724769805536, 0.351200119909572}, + {0.987178199640556, 1.17792001455227, 1.52003670190022, 1.43574761894039, 1.02417540515351, 1.19833088638994, 1.19128214039009, 1.38293205218175, 1.14630894167097, 0.433350517223017, 0, 1.49770950074319, 0.473800072611076, 0.538473125003292, 1.37979627224964, 1.5859723170438, 0.996267398224516, 0.986095542821092, 0.725310666139274, 0.570542199221932}, + {1.05007594438157, 0.394661075648738, 0.808442678243754, 0.763612910719347, 1.37259631233791, 0.637341078675405, 0.633592151371708, 1.16756929156758, 0.967795284542623, 1.463460928818, 1.49770950074319, 0, 1.0079761868248, 1.44331961488922, 0.924599080166146, 1.06275728888356, 1.05974425835993, 1.04892430642749, 0.972058829603409, 1.21378822764856}, + {1.05464162250736, 0.998807558909651, 1.2889025816028, 1.53386612356483, 1.09416720447891, 0.806490842729072, 1.27269395724349, 1.17264582493965, 0.771479459384692, 0.462965544381851, 0.473800072611076, 1.0079761868248, 0, 0.72479754849538, 1.1699868662153, 1.34481214251794, 1.06435197383538, 1.05348497728858, 0.774878150710318, 0.609532859331199}, + {1.1985987403937, 1.135143404599, 1.16264109995678, 1.74323672079854, 0.986982088723923, 1.83315144709714, 1.44641491621774, 1.33271035269688, 1.10468029976148, 0.66291968000662, 0.538473125003292, 1.44331961488922, 0.72479754849538, 0, 1.32968844979665, 1.21307373491949, 0.960087571600877, 0.475142555482979, 0.349485367759138, 0.692733248746636}, + {0.967404475245526, 1.15432562628921, 1.18228799147301, 0.886347403928663, 1.59321190226694, 0.932064479113502, 0.735428579892476, 1.07564768421292, 1.12334774065132, 1.07010201755441, 1.37979627224964, 0.924599080166146, 1.1699868662153, 1.32968844979665, 0, 0.979087429691819, 0.97631161216338, 1.21751652292503, 1.42156458605332, 1.40887880416009}, + {0.700490199584332, 1.05309036790541, 0.679475681649858, 0.808614044804528, 0.915638787768407, 0.850321696813199, 0.845319988414402, 0.778868281341681, 1.02482926701639, 1.23000200130049, 1.5859723170438, 1.06275728888356, 1.34481214251794, 1.21307373491949, 0.979087429691819, 0, 0.56109848274013, 1.76318885009194, 1.29689226231656, 1.02015839286433}, + {0.880060189098976, 1.05010474413616, 0.853658619686283, 1.01590147813779, 0.913042853922533, 1.06830084665916, 1.06201695511881, 1.23287107008366, 1.28754326478771, 0.973485453109068, 0.996267398224516, 1.05974425835993, 1.06435197383538, 0.960087571600877, 0.97631161216338, 0.56109848274013, 0, 1.39547634461879, 1.02642577026706, 0.807404666228614}, + {1.09748548316685, 1.03938321130789, 1.68988558988005, 1.59617804551619, 1.80744143643002, 1.05739353225849, 1.324395996498, 0.968539655354582, 1.27439749294131, 0.963546200571036, 0.986095542821092, 1.04892430642749, 1.05348497728858, 0.475142555482979, 1.21751652292503, 1.76318885009194, 1.39547634461879, 0, 0.320002937404137, 1.268589159299}, + {1.28141710375267, 0.963216908696184, 1.24297493464833, 1.1740494822217, 1.3294417177004, 0.979907428113788, 1.22734387448031, 1.42479529031801, 0.468683841672724, 0.708724769805536, 0.725310666139274, 0.972058829603409, 0.774878150710318, 0.349485367759138, 1.42156458605332, 1.29689226231656, 1.02642577026706, 0.320002937404137, 0, 0.933095433689795}, + {0.800038509951648, 1.20274751778601, 1.55207513886163, 1.46600946033173, 0.830022143283238, 1.5416250309563, 1.53255698189437, 1.41208067821187, 1.47469999960758, 0.351200119909572, 0.570542199221932, 1.21378822764856, 0.609532859331199, 0.692733248746636, 1.40887880416009, 1.02015839286433, 0.807404666228614, 1.268589159299, 0.933095433689795, 0} + }, + /*eigeninv*/ + { + {-0.216311217101265, -0.215171653035930, -0.217000020881064, -0.232890860601250, -0.25403526530177, -0.211569372858927, -0.218073620637049, -0.240585637190076, -0.214507049619293, -0.228476323330312, -0.223235445346107, -0.216116483840334, -0.206903836810903, -0.223553828183343, -0.236937609127783, -0.217652789023588, -0.211982652566286, -0.245995223308316, -0.206187718714279, -0.227670670439422}, + {-0.0843931919568687, -0.0342164464991033, 0.393702284928246, -0.166018266253027, 0.0500896782860136, -0.262731388032538, 0.030139964190519, -0.253997503551094, -0.0932603349591988, -0.32884667697173, 0.199966846276877, -0.117543453869516, 0.196248237055757, -0.456448703853250, 0.139286961076387, 0.241166801918811, -0.0783508285295053, 0.377438091416498, 0.109499076984234, 0.128581669647144}, + {-0.0690428674271772, 0.0133858672878363, -0.208289917312908, 0.161232925220819, 0.0735806288007248, -0.316269599838174, -0.0640708424745702, -0.117078801507436, 0.360805085405857, 0.336899760384943, 0.0332447078185156, 0.132954055834276, 0.00595209121998118, -0.157755611190327, -0.199839273133436, 0.193688928807663, 0.0970290928040946, 0.374683975138541, -0.478110944870958, -0.243290196936098}, + {0.117284581850481, 0.310399467781876, -0.143513477698805, 0.088808130300351, 0.105747812943691, -0.373871701179853, 0.189069306295134, 0.133258225034741, -0.213043549687694, 0.301303731259140, -0.182085224761849, -0.161971915020789, 0.229301173581378, -0.293586313243755, -0.0260480060747498, -0.0217953684540699, 0.0202675755458796, -0.160134624443657, 0.431950096999465, -0.329885160320501}, + {0.256496969244703, 0.0907408349583135, 0.0135731083898029, 0.477557831930769, -0.0727379669280703, 0.101732675207959, -0.147293025369251, -0.348325291603251, -0.255678082078362, -0.187092643740172, -0.177164064346593, -0.225921480146133, 0.422318841046522, 0.319959853469398, -0.0623652546300045, 0.0824203908606883, -0.102057926881110, 0.120728407576411, -0.156845807891241, -0.123528163091204}, + {-0.00906668858975576, -0.0814722888231236, -0.0762715085459023, 0.055819989938286, -0.0540516675257271, -0.0070589302769034, -0.315813159989213, -0.0103527463419808, -0.194634331372293, -0.0185860407566822, 0.50134169352609, 0.384531812730061, -0.0405008616742061, 0.0781033650669525, 0.069334900096687, 0.396455180448549, -0.204065801866462, -0.215272089630713, 0.171046818996465, -0.396393364716348}, + {0.201971098571663, 0.489747667606921, 0.00226258734592836, 0.0969514005747054, 0.0853921636903791, 0.0862068740282345, -0.465412154271164, -0.130516676347786, 0.165513616974634, 0.0712238027886633, 0.140746943067963, -0.325919272273406, -0.421213488261598, -0.163508199065965, 0.269695802810568, -0.110296405171437, -0.106834099902202, 0.00509414588152415, 0.00909215239544615, 0.0500401865589727}, + {0.515854176692456, -0.087468413428258, 0.102796468891449, -0.06046105990993, -0.212014383772414, -0.259853648383794, -0.0997372883043333, -0.109934574535736, 0.284891018406112, -0.250578342940183, 0.142174204994568, 0.210384918947619, 0.118803190788946, -0.0268434355996836, 0.0103721198836548, -0.355555176478458, 0.428042332431476, -0.150610175411631, 0.0464090887952940, -0.140238796382057}, + {-0.239392215229762, -0.315483492656425, 0.100205194952396, 0.197830195325302, 0.40178804665223, 0.195809461460298, -0.407817115321684, 0.0226836686147386, -0.169780276210306, 0.0818161585952184, -0.172886230584939, 0.174982644851064, 0.0868786992159535, -0.198450519980824, 0.168581078329968, -0.361514336004068, 0.238668430084722, 0.165494019791904, 0.110437707249228, -0.169592003035203}, + {-0.313151735678025, 0.10757884850664, -0.49249098807229, 0.0993472335619114, -0.148695715250836, 0.0573801136941699, -0.190040373500722, 0.254848437434773, 0.134147888304352, -0.352719341442756, 0.0839609323513986, -0.207904182300122, 0.253940523323376, -0.109832138553288, 0.0980084518687944, 0.209026594443723, 0.406236051871548, -0.0521120230935943, 0.0554108014592302, 0.134681046631955}, + {-0.102905214421384, 0.235803606800009, 0.213414976431981, -0.253606415825635, 0.00945656859370683, 0.259551282655855, 0.159527348902192, 0.083218761193016, -0.286815935191867, 0.0135069477264877, 0.336758103107357, -0.271707359524149, -0.0400009875851839, 0.0871186292716414, -0.171506310409388, -0.0954276577211755, 0.393467571460712, 0.111732846649458, -0.239886066474217, -0.426474828195231}, + {-0.0130795552324104, 0.0758967690968058, -0.165099404017689, -0.46035152559912, 0.409888158016031, -0.0235053940299396, 0.0699393201709723, -0.161320910316996, 0.226111732196825, -0.177811841258496, -0.219073917645916, -0.00703219376737286, 0.162831878334912, 0.271670554900684, 0.451033612762052, 0.0820942662443393, -0.0904983490498446, -0.0587000279313978, -0.0938852980928252, -0.306078621571843}, + {0.345092040577428, -0.257721588971295, -0.301689123771848, -0.0875212184538126, 0.161012613069275, 0.385104899829821, 0.118355290985046, -0.241723794416731, 0.083201920119646, -0.0809095291508749, -0.0820275390511991, -0.115569770103317, -0.250105681098033, -0.164197583037664, -0.299481453795592, 0.255906951902366, 0.129042051416371, 0.203761730442746, 0.347550071284268, -0.109264854744020}, + {0.056345924962239, 0.072536751679082, 0.303127492633681, -0.368877185781648, -0.343024497082421, 0.206879529669083, -0.413012709639426, 0.078538816203612, 0.103382383425097, 0.288319996147499, -0.392663258459423, 0.0319588502083897, 0.220316797792669, -0.0563686494606947, -0.0869286063283735, 0.323677017794391, 0.0984875197088935, -0.0303289828821742, 0.0450197853450979, -0.0261771221270139}, + {-0.253701638374729, -0.148922815783583, 0.111794052194159, 0.157313977830326, -0.269846001260543, -0.222989872703583, 0.115441028189268, -0.350456582262355, -0.0409581422905941, 0.174078744248002, -0.130673397086811, -0.123963802708056, -0.351609207081548, 0.281548012920868, 0.340382662112428, 0.180262131025562, 0.3895263830793, 0.0121546812430960, 0.214830943227063, -0.0617782909660214}, + {-0.025854479416026, 0.480654788977767, -0.138024550829229, -0.130191670810919, 0.107816875829919, -0.111243997319276, -0.0679814460571245, -0.183167991080677, -0.363355166018786, -0.183934891092050, -0.216097125080962, 0.520240628803255, -0.179616013606479, 0.0664131536100941, -0.178350708111064, 0.0352047611606709, 0.223857228692892, 0.128363679623513, -0.000403433628490731, 0.224972110977704}, + {0.159207394033448, -0.0371517305736114, -0.294302634912281, -0.0866954375908417, -0.259998567870054, 0.284966673982689, 0.205356416771391, -0.257613708650298, -0.264820519037270, 0.293359248624603, 0.0997476397434102, 0.151390539497369, 0.165571346773648, -0.347569523551258, 0.43792310820533, -0.0723248163210163, 0.0379214984816955, -0.0542758730251438, -0.258020301801603, 0.128680501102363}, + {0.316853842351797, -0.153950010941153, -0.13387065213508, -0.0702971390607613, -0.202558481846057, -0.172941438694837, -0.068882524588574, 0.524738203063889, -0.271670479920716, -0.112864756695310, -0.146831636946145, -0.0352336188578041, -0.211108490884767, 0.097857111349555, 0.276459740956662, 0.0231297536754823, -0.0773173324868396, 0.487208384389438, -0.0734191389266824, -0.113198765573319}, + {-0.274285525741087, 0.227334266052039, -0.0973746625709059, -0.00965256583655389, -0.402438444750043, 0.198586229519026, 0.0958135064575833, -0.108934376958686, 0.253641732094319, -0.0551918478254021, 0.0243640218331436, 0.181936272247179, 0.090952738347629, 0.0603352483029044, -0.0043821671755761, -0.347720824658591, -0.267879988539971, 0.403804652116592, 0.337654323971186, -0.241509293972297}, + {-0.0197089518344238, 0.139681034626696, 0.251980475788267, 0.341846624362846, -0.075141195125153, 0.2184951591319, 0.268870823491343, 0.150392399018138, 0.134592404015057, -0.337050200539163, -0.313109373497998, 0.201993318439135, -0.217140733851970, -0.337622749083808, 0.135253284365068, 0.181729249828045, -0.00627813335422765, -0.197218833324039, -0.194060005031698, -0.303055888528004} + }, + /*eigenval*/ + { + 20.29131, 0.5045685, 0.2769945, 0.1551147, 0.03235484, -0.04127639, -0.3516426, -0.469973, -0.5835191, -0.6913107, -0.7207972, -0.7907875, -0.9524307, -1.095310, -1.402153, -1.424179, -1.936704, -2.037965, -3.273561, -5.488734 + }, + /*eigentot and codeFreq left out, these are initialized elsewhere*/ + }; + +/* The JTT92 matrix, D. T. Jones, W. R. Taylor, & J. M. Thorton, CABIOS 8:275 (1992) + Derived from the PhyML source code (models.c) by filling in the other side of the symmetric matrix, + scaling the entries by the stationary rate (to give the rate of a->b not b|a), to set the diagonals + so the rows sum to 0, to rescale the matrix so that the implied rate of evolution is 1. + The resulting matrix is the transpose (I think). +*/ +#if 0 +{ + int i,j; + for (i=0; i<20; i++) for (j=0; jThis App reconstructs a phylogenetic tree from a Multiple Sequence Alignment (MSA) of either nucleotide or protein sequences using FastTree2. FastTree2 can be used to determine evolutionary relationships among aligned sequences. FastTree2 will calculate the distances between proteins in the alignment and build an approximately maximum-likelihood tree. The tree is displayed using ETE3 (v3.0.0b35).

+

This App reconstructs a phylogenetic tree from a Multiple Sequence Alignment (MSA) of either nucleotide or protein sequences using FastTree2. FastTree2 can be used to determine evolutionary relationships among aligned sequences. FastTree2 will calculate the distances between proteins in the alignment and build an approximately maximum-likelihood tree. The tree is displayed using ETE3 (v3.1.2).

We recommend that users to review the Build Gene Tree Tutorial to understand the upstream processes required to use this App.

FastTree2 takes a precomputed MSA and, following an evolutionary model for the distance between aligned positions (e.g. the Jones-Taylor-Thornton JTT model), determines the distances between sequences and infers an approximately Maximum Likelihood tree for those distances. FastTree2 is much faster than many methods of comparable quality. The output is a newick formatted tree, which KBase displays using the ETE3 toolkit. A KBase Tree object is generated and stored in the Narrative. The newick file and tree images are available for download. Nucleotide or Protein sequence MSAs may be used, and the method is agnostic to whether it is a GeneTree or a SpeciesTree (but tree type must be indicated to set for the output Tree object).

+


+

Tool Source:

+

FastTree v2.1.11 is installed from http://www.microbesonline.org/fasttree/.

+ +


Configuration:

Tree Description: This is used in the output figure and carried in the Tree object.

Input MSA: The MSA from which to generate the tree. You must pre-concatenate MSAs if you wish to make a SpeciesTree from concatenated phylogenetic marker MSAs.

@@ -130,11 +135,31 @@ description : |

Output Tree Image: The Tree is rendered using the ETE3 Toolkit.

Downloadable files: The Newick formatted output tree, as well as rendered PNG and PDF formats, are available for download.

-

FastTree2.1.9 source

-

Team members who implemented App in KBase: Dylan Chivian. For questions, please contact us.

+


+

Team members who implemented App in KBase: Dylan Chivian. For questions, please contact us.

+ + +

Please cite: +

    +
  • Price MN, Dehal PS, Arkin AP. FastTree 2 – Approximately Maximum-Likelihood Trees for Large Alignments. PLOS ONE. 2010;5: e9490. doi:10.1371/journal.pone.0009490 +
+

+ publications : + - + pmid : 20224823 + display-text: | + Price MN, Dehal PS, Arkin AP. FastTree 2 – Approximately Maximum-Likelihood Trees for Large Alignments. PLOS ONE. 2010;5: e9490. doi:10.1371/journal.pone.0009490 + link: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0009490 + + - + pmid : 19377059 + display-text: | + Price MN, Dehal PS, Arkin AP. FastTree: computing large minimum evolution trees with profiles instead of a distance matrix. Mol Biol Evol. 2009;26: 1641–1650. doi:10.1093/molbev/msp077 + link: https://www.ncbi.nlm.nih.gov/pubmed/19377059 + - pmid : 26921390 display-text: | @@ -143,17 +168,11 @@ publications : - display-text: | - ETE3 source: - link: http://etetoolkit.org + FastTree-2 source: + link: http://www.microbesonline.org/fasttree/ - - pmid : 19377059 display-text: | - Price MN, Dehal PS, Arkin AP. FastTree: computing large minimum evolution trees with profiles instead of a distance matrix. Mol Biol Evol. 2009;26: 1641–1650. doi:10.1093/molbev/msp077 - link: https://www.ncbi.nlm.nih.gov/pubmed/19377059 + ETE3 source: + link: http://etetoolkit.org - - - pmid : 20224823 - display-text: | - Price MN, Dehal PS, Arkin AP. FastTree 2 – Approximately Maximum-Likelihood Trees for Large Alignments. PLOS ONE. 2010;5: e9490. doi:10.1371/journal.pone.0009490 - link: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0009490 diff --git a/ui/narrative/methods/run_FastTree/spec.json b/ui/narrative/methods/run_FastTree/spec.json index 4b245e3..f9aa92b 100644 --- a/ui/narrative/methods/run_FastTree/spec.json +++ b/ui/narrative/methods/run_FastTree/spec.json @@ -1,10 +1,10 @@ { - "ver": "1.0.3", + "ver": "1.1.0", "authors": [ "dylan", "psdehal" ], - "contact": "http://kbase.us/contact-us/", + "contact": "http://www.kbase.us/support/", "visible": true, "categories": ["active","comparative_genomics"], "widgets": {