galaxyproject · paulzierep · Nov 1, 2023 · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023
diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,11 @@
+[settings]
+combine_as_imports=true
+force_alphabetical_sort_within_sections=true
+# Override force_grid_wrap value from profile=black, but black is still happy
+force_grid_wrap=2
+# Same line length as for black
+line_length=120
+no_lines_before=LOCALFOLDER
+profile=black
+reverse_relative=true
+skip_gitignore=true
diff --git a/README.md b/README.md
@@ -38,24 +38,22 @@ Galaxy Tool extractor
     $ python3 -m pip install -r requirements.txt
     ```
 
-# Extract tools for categories in the ToolShed
+## Extract all tools
 
 1. Get an API key ([personal token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens)) for GitHub
-2. (Optional) Create a text file with ToolShed categories for which tools need to be extracted: 1 ToolShed category per row ([example for microbial data analysis](data/microgalaxy/categories))
-3. (Optional) Create a text file with list of tools to exclude: 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_exclude))
-4. (Optional) Create a text file with list of tools to really keep (already reviewed): 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_keep))
-4. Run the tool extractor script
+2. Export the GitHub API key as an environment variable:
 
     ```
-    $ python bin/extract_galaxy_tools.py \
-        --api <GitHub API key> \
-        --output <Path to output file> \
-        [--categories <Path to ToolShed category file>] \
-        [--exclude <Path to excluded tool file category file>]\
-        [--keep <Path to to-keep tool file category file>]
+    $ export GITHUB_API_KEY=<your GitHub API key>
+    ```
+
+3. Run the script
+
+    ```
+    $ python bin/extract_all_tools.sh
     ```
 
-The script will generate a CSV file with each tool found in the list of GitHub repository and several information for these tools:
+The script will generate a TSV file with each tool found in the list of GitHub repositories and metadata for these tools:
 
 1. Galaxy wrapper id
 2. Description
@@ -73,27 +71,30 @@ The script will generate a CSV file with each tool found in the list of GitHub r
 14. Galaxy wrapper version
 15. Conda id
 16. Conda version
-17. Reviewed
-18. To keep
 
-## For microbial related tools
+## Filter tools based on their categories in the ToolShed
 
-For microGalaxy, a Bash script in `bin` can used by:
-
-1. Exporting the GitHub API key as an environment variable:
+1. Run the extraction as explained before
+2. (Optional) Create a text file with ToolShed categories for which tools need to be extracted: 1 ToolShed category per row ([example for microbial data analysis](data/microgalaxy/categories))
+3. (Optional) Create a text file with list of tools to exclude: 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_exclude))
+4. (Optional) Create a text file with list of tools to really keep (already reviewed): 1 tool id per row ([example for microbial data analysis](data/microgalaxy/tools_to_keep))
+4. Run the tool extractor script
 
     ```
-    $ export GITHUB_API_KEY=<your GitHub API key>
+    $ python bin/extract_galaxy_tools.py \
+        --tools <Path to CSV file with all extracted tools> \
+        --filtered_tools <Path to output CSV file with filtered tools> \
+        [--categories <Path to ToolShed category file>] \
+        [--excluded <Path to excluded tool file category file>]\
+        [--keep <Path to to-keep tool file category file>]
     ```
 
-2. Running the script
-
-    ```
-    $ bash bin/extract_microgalaxy_tools.sh
-    ```
+### Filter tools for microbial data analysis
 
-    It will:
-    1. Update the files in the `data/microgalaxy` folder
-    2. Export the tools into `microgalaxy_tools.csv`
+For microGalaxy, a Bash script in `bin` can used by running the script
 
+```
+$ bash bin/extract_microgalaxy_tools.sh
+```
 
+It will take the files in the `data/microgalaxy` folder and export the tools into `microgalaxy_tools.csv`
diff --git a/bin/extract_all_tools.sh b/bin/extract_all_tools.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+python bin/extract_galaxy_tools.py \
+        extractools \
+        --api $GITHUB_API_KEY \
+        --all_tools 'results/all_tools.tsv'
diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py
@@ -5,7 +5,12 @@
 import time
 import xml.etree.ElementTree as et
 from pathlib import Path
-from typing import List
+from typing import (
+    Any,
+    Dict,
+    List,
+    Optional,
+)
 
 import pandas as pd
 import requests
@@ -19,7 +24,7 @@
 BIOTOOLS_API_URL = "https://130.226.25.21"
 
 
-def read_file(filepath):
+def read_file(filepath) -> List[str]:
     """
     Read an optional file with 1 element per line
 
@@ -134,29 +139,22 @@ def check_categories(ts_categories, ts_cat):
     :param ts_categories: tool ToolShed categories
     :param ts_cat: list of ToolShed categories to keep in the extraction
     """
-    if ts_categories is not None and len(ts_cat) > 0:
-        to_keep = False
-        for cat in ts_categories:
-            if cat in ts_cat:
-                to_keep = True
-        return to_keep
-    return True
+    if not ts_cat:
+        return True
+    if not ts_categories:
+        return False
+    ts_cats = ts_categories.split(", ")
+    return bool(set(ts_cat) & set(ts_cats))
 
 
-def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools):
+def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, Any]]:
     """
-    Get tool information
-    - Check the `.shed.yaml` file
-    - Extract metadata from the `.shed.yaml`
-    - Filter for specific ToolShed categories
-    - Extract the requirements in the macros or xml file to get tool version supported in Galaxy
-    - Extract bio.tools information if available in the macros or xml
+    Get tool metadata from the .shed.yaml, requirements in the macros or xml
+    file,  bio.tools information if available in the macros or xml, EDAM
+    annotations using bio.tools API, recent conda version using conda API
 
     :param tool: GitHub ContentFile object
     :param repo: GitHub Repository object
-    :param ts_cat: list of ToolShed categories to keep in the extraction
-    :param excluded_tools: list of tools to skip
-    :param keep_tools: list of tools to keep
     """
     if tool.type != "dir":
         return None
@@ -178,13 +176,7 @@ def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools):
         "Galaxy wrapper version": None,
         "Conda id": None,
         "Conda version": None,
-        "Reviewed": tool.name in keep_tools or tool.name in excluded_tools,
-        "To keep": "",
     }
-    if tool.name in keep_tools:
-        metadata["To keep"] = True
-    elif tool.name in excluded_tools:
-        metadata["To keep"] = False
     # extract .shed.yml information and check macros.xml
     try:
         shed = repo.get_contents(f"{tool.path}/.shed.yml")
@@ -206,11 +198,10 @@ def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools):
         metadata["ToolShed categories"] = get_shed_attribute("categories", yaml_content, [])
         if metadata["ToolShed categories"] is None:
             metadata["ToolShed categories"] = []
-    # filter ToolShed categories and leave function if not in expected categories
-    if not check_categories(metadata["ToolShed categories"], ts_cat):
-        return None
     # find and parse macro file
-    for file in repo.get_contents(tool.path):
+    file_list = repo.get_contents(tool.path)
+    assert isinstance(file_list, list)
+    for file in file_list:
         if "macro" in file.name and file.name.endswith("xml"):
             file_content = get_string_content(file)
             root = et.fromstring(file_content)
@@ -223,9 +214,8 @@ def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools):
                     biotools = get_biotools(child)
                     if biotools is not None:
                         metadata["bio.tool id"] = biotools
-
     # parse XML file and get meta data from there, also tool ids
-    for file in repo.get_contents(tool.path):
+    for file in file_list:
         if file.name.endswith("xml") and "macro" not in file.name:
             file_content = get_string_content(file)
             try:
@@ -260,7 +250,6 @@ def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools):
                 # tool ids
                 if "id" in root.attrib:
                     metadata["Galaxy tool ids"].append(root.attrib["id"])
-
     # get latest conda version and compare to the wrapper version
     if metadata["Conda id"] is not None:
         r = requests.get(f'https://api.anaconda.org/package/bioconda/{metadata["Conda id"]}')
@@ -272,7 +261,7 @@ def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools):
                     metadata["Status"] = "Up-to-date"
     # get bio.tool information
     if metadata["bio.tool id"] is not None:
-        r = requests.get(f'{BIOTOOLS_API_URL}/api/tool/{metadata["bio.tool id"]}/?format=json')
+        r = requests.get(f'{BIOTOOLS_API_URL}/api/tool/{metadata["bio.tool id"]}/?format=json', verify=False)
         if r.status_code == requests.codes.ok:
             biotool_info = r.json()
             if "function" in biotool_info:
@@ -290,14 +279,11 @@ def get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools):
     return metadata
 
 
-def parse_tools(repo: Repository, ts_cat, excluded_tools, keep_tools):
+def parse_tools(repo: Repository) -> List[Dict[str, Any]]:
     """
-    Parse tools in a GitHub repository to expact
+    Parse tools in a GitHub repository, extract them and their metadata
 
     :param repo: GitHub Repository object
-    :param ts_cat: list of ToolShed categories to keep in the extraction
-    :param excluded_tools: list of tools to skip
-    :param keep_tools: list of tools to keep
     """
     # get tool folders
     tool_folders: List[List[ContentFile]] = []
@@ -336,59 +322,112 @@ def parse_tools(repo: Repository, ts_cat, excluded_tools, keep_tools):
                 file_list = repo.get_contents(tool.path)
                 assert isinstance(file_list, list)
                 for content in file_list:
-                    metadata = get_tool_metadata(content, repo, ts_cat, excluded_tools, keep_tools)
+                    metadata = get_tool_metadata(content, repo)
                     if metadata is not None:
                         tools.append(metadata)
             else:
-                metadata = get_tool_metadata(tool, repo, ts_cat, excluded_tools, keep_tools)
+                metadata = get_tool_metadata(tool, repo)
                 if metadata is not None:
                     tools.append(metadata)
     return tools
 
 
-def export_tools(tools: list, output_fp: str) -> None:
+def format_list_column(col):
+    """
+    Format a column that could be a list before exporting
+    """
+    return col.apply(lambda x: ", ".join([str(i) for i in x]))
+
+
+def export_tools(tools: List[Dict], output_fp: str, format_list_col=False) -> None:
     """
     Export tool metadata to tsv output file
 
     :param tools: dictionary with tools
     :param output_fp: path to output file
+    :param format_list_col: boolean indicating if list columns should be formatting
     """
     df = pd.DataFrame(tools)
-    df["ToolShed categories"] = df["ToolShed categories"].apply(lambda x: ", ".join([str(i) for i in x]))
-    df["EDAM operation"] = df["EDAM operation"].apply(lambda x: ", ".join([str(i) for i in x]))
-    df["EDAM topic"] = df["EDAM topic"].apply(lambda x: ", ".join([str(i) for i in x]))
-    df["Galaxy tool ids"] = df["Galaxy tool ids"].apply(lambda x: ", ".join([str(i) for i in x]))
+    if format_list_col:
+        df["ToolShed categories"] = format_list_column(df["ToolShed categories"])
+        df["EDAM operation"] = format_list_column(df["EDAM operation"])
+        df["EDAM topic"] = format_list_column(df["EDAM topic"])
+        df["Galaxy tool ids"] = format_list_column(df["Galaxy tool ids"])
     df.to_csv(output_fp, sep="\t", index=False)
 
 
+def filter_tools(tools: List[Dict], ts_cat: List[str], excluded_tools: List[str], keep_tools: List[str]) -> List[Dict]:
+    """
+    Filter tools for specific ToolShed categories and add information if to keep or to exclude
+
+    :param tools: dictionary with tools and their metadata
+    :param ts_cat: list of ToolShed categories to keep in the extraction
+    :param excluded_tools: list of tools to skip
+    :param keep_tools: list of tools to keep
+    """
+    filtered_tools = []
+    for tool in tools:
+        # filter ToolShed categories and leave function if not in expected categories
+        if check_categories(tool["ToolShed categories"], ts_cat):
+            name = tool["Galaxy wrapper id"]
+            tool["Reviewed"] = name in keep_tools or name in excluded_tools
+            tool["To keep"] = None
+            if name in keep_tools:
+                tool["To keep"] = True
+            elif name in excluded_tools:
+                tool["To keep"] = False
+            filtered_tools.append(tool)
+    return filtered_tools
+
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Extract a GitHub project to CSV")
-    parser.add_argument("--api", "-a", required=True, help="GitHub access token")
-    parser.add_argument("--output", "-o", required=True, help="Output filepath")
-    parser.add_argument(
+    parser = argparse.ArgumentParser(
+        description="Extract Galaxy tools from GitHub repositories together with biotools and conda metadata"
+    )
+    subparser = parser.add_subparsers(dest="command")
+    # Extract tools
+    extractools = subparser.add_parser("extractools", help="Extract tools")
+    extractools.add_argument("--api", "-a", required=True, help="GitHub access token")
+    extractools.add_argument("--all_tools", "-o", required=True, help="Filepath to TSV with all extracted tools")
+    # Filter tools
+    filtertools = subparser.add_parser("filtertools", help="Filter tools")
+    filtertools.add_argument(
+        "--tools",
+        "-t",
+        required=True,
+        help="Filepath to TSV with all extracted tools, generated by extractools command",
+    )
+    filtertools.add_argument("--filtered_tools", "-f", required=True, help="Filepath to TSV with filtered tools")
+    filtertools.add_argument(
         "--categories", "-c", help="Path to a file with ToolShed category to keep in the extraction (one per line)"
     )
-    parser.add_argument("--exclude", "-e", help="Path to a file with ToolShed ids of tools to exclude (one per line)")
-    parser.add_argument("--keep", "-ek", help="Path to a file with ToolShed ids of tools to keep (one per line)")
+    filtertools.add_argument(
+        "--exclude", "-e", help="Path to a file with ToolShed ids of tools to exclude (one per line)"
+    )
+    filtertools.add_argument("--keep", "-k", help="Path to a file with ToolShed ids of tools to keep (one per line)")
     args = parser.parse_args()
 
-    # connect to GitHub
-    g = Github(args.api)
-    # get list of GitHub repositories to parse
-    repo_list = get_tool_github_repositories(g)
-
-    # get categories and tools to exclude
-    categories = read_file(args.categories)
-    excl_tools = read_file(args.exclude)
-    keep_tools = read_file(args.keep)
-
-    # parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file
-    tools = []
-    for r in repo_list:
-        print(r)
-        if "github" not in r:
-            continue
-        repo = get_github_repo(r, g)
-        tools += parse_tools(repo, categories, excl_tools, keep_tools)
-        export_tools(tools, args.output)
-        print()
+    if args.command == "extractools":
+        # connect to GitHub
+        g = Github(args.api)
+        # get list of GitHub repositories to parse
+        repo_list = get_tool_github_repositories(g)
+        # parse tools in GitHub repositories to extract metada, filter by TS categories and export to output file
+        tools: List[Dict] = []
+        for r in repo_list:
+            print(r)
+            if "github" not in r:
+                continue
+            repo = get_github_repo(r, g)
+            tools += parse_tools(repo)
+            export_tools(tools, args.all_tools, format_list_col=True)
+            print()
+    elif args.command == "filtertools":
+        tools = pd.read_csv(Path(args.tools), sep="\t", keep_default_na=False).to_dict("records")
+        # get categories and tools to exclude
+        categories = read_file(args.categories)
+        excl_tools = read_file(args.exclude)
+        keep_tools = read_file(args.keep)
+        # filter tool lists
+        filtered_tools = filter_tools(tools, categories, excl_tools, keep_tools)
+        export_tools(filtered_tools, args.filtered_tools)
diff --git a/bin/extract_microgalaxy_tools.sh → bin/filter_microgalaxy_tools.sh b/bin/extract_microgalaxy_tools.sh → bin/filter_microgalaxy_tools.sh
@@ -10,9 +10,12 @@ curl \
         "https://docs.google.com/spreadsheets/d/1Nq_g-CPc8t_eC4M1NAS9XFJDflA7yE3b9hfSg3zu9L4/export?format=tsv&gid=672552331" \
         -o "data/microgalaxy/tools_to_exclude"
 
+mkdir -p 'results/microgalaxy'
+
 python bin/extract_galaxy_tools.py \
-        --api $GITHUB_API_KEY \
-        --output microgalaxy_tools.csv \
+        filtertools \
+        --tools 'results/all_tools.tsv' \
+        --filtered_tools 'results/microgalaxy/tools.tsv' \
         --categories "data/microgalaxy/categories" \
         --exclude "data/microgalaxy/tools_to_exclude" \
         --keep "data/microgalaxy/tools_to_keep"