Merge pull request #2126 from broadinstitute/development

Release 1.79.0
broadinstitute · Sep 11, 2024 · 699d208 · 699d208
2 parents 6fc9124 + b7dd9bb
commit 699d208
Show file tree

Hide file tree

Showing 24 changed files with 483 additions and 106 deletions.
diff --git a/app/javascript/components/upload/UploadWizard.jsx b/app/javascript/components/upload/UploadWizard.jsx
@@ -280,7 +280,7 @@ export function RawUploadWizard({ studyAccession, name }) {
     let fileToSave = file
     let studyFileId = file._id
 
-    if (isAnnDataExperience) {
+    if (isAnnDataExperience && fileToSave?.file_type === 'AnnData') {
       fileToSave = saveAnnDataFileHelper(file, fileToSave)
       studyFileId = fileToSave._id
     }

diff --git a/app/javascript/lib/scp-api.jsx b/app/javascript/lib/scp-api.jsx
@@ -438,6 +438,10 @@ export async function deleteAnnDataFragment(studyAccession, fileId, fragId, mock
   return response
 }
 
+/** Get OAuth token for HTTP requests that require "Authorization: Bearer" header */
+export function getOAuthToken() {
+  return window.SCP.readOnlyToken
+}
 
 /**
  * Fetches a given resource from a GCP bucket -- this handles adding the
@@ -450,7 +454,7 @@ export async function fetchBucketFile(bucketName, filePath, maxBytes=null, mock=
   const init = {
     method: 'GET',
     headers: {
-      Authorization: `Bearer ${window.SCP.readOnlyToken}`
+      Authorization: `Bearer ${getOAuthToken()}`
     }
   }
 

diff --git a/app/javascript/lib/validation/shared-validation.js b/app/javascript/lib/validation/shared-validation.js
@@ -3,9 +3,10 @@
 */
 
 // Ultimately sourced from: scp-ingest-pipeline/schemas
-import * as data from 'lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json';
+import * as _schema from 'lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json';
 
-export const REQUIRED_CONVENTION_COLUMNS = data.required.filter(c => c !== 'CellID')
+export const metadataSchema = _schema
+export const REQUIRED_CONVENTION_COLUMNS = metadataSchema.required.filter(c => c !== 'CellID')
 
 /**
  * ParseException can be thrown when we encounter an error that prevents us from parsing the file further
@@ -245,6 +246,7 @@ export function validateRequiredMetadataColumns(parsedHeaders, isAnnData=false)
     const msg = `File is missing required ${columns}: ${missingCols.join(', ')}`
     issues.push(['error', 'format:cap:metadata-missing-column', msg])
   }
+
   return issues
 }
 

diff --git a/app/javascript/lib/validation/validate-anndata.js b/app/javascript/lib/validation/validate-anndata.js
@@ -1,6 +1,42 @@
-import {openH5File} from '@single-cell-portal/hdf5-indexed-reader'
+import {openH5File} from 'hdf5-indexed-reader'
 
-import { validateUnique, validateRequiredMetadataColumns } from './shared-validation'
+import {
+  validateUnique, validateRequiredMetadataColumns,
+  metadataSchema, REQUIRED_CONVENTION_COLUMNS
+} from './shared-validation'
+import { getOAuthToken } from '~/lib/scp-api'
+
+/** Get ontology ID values for key in AnnData file */
+async function getOntologyIds(key, hdf5File) {
+  let ontologyIds = []
+
+  const obs = await hdf5File.get('obs')
+  const obsValues = await Promise.all(obs.values)
+
+  // Old versions of the AnnData spec used __categories as an obs.
+  // However, in new versions (since before 2023-01-23) of AnnData spec,
+  // categorical arrays are encoded as self-contained groups containing their
+  // own `categories` and `codes`.
+  // See e.g. https://github.com/scverse/anndata/issues/879
+  const internalCategories = obsValues.find(o => o.name.endsWith('__categories'))
+
+  let resolvedCategories = obsValues
+  if (internalCategories) {
+    resolvedCategories = await Promise.all(internalCategories.values)
+  }
+  const group = resolvedCategories.find(o => o.name.endsWith(key))
+  if (group) {
+    let categories
+    if (internalCategories) {
+      ontologyIds = await group.value
+    } else {
+      categories = await group.values[0]
+      ontologyIds = await categories.value
+    }
+  }
+
+  return ontologyIds
+}
 
 /** Get annotation headers for a key (e.g. obs) from an HDF5 file */
 async function getAnnotationHeaders(key, hdf5File) {
@@ -20,47 +56,130 @@ function isUrl(fileOrUrl) {
   return typeof fileOrUrl === 'string' && fileOrUrl.startsWith('http')
 }
 
-/** Get all headers from AnnData file */
-export async function getAnnDataHeaders(fileOrUrl) {
+/** Load local or remote AnnData file, return stream-parseable HDF5 object */
+export async function getHdf5File(fileOrUrl, remoteProps) {
   // Jest test uses Node, where file API differs
-  // TODO (SCP-5770): See if we can smoothen this and do away with `isTest`
   const isTest = isUrl(fileOrUrl)
 
   const isRemoteFileObject = !isUrl(fileOrUrl) && fileOrUrl.type === 'application/octet-stream'
 
-  // TODO (SCP-5770): Parameterize this, also support URL to remote file
-  const idType = isTest ? 'url' : 'file'
+  const idType = isTest || isRemoteFileObject ? 'url' : 'file'
 
-  // TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
   if (isRemoteFileObject) {
-    return null
+    fileOrUrl = remoteProps.url
   }
 
   const openParams = {}
   openParams[idType] = fileOrUrl
+
+  if (isRemoteFileObject) {
+    const oauthToken = getOAuthToken()
+    openParams.oauthToken = oauthToken
+  }
+
   const hdf5File = await openH5File(openParams)
+  return hdf5File
+}
 
+/** Get all headers from AnnData file */
+export async function getAnnDataHeaders(hdf5File) {
   const headers = await getAnnotationHeaders('obs', hdf5File)
 
   // const obsmHeaders = await getAnnotationHeaders('obsm', hdf5File)
   // const xHeaders = await getAnnotationHeaders('X', hdf5File)
   return headers
 }
 
+/**
+ * Get list of ontology names accepted for key from metadata schema
+ *
+ * E.g. "disease" -> ["MONDO", "PATO"]
+ */
+function getAcceptedOntologies(key, metadataSchema) {
+  // E.g. "ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/mondo,https://www.ebi.ac.uk/ols/ontologies/pato"
+  const olsUrls = metadataSchema.properties[key].ontology
+
+  const acceptedOntologies =
+    olsUrls?.split(',').map(url => url.split('/').slice(-1)[0].toUpperCase())
+
+  if (acceptedOntologies.includes('NCBITAXON')) {
+    acceptedOntologies.push('NCBITaxon')
+  }
+
+  return acceptedOntologies
+}
+
+/**
+ * Check format of ontology IDs for key, return updated issues array
+ *
+ * TODO (SCP-5791): Move this rule to shared-validation.js, apply to classic as well
+ */
+export function checkOntologyIdFormat(key, ontologyIds) {
+  const issues = []
+
+  const acceptedOntologies = getAcceptedOntologies(key, metadataSchema)
+  if (!acceptedOntologies) {return}
+
+  ontologyIds.forEach(ontologyId => {
+    const ontologyShortName = ontologyId.split(/[_:]/)[0]
+    if (!acceptedOntologies.includes(ontologyShortName)) {
+      const accepted = acceptedOntologies.join(', ')
+      const msg =
+        `Ontology ID "${ontologyId}" ` +
+        `is not among accepted ontologies (${accepted}) ` +
+        `for key "${key}"`
+
+      // Match "ontology:label-lookup-error" error type used in Ingest Pipeline, per
+      // https://github.com/broadinstitute/scp-ingest-pipeline/blob/858bb96ea7669f799d8f42d30b0b3131e2091710/ingest/validation/validate_metadata.py
+      issues.push(['error', 'ontology:label-lookup-error', msg])
+    }
+  })
+
+  return issues
+}
+
+/** Validate ontology IDs for required metadata columns in AnnData file */
+async function validateOntologyIdFormat(hdf5File) {
+  let issues = []
+
+  // Validate IDs for species, organ, disease, and library preparation protocol
+  for (let i = 0; i < REQUIRED_CONVENTION_COLUMNS.length; i++) {
+    const column = REQUIRED_CONVENTION_COLUMNS[i]
+    if (!column.endsWith('__ontology_label')) {continue}
+    const key = column.split('__ontology_label')[0]
+    const ontologyIds = await getOntologyIds(key, hdf5File)
+
+    issues = issues.concat(
+      checkOntologyIdFormat(key, ontologyIds)
+    )
+  }
+
+  return issues
+}
+
 /** Parse AnnData file, and return an array of issues, along with file parsing info */
-export async function parseAnnDataFile(file) {
+export async function parseAnnDataFile(fileOrUrl, remoteProps) {
   let issues = []
 
-  const headers = await getAnnDataHeaders(file)
+  const hdf5File = await getHdf5File(fileOrUrl, remoteProps)
+
+  const headers = await getAnnDataHeaders(hdf5File)
 
   // TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
   if (!headers) {
     return { issues }
   }
 
+  const requiredMetadataIssues = validateRequiredMetadataColumns([headers], true)
+  let ontologyIdFormatIssues = []
+  if (requiredMetadataIssues.length === 0) {
+    ontologyIdFormatIssues = await validateOntologyIdFormat(hdf5File)
+  }
+
   issues = issues.concat(
     validateUnique(headers),
-    validateRequiredMetadataColumns([headers], true)
+    requiredMetadataIssues,
+    ontologyIdFormatIssues
   )
 
   return { issues }

diff --git a/app/javascript/lib/validation/validate-file-content.js b/app/javascript/lib/validation/validate-file-content.js
@@ -327,7 +327,10 @@ export async function validateGzipEncoding(file, fileType) {
  * @returns {Object} result.issues Array of [category, type, message]
  * @returns {Number} result.perfTime How long this function took
  */
-async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
+async function parseFile(
+  file, fileType, fileOptions={},
+  sizeProps={}, remoteProps={}
+) {
   const startTime = performance.now()
 
   const fileInfo = {
@@ -371,7 +374,7 @@ async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
     }
 
     if (fileType === 'AnnData') {
-      const { issues } = await parseAnnDataFile(file)
+      const { issues } = await parseAnnDataFile(file, remoteProps)
       parseResult.issues = parseResult.issues.concat(issues)
     } else if (parseFunctions[fileType]) {
       let ignoreLastLine = false
@@ -411,6 +414,7 @@ async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
     } else {
       parseResult.issues.push(['error', 'parse:unhandled', error.message])
     }
+    console.error(error)
   }
 
   const perfTime = Math.round(performance.now() - startTime)

diff --git a/app/javascript/lib/validation/validate-file.js b/app/javascript/lib/validation/validate-file.js
@@ -166,8 +166,14 @@ async function validateRemoteFile(
 
     const sizeProps = getSizeProps(contentRange, contentLength, file)
 
+    const remoteProps = {
+      url: response.url
+    }
+
     // Equivalent block exists in validateFileContent
-    const parseResults = await ValidateFileContent.parseFile(file, fileType, fileOptions, sizeProps)
+    const parseResults = await ValidateFileContent.parseFile(
+      file, fileType, fileOptions, sizeProps, remoteProps
+    )
     fileInfo = parseResults['fileInfo']
     issues = parseResults['issues']
     perfTime = parseResults['perfTime']

diff --git a/app/lib/differential_expression_service.rb b/app/lib/differential_expression_service.rb
@@ -151,6 +151,7 @@ def self.run_differential_expression_job(cluster_group, study, user, annotation_
       de_params[:barcode_file] = barcode_file.gs_url
     elsif raw_matrix.file_type == 'AnnData'
       de_params[:matrix_file_type] = 'h5ad'
+      de_params[:file_size] = raw_matrix.upload_file_size
     else
       de_params[:matrix_file_type] = 'dense'
     end

diff --git a/app/lib/file_parse_service.rb b/app/lib/file_parse_service.rb
@@ -113,7 +113,7 @@ def self.run_parse_job(study_file, study, user, reparse: false, persist_on_fail:
           else
             params_object = AnnDataIngestParameters.new(
               anndata_file: study_file.gs_url, obsm_keys: study_file.ann_data_file_info.obsm_key_names,
-              file_size: study_file.upload_file_size
+              file_size: study_file.upload_file_size, extract_raw_counts: study_file.is_raw_counts_file?
             )
           end
           # TODO extract and parse Raw Exp Data (SCP-4710)

diff --git a/app/models/ann_data_ingest_parameters.rb b/app/models/ann_data_ingest_parameters.rb
@@ -3,6 +3,10 @@
 class AnnDataIngestParameters
   include ActiveModel::Model
   include Parameterizable
+  include ComputeScaling
+
+  # RAM scaling coefficient for auto-selecting machine_type
+  GB_PER_CORE = 1.75
 
   # default values for parameters, also used as control list for attributes hash
   # attributes marked as true are passed to the command line as a standalone flag with no value
@@ -13,6 +17,7 @@ class AnnDataIngestParameters
   # ingest_anndata: gate primary validation/extraction of AnnData file
   # anndata_file: GS URL for AnnData file
   # extract: array of values for different file type extractions
+  # extract_raw_counts: T/F for whether to add raw_counts to extraction
   # obsm_keys: data slots containing clustering information
   # ingest_cluster: gate ingesting an extracted cluster file
   # cluster_file: GS URL for extracted cluster file
@@ -32,7 +37,8 @@ class AnnDataIngestParameters
     cluster_file: nil,
     name: nil,
     domain_ranges: nil,
-    extract: %w[cluster metadata processed_expression raw_counts],
+    extract: %w[cluster metadata processed_expression],
+    extract_raw_counts: false,
     cell_metadata_file: nil,
     ingest_cell_metadata: false,
     study_accession: nil,
@@ -46,19 +52,7 @@ class AnnDataIngestParameters
   }.freeze
 
   # values that are available as methods but not as attributes (and not passed to command line)
-  NON_ATTRIBUTE_PARAMS = %i[file_size machine_type].freeze
-
-  # GCE machine types and file size ranges for handling fragment extraction
-  # produces a hash with entries like { 'n2-highmem-4' => 0..24.gigabytes }
-  # adjust (core * n) to n=4 for faster scaling (ie. n2-highmem-4 for 0 to 16G)
-  NUM_CORES = [4, 8, 16, 32, 48, 64, 80, 96].freeze
-  RAM_PER_CORE = NUM_CORES.map { |core| (core * 6).gigabytes }.freeze
-  EXTRACT_MACHINE_TYPES = NUM_CORES.map.with_index do |cores, index|
-    floor = index == 0 ? 0 : RAM_PER_CORE[index - 1]
-    limit = index == NUM_CORES.count - 1 ? RAM_PER_CORE[index] * 2 : RAM_PER_CORE[index]
-    # ranges that use '...' exclude the given end value.
-    { "n2d-highmem-#{cores}" => floor...limit }
-  end.reduce({}, :merge).freeze
+  NON_ATTRIBUTE_PARAMS = %i[file_size machine_type extract_raw_counts].freeze
 
   attr_accessor(*PARAM_DEFAULTS.keys)
 
@@ -73,14 +67,21 @@ def initialize(attributes = nil)
     # machine_type default is declared here to allow for autoscaling with optional override
     # see https://ruby-doc.org/core-3.1.0/Range.html#method-i-3D-3D-3D for range detection doc
     if @machine_type.nil?
-      self.machine_type = EXTRACT_MACHINE_TYPES.detect do |_, mem_range|
-                            mem_range === file_size
-                          end&.first || 'n2d-highmem-4'
+      self.machine_type = ingest_anndata ? assign_machine_type : default_machine_type
     end
+    append_raw_counts_extract!
   end
 
   # get the particular file (either source AnnData or fragment) being processed by this job
   def associated_file
     anndata_file || cluster_file || cell_metadata_file || matrix_file
   end
+
+  private
+
+  def append_raw_counts_extract!
+    if @ingest_anndata && @extract_raw_counts && !@extract.include?('raw_counts')
+      self.extract << 'raw_counts'
+    end
+  end
 end