Skip to content

Commit

Permalink
Merge pull request #2126 from broadinstitute/development
Browse files Browse the repository at this point in the history
Release 1.79.0
  • Loading branch information
jlchang authored Sep 11, 2024
2 parents 6fc9124 + b7dd9bb commit 699d208
Show file tree
Hide file tree
Showing 24 changed files with 483 additions and 106 deletions.
2 changes: 1 addition & 1 deletion app/javascript/components/upload/UploadWizard.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ export function RawUploadWizard({ studyAccession, name }) {
let fileToSave = file
let studyFileId = file._id

if (isAnnDataExperience) {
if (isAnnDataExperience && fileToSave?.file_type === 'AnnData') {
fileToSave = saveAnnDataFileHelper(file, fileToSave)
studyFileId = fileToSave._id
}
Expand Down
6 changes: 5 additions & 1 deletion app/javascript/lib/scp-api.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,10 @@ export async function deleteAnnDataFragment(studyAccession, fileId, fragId, mock
return response
}

/** Get OAuth token for HTTP requests that require "Authorization: Bearer" header */
export function getOAuthToken() {
return window.SCP.readOnlyToken
}

/**
* Fetches a given resource from a GCP bucket -- this handles adding the
Expand All @@ -450,7 +454,7 @@ export async function fetchBucketFile(bucketName, filePath, maxBytes=null, mock=
const init = {
method: 'GET',
headers: {
Authorization: `Bearer ${window.SCP.readOnlyToken}`
Authorization: `Bearer ${getOAuthToken()}`
}
}

Expand Down
6 changes: 4 additions & 2 deletions app/javascript/lib/validation/shared-validation.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
*/

// Ultimately sourced from: scp-ingest-pipeline/schemas
import * as data from 'lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json';
import * as _schema from 'lib/assets/metadata_schemas/alexandria_convention/alexandria_convention_schema.json';

export const REQUIRED_CONVENTION_COLUMNS = data.required.filter(c => c !== 'CellID')
export const metadataSchema = _schema
export const REQUIRED_CONVENTION_COLUMNS = metadataSchema.required.filter(c => c !== 'CellID')

/**
* ParseException can be thrown when we encounter an error that prevents us from parsing the file further
Expand Down Expand Up @@ -245,6 +246,7 @@ export function validateRequiredMetadataColumns(parsedHeaders, isAnnData=false)
const msg = `File is missing required ${columns}: ${missingCols.join(', ')}`
issues.push(['error', 'format:cap:metadata-missing-column', msg])
}

return issues
}

Expand Down
143 changes: 131 additions & 12 deletions app/javascript/lib/validation/validate-anndata.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,42 @@
import {openH5File} from '@single-cell-portal/hdf5-indexed-reader'
import {openH5File} from 'hdf5-indexed-reader'

import { validateUnique, validateRequiredMetadataColumns } from './shared-validation'
import {
validateUnique, validateRequiredMetadataColumns,
metadataSchema, REQUIRED_CONVENTION_COLUMNS
} from './shared-validation'
import { getOAuthToken } from '~/lib/scp-api'

/** Get ontology ID values for key in AnnData file */
async function getOntologyIds(key, hdf5File) {
let ontologyIds = []

const obs = await hdf5File.get('obs')
const obsValues = await Promise.all(obs.values)

// Old versions of the AnnData spec used __categories as an obs.
// However, in new versions (since before 2023-01-23) of AnnData spec,
// categorical arrays are encoded as self-contained groups containing their
// own `categories` and `codes`.
// See e.g. https://github.com/scverse/anndata/issues/879
const internalCategories = obsValues.find(o => o.name.endsWith('__categories'))

let resolvedCategories = obsValues
if (internalCategories) {
resolvedCategories = await Promise.all(internalCategories.values)
}
const group = resolvedCategories.find(o => o.name.endsWith(key))
if (group) {
let categories
if (internalCategories) {
ontologyIds = await group.value
} else {
categories = await group.values[0]
ontologyIds = await categories.value
}
}

return ontologyIds
}

/** Get annotation headers for a key (e.g. obs) from an HDF5 file */
async function getAnnotationHeaders(key, hdf5File) {
Expand All @@ -20,47 +56,130 @@ function isUrl(fileOrUrl) {
return typeof fileOrUrl === 'string' && fileOrUrl.startsWith('http')
}

/** Get all headers from AnnData file */
export async function getAnnDataHeaders(fileOrUrl) {
/** Load local or remote AnnData file, return stream-parseable HDF5 object */
export async function getHdf5File(fileOrUrl, remoteProps) {
// Jest test uses Node, where file API differs
// TODO (SCP-5770): See if we can smoothen this and do away with `isTest`
const isTest = isUrl(fileOrUrl)

const isRemoteFileObject = !isUrl(fileOrUrl) && fileOrUrl.type === 'application/octet-stream'

// TODO (SCP-5770): Parameterize this, also support URL to remote file
const idType = isTest ? 'url' : 'file'
const idType = isTest || isRemoteFileObject ? 'url' : 'file'

// TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
if (isRemoteFileObject) {
return null
fileOrUrl = remoteProps.url
}

const openParams = {}
openParams[idType] = fileOrUrl

if (isRemoteFileObject) {
const oauthToken = getOAuthToken()
openParams.oauthToken = oauthToken
}

const hdf5File = await openH5File(openParams)
return hdf5File
}

/** Get all headers from AnnData file */
export async function getAnnDataHeaders(hdf5File) {
const headers = await getAnnotationHeaders('obs', hdf5File)

// const obsmHeaders = await getAnnotationHeaders('obsm', hdf5File)
// const xHeaders = await getAnnotationHeaders('X', hdf5File)
return headers
}

/**
* Get list of ontology names accepted for key from metadata schema
*
* E.g. "disease" -> ["MONDO", "PATO"]
*/
function getAcceptedOntologies(key, metadataSchema) {
// E.g. "ontology_browser_url": "https://www.ebi.ac.uk/ols/ontologies/mondo,https://www.ebi.ac.uk/ols/ontologies/pato"
const olsUrls = metadataSchema.properties[key].ontology

const acceptedOntologies =
olsUrls?.split(',').map(url => url.split('/').slice(-1)[0].toUpperCase())

if (acceptedOntologies.includes('NCBITAXON')) {
acceptedOntologies.push('NCBITaxon')
}

return acceptedOntologies
}

/**
* Check format of ontology IDs for key, return updated issues array
*
* TODO (SCP-5791): Move this rule to shared-validation.js, apply to classic as well
*/
export function checkOntologyIdFormat(key, ontologyIds) {
const issues = []

const acceptedOntologies = getAcceptedOntologies(key, metadataSchema)
if (!acceptedOntologies) {return}

ontologyIds.forEach(ontologyId => {
const ontologyShortName = ontologyId.split(/[_:]/)[0]
if (!acceptedOntologies.includes(ontologyShortName)) {
const accepted = acceptedOntologies.join(', ')
const msg =
`Ontology ID "${ontologyId}" ` +
`is not among accepted ontologies (${accepted}) ` +
`for key "${key}"`

// Match "ontology:label-lookup-error" error type used in Ingest Pipeline, per
// https://github.com/broadinstitute/scp-ingest-pipeline/blob/858bb96ea7669f799d8f42d30b0b3131e2091710/ingest/validation/validate_metadata.py
issues.push(['error', 'ontology:label-lookup-error', msg])
}
})

return issues
}

/** Validate ontology IDs for required metadata columns in AnnData file */
async function validateOntologyIdFormat(hdf5File) {
let issues = []

// Validate IDs for species, organ, disease, and library preparation protocol
for (let i = 0; i < REQUIRED_CONVENTION_COLUMNS.length; i++) {
const column = REQUIRED_CONVENTION_COLUMNS[i]
if (!column.endsWith('__ontology_label')) {continue}
const key = column.split('__ontology_label')[0]
const ontologyIds = await getOntologyIds(key, hdf5File)

issues = issues.concat(
checkOntologyIdFormat(key, ontologyIds)
)
}

return issues
}

/** Parse AnnData file, and return an array of issues, along with file parsing info */
export async function parseAnnDataFile(file) {
export async function parseAnnDataFile(fileOrUrl, remoteProps) {
let issues = []

const headers = await getAnnDataHeaders(file)
const hdf5File = await getHdf5File(fileOrUrl, remoteProps)

const headers = await getAnnDataHeaders(hdf5File)

// TODO (SCP-5770): Extend AnnData CSFV to remote files, then remove this
if (!headers) {
return { issues }
}

const requiredMetadataIssues = validateRequiredMetadataColumns([headers], true)
let ontologyIdFormatIssues = []
if (requiredMetadataIssues.length === 0) {
ontologyIdFormatIssues = await validateOntologyIdFormat(hdf5File)
}

issues = issues.concat(
validateUnique(headers),
validateRequiredMetadataColumns([headers], true)
requiredMetadataIssues,
ontologyIdFormatIssues
)

return { issues }
Expand Down
8 changes: 6 additions & 2 deletions app/javascript/lib/validation/validate-file-content.js
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,10 @@ export async function validateGzipEncoding(file, fileType) {
* @returns {Object} result.issues Array of [category, type, message]
* @returns {Number} result.perfTime How long this function took
*/
async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
async function parseFile(
file, fileType, fileOptions={},
sizeProps={}, remoteProps={}
) {
const startTime = performance.now()

const fileInfo = {
Expand Down Expand Up @@ -371,7 +374,7 @@ async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
}

if (fileType === 'AnnData') {
const { issues } = await parseAnnDataFile(file)
const { issues } = await parseAnnDataFile(file, remoteProps)
parseResult.issues = parseResult.issues.concat(issues)
} else if (parseFunctions[fileType]) {
let ignoreLastLine = false
Expand Down Expand Up @@ -411,6 +414,7 @@ async function parseFile(file, fileType, fileOptions={}, sizeProps={}) {
} else {
parseResult.issues.push(['error', 'parse:unhandled', error.message])
}
console.error(error)
}

const perfTime = Math.round(performance.now() - startTime)
Expand Down
8 changes: 7 additions & 1 deletion app/javascript/lib/validation/validate-file.js
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,14 @@ async function validateRemoteFile(

const sizeProps = getSizeProps(contentRange, contentLength, file)

const remoteProps = {
url: response.url
}

// Equivalent block exists in validateFileContent
const parseResults = await ValidateFileContent.parseFile(file, fileType, fileOptions, sizeProps)
const parseResults = await ValidateFileContent.parseFile(
file, fileType, fileOptions, sizeProps, remoteProps
)
fileInfo = parseResults['fileInfo']
issues = parseResults['issues']
perfTime = parseResults['perfTime']
Expand Down
1 change: 1 addition & 0 deletions app/lib/differential_expression_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def self.run_differential_expression_job(cluster_group, study, user, annotation_
de_params[:barcode_file] = barcode_file.gs_url
elsif raw_matrix.file_type == 'AnnData'
de_params[:matrix_file_type] = 'h5ad'
de_params[:file_size] = raw_matrix.upload_file_size
else
de_params[:matrix_file_type] = 'dense'
end
Expand Down
2 changes: 1 addition & 1 deletion app/lib/file_parse_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def self.run_parse_job(study_file, study, user, reparse: false, persist_on_fail:
else
params_object = AnnDataIngestParameters.new(
anndata_file: study_file.gs_url, obsm_keys: study_file.ann_data_file_info.obsm_key_names,
file_size: study_file.upload_file_size
file_size: study_file.upload_file_size, extract_raw_counts: study_file.is_raw_counts_file?
)
end
# TODO extract and parse Raw Exp Data (SCP-4710)
Expand Down
35 changes: 18 additions & 17 deletions app/models/ann_data_ingest_parameters.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
class AnnDataIngestParameters
include ActiveModel::Model
include Parameterizable
include ComputeScaling

# RAM scaling coefficient for auto-selecting machine_type
GB_PER_CORE = 1.75

# default values for parameters, also used as control list for attributes hash
# attributes marked as true are passed to the command line as a standalone flag with no value
Expand All @@ -13,6 +17,7 @@ class AnnDataIngestParameters
# ingest_anndata: gate primary validation/extraction of AnnData file
# anndata_file: GS URL for AnnData file
# extract: array of values for different file type extractions
# extract_raw_counts: T/F for whether to add raw_counts to extraction
# obsm_keys: data slots containing clustering information
# ingest_cluster: gate ingesting an extracted cluster file
# cluster_file: GS URL for extracted cluster file
Expand All @@ -32,7 +37,8 @@ class AnnDataIngestParameters
cluster_file: nil,
name: nil,
domain_ranges: nil,
extract: %w[cluster metadata processed_expression raw_counts],
extract: %w[cluster metadata processed_expression],
extract_raw_counts: false,
cell_metadata_file: nil,
ingest_cell_metadata: false,
study_accession: nil,
Expand All @@ -46,19 +52,7 @@ class AnnDataIngestParameters
}.freeze

# values that are available as methods but not as attributes (and not passed to command line)
NON_ATTRIBUTE_PARAMS = %i[file_size machine_type].freeze

# GCE machine types and file size ranges for handling fragment extraction
# produces a hash with entries like { 'n2-highmem-4' => 0..24.gigabytes }
# adjust (core * n) to n=4 for faster scaling (ie. n2-highmem-4 for 0 to 16G)
NUM_CORES = [4, 8, 16, 32, 48, 64, 80, 96].freeze
RAM_PER_CORE = NUM_CORES.map { |core| (core * 6).gigabytes }.freeze
EXTRACT_MACHINE_TYPES = NUM_CORES.map.with_index do |cores, index|
floor = index == 0 ? 0 : RAM_PER_CORE[index - 1]
limit = index == NUM_CORES.count - 1 ? RAM_PER_CORE[index] * 2 : RAM_PER_CORE[index]
# ranges that use '...' exclude the given end value.
{ "n2d-highmem-#{cores}" => floor...limit }
end.reduce({}, :merge).freeze
NON_ATTRIBUTE_PARAMS = %i[file_size machine_type extract_raw_counts].freeze

attr_accessor(*PARAM_DEFAULTS.keys)

Expand All @@ -73,14 +67,21 @@ def initialize(attributes = nil)
# machine_type default is declared here to allow for autoscaling with optional override
# see https://ruby-doc.org/core-3.1.0/Range.html#method-i-3D-3D-3D for range detection doc
if @machine_type.nil?
self.machine_type = EXTRACT_MACHINE_TYPES.detect do |_, mem_range|
mem_range === file_size
end&.first || 'n2d-highmem-4'
self.machine_type = ingest_anndata ? assign_machine_type : default_machine_type
end
append_raw_counts_extract!
end

# get the particular file (either source AnnData or fragment) being processed by this job
def associated_file
anndata_file || cluster_file || cell_metadata_file || matrix_file
end

private

def append_raw_counts_extract!
if @ingest_anndata && @extract_raw_counts && !@extract.include?('raw_counts')
self.extract << 'raw_counts'
end
end
end
Loading

0 comments on commit 699d208

Please sign in to comment.