Skip to content

Commit

Permalink
models: Add new VariantStats fields. #174
Browse files Browse the repository at this point in the history
  • Loading branch information
j-coll committed Feb 10, 2020
1 parent a4972ea commit 53089ee
Show file tree
Hide file tree
Showing 8 changed files with 177 additions and 16 deletions.
40 changes: 34 additions & 6 deletions biodata-models/src/main/avro/variant.avdl
Original file line number Diff line number Diff line change
Expand Up @@ -39,51 +39,79 @@ protocol Variants {
* Total number of alleles in called genotypeCounters. Does not include missing alleles
**/
union { null, int } alleleCount;

/**
* Number of reference alleles found in this variant
**/
union { null, int } refAlleleCount;

/**
* Number of main alternate alleles found in this variants. Does not include secondary alternates
**/
union { null, int } altAlleleCount;

/**
* Reference allele frequency calculated from refAlleleCount and alleleCount, in the range (0,1)
**/
union { null, float } refAlleleFreq;

/**
* Alternate allele frequency calculated from altAlleleCount and alleleCount, in the range (0,1)
**/
union { null, float } altAlleleFreq;

/**
* Number of missing alleles
**/
union { null, int } missingAlleleCount;

/**
* Number of missing genotypeCounters
**/
union { null, int } missingGenotypeCount;

/**
* Count for each genotype found
**/
@java-key-class("org.opencb.biodata.models.feature.Genotype") map<int> genotypeCount;
@java-key-class("org.opencb.biodata.models.feature.Genotype")
map<int> genotypeCount = {};

/**
* Genotype frequency for each genotype found
**/
@java-key-class("org.opencb.biodata.models.feature.Genotype") map<float> genotypeFreq;
@java-key-class("org.opencb.biodata.models.feature.Genotype")
map<float> genotypeFreq = {};

/**
* Number of missing alleles
Number of samples with non-missing genotype with that specific filter.
**/
union { null, int } missingAlleleCount;
map<int> filterCount;

/**
* Number of missing genotypeCounters
Frequency of each filter. Count divided by the number of non-missing samples.
**/
union { null, int } missingGenotypeCount;
map<float> filterFreq;

/**
The weighted average of the Quality, computed only for non-missing samples.
**/
union { null, float } qualityAvg;

/**
* Minor allele frequency
**/
union { null, float } maf;

/**
* Minor genotype frequency
**/
union { null, float } mgf;

/**
* Allele with minor frequency
**/
union { null, string } mafAllele;

/**
* Genotype with minor frequency
**/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@ public VariantStats(org.opencb.biodata.models.variant.avro.VariantStats other) {
public VariantStats(float maf, float mgf, String mafAllele, String mgfGenotype,
int missingAlleleCount, int missingGenotypeCount) {
impl = new org.opencb.biodata.models.variant.avro.VariantStats(-1, -1, -1, -1F, -1F,
new HashMap<>(), new HashMap<>(),
missingAlleleCount, missingGenotypeCount,
new HashMap<>(), new HashMap<>(),
new HashMap<>(), new HashMap<>(), -1F,
maf, mgf, mafAllele, mgfGenotype);
}

Expand Down Expand Up @@ -181,6 +182,30 @@ public VariantStats addGenotype(Genotype g, int addedCount, boolean normalize) {
return this;
}

public java.util.Map<java.lang.String,java.lang.Integer> getFilterCount() {
return impl.getFilterCount();
}

public void setFilterCount(java.util.Map<java.lang.String,java.lang.Integer> value) {
this.impl.setFilterCount(value);
}

public java.util.Map<java.lang.String,java.lang.Float> getFilterFreq() {
return impl.getFilterFreq();
}

public void setFilterFreq(java.util.Map<java.lang.String,java.lang.Float> value) {
this.impl.setFilterFreq(value);
}

public java.lang.Float getQualityAvg() {
return impl.getQualityAvg();
}

public void setQualityAvg(java.lang.Float value) {
this.impl.setQualityAvg(value);
}

private Genotype normalizeGenotypeAlleles(Genotype g) {
// Get alleles sorted in ascending order
int[] sortedAlleles = g.getNormalizedAllelesIdx();
Expand Down
32 changes: 28 additions & 4 deletions biodata-models/src/main/proto/protobuf/opencb/variant.proto
Original file line number Diff line number Diff line change
Expand Up @@ -47,43 +47,67 @@ message VariantStats {
* Total number of alleles in called genotypeCounters. Does not include missing alleles
**/
int32 alleleCount = 1;

/**
* Number of reference alleles found in this variant
**/
int32 refAlleleCount = 2;

/**
* Number of main alternate alleles found in this variants. Does not include secondary alternates
**/
int32 altAlleleCount = 3;

/**
* Reference allele frequency calculated from refAlleleCount and alleleCount, in the range (0,1)
**/
float refAlleleFreq = 4;

/**
* Alternate allele frequency calculated from altAlleleCount and alleleCount, in the range (0,1)
**/
float altAlleleFreq = 5;

/**
* Number of missing alleles
**/
int32 missingAlleleCount = 8;

/**
* Number of missing genotypeCounters
**/
int32 missingGenotypeCount = 9;

/**
* Count for each genotype found
**/
map<string, int32> genotypeCount = 6;

/**
* Genotype frequency for each genotype found
**/
map<string, float> genotypeFreq = 7;

/**
* Number of missing alleles
* Number of samples with non-missing genotype with that specific filter.
**/
int32 missingAlleleCount = 8;
map<string, int32> filterCount = 14;

/**
* Number of missing genotypeCounters
* Frequency of each filter. Count divided by the number of non-missing samples.
**/
int32 missingGenotypeCount = 9;
map<string, float> filterFreq = 15;

/**
* The weighted average of the Quality, computed only for non-missing samples.
**/
float qualityAvg = 16;

/**
* Minor allele frequency
**/
float maf = 10;

/**
* Minor genotype frequency
**/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ protected void parseStats(Variant variant, StudyEntry study, int numAllele, Stri
String splitsGTC[] = info.get("GTC").split(",");
addGenotypeWithGTS(study.getAttributes(), splitsGTC, reference, alternateAlleles, numAllele, stats);
}
calculateFilterQualStats(fileEntry.getAttributes(), stats);

study.setStats(StudyEntry.DEFAULT_COHORT, stats);
}

Expand Down Expand Up @@ -150,6 +152,9 @@ protected void parseMappedStats(Variant variant, StudyEntry studyEntry,
}
// TODO reprocess stats to complete inferable values. A StatsHolder may be needed to keep values not storables in VariantStats
}
for (VariantStats stats : studyEntry.getStats().values()) {
calculateFilterQualStats(info, stats);
}
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ protected void parseStats(Variant variant, StudyEntry studyEntry, int numAllele,
int an = Integer.parseInt(info.get(AN_ADJ));
setMaf(an, acCounts, variant.getReference(), alternateAlleles, stats);
}
calculateFilterQualStats(info, stats);

studyEntry.setStats(StudyEntry.DEFAULT_COHORT, stats);
}
Expand Down Expand Up @@ -170,6 +171,7 @@ protected void parseMappedStats(Variant variant, StudyEntry studyEntry, int numA
for (String cohortName : studyEntry.getStats().keySet()) {
if (ans.containsKey(cohortName)) {
VariantStats cohortStats = studyEntry.getStats(cohortName);
calculateFilterQualStats(info, cohortStats);
Integer alleleNumber = ans.get(cohortName);
addReferenceGenotype(variant, cohortStats, alleleNumber);
setRefAlleleCount(cohortStats, alleleNumber, acs.get(cohortName));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.opencb.biodata.tools.variant.stats;

import org.apache.commons.lang.StringUtils;
import org.opencb.biodata.models.feature.Genotype;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.formats.variant.vcf4.VariantAggregatedVcfFactory;
Expand Down Expand Up @@ -165,11 +166,7 @@ protected void parseMappedStats(Variant variant, StudyEntry file, int numAllele,
String[] tagSplit = opencgaTag.split(DOT);
String cohortName = tagSplit[0];
String statName = tagSplit[1];
Map<String, String> parsedValues = cohortStats.get(cohortName);
if (parsedValues == null) {
parsedValues = new LinkedHashMap<>();
cohortStats.put(cohortName, parsedValues);
}
Map<String, String> parsedValues = cohortStats.computeIfAbsent(cohortName, k -> new LinkedHashMap<>());
parsedValues.put(statName, entry.getValue());
}
}
Expand Down Expand Up @@ -341,6 +338,21 @@ protected void calculate(Variant variant, StudyEntry studyEntry, int numAllele,
VariantStatsCalculator.calculateGenotypeFrequencies(variantStats);
}
}

calculateFilterQualStats(attributes, variantStats);
}

protected void calculateFilterQualStats(Map<String, String> attributes, VariantStats variantStats) {
String filter = attributes.get(StudyEntry.FILTER);
if (StringUtils.isNotEmpty(filter)) {
VariantStatsCalculator.addFileFilter(filter, variantStats.getFilterCount());
VariantStatsCalculator.calculateFilterFreq(variantStats, 1);
}

String qual = attributes.get(StudyEntry.QUAL);
if (StringUtils.isNotEmpty(qual) && !qual.equals(".")) {
variantStats.setQualityAvg(Float.valueOf(qual));
}
}

protected float parseFloat(String s, float missingValue) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@

package org.opencb.biodata.tools.variant.stats;

import org.apache.commons.lang.StringUtils;
import org.opencb.biodata.models.feature.AllelesCode;
import org.opencb.biodata.models.feature.Genotype;
import org.opencb.biodata.models.pedigree.Pedigree;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.stats.VariantStats;

import java.util.*;
Expand Down Expand Up @@ -60,6 +62,24 @@ public static VariantStats calculate(Variant variant, StudyEntry study, Collecti

calculate(gtCount, variantStats, variant.getReference(), variant.getAlternate());

int numFilterFiles = 0;
int numQualFiles = 0;
double qualSum = 0;
for (FileEntry file : study.getFiles()) {
String filter = file.getAttributes().get(StudyEntry.FILTER);
if (StringUtils.isNotEmpty(filter)) {
addFileFilter(filter, variantStats.getFilterCount());
numFilterFiles++;
}
String qual = file.getAttributes().get(StudyEntry.QUAL);
if (StringUtils.isNotEmpty(qual) && !qual.equals(".")) {
qualSum += Double.parseDouble(qual);
numQualFiles++;
}
}
calculateFilterFreq(variantStats, numFilterFiles, variantStats.getFilterCount());
variantStats.setQualityAvg((float) (qualSum / numQualFiles));

// Calculate Hardy-Weinberg statistic //FIXME
// variantStats.getHw().calculate();

Expand Down Expand Up @@ -116,6 +136,33 @@ public static void calculate(Map<Genotype, Integer> genotypeCount, VariantStats
calculate(genotypeCount, variantStats, refAllele, altAllele, true);
}

public static void calculateFilterFreq(VariantStats variantStats, final int numFiles) {
calculateFilterFreq(variantStats, numFiles, variantStats.getFilterCount());
}

public static void calculateFilterFreq(VariantStats variantStats, final int numFiles, Map<String, Integer> filterCount) {
variantStats.setFilterCount(filterCount);
if (numFiles > 0) {
Map<String, Float> filterFreq = variantStats.getFilterFreq();
filterCount.forEach((filter, count) -> filterFreq.put(filter, count / (float) numFiles));
} // else -> do not fill freqs
}

public static void addFileFilter(String filter, Map<String, Integer> filterCount) {
int endIndex = 0;
do {
int startIndex = endIndex;
endIndex = filter.indexOf(";", endIndex);
if (endIndex < 0) {
filterCount.merge(filter.substring(startIndex), 1, Integer::sum);
break;
} else {
filterCount.merge(filter.substring(startIndex, endIndex), 1, Integer::sum);
}
endIndex++;
} while (true);
}

public static void calculate(Map<Genotype, Integer> genotypeCount, VariantStats variantStats,
String refAllele, String altAllele, boolean multiAllelic) {
// Map<String, Genotype> gts = new TreeMap<>(String::compareTo);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,24 @@ public void testCreate_1000g_38_liftover_indel() throws Exception {
// System.out.println(new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(s.getStats()));
}

@Test
public void testAddFilter() {
HashMap<String, Integer> map = new HashMap<>();
VariantStatsCalculator.addFileFilter("PASS", map);
assertEquals("PASS", Integer.valueOf(1), map.get("PASS"));

VariantStatsCalculator.addFileFilter("PASS", map);
assertEquals("PASS", Integer.valueOf(2), map.get("PASS"));

VariantStatsCalculator.addFileFilter("PASS;other", map);
assertEquals("PASS", Integer.valueOf(3), map.get("PASS"));
assertEquals("other", Integer.valueOf(1), map.get("other"));

VariantStatsCalculator.addFileFilter("other", map);
assertEquals("PASS", Integer.valueOf(3), map.get("PASS"));
assertEquals("other", Integer.valueOf(2), map.get("other"));
}

private Properties get1000gLiftOverTagMap() {
Properties tagMap = new Properties();
tagMap.put("ALL.MAF", "MAF");
Expand Down

0 comments on commit 53089ee

Please sign in to comment.