Skip to content

Commit

Permalink
merging branch new scoring
Browse files Browse the repository at this point in the history
  • Loading branch information
aineniamh committed May 27, 2021
2 parents a6e5c46 + 69ad865 commit f2f30b0
Show file tree
Hide file tree
Showing 24 changed files with 842 additions and 979 deletions.
3 changes: 3 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ dependencies:
- python>=3.7
- snakemake-minimal>=5.13
- gofasta
- usher
- pip:
- pandas==1.0.1
- git+https://github.com/cov-lineages/pangoLEARN.git
- git+https://github.com/cov-lineages/scorpio.git
- git+https://github.com/cov-lineages/constellations.git
8 changes: 7 additions & 1 deletion pangolin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
_program = "pangolin"
__version__ = "2.4.2"
__version__ = "3.0"


__all__ = [
"utils"]

from pangolin import *
190 changes: 101 additions & 89 deletions pangolin/command.py

Large diffs are not rendered by default.

17 changes: 0 additions & 17 deletions pangolin/data/config_b.1.1.7.csv

This file was deleted.

6 changes: 0 additions & 6 deletions pangolin/data/config_b.1.214.2.csv

This file was deleted.

9 changes: 0 additions & 9 deletions pangolin/data/config_b.1.351.csv

This file was deleted.

15 changes: 0 additions & 15 deletions pangolin/data/config_p.1.csv

This file was deleted.

5 changes: 0 additions & 5 deletions pangolin/data/config_p.2.csv

This file was deleted.

12 changes: 0 additions & 12 deletions pangolin/data/config_p.3.csv

This file was deleted.

62 changes: 52 additions & 10 deletions pangolin/scripts/pangolearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def parse_args():
referenceSeq = ""
referenceId = "reference"

imputationScores = dict()

def findReferenceSeq():
currentSeq = ""

Expand All @@ -49,23 +51,43 @@ def clean(x, loc):
x = x.upper()

if x == 'T' or x == 'A' or x == 'G' or x == 'C' or x == '-':
return x
return x, False

if x == 'U':
return 'T'
return 'T', False

# replace ambiguity with the reference seq value
return referenceSeq[loc]

if referenceSeq[loc] == x:
return referenceSeq[loc], False

return referenceSeq[loc], True


# generates data line
def encodeSeq(seq, indiciesToKeep):
dataLine = []
imputed = 0
nonimputed = 0

for i in indiciesToKeep:
if i < len(seq):
dataLine.extend(clean(seq[i], i))
cleaned, imputed = clean(seq[i], i)

dataLine.extend(cleaned)

return dataLine
if(imputed):
imputed = imputed + 1
else:
nonimputed = nonimputed + 1


score = 0

if nonimputed != 0 and imputed < nonimputed:
score = 1 - (imputed/nonimputed)

return dataLine, score


# reads in the two data files
Expand All @@ -88,9 +110,13 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):
if currentSeq:
# yield sequence as one-hot encoded vector
idList.append(seqid)
seqList.append(encodeSeq(currentSeq, indiciesToKeep))

finalSeq, imputationScore = encodeSeq(currentSeq, indiciesToKeep)
seqList.append(finalSeq)
currentSeq = ""

imputationScores[seqid] = imputationScore

# this is a fasta line designating an id, but we don't want to keep the >
seqid = line.strip('>')

Expand All @@ -104,7 +130,9 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):

# gotta get the last one
idList.append(seqid)
seqList.append(encodeSeq(currentSeq, indiciesToKeep))
finalSeq, imputationScore = encodeSeq(currentSeq, indiciesToKeep)
seqList.append(finalSeq)
imputationScores[seqid] = imputationScore

yield idList, seqList

Expand All @@ -117,14 +145,19 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):
# possible nucleotide symbols
categories = ['-','A', 'C', 'G', 'T']
columns = [f"{i}_{c}" for i in indiciesToKeep for c in categories]
refRow = [r==c for r in encodeSeq(referenceSeq, indiciesToKeep) for c in categories]



rs, score = encodeSeq(referenceSeq, indiciesToKeep)

refRow = [r==c for r in rs for c in categories]

print("loading model " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
loaded_model = joblib.load(args.model_file)

# write predictions to a file
f = open(args.outfile, "w")

f.write("taxon,prediction,score,imputation_score,non_zero_ids,non_zero_scores,designated\n")
for idList, seqList in readInAndFormatData(args.sequences_file, indiciesToKeep):
print("processing block of {} sequences {}".format(
len(seqList), datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
Expand All @@ -148,18 +181,27 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):
maxScore = 0
maxIndex = -1

nonZeroIds = []
nonZeroScores = []

# get the max probability score and its assosciated index
for i in range(len(predictions[index])):
if predictions[index][i] > maxScore:
maxScore = predictions[index][i]
maxIndex = i

nonZeroScores.append(predictions[index][i])
nonZeroIds.append(loaded_model.classes_[i])

score = maxScore
prediction = loaded_model.classes_[maxIndex]
seqId = idList[index]

nonZeroIds = ";".join(nonZeroIds)
nonZeroScores = ';'.join(str(x) for x in nonZeroScores)

if seqId != referenceId:
f.write(seqId + "," + prediction + "," + str(score) + "\n")
f.write(seqId + "," + prediction + "," + str(score) + "," + str(imputationScores[seqId]) + "," + nonZeroIds + "," + nonZeroScores + "," + "\n")

f.close()

Expand Down
Loading

0 comments on commit f2f30b0

Please sign in to comment.