From cb72c12eb2ab9830b96067c9741fe5f270178a18 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Fri, 26 Apr 2024 08:28:17 +0200 Subject: [PATCH 1/6] refactored English output for clarity --- .github/workflows/documentation.yml | 2 +- docs/_static/style.css | 3 - pom.xml | 2 +- .../model/AgeNotSpecified.java | 5 + .../phenopacket2prompt/model/HpoOnsetAge.java | 10 + .../phenopacket2prompt/model/Iso8601Age.java | 6 + .../model/PhenopacketAge.java | 2 + .../model/PpktIndividual.java | 100 ++++++- .../output/PhenopacketAgeSexGenerator.java | 8 +- .../PpktPhenotypicFeatureGenerator.java | 31 +- .../output/PromptGenerator.java | 38 ++- .../impl/english/EnglishPromptGenerator.java | 77 +---- .../impl/english/PpktAgeSexEnglish.java | 282 +++++++++++++++++- .../english/PpktPhenotypicfeatureEnglish.java | 45 ++- .../impl/spanish/PpktAgeSexSpanish.java | 22 +- .../spanish/PpktPhenotypicfeatureSpanish.java | 12 +- .../impl/spanish/SpanishPromptGenerator.java | 51 +--- .../model/PpktIndividualTest.java | 2 +- 18 files changed, 516 insertions(+), 182 deletions(-) delete mode 100644 docs/_static/style.css diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 48983c6..d8fdef1 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -2,7 +2,7 @@ name: mkdocs-generation on: push: branches: - - main + - [main, develop] permissions: contents: write jobs: diff --git a/docs/_static/style.css b/docs/_static/style.css deleted file mode 100644 index 0a9e3c5..0000000 --- a/docs/_static/style.css +++ /dev/null @@ -1,3 +0,0 @@ -.wy-nav-content { - max-width: 80% !important; -} \ No newline at end of file diff --git a/pom.xml b/pom.xml index 4b871bf..5bd5653 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ org.monarchinitiative phenopacket2prompt - 0.3.11 + 0.3.12 phenopacket2prompt https://github.com/monarch-initiative/phenopacket2prompt diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/AgeNotSpecified.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/AgeNotSpecified.java index 7987ae1..e94e62e 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/AgeNotSpecified.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/AgeNotSpecified.java @@ -11,6 +11,11 @@ public PhenopacketAgeType ageType() { return PhenopacketAgeType.NOT_SPECIFIED; } + @Override + public boolean isJuvenile() { + return false; + } + @Override public boolean isChild() { return false; diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/HpoOnsetAge.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/HpoOnsetAge.java index 6c7cd41..b8a487f 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/HpoOnsetAge.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/HpoOnsetAge.java @@ -22,6 +22,7 @@ public class HpoOnsetAge implements PhenopacketAge { /** Childhood onset */ private final static TermId childhoodOnset = TermId.of("HP:0011463"); + private final static TermId juvenileOnset = TermId.of("HP:0003621"); /** Infantile onset */ private final static TermId infantileOnset = TermId.of("HP:0003593"); @@ -54,6 +55,11 @@ public PhenopacketAgeType ageType() { return PhenopacketAgeType.HPO_ONSET_AGE_TYPE; } + @Override + public boolean isJuvenile() { + return tid.equals(juvenileOnset); + } + @Override public boolean isChild() { @@ -80,4 +86,8 @@ public boolean isFetus() { public int totalDays() { return totalDays; } + + public TermId getTid() { + return tid; + } } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/Iso8601Age.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/Iso8601Age.java index 1692624..34e5781 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/Iso8601Age.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/Iso8601Age.java @@ -71,6 +71,12 @@ public PhenopacketAgeType ageType() { return PhenopacketAgeType.ISO8601_AGE_TYPE; } + + @Override + public boolean isJuvenile() { + return years >= 10 && years < 18; + } + @Override public boolean isChild() { return years >= 1 && years < 10; diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PhenopacketAge.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PhenopacketAge.java index a08a1d1..32c0902 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PhenopacketAge.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PhenopacketAge.java @@ -5,6 +5,8 @@ public interface PhenopacketAge { String age(); PhenopacketAgeType ageType(); + boolean isJuvenile(); + boolean isChild(); boolean isInfant(); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java index 24ffece..7d7c02e 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java @@ -96,10 +96,85 @@ public List getDiseases() { return diseases; } - public Map> getPhenotypicFeatures() { + + + public List getPhenotypicFeaturesWithNoSpecifiedAge() { + List unspecifiedFeatures = new ArrayList<>(); + for (var pf : ppkt.getPhenotypicFeaturesList()) { + OntologyClass clz = pf.getType(); + if (clz.getId().isEmpty()) { + System.err.println("Warning, empty ontology term"); + continue; + } + TermId hpoId = TermId.of(pf.getType().getId()); + String label = pf.getType().getLabel(); + boolean excluded = pf.getExcluded(); + if (pf.hasOnset()) { + continue; + } else { + unspecifiedFeatures.add(new OntologyTerm(hpoId, label, excluded)); + } + } + return unspecifiedFeatures; + } + + + private boolean agesEqual(PhenopacketAge ageOne, PhenopacketAge ageTwo) { + if (ageOne.ageType().equals(ageTwo.ageType())) { + if (ageOne.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoOne = (Iso8601Age) ageOne; + Iso8601Age isoTwo = (Iso8601Age) ageTwo; + return isoOne.getDays() == isoTwo.getDays() && + isoOne.getMonths() == isoTwo.getMonths() && + isoOne.getYears() == isoTwo.getYears(); + } else if (ageOne.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + HpoOnsetAge onsetOne = (HpoOnsetAge) ageOne; + HpoOnsetAge onsetTwo = (HpoOnsetAge) ageTwo; + return onsetOne.getTid().equals(onsetTwo.getTid()); + } + } + return false; + } + + + public List getPhenotypicFeaturesAtOnset() { + Optional opt = getAgeAtOnset(); + if (opt.isEmpty()) { + return List.of(); // + } + List onsetFeatures = new ArrayList<>(); + PhenopacketAge onsetAge = opt.get(); + + for (var pf : ppkt.getPhenotypicFeaturesList()) { + OntologyClass clz = pf.getType(); + if (clz.getId().isEmpty()) { + System.err.println("Warning, empty ontology term"); + continue; + } + TermId hpoId = TermId.of(pf.getType().getId()); + String label = pf.getType().getLabel(); + boolean excluded = pf.getExcluded(); + if (pf.hasOnset()) { + TimeElement telem = pf.getOnset(); + Optional ageOpt = getAgeFromTimeElement(telem); + if (ageOpt.isPresent()) { + if (agesEqual(onsetAge, ageOpt.get())) { + onsetFeatures.add(new OntologyTerm(hpoId, label, excluded, onsetAge)); + } + } + } + } + return onsetFeatures; + } + + /** + * Get a map of phenotypic features with specified onset after the age of onset + * This does not include features with unspecified onset (for that, use {@code getPhenotypicFeaturesWithNoSpecifiedAge}). + * @return + */ + public Map> getSpecifiedAgePhenotypicFeatures() { Map> ageToFeatureMap = new HashMap<>(); - PhenopacketAge notSpecified = new AgeNotSpecified(); - ageToFeatureMap.put(notSpecified, new ArrayList<>()); + Optional onsetOpt = getAgeAtOnset(); for (var pf : ppkt.getPhenotypicFeaturesList()) { OntologyClass clz = pf.getType(); if (clz.getId().isEmpty()) { @@ -109,16 +184,21 @@ public Map> getPhenotypicFeatures() { TermId hpoId = TermId.of(pf.getType().getId()); String label = pf.getType().getLabel(); boolean excluded = pf.getExcluded(); - Optional opt = Optional.empty(); + Optional ageOpt = Optional.empty(); if (pf.hasOnset()) { TimeElement telem = pf.getOnset(); - opt = getAgeFromTimeElement(telem); + ageOpt = getAgeFromTimeElement(telem); } - if (opt.isPresent()) { - ageToFeatureMap.putIfAbsent(opt.get(), new ArrayList<>()); - ageToFeatureMap.get(opt.get()).add(new OntologyTerm(hpoId, label, excluded, opt.get())); - } else { - ageToFeatureMap.get(notSpecified).add(new OntologyTerm(hpoId, label, excluded)); + // skip features that occur at age of onset + if (ageOpt.isPresent() && onsetOpt.isPresent()) { + if (agesEqual(ageOpt.get(), onsetOpt.get())) { + continue; + } + } + // only add features with specified onset here. + if (ageOpt.isPresent()) { + ageToFeatureMap.putIfAbsent(ageOpt.get(), new ArrayList<>()); + ageToFeatureMap.get(ageOpt.get()).add(new OntologyTerm(hpoId, label, excluded, ageOpt.get())); } } return ageToFeatureMap; diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PhenopacketAgeSexGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PhenopacketAgeSexGenerator.java index 7da409e..606e6e2 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PhenopacketAgeSexGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PhenopacketAgeSexGenerator.java @@ -1,15 +1,19 @@ package org.monarchinitiative.phenopacket2prompt.output; import org.monarchinitiative.phenopacket2prompt.model.PhenopacketAge; +import org.monarchinitiative.phenopacket2prompt.model.PhenopacketSex; import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; public interface PhenopacketAgeSexGenerator { - String individualWithAge(PhenopacketAge ppktAge); + String getIndividualDescription(PpktIndividual individual); + + + String heSheIndividual(PhenopacketSex psex); String atAge(PhenopacketAge ppktAge); - String ppktSex(); + //String ppktSex(); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PpktPhenotypicFeatureGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PpktPhenotypicFeatureGenerator.java index f7a65b5..e3f3443 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PpktPhenotypicFeatureGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PpktPhenotypicFeatureGenerator.java @@ -8,36 +8,7 @@ public interface PpktPhenotypicFeatureGenerator { - String featureList( List ontologyTerms); - - String excludedFeatureList( List ontologyTerms); - - - default boolean hasObservedFeatures( List ontologyTerms) { - return ontologyTerms.stream().anyMatch(Predicate.not(OntologyTerm::isExcluded)); - } - - default boolean hasExcludedFeatures( List ontologyTerms) { - return ontologyTerms.stream().anyMatch(OntologyTerm::isExcluded); - } - - default String getOxfordCommaList(List items, String andWord) { - if (items.size() == 2) { - // no comma if we just have two items. - // one item will work with the below code - String andWithSpace = String.format(" %s ", andWord); - return String.join(andWithSpace, items) + "."; - } - StringBuilder sb = new StringBuilder(); - String symList = String.join(", ", items); - int jj = symList.lastIndexOf(", "); - if (jj > 0) { - String andWithSpaceAndComma = String.format(", %s ", andWord); - symList = symList.substring(0, jj) + andWithSpaceAndComma + symList.substring(jj+2); - } - sb.append(symList); - return sb.toString(); - } + String formatFeatures( List ontologyTerms); } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java index 82a0510..6603b38 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PromptGenerator.java @@ -2,10 +2,16 @@ import org.monarchinitiative.phenol.ontology.data.Ontology; import org.monarchinitiative.phenopacket2prompt.international.HpInternational; +import org.monarchinitiative.phenopacket2prompt.model.OntologyTerm; +import org.monarchinitiative.phenopacket2prompt.model.PhenopacketAge; +import org.monarchinitiative.phenopacket2prompt.model.PhenopacketSex; import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; import org.monarchinitiative.phenopacket2prompt.output.impl.english.EnglishPromptGenerator; import org.monarchinitiative.phenopacket2prompt.output.impl.spanish.*; +import java.util.List; +import java.util.Map; + public interface PromptGenerator { @@ -13,12 +19,12 @@ public interface PromptGenerator { String queryHeader(); String getIndividualInformation(PpktIndividual ppktIndividual); - String getPhenotypicFeatures(PpktIndividual ppktIndividual); + String formatFeatures( List ontologyTerms); + String getVignetteAtAge(PhenopacketAge page, PhenopacketSex psex, List terms); public static PromptGenerator english(Ontology ontology){ - return new EnglishPromptGenerator(ontology); } @@ -27,11 +33,29 @@ static PromptGenerator spanish(Ontology hpo, HpInternational international) { return new SpanishPromptGenerator(hpo, pfgen); } + /** + * The following structure should work for most other languages, but the function + * can be overridden if necessary. + * @param individual The individual for whom we are creating the prompt + * @return the prompt text + */ default String createPrompt(PpktIndividual individual) { - String sb = queryHeader() + - getIndividualInformation(individual) + - getPhenotypicFeatures(individual); - return sb; + String individualInfo = getIndividualInformation(individual); + List onsetTerms = individual.getPhenotypicFeaturesAtOnset(); + List unspecifiedAgeTerms = individual.getPhenotypicFeaturesWithNoSpecifiedAge(); + Map> pfMap = individual.getSpecifiedAgePhenotypicFeatures(); + // For creating the prompt, we first report the onset and the unspecified terms together, and then + // report the rest + onsetTerms.addAll(unspecifiedAgeTerms); + String onsetFeatures = formatFeatures(onsetTerms); + StringBuilder sb = new StringBuilder(); + sb.append(queryHeader()); + sb.append(individualInfo).append(" ").append(onsetFeatures); + for (var entry: pfMap.entrySet()) { + String vignette = getVignetteAtAge(entry.getKey(), individual.getSex(), entry.getValue()); + sb.append(vignette).append(" "); + } + return sb.toString(); } @@ -39,4 +63,6 @@ default String createPrompt(PpktIndividual individual) { + + } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java index d162c0f..1c7f5ec 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java @@ -3,16 +3,17 @@ import org.monarchinitiative.phenol.ontology.data.Ontology; import org.monarchinitiative.phenopacket2prompt.model.OntologyTerm; import org.monarchinitiative.phenopacket2prompt.model.PhenopacketAge; +import org.monarchinitiative.phenopacket2prompt.model.PhenopacketSex; import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; import org.monarchinitiative.phenopacket2prompt.output.*; import java.util.*; +import java.util.function.Predicate; public class EnglishPromptGenerator implements PromptGenerator { private final Ontology hpo; - private final PhenopacketAgeSexGenerator ppktAgeGenerator; private final PhenopacketTextGenerator ppktTextGenerator; @@ -20,7 +21,6 @@ public class EnglishPromptGenerator implements PromptGenerator { private final PpktPhenotypicFeatureGenerator ppktPhenotypicFeatureGenerator; - public EnglishPromptGenerator(Ontology hpo){ this.hpo = hpo; ppktAgeGenerator = new PpktAgeSexEnglish(); @@ -29,75 +29,26 @@ public EnglishPromptGenerator(Ontology hpo){ } - + @Override + public String queryHeader() { + return ppktTextGenerator.QUERY_HEADER(); + } @Override public String getIndividualInformation(PpktIndividual ppktIndividual) { - StringBuilder sb = new StringBuilder(); - /*String sex = sexGenerator.ppktSex(ppktIndividual); - Optional lastAgeOpt = ppktIndividual.getAgeAtLastExamination(); - Optional onsetOpt = ppktIndividual.getAgeAtOnset(); - if (lastAgeOpt.isPresent()) { - PhenopacketAge lastExamAge = lastAgeOpt.get(); - String examAge = ppktAgeGenerator.age(lastExamAge); - sb.append("The proband was a ").append(examAge).append( " ").append(sex).append(". "); - } else { - sb.append("The proband was a ").append(sex).append(". "); - } - if (onsetOpt.isPresent()) { - PhenopacketAge onsetAge = onsetOpt.get(); - String onset = ppktAgeGenerator.age(onsetAge); - sb.append("Initial manifestations of disease appeared when the proband was ").append(onset).append(". "); - }*/ - return sb.toString(); + return this.ppktAgeGenerator.getIndividualDescription(ppktIndividual); } @Override - public String getPhenotypicFeatures(PpktIndividual ppktIndividual) { - StringBuilder sb = new StringBuilder(); - Map> termMap = ppktIndividual.getPhenotypicFeatures(); - List ageList = new ArrayList<>(termMap.keySet()); - Collections.sort(ageList,(a, b) -> Integer.compare(a.totalDays(), b.totalDays())); - for (var age: ageList) { - List terms = termMap.get(age); - if (! age.specified()) { - if (termMap.size() > 1) { - // if size is greater than one, there was at least one specified time point - if (ppktPhenotypicFeatureGenerator.hasObservedFeatures(terms)) { - sb.append("Additional features included ").append(ppktPhenotypicFeatureGenerator.featureList(terms)).append(". "); - } - if (ppktPhenotypicFeatureGenerator.hasExcludedFeatures(terms)) { - sb.append("Additional excluded features were ").append(ppktPhenotypicFeatureGenerator.excludedFeatureList(terms)).append(". "); - } - } else { - if (ppktPhenotypicFeatureGenerator.hasObservedFeatures(terms)) { - sb.append("The following clinical manifestations were observed: ").append(ppktPhenotypicFeatureGenerator.featureList(terms)).append(". "); - } - if (ppktPhenotypicFeatureGenerator.hasExcludedFeatures(terms)) { - sb.append("The following clinical manifestations were excluded: ").append(ppktPhenotypicFeatureGenerator.excludedFeatureList(terms)).append(". "); - } - } - } else { - String ageString = "";//ppktAgeGenerator.age(age); - - if (ppktPhenotypicFeatureGenerator.hasObservedFeatures(terms)) { - sb.append(ageString).append(", the following clinical manifestations were observed: ").append(ppktPhenotypicFeatureGenerator.featureList(terms)).append(". "); - } - if (ppktPhenotypicFeatureGenerator.hasExcludedFeatures(terms)) { - sb.append(ageString).append(", the following clinical manifestations were excluded: ").append(ppktPhenotypicFeatureGenerator.excludedFeatureList(terms)).append(". "); - } - } - } - - return sb.toString(); + public String formatFeatures(List ontologyTerms) { + return ppktPhenotypicFeatureGenerator.formatFeatures(ontologyTerms); } - - - - @Override - public String queryHeader() { - return ppktTextGenerator.QUERY_HEADER(); + public String getVignetteAtAge(PhenopacketAge page, PhenopacketSex psex, List terms) { + String ageString = this.ppktAgeGenerator.atAge(page); + String features = formatFeatures(terms); + return String.format("%s, %s presented with %s", ageString, ppktAgeGenerator.heSheIndividual(psex), features); } + } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktAgeSexEnglish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktAgeSexEnglish.java index 6363c3f..43e1e0b 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktAgeSexEnglish.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktAgeSexEnglish.java @@ -1,11 +1,11 @@ package org.monarchinitiative.phenopacket2prompt.output.impl.english; -import org.monarchinitiative.phenopacket2prompt.model.PhenopacketAge; -import org.monarchinitiative.phenopacket2prompt.model.PhenopacketAgeType; -import org.monarchinitiative.phenopacket2prompt.model.PhenopacketSex; -import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; +import org.monarchinitiative.phenol.base.PhenolRuntimeException; +import org.monarchinitiative.phenopacket2prompt.model.*; import org.monarchinitiative.phenopacket2prompt.output.PhenopacketAgeSexGenerator; +import java.util.ArrayList; +import java.util.List; import java.util.Optional; public class PpktAgeSexEnglish implements PhenopacketAgeSexGenerator { @@ -15,6 +15,271 @@ public PpktAgeSexEnglish() { } + public String getIndividualDescription(PpktIndividual individual) { + Optional lastExamOpt = individual.getAgeAtLastExamination(); + Optional onsetOpt = individual.getAgeAtOnset(); + PhenopacketSex psex = individual.getSex(); + if (lastExamOpt.isPresent() && onsetOpt.isPresent()) { + return onsetAndLastEncounterAvailable(psex, lastExamOpt.get(), onsetOpt.get()); + } else if (lastExamOpt.isPresent()) { + return lastEncounterAvailable(psex, lastExamOpt.get()); + } else if (onsetOpt.isPresent()) { + return onsetAvailable(psex, onsetOpt.get()); + } else { + return ageNotAvailable(psex); + } + } + + @Override + public String heSheIndividual(PhenopacketSex psex) { + return switch (psex) { + case FEMALE -> "she"; + case MALE -> "he"; + default -> "the individual"; + }; + } + + + private String iso8601ToYear(Iso8601Age iso8601Age) { + return String.format("%d-year old", iso8601Age.getYears()); + } + + private String iso8601ToYearMonth(Iso8601Age iso8601Age) { + if (iso8601Age.getMonths() == 0) { + return String.format("%d-year old", iso8601Age.getYears()); + } else { + return String.format("%d-year, %d-month old", iso8601Age.getYears(), iso8601Age.getMonths()); + } + } + + private String iso8601ToMonthDay(Iso8601Age iso8601Age) { + int m = iso8601Age.getMonths(); + int d = iso8601Age.getDays(); + if (m == 0) { + return String.format("%d-day old", d); + } else if (d>0){ + return String.format("%d-month, %d-day old", m, d); + } else { + return String.format("%d-month old", m, d); + } + } + + /** + * Create a phrase such as "at the age of 7 years, 4 months, and 2 days" + * Leave out the months and days if they are zero. + * @param isoAge + * @return + */ + private String iso8601AtAgeOf(Iso8601Age isoAge) { + List components = new ArrayList<>(); + + if (isoAge.getYears()>1) { + components.add(String.format("%d years", isoAge.getYears())); + } else if (isoAge.getYears() == 1) { + components.add("1 year"); + } + if (isoAge.getMonths() > 1) { + components.add(String.format("%d months", isoAge.getMonths())); + } else if (isoAge.getMonths() == 1) { + components.add("1 month"); + } + if (isoAge.getDays()>1) { + components.add(String.format("%d days", isoAge.getDays())); + } else if (isoAge.getDays()==1) { + components.add("1 day"); + } + if (components.isEmpty()) { + return "as a newborn"; + } else if (components.size() == 1) { + return "at the age of " + components.get(0); + } else if (components.size() == 2) { + return "at the age of " + components.get(0) + " and " + components.get(1); + } else { + return "at the age of " + components.get(0) + "m " + components.get(1) + + ", and " + components.get(2); + } + } + + private String onsetTermAtAgeOf(HpoOnsetAge hpoOnsetTermAge) { + if (hpoOnsetTermAge.isFetus()) { + return "in the fetal period"; + } else if (hpoOnsetTermAge.isCongenital()) { + return "as a newborn"; + } else if (hpoOnsetTermAge.isInfant()) { + return "as an infant"; + } else if (hpoOnsetTermAge.isChild()) { + return "in childhood"; + } else if (hpoOnsetTermAge.isJuvenile()) { + return "as an adolescent"; + } else { + return "in adulthood"; + } + } + + + private String iso8601individualDescription(PhenopacketSex psex, Iso8601Age iso8601Age) { + int y = iso8601Age.getYears(); + int m = iso8601Age.getMonths(); + int d = iso8601Age.getDays(); + // if older + if (y>17) { + return switch (psex) { + case FEMALE -> String.format("%d-year old woman", y); + case MALE -> String.format("%d-year old man", y); + default -> String.format("%d-year old individual", y); + }; + } else if (y>9) { + return switch (psex) { + case FEMALE -> String.format("%d-year old adolescent female", y); + case MALE -> String.format("%d-year old adolescent male", y); + default -> String.format("%d-year old adolescent", y); + }; + } else if (y>0) { + return switch (psex) { + case FEMALE -> String.format("%s girl", iso8601ToYearMonth(iso8601Age)); + case MALE -> String.format("%s boy", iso8601ToYearMonth(iso8601Age)); + default -> String.format("%s child", iso8601ToYearMonth(iso8601Age)); + }; + } else if (m>0 || d> 0) { + return switch (psex) { + case FEMALE -> String.format("%s baby girl", iso8601ToMonthDay(iso8601Age)); + case MALE -> String.format("\"%s baby boy", iso8601ToMonthDay(iso8601Age)); + default -> String.format("%s baby", iso8601ToMonthDay(iso8601Age)); + }; + } else { + return switch (psex) { + case FEMALE -> "newborn girl"; + case MALE -> "newborn boy"; + default -> "newborn"; + }; + } + } + + private String hpoOnsetIndividualDescription(PhenopacketSex psex, HpoOnsetAge hpoOnsetTermAge) { + if (hpoOnsetTermAge.isFetus()) { + return switch (psex) { + case FEMALE -> "female fetus"; + case MALE -> "male fetus"; + default -> "fetus"; + }; + } else if (hpoOnsetTermAge.isCongenital()) { + return switch (psex) { + case FEMALE -> "female newborn"; + case MALE -> "male newborn"; + default -> "newborn"; + }; + } else if (hpoOnsetTermAge.isInfant()) { + return switch (psex) { + case FEMALE -> "female infant"; + case MALE -> "male infant"; + default -> "infant"; + }; + } else if (hpoOnsetTermAge.isChild()) { + return switch (psex) { + case FEMALE -> "girl"; + case MALE -> "boy"; + default -> "child"; + }; + } else if (hpoOnsetTermAge.isJuvenile()) { + return switch (psex) { + case FEMALE -> "female adolescent"; + case MALE -> "male adolescent"; + default -> "adolescent"; + }; + }else { + return switch (psex) { + case FEMALE -> "woman"; + case MALE -> "man"; + default -> "adult"; + }; + } + } + + /** + * A sentence such as The proband was a 39-year old woman who presented at the age of 12 years with + * HPO1, HPO2, and HPO3. HPO4 and HPO5 were excluded. This method returns the phrase that ends with "with" + * @param psex + * @param lastExamAge + * @param onsetAge + * @return + */ + private String onsetAndLastEncounterAvailable(PhenopacketSex psex, PhenopacketAge lastExamAge, PhenopacketAge onsetAge) { + String individualDescription; + String onsetDescription; + if (lastExamAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoAge = (Iso8601Age) lastExamAge; + individualDescription = iso8601individualDescription(psex, isoAge); + } else if (lastExamAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + HpoOnsetAge hpoOnsetTermAge = (HpoOnsetAge) lastExamAge; + individualDescription = hpoOnsetIndividualDescription(psex,hpoOnsetTermAge); + } else { + // should never happen + throw new PhenolRuntimeException("Did not recognize last exam age type " + lastExamAge.ageType()); + } + if (onsetAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoAge = (Iso8601Age) onsetAge; + onsetDescription = iso8601AtAgeOf(isoAge); + } else if (onsetAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + HpoOnsetAge hpoOnsetTermAge = (HpoOnsetAge) onsetAge; + onsetDescription = onsetTermAtAgeOf(hpoOnsetTermAge); + } else { + // should never happen + throw new PhenolRuntimeException("Did not recognize onset age type " + onsetAge.ageType()); + } + return String.format("The proband was a %s who presented %s with", individualDescription, onsetDescription); + } + + + /** + * Age at last examination available but age of onset not available + * The proband was a 39-year old woman who presented with HPO1, HPO2, and HPO3. HPO4 and HPO5 were excluded. + * @param psex + * @param lastExamAge + */ + private String lastEncounterAvailable(PhenopacketSex psex, PhenopacketAge lastExamAge) { + String individualDescription; + if (lastExamAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoAge = (Iso8601Age) lastExamAge; + individualDescription = iso8601individualDescription(psex, isoAge); + } else if (lastExamAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + HpoOnsetAge hpoOnsetTermAge = (HpoOnsetAge) lastExamAge; + individualDescription = hpoOnsetIndividualDescription(psex,hpoOnsetTermAge); + } else { + // should never happen + throw new PhenolRuntimeException("Did not recognize last exam age type " + lastExamAge.ageType()); + } + return String.format("The proband was a %s who presented with", individualDescription); + } + + /** + * Age at last examination not available but age of onset available + * The proband presented at the age of 12 years with HPO1, HPO2, and HPO3. HPO4 and HPO5 were excluded. + * @param psex + * @param onsetAge + * @return + */ + private String onsetAvailable(PhenopacketSex psex, PhenopacketAge onsetAge) { + String onsetDescription; + if (onsetAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoAge = (Iso8601Age) onsetAge; + onsetDescription = iso8601AtAgeOf(isoAge); + } else if (onsetAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + HpoOnsetAge hpoOnsetTermAge = (HpoOnsetAge) onsetAge; + onsetDescription = onsetTermAtAgeOf(hpoOnsetTermAge); + } else { + // should never happen + throw new PhenolRuntimeException("Did not recognize onset age type " + onsetAge.ageType()); + } + return String.format("The proband presented %s with", onsetDescription, onsetDescription); + } + + private String ageNotAvailable(PhenopacketSex psex) { + return switch (psex) { + case FEMALE -> "The proband was a female who presented with"; + case MALE -> "The proband was a male who presented with"; + default -> "The proband presented with"; + }; + } private String individualName(PpktIndividual individual) { PhenopacketSex psex = individual.getSex(); @@ -65,7 +330,7 @@ private String individualName(PpktIndividual individual) { - @Override + /* @Override public String individualWithAge(PhenopacketAge ppktAge) { if (ppktAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { return ppktAge.age() + " old"; @@ -82,7 +347,7 @@ public String individualWithAge(PhenopacketAge ppktAge) { } else { return ""; // should never get here } - } + }*/ @Override public String atAge(PhenopacketAge ppktAge) { @@ -103,8 +368,5 @@ public String atAge(PhenopacketAge ppktAge) { } } - @Override - public String ppktSex() { - return ""; - } + } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktPhenotypicfeatureEnglish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktPhenotypicfeatureEnglish.java index 2add8ed..7bc0503 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktPhenotypicfeatureEnglish.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktPhenotypicfeatureEnglish.java @@ -7,19 +7,46 @@ import java.util.function.Predicate; public class PpktPhenotypicfeatureEnglish implements PpktPhenotypicFeatureGenerator { - @Override - public String featureList(List ontologyTerms) { - List labels = ontologyTerms.stream() - .filter(Predicate.not(OntologyTerm::isExcluded)) - .map(OntologyTerm::getLabel).toList(); - return getOxfordCommaList(labels, "and"); + + + private String getOxfordCommaList(List items) { + if (items.size() == 1) { + return items.get(0); + } + if (items.size() == 2) { + // no comma if we just have two items. + // one item will work with the below code + return String.join(" and ", items); + } + String symList = String.join(", ", items); + int jj = symList.lastIndexOf(", "); + if (jj > 0) { + symList = symList.substring(0, jj) + ", and " + symList.substring(jj+2); + } + return symList; } + /** + * format features + * The proband was a 39-year old woman who presented at the age of 12 years with HPO1, HPO2, and HPO3. HPO4 and HPO5 were excluded. + */ @Override - public String excludedFeatureList(List ontologyTerms) { - List labels = ontologyTerms.stream() + public String formatFeatures(List ontologyTerms) { + List observed = ontologyTerms.stream() + .filter(Predicate.not(OntologyTerm::isExcluded)) + .map(OntologyTerm::getLabel).toList(); + List excluded = ontologyTerms.stream() .filter(OntologyTerm::isExcluded) .map(OntologyTerm::getLabel).toList(); - return getOxfordCommaList(labels, "and"); + if (observed.isEmpty() && excluded.isEmpty()) { + return "no phenotypic abnormalities"; // should never happen, actually! + } else if (excluded.isEmpty()) { + return getOxfordCommaList(observed) + ". "; + } else if (observed.isEmpty()) { + return "exclusion of " + getOxfordCommaList(excluded) + "."; + } else { + String exclusion = String.format("%s %s excluded.", getOxfordCommaList(excluded), excluded.size() > 1 ? " were" : "was"); + return getOxfordCommaList(observed) + ", whereby " + exclusion; + } } } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktAgeSexSpanish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktAgeSexSpanish.java index d931b32..896f273 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktAgeSexSpanish.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktAgeSexSpanish.java @@ -158,7 +158,7 @@ private String individualName(PpktIndividual individual) { } - @Override + /* @Override public String individualWithAge(PhenopacketAge ppktAge) { if (ppktAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { return ppktAge.age() + " old"; @@ -176,7 +176,7 @@ public String individualWithAge(PhenopacketAge ppktAge) { return ""; // should never get here } } - +*/ private String atIsoAgeExact(PhenopacketAge ppktAge) { Iso8601Age iso8601Age = (Iso8601Age) ppktAge; @@ -202,6 +202,19 @@ private String atIsoAgeExact(PhenopacketAge ppktAge) { } + @Override + public String getIndividualDescription(PpktIndividual individual) { + return ""; + } + + @Override + public String heSheIndividual(PhenopacketSex psex) { + return switch (psex) { + case FEMALE -> "el"; + case MALE -> "ella"; + default -> "la persona"; + }; + } @Override public String atAge(PhenopacketAge ppktAge) { @@ -270,8 +283,5 @@ public String ppktSex(PpktIndividual individual) { } } - @Override - public String ppktSex() { - return ""; - } + } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java index 43e6ce5..f5c9bb8 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java @@ -34,19 +34,23 @@ private List getTranslations(List ontologyTerms) { - @Override public String featureList(List ontologyTerms) { List terms = ontologyTerms.stream() .filter(Predicate.not(OntologyTerm::isExcluded)).toList(); List labels = getTranslations(terms); - return getOxfordCommaList(labels, "y"); + return ""; //;//getOxfordCommaList(labels, "y"); } - @Override + public String excludedFeatureList(List ontologyTerms) { List terms = ontologyTerms.stream() .filter(OntologyTerm::isExcluded).toList(); List labels = getTranslations(terms); - return getOxfordCommaList(labels, "y"); + return ""; //;//getOxfordCommaList(labels, "y"); + } + + @Override + public String formatFeatures(List ontologyTerms) { + return ""; } } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/SpanishPromptGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/SpanishPromptGenerator.java index 1d4a053..18f9e75 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/SpanishPromptGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/SpanishPromptGenerator.java @@ -3,6 +3,7 @@ import org.monarchinitiative.phenol.ontology.data.Ontology; import org.monarchinitiative.phenopacket2prompt.model.OntologyTerm; import org.monarchinitiative.phenopacket2prompt.model.PhenopacketAge; +import org.monarchinitiative.phenopacket2prompt.model.PhenopacketSex; import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; import org.monarchinitiative.phenopacket2prompt.output.*; @@ -55,43 +56,21 @@ public String getIndividualInformation(PpktIndividual ppktIndividual) { } @Override - public String getPhenotypicFeatures(PpktIndividual ppktIndividual) { - StringBuilder sb = new StringBuilder(); - Map> termMap = ppktIndividual.getPhenotypicFeatures(); - List ageList = new ArrayList<>(termMap.keySet()); - Collections.sort(ageList,(a, b) -> Integer.compare(a.totalDays(), b.totalDays())); - for (var age: ageList) { - List terms = termMap.get(age); - if (! age.specified()) { - if (termMap.size() > 1) { - // if size is greater than one, there was at least one specified time point - if (ppktPhenotypicFeatureGenerator.hasObservedFeatures(terms)) { - sb.append("Características adicionales comprendían").append(ppktPhenotypicFeatureGenerator.featureList(terms)).append(". "); - } - if (ppktPhenotypicFeatureGenerator.hasExcludedFeatures(terms)) { - sb.append("Otras características excluidas fueron ").append(ppktPhenotypicFeatureGenerator.excludedFeatureList(terms)).append(". "); - } - } else { - if (ppktPhenotypicFeatureGenerator.hasObservedFeatures(terms)) { - sb.append("Se observaron las siguientes manifestaciones clínicas: ").append(ppktPhenotypicFeatureGenerator.featureList(terms)).append(". "); - } - if (ppktPhenotypicFeatureGenerator.hasExcludedFeatures(terms)) { - sb.append("Se excluyeron las siguientes manifestaciones clínicas: ").append(ppktPhenotypicFeatureGenerator.excludedFeatureList(terms)).append(". "); - } - } - } else { - String ageString = "";//ppktAgeSexGenerator.age(age); - - if (ppktPhenotypicFeatureGenerator.hasObservedFeatures(terms)) { - sb.append(ageString).append(", se observaron las siguientes manifestaciones clínicas: ").append(ppktPhenotypicFeatureGenerator.featureList(terms)).append(". "); - } - if (ppktPhenotypicFeatureGenerator.hasExcludedFeatures(terms)) { - sb.append(ageString).append(", se excluyeron las siguientes manifestaciones clínicas: ").append(ppktPhenotypicFeatureGenerator.excludedFeatureList(terms)).append(". "); - } - } - } + public String formatFeatures(List ontologyTerms) { + return ""; + } - return sb.toString(); + @Override + public String getVignetteAtAge(PhenopacketAge page, PhenopacketSex psex, List terms) { + return ""; + } + + + + + @Override + public String createPrompt(PpktIndividual individual) { + return ""; } diff --git a/src/test/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividualTest.java b/src/test/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividualTest.java index fe562d9..efbb8b6 100644 --- a/src/test/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividualTest.java +++ b/src/test/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividualTest.java @@ -77,7 +77,7 @@ public void testPhenopacketDisease() { @Test public void testPhenotypicFeatures() { - Map> ppktFeatureMap = ppktIndividual.getPhenotypicFeatures(); + Map> ppktFeatureMap = ppktIndividual.getSpecifiedAgePhenotypicFeatures(); assertFalse(ppktFeatureMap.isEmpty()); Predicate termPredicate = term -> term.getLabel().equals("Cerebral atrophy"); List otlist = new ArrayList<>(); From 7e84e3f1b104993040396e57cc927d4d229ab0c2 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Fri, 26 Apr 2024 12:18:44 +0200 Subject: [PATCH 2/6] Spanish v1 --- .../legacy/AdditionalConcept.java | 25 - .../legacy/AdditionalConceptI.java | 29 - .../legacy/AdditionalConceptType.java | 41 -- .../AdditionalReplacementConceptType.java | 29 - .../legacy/TimeSegment.java | 21 - .../legacy/nejm/Dehyphenizer.java | 44 -- .../nejm/NejmCaseReportFromPdfFilterer.java | 267 --------- .../legacy/nejm/NejmCaseReportImporter.java | 142 ----- .../legacy/nejm/NejmCaseReportIngestor.java | 113 ---- .../querygen/PhenopacketFactoryIngestor.java | 48 -- .../querygen/PhenotypicFeatureFilter.java | 87 --- .../legacy/querygen/QueryOutputGenerator.java | 56 -- .../legacy/querygen/QueryOutputType.java | 23 - .../legacy/querygen/QueryPromptFactory.java | 97 ---- .../legacy/querygen/TimePoint.java | 16 - .../legacy/querygen/TimePointParser.java | 98 ---- .../qfactory/AbstractQueryGenerator.java | 391 ------------- .../qfactory/PhenopacketOnlyQuery.java | 24 - .../querygen/qfactory/QcQueryGenerator.java | 29 - .../TextWithManualAnnotsGenerator.java | 206 ------- .../qfactory/TextWithoutDiscussionQuery.java | 22 - .../model/PpktIndividual.java | 5 +- .../output/IndividualInformation.java | 13 - ...packetIndividualInformationGenerator.java} | 7 +- .../PpktPhenotypicFeatureGenerator.java | 1 - .../impl/english/EnglishPromptGenerator.java | 5 +- ...nglish.java => PpktIndividualEnglish.java} | 6 +- .../impl/spanish/PpktAgeSexSpanish.java | 287 ---------- .../impl/spanish/PpktIndividualSpanish.java | 524 ++++++++++++++++++ .../spanish/PpktPhenotypicfeatureSpanish.java | 62 ++- .../impl/spanish/SpanishPromptGenerator.java | 33 +- 31 files changed, 588 insertions(+), 2163 deletions(-) delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConcept.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConceptI.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConceptType.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalReplacementConceptType.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/TimeSegment.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/Dehyphenizer.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportFromPdfFilterer.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportImporter.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportIngestor.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/PhenopacketFactoryIngestor.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/PhenotypicFeatureFilter.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryOutputGenerator.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryOutputType.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryPromptFactory.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/TimePoint.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/TimePointParser.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/AbstractQueryGenerator.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/PhenopacketOnlyQuery.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/QcQueryGenerator.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/TextWithManualAnnotsGenerator.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/TextWithoutDiscussionQuery.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/output/IndividualInformation.java rename src/main/java/org/monarchinitiative/phenopacket2prompt/output/{PhenopacketAgeSexGenerator.java => PhenopacketIndividualInformationGenerator.java} (85%) rename src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/{PpktAgeSexEnglish.java => PpktIndividualEnglish.java} (98%) delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktAgeSexSpanish.java create mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktIndividualSpanish.java diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConcept.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConcept.java deleted file mode 100644 index 6facd36..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConcept.java +++ /dev/null @@ -1,25 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy; - -public record AdditionalConcept(AdditionalConceptType ctype,String text) implements AdditionalConceptI { - - public static AdditionalConcept of(String concept, String text) { - AdditionalConceptType act = AdditionalConceptType.of(concept); - return new AdditionalConcept(act, text); - } - - - @Override - public String originalText() { - return text; - } - - @Override - public AdditionalConceptType conceptType() { - return ctype; - } - - @Override - public String insertText() { - return text; - } -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConceptI.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConceptI.java deleted file mode 100644 index 084d42e..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConceptI.java +++ /dev/null @@ -1,29 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy; - - -/** - * This interface represents the concepts used for manual replacements, e.g. - *
- * eye redness:PHENOTYPE
- * conjunctival injection:PHENOTYPE
- * lungs were clear on auscultation:EXCLUDE:Abnormal breath sound
- * (SARS-CoV-2) RNA was negative:DIAGNOSTICS
- * rapid antigen testing for influenza types A and B was negative:DIAGNOSTICS
- * amoxicillin:TREATMENT
- * sputum had streaks of bright red blood:PHENOTYPE:Hemoptysis
- * developmental dysplasia of the hip:PHENOTYPE
- * patchy airspace opacities:DIAGNOSTICS:predominantly lower lung patchy airspace opacities
- * amoxicillin, acetaminophen, ibuprofen, benzonatate, guaifenesin, and dextro-methorphan:TREATMENT
- * the temperature was 38.5°C:PHENOTYPE
- * the heart rate 124 beats per minute:PHENOTYPE:Tachycardia
- * The body-mass index (the weight in kilograms divided by the square of the height in meters) was 35.9:PHENOTYPE:Obesity
- * 
- * For items with two fields, the original text and the text inserted into our query prompt are the same. - * For the items with three fields, the last field is used to replace the original text - */ -public interface AdditionalConceptI { - - String originalText(); - AdditionalConceptType conceptType(); - String insertText(); -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConceptType.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConceptType.java deleted file mode 100644 index 9c03569..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalConceptType.java +++ /dev/null @@ -1,41 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy; - -import org.monarchinitiative.phenol.base.PhenolRuntimeException; - -/** - * The concept type for the strings that we manually match in the original text. - *
    - *
  1. PHENOTYPE: observed phenotypic feature
  2. - *
  3. EXCLUDE: excluded phenotypic feature
  4. - *
  5. DIAGNOSTICS
  6. - *
  7. TREATMENT
  8. - *
  9. PMH: past medical history
  10. - *
  11. VERBATIM - "other", to be just added
  12. - *
- */ -public enum AdditionalConceptType { - PHENOTYPE, - EXCLUDE, - DIAGNOSTICS, - TREATMENT, - PMH, - FAMILY_HISTORY, - VERBATIM; - - - public static AdditionalConceptType of(String s) { - String concept = s.toUpperCase(); - return switch (concept) { - case "PHENOTYPE" -> PHENOTYPE; - case "EXCLUDE" -> EXCLUDE; - case "DIAGNOSTICS" -> DIAGNOSTICS; - case "TREATMENT" -> TREATMENT; - case "PMH" -> PMH; - case "FAMILY_HISTORY" -> FAMILY_HISTORY; - case "VERBATIM" -> VERBATIM; - default -> throw new PhenolRuntimeException("Unrecognised concept \"" + concept + "\""); - }; - } -} - - diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalReplacementConceptType.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalReplacementConceptType.java deleted file mode 100644 index 89f7bbb..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/AdditionalReplacementConceptType.java +++ /dev/null @@ -1,29 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy; - -public record AdditionalReplacementConceptType(AdditionalConceptType ctype, - String text, - String replacement - ) implements AdditionalConceptI { - - - public static AdditionalReplacementConceptType of(String concept, String text, String replacement) { - AdditionalConceptType act = AdditionalConceptType.of(concept); - return new AdditionalReplacementConceptType(act, text, replacement); - } - - - @Override - public String originalText() { - return text; - } - - @Override - public AdditionalConceptType conceptType() { - return ctype; - } - - @Override - public String insertText() { - return replacement; - } -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/TimeSegment.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/TimeSegment.java deleted file mode 100644 index 548bc2e..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/TimeSegment.java +++ /dev/null @@ -1,21 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy; - -public class TimeSegment { - - private final String timeDesgination; - private final String payload; - - public TimeSegment(String timeDesgination, String payload) { - this.timeDesgination = timeDesgination; - this.payload = payload; - } - - - public String getTimeDesgination() { - return timeDesgination; - } - - public String getPayload() { - return payload; - } -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/Dehyphenizer.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/Dehyphenizer.java deleted file mode 100644 index 90fd7df..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/Dehyphenizer.java +++ /dev/null @@ -1,44 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.nejm; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - - -/** - * Some of the lines in the original text end with a hyphen, because a word is spread across two lines - * Here, we fix this by joining such words and ensuring that all lines are trimmed (do not start or end with whitespace). - */ -public class Dehyphenizer { - - - public static List dehyphenizeLines(List lines) { - List cleansedLines = new ArrayList<>(); - boolean previousLineHadHyphen = false; - String previousLinePrefix = ""; - for (String line : lines) { - String currentLine; - // get next line and add prefix from previous line if there was one - if (previousLineHadHyphen) { - currentLine = previousLinePrefix + line.strip(); - previousLinePrefix = ""; - } else { - currentLine = line; - } - if (currentLine.endsWith("-")) { - String[] tokens = currentLine.split("\\s+"); - String prefix = tokens[tokens.length - 1]; - tokens = Arrays.copyOf(tokens, tokens.length - 1); - currentLine = String.join(" ", tokens); - // remove hyphen - previousLinePrefix = prefix.substring(0, prefix.length() - 1); - previousLineHadHyphen = true; - } else { - previousLineHadHyphen = false; - } - cleansedLines.add(currentLine.trim()); - } - return cleansedLines; - } - -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportFromPdfFilterer.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportFromPdfFilterer.java deleted file mode 100644 index 692da1a..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportFromPdfFilterer.java +++ /dev/null @@ -1,267 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.nejm; - - -import org.monarchinitiative.phenol.base.PhenolRuntimeException; -import org.monarchinitiative.phenopacket2prompt.legacy.AdditionalConcept; -import org.monarchinitiative.phenopacket2prompt.legacy.AdditionalConceptI; -import org.monarchinitiative.phenopacket2prompt.legacy.AdditionalReplacementConceptType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * This class is responsible for cleaning up the text that was parsed from the NEJM case report PDF files. - * It also extracts age and sex. - */ -public class NejmCaseReportFromPdfFilterer { - private final Logger LOGGER = LoggerFactory.getLogger(NejmCaseReportFromPdfFilterer.class); - /** Age of the probad, e.g. P20Y for twenty years old. This must be at the beginning of the - * parsed NEJM file, e.g. - * age: 20 - * sex: M - */ - private final String isoAge; - - private final String phenopacketSex; - private final List caseLines; - - private final List presentationWithoutDiscussionLines; - - private final List allLines; - - private String diagnosis = null; - private boolean inCase = false; - private boolean inDifferentialDiagnosis = false; - private boolean inActualDiagnosis = false; - - /** - * Match phrases such as Dr. Andrea L . Ciaranello’s Diagnosis - */ - private static final Pattern DIAGNOSIS_REGEX = Pattern.compile("Dr. (.*) Diagnosis"); - - - /** - * - * lines such as Case 40-2022 - */ - private static final Pattern CASE_LINE_REGEX = Pattern.compile("Case \\d+-20\\d{2}"); - /** - * Additional contents not picked up by fenomial text parsing, but added manually at the top of the - * input file. - */ - private final Set additionalConcepts; - - public NejmCaseReportFromPdfFilterer(String caseId, List lines) { - isoAge = getIso8601Age(lines.get(0)); - phenopacketSex = getSex(lines.get(1)); - caseLines = new ArrayList<>(); - allLines = new ArrayList<>(); - int index = 2; - boolean in_clinical_vignette = false; - this.additionalConcepts = new HashSet<>(); - LOGGER.trace("Filterer for {}", caseId); - while (! in_clinical_vignette) { - String line = lines.get(index); - index++; - if (! line.contains(":")) { - continue; // skip empty lines - } - if (line.equals("begin_vignette:")) { - in_clinical_vignette = true; - break; - } - String [] fields = line.split(":"); - if (fields.length < 2) { - throw new PhenolRuntimeException("Malformed header line: " + line); - } else if (fields.length == 2){ - String payload = fields[0].trim(); - String category = fields[1].trim(); - additionalConcepts.add(AdditionalConcept.of(category, payload)); - } else if (fields.length == 3) { - String payload = fields[0].trim(); - String category = fields[1].trim(); - String replacement = fields[2].trim(); - additionalConcepts.add(AdditionalReplacementConceptType.of(category, payload, replacement)); - } - } - if (! in_clinical_vignette) { - throw new PhenolRuntimeException("Did not find \"begin_vignette:\" line!"); - } - if (index >= lines.size()) { - throw new PhenolRuntimeException("Did not find text after \"begin_vignette:\" line!"); - } - for (String line : lines.subList(index, lines.size())) { - Matcher caseLineMatcher = CASE_LINE_REGEX.matcher(line); - if (caseLineMatcher.find()) { - continue; - } - // skip lines such as - // Michael Levy, M.D., Ph.D., Bart K. Chwalisz, M.D., Benjamin M. Kozak, M.D., Michael K. Yoon, M.D., Helen A. Shih, M.D., and Anna M. Stagner, M.D. - int countMD = countMd(line); - if (countMD > 2) { - continue; - } - if (line.contains("Presentation of Case")) { - inCase = true; - } else if (line.startsWith("Differential Diagnosis")) { - inDifferentialDiagnosis = true; - } else if (caseId.equalsIgnoreCase("PMID:34437787") && - line.startsWith("Discussion of Bone Marrow Biopsy Results")) { - inDifferentialDiagnosis = true; - } else if (caseId.equalsIgnoreCase("PMID:36383716") && - line.startsWith("Pathological Diagnosis")){ - inDifferentialDiagnosis = true; - } else if (caseId.equalsIgnoreCase("PMID:33730458") && - line.startsWith("Pathological Discussion")) { - inDifferentialDiagnosis = true; - } else if (caseId.equals("PMID:34437787") && line.startsWith("Dr. Andrew M. Crabbe")) { - inDifferentialDiagnosis = true; - } else { - if (inCase && ! inDifferentialDiagnosis) { - caseLines.add(line); - } - } - Matcher m = DIAGNOSIS_REGEX.matcher(line); - if (m.find()) { - inActualDiagnosis = true; - diagnosis = lines.get(index+1); - } else if ( - line.startsWith("Pathological Diagnosis")) { - inActualDiagnosis = true; - diagnosis = lines.get(index + 1); - } else if (line.strip().startsWith("Final Diagnosis")) { - inActualDiagnosis = true; - diagnosis = lines.get(index+1); - } else if (line.strip().startsWith("Anatomical Diagnosis")) { - inActualDiagnosis = true; - diagnosis = lines.get(index+1); - } - - - // leave out lines after the actual diagnosis line - if (inCase && ! inActualDiagnosis) { - allLines.add(line); - } - - index++; - } - // The purpose of the following lines is the following. - // The case reports start with one doctor's report, e.., - // Dr. Natalie A. Diacovo (Pediatrics): - // After the initial presentation, there is a dicussion amongst - // a group of doctors. The discussion begins with text from - // a second doctor, e.g. - // Dr. Maria G. Figueiro Longo: - // We want to extract the text "between the doctors -- - // this is the initial presentation of the case - String caseLinesStr = String.join("\n", caseLines); - var pattern = Pattern.compile("Dr. (.*?):"); - var matcher = pattern.matcher(caseLinesStr); - int n_matched = 0; - int start=-1; - int end=-1; - while(matcher.find()) { - n_matched++; - if (n_matched==1){ - start = matcher.end()+1; - } else if (n_matched==2) { - end = matcher.start() -1; - } - } - if (n_matched==1) { - caseLinesStr = caseLinesStr.substring(start); - } else if (n_matched>1) { - caseLinesStr = caseLinesStr.substring(start, end); - } - var plines = caseLinesStr.split("\\n"); - presentationWithoutDiscussionLines = Arrays.stream(plines).toList(); - } - - - private int countMd(String line) { - String findString = "M.D."; - return line.split(findString, -1).length-1; - } - - /** - * - * @param age a line such as age: 26 - * @return an iso8601 duration string - */ - String getIso8601Age(String age) { - if (! age.startsWith("age:") && (! age.startsWith("Age:"))) { - throw new PhenolRuntimeException("[NejmCaseReportFromPdfFilterer] Malformed age line: " + age); - } - String years = age.substring(4).trim(); - years = years.replace(".", "");// remove stray period - if (years.equalsIgnoreCase("newborn")) { - return "P0Y0M1D"; - } - if (years.contains("12-month-old")) { - return "P1Y"; - } - int y = Integer.parseInt(years); - return String.format("P%dY", y); - } - - /** - * - * @param sex a line such as sex: female - * @return "MALE" or "FEMALE" - */ - String getSex(String sex) { - if (! sex.toLowerCase().startsWith("sex:") ) { - throw new PhenolRuntimeException("Malformed sex line: " + sex); - } - String s = sex.substring(4).trim(); - s = s.replace(".",""); - if (s.equalsIgnoreCase("female")) { - return "FEMALE"; - } else if (s.equalsIgnoreCase("male")) { - return "MALE"; - } else if (s.equalsIgnoreCase("boy")) { - return "MALE"; - } else { - throw new PhenolRuntimeException("[NejmCaseReportFromPdfFilterer] Malformed sex line: " + sex); - } - } - - - - - public String getIsoAge() { - return isoAge; - } - - public String getPhenopacketSex() { - return phenopacketSex; - } - - public List getCaseLines() { - return caseLines; - } - - public List getPresentationWithoutDiscussionLines() { - return presentationWithoutDiscussionLines; - } - - public List getAllLines() { - return allLines; - } - - public Optional getDiagnosis() { - return Optional.ofNullable(this.diagnosis); - } - - public Set getAdditionalConcepts() { - return additionalConcepts; - } - - public boolean validParse() { - return this.inCase && this.inDifferentialDiagnosis && inActualDiagnosis; - } -} - diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportImporter.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportImporter.java deleted file mode 100644 index 3d21682..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportImporter.java +++ /dev/null @@ -1,142 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.nejm; - -import org.monarchinitiative.phenol.base.PhenolRuntimeException; - -import java.io.*; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class NejmCaseReportImporter { - private final List cleanedLines; - /** Remove HTML tags */ - final Pattern CLEAN_HTML_TAG = Pattern.compile("<.*?>"); - /** Skip line if it begins with one of these tokens */ - final Set STARTS_WTH_FILTER_TOKENS = Set.of("The New England Journal of Medicine", - "Downloaded from nejm.org", - "n engl j med", - "new engl and jour nal", - "N Engl J Med", - "engl j med", - "Copyright", - "Case Records of the Massachusetts", - "at NEJM.org", - "Massachusetts General Hospital", - "my nejm in the journal online", - "Individual subscribers can store articles", - "From the Department", - "Founded by Richard C. Cabot", - "Medical School", - "DOI: ", - "Eric S. Rosenberg", - "Dennis C. Sgroi", - "Emily K. McDonald", - "Miriam B. Barshak", - "The new engl" - ); - final Set EQUALS_FILTER_TOKENS = Set.of("the", "medicine", "Case Records", "of the" ); - - - public NejmCaseReportImporter(File gptFilePath) { - List lines = new ArrayList<>(); - try { - CodingErrorAction codingErrorAction = CodingErrorAction.IGNORE; - Charset charset = Charset.defaultCharset(); - CharsetDecoder charsetDecoder = charset.newDecoder(); - charsetDecoder.onMalformedInput(codingErrorAction); - InputStream is = new FileInputStream(gptFilePath); - InputStreamReader reader = new InputStreamReader(is, charsetDecoder); - BufferedReader br = new BufferedReader(reader); - String line; - // the first two lines contain age and sex, always save the - lines.add(br.readLine()); - lines.add(br.readLine()); - while ((line = br.readLine()) != null) { - if (line.length() < 3) continue; // empty or very short lines should be skipped - String processed = cleanLine(line); - if (processed.contains(" ")) { - throw new PhenolRuntimeException("Double white space found in line"+line); - } - if (isValid(processed)) { - lines.add(processed); - // System.out.println(processed); - } - } - } catch (IOException e) { - throw new PhenolRuntimeException("Could not read Gpt file: " + e.getLocalizedMessage()); - } - // Some lines will end with a hypen and the rest of the word continues on the following - // line. our strategy is to extract the prefix with the hypen and add it to the next line. - // We extract the last - this.cleanedLines = Dehyphenizer.dehyphenizeLines(lines); - } - - - - - - private boolean isValid(String line) { - for (String token : STARTS_WTH_FILTER_TOKENS) { - if (line.toLowerCase().startsWith(token.toLowerCase())) - return false; - } - for (String token : EQUALS_FILTER_TOKENS) { - if (line.equalsIgnoreCase(token)) - return false; - } - - // count letter characters. - int n_char = 0; - for (int i=0; i= 4; - } - - private String cleanLine(String line) { - // remove duplicated whitespace - line = line.replaceAll("\\s+", " "); - int lastIndex = 0; - // remove trailing dash, which is a sign that the word at the end of the line was hypenated - // and now is spread over two lines -// if (line.endsWith("-")) { -// line = line.substring(0, line.length()-1); -// } - Matcher htmlCleaner = CLEAN_HTML_TAG.matcher(line); - StringBuilder output = new StringBuilder(); - while (htmlCleaner.find()) { - output.append(line, lastIndex, htmlCleaner.start()); - lastIndex = htmlCleaner.end(); - } - if (lastIndex < line.length()) { - output.append(line, lastIndex, line.length()); - } - String processed = output.toString(); - - processed = processed.replace("-| ", ""); - processed = processed.replace("|", ""); - processed = processed.replaceAll("\\s+", " "); - if (processed.contains(" ")) { - throw new PhenolRuntimeException("Double space in processed: " + processed); - } - return processed.trim(); - } - - - - - - public List getCleanedLines() { - return this.cleanedLines; - } - - -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportIngestor.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportIngestor.java deleted file mode 100644 index 2fa4911..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/nejm/NejmCaseReportIngestor.java +++ /dev/null @@ -1,113 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.nejm; - -import org.monarchinitiative.phenol.base.PhenolRuntimeException; - -import java.io.File; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; -import java.util.stream.Stream; - - -/** - * Parse the text files with the parsed PDF articles. The ingestion code corrects various parsing - * errors using the - */ -public class NejmCaseReportIngestor { - - /** - * The case reports are not valid differential diagnostic exercises - * 34496178: Discussing HIV prophylaxis - * 33730458: primarily imaging, not enough text in initial presentation to be a fair comparison - * PMID:33913642: scant information in case presented by first discussant. Imaging plays prominent role in case. - * PMID:34587390: scant information in case presented by first discussant. Imaging plays prominent role in case. - * PMID:34670047: scant information in case presented by first discussant. Imaging plays prominent role in case. - */ - private final Set INVALID_CASE_REPORTS = Set.of("PMID:34496178", "PMID:33730458", "PMID:33913642", - "PMID:34587390", "PMID:34670047"); - - - /** key: identifier of PMID; value - lines of text */ - private final Map> id2lines; - - - public NejmCaseReportIngestor(String nejmDirectory) { - this.id2lines = new HashMap<>(); - init(nejmDirectory); - } - - private void init(String nejmDirectory) { - // raw text from PDF parsing of the NEJM cases - Set nejmCaseReportFiles = listNejmCaseReportFiles(nejmDirectory); - for (String fname : nejmCaseReportFiles) { - File fpath = new File(nejmDirectory + File.separator + fname); - NejmCaseReportImporter importer = new NejmCaseReportImporter(fpath); - List lines = importer.getCleanedLines(); - String caseNameAsPmid = getCaseNameAsPmid(fname); - // we skip five of the 80 raw files for reasons listed above - if (INVALID_CASE_REPORTS.contains(caseNameAsPmid)) { - continue; - } - id2lines.put(caseNameAsPmid, lines); - } - } - - /** - * This function reads the txt files from a directory in which we put - * texts parsed from the PDF files representing the NEJM case reports. - * @param dir directory with txt files - * @return list of file paths - */ - private Set listNejmCaseReportFiles(String dir) { - File dirFile = new File(dir); - if (dirFile.isDirectory()) { - return Stream.of(dirFile.listFiles()) - .filter(file -> !file.isDirectory()) - .filter(file -> file.getAbsolutePath().endsWith(".txt")) - .map(File::getName) - .collect(Collectors.toSet()); - } else { - throw new PhenolRuntimeException("input directory did not point to valid directory"); - } - } - - - /** - * - * @param filePath e.g., /User/rrabbit/data/34644476.txt - * @return e.g. PMID:34644476 - */ - private String getCaseNameAsPmid(String filePath) { - File f = new File(filePath); - String bname = f.getName(); - Pattern p = Pattern.compile("\\d+"); - Matcher m = p.matcher(filePath); - if (m.find()) { - return "PMID:" + m.group(); - } - return bname; - } - - public void restrictToTarget(String targetCase) { - if (! id2lines.containsKey(targetCase)) { - throw new PhenolRuntimeException("Invalid target case"); - } - List lines = id2lines.get(targetCase); - id2lines.clear(); - id2lines.put(targetCase, lines); - System.out.printf("[INFO] Restricting analysis to targetCase %s.\n", targetCase); - } - - /** - * - * @return key: identifier of PMID; value - lines of text - */ - public Map> getId2lines() { - return id2lines; - } -} - diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/PhenopacketFactoryIngestor.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/PhenopacketFactoryIngestor.java deleted file mode 100644 index 80fc6b2..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/PhenopacketFactoryIngestor.java +++ /dev/null @@ -1,48 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen; - -import org.monarchinitiative.fenominal.core.TermMiner; -import org.monarchinitiative.phenol.ontology.data.Ontology; -import org.monarchinitiative.phenopacket2prompt.legacy.nejm.NejmCaseReportFromPdfFilterer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -public class PhenopacketFactoryIngestor { - private final Logger LOGGER = LoggerFactory.getLogger(PhenopacketFactoryIngestor.class); - - private final Map id2timeCourseFactory; - - public PhenopacketFactoryIngestor(Map> id2lines, - Ontology hpo) { - int validParsedCases = 0; - this.id2timeCourseFactory = new HashMap<>(); - final TermMiner miner = TermMiner.defaultNonFuzzyMapper(hpo); - for (var entry : id2lines.entrySet()) { - String caseNameAsPmid = entry.getKey(); - LOGGER.trace("Creating prompt for {}.", caseNameAsPmid); - try { - NejmCaseReportFromPdfFilterer filterer = new NejmCaseReportFromPdfFilterer(caseNameAsPmid, entry.getValue()); - if (!filterer.validParse()) { - LOGGER.error("NejmCaseReportFromPdfFilterer -- {}: Not Valid.\n", caseNameAsPmid); - System.err.printf("Exiting because of problems with %s. Fix this and come back later\n", caseNameAsPmid); - System.exit(1); - } - QueryPromptFactory factory = new QueryPromptFactory(filterer, caseNameAsPmid, miner, hpo); - id2timeCourseFactory.put(caseNameAsPmid, factory); - } catch (Exception e) { - System.out.printf("Exception with %s: %s.\n", entry.getKey(), e.getMessage()); - System.exit(1); - } - validParsedCases++; - } - System.out.printf("[INFO] Factory map has %d cases.\n", id2timeCourseFactory.size()); - System.out.printf("We parsed %d cases, of which %d were valid.\n", id2lines.entrySet().size(), validParsedCases); - } - - public Map getId2timeCourseFactory() { - return id2timeCourseFactory; - } -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/PhenotypicFeatureFilter.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/PhenotypicFeatureFilter.java deleted file mode 100644 index eb2fece..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/PhenotypicFeatureFilter.java +++ /dev/null @@ -1,87 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen; - - -import org.monarchinitiative.phenol.ontology.data.Ontology; -import org.monarchinitiative.phenol.ontology.data.TermId; -import org.phenopackets.schema.v2.core.OntologyClass; -import org.phenopackets.schema.v2.core.PhenotypicFeature; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -public class PhenotypicFeatureFilter { - private static final Logger LOGGER = LoggerFactory.getLogger(PhenotypicFeatureFilter.class); - final Set finalFeatures; - - - private final static TermId negativism = TermId.of("HP:0410291"); // common mistake (negative) - private final static TermId allergy = TermId.of("HP:0012393"); // Only allow specific allergy terms. Allergy HP:0012393 is getting picked up for questions - - private final static TermId neoplasm = TermId.of("HP:0002664"); // Only allow specific cancer terms.Neoplasm HP:0002664 is only used for family history etc - - private final static TermId asthenia = TermId.of("HP:0025406"); //Asthenia HP:0025406 -- FP call for Weakness - private final static Set termsToAvoid = Set.of(negativism, allergy, neoplasm); - - public PhenotypicFeatureFilter(Set phenotypicFeaturesSet, Ontology ontology) { - // remove terms that are both observed and excluded -- presumably there is some error - Map observed = phenotypicFeaturesSet.stream() - .filter(Predicate.not(PhenotypicFeature::getExcluded)) - .collect(Collectors.toMap(PhenotypicFeature::getType, Function.identity())); - - Map excluded = phenotypicFeaturesSet.stream() - .filter(PhenotypicFeature::getExcluded) - .collect(Collectors.toMap(PhenotypicFeature::getType, Function.identity())); - HashSet termsToExclude = new HashSet<>(); - for (OntologyClass tid : observed.keySet() ) { - if (excluded.containsKey(tid)) { - termsToExclude.add(tid); - LOGGER.info("Excluding {}/{} because it was in both observed and excluded", tid.getId(), tid.getLabel()); - } - } - // now transform survinng terms into TermIds - Map map2 = new HashMap<>(); - for (PhenotypicFeature pf: phenotypicFeaturesSet) { - if (termsToExclude.contains(pf.getType())) { - continue; - } - TermId tid = TermId.of(pf.getType().getId()); - map2.put(tid, pf); - } - // remove the ancestors of any term, keeping only the most specific - Set ancestorsToExclude = new HashSet<>(); - for (TermId t : map2.keySet()) { - // get all ancestors, do not include current term - Set ancs = ontology.getAncestorTermIds(t,false); - for (TermId anc : ancs) { - if (!anc.equals(t) && map2.containsKey(anc)) { - ancestorsToExclude.add(anc); - } - } - } - finalFeatures = new HashSet<>(); - for (Map.Entry e : map2.entrySet()) { - if (ancestorsToExclude.contains(e.getKey())) { - LOGGER.info("Skipping ancestor {}", e.getKey().getValue()); - } else if (termsToAvoid.contains(e.getKey())) { - LOGGER.info("Skipping term to exclude {}", e.getKey().getValue()); - } else { - finalFeatures.add(e.getValue()); - } - } - } - - public static boolean isOmittedTerm(TermId tid) { - return termsToAvoid.contains(tid); - } - - public Set getFinalFeatures() { - return finalFeatures; - } -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryOutputGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryOutputGenerator.java deleted file mode 100644 index e975ce7..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryOutputGenerator.java +++ /dev/null @@ -1,56 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen; - -import org.monarchinitiative.phenol.base.PhenolRuntimeException; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.List; - -public class QueryOutputGenerator { - - - private final List outputTypeList; - - private final String outdirPath; - - public QueryOutputGenerator(List outputTypeList, String outdirPath) { - this.outputTypeList = outputTypeList; - this.outdirPath = outdirPath; - createOutputDirectoryIfNeeded(outdirPath); - for (var qtype: outputTypeList) { - String outpath = outdirPath + File.separator + QueryOutputType.outputString(qtype); - createOutputDirectoryIfNeeded(outpath); - } - } - /** CREATE THE OUTPUT DIRECTORIES IF NEEDED. */ - private void createOutputDirectoryIfNeeded(String outpath) { - File outdirfile = new File(outpath); - if (! outdirfile.isDirectory()) { - boolean dirCreated = outdirfile.mkdir(); - if (!dirCreated) { - throw new PhenolRuntimeException("Could not create outdirfile directory"); - } - } - } - - - public void outputEntry(String pmidString, QueryPromptFactory timeBasedFactory) { - String pmid = pmidString.replace(":", "_"); // avoid colon in file paths - for (var otype : this.outputTypeList) { - String outputString = QueryOutputType.outputString(otype); - String outpath = this.outdirPath + File.separator + outputString + File.separator + - pmid + "-" + outputString + ".txt"; - try (BufferedWriter writer = new BufferedWriter(new FileWriter(outpath))) { - writer.write(timeBasedFactory.getQuery(otype)); - } catch (IOException e) { - throw new PhenolRuntimeException(e.getMessage()); - } - } - } - - - - -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryOutputType.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryOutputType.java deleted file mode 100644 index 563dd64..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryOutputType.java +++ /dev/null @@ -1,23 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen; - -public enum QueryOutputType { - - TIME_BASED, - QC, - TEXT_WITHOUT_DISCUSSION, - TEXT_PLUS_MANUAL; - - - public static String outputString(QueryOutputType qtype) { - return switch (qtype) { - case TIME_BASED -> "phenopacket_time_based_queries"; - case QC -> "QC"; - case TEXT_WITHOUT_DISCUSSION -> "txt_without_discussion"; - case TEXT_PLUS_MANUAL -> "txt_with_manual_annots"; - }; - } - - - - -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryPromptFactory.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryPromptFactory.java deleted file mode 100644 index 7d3dc75..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/QueryPromptFactory.java +++ /dev/null @@ -1,97 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen; - -import org.monarchinitiative.fenominal.core.TermMiner; -import org.monarchinitiative.phenol.base.PhenolRuntimeException; -import org.monarchinitiative.phenol.ontology.data.Ontology; -import org.monarchinitiative.phenopacket2prompt.legacy.nejm.NejmCaseReportFromPdfFilterer; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.qfactory.PhenopacketOnlyQuery; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.qfactory.QcQueryGenerator; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.qfactory.TextWithManualAnnotsGenerator; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.qfactory.TextWithoutDiscussionQuery; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class QueryPromptFactory { - private final Logger LOGGER = LoggerFactory.getLogger(QueryPromptFactory.class); - - /** - * If the description segment of a time period is less than 5 characters, skip it. - */ - private final static int MIN_DESCRIPTION_LENGTH = 5; - private final NejmCaseReportFromPdfFilterer filterer; - - private final TermMiner miner; - - private final Ontology hpo; - - private final String caseId; - - private final String isoAge; - - private final String phenopacketSex; - - - public QueryPromptFactory(NejmCaseReportFromPdfFilterer filterer, String id, TermMiner miner, Ontology hpo) { - this.filterer = filterer; - this.miner = miner; - this.hpo = hpo; - this.phenopacketSex = filterer.getPhenopacketSex(); - this.isoAge = filterer.getIsoAge(); - this.caseId = id; - } - - public String getQuery(QueryOutputType outputType) { - LOGGER.trace("Getting query for {}", outputType.name()); - switch (outputType) { - - case TIME_BASED -> { - PhenopacketOnlyQuery tbq = new PhenopacketOnlyQuery(filterer, caseId, miner, hpo); - return tbq.getQuery(); - } - case TEXT_WITHOUT_DISCUSSION -> { - TextWithoutDiscussionQuery tbq = new TextWithoutDiscussionQuery(filterer, caseId, miner, hpo); - return tbq.getQuery(); - } - case QC -> { - QcQueryGenerator qcg = new QcQueryGenerator(filterer, caseId, miner, hpo); - return qcg.getQuery(); - } - case TEXT_PLUS_MANUAL -> { - TextWithManualAnnotsGenerator tpm = new TextWithManualAnnotsGenerator(filterer, caseId, miner, hpo); - return tpm.getQuery(); - } - } - // should never happen - throw new PhenolRuntimeException("Could not find query type"); - } - - - - private String get_person_string() { - String sex = this.phenopacketSex.toLowerCase(); - final Pattern AGE_REGEX = Pattern.compile("P(\\d+)Y"); - Matcher m = AGE_REGEX.matcher(this.isoAge); - if (m.find()) { - String years = m.group(1); - return "A " + years + "-year old " + sex; - } - final Pattern DAYS_REGEX = Pattern.compile("P0Y(\\d+)D"); - Matcher m2 = DAYS_REGEX.matcher(isoAge); - if (m2.find()) { - String years = m2.group(1); - return "A " + years + "-day old " + sex + " newborn"; - } - throw new PhenolRuntimeException("Could not extract person"); - } - - - - - - - - -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/TimePoint.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/TimePoint.java deleted file mode 100644 index 5436a18..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/TimePoint.java +++ /dev/null @@ -1,16 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen; - -public record TimePoint(String point, int start, int end) implements Comparable{ - - - @Override - public int compareTo(TimePoint other) { - return this.start - other.start; - } - - - @Override - public String toString() { - return String.format("%s (%d-%d)", point, start, end); - } -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/TimePointParser.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/TimePointParser.java deleted file mode 100644 index 2d22d3b..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/TimePointParser.java +++ /dev/null @@ -1,98 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen; - -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class TimePointParser { - - - - /** gets 2 dats before presentation, Two days before presentation etc.*/ - private final Pattern pattern1 = Pattern.compile("\\b\\w+\\b\\s+(hours?|days?|weeks?|months?|years?|decades?) before (the current )?presentation",Pattern.CASE_INSENSITIVE); - - private final Pattern pattern2 = Pattern.compile("\\b\\w+\\b\\s+(hours?|days?|weeks?|months?|years?|decades?) earlier",Pattern.CASE_INSENSITIVE); - - private final Pattern pattern3 = Pattern.compile("(approximately)?\\s?\\b\\w+\\b (hours?|days?|weeks?|months?|years?|decades?) before (the current )?admission",Pattern.CASE_INSENSITIVE); -//Five hours before this admission - private final Pattern pattern3a = Pattern.compile("(approximately)?\\s?\\b\\w+\\b (hours?|days?|weeks?|months?|years?|decades?) before this admission",Pattern.CASE_INSENSITIVE); - - - /** e.g. his ocular history included */ - private final Pattern pattern4 = Pattern.compile("\\b\\w+\\b\\s+\\b\\w+\\b\\s+history included",Pattern.CASE_INSENSITIVE); - - private final Pattern pattern5 = Pattern.compile("During the next \\b\\w+\\b (days|weeks)",Pattern.CASE_INSENSITIVE); - /** - * e.g., After 3 days of fever - */ - private final Pattern pattern6 = Pattern.compile("After \\b\\w+\\b (days|weeks) of \\b\\w+\\b",Pattern.CASE_INSENSITIVE); - - private final Pattern pattern7 = Pattern.compile("\\b\\w+\\b (hours?|days?|weeks?|months?|years?) before (this )?evaluation",Pattern.CASE_INSENSITIVE); - private final Pattern pattern8 = Pattern.compile("\\b\\w+\\b (days?|weeks?|months?|years?) later",Pattern.CASE_INSENSITIVE); - private final Pattern pattern9 = Pattern.compile("After a \\b\\w+\\b[ -](weeks?|days?|months?|years?) admission",Pattern.CASE_INSENSITIVE); - private final Pattern pattern10 = Pattern.compile("On admission to (the other|another) hospital",Pattern.CASE_INSENSITIVE); - private final Pattern pattern11 = Pattern.compile("Over the next \\b\\w+\\b (hours?|days?|weeks?|months?|years?)",Pattern.CASE_INSENSITIVE); - private final Pattern pattern12 = Pattern.compile("3.5 years before the current evaluation"); - - /** Note we do all searching in lower case */ - private final Set fixedPatterns = Set.of("in the emergency department", "on examination", "in childhood", "examination was notable for", - "the night before the current evaluation","on arrival at the emergency department"); - - - - private final List patternList; - - public TimePointParser() { - patternList = new ArrayList<>(); - patternList.add(pattern1); - patternList.add(pattern2); - patternList.add(pattern3); - patternList.add(pattern3a); - patternList.add(pattern4); - patternList.add(pattern5); - patternList.add(pattern6); - patternList.add(pattern7); - patternList.add(pattern8); - patternList.add(pattern9); - patternList.add(pattern10); - patternList.add(pattern12); - - } - - public List getTimePoints(String input) { - Set timePointSet = new HashSet<>(); - patternList.forEach(p -> { - Matcher m = p.matcher(input); - while (m.find()) { - int s = m.start(); - int e = m.end(); - String txt = m.group(); - if (txt.startsWith(" ")) { - txt = txt.substring(1); - s = s + 1; - } - /// remove stray whitespace - // txt = txt.replaceAll("\\s+", " "); - timePointSet.add(new TimePoint(txt, s, e)); - } - }); - // simpler method for String matches. - for (String item : fixedPatterns) { - int lastIndex = 0; - String inputLower = input.toLowerCase(); - while(lastIndex != -1) { - lastIndex = inputLower.indexOf(item,lastIndex); - if(lastIndex != -1){ - int end = lastIndex + item.length(); - String originalItem = input.substring(lastIndex, end); // original capitalization - timePointSet.add(new TimePoint(originalItem, lastIndex, end)); - lastIndex += 1; - } - } - } - List tpList = new ArrayList<>(timePointSet); - Collections.sort(tpList); - return tpList; - } - -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/AbstractQueryGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/AbstractQueryGenerator.java deleted file mode 100644 index 990b9f9..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/AbstractQueryGenerator.java +++ /dev/null @@ -1,391 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen.qfactory; - -import com.google.protobuf.InvalidProtocolBufferException; -import com.google.protobuf.util.JsonFormat; -import org.monarchinitiative.fenominal.core.TermMiner; -import org.monarchinitiative.fenominal.model.MinedTerm; -import org.monarchinitiative.phenol.base.PhenolRuntimeException; -import org.monarchinitiative.phenol.ontology.algo.OntologyAlgorithm; -import org.monarchinitiative.phenol.ontology.data.Ontology; -import org.monarchinitiative.phenol.ontology.data.TermId; -import org.monarchinitiative.phenopacket2prompt.legacy.TimeSegment; -import org.monarchinitiative.phenopacket2prompt.legacy.nejm.NejmCaseReportFromPdfFilterer; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.PhenotypicFeatureFilter; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.TimePoint; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.TimePointParser; -import org.phenopackets.phenopackettools.builder.PhenopacketBuilder; -import org.phenopackets.phenopackettools.builder.builders.IndividualBuilder; -import org.phenopackets.phenopackettools.builder.builders.MetaDataBuilder; -import org.phenopackets.phenopackettools.builder.builders.PhenotypicFeatureBuilder; -import org.phenopackets.phenopackettools.builder.builders.Resources; -import org.phenopackets.schema.v2.Phenopacket; -import org.phenopackets.schema.v2.core.Individual; -import org.phenopackets.schema.v2.core.OntologyClass; -import org.phenopackets.schema.v2.core.PhenotypicFeature; - -import java.util.*; -import java.util.function.Predicate; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -public abstract class AbstractQueryGenerator { - - protected final TermId PHENOTYPIC_ABNORMALITY_ROOT = TermId.of("HP:0000118"); - - - - protected final static String QUERY_HEADER = """ -I am running an experiment on a clinicopathological case conference to see how your diagnoses -compare with those of human experts. I am going to give you part of a medical case. These have -all been published in the New England Journal of Medicine. You are not trying to treat any patients. -As you read the case, you will notice that there are expert discussants giving their thoughts. -In this case, you are “Dr. GPT-4,” an Al language model who is discussing the case along with -human experts. A clinicopathological case conference has several unspoken rules. The first is -that there is most often a single definitive diagnosis (though rarely there may be more than one), -and it is a diagnosis that is known today to exist in humans. The diagnosis is almost always -confirmed by some sort of clinical pathology test or anatomic pathology test, though in -rare cases when such a test does not exist for a diagnosis the diagnosis can instead be -made using validated clinical criteria or very rarely just confirmed by expert opinion. -You will be told at the end of the case description whether a diagnostic test/tests are -being ordered, which you can assume will make the diagnosis/diagnoses. After you read the case, -I want you to give two pieces of information. The first piece of information is your most likely -diagnosis/diagnoses. You need to be as specific as possible -- the goal is to get the correct -answer, not a broad category of answers. You do not need to explain your reasoning, just give -the diagnosis/diagnoses. The second piece of information is to give a robust differential diagnosis, -ranked by their probability so that the most likely diagnosis is at the top, and the least likely -is at the bottom. There is no limit to the number of diagnoses on your differential. You can give -as many diagnoses as you think are reasonable. You do not need to explain your reasoning, -just list the diagnoses. Again, the goal is to be as specific as possible with each of the -diagnoses. -Do you have any questions, Dr. GPT-4? - -Here is the case: - -"""; - /** - * If the description segment of a time period is less than 5 characters, skip it. - */ - protected final static int MIN_DESCRIPTION_LENGTH = 5; - - public abstract String getQuery(); - - protected final TermMiner miner; - - protected final Ontology hpo; - - protected final NejmCaseReportFromPdfFilterer filterer; - - private final String patientId; - - public AbstractQueryGenerator(NejmCaseReportFromPdfFilterer filterer, String id, TermMiner miner, Ontology hpo) { - this.filterer = filterer; - this.miner = miner; - this.hpo = hpo; - this.patientId = id; - } - - protected String getPersonIntroduction() { - String person_string = get_person_string(filterer.getPhenopacketSex(), filterer.getIsoAge()); - return String.format("%s presented with the following signs and symptoms:\n", person_string); - } - - /** - * Remove sentences that describe the past medical history (PMH) or family history (FH) - * We add these lines back manually - * @param originalSeg original text from case report - * @return segment, with lines about PMH or FH removed - */ - private String stripFamilyHistoryAndPmh(String originalSeg) { - List validLines = new ArrayList<>(); // everything but family history - String [] lines = originalSeg.split("\\."); - for (var line : lines) { - if (line.toLowerCase().contains("family history")) continue; - if (line.toLowerCase().contains("medical history")) continue; - if (line.toLowerCase().contains(" mother ")) continue; // e.g. the patient's mother had - if (line.toLowerCase().contains(" father ")) continue; - validLines.add(line); - } - return String.join(". ", validLines); - } - - /** - * Break up the original vignette, which contains the entire cqse report, into segemnts that represent - * indivudal time points such as "Two months before admission" ... - * @param vignette the original vignette - * @param timePointList subvignettes, one per time point - * @return - */ - protected List timeSegments(String vignette, List timePointList) { - Map timeSegments = new LinkedHashMap<>(); // ordered map - List timeSegmentList = new ArrayList<>(); - String nextStart = ""; - int lastEnd = 0; - for (var timePoint: timePointList) { - int s = timePoint.start(); - int e = timePoint.end(); - String seg = nextStart + vignette.substring(lastEnd, s); - seg = stripFamilyHistoryAndPmh(seg); - lastEnd = e + 1; - timeSegments.put(nextStart, seg.strip()); - timeSegmentList.add(new TimeSegment(nextStart, seg.strip())); - nextStart = timePoint.point(); - } - if (lastEnd < vignette.length()) { - String seg = nextStart + vignette.substring(lastEnd); - timeSegments.put(nextStart, seg.strip()); - timeSegmentList.add(new TimeSegment(nextStart, seg.strip())); - } - return timeSegmentList; - } - - /** - * @param items a list of HPO labels, e.g., X and Y and Z - * @return A string formatted as X, Y, and Z. - */ - protected String getOxfordCommaList(Set items) { - if (items.size() == 2) { - // no comma if we just have two items. - // one item will work with the below code - return String.join(" and ", items) + "."; - } - StringBuilder sb = new StringBuilder(); - String symList = String.join(", ", items); - int jj = symList.lastIndexOf(", "); - if (jj > 0) { - symList = symList.substring(0, jj) + ", and " + symList.substring(jj+2); - } - sb.append(symList).append("."); - return sb.toString(); - } - - - - List getPhenotypicFeatures(String input) { - List pflist = new ArrayList<>(); - Collection minedTerms = this.miner.mineTerms(input); - for (var mt : minedTerms) { - boolean hpoObserved = mt.isPresent(); - TermId tid = TermId.of(mt.getTermIdAsString()); - if (!OntologyAlgorithm.isSubclass(hpo, tid, PHENOTYPIC_ABNORMALITY_ROOT)) { - continue; - } - Optional labelOpt = hpo.getTermLabel(tid); - if (labelOpt.isEmpty()) continue; - String label = labelOpt.get(); - if (label.equalsIgnoreCase("Negativism")) { - continue; // common false positive, Negative is a synonym for negativism - } - PhenotypicFeatureBuilder builder = PhenotypicFeatureBuilder.builder(tid.getValue(), labelOpt.get()); - if (!hpoObserved) { - builder.excluded(); - } - pflist.add(builder.build()); - - } - return pflist; - } - - protected String getPhenopacketBasedQuerySegment(String presentationTimeDescription, String input) { - List pfeatures = getPhenotypicFeatures(input); - if (pfeatures.isEmpty()) { - return ""; // no features detected for this time period - } - Set observed_terms = pfeatures.stream() - .filter(Predicate.not(PhenotypicFeature::getExcluded)) - .map(PhenotypicFeature::getType) - .map(OntologyClass::getLabel) - .collect(Collectors.toSet()); - Set excluded_terms = pfeatures.stream() - .filter(PhenotypicFeature::getExcluded) - .map(PhenotypicFeature::getType) - .map(OntologyClass::getLabel) - .collect(Collectors.toSet()); - StringBuilder sb = new StringBuilder(); - String capitalizedTimepoint; - if (presentationTimeDescription.equalsIgnoreCase("Examination was notable for")) { - presentationTimeDescription = "On examination"; - } - if (presentationTimeDescription.length() < 2) { - capitalizedTimepoint = ""; - } else { - capitalizedTimepoint = presentationTimeDescription.substring(0, 1).toUpperCase() + presentationTimeDescription.substring(1); - } - - sb.append(capitalizedTimepoint); - boolean observedEmpty = true; - if (!observed_terms.isEmpty()) { - observedEmpty = false; - if (capitalizedTimepoint.isEmpty()) { - sb.append("The patient presented with "); - } else if (capitalizedTimepoint.equalsIgnoreCase("Other medical history included")) { - sb.append(" "); // this will output Other medical history included X, Y, and Z - } else { - sb.append(", the patient presented with "); - } - - String observedSymptoms = getOxfordCommaList(observed_terms); - sb.append(observedSymptoms).append(" \n"); - } - if (!excluded_terms.isEmpty()) { - String excludededSymptoms = getOxfordCommaList(excluded_terms); - if (observedEmpty) { - sb.append(", the following signs and symptoms were excluded: "); - } else { - sb.append("The following signs and symptoms were excluded: "); - } - sb.append(excludededSymptoms).append(" "); - } - return sb.toString(); - } - - - protected String get_person_string(String phenopacketSex, String isoAge) { - String sex = phenopacketSex.toLowerCase(); - final Pattern AGE_REGEX = Pattern.compile("P(\\d+)Y"); - Matcher m = AGE_REGEX.matcher(isoAge); - if (m.find()) { - String years = m.group(1); - return "A " + years + "-year old " + sex; - } - final Pattern DAYS_REGEX = Pattern.compile("P0Y(\\d+)D"); - Matcher m2 = DAYS_REGEX.matcher(isoAge); - if (m2.find()) { - String years = m2.group(1); - return "A " + years + "-day old " + sex + " newborn"; - } - throw new PhenolRuntimeException("Could not extract person"); - } - - - protected String getPlainPhenopacketText(NejmCaseReportFromPdfFilterer filterer, String id, TermMiner miner, Ontology hpo) { - TimePointParser timePointParser = new TimePointParser(); - List lines = filterer.getPresentationWithoutDiscussionLines(); - String vignette = String.join(" ", lines); - int ii = vignette.indexOf("."); - if (ii < 0) { - throw new PhenolRuntimeException("Malformed vignette without one single period"); - } - String firstSentence = vignette.substring(0, ii + 1).strip(); - vignette = vignette.substring(ii + 1); - List timePointList = timePointParser.getTimePoints(vignette); - - StringBuilder sb = new StringBuilder(); - sb.append(firstSentence).append("\n"); - try { - //Map timeSegments = timeSegments(starts, ends, vignette, start2pointMap); - // Map timeSegments = timeSegments(vignette, timePointList); - List timeSegList = timeSegments(vignette, timePointList); - for (var tseg : timeSegList) { - String timePoint = tseg.getTimeDesgination(); //= entry.getKey(); - String description = tseg.getPayload(); //entry.getValue(); - if (description.equals("Examination was notable for")) { - description = "On examination"; - } - if (description.length() > MIN_DESCRIPTION_LENGTH) { - String output = getPhenopacketBasedQuerySegment(timePoint, description); - if (output.isEmpty()) continue; - sb.append(output).append("\n"); - } - } - } catch (Exception eee) { - System.out.printf("[ERROR(TimeBasedFactory.java] Could not parse time segments for %s because of %s", id, eee.getMessage()); - System.exit(1); - } - return sb.toString(); - } - - /** - * concatenate cases lines and remove the name of the first - * physician to contribute, e.g., - * Dr. Kathy M. Tran (Medicine): - * @param lines the lines repreenting the case parsed from the original file - * @return a single line with all text between the first and the second discussant. - */ - protected String caseLines(List lines) { - if (lines.isEmpty()) { - throw new PhenolRuntimeException("Empty case lines (Should never happen"); - } - final Pattern DR_REGEX = Pattern.compile("Dr\\. .*:"); - String line1 = lines.get(0); - Matcher m = DR_REGEX.matcher(line1); - if (m.find()) { - int e = m.end(); - line1 = line1.substring(e+1); - } - return line1 + lines.stream(). - skip(1). - collect(Collectors.joining("\n")); - } - - - public String getPhenopacketJsonString() { - Phenopacket phenopacket = getPhenopacket(); - try { - return JsonFormat.printer().print(phenopacket); - } catch (InvalidProtocolBufferException e) { - throw new RuntimeException("Could not create JSON: " + e.getLocalizedMessage()); - } - } - - - - - public Phenopacket getPhenopacket() { - String version = hpo.getMetaInfo().getOrDefault("data-version", "n/a"); - var metaData = MetaDataBuilder.builder("csv2phenopacket") - .addResource(Resources.hpoVersion(version)) - .build(); - PhenopacketBuilder builder = PhenopacketBuilder.create(patientId, metaData); - IndividualBuilder probandBuilder = IndividualBuilder.builder(patientId); - String phenopacketSex = filterer.getPhenopacketSex(); - if (phenopacketSex.equalsIgnoreCase("male")) { - probandBuilder.male(); - } else if (phenopacketSex.equalsIgnoreCase("female")) { - probandBuilder.female(); - } - probandBuilder.ageAtLastEncounter(filterer.getIsoAge()); - Individual proband = probandBuilder.build(); - builder.individual(proband); - // Use a set to get rid of duplicates - int total = 0; - Set phenotypicFeaturesSet = new HashSet<>(); - List pflist = getPhenotypicFeatures() ; - for (var pf: pflist) { - total++; - phenotypicFeaturesSet.add(pf); - } - PhenotypicFeatureFilter filter = new PhenotypicFeatureFilter(phenotypicFeaturesSet, hpo); - List allFeatures = new ArrayList<>(filter.getFinalFeatures()); - System.out.printf("%s: %d unique and %d total features\n", this.patientId, allFeatures.size(), total); - builder.addPhenotypicFeatures(allFeatures); - - return builder.build(); - - } - - - - List getPhenotypicFeatures() { - List pflist = new ArrayList<>(); - String payload = String.join(" ", this.filterer.getPresentationWithoutDiscussionLines()); - Collection minedTerms = this.miner.mineTerms(payload); - for (var mt : minedTerms) { - boolean hpoObserved = mt.isPresent(); - TermId tid = TermId.of(mt.getTermIdAsString()); - if (!OntologyAlgorithm.isSubclass(hpo, tid, PHENOTYPIC_ABNORMALITY_ROOT)) { - continue; - } - Optional labelOpt = hpo.getTermLabel(tid); - if (labelOpt.isPresent()) { - PhenotypicFeatureBuilder builder = PhenotypicFeatureBuilder.builder(tid.getValue(), labelOpt.get()); - if (!hpoObserved) { - builder.excluded(); - } - pflist.add(builder.build()); - } - } - return pflist; - } - -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/PhenopacketOnlyQuery.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/PhenopacketOnlyQuery.java deleted file mode 100644 index ce9cb49..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/PhenopacketOnlyQuery.java +++ /dev/null @@ -1,24 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen.qfactory; - -import org.monarchinitiative.fenominal.core.TermMiner; -import org.monarchinitiative.phenol.ontology.data.Ontology; -import org.monarchinitiative.phenopacket2prompt.legacy.nejm.NejmCaseReportFromPdfFilterer; - -public class PhenopacketOnlyQuery extends AbstractQueryGenerator { - - - private final String promptText; - - public PhenopacketOnlyQuery(NejmCaseReportFromPdfFilterer filterer, String id, TermMiner miner, Ontology hpo) { - super(filterer, id, miner, hpo); - String intro = getPersonIntroduction(); - String phenotext = getPlainPhenopacketText(filterer, id, miner, hpo); - promptText = String.format("%s%s", QUERY_HEADER, phenotext); - } - - - @Override - public String getQuery() { - return this.promptText; - } -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/QcQueryGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/QcQueryGenerator.java deleted file mode 100644 index a57e508..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/QcQueryGenerator.java +++ /dev/null @@ -1,29 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen.qfactory; - -import org.monarchinitiative.fenominal.core.TermMiner; -import org.monarchinitiative.phenol.ontology.data.Ontology; -import org.monarchinitiative.phenopacket2prompt.legacy.nejm.NejmCaseReportFromPdfFilterer; - -import java.util.List; - -public class QcQueryGenerator extends AbstractQueryGenerator { - - private final String promptText; - - - - public QcQueryGenerator(NejmCaseReportFromPdfFilterer filterer, String id, TermMiner miner, Ontology hpo) { - super(filterer, id, miner, hpo); - String phenopacketText = getPlainPhenopacketText(filterer, id, miner, hpo); - List lines = filterer.getPresentationWithoutDiscussionLines(); - String original = caseLines(lines); - promptText = String.format("### Phenopacket-text ###\n\n%s\n\n###Original###\n\n%s", - phenopacketText, original); - } - - - @Override - public String getQuery() { - return promptText; - } -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/TextWithManualAnnotsGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/TextWithManualAnnotsGenerator.java deleted file mode 100644 index a6ab8e6..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/TextWithManualAnnotsGenerator.java +++ /dev/null @@ -1,206 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen.qfactory; - -import org.monarchinitiative.fenominal.core.TermMiner; -import org.monarchinitiative.phenol.base.PhenolRuntimeException; -import org.monarchinitiative.phenol.ontology.data.Ontology; -import org.monarchinitiative.phenopacket2prompt.legacy.AdditionalConceptI; -import org.monarchinitiative.phenopacket2prompt.legacy.AdditionalConceptType; -import org.monarchinitiative.phenopacket2prompt.legacy.nejm.NejmCaseReportFromPdfFilterer; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.TimePoint; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.TimePointParser; -import org.phenopackets.schema.v2.core.OntologyClass; -import org.phenopackets.schema.v2.core.PhenotypicFeature; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.*; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -public class TextWithManualAnnotsGenerator extends AbstractQueryGenerator { - private final Logger LOGGER = LoggerFactory.getLogger(TextWithManualAnnotsGenerator.class); - private final String promptText; - - private final Set additionalConcepts; - private final Set pmh; - private final Set familyHistory; - - private final List outputLines; - - - public TextWithManualAnnotsGenerator(NejmCaseReportFromPdfFilterer filterer, String id, TermMiner miner, Ontology hpo) { - super(filterer, id, miner, hpo); - this.outputLines = new ArrayList<>(); - this.pmh = new HashSet<>(); - familyHistory = filterer.getAdditionalConcepts().stream() - .filter(a -> a.conceptType() == AdditionalConceptType.FAMILY_HISTORY) - .map(AdditionalConceptI::insertText) - .collect(Collectors.toSet()); - this.additionalConcepts = filterer.getAdditionalConcepts(); - String phenotext = getPhenopacketTextWithManualAdditions(); - promptText = String.format("%s%s", QUERY_HEADER, phenotext); - } - - - - protected String getPhenopacketTextWithManualAdditions() { - TimePointParser timePointParser = new TimePointParser(); - List lines = filterer.getPresentationWithoutDiscussionLines(); - String vignette = String.join(" ", lines); - // the next five lines extract the first sentence from the text -- we include the first sentence verbatim - // in our query prompts - int ii = vignette.indexOf("."); - if (ii < 0) { - throw new PhenolRuntimeException("Malformed vignette without one single period"); - } - String firstSentence = vignette.substring(0, ii + 1).strip(); - vignette = vignette.substring(ii + 1); - List timePointList = timePointParser.getTimePoints(vignette); - try { - for (var tseg : timeSegments(vignette, timePointList)) { - String timePoint = tseg.getTimeDesgination(); - String vignette_at_timepoint = tseg.getPayload(); - if (vignette_at_timepoint.equals("Examination was notable for")) { - vignette_at_timepoint = "On examination "; - } - if (vignette_at_timepoint.length() > MIN_DESCRIPTION_LENGTH) { - String output = getPhenopacketBasedQuerySegmentWithAdditions(timePoint, vignette_at_timepoint); - if (output.isEmpty()) continue; - outputLines.add(output.trim()); - } - } - } catch (Exception eee) { - System.out.printf("[ERROR(TextPlusManualGenerator.java] Could not parse time segments for because of %s", eee.getMessage()); - System.exit(1); - } - StringBuilder sb = new StringBuilder(); - sb.append(firstSentence).append("\n"); - if (pmh.size() > 0) { - sb.append("The past medical history was notable for ") - .append(getOxfordCommaList(pmh)) - .append("\n"); - } - if (familyHistory.size() > 0) { - sb.append("The family history was notable for the following. "); - for (String item: familyHistory) { - sb.append(item).append("\n"); - } - } - for (var line : outputLines) { - sb.append(line).append("\n"); - } - return sb.toString(); - } - - - protected String getPhenopacketBasedQuerySegmentWithAdditions(String presentationTimeDescription, String vignette_at_timepoint) { - List pfeatures = getPhenotypicFeatures(vignette_at_timepoint); - Set diagnostics = new HashSet<>(); - Set treatment = new HashSet<>(); - Set verbatim = new HashSet<>(); - - Set observed_terms = pfeatures.stream() - .filter(Predicate.not(PhenotypicFeature::getExcluded)) - .map(PhenotypicFeature::getType) - .map(OntologyClass::getLabel) - .collect(Collectors.toSet()); - Set excluded_terms = pfeatures.stream() - .filter(PhenotypicFeature::getExcluded) - .map(PhenotypicFeature::getType) - .map(OntologyClass::getLabel) - .collect(Collectors.toSet()); - for (var addcon : this.additionalConcepts ) { - String x = addcon.insertText(); - // check if the vignette for the current time period includes a text from the - // additional concepts listed at the top of the input file - // e.g. if the input file has - // Cough:PHENOTYPTE and we find the string "Cough" in the original_vignette_text, - // then we would add "Cough" to the set observed_terms - // if the input file has - // Aspirin:TREATMENT, then we add "Aspirin" to the set treatment - if (vignette_at_timepoint.contains(addcon.originalText())) { - switch (addcon.conceptType()) { - case PHENOTYPE -> observed_terms.add(addcon.insertText()); - case EXCLUDE -> excluded_terms.add(addcon.insertText()); - case DIAGNOSTICS -> diagnostics.add(addcon.insertText()); - case TREATMENT -> treatment.add(addcon.insertText()); - case VERBATIM -> verbatim.add(addcon.insertText()); - case PMH -> { - // do not repeat the PMH even if the original text mentions it more than once - if (!pmh.contains(addcon.originalText())) { - pmh.add(addcon.insertText()); - } - } - case FAMILY_HISTORY -> familyHistory.add(addcon.insertText()); - } - } - } - StringBuilder sb = new StringBuilder(); - String capitalizedTimepoint; - if (presentationTimeDescription.equalsIgnoreCase("Examination was notable for")) { - presentationTimeDescription = "On examination"; - } - if (presentationTimeDescription.length() < 2) { - capitalizedTimepoint = ""; - } else { - capitalizedTimepoint = presentationTimeDescription.substring(0, 1).toUpperCase() + presentationTimeDescription.substring(1); - } - if (observed_terms.isEmpty() && - excluded_terms.isEmpty() && - treatment.isEmpty() && - diagnostics.isEmpty() && - verbatim.isEmpty()) { - return ""; - } else { - sb.append(capitalizedTimepoint);//.append(" "); - } - boolean observedEmpty = true; - boolean needEmpty = true; - if (!observed_terms.isEmpty()) { - observedEmpty = false; - if (capitalizedTimepoint.isEmpty()) { - sb.append("The patient presented with "); - } else if (capitalizedTimepoint.equalsIgnoreCase("Other medical history included")) { - sb.append(" "); // this will output Other medical history included X, Y, and Z - } else { - sb.append(", the patient presented with "); - } - - String observedSymptoms = getOxfordCommaList(observed_terms); - sb.append(observedSymptoms).append(" \n"); - } - if (!excluded_terms.isEmpty()) { - if (needEmpty) { sb.append(" "); needEmpty = false; } - String excludededSymptoms = getOxfordCommaList(excluded_terms); - if (observedEmpty) { - sb.append("The following signs and symptoms were excluded: "); - } else { - sb.append("The following signs and symptoms were excluded: "); - } - sb.append(excludededSymptoms).append("\n"); - } - if (! diagnostics.isEmpty()) { - if (needEmpty) { sb.append(" "); needEmpty = false; } - sb.append("The following diagnostic observations were made: "); - sb.append(getOxfordCommaList(diagnostics)); - sb.append("\n"); - } - if (! treatment.isEmpty()) { - if (needEmpty) { sb.append(" "); needEmpty = false; } - sb.append("The following treatments were administered: "); - sb.append(getOxfordCommaList(treatment)); - sb.append("\n"); - } - if (! verbatim.isEmpty()) { - for (var v : verbatim) - sb.append(v).append("\n"); - } - return sb.toString(); - } - - - @Override - public String getQuery() { - return promptText; - } -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/TextWithoutDiscussionQuery.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/TextWithoutDiscussionQuery.java deleted file mode 100644 index f80654e..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/legacy/querygen/qfactory/TextWithoutDiscussionQuery.java +++ /dev/null @@ -1,22 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.legacy.querygen.qfactory; - -import org.monarchinitiative.fenominal.core.TermMiner; -import org.monarchinitiative.phenol.ontology.data.Ontology; -import org.monarchinitiative.phenopacket2prompt.legacy.nejm.NejmCaseReportFromPdfFilterer; - -import java.util.List; - -public class TextWithoutDiscussionQuery extends AbstractQueryGenerator { - - private final String promptText; - - public TextWithoutDiscussionQuery(NejmCaseReportFromPdfFilterer filterer, String id, TermMiner miner, Ontology hpo) { - super(filterer, id, miner, hpo); - List lines = filterer.getPresentationWithoutDiscussionLines(); - promptText = QUERY_HEADER + caseLines(lines); - } - @Override - public String getQuery() { - return promptText; - } -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java index 7d7c02e..03a674b 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java @@ -168,9 +168,8 @@ public List getPhenotypicFeaturesAtOnset() { } /** - * Get a map of phenotypic features with specified onset after the age of onset - * This does not include features with unspecified onset (for that, use {@code getPhenotypicFeaturesWithNoSpecifiedAge}). - * @return + * This code does not include features with unspecified onset (for that, use {@code getPhenotypicFeaturesWithNoSpecifiedAge}) or terms at the age of onset + * @return map of phenotypic features with specified onset after the age of onset */ public Map> getSpecifiedAgePhenotypicFeatures() { Map> ageToFeatureMap = new HashMap<>(); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/IndividualInformation.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/IndividualInformation.java deleted file mode 100644 index 1398043..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/IndividualInformation.java +++ /dev/null @@ -1,13 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.output; - -import org.monarchinitiative.phenopacket2prompt.model.PhenopacketSex; - -public record IndividualInformation(PhenopacketSex psex, - String ageSexAtLastExam) { - - - - - - -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PhenopacketAgeSexGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PhenopacketIndividualInformationGenerator.java similarity index 85% rename from src/main/java/org/monarchinitiative/phenopacket2prompt/output/PhenopacketAgeSexGenerator.java rename to src/main/java/org/monarchinitiative/phenopacket2prompt/output/PhenopacketIndividualInformationGenerator.java index 606e6e2..7506bb1 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PhenopacketAgeSexGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PhenopacketIndividualInformationGenerator.java @@ -4,17 +4,12 @@ import org.monarchinitiative.phenopacket2prompt.model.PhenopacketSex; import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; -public interface PhenopacketAgeSexGenerator { +public interface PhenopacketIndividualInformationGenerator { String getIndividualDescription(PpktIndividual individual); - String heSheIndividual(PhenopacketSex psex); String atAge(PhenopacketAge ppktAge); - //String ppktSex(); - - - } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PpktPhenotypicFeatureGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PpktPhenotypicFeatureGenerator.java index e3f3443..3f2b24a 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PpktPhenotypicFeatureGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/PpktPhenotypicFeatureGenerator.java @@ -3,7 +3,6 @@ import org.monarchinitiative.phenopacket2prompt.model.OntologyTerm; import java.util.List; -import java.util.function.Predicate; public interface PpktPhenotypicFeatureGenerator { diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java index 1c7f5ec..905bd5e 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/EnglishPromptGenerator.java @@ -8,13 +8,12 @@ import org.monarchinitiative.phenopacket2prompt.output.*; import java.util.*; -import java.util.function.Predicate; public class EnglishPromptGenerator implements PromptGenerator { private final Ontology hpo; - private final PhenopacketAgeSexGenerator ppktAgeGenerator; + private final PhenopacketIndividualInformationGenerator ppktAgeGenerator; private final PhenopacketTextGenerator ppktTextGenerator; @@ -23,7 +22,7 @@ public class EnglishPromptGenerator implements PromptGenerator { public EnglishPromptGenerator(Ontology hpo){ this.hpo = hpo; - ppktAgeGenerator = new PpktAgeSexEnglish(); + ppktAgeGenerator = new PpktIndividualEnglish(); ppktTextGenerator = new PpktTextEnglish(); this.ppktPhenotypicFeatureGenerator = new PpktPhenotypicfeatureEnglish(); } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktAgeSexEnglish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktIndividualEnglish.java similarity index 98% rename from src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktAgeSexEnglish.java rename to src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktIndividualEnglish.java index 43e1e0b..75bac66 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktAgeSexEnglish.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/english/PpktIndividualEnglish.java @@ -2,15 +2,15 @@ import org.monarchinitiative.phenol.base.PhenolRuntimeException; import org.monarchinitiative.phenopacket2prompt.model.*; -import org.monarchinitiative.phenopacket2prompt.output.PhenopacketAgeSexGenerator; +import org.monarchinitiative.phenopacket2prompt.output.PhenopacketIndividualInformationGenerator; import java.util.ArrayList; import java.util.List; import java.util.Optional; -public class PpktAgeSexEnglish implements PhenopacketAgeSexGenerator { +public class PpktIndividualEnglish implements PhenopacketIndividualInformationGenerator { - public PpktAgeSexEnglish() { + public PpktIndividualEnglish() { } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktAgeSexSpanish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktAgeSexSpanish.java deleted file mode 100644 index 896f273..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktAgeSexSpanish.java +++ /dev/null @@ -1,287 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.output.impl.spanish; - -import org.monarchinitiative.phenopacket2prompt.model.*; -import org.monarchinitiative.phenopacket2prompt.output.IndividualInformation; -import org.monarchinitiative.phenopacket2prompt.output.PhenopacketAgeSexGenerator; - -import java.util.Optional; - -public class PpktAgeSexSpanish implements PhenopacketAgeSexGenerator { - - - public PpktAgeSexSpanish() { - } - - - - public IndividualInformation getInformation(PpktIndividual individual) { - PhenopacketSex psex = individual.getSex(); - String ageSexAtLastExam = ageAndSexAtLastExamination(individual); - - - - return new IndividualInformation(psex, ageSexAtLastExam); - } - - - /** - * Equivalent of "The clinical - * @param individual - * @return - */ - public String ageAndSexAtOnset(PpktIndividual individual) { - Optional ageOpt = individual.getAgeAtOnset(); - return ""; - } - - - - - public String ageAndSexAtLastExamination(PpktIndividual individual) { - PhenopacketSex psex = individual.getSex(); - Optional ageOpt = individual.getAgeAtLastExamination(); - if (ageOpt.isEmpty()) { - ageOpt = individual.getAgeAtOnset(); - } - String sex; - switch (psex) { - case FEMALE -> sex = "una paciente femenina"; - case MALE -> sex = "un paciente masculino"; - default -> sex = "una persona"; - }; - - if (ageOpt.isEmpty()) { - return sex; - } - PhenopacketAge age = ageOpt.get(); - if (age.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { - Iso8601Age isoage = (Iso8601Age) age; - int y = isoage.getYears(); - int m = isoage.getMonths(); - int d = isoage.getDays(); - if (psex.equals(PhenopacketSex.FEMALE)) { - if (y > 17) { - return String.format("una mujer de %d años", y); - } else if (y > 9) { - return String.format("una adolescente de %d años", y); - - } else if (y > 0) { - return String.format("una niña de %d años", y); - } else if (m>0) { - return String.format("una bebe niña de %d meses", m); - } else { - return String.format("una recien nacida %d meses", d); - } - } - } else { - // age is an HPO onset term, we do not have an exact date - } - if (age.isChild()) { - return switch (psex) { - case FEMALE -> "una niña"; - case MALE -> "un niño"; - default -> "un niño"; // difficult to be gender neutral - }; - } else if (age.isCongenital()) { - return switch (psex) { - case FEMALE -> "una recien nacida"; - case MALE -> "un recien nacido"; - default -> "un recien nacido"; - }; - } else if (age.isFetus()) { - return switch (psex) { - case FEMALE -> "un feto femenino"; - case MALE -> "un feto masculino"; - default -> "un feto"; - }; - } else if (age.isInfant()) { - return switch (psex) { - case FEMALE -> "un bebé femenino"; - case MALE -> "un bebé masculino"; - default -> "un bebé"; - }; - } else { - return switch (psex) { - case FEMALE -> "un mujer"; - case MALE -> "un hombre"; - default -> "una persona adulta"; - }; - } - } - - - private String individualName(PpktIndividual individual) { - PhenopacketSex psex = individual.getSex(); - Optional ageOpt = individual.getAgeAtLastExamination(); - if (ageOpt.isEmpty()) { - ageOpt = individual.getAgeAtOnset(); - } - if (ageOpt.isEmpty()) { - return switch (psex) { - case FEMALE -> "female"; - case MALE -> "male"; - default -> "individual"; - }; - } - PhenopacketAge age = ageOpt.get();; - if (age.isChild()) { - return switch (psex) { - case FEMALE -> "girl"; - case MALE -> "boy"; - default -> "child"; - }; - } else if (age.isCongenital()) { - return switch (psex) { - case FEMALE -> "female newborn"; - case MALE -> "male newborn"; - default -> "newborn"; - }; - } else if (age.isFetus()) { - return switch (psex) { - case FEMALE -> "female fetus"; - case MALE -> "male fetus"; - default -> "fetus"; - }; - } else if (age.isInfant()) { - return switch (psex) { - case FEMALE -> "female infant"; - case MALE -> "male infant"; - default -> "infant"; - }; - } else { - return switch (psex) { - case FEMALE -> "woman"; - case MALE -> "man"; - default -> "individual"; - }; - } - } - - - /* @Override - public String individualWithAge(PhenopacketAge ppktAge) { - if (ppktAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { - return ppktAge.age() + " old"; - } else if (ppktAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { - String label = ppktAge.age(); // something like "Infantile onset" - return switch (label) { - case "Infantile onset" -> "bebé"; - case "Childhood onset" -> "niño"; - case "Neonatal onset" -> "neonate"; - case "Congenital onset" -> "recién nacido"; - case "Adult onset" -> "adulto"; - default-> String.format("During the %s", label.replace(" onset", "")); - }; - } else { - return ""; // should never get here - } - } -*/ - - private String atIsoAgeExact(PhenopacketAge ppktAge) { - Iso8601Age iso8601Age = (Iso8601Age) ppktAge; - int y = iso8601Age.getYears(); - int m = iso8601Age.getMonths(); - int d = iso8601Age.getDays(); - - if (y > 10) { - return String.format("%d años", y); - } else if (y > 0) { - if (m > 1) { - return String.format("%d años y %d meses", y, m); - } else if (m == 1) { - return String.format("%d años y un mes", y); - } else { - return String.format("%d años", y); - } - } else if (m>0) { - return String.format("%d meses y %d días", m, d); - } else { - return String.format("%d días", d); - } - } - - - @Override - public String getIndividualDescription(PpktIndividual individual) { - return ""; - } - - @Override - public String heSheIndividual(PhenopacketSex psex) { - return switch (psex) { - case FEMALE -> "el"; - case MALE -> "ella"; - default -> "la persona"; - }; - } - - @Override - public String atAge(PhenopacketAge ppktAge) { - if (ppktAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { - return "A la edad de " + atIsoAgeExact(ppktAge); - } else if (ppktAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { - String label = ppktAge.age(); // something like "Infantile onset" - return switch (label) { - case "Infantile onset" -> "Durante el periodo infantil"; - case "Childhood onset" -> "Durante la infancia"; - case "Neonatal onset" -> "Durante el periodo neonatal"; - case "Congenital onset" -> "Al nacer"; - case "Adult onset" -> "Como adulto"; - default-> String.format("Durante el %s periodo", label.replace(" onset", "")); - }; - } else { - return ""; // should never get here - } - } - - // @Override - public String ppktSex(PpktIndividual individual) { - PhenopacketSex psex = individual.getSex(); - Optional ageOpt = individual.getAgeAtLastExamination(); - if (ageOpt.isEmpty()) { - ageOpt = individual.getAgeAtOnset(); - } - if (ageOpt.isEmpty()) { - return switch (psex) { - case FEMALE -> "female"; - case MALE -> "male"; - default -> "individual"; - }; - } - PhenopacketAge age = ageOpt.get();; - if (age.isChild()) { - return switch (psex) { - case FEMALE -> "girl"; - case MALE -> "boy"; - default -> "child"; - }; - } else if (age.isCongenital()) { - return switch (psex) { - case FEMALE -> "female newborn"; - case MALE -> "male newborn"; - default -> "newborn"; - }; - } else if (age.isFetus()) { - return switch (psex) { - case FEMALE -> "female fetus"; - case MALE -> "male fetus"; - default -> "fetus"; - }; - } else if (age.isInfant()) { - return switch (psex) { - case FEMALE -> "female infant"; - case MALE -> "male infant"; - default -> "infant"; - }; - } else { - return switch (psex) { - case FEMALE -> "woman"; - case MALE -> "man"; - default -> "individual"; - }; - } - } - - -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktIndividualSpanish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktIndividualSpanish.java new file mode 100644 index 0000000..dadeedc --- /dev/null +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktIndividualSpanish.java @@ -0,0 +1,524 @@ +package org.monarchinitiative.phenopacket2prompt.output.impl.spanish; + +import org.monarchinitiative.phenol.base.PhenolRuntimeException; +import org.monarchinitiative.phenopacket2prompt.model.*; +import org.monarchinitiative.phenopacket2prompt.output.PhenopacketIndividualInformationGenerator; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +public class PpktIndividualSpanish implements PhenopacketIndividualInformationGenerator { + + + /** + * Equivalent of "The clinical + * @param individual + * @return + */ + public String ageAndSexAtOnset(PpktIndividual individual) { + Optional ageOpt = individual.getAgeAtOnset(); + return ""; + } + + + + + public String ageAndSexAtLastExamination(PpktIndividual individual) { + PhenopacketSex psex = individual.getSex(); + Optional ageOpt = individual.getAgeAtLastExamination(); + if (ageOpt.isEmpty()) { + ageOpt = individual.getAgeAtOnset(); + } + String sex; + switch (psex) { + case FEMALE -> sex = "una paciente femenina"; + case MALE -> sex = "un paciente masculino"; + default -> sex = "una persona"; + }; + + if (ageOpt.isEmpty()) { + return sex; + } + PhenopacketAge age = ageOpt.get(); + if (age.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoage = (Iso8601Age) age; + int y = isoage.getYears(); + int m = isoage.getMonths(); + int d = isoage.getDays(); + if (psex.equals(PhenopacketSex.FEMALE)) { + if (y > 17) { + return String.format("una mujer de %d años", y); + } else if (y > 9) { + return String.format("una adolescente de %d años", y); + + } else if (y > 0) { + return String.format("una niña de %d años", y); + } else if (m>0) { + return String.format("una bebe niña de %d meses", m); + } else { + return String.format("una recien nacida %d meses", d); + } + } + } else { + // age is an HPO onset term, we do not have an exact date + } + if (age.isChild()) { + return switch (psex) { + case FEMALE -> "una niña"; + case MALE -> "un niño"; + default -> "un niño"; // difficult to be gender neutral + }; + } else if (age.isCongenital()) { + return switch (psex) { + case FEMALE -> "una recien nacida"; + case MALE -> "un recien nacido"; + default -> "un recien nacido"; + }; + } else if (age.isFetus()) { + return switch (psex) { + case FEMALE -> "un feto femenino"; + case MALE -> "un feto masculino"; + default -> "un feto"; + }; + } else if (age.isInfant()) { + return switch (psex) { + case FEMALE -> "un bebé femenino"; + case MALE -> "un bebé masculino"; + default -> "un bebé"; + }; + } else { + return switch (psex) { + case FEMALE -> "un mujer"; + case MALE -> "un hombre"; + default -> "una persona adulta"; + }; + } + } + + + private String individualName(PpktIndividual individual) { + PhenopacketSex psex = individual.getSex(); + Optional ageOpt = individual.getAgeAtLastExamination(); + if (ageOpt.isEmpty()) { + ageOpt = individual.getAgeAtOnset(); + } + if (ageOpt.isEmpty()) { + return switch (psex) { + case FEMALE -> "female"; + case MALE -> "male"; + default -> "individual"; + }; + } + PhenopacketAge age = ageOpt.get();; + if (age.isChild()) { + return switch (psex) { + case FEMALE -> "girl"; + case MALE -> "boy"; + default -> "child"; + }; + } else if (age.isCongenital()) { + return switch (psex) { + case FEMALE -> "female newborn"; + case MALE -> "male newborn"; + default -> "newborn"; + }; + } else if (age.isFetus()) { + return switch (psex) { + case FEMALE -> "female fetus"; + case MALE -> "male fetus"; + default -> "fetus"; + }; + } else if (age.isInfant()) { + return switch (psex) { + case FEMALE -> "female infant"; + case MALE -> "male infant"; + default -> "infant"; + }; + } else { + return switch (psex) { + case FEMALE -> "woman"; + case MALE -> "man"; + default -> "individual"; + }; + } + } + + + /* @Override + public String individualWithAge(PhenopacketAge ppktAge) { + if (ppktAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + return ppktAge.age() + " old"; + } else if (ppktAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + String label = ppktAge.age(); // something like "Infantile onset" + return switch (label) { + case "Infantile onset" -> "bebé"; + case "Childhood onset" -> "niño"; + case "Neonatal onset" -> "neonate"; + case "Congenital onset" -> "recién nacido"; + case "Adult onset" -> "adulto"; + default-> String.format("During the %s", label.replace(" onset", "")); + }; + } else { + return ""; // should never get here + } + } +*/ + + private String atIsoAgeExact(PhenopacketAge ppktAge) { + Iso8601Age iso8601Age = (Iso8601Age) ppktAge; + int y = iso8601Age.getYears(); + int m = iso8601Age.getMonths(); + int d = iso8601Age.getDays(); + + if (y > 10) { + return String.format("%d años", y); + } else if (y > 0) { + if (m > 1) { + return String.format("%d años y %d meses", y, m); + } else if (m == 1) { + return String.format("%d años y un mes", y); + } else { + return String.format("%d años", y); + } + } else if (m>0) { + return String.format("%d meses y %d días", m, d); + } else { + return String.format("%d días", d); + } + } + + + @Override + public String getIndividualDescription(PpktIndividual individual) { + Optional lastExamOpt = individual.getAgeAtLastExamination(); + Optional onsetOpt = individual.getAgeAtOnset(); + PhenopacketSex psex = individual.getSex(); + if (lastExamOpt.isPresent() && onsetOpt.isPresent()) { + return onsetAndLastEncounterAvailable(psex, lastExamOpt.get(), onsetOpt.get()); + } else if (lastExamOpt.isPresent()) { + return lastEncounterAvailable(psex, lastExamOpt.get()); + } else if (onsetOpt.isPresent()) { + return onsetAvailable(psex, onsetOpt.get()); + } else { + return ageNotAvailable(psex); + } + } + + + private String iso8601ToYearMonth(Iso8601Age iso8601Age) { + if (iso8601Age.getMonths() == 0) { + return String.format("de %d años", iso8601Age.getYears()); + } else { + return String.format("de %d años y %d meses", iso8601Age.getYears(), iso8601Age.getMonths()); + } + } + + private String iso8601ToMonthDay(Iso8601Age iso8601Age) { + int m = iso8601Age.getMonths(); + int d = iso8601Age.getDays(); + if (m == 0) { + return String.format("de %d dias", d); + } else if (d>0){ + return String.format("de %d meses y %d dias", m, d); + } else { + return String.format("de %d meses", m); + } + } + + /** + * Create a phrase such as "at the age of 7 years, 4 months, and 2 days" + * Leave out the months and days if they are zero. + * @param isoAge + * @return + */ + private String iso8601AtAgeOf(Iso8601Age isoAge) { + List components = new ArrayList<>(); + + if (isoAge.getYears()>1) { + components.add(String.format("%d years", isoAge.getYears())); + } else if (isoAge.getYears() == 1) { + components.add("1 year"); + } + if (isoAge.getMonths() > 1) { + components.add(String.format("%d months", isoAge.getMonths())); + } else if (isoAge.getMonths() == 1) { + components.add("1 month"); + } + if (isoAge.getDays()>1) { + components.add(String.format("%d days", isoAge.getDays())); + } else if (isoAge.getDays()==1) { + components.add("1 day"); + } + if (components.isEmpty()) { + return "as a newborn"; + } else if (components.size() == 1) { + return "at the age of " + components.get(0); + } else if (components.size() == 2) { + return "at the age of " + components.get(0) + " and " + components.get(1); + } else { + return "at the age of " + components.get(0) + "m " + components.get(1) + + ", and " + components.get(2); + } + } + + private String onsetTermAtAgeOf(HpoOnsetAge hpoOnsetTermAge) { + if (hpoOnsetTermAge.isFetus()) { + return "en el periodo fetal"; + } else if (hpoOnsetTermAge.isCongenital()) { + return "en el periodo neonatal"; + } else if (hpoOnsetTermAge.isInfant()) { + return "como un bebe"; + } else if (hpoOnsetTermAge.isChild()) { + return "en la niñez"; + } else if (hpoOnsetTermAge.isJuvenile()) { + return "como adolescente"; + } else { + return "en la edad adulta"; + } + } + + + private String iso8601individualDescription(PhenopacketSex psex, Iso8601Age iso8601Age) { + int y = iso8601Age.getYears(); + int m = iso8601Age.getMonths(); + int d = iso8601Age.getDays(); + // if older + if (y>17) { + return switch (psex) { + case FEMALE -> String.format("mujer de %d años", y); + case MALE -> String.format("hombre de %d años", y); + default -> String.format("persona de %d años", y); + }; + } else if (y>9) { + return switch (psex) { + case FEMALE -> String.format("una adolescente de %d años", y); + case MALE -> String.format("un adolescente de %d años", y); + default -> String.format("un adolescente de %d años", y); + }; + } else if (y>0) { + return switch (psex) { + case FEMALE -> String.format("niña %s", iso8601ToYearMonth(iso8601Age)); + case MALE -> String.format("niño %s", iso8601ToYearMonth(iso8601Age)); + default -> String.format("niño %s", iso8601ToYearMonth(iso8601Age)); + }; + } else if (m>0 || d> 0) { + return switch (psex) { + case FEMALE -> String.format("una infante %s", iso8601ToMonthDay(iso8601Age)); + case MALE -> String.format("un infante %s", iso8601ToMonthDay(iso8601Age)); + default -> String.format("un infante %s", iso8601ToMonthDay(iso8601Age)); + }; + } else { + return switch (psex) { + case FEMALE -> "recien nacida girl"; + case MALE -> "recien nacido"; + default -> "recien nacido"; + }; + } + } + + private String hpoOnsetIndividualDescription(PhenopacketSex psex, HpoOnsetAge hpoOnsetTermAge) { + if (hpoOnsetTermAge.isFetus()) { + return switch (psex) { + case FEMALE -> "female fetus"; + case MALE -> "male fetus"; + default -> "fetus"; + }; + } else if (hpoOnsetTermAge.isCongenital()) { + return switch (psex) { + case FEMALE -> "female newborn"; + case MALE -> "male newborn"; + default -> "newborn"; + }; + } else if (hpoOnsetTermAge.isInfant()) { + return switch (psex) { + case FEMALE -> "female infant"; + case MALE -> "male infant"; + default -> "infant"; + }; + } else if (hpoOnsetTermAge.isChild()) { + return switch (psex) { + case FEMALE -> "girl"; + case MALE -> "boy"; + default -> "child"; + }; + } else if (hpoOnsetTermAge.isJuvenile()) { + return switch (psex) { + case FEMALE -> "female adolescent"; + case MALE -> "male adolescent"; + default -> "adolescent"; + }; + }else { + return switch (psex) { + case FEMALE -> "woman"; + case MALE -> "man"; + default -> "adult"; + }; + } + } + + /** + * A sentence such as The proband was a 39-year old woman who presented at the age of 12 years with + * HPO1, HPO2, and HPO3. HPO4 and HPO5 were excluded. This method returns the phrase that ends with "with" + * El sujeto era un niño de 1 año y 10 meses que se presentó como recién nacido con un filtrum largo. + * @param psex + * @param lastExamAge + * @param onsetAge + * @return + */ + private String onsetAndLastEncounterAvailable(PhenopacketSex psex, PhenopacketAge lastExamAge, PhenopacketAge onsetAge) { + String individualDescription; + String onsetDescription; + if (lastExamAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoAge = (Iso8601Age) lastExamAge; + individualDescription = iso8601individualDescription(psex, isoAge); + } else if (lastExamAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + HpoOnsetAge hpoOnsetTermAge = (HpoOnsetAge) lastExamAge; + individualDescription = hpoOnsetIndividualDescription(psex,hpoOnsetTermAge); + } else { + // should never happen + throw new PhenolRuntimeException("Did not recognize last exam age type " + lastExamAge.ageType()); + } + if (onsetAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoAge = (Iso8601Age) onsetAge; + onsetDescription = iso8601AtAgeOf(isoAge); + } else if (onsetAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + HpoOnsetAge hpoOnsetTermAge = (HpoOnsetAge) onsetAge; + onsetDescription = onsetTermAtAgeOf(hpoOnsetTermAge); + } else { + // should never happen + throw new PhenolRuntimeException("Did not recognize onset age type " + onsetAge.ageType()); + } + return String.format("El sujeto era %s que se presentó %s con", individualDescription, onsetDescription); + } + + + /** + * Age at last examination available but age of onset not available + * The proband was a 39-year old woman who presented with HPO1, HPO2, and HPO3. HPO4 and HPO5 were excluded. + * @param psex + * @param lastExamAge + */ + private String lastEncounterAvailable(PhenopacketSex psex, PhenopacketAge lastExamAge) { + String individualDescription; + if (lastExamAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoAge = (Iso8601Age) lastExamAge; + individualDescription = iso8601individualDescription(psex, isoAge); + } else if (lastExamAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + HpoOnsetAge hpoOnsetTermAge = (HpoOnsetAge) lastExamAge; + individualDescription = hpoOnsetIndividualDescription(psex,hpoOnsetTermAge); + } else { + // should never happen + throw new PhenolRuntimeException("Did not recognize last exam age type " + lastExamAge.ageType()); + } + return String.format("The proband was a %s who presented with", individualDescription); + } + + /** + * Age at last examination not available but age of onset available + * The proband presented at the age of 12 years with HPO1, HPO2, and HPO3. HPO4 and HPO5 were excluded. + * @param psex + * @param onsetAge + * @return + */ + private String onsetAvailable(PhenopacketSex psex, PhenopacketAge onsetAge) { + String onsetDescription; + if (onsetAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + Iso8601Age isoAge = (Iso8601Age) onsetAge; + onsetDescription = iso8601AtAgeOf(isoAge); + } else if (onsetAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + HpoOnsetAge hpoOnsetTermAge = (HpoOnsetAge) onsetAge; + onsetDescription = onsetTermAtAgeOf(hpoOnsetTermAge); + } else { + // should never happen + throw new PhenolRuntimeException("Did not recognize onset age type " + onsetAge.ageType()); + } + return String.format("The proband presented %s with", onsetDescription, onsetDescription); + } + + private String ageNotAvailable(PhenopacketSex psex) { + return switch (psex) { + case FEMALE -> "The proband was a female who presented with"; + case MALE -> "The proband was a male who presented with"; + default -> "The proband presented with"; + }; + } + + @Override + public String heSheIndividual(PhenopacketSex psex) { + return switch (psex) { + case FEMALE -> "el"; + case MALE -> "ella"; + default -> "la persona"; + }; + } + + @Override + public String atAge(PhenopacketAge ppktAge) { + if (ppktAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { + return "A la edad de " + atIsoAgeExact(ppktAge); + } else if (ppktAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { + String label = ppktAge.age(); // something like "Infantile onset" + return switch (label) { + case "Infantile onset" -> "Durante el periodo infantil"; + case "Childhood onset" -> "Durante la infancia"; + case "Neonatal onset" -> "Durante el periodo neonatal"; + case "Congenital onset" -> "Al nacer"; + case "Adult onset" -> "Como adulto"; + default-> String.format("Durante el %s periodo", label.replace(" onset", "")); + }; + } else { + return ""; // should never get here + } + } + + // @Override + public String ppktSex(PpktIndividual individual) { + PhenopacketSex psex = individual.getSex(); + Optional ageOpt = individual.getAgeAtLastExamination(); + if (ageOpt.isEmpty()) { + ageOpt = individual.getAgeAtOnset(); + } + if (ageOpt.isEmpty()) { + return switch (psex) { + case FEMALE -> "female"; + case MALE -> "male"; + default -> "individual"; + }; + } + PhenopacketAge age = ageOpt.get();; + if (age.isChild()) { + return switch (psex) { + case FEMALE -> "girl"; + case MALE -> "boy"; + default -> "child"; + }; + } else if (age.isCongenital()) { + return switch (psex) { + case FEMALE -> "female newborn"; + case MALE -> "male newborn"; + default -> "newborn"; + }; + } else if (age.isFetus()) { + return switch (psex) { + case FEMALE -> "female fetus"; + case MALE -> "male fetus"; + default -> "fetus"; + }; + } else if (age.isInfant()) { + return switch (psex) { + case FEMALE -> "female infant"; + case MALE -> "male infant"; + default -> "infant"; + }; + } else { + return switch (psex) { + case FEMALE -> "woman"; + case MALE -> "man"; + default -> "individual"; + }; + } + } + + +} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java index f5c9bb8..0be3121 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktPhenotypicfeatureSpanish.java @@ -7,6 +7,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Optional; +import java.util.Set; import java.util.function.Predicate; public class PpktPhenotypicfeatureSpanish implements PpktPhenotypicFeatureGenerator { @@ -33,24 +34,57 @@ private List getTranslations(List ontologyTerms) { } + private final Set vowels = Set.of('A', 'E', 'I', 'O', 'U', 'Y'); - public String featureList(List ontologyTerms) { - List terms = ontologyTerms.stream() - .filter(Predicate.not(OntologyTerm::isExcluded)).toList(); - List labels = getTranslations(terms); - return ""; //;//getOxfordCommaList(labels, "y"); - } - - - public String excludedFeatureList(List ontologyTerms) { - List terms = ontologyTerms.stream() - .filter(OntologyTerm::isExcluded).toList(); - List labels = getTranslations(terms); - return ""; //;//getOxfordCommaList(labels, "y"); + private String getOxfordCommaList(List items) { + if (items.size() == 1) { + return items.get(0); + } + if (items.size() == 2) { + // no comma if we just have two items. + // one item will work with the below code + return String.join(" and ", items); + } + String symList = String.join(", ", items); + int jj = symList.lastIndexOf(", "); + if (jj > 0) { + String end = symList.substring(jj+2); + if (vowels.contains(end.charAt(0))) { + symList = symList.substring(0, jj) + " i " + end; + } else { + symList = symList.substring(0, jj) + " y " + end; + } + } + return symList; } @Override public String formatFeatures(List ontologyTerms) { - return ""; + List observedTerms = ontologyTerms.stream() + .filter(Predicate.not(OntologyTerm::isExcluded)) + .toList(); + List observedLabels = getTranslations(observedTerms); + List excludedTerms = ontologyTerms.stream() + .filter(OntologyTerm::isExcluded).toList(); + List excludedLabels = getTranslations(excludedTerms); + if (observedLabels.isEmpty() && excludedLabels.isEmpty()) { + return "no phenotypic abnormalities"; // should never happen, actually! + } else if (excludedLabels.isEmpty()) { + return getOxfordCommaList(observedLabels) + ". "; + } else if (observedLabels.isEmpty()) { + if (excludedLabels.size() > 1) { + return String.format("por lo que se excluyeron %s.", getOxfordCommaList(excludedLabels)); + } else { + return String.format("por lo que %s fue excluido.",excludedLabels.get(0)); + } + } else { + String exclusion; + if (excludedLabels.size() == 1) { + exclusion = String.format(" y se excluyó %s.", getOxfordCommaList(excludedLabels)); + } else { + exclusion = String.format(" y se excluyeron %s.", getOxfordCommaList(excludedLabels)); + } + return getOxfordCommaList(observedLabels) + exclusion; + } } } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/SpanishPromptGenerator.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/SpanishPromptGenerator.java index 18f9e75..ee48aee 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/SpanishPromptGenerator.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/SpanishPromptGenerator.java @@ -14,7 +14,7 @@ public class SpanishPromptGenerator implements PromptGenerator { private final Ontology hpo; - private final PhenopacketAgeSexGenerator ppktAgeSexGenerator; + private final PhenopacketIndividualInformationGenerator ppktAgeSexGenerator; private final PhenopacketTextGenerator ppktTextGenerator; @@ -24,7 +24,7 @@ public class SpanishPromptGenerator implements PromptGenerator { public SpanishPromptGenerator(Ontology hpo, PpktPhenotypicFeatureGenerator pfgen) { this.hpo = hpo; - ppktAgeSexGenerator = new PpktAgeSexSpanish(); + ppktAgeSexGenerator = new PpktIndividualSpanish(); ppktTextGenerator = new PpktTextSpanish(); this.ppktPhenotypicFeatureGenerator = pfgen; } @@ -36,42 +36,25 @@ public String queryHeader() { @Override public String getIndividualInformation(PpktIndividual ppktIndividual) { - StringBuilder sb = new StringBuilder(); - /* String sex = sexGenerator.ppktSex(ppktIndividual); - Optional lastAgeOpt = ppktIndividual.getAgeAtLastExamination(); - Optional onsetOpt = ppktIndividual.getAgeAtOnset(); - if (lastAgeOpt.isPresent()) { - PhenopacketAge lastExamAge = lastAgeOpt.get(); - String examAge = ppktAgeSexGenerator.age(lastExamAge); - sb.append("El probando era un ").append(examAge).append( " ").append(sex).append(". "); - } else { - sb.append("El probando era un ").append(sex).append(". "); - } - if (onsetOpt.isPresent()) { - PhenopacketAge onsetAge = onsetOpt.get(); - String onset = ppktAgeSexGenerator.age(onsetAge); - sb.append("Las manifestaciones iniciales de la enfermedad aparecieron cuando el probando era ").append(onset).append(". "); - }*/ - return sb.toString(); + return this.ppktAgeSexGenerator.getIndividualDescription(ppktIndividual); } @Override public String formatFeatures(List ontologyTerms) { - return ""; + return ppktPhenotypicFeatureGenerator.formatFeatures(ontologyTerms); } @Override public String getVignetteAtAge(PhenopacketAge page, PhenopacketSex psex, List terms) { - return ""; + String ageString = this.ppktAgeSexGenerator.atAge(page); + String features = formatFeatures(terms); + return String.format("%s, %s presentó %s", ageString, ppktAgeSexGenerator.heSheIndividual(psex), features); } - @Override - public String createPrompt(PpktIndividual individual) { - return ""; - } + } From 95e49b32525bfb534c54a0b39968e5dcc08986b2 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Fri, 26 Apr 2024 12:21:42 +0200 Subject: [PATCH 3/6] documentation --- .github/workflows/documentation.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index d8fdef1..f5806fc 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -2,7 +2,7 @@ name: mkdocs-generation on: push: branches: - - [main, develop] + - develop permissions: contents: write jobs: @@ -20,6 +20,6 @@ jobs: key: ${{ github.ref }} path: .cache - - run: python3 -m pip install .[docs] + - run: python3 -m pip install mkdocs-material[imaging],mkdocs-material-extensions,mkdocstrings[python],pillow,cairosvg - run: mkdocs gh-deploy --force From bc7356d7970b344df889e57c5081c84be3263144 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Fri, 26 Apr 2024 17:22:47 +0200 Subject: [PATCH 4/6] batch phenoopacket creation --- .github/workflows/documentation.yml | 7 +- .../phenopacket2prompt/Main.java | 6 +- .../cmd/GbtTranslateBatchCommand.java | 137 +++++++++++++ .../cmd/GptTranslateCommand.java | 2 +- .../cmd/OntoGptCommand.java | 85 -------- .../model/PpktIndividual.java | 2 +- .../impl/spanish/PpktIndividualSpanish.java | 94 ++------- .../nejm/DehyphenizerTest.java | 39 ---- .../querygen/TimeBasedFatoryTest.java | 29 --- .../querygen/TimePointParserTest.java | 192 ------------------ 10 files changed, 162 insertions(+), 431 deletions(-) create mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java delete mode 100644 src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/OntoGptCommand.java delete mode 100644 src/test/java/org/monarchinitiative/phenopacket2prompt/nejm/DehyphenizerTest.java delete mode 100644 src/test/java/org/monarchinitiative/phenopacket2prompt/querygen/TimeBasedFatoryTest.java delete mode 100644 src/test/java/org/monarchinitiative/phenopacket2prompt/querygen/TimePointParserTest.java diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index f5806fc..36fefba 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -20,6 +20,9 @@ jobs: key: ${{ github.ref }} path: .cache - - run: python3 -m pip install mkdocs-material[imaging],mkdocs-material-extensions,mkdocstrings[python],pillow,cairosvg - + - run: pip install mkdocs-material + - run: pip install mkdocs-material[imaging] + - run: pip install mkdocs-material-extensions + - run: pip install pillow cairosvg + - run: pip install mkdocstrings[python] - run: mkdocs gh-deploy --force diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java index e1856b0..9bd95b4 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java @@ -15,8 +15,8 @@ public static void main(String[] args){ args = new String[]{"-h"}; } CommandLine cline = new CommandLine(new Main()) + .addSubcommand("batch", new GbtTranslateBatchCommand()) .addSubcommand("download", new DownloadCommand()) - .addSubcommand("gpt", new OntoGptCommand()) .addSubcommand("translate", new GptTranslateCommand()) ; cline.setToggleBooleanFlags(false); @@ -30,4 +30,8 @@ public Integer call() { // work done in subcommands return 0; } + + + + } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java new file mode 100644 index 0000000..c9c1aba --- /dev/null +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java @@ -0,0 +1,137 @@ +package org.monarchinitiative.phenopacket2prompt.cmd; + + +import org.monarchinitiative.phenol.base.PhenolRuntimeException; +import org.monarchinitiative.phenol.io.OntologyLoader; +import org.monarchinitiative.phenol.ontology.data.Ontology; +import org.monarchinitiative.phenopacket2prompt.model.PhenopacketDisease; +import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; +import org.monarchinitiative.phenopacket2prompt.output.PromptGenerator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import picocli.CommandLine; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; + +@CommandLine.Command(name = "batch", aliases = {"B"}, + mixinStandardHelpOptions = true, + description = "Translate batch of phenopackets and output prompts") +public class GbtTranslateBatchCommand implements Callable { + private final static Logger LOGGER = LoggerFactory.getLogger(GbtTranslateBatchCommand.class); + + + @CommandLine.Option(names = {"--hp"}, + description = "path to HP json file") + private String hpoJsonPath = "data/hp.json"; + + @CommandLine.Option(names = {"--translations"}, + description = "path to translations file") + private String translationsPath = "data/hp-international.obo"; + + @CommandLine.Option(names = {"-d", "--dir"}, description = "Path to directory with JSON phenopacket files", required = true) + private String ppktDir; + + @Override + public Integer call() throws Exception { + File hpJsonFile = new File(hpoJsonPath); + if (! hpJsonFile.isFile()) { + throw new PhenolRuntimeException("Could not find hp.json at " + hpJsonFile.getAbsolutePath()); + } + Ontology hpo = OntologyLoader.loadOntology(hpJsonFile); + LOGGER.info("HPO version {}", hpo.version().orElse("n/a")); + List ppktFiles = getAllPhenopacketJsonFiles(); + createDir("prompts"); + outputPromptsEnglish(ppktFiles, hpo); + return 0; + } + + + + private String getFileName(String phenopacketID) { + return phenopacketID.replaceAll("[^\\w]", phenopacketID).replaceAll("/","_") + "-prompt.txt"; + } + + + private void outputPromptsEnglish(List ppktFiles, Ontology hpo) { + createDir("prompts/en"); + PromptGenerator generator = PromptGenerator.english(hpo); + List diagnosisList = new ArrayList<>(); + for (var f: ppktFiles) { + PpktIndividual individual = new PpktIndividual(f); + List diseaseList = individual.getDiseases(); + if (diseaseList.size() != 1) { + System.err.println(String.format("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId())); + continue; + } + PhenopacketDisease pdisease = diseaseList.get(0); + String promptFileName = getFileName( individual.getPhenopacketId()); + String diagnosisLine = String.format("%s\t%s\t%s\t%s", pdisease.getDiseaseId(), pdisease.getLabel(), promptFileName, f.getAbsolutePath()); + try { + diagnosisList.add(diagnosisLine); + String prompt = generator.createPrompt(individual); + outputPrompt(prompt, promptFileName, "prompts/en"); + } catch (Exception e) { + e.printStackTrace(); + } + } + } + + + + private void outputPrompt(String prompt, String promptFileName, String dir) { + File outpath = new File(dir + File.separator + promptFileName); + try (BufferedWriter bw = new BufferedWriter(new FileWriter(outpath))) { + bw.write(prompt); + } catch (IOException e) { + e.printStackTrace(); + } + System.out.print("."); + } + + + + private void createDir(String path) { + File pathAsFile = new File(path); + if (!Files.exists(Paths.get(path))) { + pathAsFile.mkdir(); + } + } + + + + + + private List getAllPhenopacketJsonFiles() { + List ppktDirectories = new ArrayList<>(); + List ppktFiles = new ArrayList<>(); + File[] items = new File(this.ppktDir).listFiles(); + // We know that all phenopackets are located in the subdirectories + if (!ppktDir.substring(ppktDir.length() - 1).equals("/")) { + ppktDir += "/"; + } + for (File item : items) { + if (item.isDirectory()) + ppktDirectories.add(ppktDir+item.getName()); + } + for (var f: ppktDirectories) { + File subdir = new File(f); + File[] files = subdir.listFiles(); + for (var ff : files) { + if (ff.isFile() && ff.getAbsolutePath().endsWith(".json")) { + ppktFiles.add(ff); + } + } + } + System.out.printf("Retrieved %d files.\n", ppktFiles.size()); + return ppktFiles; + } + +} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java index 68cd748..88e311e 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GptTranslateCommand.java @@ -20,7 +20,7 @@ mixinStandardHelpOptions = true, description = "Translate phenopackets and output prompts") public class GptTranslateCommand implements Callable { - Logger LOGGER = LoggerFactory.getLogger(GptTranslateCommand.class); + private final static Logger LOGGER = LoggerFactory.getLogger(GptTranslateCommand.class); @CommandLine.Option(names = {"--hp"}, diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/OntoGptCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/OntoGptCommand.java deleted file mode 100644 index 2cd9ff0..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/OntoGptCommand.java +++ /dev/null @@ -1,85 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.cmd; - - -import org.monarchinitiative.phenol.io.OntologyLoader; -import org.monarchinitiative.phenol.ontology.data.Ontology; -import org.monarchinitiative.phenopacket2prompt.legacy.nejm.NejmCaseReportIngestor; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.PhenopacketFactoryIngestor; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.QueryOutputGenerator; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.QueryOutputType; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.QueryPromptFactory; -import picocli.CommandLine; - -import java.io.*; -import java.util.*; -import java.util.concurrent.Callable; - -import static org.monarchinitiative.phenopacket2prompt.legacy.querygen.QueryOutputType.*; - - -@CommandLine.Command(name = "gpt-time", aliases = {"G"}, - mixinStandardHelpOptions = true, - description = "Create GPT time-course prompt") -public class OntoGptCommand implements Callable { - @CommandLine.Option(names = {"-n", "--nejm"}, - required = true, - description = "path to directory with NEJM text files") - private String nejmDirectoryPath; - - @CommandLine.Option(names = {"--hp"}, - description = "path to HP json file") - private String hpoJsonPath = "data/hp.json"; - - @CommandLine.Option(names = {"-o", "--out"}, - description = "path to output dir (created if necessary)") - private String outDir = "gptOut"; - - @CommandLine.Option(names = {"-c", "--case"}, - description = "case ID (just analyze this case)" ) - private String targetCase = null; - - - - - @Override - public Integer call() { - // 1. Ingest the NEJM case report texts. Clean up the original text (PDF parse oddities) - // but otherwise leave the processing for subsequent steps - Ontology hpo = OntologyLoader.loadOntology(new File(hpoJsonPath)); - NejmCaseReportIngestor nejmIngestor = new NejmCaseReportIngestor(this.nejmDirectoryPath); - // If run with the --targetCase argument, just the targetCase is processed. - // if targetCase == null, that means we are processing all files - if (targetCase != null) { - nejmIngestor.restrictToTarget(targetCase); - } - // key: identifier of PMID; value - lines of text - Map> id2lines = nejmIngestor.getId2lines(); - System.out.printf("[INFO] Parsed %d cases.\n", id2lines.size()); - - // 2. Create factory objects from the above lines. The factory objects know how to create - // the various output - PhenopacketFactoryIngestor ppIngestor = new PhenopacketFactoryIngestor(id2lines, hpo); - Map id2timeCourseFactory = ppIngestor.getId2timeCourseFactory(); - System.out.printf("[INFO] Factory map has %d cases.\n", id2timeCourseFactory.size()); - System.out.printf("[INFO] We parsed %d cases.\n", id2lines.entrySet().size()); - - // CREATE THE OUTPUT DIRECTORIES IF NEEDED. - List outputTypes = List.of(TIME_BASED, - QC, - TEXT_WITHOUT_DISCUSSION, - TEXT_PLUS_MANUAL); - // CREATE THE OUTPUT DIRECTORIES IF NEEDED. - QueryOutputGenerator outputGenerator = new QueryOutputGenerator(outputTypes, outDir); - // output individual query prompts to the corresponding directories - int n_output = 0; - // OUTOUT THE QUERY FILES - for (var entry : id2timeCourseFactory.entrySet()) { - outputGenerator.outputEntry(entry.getKey(), entry.getValue()); - n_output++; - } - System.out.printf("We output %d cases.\n", n_output); - return 0; - } - - -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java index 03a674b..a00aff6 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/model/PpktIndividual.java @@ -140,7 +140,7 @@ private boolean agesEqual(PhenopacketAge ageOne, PhenopacketAge ageTwo) { public List getPhenotypicFeaturesAtOnset() { Optional opt = getAgeAtOnset(); if (opt.isEmpty()) { - return List.of(); // + return new ArrayList<>(); // } List onsetFeatures = new ArrayList<>(); PhenopacketAge onsetAge = opt.get(); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktIndividualSpanish.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktIndividualSpanish.java index dadeedc..2fd5a6b 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktIndividualSpanish.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/output/impl/spanish/PpktIndividualSpanish.java @@ -97,74 +97,6 @@ public String ageAndSexAtLastExamination(PpktIndividual individual) { } - private String individualName(PpktIndividual individual) { - PhenopacketSex psex = individual.getSex(); - Optional ageOpt = individual.getAgeAtLastExamination(); - if (ageOpt.isEmpty()) { - ageOpt = individual.getAgeAtOnset(); - } - if (ageOpt.isEmpty()) { - return switch (psex) { - case FEMALE -> "female"; - case MALE -> "male"; - default -> "individual"; - }; - } - PhenopacketAge age = ageOpt.get();; - if (age.isChild()) { - return switch (psex) { - case FEMALE -> "girl"; - case MALE -> "boy"; - default -> "child"; - }; - } else if (age.isCongenital()) { - return switch (psex) { - case FEMALE -> "female newborn"; - case MALE -> "male newborn"; - default -> "newborn"; - }; - } else if (age.isFetus()) { - return switch (psex) { - case FEMALE -> "female fetus"; - case MALE -> "male fetus"; - default -> "fetus"; - }; - } else if (age.isInfant()) { - return switch (psex) { - case FEMALE -> "female infant"; - case MALE -> "male infant"; - default -> "infant"; - }; - } else { - return switch (psex) { - case FEMALE -> "woman"; - case MALE -> "man"; - default -> "individual"; - }; - } - } - - - /* @Override - public String individualWithAge(PhenopacketAge ppktAge) { - if (ppktAge.ageType().equals(PhenopacketAgeType.ISO8601_AGE_TYPE)) { - return ppktAge.age() + " old"; - } else if (ppktAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { - String label = ppktAge.age(); // something like "Infantile onset" - return switch (label) { - case "Infantile onset" -> "bebé"; - case "Childhood onset" -> "niño"; - case "Neonatal onset" -> "neonate"; - case "Congenital onset" -> "recién nacido"; - case "Adult onset" -> "adulto"; - default-> String.format("During the %s", label.replace(" onset", "")); - }; - } else { - return ""; // should never get here - } - } -*/ - private String atIsoAgeExact(PhenopacketAge ppktAge) { Iso8601Age iso8601Age = (Iso8601Age) ppktAge; int y = iso8601Age.getYears(); @@ -236,29 +168,29 @@ private String iso8601AtAgeOf(Iso8601Age isoAge) { List components = new ArrayList<>(); if (isoAge.getYears()>1) { - components.add(String.format("%d years", isoAge.getYears())); + components.add(String.format("%d años", isoAge.getYears())); } else if (isoAge.getYears() == 1) { - components.add("1 year"); + components.add("1 año"); } if (isoAge.getMonths() > 1) { - components.add(String.format("%d months", isoAge.getMonths())); + components.add(String.format("%d meses", isoAge.getMonths())); } else if (isoAge.getMonths() == 1) { - components.add("1 month"); + components.add("1 mes"); } if (isoAge.getDays()>1) { - components.add(String.format("%d days", isoAge.getDays())); + components.add(String.format("%d dias", isoAge.getDays())); } else if (isoAge.getDays()==1) { - components.add("1 day"); + components.add("1 dia"); } if (components.isEmpty()) { return "as a newborn"; } else if (components.size() == 1) { return "at the age of " + components.get(0); } else if (components.size() == 2) { - return "at the age of " + components.get(0) + " and " + components.get(1); + return "a la edad de " + components.get(0) + " y " + components.get(1); } else { - return "at the age of " + components.get(0) + "m " + components.get(1) + - ", and " + components.get(2); + return "a la edad de " + components.get(0) + ", " + components.get(1) + + " y " + components.get(2); } } @@ -310,7 +242,7 @@ private String iso8601individualDescription(PhenopacketSex psex, Iso8601Age iso8 }; } else { return switch (psex) { - case FEMALE -> "recien nacida girl"; + case FEMALE -> "recien nacida"; case MALE -> "recien nacido"; default -> "recien nacido"; }; @@ -460,12 +392,12 @@ public String atAge(PhenopacketAge ppktAge) { } else if (ppktAge.ageType().equals(PhenopacketAgeType.HPO_ONSET_AGE_TYPE)) { String label = ppktAge.age(); // something like "Infantile onset" return switch (label) { - case "Infantile onset" -> "Durante el periodo infantil"; + case "Infantile onset" -> "Durante el período infantil"; case "Childhood onset" -> "Durante la infancia"; - case "Neonatal onset" -> "Durante el periodo neonatal"; + case "Neonatal onset" -> "Durante el período neonatal"; case "Congenital onset" -> "Al nacer"; case "Adult onset" -> "Como adulto"; - default-> String.format("Durante el %s periodo", label.replace(" onset", "")); + default-> String.format("Durante el %s período", label.replace(" onset", "")); }; } else { return ""; // should never get here diff --git a/src/test/java/org/monarchinitiative/phenopacket2prompt/nejm/DehyphenizerTest.java b/src/test/java/org/monarchinitiative/phenopacket2prompt/nejm/DehyphenizerTest.java deleted file mode 100644 index 0f2bff2..0000000 --- a/src/test/java/org/monarchinitiative/phenopacket2prompt/nejm/DehyphenizerTest.java +++ /dev/null @@ -1,39 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.nejm; - -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; -import org.monarchinitiative.phenopacket2prompt.legacy.nejm.Dehyphenizer; - -import java.util.ArrayList; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -public class DehyphenizerTest { - - private static List mylines; - - - @BeforeAll - public static void init() { - mylines = new ArrayList<>(); - mylines.add("hospital. On examination, the pulse was 104 beats per minute and the blood pres-"); - mylines.add("sure 128/80 mm Hg. "); - mylines.add("At the recommendation of his physicians, the patient stopped participating in all sports."); - } - - - @Test - public void testRemoveHyphen() { - List cleanedLines = Dehyphenizer.dehyphenizeLines(mylines); - String expectedLine1 = "hospital. On examination, the pulse was 104 beats per minute and the blood"; - String expectedLine2 = "pressure 128/80 mm Hg."; - String expectedLine3 = "At the recommendation of his physicians, the patient stopped participating in all sports."; - assertEquals(3, cleanedLines.size()); - assertEquals(expectedLine1, cleanedLines.get(0)); - assertEquals(expectedLine2, cleanedLines.get(1)); - assertEquals(expectedLine3, cleanedLines.get(2)); - } - - -} diff --git a/src/test/java/org/monarchinitiative/phenopacket2prompt/querygen/TimeBasedFatoryTest.java b/src/test/java/org/monarchinitiative/phenopacket2prompt/querygen/TimeBasedFatoryTest.java deleted file mode 100644 index e385a99..0000000 --- a/src/test/java/org/monarchinitiative/phenopacket2prompt/querygen/TimeBasedFatoryTest.java +++ /dev/null @@ -1,29 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.querygen; - -import org.junit.jupiter.api.Test; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class TimeBasedFatoryTest { - - - private final static String vignette = "clinic of another hospital. On examination, there was {{conjunctival injection:PHENOTYPE}} in " + - "both eyes. The {{lungs were clear on auscultation:EXCLUDE:Abnormal breath sound}}, and the remainder of the physical " + - "examination was reportedly normal. Testing of a nasopharyngeal specimen for " + - "severe acute respiratory syndrome coronavirus 2 {{(SARS-CoV-2) RNA was negative:LABORATORY}}."; - - private final static Pattern pattern = Pattern.compile("\\{\\{([^}]*)}}"); - - - @Test - public void testRegex() { - Matcher m = pattern.matcher(vignette); - while (m.find()) { - System.out.println(m.group()); - } - - } - - -} diff --git a/src/test/java/org/monarchinitiative/phenopacket2prompt/querygen/TimePointParserTest.java b/src/test/java/org/monarchinitiative/phenopacket2prompt/querygen/TimePointParserTest.java deleted file mode 100644 index 7007bd9..0000000 --- a/src/test/java/org/monarchinitiative/phenopacket2prompt/querygen/TimePointParserTest.java +++ /dev/null @@ -1,192 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.querygen; - -import org.junit.jupiter.api.Test; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.TimePoint; -import org.monarchinitiative.phenopacket2prompt.legacy.querygen.TimePointParser; - -import java.util.List; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -public class TimePointParserTest { - - private final TimePointParser timePointParser = new TimePointParser(); - - @Test - public void test1() { - String input = "The patient had been well until 3 days before presentation, when pressurelike pain developed " + - "n the left side of the forehead and frontal scalp and the bilateral maxillary sinuses and upper jaws."; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals(32, tp.start()); - assertEquals(58, tp.end()); - } - - @Test - public void test2() { - String input = "Two days before presentation, the patient noted erythema and small reddishbrown skin lesions on " + - "the left side of the forehead "; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("Two days before presentation", tp.point()); - assertEquals(0, tp.start()); - assertEquals(28, tp.end()); - } - - @Test - public void test3() { - String input = "had pointed out a similar spot on the top of his scalp 2 weeks earlier."; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("2 weeks earlier", tp.point()); - assertEquals(55, tp.start()); - assertEquals(70, tp.end()); - } - - @Test - public void test4() { - String input = "In the emergency department, the patient reported no ocular or nasal discharge,"; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("In the emergency department", tp.point()); - assertEquals(0, tp.start()); - assertEquals(27, tp.end()); - } - - @Test - public void test5() { - String input = "His ocular history included bilateral mild ptosis; he had undergone bilateral cataract extraction"; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("His ocular history included", tp.point()); - assertEquals(0, tp.start()); - assertEquals(27, tp.end()); - } - - @Test - public void test6() { - String input = "The patient had been in her usual state of health until approximately 4 weeks before admission,"; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("approximately 4 weeks before admission", tp.point()); - assertEquals(56, tp.start()); - assertEquals(94, tp.end()); - } - - @Test - public void test7() { - String input = "During the next 3 days, she had nausea and a poor appetite. "; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("During the next 3 days", tp.point()); - assertEquals(0, tp.start()); - assertEquals(22, tp.end()); - } - - @Test - public void test8() { - String input = "After 3 days of fever with a temperature of up to 38.5°C, she began vomiting "; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("After 3 days of fever", tp.point()); - assertEquals(0, tp.start()); - assertEquals(21, tp.end()); - } - - @Test - public void test9() { - String input = "Approximately two decades before the current admission, he was struck on the head "; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("Approximately two decades before the current admission", tp.point()); - assertEquals(0, tp.start()); - assertEquals(54, tp.end()); - } - - @Test - public void test10() { - String input = "Some text (4 years before the current admission) some text"; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("4 years before the current admission", tp.point()); - assertEquals(11, tp.start()); - assertEquals(47, tp.end()); - } - - @Test - public void test11() { - String input = " been well until 20 hours before this admission, when diffuse abdominal pain and nausea developed"; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("20 hours before this admission", tp.point()); - assertEquals(17, tp.start()); - assertEquals(47, tp.end()); - } - - @Test - public void test12() { - String input = "After a 1-week admission, the dyspnea abated, "; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("After a 1-week admission", tp.point()); - assertEquals(0, tp.start()); - assertEquals(24, tp.end()); - } - - @Test - public void test13() { - String input = "The patient had been in his usual state of good health until 1 hour before evaluation, some other text."; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("1 hour before evaluation", tp.point()); - assertEquals(61, tp.start()); - assertEquals(85, tp.end()); - } - @Test - public void test14() { - String input = "Approximately 2 weeks before the current admission, she sought evaluation at the emergency department of another hospital."; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("Approximately 2 weeks before the current admission", tp.point()); - assertEquals(0, tp.start()); - assertEquals(50, tp.end()); - } - - @Test - public void test15() { - String input = "he patient had been in her usual state of health until 4 days before admission, when she awoke from sleep with swelling ."; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("4 days before admission", tp.point()); - assertEquals(55, tp.start()); - assertEquals(78, tp.end()); - } - - @Test - public void test16() { - String input = "The patient had been in his usual state of health until 8 days before this admission, when constant aching pain developed in the left eye and the left side of the head."; - List tplist = timePointParser.getTimePoints(input); - assertEquals(1, tplist.size()); - TimePoint tp = tplist.get(0); - assertEquals("8 days before this admission", tp.point()); - assertEquals(56, tp.start()); - assertEquals(84, tp.end()); - } - -//T -} From 9e1eccdb9e7f9acfc19b2ba849523580873c840b Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Fri, 26 Apr 2024 17:23:26 +0200 Subject: [PATCH 5/6] batch phenoopacket creation --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 5bd5653..cd43cac 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ org.monarchinitiative phenopacket2prompt - 0.3.12 + 0.3.14 phenopacket2prompt https://github.com/monarch-initiative/phenopacket2prompt From b2c4429fc4ca79b2a6ef05692a31830f7e77ca44 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Fri, 26 Apr 2024 17:34:48 +0200 Subject: [PATCH 6/6] documentation --- docs/setup.md | 27 +++++++++++++ mkdocs.yml | 1 + .../cmd/GbtTranslateBatchCommand.java | 39 +++++++++++++++++++ 3 files changed, 67 insertions(+) create mode 100644 docs/setup.md diff --git a/docs/setup.md b/docs/setup.md new file mode 100644 index 0000000..914214d --- /dev/null +++ b/docs/setup.md @@ -0,0 +1,27 @@ +# Set-up + +TODO -- how to setup Java etc. + +## Download command +Before running the batch command, run the download command to get the necessary files + +``` +java -jar target/phenopacket2prompt.jar download +``` + +## Batch command +To run the batch command, first download the latest release from the +[releases](https://github.com/monarch-initiative/phenopacket-store/releases) section of the phenopacket-store +repository. Unpack either all_phenopackets.tgz or all_phenopackets.zip (the files are identical except for the +method of compression). + +``` +java -jar target/phenopacket2prompt.jar batch -d +``` +Replasce `` with the actual path on your system. + +The app should create a folder "prompts", with two subdirectories, "en" and "es" with English and Spanish prompts. +There are some errors that still need to be fixed, but several thousand prompts should appear. + +## Todo +also output a file with expected diagnosis diff --git a/mkdocs.yml b/mkdocs.yml index b889f6d..1b7d9af 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -34,6 +34,7 @@ nav: - Languages: - "Template": 'languages.md' - "English": "english.md" + - Setup: "setup.md" plugins: - search diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java index c9c1aba..1546a32 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java @@ -4,6 +4,8 @@ import org.monarchinitiative.phenol.base.PhenolRuntimeException; import org.monarchinitiative.phenol.io.OntologyLoader; import org.monarchinitiative.phenol.ontology.data.Ontology; +import org.monarchinitiative.phenopacket2prompt.international.HpInternational; +import org.monarchinitiative.phenopacket2prompt.international.HpInternationalOboParser; import org.monarchinitiative.phenopacket2prompt.model.PhenopacketDisease; import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; import org.monarchinitiative.phenopacket2prompt.output.PromptGenerator; @@ -19,6 +21,7 @@ import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.concurrent.Callable; @CommandLine.Command(name = "batch", aliases = {"B"}, @@ -47,9 +50,19 @@ public Integer call() throws Exception { } Ontology hpo = OntologyLoader.loadOntology(hpJsonFile); LOGGER.info("HPO version {}", hpo.version().orElse("n/a")); + File translationsFile = new File(translationsPath); + if (! translationsFile.isFile()) { + System.err.printf("Could not find translations file at %s. Try download command", translationsPath); + return 1; + } + HpInternationalOboParser oboParser = new HpInternationalOboParser(translationsFile); + Map internationalMap = oboParser.getLanguageToInternationalMap(); + LOGGER.info("Got {} translations", internationalMap.size()); List ppktFiles = getAllPhenopacketJsonFiles(); createDir("prompts"); outputPromptsEnglish(ppktFiles, hpo); + PromptGenerator spanish = PromptGenerator.spanish(hpo, internationalMap.get("es")); + outputPromptsInternational(ppktFiles, hpo, "es", spanish); return 0; } @@ -60,6 +73,32 @@ private String getFileName(String phenopacketID) { } + + private void outputPromptsInternational(List ppktFiles, Ontology hpo, String languageCode, PromptGenerator generator) { + String dirpath = String.format("prompts/%s", languageCode); + createDir(dirpath); + List diagnosisList = new ArrayList<>(); + for (var f: ppktFiles) { + PpktIndividual individual = new PpktIndividual(f); + List diseaseList = individual.getDiseases(); + if (diseaseList.size() != 1) { + System.err.println(String.format("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId())); + continue; + } + PhenopacketDisease pdisease = diseaseList.get(0); + String promptFileName = getFileName( individual.getPhenopacketId()); + String diagnosisLine = String.format("%s\t%s\t%s\t%s", pdisease.getDiseaseId(), pdisease.getLabel(), promptFileName, f.getAbsolutePath()); + try { + diagnosisList.add(diagnosisLine); + String prompt = generator.createPrompt(individual); + outputPrompt(prompt, promptFileName, dirpath); + } catch (Exception e) { + e.printStackTrace(); + } + } + } + + private void outputPromptsEnglish(List ppktFiles, Ontology hpo) { createDir("prompts/en"); PromptGenerator generator = PromptGenerator.english(hpo);