diff --git a/docs/cases/PMID_25163805.txt b/docs/cases/PMID_25163805.txt new file mode 100644 index 0000000..c793237 --- /dev/null +++ b/docs/cases/PMID_25163805.txt @@ -0,0 +1,7 @@ +[source] +pmid = PMID:25163805 +title = Further delineation of Loeys-Dietz syndrome type 4 in a family with mild vascular involvement and a TGFB2 splicing mutation +[diagnosis] +disease_id = OMIM:614816 +disease_label = Loeys-Dietz syndrome 4 +[text] diff --git a/docs/cases/PMID_30249733.txt b/docs/cases/PMID_30249733.txt new file mode 100644 index 0000000..b3ce085 --- /dev/null +++ b/docs/cases/PMID_30249733.txt @@ -0,0 +1,26 @@ +[source] +pmid = PMID:30249733 +title = Novel mutation in the CHST14 gene causing musculocontractural type of Ehlers-Danlos syndrome +[diagnosis] +disease_id = OMIM:601776 +disease_label = Ehlers-Danlos syndrome, musculocontractural type 1 +[text] +A 3-year-old boy, born out of a third-degree consanguineous marriage was referred to us by the paediatric surgeons on +suspicion of an underlying genetic disorder. He was being followed by them for bilateral hydronephrosis with bilateral +pelviureteric junction obstruction (right >left). The child was born by normal vaginal delivery with a birth weight of 2.7kg. +The length and head circumference at birth were not recorded. At birth, he was noted to have bilateral clubfeet. +The child started sitting at around 7–8 months of age however had difficulty in standing and walking. +At the current age of 3 years also, he is able to stand with support only for few minutes. +In the other sectors of development like cognition and language, the child showed appropriate gain and currently is able +to tell short stories and enjoys playing with family members. Anthropometry at the age of 3 years showed weight to be 12.6kg, +length to be 88cm and head circumference of 47cm at 3 years of age. For the initial 1–1.5 years of life, +the parents were mainly concerned about clubfeet in their child and were taking opinion of local practitioners for the same. +During an episode of acute febrile illness, he was coincidently diagnosed to have hydronephrosis and in view of cryptorchidism +noted by the examining physician was referred to our centre for evaluation and management. +On examination, the child had facial dysmorphism in the form of synophrys, hypertelorism, down slanting palpebral fissures, +low set ears, thin upper lip, high arched palate and prominent nasolabial folds (figures 1A and 2A–C). +He had tapering fingers with bilaterally thin and adducted thumbs (figure 1B). The deep palmer creases were absent and +only a few fine creases were seen. Feet showed bilateral talipes equinovarus deformity (figure 1C). +The skin was hyperelastic and hypermobility of fingers, elbow and knee joints was noted. Generalised hypotonia was present. +The child also had bilateral cryptorchidism. No bruises or haematomas were seen and even on repeated asking the parents denied +any bleeding tendency. \ No newline at end of file diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java index 726c6d0..1960bbc 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/Main.java @@ -20,6 +20,7 @@ public static void main(String[] args){ .addSubcommand("download", new DownloadCommand()) .addSubcommand("prompt", new PromptCommand()) .addSubcommand("mine", new TextMineCommand()) + .addSubcommand("batchmine", new TextMineCommand()) .addSubcommand("translate", new GptTranslateCommand()) ; cline.setToggleBooleanFlags(false); diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/AbstractMineCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/AbstractMineCommand.java deleted file mode 100644 index 97748c2..0000000 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/AbstractMineCommand.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.monarchinitiative.phenopacket2prompt.cmd; - -import org.monarchinitiative.phenol.base.PhenolRuntimeException; -import org.monarchinitiative.phenopacket2prompt.mining.Case; -import org.monarchinitiative.phenopacket2prompt.mining.CaseBundle; -import org.monarchinitiative.phenopacket2prompt.mining.CaseParser; -import org.monarchinitiative.phenopacket2prompt.mining.FenominalParser; -import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; -import org.monarchinitiative.phenopacket2prompt.output.PromptGenerator; -import org.phenopackets.schema.v2.Phenopacket; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -public class AbstractMineCommand { - private static final Logger LOGGER = LoggerFactory.getLogger(AbstractMineCommand.class); - - - protected List getCaseBundleList(String inputFile, FenominalParser fenominalParser) { - List caseBundleList = new ArrayList<>(); - CaseParser caseParser = new CaseParser(Path.of(inputFile)); - List caseList = caseParser.getCaseList(); - for (Case cs : caseList) { - Phenopacket ppkt = fenominalParser.parse(cs.caseText()); - PpktIndividual individual = new PpktIndividual(ppkt); - caseBundleList.add(new CaseBundle(cs, ppkt, individual)); - } - System.out.printf("Got %d cases.\n", caseBundleList.size()); - return caseBundleList; - } - - - protected void outputPrompt(CaseBundle bundle, String output) { - PpktIndividual individual = bundle.individual(); - PromptGenerator generator = PromptGenerator.english(); - String prompt = generator.createPrompt(individual); - try { - Path path = Path.of(output); - Files.writeString(path, prompt); - } catch (IOException e) { - LOGGER.error("Could not write prompt: {}", e.getMessage()); - throw new PhenolRuntimeException(e); - } - } - - -} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/BatchMineCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/BatchMineCommand.java new file mode 100644 index 0000000..c6e133d --- /dev/null +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/BatchMineCommand.java @@ -0,0 +1,100 @@ +package org.monarchinitiative.phenopacket2prompt.cmd; + + +import org.monarchinitiative.phenol.base.PhenolRuntimeException; +import org.monarchinitiative.phenopacket2prompt.mining.CaseBundle; +import org.monarchinitiative.phenopacket2prompt.mining.FenominalParser; +import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; +import org.monarchinitiative.phenopacket2prompt.output.CorrectResult; +import org.monarchinitiative.phenopacket2prompt.output.PpktCopy; +import org.monarchinitiative.phenopacket2prompt.output.PromptGenerator; +import picocli.CommandLine; + +import java.io.File; +import java.util.List; +import java.util.concurrent.Callable; + +@CommandLine.Command(name = "batchmine", aliases = {"B2"}, + mixinStandardHelpOptions = true, + description = "Batch Text mine, Translate, and Output phenopacket and prompt") +public class BatchMineCommand implements Callable { + @CommandLine.Option(names={"-d","--data"}, description ="directory to download data (default: ${DEFAULT-VALUE})" ) + public String datadir="data"; + + @CommandLine.Option(names={"-i","--inputdir"}, description ="input files (directory)" ) + public String input = "docs/cases/"; // provide path for testing + + @CommandLine.Option(names = { "-o", "--output"}, description = "Path to output file dir(default: ${DEFAULT-VALUE})") + private String output = "mined_out"; + + @CommandLine.Option(names = {"-e", "--exact"}, description = "Use exact matching algorithm") + private boolean useExactMatching = false; + + @CommandLine.Option(names = {"--translations"}, + description = "path to translations file") + private String translationsPath = "data/hp-international.obo"; + + @CommandLine.Option(names = {"--verbose"}, description = "show results in shell (default is to just write to file)") + private boolean verbose; + + + + @Override + public Integer call() throws Exception { + File inDirectory = new File(input); + if (!inDirectory.isDirectory()) { + throw new PhenolRuntimeException("Could not find directory at " + input); + } + File hpoJsonFile = new File(datadir + File.separator + "hp.json"); + if (! hpoJsonFile.isFile()) { + System.out.printf("[ERROR] Could not find hp.json file at %s\nRun download command first\n", hpoJsonFile.getAbsolutePath()); + } + File translationsFile = new File(translationsPath); + if (! translationsFile.isFile()) { + System.err.printf("Could not find translations file at %s. Try download command", translationsPath); + return 1; + } + Utility utility = new Utility(translationsFile); + List individualList = getIndividualsFromTextMining(inDirectory,hpoJsonFile); + PromptGenerator spanish = utility.spanish(); + Utility.outputPromptsInternationalMining(individualList,"es", spanish); + // Dutch + PromptGenerator dutch = utility.dutch(); + Utility.outputPromptsInternationalMining(individualList,"nl", dutch); + // GERMAN + PromptGenerator german = utility.german(); + Utility.outputPromptsInternationalMining(individualList,"de", german); + // ITALIAN + PromptGenerator italian = utility.italian(); + Utility.outputPromptsInternationalMining(individualList,"it", italian); + + // output file with correct diagnosis list + List correctResultList =Utility.outputPromptsEnglishFromIndividuals(individualList); + Utility.outputCorrectTextmined(correctResultList); + return 0; + } + + /** + * Get all of the individual objects by text mining the input files + * @param inDirectory Input directory. Should hold input files formatted for this project (demonstration) + * @param hpoJsonFile File representing hp.json + * @return list of individuals + */ + protected List getIndividualsFromTextMining(File inDirectory, File hpoJsonFile) { + FenominalParser parser = new FenominalParser(hpoJsonFile, useExactMatching); + List caseBundleList = Utility.getAllCaseBundlesFromDirectory(inDirectory, parser); + return caseBundleList.stream().map(CaseBundle::individual).toList(); + } + + + + private void outputTextmined(FenominalParser parser) { + + List caseBundleList = Utility.getCaseBundleList(input, parser); + if (caseBundleList.isEmpty()) { + System.err.println("Could not extract cases from " + input); + } + // for now, just output one case + Utility.outputPromptFromCaseBundle(caseBundleList.getFirst().individual(), output); + } +} diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java index 7dfab15..bc10f09 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/GbtTranslateBatchCommand.java @@ -6,6 +6,8 @@ import org.monarchinitiative.phenol.ontology.data.Ontology; import org.monarchinitiative.phenopacket2prompt.international.HpInternational; import org.monarchinitiative.phenopacket2prompt.international.HpInternationalOboParser; +import org.monarchinitiative.phenopacket2prompt.mining.CaseBundle; +import org.monarchinitiative.phenopacket2prompt.mining.FenominalParser; import org.monarchinitiative.phenopacket2prompt.model.PhenopacketDisease; import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; import org.monarchinitiative.phenopacket2prompt.output.CorrectResult; @@ -33,7 +35,6 @@ public class GbtTranslateBatchCommand implements Callable { private final static Logger LOGGER = LoggerFactory.getLogger(GbtTranslateBatchCommand.class); - @CommandLine.Option(names = {"--hp"}, description = "path to HP json file") private String hpoJsonPath = "data/hp.json"; @@ -55,192 +56,50 @@ public class GbtTranslateBatchCommand implements Callable { @Override public Integer call() throws Exception { File hpJsonFile = new File(hpoJsonPath); + boolean useExactMatching = true; if (! hpJsonFile.isFile()) { throw new PhenolRuntimeException("Could not find hp.json at " + hpJsonFile.getAbsolutePath()); } Ontology hpo = OntologyLoader.loadOntology(hpJsonFile); LOGGER.info("HPO version {}", hpo.version().orElse("n/a")); + FenominalParser parser = new FenominalParser(hpJsonFile, useExactMatching); File translationsFile = new File(translationsPath); if (! translationsFile.isFile()) { System.err.printf("Could not find translations file at %s. Try download command", translationsPath); return 1; } - HpInternationalOboParser oboParser = new HpInternationalOboParser(translationsFile); + Utility utility = new Utility(translationsFile); - Map internationalMap = oboParser.getLanguageToInternationalMap(); - LOGGER.info("Got {} translations", internationalMap.size()); - List ppktFiles = getAllPhenopacketJsonFiles(); - createDir(outdirname); - List correctResultList = outputPromptsEnglish(ppktFiles, hpo); + List ppktFiles = Utility.getAllPhenopacketJsonFiles(ppktDir); + Utility.createDir(outdirname); + List correctResultList = Utility.outputPromptsEnglish(ppktFiles, hpo); // output all non-English languages here - // SPANISH - PromptGenerator spanish = PromptGenerator.spanish(internationalMap.get("es")); - resetOutput("es"); - outputPromptsInternational(ppktFiles, hpo, "es", spanish); - - resetOutput("nl"); - PromptGenerator dutch = PromptGenerator.dutch(internationalMap.get("nl")); - outputPromptsInternational(ppktFiles, hpo, "nl", dutch); + PromptGenerator spanish = utility.spanish(); + Utility.outputPromptsInternational(ppktFiles,"es", spanish); + // Dutch + PromptGenerator dutch = utility.dutch(); + Utility.outputPromptsInternational(ppktFiles,"nl", dutch); // GERMAN - resetOutput("de"); - PromptGenerator german = PromptGenerator.german(internationalMap.get("de")); - outputPromptsInternational(ppktFiles, hpo, "de", german); - + PromptGenerator german = utility.german(); + Utility.outputPromptsInternational(ppktFiles,"de", german); // ITALIAN - resetOutput("it"); - PromptGenerator italian = PromptGenerator.italian(internationalMap.get("it")); - outputPromptsInternational(ppktFiles, hpo, "it", italian); - resetOutput("finished"); + PromptGenerator italian = utility.italian(); + Utility.outputPromptsInternational(ppktFiles,"it", italian); // output original phenopackets PpktCopy pcopy = new PpktCopy(new File(outdirname)); for (var file : ppktFiles) { pcopy.copyFile(file); } - // output file with correct diagnosis list - outputCorrectResults(correctResultList); + Utility.outputCorrectPPKt(correctResultList); return 0; } - private void resetOutput(String es) { - if (currentLanguageCode != null) { - System.out.printf("Finished writing %d phenopackets in %s\n", currentCount, currentLanguageCode); - } - currentLanguageCode = es; - currentCount = 0; - } - - private void outputCorrectResults(List correctResultList) { - File outfile = new File("prompts" + File.separator + "correct_results.tsv"); - try (BufferedWriter bw = new BufferedWriter(new FileWriter(outfile))) { - for (var cres : correctResultList) { - bw.write(String.format("%s\t%s\t%s\n", cres.diseaseLabel(), cres.diseaseId().getValue(), cres.promptFileName())); - } - } catch (IOException e) { - e.printStackTrace(); - } - System.out.printf("[INFO] Output a total of %d prompts in en, es, nl, de, and it.\n", correctResultList.size()); - } - - - private String getFileName(String phenopacketID, String languageCode) { - return phenopacketID.replaceAll("[^\\w]","_") + "_" + languageCode + "-prompt.txt"; - } - - - - private void outputPromptsInternational(List ppktFiles, Ontology hpo, String languageCode, PromptGenerator generator) { - String dirpath = String.format("prompts/%s", languageCode); - createDir(dirpath); - List diagnosisList = new ArrayList<>(); - for (var f: ppktFiles) { - PpktIndividual individual = PpktIndividual.fromFile(f); - List diseaseList = individual.getDiseases(); - if (diseaseList.size() != 1) { - String errmsg = String.format("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId()); - throw new PhenolRuntimeException(errmsg); - } - PhenopacketDisease pdisease = diseaseList.getFirst(); - String promptFileName = getFileName( individual.getPhenopacketId(), languageCode); - String diagnosisLine = String.format("%s\t%s\t%s\t%s", pdisease.getDiseaseId(), pdisease.getLabel(), promptFileName, f.getAbsolutePath()); - try { - diagnosisList.add(diagnosisLine); - String prompt = generator.createPrompt(individual); - outputPrompt(prompt, promptFileName, dirpath); - } catch (Exception e) { - System.err.printf("[ERROR] Could not process %s: %s\n", promptFileName, e.getMessage()); - //e.printStackTrace(); - } - } - Set missing = generator.getMissingTranslations(); - if (! missing.isEmpty()) { - for (var m : missing) { - System.out.printf("[%s] Missing: %s\n", languageCode, m); - } - } - } - - - private List outputPromptsEnglish(List ppktFiles, Ontology hpo) { - createDir("prompts/en"); - List correctResultList = new ArrayList<>(); - PromptGenerator generator = PromptGenerator.english(); - - for (var f: ppktFiles) { - PpktIndividual individual = PpktIndividual.fromFile(f); - List diseaseList = individual.getDiseases(); - if (diseaseList.size() != 1) { - System.err.printf("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId()); - continue; - } - PhenopacketDisease pdisease = diseaseList.getFirst(); - String promptFileName = getFileName( individual.getPhenopacketId(), "en"); - String diagnosisLine = String.format("%s\t%s\t%s\t%s", pdisease.getDiseaseId(), pdisease.getLabel(), promptFileName, f.getAbsolutePath()); - try { - String prompt = generator.createPrompt(individual); - outputPrompt(prompt, promptFileName, "prompts/en"); - var cres = new CorrectResult(promptFileName, pdisease.getDiseaseId(), pdisease.getLabel()); - correctResultList.add(cres); - } catch (Exception e) { - e.printStackTrace(); - } - } - return correctResultList; - } - - - - private void outputPrompt(String prompt, String promptFileName, String dir) { - File outpath = new File(dir + File.separator + promptFileName); - try (BufferedWriter bw = new BufferedWriter(new FileWriter(outpath))) { - bw.write(prompt); - } catch (IOException e) { - e.printStackTrace(); - } - System.out.printf("%s %d.\r", currentLanguageCode, currentCount); - currentCount++; - } - - - - private void createDir(String path) { - File pathAsFile = new File(path); - if (!Files.exists(Paths.get(path))) { - pathAsFile.mkdir(); - } - } - private List getAllPhenopacketJsonFiles() { - List ppktDirectories = new ArrayList<>(); - List ppktFiles = new ArrayList<>(); - File[] items = new File(this.ppktDir).listFiles(); - // We know that all phenopackets are located in the subdirectories - if (!ppktDir.substring(ppktDir.length() - 1).equals("/")) { - ppktDir += "/"; - } - for (File item : items) { - if (item.isDirectory()) - ppktDirectories.add(ppktDir+item.getName()); - else if (item.isFile() && item.getName().endsWith(".json")) { - ppktFiles.add(item); - } - } - for (var f: ppktDirectories) { - File subdir = new File(f); - File[] files = subdir.listFiles(); - for (var ff : files) { - if (ff.isFile() && ff.getAbsolutePath().endsWith(".json")) { - ppktFiles.add(ff); - } - } - } - System.out.printf("Retrieved %d files.\n", ppktFiles.size()); - return ppktFiles; - } } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TextMineCommand.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TextMineCommand.java index 18b8bc7..77e1a02 100644 --- a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TextMineCommand.java +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/TextMineCommand.java @@ -2,6 +2,7 @@ import org.monarchinitiative.phenopacket2prompt.mining.CaseBundle; import org.monarchinitiative.phenopacket2prompt.mining.FenominalParser; +import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import picocli.CommandLine; @@ -13,7 +14,7 @@ @CommandLine.Command(name = "mine", aliases = {"M"}, mixinStandardHelpOptions = true, description = "Text mine and output phenopacket and prompt") -public class TextMineCommand extends AbstractMineCommand implements Callable { +public class TextMineCommand implements Callable { private final static Logger LOGGER = LoggerFactory.getLogger(TextMineCommand.class); @CommandLine.Option(names={"-d","--data"}, description ="directory to download data (default: ${DEFAULT-VALUE})" ) @@ -34,25 +35,19 @@ public class TextMineCommand extends AbstractMineCommand implements Callable caseBundleList = getCaseBundleList(input, parser); + List caseBundleList = Utility.getCaseBundleList(input, parser); if (caseBundleList.isEmpty()) { System.err.println("Could not extract cases from " + input); } // for now, just output one case - outputPrompt(caseBundleList.getFirst(), output); - + PpktIndividual individual = caseBundleList.getFirst().individual(); + Utility.outputPromptFromCaseBundle(individual, output); return 0; - - } } diff --git a/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/Utility.java b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/Utility.java new file mode 100644 index 0000000..1c8c085 --- /dev/null +++ b/src/main/java/org/monarchinitiative/phenopacket2prompt/cmd/Utility.java @@ -0,0 +1,291 @@ +package org.monarchinitiative.phenopacket2prompt.cmd; + +import org.monarchinitiative.phenol.base.PhenolRuntimeException; +import org.monarchinitiative.phenol.ontology.data.Ontology; +import org.monarchinitiative.phenopacket2prompt.international.HpInternational; +import org.monarchinitiative.phenopacket2prompt.international.HpInternationalOboParser; +import org.monarchinitiative.phenopacket2prompt.mining.Case; +import org.monarchinitiative.phenopacket2prompt.mining.CaseBundle; +import org.monarchinitiative.phenopacket2prompt.mining.CaseParser; +import org.monarchinitiative.phenopacket2prompt.mining.FenominalParser; +import org.monarchinitiative.phenopacket2prompt.model.PhenopacketDisease; +import org.monarchinitiative.phenopacket2prompt.model.PpktIndividual; +import org.monarchinitiative.phenopacket2prompt.output.CorrectResult; +import org.monarchinitiative.phenopacket2prompt.output.PromptGenerator; +import org.phenopackets.schema.v2.Phenopacket; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; + +/** + * This class provides several utility functions. + */ +public class Utility { + private static final Logger LOGGER = LoggerFactory.getLogger(Utility.class); + + private static final String PROMPT_DIR = "prompts"; + private static final String TEXT_MINED_DIR = "text_mined"; + + + private final Map internationalMap ; + + public Utility(File translationsFile) { + HpInternationalOboParser oboParser = new HpInternationalOboParser(translationsFile); + this.internationalMap = oboParser.getLanguageToInternationalMap(); + LOGGER.info("Got {} translations", internationalMap.size()); + } + + public PromptGenerator german() { + return PromptGenerator.german(internationalMap.get("de")); + } + + public PromptGenerator spanish() { + return PromptGenerator.german(internationalMap.get("es")); + } + + public PromptGenerator dutch() { + return PromptGenerator.german(internationalMap.get("nl")); + } + + public PromptGenerator italian() { + return PromptGenerator.german(internationalMap.get("it")); + } + + + + public static String getFileName(String phenopacketID, String languageCode) { + return phenopacketID.replaceAll("[^\\w]","_") + "_" + languageCode + "-prompt.txt"; + } + + public static void createDir(String path) { + File pathAsFile = new File(path); + if (!Files.exists(Paths.get(path))) { + pathAsFile.mkdir(); + } + } + + + public static void outputCorrectTextmined(List correctResultList) { + outputCorrectResults(correctResultList, TEXT_MINED_DIR); + } + public static void outputCorrectPPKt(List correctResultList) { + outputCorrectResults(correctResultList, PROMPT_DIR); + } + + public static void outputCorrectResults(List correctResultList, String basename) { + File outfile = new File(basename + File.separator + "correct_results.tsv"); + try (BufferedWriter bw = new BufferedWriter(new FileWriter(outfile))) { + for (var cres : correctResultList) { + bw.write(String.format("%s\t%s\t%s\n", cres.diseaseLabel(), cres.diseaseId().getValue(), cres.promptFileName())); + } + } catch (IOException e) { + e.printStackTrace(); + } + System.out.printf("[INFO] Output a total of %d prompts in en, es, nl, de, and it.\n", correctResultList.size()); + } + + + public static void outputPromptFromCaseBundle(String prompt, String promptFileName, String dir) { + File outpath = new File(dir + File.separator + promptFileName); + try (BufferedWriter bw = new BufferedWriter(new FileWriter(outpath))) { + bw.write(prompt); + } catch (IOException e) { + throw new PhenolRuntimeException("Could not output file to " + promptFileName); + } + } + + + + + public static List getAllPhenopacketJsonFiles(String ppktDir) { + List ppktDirectories = new ArrayList<>(); + List ppktFiles = new ArrayList<>(); + File[] items = new File(ppktDir).listFiles(); + // We know that all phenopackets are located in the subdirectories + if (!ppktDir.substring(ppktDir.length() - 1).equals("/")) { + ppktDir += "/"; + } + for (File item : items) { + if (item.isDirectory()) + ppktDirectories.add(ppktDir+item.getName()); + else if (item.isFile() && item.getName().endsWith(".json")) { + ppktFiles.add(item); + } + } + for (var f: ppktDirectories) { + File subdir = new File(f); + File[] files = subdir.listFiles(); + for (var ff : files) { + if (ff.isFile() && ff.getAbsolutePath().endsWith(".json")) { + ppktFiles.add(ff); + } + } + } + System.out.printf("Retrieved %d files.\n", ppktFiles.size()); + return ppktFiles; + } + + + public static void outputPromptsInternationalFromIndividualList(List individualList, + String languageCode, + String baseDir, + PromptGenerator generator) { + String dirpath = String.format("%s%s%s", baseDir, File.separator, languageCode); + Utility.createDir(dirpath); + List diagnosisList = new ArrayList<>(); + for (PpktIndividual individual : individualList) { + List diseaseList = individual.getDiseases(); + if (diseaseList.size() != 1) { + String errmsg = String.format("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId()); + throw new PhenolRuntimeException(errmsg); + } + PhenopacketDisease pdisease = diseaseList.getFirst(); + String promptFileName = Utility.getFileName( individual.getPhenopacketId(), languageCode); + String diagnosisLine = String.format("%s\t%s\t%s\t%s", pdisease.getDiseaseId(), pdisease.getLabel(), promptFileName, individual.getPhenopacketId()); + try { + diagnosisList.add(diagnosisLine); + String prompt = generator.createPrompt(individual); + Utility.outputPromptFromCaseBundle(prompt, promptFileName, dirpath); + } catch (Exception e) { + System.err.printf("[ERROR] Could not process %s: %s\n", promptFileName, e.getMessage()); + //e.printStackTrace(); + } + } + + } + + + + public static void outputPromptsInternational(List ppktFiles, String languageCode, PromptGenerator generator) { + List individualList = new ArrayList<>(); + for (var f: ppktFiles) { + PpktIndividual individual = PpktIndividual.fromFile(f); + individualList.add(individual); + } + outputPromptsInternationalFromIndividualList(individualList, + PROMPT_DIR, + languageCode, + generator); + } + + public static void outputPromptsInternationalMining(List individualList, + String languageCode, + PromptGenerator generator) { + outputPromptsInternationalFromIndividualList(individualList, + TEXT_MINED_DIR, + languageCode, + generator); + } + + + + public static List outputPromptsEnglish(List ppktFiles, Ontology hpo) { + Utility.createDir("prompts/en"); + List correctResultList = new ArrayList<>(); + PromptGenerator generator = PromptGenerator.english(); + int currentCount = 0; + for (var f: ppktFiles) { + PpktIndividual individual = PpktIndividual.fromFile(f); + List diseaseList = individual.getDiseases(); + if (diseaseList.size() != 1) { + System.err.printf("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId()); + continue; + } + PhenopacketDisease pdisease = diseaseList.getFirst(); + String promptFileName = Utility.getFileName( individual.getPhenopacketId(), "en"); + String diagnosisLine = String.format("%s\t%s\t%s\t%s", pdisease.getDiseaseId(), pdisease.getLabel(), promptFileName, f.getAbsolutePath()); + try { + String prompt = generator.createPrompt(individual); + Utility.outputPromptFromCaseBundle(prompt, promptFileName, "prompts/en"); + System.out.printf("en %d.\r", currentCount); + currentCount++; + var cres = new CorrectResult(promptFileName, pdisease.getDiseaseId(), pdisease.getLabel()); + correctResultList.add(cres); + } catch (Exception e) { + e.printStackTrace(); + } + } + return correctResultList; + } + + + public static List outputPromptsEnglishFromIndividuals(List individualList) { + var outd = TEXT_MINED_DIR + File.separator + "en"; + Utility.createDir(outd); + List correctResultList = new ArrayList<>(); + PromptGenerator generator = PromptGenerator.english(); + int currentCount = 0; + for (PpktIndividual individual: individualList) { + List diseaseList = individual.getDiseases(); + if (diseaseList.size() != 1) { + System.err.printf("[ERROR] Got %d diseases for %s.\n", diseaseList.size(), individual.getPhenopacketId()); + continue; + } + PhenopacketDisease pdisease = diseaseList.getFirst(); + String promptFileName = Utility.getFileName( individual.getPhenopacketId(), "en"); + try { + String prompt = generator.createPrompt(individual); + Utility.outputPromptFromCaseBundle(prompt, promptFileName, "prompts/en"); + System.out.printf("en %d.\r", currentCount); + currentCount++; + var cres = new CorrectResult(promptFileName, pdisease.getDiseaseId(), pdisease.getLabel()); + correctResultList.add(cres); + } catch (Exception e) { + e.printStackTrace(); + } + } + return correctResultList; + } + + + public static void outputPromptFromCaseBundle(PpktIndividual individual, String output) { + + PromptGenerator generator = PromptGenerator.english(); + String prompt = generator.createPrompt(individual); + try { + Path path = Path.of(output); + Files.writeString(path, prompt); + } catch (IOException e) { + LOGGER.error("Could not write prompt: {}", e.getMessage()); + throw new PhenolRuntimeException(e); + } + } + + public static List getCaseBundleList(String inputFile, FenominalParser fenominalParser) { + List caseBundleList = new ArrayList<>(); + CaseParser caseParser = new CaseParser(Path.of(inputFile)); + List caseList = caseParser.getCaseList(); + for (Case cs : caseList) { + Phenopacket ppkt = fenominalParser.parse(cs.caseText()); + PpktIndividual individual = new PpktIndividual(ppkt); + caseBundleList.add(new CaseBundle(cs, ppkt, individual)); + } + System.out.printf("Got %d cases.\n", caseBundleList.size()); + return caseBundleList; + } + + + public static List getAllCaseBundlesFromDirectory(File indir, FenominalParser fenominalParser) { + if (! indir.isDirectory()) { + throw new PhenolRuntimeException("Could not find text mining input directory at " + indir.getAbsolutePath()); + } + List caseBundleList = new ArrayList<>(); + File[] files = indir.listFiles(); + for (File file : files) { + if (! file.getAbsolutePath().contains("PMID") && file.getAbsolutePath().endsWith("txt")) continue; + List bundles = getCaseBundleList(file.getAbsolutePath(), fenominalParser); + caseBundleList.addAll(bundles); + } + return caseBundleList; + } + + +}