diff --git a/Dockerfile b/Dockerfile index 3372a21..a7cd8d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,21 @@ -FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7 +FROM python:3.6.12 -COPY . /app -COPY ./examples/predict_fastapi.py /app/main.py -RUN pip install -e /app +WORKDIR /classification + +# Install transformers +RUN pip3 install transformers +# Install fastAPI +RUN pip3 install -U fastapi[all] +# Install torch +RUN pip3 install torch +# Install mongo client +RUN pip3 install pymongo +# Install numpy/scipy +RUN pip3 install scipy +RUN pip3 install numpy + +COPY . /classification +RUN pip3 install -e . RUN python -m synthesis_classifier.model download + +#uvicorn classification_api:app --port 8051 --reload --host 0.0.0.0 \ No newline at end of file diff --git a/examples/predict_fastapi.py b/classification_api.py similarity index 100% rename from examples/predict_fastapi.py rename to classification_api.py diff --git a/examples/examples.txt b/examples/examples.txt deleted file mode 100644 index 9d3bc5e..0000000 --- a/examples/examples.txt +++ /dev/null @@ -1,4 +0,0 @@ -10.1063/1.3676216: The raw materials were BaCO3, ZnO, Nb2O5, and Ta2O5 powders with purity of more than 99.5%. Ba[Zn1/3 (Nb1−xTax)2/3]O3 (BZNT, x = 0.0, 0.2, 0.4, 0.6, 0.8, 1.0) solid solutions were synthesized by conventional solid-state sintering technique. Oxide compounds were mixed for 12 h in polyethylene jars with zirconia balls and then dried and calcined at 1100 °C for 2 h. After remilling, the powders were dried and pressed into discs of 15 mm × 1 mm and next sintered at 1500 °C for 3 h. -10.1016/j.optmat.2015.01.014: Tm3+/Al3+ co-doped silica glasses were prepared by sol–gel method and subsequent high temperature sintering. Tetraethoxysilane (TEOS), AlCl3·6H2O (Aladdin, 99.99%) and TmCl3·6H2O (Aladdin, 99.99%) were chosen as the precursors of SiO2, Al2O3 and Tm2O3, respectively. Ethanol was used as solvent. Deionized water was added to sustain the hydrolysis reaction. The above-mentioned analytically pure grade chemical reagents were mixed and stirred for 24 h at 30 °C to form homogeneous and clear doping sol. After series of heat treatments, the powder was achieved in which hydroxyl and organics were almost decomposed. Then the powder was sintered into glass at 1750 °C for 3 h in vacuum state to form glass in alumina crucible. The glass was polished to 2 mm thickness chip for the optical property measurements. Tm3+-doped silica glasses with compositions of xTm2O3–15xAl2O3–(100 − 16x) SiO2 (in mol%, x = 0.1, 0.3, 0.5, 0.8 and 1.0 named as TAS1, TAS2, TAS3, TAS4 and TAS5 respectively) were prepared. -10.1021/acs.inorgchem.6b01899: A mixture of Ln(NO3)3·6H2O (0.15 mmol), H3L (0.0166 g, 0.05 mmol), H2O (2 mL), and DMF (2 mL) was sealed in a 25 mL Teflon-lined stainless steel container. After that the vessel cooled to room temperature with a rate of 5 °C h–1, and finally the block crystals of 1-Ln were obtained. -10.1016/j.cej.2015.03.036: Channels in the MSR area were loaded with a granular catalyst Cu18.5CeхAl81.5− x (fraction 0.2–0.3 mm) prepared by coprecipitation of initial components from an aqueous solution [9]. For this purpose, 1 M nitrate solutions of cerium (Ce(NO3)2·6H2O), copper (Cu(NO3)2·3H2O), and aluminum (Al(NO3)2·9H2O) were mixed in required proportions and heated to 60 °С. The precipitant (1 M solution of Na2CO3) was slowly added under vigorous stirring to a solution of metal nitrates until pH = 8–9. The resultant precipitate was aged for 1 h, filtered on a Buchner funnel and washed several times with warm distilled water to remove Na+ cations. The obtained precursor was dried at 100 °С for 12 h. To obtain the catalyst, the dried mixture was calcined in dry air at 700 °С for 4 h. After that, the catalyst was activated in a 10% mixture of hydrogen in argon at 250 °С for 2 h. The specific surface area of prepared catalyst was 128 m2/g, the specific surface area of copper metal was 10.5 m2/g. The detailed characterization of the prepared catalyst including XRD et al. was performed in [10]. \ No newline at end of file diff --git a/examples/predict_json.py b/examples/predict_json.py deleted file mode 100644 index 6810b20..0000000 --- a/examples/predict_json.py +++ /dev/null @@ -1,17 +0,0 @@ -import torch - -from synthesis_classifier import get_model, get_tokenizer, run_batch - -model = get_model() -tokenizer = get_tokenizer() - -with open('examples.txt', 'r') as f: - paragraphs = list(map(str.strip, f)) - -batch_size = 2 -batches = [paragraphs[i:min(i + batch_size, len(paragraphs))] - for i in range(0, len(paragraphs), batch_size)] - -for batch in batches: - result = run_batch(batch, model, tokenizer) - print(result) diff --git a/synpro_scripts/classify_experimental.py b/synpro_scripts/classify_experimental.py deleted file mode 100644 index 7e76f47..0000000 --- a/synpro_scripts/classify_experimental.py +++ /dev/null @@ -1,23 +0,0 @@ -import re - -from synthesis_classifier.model import classifier_version -from synthesis_classifier.multiprocessing_classifier import perform_collection, make_batch -from synthesis_classifier.database.synpro import MetaCollectionIteratorByQuery, SynProDBWriter - - -def experimental_paragraphs(): - query = { - 'path': re.compile('experiment|experimental|preparation|prepare|synthesis|syntheses|material', re.IGNORECASE), - classifier_version: {'$exists': False} - } - - return MetaCollectionIteratorByQuery(query) - - -if __name__ == "__main__": - batch_size = 16 - perform_collection( - SynProDBWriter, - make_batch(experimental_paragraphs(), batch_size), - './job_reclassify.sh' - ) diff --git a/synpro_scripts/compute_embeddings.py b/synpro_scripts/compute_embeddings.py deleted file mode 100644 index 52a1f7e..0000000 --- a/synpro_scripts/compute_embeddings.py +++ /dev/null @@ -1,62 +0,0 @@ -from multiprocessing import get_context -from multiprocessing.queues import Queue - -import numpy - -from synthesis_classifier import classifier_version -from synthesis_classifier.database.synpro import MetaCollectionIteratorByQuery, get_connection -from synthesis_classifier.multiprocessing_classifier import perform_collection, make_batch - -version = classifier_version.split('_')[-1] - - -def not_embedding_paragraphs(): - query = { - ('paragraph_embedding_' + version): {'$exists': False} - } - - return MetaCollectionIteratorByQuery(query) - - -class SynProEmbeddingWriter(object): - def __init__(self): - self.mp_ctx = get_context('spawn') # To be compatible with classifier workers - - self.db_writer_queue = self.mp_ctx.Queue(maxsize=512) - self.process = self.mp_ctx.Process(target=embedding_writer, args=(self.db_writer_queue,)) - self.process.start() - - def __enter__(self): - return self.db_writer_queue - - def __exit__(self, exc_type, exc_val, exc_tb): - self.db_writer_queue.put(None) - self.process.join() - - -def embedding_writer(queue: Queue): - meta = get_connection().Paragraphs_Meta - - while True: - batch_result = queue.get() - if batch_result is None: - break - meta_ids, _, hidden_states = batch_result[:3] - hidden_states = hidden_states[:, 0] - - for meta_id, hs in zip(meta_ids, hidden_states): - meta.update_one( - {'_id': meta_id}, - {'$set': { - ('paragraph_embedding_' + version): hs.tolist(), - }} - ) - print(meta_id, hs.shape) - - -if __name__ == "__main__": - batch_size = 16 - perform_collection( - SynProEmbeddingWriter, - make_batch(not_embedding_paragraphs(), batch_size), - ) diff --git a/synpro_scripts/convert_synpro.js b/synpro_scripts/convert_synpro.js deleted file mode 100644 index b86e15d..0000000 --- a/synpro_scripts/convert_synpro.js +++ /dev/null @@ -1,35 +0,0 @@ -db.getCollection('Paragraphs').find({}, {'path': 1}).forEach(function(doc) { - db.Paragraphs_Meta.update( - {paragraph_id: doc._id}, - {'$set': { - path: doc.path, - }} - ); -}) - -db.getCollection('Paragraphs_Meta').find({bert_classifier_20200904: {$exists: true}}).forEach(function(doc) { - var result = doc.bert_classifier_20200904; - - if(result){ - var cls = result ? Object.keys(result).filter(i => result[i] > 0.5) : []; - db.Paragraphs_Meta.update( - {_id: doc._id}, - {'$set': { - classification: cls.length > 0 ? cls[0] : null, - confidence: cls.length > 0 ? result[cls[0]] : null, - classifier_version: 'bert_classifier_20200904' - }} - ); - } -}) - -db.getCollection('Paragraphs_Meta').find({'bert_classifier_20200803.something_else': {$gte: 0.9}}).forEach(function(doc) { - db.Paragraphs_Meta.update( - {_id: doc._id}, - {'$set': { - classification: 'something_else', - confidence: doc.bert_classifier_20200803.something_else, - classifier_version: 'bert_classifier_20200904' - }} - ); -}) \ No newline at end of file diff --git a/synpro_scripts/patents_batteries.py b/synpro_scripts/patents_batteries.py deleted file mode 100644 index 39eb3ed..0000000 --- a/synpro_scripts/patents_batteries.py +++ /dev/null @@ -1,47 +0,0 @@ -from synthesis_classifier.database.patents import PatentsDBWriter, get_connection -from synthesis_classifier.multiprocessing_classifier import perform_collection, make_batch - - -class PatentBatteryParagraphs(object): - def __init__(self): - self.db = get_connection() - - def __iter__(self): - cursor = self.db.patent_section_battery.aggregate([ - {'$lookup': { - 'from': 'patent_text_section_battery_meta', - 'localField': 'paragraph_id', - 'foreignField': 'paragraph_id', - 'as': 'meta'}}, - {'$match': {'meta': {'$size': 0}}}, - {'$lookup': { - 'from': 'patent_text_section', - 'localField': 'paragraph_id', - 'foreignField': '_id', - 'as': 'p'}}, - ]) - - for item in cursor: - paragraph = item['p'][0]['text'] - if paragraph is not None and paragraph.strip(): - yield item['paragraph_id'], item['p'][0]['text'] - - def __len__(self): - return next(self.db.patent_section_battery.aggregate([ - {'$lookup': { - 'from': 'patent_text_section_battery_meta', - 'localField': 'paragraph_id', - 'foreignField': 'paragraph_id', - 'as': 'meta'}}, - {'$match': {'meta': {'$size': 0}}}, - {'$count': 'total'} - ]))['total'] - - -if __name__ == "__main__": - batch_size = 16 - perform_collection( - PatentsDBWriter(meta_col_name='patent_text_section_battery_meta'), - make_batch(PatentBatteryParagraphs(), batch_size), - './job_patents_batteries.sh' - ) diff --git a/synpro_scripts/patents_examples.py b/synpro_scripts/patents_examples.py deleted file mode 100644 index 82ef8f0..0000000 --- a/synpro_scripts/patents_examples.py +++ /dev/null @@ -1,21 +0,0 @@ -import re - -from synthesis_classifier.multiprocessing_classifier import perform_collection, make_batch -from synthesis_classifier.database.patents import PatentsDBWriter, PatentParagraphsByQuery - - -def example_paragraphs(): - query = { - 'path': re.compile(r'.*example.*', re.IGNORECASE), - } - - return PatentParagraphsByQuery(query) - - -if __name__ == "__main__": - batch_size = 16 - perform_collection( - PatentsDBWriter, - make_batch(example_paragraphs(), batch_size), - './job_patents_examples.sh' - ) diff --git a/synpro_scripts/reclassify.py b/synpro_scripts/reclassify.py deleted file mode 100644 index d07ff69..0000000 --- a/synpro_scripts/reclassify.py +++ /dev/null @@ -1,21 +0,0 @@ -from synthesis_classifier.model import classifier_version -from synthesis_classifier.multiprocessing_classifier import perform_collection, make_batch -from synthesis_classifier.database.synpro import MetaCollectionIteratorByQuery, SynProDBWriter - - -def already_classified(): - query = { - classifier_version: {'$exists': True}, - 'classification': {'$exists': False} - } - - return MetaCollectionIteratorByQuery(query) - - -if __name__ == "__main__": - batch_size = 16 - perform_collection( - SynProDBWriter, - make_batch(already_classified(), batch_size), - './job_reclassify.sh' - )