Merge pull request #265 from 0xTong/main

Upload darwin benchmark
materialsproject · Jan 20, 2024 · 19bb0dd · 19bb0dd
2 parents ed02568 + ce2fbe5
commit 19bb0dd
Show file tree

Hide file tree

Showing 7 changed files with 371 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/benchmarks/.DS_Store b/benchmarks/.DS_Store
diff --git a/benchmarks/matbench_v0.1_darwin/.DS_Store b/benchmarks/matbench_v0.1_darwin/.DS_Store
diff --git a/benchmarks/matbench_v0.1_darwin/info.json b/benchmarks/matbench_v0.1_darwin/info.json
@@ -0,0 +1,22 @@
+{
+  "authors": "Tong Xie, Yuwei Wan, Wei Huang, Zhenyu Yin, Yixuan Liu, Shaozhou Wang, Qingyuan Linghu, Imran Razzak, Bram Hoex, Chunyu Kit, Wenjie Zhang",
+  "algorithm": "Darwin",
+  "algorithm_long": "Fine-tuning DARWIN Natural Science Large Language Model",
+  "bibtex_refs": "@misc{xie2023large,\n title={Large Language Models as Master Key: Unlocking the Secrets of Materials Science with GPT},\n author={Tong Xie and Yuwei Wan and Wei Huang and Yufei Zhou and Yixuan Liu and Qingyuan Linghu and Shaozhou Wang and Chunyu Kit and Clara Grazian and Wenjie Zhang and Bram Hoex},\n year={2023},\n eprint={2304.02213},\n archivePrefix={arXiv},\n primaryClass={cs.CL}",
+  "notes": "We provide prompts and call-and-return of our model. The code for evaluating the benchmarks is available at https://github.com/MasterAI-EAM/Darwin-SIT, our base model is available at https://aigreendynamics-my.sharepoint.com/:f:/g/personal/yuwei_greendynamics_com_au/EvZEghuFSZZCguWrCsbk2QMB_eYqv-BRMM4VLhcK8TT4Zw?e=9bnqWW. To train our model, it requires at least 4*A100(80G)",
+  "requirements": {
+    "python": [
+        "git+https://github.com/MasterAI-EAM/Darwin.git",
+        "matbench==0.1.0",
+        "numpy",
+        "rouge_score",
+        "fire",
+        "openai",
+        "transformers>=4.28.1",
+        "torch",
+        "sentencepiece",
+        "tokenizers>=0.13.3",
+        "wandb"
+    ]
+  }
+}
diff --git a/benchmarks/matbench_v0.1_darwin/preprocessing.py b/benchmarks/matbench_v0.1_darwin/preprocessing.py
@@ -0,0 +1,233 @@
+import pandas as pd
+import random
+import json 
+def convert_gap(train_inputs,train_outputs=None,train=True):
+    input_df = pd.DataFrame(train_inputs)
+    if train:
+        output_df = pd.DataFrame(train_outputs)
+        df = input_df.join(output_df)
+    else:
+        df = input_df
+
+    slot = {"composition":"composition", 
+            "band gap":"gap expt"}
+    pair1 = [("band gap", "composition")]
+    data_list = []
+    for df_slice in df.iterrows():
+        for p in pair1:
+            tmp_dict = {}
+            ran = random.randint(0,3)
+            if ran == 0:
+                question = "What is "+ p[0]+" of given "+ p[1] + "? ->"
+            elif ran == 1:
+                question = "Write "+ p[0]+ " of given "+ p[1] + ". ->"
+            elif ran == 2:
+                question = "Given " + p[1] + ", write its "+ p[0] + ". ->"
+            elif ran == 3:
+                question = "Tell me " + p[0] + " of given " + p[1] + ". ->"
+            elif ran == 4:
+                question = "Given " + p[1] + ", what is its "+ p[0] + "? ->"
+            tmp_dict["instruction"] = question
+            tmp_dict["input"] = str(df_slice[1][slot[p[1]]])+"\n"
+            if train:
+                tmp_dict["output"] = " "+str(df_slice[1][slot[p[0]]])+"\n"
+            data_list.append(tmp_dict)
+
+    return data_list
+
+def convert_metal(train_inputs,train_outputs=None,train=True):
+    input_df = pd.DataFrame(train_inputs)
+    if train:
+        output_df = pd.DataFrame(train_outputs)
+        df = input_df.join(output_df)
+    else:
+        df = input_df
+    # key=csv column name, value=slot in Q
+    slot = {"composition":"composition"}
+    data_list = []
+    for s in slot.keys():
+        for i, element in enumerate(df[s]):
+            tmp_dict = {}
+            ran = random.randint(0,2)
+            if ran == 0:
+                question = "Is composition metal? ->"
+            elif ran == 1:
+                question = "Is given composition metal? ->"
+            elif ran == 2:
+                question = "Given composition, is it metal? ->"
+
+            tmp_dict["instruction"] = question
+            tmp_dict["input"] =  " "+ element +"\n"
+            if train:
+                answer = df['is_metal'][i]
+                if answer == True:
+                    tmp_dict["output"] = f" Yes, {element} is metal.\n"
+                elif answer == False:
+                    tmp_dict["output"] = f" No, {element} is not metal.\n"
+            data_list.append(tmp_dict)
+    if train:
+        # https://github.com/psobko/Common-English-Nouns
+        with open('2325_nouns.json', 'r', encoding='utf-8') as f:
+            nouns = json.load(f)
+
+        # add no-answer question according to length of data_list
+        add = int(len(data_list)/30)
+
+        add_nouns = random.sample(nouns, add)
+
+        for an in add_nouns:
+            tmp_dict = {}
+            ran1 = random.randint(0,2)
+            ran2 = random.sample(list(slot.values()), 1)[0]
+            if ran1 == 0:
+                question = "Is composition metal? ->"
+            elif ran1 == 1:
+                question = "Is given composition metal? ->"
+            elif ran1 == 2:
+                question = "Given composition, is it metal? ->"
+            tmp_dict["instruction"] = question
+            tmp_dict["input"] = " " + an + "\n"
+            if train:
+                tmp_dict["output"] = " "+an+" is not a "+ran2+" and it is not metal.\n"
+            data_list.append(tmp_dict)
+        random.shuffle(data_list)
+    return data_list
+
+def convert_steels(train_inputs,train_outputs=None,train=True):
+    input_df = pd.DataFrame(train_inputs)
+    if train:
+        output_df = pd.DataFrame(train_outputs)
+        df = input_df.join(output_df)
+    else:
+        df = input_df
+    # key=csv column name, value=slot in Q
+    slot = {"composition":"composition"}
+    data_list = []
+    for s in slot.keys():
+        for i, element in enumerate(df[s]):
+            tmp_dict = {}
+            ran = random.randint(0,2)
+
+            # What will be yield strength of composition at 800-1200 °C 
+            if ran == 0:
+                question = "What will be the yield strength of given composition at 800-1200 °C? ->"
+            elif ran == 1:
+                question = "Write a possible yield strength of given composition at 800-1200 °C. ->"
+            elif ran == 2:
+                question = "Given composition, write its potential yield strength at 800-1200 °C. ->"
+
+            tmp_dict["instruction"] = question
+            tmp_dict["input"] = " "+ element +"\n"
+            if train:
+                answer = df['yield strength'][i]
+                tmp_dict["output"] = " "+str(answer)+"\n"
+            data_list.append(tmp_dict)
+    if train:
+        # https://github.com/psobko/Common-English-Nouns
+        with open('2325_nouns.json', 'r', encoding='utf-8') as f:
+            nouns = json.load(f)
+        # add no-answer question according to length of data_list
+        add_c = int(len(data_list)/50)
+        add_n = int(len(data_list)/30)
+
+        add_comps = random.sample(df['composition'].tolist(), add_c)
+
+
+        for an in add_comps:
+            tmp_dict = {}
+            ran1 = random.randint(0,2)
+            ran2 = random.sample(list(slot.values()), 1)[0]
+            if ran1 == 0:
+                question = "What is yield strength of composition? ->"
+            elif ran1 == 1:
+                question = "Write a possible yield strength of given composition. ->"
+            elif ran1 == 2:
+                question = "Given composition, write its potential yield strength. ->"
+            tmp_dict["instruction"] = question
+            tmp_dict["input"] = " "+ an + ".\n"
+            if train:
+                tmp_dict["output"] = " Unable to answer due to lack of conditions.\n"
+            data_list.append(tmp_dict)
+        # add no-answer question according to length of data_list
+        add_nouns = random.sample(nouns, add_n)
+
+        for an in add_nouns:
+            tmp_dict = {}
+            ran1 = random.randint(0,2)
+            ran2 = random.sample(list(slot.values()), 1)[0]
+            if ran1 == 0:
+                question = "What will be the yield strength of given composition at 800-1200 °C? ->"
+            elif ran1 == 1:
+                question = "Write a possible yield strength of given composition at 800-1200 °C. ->"
+            elif ran1 == 2:
+                question = "Given composition, write its potential yield strength at 800-1200 °C. ->"
+            tmp_dict["instruction"] = question
+            tmp_dict["input"] = an
+            if train:
+                tmp_dict["output"] = " "+an+" is not a "+ran2+" and it does not have yield strength.\n"
+            data_list.append(tmp_dict)
+        random.shuffle(data_list)
+    return data_list
+
+def convert_glass(train_inputs,train_outputs=None,train=True):
+    input_df = pd.DataFrame(train_inputs)
+    if train:
+        output_df = pd.DataFrame(train_outputs)
+        df = input_df.join(output_df)
+    else:
+        df = input_df
+    slot = {"composition":"composition", 
+        "glass formation ability":"gfa"}
+    pair1 = [("composition", "glass formation ability")]
+    data_list = []
+    for df_slice in df.iterrows():
+        for p in pair1:
+            tmp_dict = {}
+            ran = random.randint(0,1)
+            ran1 = random.randint(0,1)
+            if ran1 == 0:
+                form = "glass formation ability"
+            elif ran1 == 1:
+                form = "glass-forming ability"
+            if ran == 0:
+                question = "Does given "+ p[0]+" have "+ form + "? ->"
+            elif ran == 1:
+                question = "Tell me if given "+ p[0]+ " has "+ form + ". ->"
+            tmp_dict["instruction"] = question
+            tmp_dict["input"] = " "+str(df_slice[1][slot[p[0]]])+"\n"
+            if train:
+                if str(df_slice[1][slot[p[1]]]) == "True":
+                    answer = " Yes, " + str(df_slice[1][slot[p[0]]]) + " has "+form+".\n"
+                else:
+                    answer = " No, "+ str(df_slice[1][slot[p[0]]]) + " does not have "+form+".\n"
+                tmp_dict["output"] = answer
+            data_list.append(tmp_dict)
+    if train:
+        # https://github.com/psobko/Common-English-Nouns
+        with open('2325_nouns.json', 'r', encoding='utf-8') as f:
+            nouns = json.load(f)
+
+        # add no-answer question according to length of data_list
+        add = int(len(data_list)/30)
+
+        add_nouns = random.sample(nouns, add)
+
+        for an in add_nouns:
+            tmp_dict = {}
+            ran = random.randint(0,1)
+            ran1 = random.randint(0,1)
+            if ran1 == 0:
+                form = "glass formation ability"
+            elif ran1 == 1:
+                form = "glass-forming ability"
+            if ran == 0:
+                question = "Does given "+ p[0]+" have "+ form + "? ->"
+            elif ran == 1:
+                question = "Tell me if given "+ p[0]+ " has "+ form + ". ->"
+            tmp_dict["instruction"] = question
+            tmp_dict["input"] = " "+an+"\n"
+            if train:
+                tmp_dict["output"] = " "+an+" is not a composition and it has no relation with "+form+".\n"
+            data_list.append(tmp_dict)
+        random.shuffle(data_list)
+    return data_list
diff --git a/benchmarks/matbench_v0.1_darwin/results.json.gz b/benchmarks/matbench_v0.1_darwin/results.json.gz
diff --git a/benchmarks/matbench_v0.1_darwin/run.py b/benchmarks/matbench_v0.1_darwin/run.py
@@ -0,0 +1,116 @@
+from matbench.bench import MatbenchBenchmark
+from preprocessing import *
+import random
+import json
+import os
+random.seed(0)
+
+################################ Darwin Installation ################################
+
+'''
+Please clone and install the Darwin package
+
+  1) git clone https://github.com/MasterAI-EAM/Darwin.git
+  2) pip install -r requirements.txt
+  3) download the base Darwin model from https://aigreendynamics-my.sharepoint.com/:f:/g/personal/yuwei_greendynamics_com_au/EvZEghuFSZZCguWrCsbk2QMB_eYqv-BRMM4VLhcK8TT4Zw?e=9bnqWW
+     Our base model is built upon LLaMA-7b, trained with 9 datasets: Chembl, ESOL, MoosaviCp, MoosaviDiversity, NagasawaOPV, OPV, Pei, WaterStability
+'''
+
+mb = MatbenchBenchmark(
+    autoload=True,
+    subset=[
+        "matbench_expt_is_metal",
+        "matbench_steels",
+        "matbench_glass",
+        "matbench_expt_gap"
+    ],
+)
+
+data_dir = 'train_test_data'
+os.makedirs(data_dir)
+os.makedirs('matbench_model')
+fold_data = {0:[],1:[],2:[],3:[],4:[]}
+
+for task in mb.tasks:
+    task.load()
+    task_name = task.dataset_name
+    for fold in task.folds:
+        # prepare the data for Darwin
+        train_inputs, train_outputs = task.get_train_and_val_data(fold)
+        test_inputs,test_outputs = task.get_test_data(fold,include_target=True)
+        # trainsform data into natural language
+        if (task.dataset_name == 'matbench_expt_gap'):
+            training_data = convert_gap(train_inputs,train_outputs)
+            test_data = convert_gap(test_inputs,train=False)
+        if (task.dataset_name == 'matbench_expt_is_metal'):
+            training_data = convert_metal(train_inputs,train_outputs)
+            test_data = convert_metal(test_inputs,train=False)
+        if (task.dataset_name == 'matbench_steels'):
+            training_data = convert_steels(train_inputs,train_outputs)
+            test_data = convert_steels(test_inputs,train=False)
+        if (task.dataset_name == 'matbench_glass'):
+            training_data = convert_glass(train_inputs,train_outputs)
+            test_data = convert_glass(test_inputs,train=False)
+        # mix 4 tasks into a single training set
+        fold_data[fold]+=training_data
+
+        # create test dataset
+        with open(data_dir +'/matbench_base_fold_'+str(fold)+'_'+task.dataset_name+'_test.json','w') as f:
+            json.dump(test_data,f)
+
+# creating the training dataset, training and evaluating the model
+for fold in fold_data:
+
+    # creating training dataset
+    training_data = fold_data[fold]
+    random.shuffle(training_data)
+    data_path = data_dir +'/matbench_base_fold_'+str(fold)+'_train.json'
+    output_path = 'matbench_model/fold'+str(fold)
+    with open(data_path,'w') as f:
+        json.dump(training_data,f)
+    # train the model
+    os.system("torchrun  --nproc_per_node=8 --master_port=1212 train.py \
+    --model_name_or_path base_model \
+    --data_path" + data_path + " \
+    --bf16 True \
+    --output_dir" + output_path + " \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 2 \
+    --evaluation_strategy 'no' \
+    --save_strategy 'steps' \
+    --save_steps 500 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type 'cosine' \
+    --logging_steps 1 \
+    --fsdp 'full_shard auto_wrap' \
+    --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
+    --tf32 False")
+    # evaluate the model
+
+    for task in mb.tasks:
+        test_inputs,test_outputs = task.get_test_data(fold,include_target=True)
+        test_data_path = data_dir +'/matbench_base_fold_'+str(fold)+'_'+task.dataset_name+'_test.json'
+        os.system(f"python evaluate_matbench.py \
+        --model_path {output_path} \
+        --data_path {data_dir}/matbench_base_fold_{str(fold)}_{task.dataset_name}_test.json \
+        --dataset {task.dataset_name} \
+        --fold {fold}")
+
+        # load the prediction result
+        with open('matbench_base_fold_'+str(fold)+'_'+task.dataset_name+'_test_result.json') as f:
+            data = json.load(f)
+        transformed_data = {}
+        for item in data:
+            transformed_data[item['input'].strip()] = item['output']
+        predicted_output = []
+        for i in range(len(test_inputs)):
+            predicted_output.append(transformed_data[test_inputs[i]])
+        task.record(fold,predicted_output)
+
+# save the result     
+mb.to_file("results.json.gz")