update gloabl data processing

GAIR-NLP · Jun 23, 2024 · 3872549 · 3872549
1 parent 76804e7
commit 3872549
Show file tree

Hide file tree

Showing 33 changed files with 4,521 additions and 0 deletions.
diff --git a/src/build_dataset.py b/src/build_dataset.py
@@ -0,0 +1,34 @@
+import datasets
+from datasets import load_dataset, Dataset, load_from_disk
+from utils import load_data_from_jsonl
+import sys, os
+current_folder = os.path.dirname(os.path.abspath(__file__))
+parent_folder = os.path.dirname(current_folder)
+sys.path.append(parent_folder)
+from utils import *
+
+
+META_DIR = "./data/filtering/"
+TARGET_DIR = "./data-built-on-datasets/filtering"
+
+for source in paths:
+
+    if source not in ["arXiv"]:
+        continue
+    for path in paths[source]:
+
+        full_path = os.path.join(META_DIR, path)
+        data = load_data_from_jsonl(full_path)
+        for item in data:
+            item['file_path'] = path
+            if "synthetic_textbooks" in path:
+                keys = ['outline', 'concepts', 'queries', 'context']
+                for key in keys:
+                    if key in item['meta']:
+                        del item['meta'][key]
+
+        ds = Dataset.from_list(data)
+        ds.save_to_disk(os.path.join(TARGET_DIR, os.path.basename(path).replace(".jsonl", "")))
+    print(f"{source} done!")
+
+
diff --git a/src/global_data_processing/README.md b/src/global_data_processing/README.md
@@ -0,0 +1,6 @@
+# Global Data Processing
+
+Step 1. language identification
+Step 2. cleaning and filtering
+Step 3. dedup and leakage detection and removal
+    - before this step, we need to convert data into  huggingface `datasets` format. `python bulid_dataset.py`
diff --git a/src/global_data_processing/dedup/count_common_ngram.sh b/src/global_data_processing/dedup/count_common_ngram.sh
@@ -0,0 +1,15 @@
+
+
+
+TARGET_DIR=../data-built-on-datasets/dedup/
+
+
+python text_dedup/counting_common_ngram.py \
+    --path ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_minhashlsh_5gram_9000perm_b45r20_dedup_final_dedup \
+    --cache_dir "./cache" \
+    --output "./cache" \
+    --column "text" \
+    --local \
+    --hash_func md5 \
+    --ngram_size 10 \
+    --common_words_num 20 
diff --git a/src/global_data_processing/dedup/deduplicate-text-datasets b/src/global_data_processing/dedup/deduplicate-text-datasets
diff --git a/src/global_data_processing/dedup/leakage_detection_remove.sh b/src/global_data_processing/dedup/leakage_detection_remove.sh
@@ -0,0 +1,42 @@
+
+# line-level exact match for data leakage detection and removal
+
+MATH_BENCH_TEST_SET_PATH=/data1/zzwang/benchmark-datasets/GSM8K_MATH_MMLU-STEM_test_set_concat_for_detect_data_leakage
+
+MATH_BENCH_Q_A_TEST_SET_PATH=/data1/zzwang/benchmark-datasets/GSM8K_MATH_MMLU-STEM_test_set_concat_with_q_a_concat_for_detect_data_leakage
+
+
+python text_dedup/ccnet_for_data_leakage.py \
+    --path ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_minhashlsh_5gram_9000perm_b45r20_dedup_final_dedup \
+    --reference_path ${MATH_BENCH_Q_A_TEST_SET_PATH} \
+    --cache_dir "./cache" \
+    --output ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_minhashlsh_5gram_9000perm_b45r20_dedup_final_dedup_after_ccnet_exact_match_benchmark_dedup_final \
+    --column "text" \
+    --local \
+    --hash_func md5
+
+
+
+OPENWEBMATH_META_DIR=/data1/zzwang/openwebpath
+
+python text_dedup/ccnet_for_data_leakage.py \
+    --path ${OPENWEBMATH_META_DIR}/openwebmath-built-on-datasets \
+    --reference_path ${MATH_BENCH_Q_A_TEST_SET_PATH} \
+    --cache_dir "./cache" \
+    --output ${OPENWEBMATH_META_DIR}/openwebmath-built-on-datasets-after_ccnet_exact_line_remove_against_math_bench_q_a_dedup_final \
+    --column "text" \
+    --local \
+    --hash_func md5
+
+
+six_math_benchmark_path=/data1/zzwang/benchmark-datasets/agieval_math_auqa_asdiv_asdiva_swamp_numglue_mawps_mathqa_test_set_only_q_for_detect_data_leakage
+
+
+python text_dedup/ccnet_for_data_leakage.py \
+    --path /data1/zzwang/mathpile/data-built-on-datasets/dedup/textbooks/synthetic_textbooks_markdown_minhashlsh_5gram_9000perm_b45r20_dedup_after_ccnet_exact_match_benchmark_dedup_final \
+    --reference_path ${six_math_benchmark_path} \
+    --cache_dir "./cache" \
+    --output /data1/zzwang/mathpile/data-built-on-datasets/dedup/textbooks/synthetic_textbooks_markdown_minhashlsh_5gram_9000perm_b45r20_dedup_after_ccnet_exact_match_benchmark_dedup_final_after_remove_more_math_benchmarks \
+    --column "text" \
+    --local \
+    --hash_func md5
diff --git a/src/global_data_processing/dedup/readme.md b/src/global_data_processing/dedup/readme.md
@@ -0,0 +1,23 @@
+# Deduplication 
+
+text_dedup is built on deduplicate-text-datasets. 
+
+Thanks for amazing codebase [text_dedup](https://github.com/ChenghaoMou/text-dedup) and [deduplicate-text-datasets](https://github.com/google-research/deduplicate-text-datasets)
+
+
+We conducted de-duplication with Minhash Algorithm (LSH version). Specificially, we conducted deduplication within each source and then across sources.
+
+```
+bash text-dedup.sh
+```
+
+
+# Leakage Detection and Removal
+
+We leverage line-level exact match to detect any leakage and remove them.
+
+```
+bash leakage_detection_remove.sh
+```
+
+Note that before runing these scripts, we need convert data into huggingface `datasets` format.
diff --git a/src/global_data_processing/dedup/text-dedup.sh b/src/global_data_processing/dedup/text-dedup.sh
@@ -0,0 +1,65 @@
+
+
+mkdir cache
+
+
+## source inner dedup
+
+META_DIR=../data-built-on-datasets/filtering/
+TARGET_DIR=../data-built-on-datasets/dedup/
+
+for SOURCE in math_arXiv_v0.2 wikipedia_update stackexchange   proofwiki commoncrawl 
+do
+    python text_dedup/minhash.py \
+        --path ${META_DIR}${SOURCE} \
+        --cache_dir "./cache" \
+        --output ${TARGET_DIR}${SOURCE}_minhashlsh_5gram_9000perm_b45r20_dedup \
+        --column "text" \
+        --local \
+        --ngram 5 \
+        --min_length 5 \
+        --seed 42 \
+        --num_perm 9000 \
+        --b 45 \
+        --r 20 
+        # --maintain_all_docs
+done
+
+
+for SOURCE in textbooks/textbooks_markdown textbooks/textbooks_tex textbooks/synthetic_textbooks_markdown 
+do
+    python text_dedup/minhash.py \
+        --path ${META_DIR}${SOURCE} \
+        --cache_dir "./cache" \
+        --output ${TARGET_DIR}${SOURCE}_minhashlsh_5gram_9000perm_b45r20_dedup_raw \
+        --column "text" \
+        --local \
+        --ngram 5 \
+        --min_length 5 \
+        --seed 42 \
+        --num_perm 9000 \
+        --b 45 \
+        --r 20 
+        # --maintain_all_docs
+done
+
+## source inter dedup 
+
+SUFFIX="_minhashlsh_5gram_9000perm_b45r20_dedup"
+
+python text_dedup/minhash.py \
+    --path ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks${SUFFIX}_concat_before_inter_sources_dedup \
+    --cache_dir "./cache" \
+    --output ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_concat_minhashlsh_5gram_9000perm_b45r20_after_inter_sources_dedup_raw \
+    --column "text" \
+    --local \
+    --ngram 5 \
+    --min_length 5 \
+    --seed 42 \
+    --num_perm 9000 \
+    --b 45 \
+    --r 20 
+    # --maintain_all_docs
+
+
+
diff --git a/src/global_data_processing/dedup/text_dedup/__init__.py b/src/global_data_processing/dedup/text_dedup/__init__.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+# @Date         : 2021-06-05 12:48:33
+# @Author       : Chenghao Mou ([email protected])
+
+"""Text deduplication simplified."""
+
+import logging
+
+from rich.logging import RichHandler
+
+logger = logging.getLogger("text_dedup")
+logger.setLevel(logging.INFO)
+logger.addHandler(RichHandler(rich_tracebacks=True))
+logger.propagate = False
diff --git a/src/global_data_processing/dedup/text_dedup/bloom_filter.py b/src/global_data_processing/dedup/text_dedup/bloom_filter.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+# @Date    : 2022-11-05 09:44:48
+# @Author  : Chenghao Mou ([email protected])
+import argparse
+from typing import Callable
+
+import datasets
+import numpy as np
+from datasets.load import load_dataset
+from pybloom_live import ScalableBloomFilter
+from tqdm import tqdm
+
+from text_dedup import logger
+from text_dedup.utils import add_bloom_filter_args
+from text_dedup.utils import add_io_args
+from text_dedup.utils import add_meta_args
+from text_dedup.utils.hashfunc import md5_digest
+from text_dedup.utils.hashfunc import sha256_digest
+from text_dedup.utils.hashfunc import xxh3_128_digest
+from text_dedup.utils.timer import Timer
+
+if __name__ == "__main__":  # pragma: no cover
+    parser = argparse.ArgumentParser(
+        prog="text_dedup.bloomfilter",
+        description="Deduplicate text using Bloom Filter",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser = add_io_args(parser)
+    parser = add_meta_args(parser)
+    parser = add_bloom_filter_args(parser)
+    args = parser.parse_args()
+
+    timer = Timer()
+    flags = []
+
+    with timer("Total"):
+        with timer("Loading"):
+            ds: datasets.Dataset = load_dataset(  # type: ignore
+                path=args.path,
+                name=args.name,
+                data_dir=args.data_dir,
+                data_files=args.data_files,
+                split=args.split,
+                revision=args.revision,
+                cache_dir=args.cache_dir,
+                token=args.use_auth_token,
+                num_proc=args.num_proc,
+            )
+
+        hash_func: Callable = {
+            "md5": md5_digest,  # type: ignore
+            "sha256": sha256_digest,  # type: ignore
+            "xxh3": xxh3_128_digest,  # type: ignore
+        }[args.hash_func]
+
+        LEN_DATASET = len(ds)
+
+        bf = ScalableBloomFilter(
+            initial_capacity=args.initial_capacity,
+            mode=ScalableBloomFilter.SMALL_SET_GROWTH,
+            error_rate=args.error_rate,
+        )
+        with timer("Processing"):
+            NUM_SHARDS = int(np.ceil(LEN_DATASET / args.batch_size))
+            for idx in tqdm(range(0, NUM_SHARDS), desc="Processing..."):
+                ds_shard = (
+                    ds.shard(num_shards=NUM_SHARDS, index=idx, contiguous=True)
+                    # TODO .map(either preprocessing like example.encode("utf-8") or multithreaded)
+                )
+                for example in tqdm(ds_shard[args.column], leave=False):
+                    h = hash_func(example.encode("utf-8"))
+                    # True if the element is seen, False otherwise
+                    flags.append(bf.add(h))
+
+        with timer("Filtering"):
+            ds = ds.filter(
+                lambda _, idx: not flags[idx],
+                with_indices=True,
+                num_proc=args.num_proc,
+                desc="Filtering...",
+            )
+
+        with timer("Saving"):
+            ds.save_to_disk(args.output)
+
+        with timer("Cleaning"):
+            if args.clean_cache:
+                ds.cleanup_cache_files()
+
+    PAD = 32
+    for k, v in timer.elapsed_times.items():
+        logger.info(f"{k:<{PAD}}: {v:.2f}s")
+
+    logger.info(f"{'Before':<{PAD}}: {len(flags)}")
+    logger.info(f"{'After':<{PAD}}: {len(ds)}")