-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
76804e7
commit 3872549
Showing
33 changed files
with
4,521 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import datasets | ||
from datasets import load_dataset, Dataset, load_from_disk | ||
from utils import load_data_from_jsonl | ||
import sys, os | ||
current_folder = os.path.dirname(os.path.abspath(__file__)) | ||
parent_folder = os.path.dirname(current_folder) | ||
sys.path.append(parent_folder) | ||
from utils import * | ||
|
||
|
||
META_DIR = "./data/filtering/" | ||
TARGET_DIR = "./data-built-on-datasets/filtering" | ||
|
||
for source in paths: | ||
|
||
if source not in ["arXiv"]: | ||
continue | ||
for path in paths[source]: | ||
|
||
full_path = os.path.join(META_DIR, path) | ||
data = load_data_from_jsonl(full_path) | ||
for item in data: | ||
item['file_path'] = path | ||
if "synthetic_textbooks" in path: | ||
keys = ['outline', 'concepts', 'queries', 'context'] | ||
for key in keys: | ||
if key in item['meta']: | ||
del item['meta'][key] | ||
|
||
ds = Dataset.from_list(data) | ||
ds.save_to_disk(os.path.join(TARGET_DIR, os.path.basename(path).replace(".jsonl", ""))) | ||
print(f"{source} done!") | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# Global Data Processing | ||
|
||
Step 1. language identification | ||
Step 2. cleaning and filtering | ||
Step 3. dedup and leakage detection and removal | ||
- before this step, we need to convert data into huggingface `datasets` format. `python bulid_dataset.py` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
|
||
|
||
|
||
TARGET_DIR=../data-built-on-datasets/dedup/ | ||
|
||
|
||
python text_dedup/counting_common_ngram.py \ | ||
--path ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_minhashlsh_5gram_9000perm_b45r20_dedup_final_dedup \ | ||
--cache_dir "./cache" \ | ||
--output "./cache" \ | ||
--column "text" \ | ||
--local \ | ||
--hash_func md5 \ | ||
--ngram_size 10 \ | ||
--common_words_num 20 |
Submodule deduplicate-text-datasets
added at
b64556
42 changes: 42 additions & 0 deletions
42
src/global_data_processing/dedup/leakage_detection_remove.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
|
||
# line-level exact match for data leakage detection and removal | ||
|
||
MATH_BENCH_TEST_SET_PATH=/data1/zzwang/benchmark-datasets/GSM8K_MATH_MMLU-STEM_test_set_concat_for_detect_data_leakage | ||
|
||
MATH_BENCH_Q_A_TEST_SET_PATH=/data1/zzwang/benchmark-datasets/GSM8K_MATH_MMLU-STEM_test_set_concat_with_q_a_concat_for_detect_data_leakage | ||
|
||
|
||
python text_dedup/ccnet_for_data_leakage.py \ | ||
--path ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_minhashlsh_5gram_9000perm_b45r20_dedup_final_dedup \ | ||
--reference_path ${MATH_BENCH_Q_A_TEST_SET_PATH} \ | ||
--cache_dir "./cache" \ | ||
--output ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_minhashlsh_5gram_9000perm_b45r20_dedup_final_dedup_after_ccnet_exact_match_benchmark_dedup_final \ | ||
--column "text" \ | ||
--local \ | ||
--hash_func md5 | ||
|
||
|
||
|
||
OPENWEBMATH_META_DIR=/data1/zzwang/openwebpath | ||
|
||
python text_dedup/ccnet_for_data_leakage.py \ | ||
--path ${OPENWEBMATH_META_DIR}/openwebmath-built-on-datasets \ | ||
--reference_path ${MATH_BENCH_Q_A_TEST_SET_PATH} \ | ||
--cache_dir "./cache" \ | ||
--output ${OPENWEBMATH_META_DIR}/openwebmath-built-on-datasets-after_ccnet_exact_line_remove_against_math_bench_q_a_dedup_final \ | ||
--column "text" \ | ||
--local \ | ||
--hash_func md5 | ||
|
||
|
||
six_math_benchmark_path=/data1/zzwang/benchmark-datasets/agieval_math_auqa_asdiv_asdiva_swamp_numglue_mawps_mathqa_test_set_only_q_for_detect_data_leakage | ||
|
||
|
||
python text_dedup/ccnet_for_data_leakage.py \ | ||
--path /data1/zzwang/mathpile/data-built-on-datasets/dedup/textbooks/synthetic_textbooks_markdown_minhashlsh_5gram_9000perm_b45r20_dedup_after_ccnet_exact_match_benchmark_dedup_final \ | ||
--reference_path ${six_math_benchmark_path} \ | ||
--cache_dir "./cache" \ | ||
--output /data1/zzwang/mathpile/data-built-on-datasets/dedup/textbooks/synthetic_textbooks_markdown_minhashlsh_5gram_9000perm_b45r20_dedup_after_ccnet_exact_match_benchmark_dedup_final_after_remove_more_math_benchmarks \ | ||
--column "text" \ | ||
--local \ | ||
--hash_func md5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# Deduplication | ||
|
||
text_dedup is built on deduplicate-text-datasets. | ||
|
||
Thanks for amazing codebase [text_dedup](https://github.com/ChenghaoMou/text-dedup) and [deduplicate-text-datasets](https://github.com/google-research/deduplicate-text-datasets) | ||
|
||
|
||
We conducted de-duplication with Minhash Algorithm (LSH version). Specificially, we conducted deduplication within each source and then across sources. | ||
|
||
``` | ||
bash text-dedup.sh | ||
``` | ||
|
||
|
||
# Leakage Detection and Removal | ||
|
||
We leverage line-level exact match to detect any leakage and remove them. | ||
|
||
``` | ||
bash leakage_detection_remove.sh | ||
``` | ||
|
||
Note that before runing these scripts, we need convert data into huggingface `datasets` format. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
|
||
|
||
mkdir cache | ||
|
||
|
||
## source inner dedup | ||
|
||
META_DIR=../data-built-on-datasets/filtering/ | ||
TARGET_DIR=../data-built-on-datasets/dedup/ | ||
|
||
for SOURCE in math_arXiv_v0.2 wikipedia_update stackexchange proofwiki commoncrawl | ||
do | ||
python text_dedup/minhash.py \ | ||
--path ${META_DIR}${SOURCE} \ | ||
--cache_dir "./cache" \ | ||
--output ${TARGET_DIR}${SOURCE}_minhashlsh_5gram_9000perm_b45r20_dedup \ | ||
--column "text" \ | ||
--local \ | ||
--ngram 5 \ | ||
--min_length 5 \ | ||
--seed 42 \ | ||
--num_perm 9000 \ | ||
--b 45 \ | ||
--r 20 | ||
# --maintain_all_docs | ||
done | ||
|
||
|
||
for SOURCE in textbooks/textbooks_markdown textbooks/textbooks_tex textbooks/synthetic_textbooks_markdown | ||
do | ||
python text_dedup/minhash.py \ | ||
--path ${META_DIR}${SOURCE} \ | ||
--cache_dir "./cache" \ | ||
--output ${TARGET_DIR}${SOURCE}_minhashlsh_5gram_9000perm_b45r20_dedup_raw \ | ||
--column "text" \ | ||
--local \ | ||
--ngram 5 \ | ||
--min_length 5 \ | ||
--seed 42 \ | ||
--num_perm 9000 \ | ||
--b 45 \ | ||
--r 20 | ||
# --maintain_all_docs | ||
done | ||
|
||
## source inter dedup | ||
|
||
SUFFIX="_minhashlsh_5gram_9000perm_b45r20_dedup" | ||
|
||
python text_dedup/minhash.py \ | ||
--path ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks${SUFFIX}_concat_before_inter_sources_dedup \ | ||
--cache_dir "./cache" \ | ||
--output ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_concat_minhashlsh_5gram_9000perm_b45r20_after_inter_sources_dedup_raw \ | ||
--column "text" \ | ||
--local \ | ||
--ngram 5 \ | ||
--min_length 5 \ | ||
--seed 42 \ | ||
--num_perm 9000 \ | ||
--b 45 \ | ||
--r 20 | ||
# --maintain_all_docs | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/usr/bin/env python | ||
# @Date : 2021-06-05 12:48:33 | ||
# @Author : Chenghao Mou ([email protected]) | ||
|
||
"""Text deduplication simplified.""" | ||
|
||
import logging | ||
|
||
from rich.logging import RichHandler | ||
|
||
logger = logging.getLogger("text_dedup") | ||
logger.setLevel(logging.INFO) | ||
logger.addHandler(RichHandler(rich_tracebacks=True)) | ||
logger.propagate = False |
95 changes: 95 additions & 0 deletions
95
src/global_data_processing/dedup/text_dedup/bloom_filter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#!/usr/bin/env python | ||
# @Date : 2022-11-05 09:44:48 | ||
# @Author : Chenghao Mou ([email protected]) | ||
import argparse | ||
from typing import Callable | ||
|
||
import datasets | ||
import numpy as np | ||
from datasets.load import load_dataset | ||
from pybloom_live import ScalableBloomFilter | ||
from tqdm import tqdm | ||
|
||
from text_dedup import logger | ||
from text_dedup.utils import add_bloom_filter_args | ||
from text_dedup.utils import add_io_args | ||
from text_dedup.utils import add_meta_args | ||
from text_dedup.utils.hashfunc import md5_digest | ||
from text_dedup.utils.hashfunc import sha256_digest | ||
from text_dedup.utils.hashfunc import xxh3_128_digest | ||
from text_dedup.utils.timer import Timer | ||
|
||
if __name__ == "__main__": # pragma: no cover | ||
parser = argparse.ArgumentParser( | ||
prog="text_dedup.bloomfilter", | ||
description="Deduplicate text using Bloom Filter", | ||
formatter_class=argparse.RawTextHelpFormatter, | ||
) | ||
parser = add_io_args(parser) | ||
parser = add_meta_args(parser) | ||
parser = add_bloom_filter_args(parser) | ||
args = parser.parse_args() | ||
|
||
timer = Timer() | ||
flags = [] | ||
|
||
with timer("Total"): | ||
with timer("Loading"): | ||
ds: datasets.Dataset = load_dataset( # type: ignore | ||
path=args.path, | ||
name=args.name, | ||
data_dir=args.data_dir, | ||
data_files=args.data_files, | ||
split=args.split, | ||
revision=args.revision, | ||
cache_dir=args.cache_dir, | ||
token=args.use_auth_token, | ||
num_proc=args.num_proc, | ||
) | ||
|
||
hash_func: Callable = { | ||
"md5": md5_digest, # type: ignore | ||
"sha256": sha256_digest, # type: ignore | ||
"xxh3": xxh3_128_digest, # type: ignore | ||
}[args.hash_func] | ||
|
||
LEN_DATASET = len(ds) | ||
|
||
bf = ScalableBloomFilter( | ||
initial_capacity=args.initial_capacity, | ||
mode=ScalableBloomFilter.SMALL_SET_GROWTH, | ||
error_rate=args.error_rate, | ||
) | ||
with timer("Processing"): | ||
NUM_SHARDS = int(np.ceil(LEN_DATASET / args.batch_size)) | ||
for idx in tqdm(range(0, NUM_SHARDS), desc="Processing..."): | ||
ds_shard = ( | ||
ds.shard(num_shards=NUM_SHARDS, index=idx, contiguous=True) | ||
# TODO .map(either preprocessing like example.encode("utf-8") or multithreaded) | ||
) | ||
for example in tqdm(ds_shard[args.column], leave=False): | ||
h = hash_func(example.encode("utf-8")) | ||
# True if the element is seen, False otherwise | ||
flags.append(bf.add(h)) | ||
|
||
with timer("Filtering"): | ||
ds = ds.filter( | ||
lambda _, idx: not flags[idx], | ||
with_indices=True, | ||
num_proc=args.num_proc, | ||
desc="Filtering...", | ||
) | ||
|
||
with timer("Saving"): | ||
ds.save_to_disk(args.output) | ||
|
||
with timer("Cleaning"): | ||
if args.clean_cache: | ||
ds.cleanup_cache_files() | ||
|
||
PAD = 32 | ||
for k, v in timer.elapsed_times.items(): | ||
logger.info(f"{k:<{PAD}}: {v:.2f}s") | ||
|
||
logger.info(f"{'Before':<{PAD}}: {len(flags)}") | ||
logger.info(f"{'After':<{PAD}}: {len(ds)}") |
Oops, something went wrong.