Skip to content

Commit

Permalink
update gloabl data processing
Browse files Browse the repository at this point in the history
  • Loading branch information
SinclairCoder committed Jun 23, 2024
1 parent 76804e7 commit 3872549
Show file tree
Hide file tree
Showing 33 changed files with 4,521 additions and 0 deletions.
34 changes: 34 additions & 0 deletions src/build_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import datasets
from datasets import load_dataset, Dataset, load_from_disk
from utils import load_data_from_jsonl
import sys, os
current_folder = os.path.dirname(os.path.abspath(__file__))
parent_folder = os.path.dirname(current_folder)
sys.path.append(parent_folder)
from utils import *


META_DIR = "./data/filtering/"
TARGET_DIR = "./data-built-on-datasets/filtering"

for source in paths:

if source not in ["arXiv"]:
continue
for path in paths[source]:

full_path = os.path.join(META_DIR, path)
data = load_data_from_jsonl(full_path)
for item in data:
item['file_path'] = path
if "synthetic_textbooks" in path:
keys = ['outline', 'concepts', 'queries', 'context']
for key in keys:
if key in item['meta']:
del item['meta'][key]

ds = Dataset.from_list(data)
ds.save_to_disk(os.path.join(TARGET_DIR, os.path.basename(path).replace(".jsonl", "")))
print(f"{source} done!")


6 changes: 6 additions & 0 deletions src/global_data_processing/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Global Data Processing

Step 1. language identification
Step 2. cleaning and filtering
Step 3. dedup and leakage detection and removal
- before this step, we need to convert data into huggingface `datasets` format. `python bulid_dataset.py`
15 changes: 15 additions & 0 deletions src/global_data_processing/dedup/count_common_ngram.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@



TARGET_DIR=../data-built-on-datasets/dedup/


python text_dedup/counting_common_ngram.py \
--path ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_minhashlsh_5gram_9000perm_b45r20_dedup_final_dedup \
--cache_dir "./cache" \
--output "./cache" \
--column "text" \
--local \
--hash_func md5 \
--ngram_size 10 \
--common_words_num 20
1 change: 1 addition & 0 deletions src/global_data_processing/dedup/deduplicate-text-datasets
Submodule deduplicate-text-datasets added at b64556
42 changes: 42 additions & 0 deletions src/global_data_processing/dedup/leakage_detection_remove.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

# line-level exact match for data leakage detection and removal

MATH_BENCH_TEST_SET_PATH=/data1/zzwang/benchmark-datasets/GSM8K_MATH_MMLU-STEM_test_set_concat_for_detect_data_leakage

MATH_BENCH_Q_A_TEST_SET_PATH=/data1/zzwang/benchmark-datasets/GSM8K_MATH_MMLU-STEM_test_set_concat_with_q_a_concat_for_detect_data_leakage


python text_dedup/ccnet_for_data_leakage.py \
--path ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_minhashlsh_5gram_9000perm_b45r20_dedup_final_dedup \
--reference_path ${MATH_BENCH_Q_A_TEST_SET_PATH} \
--cache_dir "./cache" \
--output ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_minhashlsh_5gram_9000perm_b45r20_dedup_final_dedup_after_ccnet_exact_match_benchmark_dedup_final \
--column "text" \
--local \
--hash_func md5



OPENWEBMATH_META_DIR=/data1/zzwang/openwebpath

python text_dedup/ccnet_for_data_leakage.py \
--path ${OPENWEBMATH_META_DIR}/openwebmath-built-on-datasets \
--reference_path ${MATH_BENCH_Q_A_TEST_SET_PATH} \
--cache_dir "./cache" \
--output ${OPENWEBMATH_META_DIR}/openwebmath-built-on-datasets-after_ccnet_exact_line_remove_against_math_bench_q_a_dedup_final \
--column "text" \
--local \
--hash_func md5


six_math_benchmark_path=/data1/zzwang/benchmark-datasets/agieval_math_auqa_asdiv_asdiva_swamp_numglue_mawps_mathqa_test_set_only_q_for_detect_data_leakage


python text_dedup/ccnet_for_data_leakage.py \
--path /data1/zzwang/mathpile/data-built-on-datasets/dedup/textbooks/synthetic_textbooks_markdown_minhashlsh_5gram_9000perm_b45r20_dedup_after_ccnet_exact_match_benchmark_dedup_final \
--reference_path ${six_math_benchmark_path} \
--cache_dir "./cache" \
--output /data1/zzwang/mathpile/data-built-on-datasets/dedup/textbooks/synthetic_textbooks_markdown_minhashlsh_5gram_9000perm_b45r20_dedup_after_ccnet_exact_match_benchmark_dedup_final_after_remove_more_math_benchmarks \
--column "text" \
--local \
--hash_func md5
23 changes: 23 additions & 0 deletions src/global_data_processing/dedup/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Deduplication

text_dedup is built on deduplicate-text-datasets.

Thanks for amazing codebase [text_dedup](https://github.com/ChenghaoMou/text-dedup) and [deduplicate-text-datasets](https://github.com/google-research/deduplicate-text-datasets)


We conducted de-duplication with Minhash Algorithm (LSH version). Specificially, we conducted deduplication within each source and then across sources.

```
bash text-dedup.sh
```


# Leakage Detection and Removal

We leverage line-level exact match to detect any leakage and remove them.

```
bash leakage_detection_remove.sh
```

Note that before runing these scripts, we need convert data into huggingface `datasets` format.
65 changes: 65 additions & 0 deletions src/global_data_processing/dedup/text-dedup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@


mkdir cache


## source inner dedup

META_DIR=../data-built-on-datasets/filtering/
TARGET_DIR=../data-built-on-datasets/dedup/

for SOURCE in math_arXiv_v0.2 wikipedia_update stackexchange proofwiki commoncrawl
do
python text_dedup/minhash.py \
--path ${META_DIR}${SOURCE} \
--cache_dir "./cache" \
--output ${TARGET_DIR}${SOURCE}_minhashlsh_5gram_9000perm_b45r20_dedup \
--column "text" \
--local \
--ngram 5 \
--min_length 5 \
--seed 42 \
--num_perm 9000 \
--b 45 \
--r 20
# --maintain_all_docs
done


for SOURCE in textbooks/textbooks_markdown textbooks/textbooks_tex textbooks/synthetic_textbooks_markdown
do
python text_dedup/minhash.py \
--path ${META_DIR}${SOURCE} \
--cache_dir "./cache" \
--output ${TARGET_DIR}${SOURCE}_minhashlsh_5gram_9000perm_b45r20_dedup_raw \
--column "text" \
--local \
--ngram 5 \
--min_length 5 \
--seed 42 \
--num_perm 9000 \
--b 45 \
--r 20
# --maintain_all_docs
done

## source inter dedup

SUFFIX="_minhashlsh_5gram_9000perm_b45r20_dedup"

python text_dedup/minhash.py \
--path ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks${SUFFIX}_concat_before_inter_sources_dedup \
--cache_dir "./cache" \
--output ${TARGET_DIR}arxiv_commoncrawl_stackexchange_wikipedia_all_textbooks_concat_minhashlsh_5gram_9000perm_b45r20_after_inter_sources_dedup_raw \
--column "text" \
--local \
--ngram 5 \
--min_length 5 \
--seed 42 \
--num_perm 9000 \
--b 45 \
--r 20
# --maintain_all_docs



14 changes: 14 additions & 0 deletions src/global_data_processing/dedup/text_dedup/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env python
# @Date : 2021-06-05 12:48:33
# @Author : Chenghao Mou ([email protected])

"""Text deduplication simplified."""

import logging

from rich.logging import RichHandler

logger = logging.getLogger("text_dedup")
logger.setLevel(logging.INFO)
logger.addHandler(RichHandler(rich_tracebacks=True))
logger.propagate = False
95 changes: 95 additions & 0 deletions src/global_data_processing/dedup/text_dedup/bloom_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python
# @Date : 2022-11-05 09:44:48
# @Author : Chenghao Mou ([email protected])
import argparse
from typing import Callable

import datasets
import numpy as np
from datasets.load import load_dataset
from pybloom_live import ScalableBloomFilter
from tqdm import tqdm

from text_dedup import logger
from text_dedup.utils import add_bloom_filter_args
from text_dedup.utils import add_io_args
from text_dedup.utils import add_meta_args
from text_dedup.utils.hashfunc import md5_digest
from text_dedup.utils.hashfunc import sha256_digest
from text_dedup.utils.hashfunc import xxh3_128_digest
from text_dedup.utils.timer import Timer

if __name__ == "__main__": # pragma: no cover
parser = argparse.ArgumentParser(
prog="text_dedup.bloomfilter",
description="Deduplicate text using Bloom Filter",
formatter_class=argparse.RawTextHelpFormatter,
)
parser = add_io_args(parser)
parser = add_meta_args(parser)
parser = add_bloom_filter_args(parser)
args = parser.parse_args()

timer = Timer()
flags = []

with timer("Total"):
with timer("Loading"):
ds: datasets.Dataset = load_dataset( # type: ignore
path=args.path,
name=args.name,
data_dir=args.data_dir,
data_files=args.data_files,
split=args.split,
revision=args.revision,
cache_dir=args.cache_dir,
token=args.use_auth_token,
num_proc=args.num_proc,
)

hash_func: Callable = {
"md5": md5_digest, # type: ignore
"sha256": sha256_digest, # type: ignore
"xxh3": xxh3_128_digest, # type: ignore
}[args.hash_func]

LEN_DATASET = len(ds)

bf = ScalableBloomFilter(
initial_capacity=args.initial_capacity,
mode=ScalableBloomFilter.SMALL_SET_GROWTH,
error_rate=args.error_rate,
)
with timer("Processing"):
NUM_SHARDS = int(np.ceil(LEN_DATASET / args.batch_size))
for idx in tqdm(range(0, NUM_SHARDS), desc="Processing..."):
ds_shard = (
ds.shard(num_shards=NUM_SHARDS, index=idx, contiguous=True)
# TODO .map(either preprocessing like example.encode("utf-8") or multithreaded)
)
for example in tqdm(ds_shard[args.column], leave=False):
h = hash_func(example.encode("utf-8"))
# True if the element is seen, False otherwise
flags.append(bf.add(h))

with timer("Filtering"):
ds = ds.filter(
lambda _, idx: not flags[idx],
with_indices=True,
num_proc=args.num_proc,
desc="Filtering...",
)

with timer("Saving"):
ds.save_to_disk(args.output)

with timer("Cleaning"):
if args.clean_cache:
ds.cleanup_cache_files()

PAD = 32
for k, v in timer.elapsed_times.items():
logger.info(f"{k:<{PAD}}: {v:.2f}s")

logger.info(f"{'Before':<{PAD}}: {len(flags)}")
logger.info(f"{'After':<{PAD}}: {len(ds)}")
Loading

0 comments on commit 3872549

Please sign in to comment.