-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathsetup.py
71 lines (56 loc) · 2.18 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
https://github.com/alycialee/beyond-scale-language-data-diversity/tree/main/diversity#quick-start
conda install -c anaconda scikit-learn
conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cudatoolkit=11.3 -c pytorch
conda install -c conda-forge tqdm
conda install -c conda-forge transformers
conda install -c conda-forge datasets
python -c "import uutils; uutils.torch_uu.gpu_test_torch_any_device()"
python -c "import uutils; uutils.torch_uu.gpu_test()"
refs:
- setup tools: https://setuptools.pypa.io/en/latest/userguide/package_discovery.html#using-find-or-find-packages
- https://stackoverflow.com/questions/70295885/how-does-one-install-pytorch-and-related-tools-from-within-the-setup-py-install
"""
from setuptools import setup
from setuptools import find_packages
import os
here = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
long_description = f.read()
setup(
name='beyond-scale-language-data-diversity', # project name
version='0.0.1',
description="Beyond Scale: the Diversity Coefficient as a Data Quality Metric Demonstrates LLMs are Pre-trained on Formally Diverse Data",
long_description=long_description,
long_description_content_type="text/markdown",
url='https://github.com/alycialee/beyond-scale-language-data-diversity',
author='Alycia Lee & Brando Miranda',
author_email='[email protected]',
python_requires='>=3.10.11',
license='Apache 2.0',
# currently
package_dir={'': 'src'},
packages=find_packages(include=['diversity*', 'ginc*']), # imports all modules/folders with __init__.py & python files,
# for pytorch see doc string at the top of file
install_requires=[
'scikit-learn',
'pandas',
'numpy',
'plotly',
'wandb',
'matplotlib',
'seaborn',
'torch',
'torchvision',
'torchaudio',
'fairseq',
'transformers',
'datasets',
'zstandard', # needed for eval of all the pile
'sentencepiece', # needed llama2 tokenizer
# 'trl',
# 'peft',
# 'bitsandbytes',
# 'einops',
]
)