progress on safety chapter

souzatharsis · Dec 15, 2024 · 0631a9b · 0631a9b
1 parent 5df3f4d
commit 0631a9b
Show file tree

Hide file tree

Showing 33 changed files with 1,942 additions and 331 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,6 @@ beautifulsoup4 = "^4.12.3"
 tiktoken = "^0.8.0"
 litellm = "^1.52.9"
 pydata-sphinx-theme = "^0.16.0"
-quantecon-book-theme = {git = "https://github.com/QuantEcon/quantecon-book-theme.git"}
 sphinx-multitoc-numbering = "^0.1.3"
 sphinxext-rediraffe = "^0.2.7"
 sphinx-tojupyter = "^0.3.0"

diff --git a/tamingllms/_build/.doctrees/environment.pickle b/tamingllms/_build/.doctrees/environment.pickle
diff --git a/tamingllms/_build/.doctrees/notebooks/alignment.doctree b/tamingllms/_build/.doctrees/notebooks/alignment.doctree
diff --git a/tamingllms/_build/.doctrees/notebooks/evals.doctree b/tamingllms/_build/.doctrees/notebooks/evals.doctree
diff --git a/tamingllms/_build/.doctrees/notebooks/output_size_limit.doctree b/tamingllms/_build/.doctrees/notebooks/output_size_limit.doctree
diff --git a/tamingllms/_build/.doctrees/notebooks/safety.doctree b/tamingllms/_build/.doctrees/notebooks/safety.doctree
diff --git a/tamingllms/_build/.doctrees/notebooks/structured_output.doctree b/tamingllms/_build/.doctrees/notebooks/structured_output.doctree
diff --git a/tamingllms/_build/html/_images/ant_score.png b/tamingllms/_build/html/_images/ant_score.png
diff --git a/tamingllms/_build/html/_images/cai.png b/tamingllms/_build/html/_images/cai.png
diff --git a/tamingllms/_build/html/_images/google_score.png b/tamingllms/_build/html/_images/google_score.png
diff --git a/tamingllms/_build/html/_images/openai_score.png b/tamingllms/_build/html/_images/openai_score.png
diff --git a/tamingllms/_build/html/_sources/notebooks/safety.ipynb b/tamingllms/_build/html/_sources/notebooks/safety.ipynb
diff --git a/tamingllms/_build/html/_static/safety/ant_score.png b/tamingllms/_build/html/_static/safety/ant_score.png
diff --git a/tamingllms/_build/html/_static/safety/cai.png b/tamingllms/_build/html/_static/safety/cai.png
diff --git a/tamingllms/_build/html/_static/safety/google_score.png b/tamingllms/_build/html/_static/safety/google_score.png
diff --git a/tamingllms/_build/html/_static/safety/openai_score.png b/tamingllms/_build/html/_static/safety/openai_score.png
diff --git a/tamingllms/_build/html/notebooks/alignment.html b/tamingllms/_build/html/notebooks/alignment.html
diff --git a/tamingllms/_build/html/notebooks/evals.html b/tamingllms/_build/html/notebooks/evals.html
diff --git a/tamingllms/_build/html/notebooks/output_size_limit.html b/tamingllms/_build/html/notebooks/output_size_limit.html
diff --git a/tamingllms/_build/html/notebooks/safety.html b/tamingllms/_build/html/notebooks/safety.html
diff --git a/tamingllms/_build/html/notebooks/structured_output.html b/tamingllms/_build/html/notebooks/structured_output.html
diff --git a/tamingllms/_build/html/objects.inv b/tamingllms/_build/html/objects.inv
diff --git a/tamingllms/_build/html/searchindex.js b/tamingllms/_build/html/searchindex.js
diff --git a/tamingllms/_build/jupyter_execute/markdown/intro.ipynb b/tamingllms/_build/jupyter_execute/markdown/intro.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "42bee8f4",
+   "id": "d486d55f",
    "metadata": {},
    "source": [
     "(intro)=\n",

diff --git a/tamingllms/_build/jupyter_execute/notebooks/safety.ipynb b/tamingllms/_build/jupyter_execute/notebooks/safety.ipynb
diff --git a/tamingllms/_config.yml b/tamingllms/_config.yml
@@ -45,6 +45,7 @@ parse:
 sphinx:
   extra_extensions:
     - sphinxcontrib.mermaid
+    - sphinxcontrib.bibtex
   config:
     mathjax_path: https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
     bibtex_reference_style: author_year

diff --git a/tamingllms/_static/safety/ant_score.png b/tamingllms/_static/safety/ant_score.png
diff --git a/tamingllms/_static/safety/cai.png b/tamingllms/_static/safety/cai.png
diff --git a/tamingllms/_static/safety/google_score.png b/tamingllms/_static/safety/google_score.png
diff --git a/tamingllms/_static/safety/openai_score.png b/tamingllms/_static/safety/openai_score.png
diff --git a/tamingllms/notebooks/safety.ipynb b/tamingllms/notebooks/safety.ipynb
diff --git a/tamingllms/references.bib b/tamingllms/references.bib
@@ -667,6 +667,24 @@ @misc{rafailov2024directpreferenceoptimizationlanguage
       url={https://arxiv.org/abs/2305.18290}, 
 }
 
+@techreport{ukgov2024airegulation24,
+      title={AI Regulation: A Pro-Innovation Approach}, 
+      author={{UK Government}},
+      year={2024},
+      institution={Department for Science, Innovation and Technology},
+      type={White Paper},
+      url={https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper},
+}
+
+@misc{meta2024llamaguard,
+      title={LlamaGuard: LLM-based Input-Output Safeguard for Human-AI Conversations}, 
+      author={Meta AI},
+      year={2024},
+      howpublished={Meta AI Research Publications},
+      url={https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/},
+}
+
+
 
 @misc{touvron2023llama2openfoundation,
       title={Llama 2: Open Foundation and Fine-Tuned Chat Models}, 
@@ -745,6 +763,17 @@ @misc{neurips2023awards
 }
 
 
+@techreport{finra2024llmguidance24,
+      title={Artificial Intelligence, Including Large Language Models and Generative AI}, 
+      author={{Financial Industry Regulatory Authority}},
+      year={2024},
+      institution={FINRA},
+      type={Regulatory Notice},
+      number={24-09},
+      url={https://www.finra.org/rules-guidance/notices/24-09},
+}
+
+
 
 @misc{huggingface2024trl,
     title={TRL},
@@ -882,3 +911,152 @@ @article{siam2024exploitllms
       number={1},
       url={https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/},
 }
+
+
+@misc{sutton2024stealtheditslargelanguage,
+      title={Stealth edits to large language models}, 
+      author={Oliver J. Sutton and Qinghua Zhou and Wei Wang and Desmond J. Higham and Alexander N. Gorban and Alexander Bastounis and Ivan Y. Tyukin},
+      year={2024},
+      eprint={2406.12670},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2406.12670}, 
+}
+
+@misc{exabeam2024airegulations,
+      title={AI Regulations and LLM Regulations: Past, Present, and Future}, 
+      author={Exabeam},
+      year={2024},
+      howpublished={Exabeam Blog},
+      url={https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/},
+}
+
+
+@techreport{ema2024llmguidelines,
+      title={Guiding principles for the use of large language models in regulatory science and medicines regulatory activities}, 
+      author={{European Medicines Agency}},
+      year={2024},
+      institution={European Medicines Agency},
+      type={Guidance Document},
+      url={https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf},
+}
+
+
+
+
+@misc{alaga2024gradingrubricaisafety,
+      title={A Grading Rubric for AI Safety Frameworks}, 
+      author={Jide Alaga and Jonas Schuett and Markus Anderljung},
+      year={2024},
+      eprint={2409.08751},
+      archivePrefix={arXiv},
+      primaryClass={cs.CY},
+      url={https://arxiv.org/abs/2409.08751}, 
+}
+
+@techreport{unicef2024aiguidance,
+      title={Policy Guidance on AI for Children}, 
+      author={{UNICEF}},
+      year={2024},
+      institution={UNICEF Office of Research - Innocenti},
+      type={Policy Report},
+      url={https://www.unicef.org/innocenti/reports/policy-guidance-ai-children},
+}
+
+
+
+
+
+
+
+
+
+
+@article{doi:10.1098/rsos.240197,
+author = {Wachter, Sandra  and Mittelstadt, Brent  and Russell, Chris },
+title = {Do large language models have a legal duty to tell the truth?},
+journal = {Royal Society Open Science},
+volume = {11},
+number = {8},
+pages = {240197},
+year = {2024},
+doi = {10.1098/rsos.240197},
+
+URL = {https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197},
+eprint = {https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197}
+}
+
+@misc{china2023generativeai,
+      title={China: Generative AI Measures Finalized},
+      author={{Library of Congress}},
+      year={2023},
+      institution={Law Library of Congress},
+      type={Global Legal Monitor},
+      month={July},
+      url={https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/},
+}
+
+@techreport{nist2024riskframework,
+      title={AI Risk Management Framework}, 
+      author={{National Institute of Standards and Technology}},
+      year={2024},
+      institution={National Institute of Standards and Technology},
+      type={Technical Report},
+      url={https://www.nist.gov/itl/ai-risk-management-framework},
+}
+
+
+@techreport{openai2024preparedness,
+      title={OpenAI Preparedness Framework}, 
+      author={{OpenAI}},
+      year={2024},
+      institution={OpenAI},
+      type={Technical Report},
+      url={https://cdn.openai.com/openai-preparedness-framework-beta.pdf},
+}
+
+@techreport{anthropic2024scaling,
+      title={Anthropic's Responsible Scaling Policy}, 
+      author={{Anthropic}},
+      year={2024},
+      institution={Anthropic},
+      type={Technical Report},
+      url={https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf},
+}
+
+@techreport{deepmind2024frontier,
+      title={The Frontier Safety Framework}, 
+      author={{DeepMind}},
+      year={2024},
+      institution={DeepMind},
+      type={Technical Report},
+      url={https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf},
+}
+
+@misc{perez2022redteaminglanguagemodels,
+      title={Red Teaming Language Models with Language Models}, 
+      author={Ethan Perez and Saffron Huang and Francis Song and Trevor Cai and Roman Ring and John Aslanides and Amelia Glaese and Nat McAleese and Geoffrey Irving},
+      year={2022},
+      eprint={2202.03286},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2202.03286}, 
+}
+
+@misc{cambria2024xaimeetsllmssurvey,
+      title={XAI meets LLMs: A Survey of the Relation between Explainable AI and Large Language Models}, 
+      author={Erik Cambria and Lorenzo Malandri and Fabio Mercorio and Navid Nobani and Andrea Seveso},
+      year={2024},
+      eprint={2407.15248},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2407.15248}, 
+}
+
+@misc{askell2023constitutionalai,
+      title={Constitutional AI: Harmlessness from AI Feedback}, 
+      author={Amanda Askell and Yuntao Bai and Anna Chen and Deep Ganguli and Danny Hernandez and Jared Kaplan and Jackson Kernion and Ben Mann and Catherine Olsson and Paul Christiano},
+      year={2023},
+      institution={Anthropic},
+      url={https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback},
+}