diff --git a/data-analysis/README.md b/data-analysis/README.md new file mode 100644 index 0000000000..b9ab35722c --- /dev/null +++ b/data-analysis/README.md @@ -0,0 +1,15 @@ +# Using Python for Data Analysis + +This folder contains completed notebooks and other files used in the Real Python tutorial on [Using Python for Data Analysis](https://realpython.com/python-for-data-analysis/). + +**The following files are included:** + +- `data_analysis_findings.ipynb` is a Jupyter Notebook containing all the code used in the tutorial. +- `data_analysis_results.ipynb` is a Jupyter Notebook containing the final version of the cleansing and analysis code. +- `james_bond_data.csv` contains the data to be cleansed and analyzed in its original form, in CSV format. +- `james_bond_data.json` contains the data to be cleansed and analyzed in its original form, in JSON format. +- `james_bond_data.parquet` contains the data to be cleansed and analyzed in its original form, in parquet format. +- `james_bond_data.xlsx` contains the data to be cleansed and analyzed in its original form, in Microsoft Excel format. +- `james_bond_data_cleansed.csv` contains the cleansed data in its final form. + +Note that although you can complete the tutorial in various Python environments, using Jupyter Notebook within JupyterLab is highly recommended. diff --git a/data-analysis/data_analysis_findings.ipynb b/data-analysis/data_analysis_findings.ipynb new file mode 100644 index 0000000000..e040d79449 --- /dev/null +++ b/data-analysis/data_analysis_findings.ipynb @@ -0,0 +1,1069 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ade4bd3f-543b-460b-980f-0b41aab2c8b6", + "metadata": {}, + "source": [ + "# Acquiring Your Data" + ] + }, + { + "cell_type": "markdown", + "id": "83ad2114-5ed8-4a90-85fa-adea5eda4392", + "metadata": {}, + "source": [ + "## Reading Data From CSV Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a360772e-7829-4c15-9af9-d4596efc7351", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e52f486-232e-440b-8585-90416e4300c2", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_csv(\"james_bond_data.csv\").convert_dtypes()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "797f69eb-3108-45d3-9a67-58c43593abf1", + "metadata": {}, + "outputs": [], + "source": [ + "james_bond_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e47c1f9b-b390-4035-956b-622615b57f32", + "metadata": {}, + "source": [ + "## Reading Data From Other Sources" + ] + }, + { + "cell_type": "markdown", + "id": "1d85aee9-cfeb-460b-9fe8-f3c7e7dfb764", + "metadata": {}, + "source": [ + "### Reading JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7465cd11-dad4-4741-9372-f825b28c33d6", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_json(\"james_bond_data.json\").convert_dtypes()" + ] + }, + { + "cell_type": "markdown", + "id": "47a0e4a6-0ed9-4253-9833-0ad22c49b968", + "metadata": {}, + "source": [ + "### Reading Excel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0364b81-64a0-4098-89fc-e58bd6d68257", + "metadata": {}, + "outputs": [], + "source": [ + "! python -m pip install openpyxl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8302139f-52dc-4f95-aa9a-96040ae5d82b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_excel(\"james_bond_data.xlsx\").convert_dtypes()" + ] + }, + { + "cell_type": "markdown", + "id": "be4a1143-c966-4056-8a5e-3bdebe2a9b1f", + "metadata": {}, + "source": [ + "### Reading Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f36ef600-e6ba-4cc6-9ee3-0cbf369a4be2", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pyarrow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c86284a2-9073-4240-b4d5-5e8b0373fc27", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_parquet(\"james_bond_data.parquet\").convert_dtypes()\n", + "\n", + "james_bond_data" + ] + }, + { + "cell_type": "markdown", + "id": "69f884c2-92e8-4db3-bd63-84007f654808", + "metadata": {}, + "source": [ + "### Scraping HTML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b902722d-9648-4124-80b0-64004342170d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install lxml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb2ff9c-3030-4f4a-be30-c2ab68452a21", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data_html = pd.read_html(\n", + " \"https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories\"\n", + ")\n", + "james_bond_tables = james_bond_data_html[1].convert_dtypes()" + ] + }, + { + "cell_type": "markdown", + "id": "31068de2-9864-434a-9652-b115d1131684", + "metadata": {}, + "source": [ + "# Cleansing Your Data With Python" + ] + }, + { + "cell_type": "markdown", + "id": "e0dcca3b-6e71-481d-a071-6218012db962", + "metadata": {}, + "source": [ + "## Creating Meaningful Column Names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d70997b9-3c75-4165-b034-8544bd084c04", + "metadata": {}, + "outputs": [], + "source": [ + "new_column_names = {\n", + " \"Release\": \"release_date\",\n", + " \"Movie\": \"movie_title\",\n", + " \"Bond\": \"bond_actor\",\n", + " \"Bond_Car_MFG\": \"car_manufacturer\",\n", + " \"US_Gross\": \"income_usa\",\n", + " \"World_Gross\": \"income_world\",\n", + " \"Budget ($ 000s)\": \"movie_budget\",\n", + " \"Film_Length\": \"film_length\",\n", + " \"Avg_User_IMDB\": \"imdb\",\n", + " \"Avg_User_Rtn_Tom\": \"rotten_tomatoes\",\n", + " \"Martinis\": \"martinis_consumed\",\n", + " \"Kills_Bond\": \"bond_kills\",\n", + "}\n", + "\n", + "data = james_bond_data.rename(columns=new_column_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "937b9121-b7ae-4f7e-800d-bfcc2689c98a", + "metadata": {}, + "outputs": [], + "source": [ + "data.columns" + ] + }, + { + "cell_type": "markdown", + "id": "e432b28e-257b-422b-b2f8-06f41608391b", + "metadata": {}, + "source": [ + "## Dealing With Missing Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d497e64c-aa7e-4d09-8de1-f529939d58f9", + "metadata": {}, + "outputs": [], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b29d5a34-c930-4ce2-898c-b9e8aa7f771d", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.isna().any(axis=\"columns\")]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1db9201a-11c1-4cdd-9625-d70cee736191", + "metadata": {}, + "outputs": [], + "source": [ + "data = james_bond_data.rename(columns=new_column_names).combine_first(\n", + " pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4af51fb-fd1f-4570-b16f-6f20e0b65473", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.isna().any(axis=\"columns\")]" + ] + }, + { + "cell_type": "markdown", + "id": "f6297c81-4c63-4eff-95e3-4a944bb5fe03", + "metadata": {}, + "source": [ + "## Handling Financial Columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "916c91b8-7888-40fc-bce7-247837508adf", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"income_usa\", \"income_world\", \"movie_budget\", \"film_length\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "880e4710-1c11-4de2-a2c3-97a9672ce6f7", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n", + " )\n", + " .assign(\n", + " income_usa=lambda data: (\n", + " data[\"income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae9c1d1b-a620-43c5-a199-eb6a7bff7ce2", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n", + " )\n", + " .assign(\n", + " income_usa=lambda data: (\n", + " data[\"income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " income_world=lambda data: (\n", + " data[\"income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "38f36603-a797-4094-829f-fcfdbe2e80ed", + "metadata": {}, + "source": [ + "## Correcting Invalid Data Types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8331b98e-169f-4d3b-9b88-0ece7ddc8dea", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n", + " )\n", + " .assign(\n", + " income_usa=lambda data: (\n", + " data[\"income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " income_world=lambda data: (\n", + " data[\"income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6340b1f-3b1c-42e6-9b69-e981f645d77b", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"income_usa\", \"income_world\", \"movie_budget\", \"film_length\"]].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7025fbd2-ce44-4efe-88c9-9f51830776c2", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"income_usa\", \"income_world\", \"movie_budget\", \"film_length\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d45b9b42-7c71-422f-9ddb-ea659e5385c9", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\"]].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f5dacf7-2f6c-47f4-b875-7d36f2251627", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed0ead0e-7310-4c82-86d5-2480a95f1525", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n", + " )\n", + " .assign(\n", + " income_usa=lambda data: (\n", + " data[\"income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " income_world=lambda data: (\n", + " data[\"income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f147876-7348-43e9-ac6a-3f3df6ee2af9", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\", \"release_year\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47d4868a-94d8-4d36-85b9-b0c9a6203a8a", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"release_date\", \"release_year\"]].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2c7922a-916e-4e01-829b-77cbb2205153", + "metadata": {}, + "outputs": [], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "markdown", + "id": "89653d81-3bcd-4078-83cb-ad4b2fa560e6", + "metadata": {}, + "source": [ + "## Fixing Inconsistencies in Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47a41ef3-751a-41ed-869d-9f2c45509196", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"income_usa\", \"income_world\", \"movie_budget\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc483320-7895-4368-a672-b98f8d0c9755", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n", + " )\n", + " .assign(\n", + " income_usa=lambda data: (\n", + " data[\"income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " income_world=lambda data: (\n", + " data[\"income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6cbd7ea-e168-442e-8dd9-e2955288fa57", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"income_usa\", \"income_world\", \"movie_budget\"]].head()" + ] + }, + { + "cell_type": "markdown", + "id": "8bdaa8b1-9f2e-46a5-b53a-c1ae4c201c99", + "metadata": {}, + "source": [ + "## Correcting Spelling Errors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e442e51a-28fd-42d7-94b0-aaf1abe5d9a8", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"bond_actor\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9863aa7-b5db-4ab1-be63-727ff437b63b", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n", + " )\n", + " .assign(\n", + " income_usa=lambda data: (\n", + " data[\"income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " income_world=lambda data: (\n", + " data[\"income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e313152b-92b4-43a8-8483-637281a1f04d", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"bond_actor\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a26b138d-72e5-4e15-a875-ee65023545d1", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"car_manufacturer\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd4ae142-e339-4601-b0a4-84375eb28c02", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n", + " )\n", + " .assign(\n", + " income_usa=lambda data: (\n", + " data[\"income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " income_world=lambda data: (\n", + " data[\"income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"].str.removesuffix(\"mins\").astype(\"Int64\")\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c596022b-02a4-40c0-ac5f-d0b0643a7a4a", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"car_manufacturer\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "50c80bc8-fdb9-4c28-af5a-cd6b66c7a01d", + "metadata": {}, + "source": [ + "## Checking For Invalid Outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8260f6b1-6d7f-4338-95b7-8946d69a92e2", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"film_length\", \"martinis_consumed\"]].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c73fe06b-5f42-4357-9b0f-2e460bf0dacf", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n", + " )\n", + " .assign(\n", + " income_usa=lambda data: (\n", + " data[\"income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " income_world=lambda data: (\n", + " data[\"income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"]\n", + " .str.removesuffix(\"mins\")\n", + " .astype(\"Int64\")\n", + " .replace(1200, 120)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n", + " -6, 6\n", + " ),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2abb5b80-48be-4a00-9483-4732b9a5d802", + "metadata": {}, + "outputs": [], + "source": [ + "data[[\"film_length\", \"martinis_consumed\"]].describe()" + ] + }, + { + "cell_type": "markdown", + "id": "3e129b32-5e66-41cb-b938-8fd58bb94116", + "metadata": {}, + "source": [ + "## Removing Duplicate Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be7aad8b-ef3f-48a6-a9a0-de909133921f", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.duplicated(keep=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c98c7640-1472-4869-9fdd-f070d665ae1d", + "metadata": {}, + "outputs": [], + "source": [ + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n", + " )\n", + " .assign(\n", + " income_usa=lambda data: (\n", + " data[\"income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " income_world=lambda data: (\n", + " data[\"income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"]\n", + " .str.removesuffix(\"mins\")\n", + " .astype(\"Int64\")\n", + " .replace(1200, 120)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n", + " -6, 6\n", + " ),\n", + " )\n", + " .drop_duplicates(ignore_index=True)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff05e0ae-4f9b-47a7-87f1-fb7630fabddc", + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[data.duplicated(keep=False)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1216a25-4791-4601-83ba-62513e4cc880", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"movie_title\"].value_counts().head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ba17e3f-3ce1-4885-a104-f60d254d9feb", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"bond_actor\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "52db1351-36ed-4104-a999-345ebbc62214", + "metadata": {}, + "source": [ + "## Storing Your Cleansed Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "575a774e-6913-41fb-8ff9-4d786f478007", + "metadata": {}, + "outputs": [], + "source": [ + "data.to_csv(\"james_bond_data_cleansed.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0", + "metadata": {}, + "source": [ + "# Performing Data Analysis Using Python" + ] + }, + { + "cell_type": "markdown", + "id": "86817f68-05a0-4235-a1c8-a5d1f6e9141e", + "metadata": {}, + "source": [ + "## Performing a Regression Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install matplotlib scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27d0a3dd-e71a-4b8a-883c-40cb5c001f7e", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"imdb\"], data[\"rotten_tomatoes\"])\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDB Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n", + "# fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "x = data.loc[:, [\"imdb\"]]\n", + "y = data.loc[:, \"rotten_tomatoes\"]\n", + "\n", + "model = LinearRegression()\n", + "model.fit(x, y)\n", + "\n", + "r_squared = f\"R-Squared: {model.score(x, y):.2f}\"\n", + "best_fit = f\"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}\"\n", + "y_pred = model.predict(x)\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(x, y)\n", + "ax.plot(x, y_pred, color=\"red\")\n", + "ax.text(7.25, 5.5, r_squared, fontsize=10)\n", + "ax.text(7.25, 7, best_fit, fontsize=10)\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDb Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n", + "# fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "b38df412-c320-49fb-93ae-e253405537a8", + "metadata": {}, + "source": [ + "## Investigating a Statistical Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938e5942-e57f-4e41-99f1-215cfb37d0df", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "length = data[\"film_length\"].value_counts(bins=7).sort_index()\n", + "length.plot.bar(\n", + " ax=ax,\n", + " title=\"Film Length Distribution\",\n", + " xlabel=\"Time Range (mins)\",\n", + " ylabel=\"Count\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff4e9955-baf4-48eb-b032-fbf55f439194", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"film_length\"].agg([\"min\", \"max\", \"mean\", \"std\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870", + "metadata": {}, + "source": [ + "## Finding No Relationship" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bb83374-347f-4cf6-bc21-8180a003371d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"imdb\"], data[\"bond_kills\"])\n", + "ax.set_title(\"Scatter Plot of Kills vs Ratings\")\n", + "ax.set_xlabel(\"Average IMDb Rating\")\n", + "ax.set_ylabel(\"Kills by Bond\")\n", + "# fig.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data-analysis/data_analysis_results.ipynb b/data-analysis/data_analysis_results.ipynb new file mode 100644 index 0000000000..9787f530de --- /dev/null +++ b/data-analysis/data_analysis_results.ipynb @@ -0,0 +1,241 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ade4bd3f-543b-460b-980f-0b41aab2c8b6", + "metadata": {}, + "source": [ + "# Data Acquisition and Cleansing Code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a360772e-7829-4c15-9af9-d4596efc7351", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c98c7640-1472-4869-9fdd-f070d665ae1d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "james_bond_data = pd.read_csv(\"james_bond_data.csv\").convert_dtypes()\n", + "\n", + "new_column_names = {\n", + " \"Release\": \"release_date\",\n", + " \"Movie\": \"movie_title\",\n", + " \"Bond\": \"bond_actor\",\n", + " \"Bond_Car_MFG\": \"car_manufacturer\",\n", + " \"US_Gross\": \"income_usa\",\n", + " \"World_Gross\": \"income_world\",\n", + " \"Budget ($ 000s)\": \"movie_budget\",\n", + " \"Film_Length\": \"film_length\",\n", + " \"Avg_User_IMDB\": \"imdb\",\n", + " \"Avg_User_Rtn_Tom\": \"rotten_tomatoes\",\n", + " \"Martinis\": \"martinis_consumed\",\n", + " \"Kills_Bond\": \"bond_kills\",\n", + "}\n", + "\n", + "data = james_bond_data.rename(columns=new_column_names)\n", + "\n", + "data = (\n", + " james_bond_data.rename(columns=new_column_names)\n", + " .combine_first(\n", + " pd.DataFrame({\"imdb\": {10: 7.1}, \"rotten_tomatoes\": {10: 6.8}})\n", + " )\n", + " .assign(\n", + " income_usa=lambda data: (\n", + " data[\"income_usa\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " income_world=lambda data: (\n", + " data[\"income_world\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " ),\n", + " movie_budget=lambda data: (\n", + " data[\"movie_budget\"]\n", + " .replace(\"[$,]\", \"\", regex=True)\n", + " .astype(\"Float64\")\n", + " * 1000\n", + " ),\n", + " film_length=lambda data: (\n", + " data[\"film_length\"]\n", + " .str.removesuffix(\"mins\")\n", + " .astype(\"Int64\")\n", + " .replace(1200, 120)\n", + " ),\n", + " release_date=lambda data: pd.to_datetime(\n", + " data[\"release_date\"], format=\"%B, %Y\"\n", + " ),\n", + " release_year=lambda data: data[\"release_date\"].dt.year.astype(\"Int64\"),\n", + " bond_actor=lambda data: (\n", + " data[\"bond_actor\"]\n", + " .str.replace(\"Shawn\", \"Sean\")\n", + " .str.replace(\"MOORE\", \"Moore\")\n", + " ),\n", + " car_manufacturer=lambda data: data[\"car_manufacturer\"].str.replace(\n", + " \"Astin\", \"Aston\"\n", + " ),\n", + " martinis_consumed=lambda data: data[\"martinis_consumed\"].replace(\n", + " -6, 6\n", + " ),\n", + " )\n", + " .drop_duplicates(ignore_index=True)\n", + ")\n", + "\n", + "data.to_csv(\"james_bond_data_cleansed.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "f50918ee-e61f-46b2-b0c2-1ffa2c62bbc0", + "metadata": {}, + "source": [ + "# Data Analysis Code" + ] + }, + { + "cell_type": "markdown", + "id": "86817f68-05a0-4235-a1c8-a5d1f6e9141e", + "metadata": {}, + "source": [ + "## Performing a Regression Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bee6d6cb-e418-4c1d-8b75-604b9ab2e63d", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install matplotlib scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "669fb9d7-d744-4e6b-899e-a69aebec53ed", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "x = data.loc[:, [\"imdb\"]]\n", + "y = data.loc[:, \"rotten_tomatoes\"]\n", + "\n", + "model = LinearRegression()\n", + "model.fit(x, y)\n", + "\n", + "r_squared = f\"R-Squared: {model.score(x, y):.2f}\"\n", + "best_fit = f\"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}\"\n", + "y_pred = model.predict(x)\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(x, y)\n", + "ax.plot(x, y_pred, color=\"red\")\n", + "ax.text(7.25, 5.5, r_squared, fontsize=10)\n", + "ax.text(7.25, 7, best_fit, fontsize=10)\n", + "ax.set_title(\"Scatter Plot of Ratings\")\n", + "ax.set_xlabel(\"Average IMDb Rating\")\n", + "ax.set_ylabel(\"Average Rotten Tomatoes Rating\")\n", + "# fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "b38df412-c320-49fb-93ae-e253405537a8", + "metadata": {}, + "source": [ + "## Investigating a Statistical Distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "938e5942-e57f-4e41-99f1-215cfb37d0df", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "length = data[\"film_length\"].value_counts(bins=7).sort_index()\n", + "length.plot.bar(\n", + " ax=ax,\n", + " title=\"Film Length Distribution\",\n", + " xlabel=\"Time Range (mins)\",\n", + " ylabel=\"Count\",\n", + ")\n", + "# fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff4e9955-baf4-48eb-b032-fbf55f439194", + "metadata": {}, + "outputs": [], + "source": [ + "data[\"film_length\"].agg([\"min\", \"max\", \"mean\", \"std\"])" + ] + }, + { + "cell_type": "markdown", + "id": "1b14c433-c3a6-4484-bc0a-26825bd1e870", + "metadata": {}, + "source": [ + "## Finding No Relationship" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bb83374-347f-4cf6-bc21-8180a003371d", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots()\n", + "ax.scatter(data[\"imdb\"], data[\"bond_kills\"])\n", + "ax.set_title(\"Scatter Plot of Kills vs Ratings\")\n", + "ax.set_xlabel(\"Average IMDb Rating\")\n", + "ax.set_ylabel(\"Kills by Bond\")\n", + "# fig.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/data-analysis/james_bond_data.csv b/data-analysis/james_bond_data.csv new file mode 100644 index 0000000000..d2f6c8f467 --- /dev/null +++ b/data-analysis/james_bond_data.csv @@ -0,0 +1,28 @@ +Release,Movie,Bond,Bond_Car_MFG,US_Gross,World_Gross,Budget ($ 000s),Film_Length,Avg_User_IMDB,Avg_User_Rtn_Tom,Martinis,Kills_Bond +"June, 1962",Dr. No,Sean Connery,Sunbeam," $16,067,035.00 "," $59,567,035.00 "," $1,000.00 ",110 mins,7.3,7.7,2,4 +"August, 1963",From Russia with Love,Sean Connery,Bentley," $24,800,000.00 "," $78,900,000.00 "," $2,000.00 ",115 mins,7.5,8,0,11 +"May, 1964",Goldfinger,Sean Connery,Aston Martin," $51,100,000.00 "," $124,900,000.00 "," $3,000.00 ",110 mins,7.8,8.4,1,9 +"September, 1965",Thunderball,Sean Connery,Aston Martin," $63,600,000.00 "," $141,200,000.00 "," $9,000.00 ",130 mins,7,6.8,0,20 +"November, 1967",You Only Live Twice,Sean Connery,Toyota," $43,100,000.00 "," $111,600,000.00 "," $9,500.00 ",117 mins,6.9,6.3,1,21 +"July, 1969",On Her Majesty's Secret Service,George Lazenby,Mercury," $22,800,000.00 "," $82,000,000.00 "," $8,000.00 ",142 mins,6.8,6.7,1,5 +"March, 1971",Diamonds Are Forever,Shawn Connery,Ford," $43,800,000.00 "," $116,000,000.00 "," $7,200.00 ",1200 mins,6.7,6.3,0,7 +"August, 1973",Live and Let Die,Roger Moore,AMC," $35,400,000.00 "," $161,800,000.00 "," $7,000.00 ",121 mins,6.8,5.9,0,8 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"July, 1974",The Man with the Golden Gun,Roger Moore,AMC," $21,000,000.00 "," $97,600,000.00 "," $7,000.00 ",125 mins,6.7,5.1,0,1 +"April, 1977",The Spy Who Loved Me,Roger Moore,Lotus," $46,800,000.00 "," $185,400,000.00 "," $14,000.00 ",125 mins,,,1,31 +"October, 1979",Moonraker,Roger Moore,Lotus," $70,300,000.00 "," $210,300,000.00 "," $31,000.00 ",126 mins,6.2,5.7,1,12 +"June, 1981",For Your Eyes Only,Roger MOORE,Citroen," $54,800,000.00 "," $195,300,000.00 "," $28,000.00 ",127 mins,6.8,6.3,0,18 +"March, 1983",Octopussy,Roger Moore,Bajaj," $67,900,000.00 "," $187,500,000.00 "," $27,500.00 ",131 mins,6.5,5.3,0,15 +"October, 1985",A View to a Kill,Roger Moore,Rolls Royce," $50,327,960.00 "," $152,627,960.00 "," $30,000.00 ",131 mins,6.2,4.7,0,5 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"May, 1987",The Living Daylights,Timothy Dalton,Rolls Royce," $51,185,000.00 "," $191,200,000.00 "," $40,000.00 ",130 mins,6.7,6.3,2,13 +"January, 1989",License to Kill,Timothy Dalton,Aston Martin," $34,667,015.00 "," $156,167,015.00 "," $42,000.00 ",133 mins,6.5,6,1,10 +"September, 1995",GoldenEye,Pierce Brosnan,BMW," $106,429,941.00 "," $356,429,941.00 "," $60,000.00 ",130 mins,7.2,6.9,1,47 +"July, 1997",Tomorrow Never Dies,Pierce Brosnan,Aston Martin," $125,304,276.00 "," $339,504,276.00 "," $110,000.00 ",119 mins,6.4,6,1,30 +"June, 1999",The World Is Not Enough,Pierce Brosnan,BMW," $126,930,660.00 "," $361,730,660.00 "," $135,000.00 ",128 mins,6.3,5.7,1,27 +"August, 2002",Die Another Day,Pierce Brosnan,Aston Martin," $160,942,139.00 "," $431,942,139.00 "," $142,000.00 ",133 mins,6,6.1,2,31 +"February, 2006",Casino Royale,Daniel Craig,Astin Martin," $167,365,000.00 "," $596,365,000.00 "," $102,000.00 ",144 mins,7.9,7.8,3,11 +"December, 2008",Quantum of Solace,Daniel Craig,Aston Martin," $169,368,427.00 "," $591,692,078.00 "," $230,000.00 ",106 mins,6.7,6.1,-6,16 +"November, 2012",Skyfall,Daniel Craig,Astin Martin," $304,360,277.00 "," $1,108,561,108.00 "," $200,000.00 ",143 mins,7.8,8.2,1,26 +"September, 2015",Spectre,Daniel Craig,Aston Martin," $200,074,175.00 "," $879,620,923.00 "," $245,000.00 ",148 mins,6.8,6.4,1,30 +"November, 2021",No Time to Die,Daniel Craig,Aston Martin," $160,891,007.00 "," $759,959,662.00 "," $275,000.00 ",163 mins,7.3,7.3,1,14 diff --git a/data-analysis/james_bond_data.json b/data-analysis/james_bond_data.json new file mode 100644 index 0000000000..35567038c3 --- /dev/null +++ b/data-analysis/james_bond_data.json @@ -0,0 +1 @@ +{"Release":{"0":"June, 1962","1":"August, 1963","2":"May, 1964","3":"September, 1965","4":"November, 1967","5":"July, 1969","6":"March, 1971","7":"August, 1973","8":"July, 1974","9":"July, 1974","10":"April, 1977","11":"October, 1979","12":"June, 1981","13":"March, 1983","14":"October, 1985","15":"May, 1987","16":"May, 1987","17":"January, 1989","18":"September, 1995","19":"July, 1997","20":"June, 1999","21":"August, 2002","22":"February, 2006","23":"December, 2008","24":"November, 2012","25":"September, 2015","26":"November, 2021"},"Movie":{"0":"Dr. No","1":"From Russia with Love","2":"Goldfinger","3":"Thunderball","4":"You Only Live Twice","5":"On Her Majesty's Secret Service","6":"Diamonds Are Forever","7":"Live and Let Die","8":"The Man with the Golden Gun","9":"The Man with the Golden Gun","10":"The Spy Who Loved Me","11":"Moonraker","12":"For Your Eyes Only","13":"Octopussy","14":"A View to a Kill","15":"The Living Daylights","16":"The Living Daylights","17":"License to Kill","18":"GoldenEye","19":"Tomorrow Never Dies","20":"The World Is Not Enough","21":"Die Another Day","22":"Casino Royale","23":"Quantum of Solace","24":"Skyfall","25":"Spectre","26":"No Time to Die"},"Bond":{"0":"Sean Connery","1":"Sean Connery","2":"Sean Connery","3":"Sean Connery","4":"Sean Connery","5":"George Lazenby","6":"Shawn Connery","7":"Roger Moore","8":"Roger Moore","9":"Roger Moore","10":"Roger Moore","11":"Roger Moore","12":"Roger MOORE","13":"Roger Moore","14":"Roger Moore","15":"Timothy Dalton","16":"Timothy Dalton","17":"Timothy Dalton","18":"Pierce Brosnan","19":"Pierce Brosnan","20":"Pierce Brosnan","21":"Pierce Brosnan","22":"Daniel Craig","23":"Daniel Craig","24":"Daniel Craig","25":"Daniel Craig","26":"Daniel Craig"},"Bond_Car_MFG":{"0":"Sunbeam","1":"Bentley","2":"Aston Martin","3":"Aston Martin","4":"Toyota","5":"Mercury","6":"Ford","7":"AMC","8":"AMC","9":"AMC","10":"Lotus","11":"Lotus","12":"Citroen","13":"Bajaj","14":"Rolls Royce","15":"Rolls Royce","16":"Rolls Royce","17":"Aston Martin","18":"BMW","19":"Aston Martin","20":"BMW","21":"Aston Martin","22":"Astin Martin","23":"Aston Martin","24":"Astin Martin","25":"Aston Martin","26":"Aston Martin"},"US_Gross":{"0":" $16,067,035.00 ","1":" $24,800,000.00 ","2":" $51,100,000.00 ","3":" $63,600,000.00 ","4":" $43,100,000.00 ","5":" $22,800,000.00 ","6":" $43,800,000.00 ","7":" $35,400,000.00 ","8":" $21,000,000.00 ","9":" $21,000,000.00 ","10":" $46,800,000.00 ","11":" $70,300,000.00 ","12":" $54,800,000.00 ","13":" $67,900,000.00 ","14":" $50,327,960.00 ","15":" $51,185,000.00 ","16":" $51,185,000.00 ","17":" $34,667,015.00 ","18":" $106,429,941.00 ","19":" $125,304,276.00 ","20":" $126,930,660.00 ","21":" $160,942,139.00 ","22":" $167,365,000.00 ","23":" $169,368,427.00 ","24":" $304,360,277.00 ","25":" $200,074,175.00 ","26":" $160,891,007.00 "},"World_Gross":{"0":" $59,567,035.00 ","1":" $78,900,000.00 ","2":" $124,900,000.00 ","3":" $141,200,000.00 ","4":" $111,600,000.00 ","5":" $82,000,000.00 ","6":" $116,000,000.00 ","7":" $161,800,000.00 ","8":" $97,600,000.00 ","9":" $97,600,000.00 ","10":" $185,400,000.00 ","11":" $210,300,000.00 ","12":" $195,300,000.00 ","13":" $187,500,000.00 ","14":" $152,627,960.00 ","15":" $191,200,000.00 ","16":" $191,200,000.00 ","17":" $156,167,015.00 ","18":" $356,429,941.00 ","19":" $339,504,276.00 ","20":" $361,730,660.00 ","21":" $431,942,139.00 ","22":" $596,365,000.00 ","23":" $591,692,078.00 ","24":" $1,108,561,108.00 ","25":" $879,620,923.00 ","26":" $759,959,662.00 "},"Budget ($ 000s)":{"0":" $1,000.00 ","1":" $2,000.00 ","2":" $3,000.00 ","3":" $9,000.00 ","4":" $9,500.00 ","5":" $8,000.00 ","6":" $7,200.00 ","7":" $7,000.00 ","8":" $7,000.00 ","9":" $7,000.00 ","10":" $14,000.00 ","11":" $31,000.00 ","12":" $28,000.00 ","13":" $27,500.00 ","14":" $30,000.00 ","15":" $40,000.00 ","16":" $40,000.00 ","17":" $42,000.00 ","18":" $60,000.00 ","19":" $110,000.00 ","20":" $135,000.00 ","21":" $142,000.00 ","22":" $102,000.00 ","23":" $230,000.00 ","24":" $200,000.00 ","25":" $245,000.00 ","26":" $275,000.00 "},"Film_Length":{"0":"110 mins","1":"115 mins","2":"110 mins","3":"130 mins","4":"117 mins","5":"142 mins","6":"1200 mins","7":"121 mins","8":"125 mins","9":"125 mins","10":"125 mins","11":"126 mins","12":"127 mins","13":"131 mins","14":"131 mins","15":"130 mins","16":"130 mins","17":"133 mins","18":"130 mins","19":"119 mins","20":"128 mins","21":"133 mins","22":"144 mins","23":"106 mins","24":"143 mins","25":"148 mins","26":"163 mins"},"Avg_User_IMDB":{"0":7.3,"1":7.5,"2":7.8,"3":7.0,"4":6.9,"5":6.8,"6":6.7,"7":6.8,"8":6.7,"9":6.7,"10":null,"11":6.2,"12":6.8,"13":6.5,"14":6.2,"15":6.7,"16":6.7,"17":6.5,"18":7.2,"19":6.4,"20":6.3,"21":6.0,"22":7.9,"23":6.7,"24":7.8,"25":6.8,"26":7.3},"Avg_User_Rtn_Tom":{"0":7.7,"1":8.0,"2":8.4,"3":6.8,"4":6.3,"5":6.7,"6":6.3,"7":5.9,"8":5.1,"9":5.1,"10":null,"11":5.7,"12":6.3,"13":5.3,"14":4.7,"15":6.3,"16":6.3,"17":6.0,"18":6.9,"19":6.0,"20":5.7,"21":6.1,"22":7.8,"23":6.1,"24":8.2,"25":6.4,"26":7.3},"Martinis":{"0":2,"1":0,"2":1,"3":0,"4":1,"5":1,"6":0,"7":0,"8":0,"9":0,"10":1,"11":1,"12":0,"13":0,"14":0,"15":2,"16":2,"17":1,"18":1,"19":1,"20":1,"21":2,"22":3,"23":-6,"24":1,"25":1,"26":1},"Kills_Bond":{"0":4,"1":11,"2":9,"3":20,"4":21,"5":5,"6":7,"7":8,"8":1,"9":1,"10":31,"11":12,"12":18,"13":15,"14":5,"15":13,"16":13,"17":10,"18":47,"19":30,"20":27,"21":31,"22":11,"23":16,"24":26,"25":30,"26":14}} \ No newline at end of file diff --git a/data-analysis/james_bond_data.parquet b/data-analysis/james_bond_data.parquet new file mode 100644 index 0000000000..30cd4033e3 Binary files /dev/null and b/data-analysis/james_bond_data.parquet differ diff --git a/data-analysis/james_bond_data.xlsx b/data-analysis/james_bond_data.xlsx new file mode 100644 index 0000000000..301fde7203 Binary files /dev/null and b/data-analysis/james_bond_data.xlsx differ diff --git a/data-analysis/james_bond_data_cleansed.csv b/data-analysis/james_bond_data_cleansed.csv new file mode 100644 index 0000000000..b01499462a --- /dev/null +++ b/data-analysis/james_bond_data_cleansed.csv @@ -0,0 +1,26 @@ +bond_actor,bond_kills,car_manufacturer,film_length,imdb,income_usa,income_world,martinis_consumed,movie_budget,movie_title,release_date,rotten_tomatoes,release_year +Sean Connery,4,Sunbeam,110,7.3,16067035.0,59567035.0,2,1000000.0,Dr. No,1962-06-01,7.7,1962 +Sean Connery,11,Bentley,115,7.5,24800000.0,78900000.0,0,2000000.0,From Russia with Love,1963-08-01,8.0,1963 +Sean Connery,9,Aston Martin,110,7.8,51100000.0,124900000.0,1,3000000.0,Goldfinger,1964-05-01,8.4,1964 +Sean Connery,20,Aston Martin,130,7.0,63600000.0,141200000.0,0,9000000.0,Thunderball,1965-09-01,6.8,1965 +Sean Connery,21,Toyota,117,6.9,43100000.0,111600000.0,1,9500000.0,You Only Live Twice,1967-11-01,6.3,1967 +George Lazenby,5,Mercury,142,6.8,22800000.0,82000000.0,1,8000000.0,On Her Majesty's Secret Service,1969-07-01,6.7,1969 +Sean Connery,7,Ford,120,6.7,43800000.0,116000000.0,0,7200000.0,Diamonds Are Forever,1971-03-01,6.3,1971 +Roger Moore,8,AMC,121,6.8,35400000.0,161800000.0,0,7000000.0,Live and Let Die,1973-08-01,5.9,1973 +Roger Moore,1,AMC,125,6.7,21000000.0,97600000.0,0,7000000.0,The Man with the Golden Gun,1974-07-01,5.1,1974 +Roger Moore,31,Lotus,125,7.1,46800000.0,185400000.0,1,14000000.0,The Spy Who Loved Me,1977-04-01,6.8,1977 +Roger Moore,12,Lotus,126,6.2,70300000.0,210300000.0,1,31000000.0,Moonraker,1979-10-01,5.7,1979 +Roger Moore,18,Citroen,127,6.8,54800000.0,195300000.0,0,28000000.0,For Your Eyes Only,1981-06-01,6.3,1981 +Roger Moore,15,Bajaj,131,6.5,67900000.0,187500000.0,0,27500000.0,Octopussy,1983-03-01,5.3,1983 +Roger Moore,5,Rolls Royce,131,6.2,50327960.0,152627960.0,0,30000000.0,A View to a Kill,1985-10-01,4.7,1985 +Timothy Dalton,13,Rolls Royce,130,6.7,51185000.0,191200000.0,2,40000000.0,The Living Daylights,1987-05-01,6.3,1987 +Timothy Dalton,10,Aston Martin,133,6.5,34667015.0,156167015.0,1,42000000.0,License to Kill,1989-01-01,6.0,1989 +Pierce Brosnan,47,BMW,130,7.2,106429941.0,356429941.0,1,60000000.0,GoldenEye,1995-09-01,6.9,1995 +Pierce Brosnan,30,Aston Martin,119,6.4,125304276.0,339504276.0,1,110000000.0,Tomorrow Never Dies,1997-07-01,6.0,1997 +Pierce Brosnan,27,BMW,128,6.3,126930660.0,361730660.0,1,135000000.0,The World Is Not Enough,1999-06-01,5.7,1999 +Pierce Brosnan,31,Aston Martin,133,6.0,160942139.0,431942139.0,2,142000000.0,Die Another Day,2002-08-01,6.1,2002 +Daniel Craig,11,Aston Martin,144,7.9,167365000.0,596365000.0,3,102000000.0,Casino Royale,2006-02-01,7.8,2006 +Daniel Craig,16,Aston Martin,106,6.7,169368427.0,591692078.0,6,230000000.0,Quantum of Solace,2008-12-01,6.1,2008 +Daniel Craig,26,Aston Martin,143,7.8,304360277.0,1108561108.0,1,200000000.0,Skyfall,2012-11-01,8.2,2012 +Daniel Craig,30,Aston Martin,148,6.8,200074175.0,879620923.0,1,245000000.0,Spectre,2015-09-01,6.4,2015 +Daniel Craig,14,Aston Martin,163,7.3,160891007.0,759959662.0,1,275000000.0,No Time to Die,2021-11-01,7.3,2021