JupiterLab_Lyamin/.ipynb_checkpoints/week2_analysis-checkpoint.ipynb

568 lines
21 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 26,
"id": "fa1e3762-fa47-4329-94a6-a0ba89929225",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Датасет скопирован в рабочую директорию: ./datasets\n"
]
}
],
"source": [
"import kagglehub\n",
"import shutil\n",
"import os\n",
"import tqdm as notebook_tqdm\n",
"\n",
"# Скачиваем датасет в кеш\n",
"cache_path = kagglehub.dataset_download(\"podsyp/sales-in-craft-beer-bar\")\n",
"\n",
"# Укажите целевую рабочую директорию\n",
"target_directory = \"./datasets\"\n",
"\n",
"# Создайте целевую директорию, если она не существует\n",
"os.makedirs(target_directory, exist_ok=True)\n",
"\n",
"# Копируем файлы из кеша в рабочую директорию\n",
"shutil.copytree(cache_path, target_directory, dirs_exist_ok=True)\n",
"\n",
"print(f\"Датасет скопирован в рабочую директорию: {target_directory}\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "18c8b086-9293-43af-bb52-3452bbc69f9b",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"df_pr = pd.read_csv(\"./datasets/Product_range.csv\")\n",
"df_tr = pd.read_csv(\"./datasets/Transactions.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "aa213ec0-625f-4811-9c6e-072ca2a61e52",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 5314 entries, 0 to 5313\n",
"Data columns (total 8 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Product_code 5314 non-null int64 \n",
" 1 Vendor_code 4288 non-null object \n",
" 2 Name 5314 non-null object \n",
" 3 Retail_price 4878 non-null float64\n",
" 4 Base_unit 4910 non-null object \n",
" 5 Country_of_Origin 4205 non-null object \n",
" 6 Size 4626 non-null float64\n",
" 7 ABV 4622 non-null float64\n",
"dtypes: float64(3), int64(1), object(4)\n",
"memory usage: 332.3+ KB\n",
"None\n",
" Product_code Retail_price Size ABV\n",
"count 5314.000000 4878.000000 4626.000000 4622.000000\n",
"mean 2690.844750 637.839502 0.626917 7.074273\n",
"std 1543.217814 504.895006 0.953290 2.457970\n",
"min 2.000000 1.000000 0.150000 0.500000\n",
"25% 1357.250000 350.000000 0.330000 5.300000\n",
"50% 2690.500000 520.000000 0.500000 6.500000\n",
"75% 4027.750000 730.000000 1.000000 8.200000\n",
"max 5358.000000 8484.850000 30.000000 21.000000\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Product_code</th>\n",
" <th>Vendor_code</th>\n",
" <th>Name</th>\n",
" <th>Retail_price</th>\n",
" <th>Base_unit</th>\n",
" <th>Country_of_Origin</th>\n",
" <th>Size</th>\n",
" <th>ABV</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5028</td>\n",
" <td>NaN</td>\n",
" <td>1 Symbiotica Apple 0.375</td>\n",
" <td>300.0</td>\n",
" <td>Pieces</td>\n",
" <td>Russia</td>\n",
" <td>0.375</td>\n",
" <td>4.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4846</td>\n",
" <td>NaN</td>\n",
" <td>1 Symbiotica Hard Kombucha Renegade Aronia 0.33</td>\n",
" <td>200.0</td>\n",
" <td>Pieces</td>\n",
" <td>Russia</td>\n",
" <td>0.330</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1340</td>\n",
" <td>NaN</td>\n",
" <td>1000 IBU Imperial IPA Barrel l</td>\n",
" <td>960.0</td>\n",
" <td>Liters</td>\n",
" <td>NaN</td>\n",
" <td>1.000</td>\n",
" <td>9.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4372</td>\n",
" <td>NaN</td>\n",
" <td>18th Street Brewery</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>USA</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4302</td>\n",
" <td>NaN</td>\n",
" <td>18th Street Brewery Deal With The Devil 0.473</td>\n",
" <td>630.0</td>\n",
" <td>Pieces</td>\n",
" <td>USA</td>\n",
" <td>0.473</td>\n",
" <td>8.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5309</th>\n",
" <td>868</td>\n",
" <td>NaN</td>\n",
" <td>Spagetti Vestern</td>\n",
" <td>880.0</td>\n",
" <td>Liters</td>\n",
" <td>NaN</td>\n",
" <td>1.000</td>\n",
" <td>8.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5310</th>\n",
" <td>1861</td>\n",
" <td>NaN</td>\n",
" <td>Stoun Imperial Rashn Stout Barrel l</td>\n",
" <td>1200.0</td>\n",
" <td>Liters</td>\n",
" <td>NaN</td>\n",
" <td>1.000</td>\n",
" <td>10.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5311</th>\n",
" <td>4724</td>\n",
" <td>NaN</td>\n",
" <td>Semjuel Adams Boston Lager Barrel</td>\n",
" <td>720.0</td>\n",
" <td>Liters</td>\n",
" <td>NaN</td>\n",
" <td>1.000</td>\n",
" <td>4.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5312</th>\n",
" <td>822</td>\n",
" <td>Bakunin</td>\n",
" <td>Bakunin Urban Juice</td>\n",
" <td>600.0</td>\n",
" <td>Liters</td>\n",
" <td>Russia</td>\n",
" <td>1.000</td>\n",
" <td>7.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5313</th>\n",
" <td>1682</td>\n",
" <td>NaN</td>\n",
" <td>Hazeljuteli Shoktabulous Barrel l temnoe fil't...</td>\n",
" <td>880.0</td>\n",
" <td>Liters</td>\n",
" <td>NaN</td>\n",
" <td>1.000</td>\n",
" <td>5.7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5314 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" Product_code Vendor_code \\\n",
"0 5028 NaN \n",
"1 4846 NaN \n",
"2 1340 NaN \n",
"3 4372 NaN \n",
"4 4302 NaN \n",
"... ... ... \n",
"5309 868 NaN \n",
"5310 1861 NaN \n",
"5311 4724 NaN \n",
"5312 822 Bakunin \n",
"5313 1682 NaN \n",
"\n",
" Name Retail_price \\\n",
"0 1 Symbiotica Apple 0.375 300.0 \n",
"1 1 Symbiotica Hard Kombucha Renegade Aronia 0.33 200.0 \n",
"2 1000 IBU Imperial IPA Barrel l 960.0 \n",
"3 18th Street Brewery NaN \n",
"4 18th Street Brewery Deal With The Devil 0.473 630.0 \n",
"... ... ... \n",
"5309 Spagetti Vestern 880.0 \n",
"5310 Stoun Imperial Rashn Stout Barrel l 1200.0 \n",
"5311 Semjuel Adams Boston Lager Barrel 720.0 \n",
"5312 Bakunin Urban Juice 600.0 \n",
"5313 Hazeljuteli Shoktabulous Barrel l temnoe fil't... 880.0 \n",
"\n",
" Base_unit Country_of_Origin Size ABV \n",
"0 Pieces Russia 0.375 4.5 \n",
"1 Pieces Russia 0.330 3.0 \n",
"2 Liters NaN 1.000 9.6 \n",
"3 NaN USA NaN NaN \n",
"4 Pieces USA 0.473 8.5 \n",
"... ... ... ... ... \n",
"5309 Liters NaN 1.000 8.7 \n",
"5310 Liters NaN 1.000 10.8 \n",
"5311 Liters NaN 1.000 4.8 \n",
"5312 Liters Russia 1.000 7.2 \n",
"5313 Liters NaN 1.000 5.7 \n",
"\n",
"[5314 rows x 8 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(df_pr.info())\n",
"print(df_pr.describe())\n",
"df_pr"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "13968604-1997-4b8b-a250-906fc834a2ab",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 50084 entries, 0 to 50083\n",
"Data columns (total 8 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Date_and_time_of_unloading 50084 non-null object \n",
" 1 Product_code 50084 non-null int64 \n",
" 2 Amount 50084 non-null float64\n",
" 3 Sale_amount 50033 non-null float64\n",
" 4 Discount_amount 39882 non-null float64\n",
" 5 Profit 50070 non-null float64\n",
" 6 Percentage_markup 48145 non-null float64\n",
" 7 Discount_percentage 39882 non-null float64\n",
"dtypes: float64(6), int64(1), object(1)\n",
"memory usage: 3.1+ MB\n",
"None\n",
" Product_code Amount Sale_amount Discount_amount \\\n",
"count 50084.000000 50084.000000 50033.000000 39882.000000 \n",
"mean 2153.160031 3.553146 1429.786677 243.761323 \n",
"std 1367.357705 5.485805 2419.585455 405.902060 \n",
"min 99.000000 0.033000 0.410000 0.010000 \n",
"25% 899.000000 1.000000 330.000000 60.000000 \n",
"50% 2098.000000 2.000000 630.000000 121.760000 \n",
"75% 3059.000000 4.000000 1436.130000 270.615000 \n",
"max 5322.000000 248.000000 58184.070000 20440.630000 \n",
"\n",
" Profit Percentage_markup Discount_percentage \n",
"count 50070.000000 48145.000000 39882.000000 \n",
"mean 705.901987 109.184511 17.251313 \n",
"std 1352.628611 1182.538753 16.608075 \n",
"min -9300.630000 -100.000000 0.000000 \n",
"25% 143.070000 59.850000 7.930000 \n",
"50% 283.000000 84.210000 12.000000 \n",
"75% 687.015000 107.790000 20.210000 \n",
"max 33352.460000 79900.000000 100.000000 \n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Date_and_time_of_unloading</th>\n",
" <th>Product_code</th>\n",
" <th>Amount</th>\n",
" <th>Sale_amount</th>\n",
" <th>Discount_amount</th>\n",
" <th>Profit</th>\n",
" <th>Percentage_markup</th>\n",
" <th>Discount_percentage</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020-01-01 23:00:00</td>\n",
" <td>144</td>\n",
" <td>1.0</td>\n",
" <td>280.00</td>\n",
" <td>NaN</td>\n",
" <td>155.00</td>\n",
" <td>124.00</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2020-01-01 23:00:00</td>\n",
" <td>209</td>\n",
" <td>2.0</td>\n",
" <td>545.73</td>\n",
" <td>294.27</td>\n",
" <td>75.73</td>\n",
" <td>16.11</td>\n",
" <td>35.03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2020-01-01 23:00:00</td>\n",
" <td>213</td>\n",
" <td>2.0</td>\n",
" <td>1265.05</td>\n",
" <td>34.95</td>\n",
" <td>653.05</td>\n",
" <td>106.71</td>\n",
" <td>2.69</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2020-01-01 23:00:00</td>\n",
" <td>217</td>\n",
" <td>1.0</td>\n",
" <td>630.00</td>\n",
" <td>70.00</td>\n",
" <td>220.50</td>\n",
" <td>53.85</td>\n",
" <td>10.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2020-01-01 23:00:00</td>\n",
" <td>222</td>\n",
" <td>2.0</td>\n",
" <td>1104.75</td>\n",
" <td>195.25</td>\n",
" <td>393.75</td>\n",
" <td>55.38</td>\n",
" <td>15.02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50079</th>\n",
" <td>2022-09-18 15:00:00</td>\n",
" <td>5316</td>\n",
" <td>6.0</td>\n",
" <td>1875.95</td>\n",
" <td>104.05</td>\n",
" <td>1095.95</td>\n",
" <td>140.51</td>\n",
" <td>5.26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50080</th>\n",
" <td>2022-09-18 15:00:00</td>\n",
" <td>5317</td>\n",
" <td>2.0</td>\n",
" <td>555.95</td>\n",
" <td>104.05</td>\n",
" <td>315.95</td>\n",
" <td>131.65</td>\n",
" <td>15.77</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50081</th>\n",
" <td>2022-09-18 15:00:00</td>\n",
" <td>5318</td>\n",
" <td>2.0</td>\n",
" <td>572.50</td>\n",
" <td>87.50</td>\n",
" <td>312.50</td>\n",
" <td>120.19</td>\n",
" <td>13.26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50082</th>\n",
" <td>2022-09-18 15:00:00</td>\n",
" <td>5321</td>\n",
" <td>1.0</td>\n",
" <td>300.00</td>\n",
" <td>NaN</td>\n",
" <td>180.00</td>\n",
" <td>150.00</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50083</th>\n",
" <td>2022-09-18 15:00:00</td>\n",
" <td>5322</td>\n",
" <td>2.0</td>\n",
" <td>600.00</td>\n",
" <td>NaN</td>\n",
" <td>340.00</td>\n",
" <td>130.77</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>50084 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" Date_and_time_of_unloading Product_code Amount Sale_amount \\\n",
"0 2020-01-01 23:00:00 144 1.0 280.00 \n",
"1 2020-01-01 23:00:00 209 2.0 545.73 \n",
"2 2020-01-01 23:00:00 213 2.0 1265.05 \n",
"3 2020-01-01 23:00:00 217 1.0 630.00 \n",
"4 2020-01-01 23:00:00 222 2.0 1104.75 \n",
"... ... ... ... ... \n",
"50079 2022-09-18 15:00:00 5316 6.0 1875.95 \n",
"50080 2022-09-18 15:00:00 5317 2.0 555.95 \n",
"50081 2022-09-18 15:00:00 5318 2.0 572.50 \n",
"50082 2022-09-18 15:00:00 5321 1.0 300.00 \n",
"50083 2022-09-18 15:00:00 5322 2.0 600.00 \n",
"\n",
" Discount_amount Profit Percentage_markup Discount_percentage \n",
"0 NaN 155.00 124.00 NaN \n",
"1 294.27 75.73 16.11 35.03 \n",
"2 34.95 653.05 106.71 2.69 \n",
"3 70.00 220.50 53.85 10.00 \n",
"4 195.25 393.75 55.38 15.02 \n",
"... ... ... ... ... \n",
"50079 104.05 1095.95 140.51 5.26 \n",
"50080 104.05 315.95 131.65 15.77 \n",
"50081 87.50 312.50 120.19 13.26 \n",
"50082 NaN 180.00 150.00 NaN \n",
"50083 NaN 340.00 130.77 NaN \n",
"\n",
"[50084 rows x 8 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(df_tr.info())\n",
"print(df_tr.describe())\n",
"df_tr"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}