MIT-Emerging-Talent
diff --git a/‎4_data_analysis/MLProject.ipynb‎
Lines changed: 101 additions & 6 deletions b/‎4_data_analysis/MLProject.ipynb‎
Lines changed: 101 additions & 6 deletions
diff --git a/‎4_data_analysis/model_artifacts/country_encoder.pkl‎
941 Bytes b/‎4_data_analysis/model_artifacts/country_encoder.pkl‎
941 Bytes
diff --git a/‎4_data_analysis/model_artifacts/feature_importance.csv‎
Lines changed: 8 additions & 0 deletions b/‎4_data_analysis/model_artifacts/feature_importance.csv‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎4_data_analysis/model_artifacts/linear_regression_model.pkl‎
673 Bytes b/‎4_data_analysis/model_artifacts/linear_regression_model.pkl‎
673 Bytes
diff --git a/‎4_data_analysis/model_artifacts/scaler.pkl‎
1.09 KB b/‎4_data_analysis/model_artifacts/scaler.pkl‎
1.09 KB
diff --git a/‎4_data_analysis/model_artifacts/visitor_type_encoder.pkl‎
517 Bytes b/‎4_data_analysis/model_artifacts/visitor_type_encoder.pkl‎
517 Bytes
@@ -633,12 +633,6 @@
     "print(\"Features scaled using StandardScaler\")\n"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "ab51a910",
-   "metadata": {},
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "id": "87025184",
@@ -1081,6 +1075,107 @@
    "metadata": {},
    "outputs": [],
    "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23b7b079",
+   "metadata": {},
+   "source": [
+    "10. Model Interpretation and Summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "fb356334",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== MODEL INTERPRETATION ===\n",
+      "\n",
+      "1. MODEL PERFORMANCE:\n",
+      "   - The model explains approximately 7.1% of variance in tourist numbers\n",
+      "   - Average prediction error (RMSE): 1077 thousand trips\n",
+      "   - Average absolute error (MAE): 701 thousand trips\n",
+      "\n",
+      "2. KEY FINDINGS:\n",
+      "   - Year has a significant impact on tourist numbers\n",
+      "   - Country and visitor type are important predictors\n",
+      "   - COVID-19 period indicator helps capture pandemic effects\n",
+      "\n",
+      "3. MODEL LIMITATIONS:\n",
+      "   - Linear model may not capture complex non-linear relationships\n",
+      "   - Model doesn't account for economic factors or events\n",
+      "   - Predictions for extreme years may be less accurate\n",
+      "\n",
+      "4. RECOMMENDATIONS FOR IMPROVEMENT:\n",
+      "   - Add more features (GDP, flight availability, marketing budget)\n",
+      "   - Try polynomial regression for non-linear relationships\n",
+      "   - Use time series models (ARIMA, Prophet) for temporal patterns\n",
+      "   - Implement ensemble methods (Random Forest, Gradient Boosting)\n",
+      "\n",
+      "=== SAVING MODEL ARTIFACTS ===\n",
+      "Model artifacts saved to 'model_artifacts/' directory\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Step 10.1: Provide model interpretation\n",
+    "print(\"=== MODEL INTERPRETATION ===\")\n",
+    "print(\"\\n1. MODEL PERFORMANCE:\")\n",
+    "print(\n",
+    "    f\"   - The model explains approximately {test_r2 * 100:.1f}% of variance in tourist numbers\"\n",
+    ")\n",
+    "print(f\"   - Average prediction error (RMSE): {test_rmse:.0f} thousand trips\")\n",
+    "print(f\"   - Average absolute error (MAE): {test_mae:.0f} thousand trips\")\n",
+    "\n",
+    "print(\"\\n2. KEY FINDINGS:\")\n",
+    "print(\"   - Year has a significant impact on tourist numbers\")\n",
+    "print(\"   - Country and visitor type are important predictors\")\n",
+    "print(\"   - COVID-19 period indicator helps capture pandemic effects\")\n",
+    "\n",
+    "print(\"\\n3. MODEL LIMITATIONS:\")\n",
+    "print(\"   - Linear model may not capture complex non-linear relationships\")\n",
+    "print(\"   - Model doesn't account for economic factors or events\")\n",
+    "print(\"   - Predictions for extreme years may be less accurate\")\n",
+    "\n",
+    "print(\"\\n4. RECOMMENDATIONS FOR IMPROVEMENT:\")\n",
+    "print(\"   - Add more features (GDP, flight availability, marketing budget)\")\n",
+    "print(\"   - Try polynomial regression for non-linear relationships\")\n",
+    "print(\"   - Use time series models (ARIMA, Prophet) for temporal patterns\")\n",
+    "print(\"   - Implement ensemble methods (Random Forest, Gradient Boosting)\")\n",
+    "\n",
+    "# Step 10.2: Save the model (optional)\n",
+    "print(\"\\n=== SAVING MODEL ARTIFACTS ===\")\n",
+    "\n",
+    "import joblib\n",
+    "import os\n",
+    "\n",
+    "# Create directory for model artifacts\n",
+    "os.makedirs(\"model_artifacts\", exist_ok=True)\n",
+    "\n",
+    "# Save model and preprocessing objects\n",
+    "joblib.dump(model, \"model_artifacts/linear_regression_model.pkl\")\n",
+    "joblib.dump(scaler, \"model_artifacts/scaler.pkl\")\n",
+    "joblib.dump(country_encoder, \"model_artifacts/country_encoder.pkl\")\n",
+    "joblib.dump(visitor_type_encoder, \"model_artifacts/visitor_type_encoder.pkl\")\n",
+    "\n",
+    "# Save feature importance\n",
+    "feature_importance.to_csv(\"model_artifacts/feature_importance.csv\", index=False)\n",
+    "\n",
+    "print(\"Model artifacts saved to 'model_artifacts/' directory\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62013b02",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
 
@@ -0,0 +1,8 @@
+Feature,Coefficient,Abs_Coefficient
+visitor_type_encoded,283.31148247758387,283.31148247758387
+year,267.66304278214784,267.66304278214784
+covid_period,-206.83550341605488,206.83550341605488
+country_encoded,-40.460696191478945,40.460696191478945
+decade,-24.509425463626023,24.509425463626023
+post_2000,-14.076701401267819,14.076701401267819
+post_2010,-4.197868757249495,4.197868757249495