diff --git a/cs2109s/labs/ps4/housing_data.csv b/cs2109s/labs/ps4/housing_data.csv new file mode 100644 index 0000000..bf37a4c --- /dev/null +++ b/cs2109s/labs/ps4/housing_data.csv @@ -0,0 +1,91 @@ +floor_area_sqm,bedrooms,schools,asking_price +60,2,3,480000.0 +63,2,4,450000.0 +65,2,1,480000.0 +65,2,4,360000.0 +66,2,5,350000.0 +67,2,0,400000.0 +67,2,1,430000.0 +67,2,2,450000.0 +67,2,3,410000.0 +67,2,4,390000.0 +67,2,5,390000.0 +67,2,6,390000.0 +68,2,1,460000.0 +68,2,2,400000.0 +68,2,3,370000.0 +68,2,4,360000.0 +68,2,6,370000.0 +68,2,7,400000.0 +68,2,8,370000.0 +69,2,3,420000.0 +70,2,3,510000.0 +70,2,8,370000.0 +73,2,5,370000.0 +73,2,7,380000.0 +83,2,1,590000.0 +83,3,1,650000.0 +83,3,2,550000.0 +83,3,3,550000.0 +83,3,4,580000.0 +84,3,3,560000.0 +86,3,2,560000.0 +88,3,3,510000.0 +89,3,1,610000.0 +90,3,3,500000.0 +90,3,4,520000.0 +90,3,5,490000.0 +91,3,0,550000.0 +91,3,3,620000.0 +92,3,1,490000.0 +92,3,2,590000.0 +92,3,3,490000.0 +92,3,4,510000.0 +92,3,5,490000.0 +92,3,6,490000.0 +92,3,7,510000.0 +93,3,0,490000.0 +93,3,1,660000.0 +93,3,2,540000.0 +93,3,3,550000.0 +93,3,4,490000.0 +93,3,5,510000.0 +93,3,6,490000.0 +93,3,7,490000.0 +94,3,2,590000.0 +94,3,3,550000.0 +94,3,4,490000.0 +94,3,5,540000.0 +95,3,1,700000.0 +95,3,2,620000.0 +95,3,3,530000.0 +95,3,4,540000.0 +96,3,2,600000.0 +96,3,7,480000.0 +97,3,1,660000.0 +97,3,2,600000.0 +97,3,4,440000.0 +98,3,5,450000.0 +104,3,1,720000.0 +105,3,0,630000.0 +105,3,1,670000.0 +105,3,2,670000.0 +105,4,1,610000.0 +108,3,1,780000.0 +109,3,2,700000.0 +110,3,3,670000.0 +110,3,4,660000.0 +111,3,3,640000.0 +112,3,1,700000.0 +112,3,2,660000.0 +112,3,3,670000.0 +112,3,4,650000.0 +113,3,1,640000.0 +113,3,2,590000.0 +113,3,3,600000.0 +113,3,4,590000.0 +113,3,5,590000.0 +115,3,1,720000.0 +115,3,2,700000.0 +119,4,3,690000.0 +124,3,1,770000.0 diff --git a/cs2109s/labs/ps4/imgs/add_bias.jpeg b/cs2109s/labs/ps4/imgs/add_bias.jpeg new file mode 100644 index 0000000..19b782f Binary files /dev/null and b/cs2109s/labs/ps4/imgs/add_bias.jpeg differ diff --git a/cs2109s/labs/ps4/imgs/bias_scatter.ipynb b/cs2109s/labs/ps4/imgs/bias_scatter.ipynb new file mode 100644 index 0000000..b6ce3ea --- /dev/null +++ b/cs2109s/labs/ps4/imgs/bias_scatter.ipynb @@ -0,0 +1,87 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "f64907f4", + "metadata": {}, + "outputs": [], + "source": [ + "# CODE TO GENERATE bias_scatter.png\n", + "import numpy as np\n", + "from sklearn.linear_model import LinearRegression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf22eb4a", + "metadata": {}, + "outputs": [], + "source": [ + "X = np.array([1, 2, 3, 4, 5, 6]).reshape((-1, 1))\n", + "y = np.array([6, 7, 8, 8, 9, 11])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b185d8b9", + "metadata": {}, + "outputs": [], + "source": [ + "model_no_bias = LinearRegression(fit_intercept = False).fit(X, y)\n", + "model_with_bias = LinearRegression().fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5076cdc", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "X_with_zero = np.vstack([0, X]) # Added to show the lines passing through Feature=0\n", + "plt.scatter(X, y)\n", + "plt.plot(X_with_zero, model_no_bias.predict(X_with_zero), color = 'b', label=\"Without bias\")\n", + "plt.plot(X_with_zero, model_with_bias.predict(X_with_zero), color = 'r', label=\"With bias\")\n", + "plt.ylim(ymin=0)\n", + "plt.xlim(xmin=0, xmax=8)\n", + "plt.xlabel(\"Feature\")\n", + "plt.ylabel(\"Target\")\n", + "plt.legend(loc=\"center right\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e68dd023", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/cs2109s/labs/ps4/imgs/bias_scatter.png b/cs2109s/labs/ps4/imgs/bias_scatter.png new file mode 100644 index 0000000..c7671a9 Binary files /dev/null and b/cs2109s/labs/ps4/imgs/bias_scatter.png differ diff --git a/cs2109s/labs/ps4/imgs/feature_scaling.png b/cs2109s/labs/ps4/imgs/feature_scaling.png new file mode 100644 index 0000000..2145dfc Binary files /dev/null and b/cs2109s/labs/ps4/imgs/feature_scaling.png differ diff --git a/cs2109s/labs/ps4/imgs/grad_desc_algorithm.png b/cs2109s/labs/ps4/imgs/grad_desc_algorithm.png new file mode 100644 index 0000000..11a9eb2 Binary files /dev/null and b/cs2109s/labs/ps4/imgs/grad_desc_algorithm.png differ diff --git a/cs2109s/labs/ps4/imgs/grad_desc_algorithm.tex b/cs2109s/labs/ps4/imgs/grad_desc_algorithm.tex new file mode 100644 index 0000000..7ab2deb --- /dev/null +++ b/cs2109s/labs/ps4/imgs/grad_desc_algorithm.tex @@ -0,0 +1,24 @@ +% CODE TO GENERATE grad_desc_algorithm.tex +\documentclass{article} +\usepackage{algorithm2e} +\usepackage{bm} +\begin{document} +\RestyleAlgo{ruled} +\DontPrintSemicolon +\begin{algorithm}[hbt!] + \caption{Gradient Descent for Linear Regression} + $w_0, w_1, \ldots, w_n \gets 0$\; + \For{$i\gets 1$ \KwTo $N$}{ + $w'_0 \gets w_0 - \alpha\frac{\partial J(\boldsymbol{w})}{\partial w_0}$\; + $w'_1 \gets w_1 - \alpha\frac{\partial J(\boldsymbol{w})}{\partial w_1}$\; + $\vdots$\; + $w'_n \gets w_n - \alpha\frac{\partial J(\boldsymbol{w})}{\partial w_n}$\; + $w_0 \gets w'_0$\; + $w_1 \gets w'_1$\; + $\vdots$\; + $w_n \gets w'_n$\; + $loss \gets J(\boldsymbol{w})$\; + } +\end{algorithm} + +\end{document} \ No newline at end of file diff --git a/cs2109s/labs/ps4/imgs/linear_reg.png b/cs2109s/labs/ps4/imgs/linear_reg.png new file mode 100644 index 0000000..c3e85cd Binary files /dev/null and b/cs2109s/labs/ps4/imgs/linear_reg.png differ diff --git a/cs2109s/labs/ps4/imgs/loss.png b/cs2109s/labs/ps4/imgs/loss.png new file mode 100644 index 0000000..564d877 Binary files /dev/null and b/cs2109s/labs/ps4/imgs/loss.png differ diff --git a/cs2109s/labs/ps4/imgs/mse_plot.png b/cs2109s/labs/ps4/imgs/mse_plot.png new file mode 100644 index 0000000..7abd43f Binary files /dev/null and b/cs2109s/labs/ps4/imgs/mse_plot.png differ diff --git a/cs2109s/labs/ps4/imgs/poly_matrix.png b/cs2109s/labs/ps4/imgs/poly_matrix.png new file mode 100644 index 0000000..144a799 Binary files /dev/null and b/cs2109s/labs/ps4/imgs/poly_matrix.png differ diff --git a/cs2109s/labs/ps4/imgs/poly_reg.png b/cs2109s/labs/ps4/imgs/poly_reg.png new file mode 100644 index 0000000..955f801 Binary files /dev/null and b/cs2109s/labs/ps4/imgs/poly_reg.png differ diff --git a/cs2109s/labs/ps4/imgs/school_price_rel.png b/cs2109s/labs/ps4/imgs/school_price_rel.png new file mode 100644 index 0000000..96c9504 Binary files /dev/null and b/cs2109s/labs/ps4/imgs/school_price_rel.png differ diff --git a/cs2109s/labs/ps4/imgs/school_price_rel_cubicfit.png b/cs2109s/labs/ps4/imgs/school_price_rel_cubicfit.png new file mode 100644 index 0000000..0a1ed44 Binary files /dev/null and b/cs2109s/labs/ps4/imgs/school_price_rel_cubicfit.png differ diff --git a/cs2109s/labs/ps4/imgs/school_price_rel_linearfit.png b/cs2109s/labs/ps4/imgs/school_price_rel_linearfit.png new file mode 100644 index 0000000..610a8e1 Binary files /dev/null and b/cs2109s/labs/ps4/imgs/school_price_rel_linearfit.png differ diff --git a/cs2109s/labs/ps4/ps4.ipynb b/cs2109s/labs/ps4/ps4.ipynb new file mode 100644 index 0000000..72f0fe4 --- /dev/null +++ b/cs2109s/labs/ps4/ps4.ipynb @@ -0,0 +1,1278 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Problem Set 4: Predicting House Prices in Singapore\n", + "\n", + "**Release Date:** 13 February 2024\n", + "\n", + "**Due Date:** 23:59, 24 February 2024" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "We have learned how to solve a regression problem using linear regression in class.\n", + "In this problem set, we will apply our knowledge to solve a real-world problem. More\n", + "specifically, we will develop linear regression and polynomial regression models to predict\n", + "house prices in Singapore.\n", + "\n", + "**Required Files**:\n", + "* ps4.ipynb\n", + "* housing_data.csv\n", + "\n", + "**Honour Code**: Note that plagiarism will not be condoned! You may discuss with your classmates and check the internet for references, but you MUST NOT submit code/report that is copied directly from other sources!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Orientation to files\n", + "\n", + "**ps4.ipynb**:\n", + "The template for all your tasks is provided in this file. Some test cases have\n", + "been provided for you to check the output of your algorithm against the expected result. The tests are **not** comprehensive, and you are\n", + "encouraged to write your own tests to check for correctness.\n", + "\n", + "**housing.csv**:\n", + "There are 90 housing data points. Each data point consists of 3 features:\n", + "* **floor_area_sqm** - size of the house in square meters\n", + "* **bedrooms** - number of bedrooms\n", + "* **schools** - number of primary schools within a 1km radius\n", + "\n", + "Our target value is the **asking_price**, which is the price of the housing unit." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### IMPORTANT\n", + "\n", + "Similar to PS0, your implementation in the following tasks **should not\n", + "involve any iteration, including `map` and `filter`, or recursion**. Instead, please work\n", + "with the operations available in NumPy. Solutions that violate this will be penalised.\n", + "\n", + "There is, however, an exception for **Tasks 2.4, 2.5, and 3.4**. In the pseudo-code for the\n", + "algorithm required, there is an explicit while loop. Hence, **only for these tasks**, you\n", + "may use a **single for/while loop** to iterate for the number of epochs required.\n", + "\n", + " You are allowed to use any mathematical functions, but this **does not mean that you are allowed to\n", + "use *any* NumPy function** (there are NumPy functions that aren’t mathematical functions). For example,\n", + "`np.vectorize` is not allowed since it is iterative. If you are in doubt about which functions are allowed, please\n", + "ask in the forum (:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Inital imports and setup\n", + "\n", + "import os\n", + "import numpy as np\n", + "\n", + "###################\n", + "# Helper function #\n", + "###################\n", + "def load_data(filepath):\n", + " \"\"\"\n", + " Load in the given csv filepath as a numpy array\n", + "\n", + " Parameters\n", + " ----------\n", + " filepath (string) : path to csv file\n", + "\n", + " Returns\n", + " -------\n", + " X, y (np.ndarray, np.ndarray) : (m, num_features), (m,) numpy matrices\n", + " \"\"\"\n", + " *X, y = np.genfromtxt(\n", + " filepath,\n", + " delimiter=',',\n", + " skip_header=True,\n", + " unpack=True,\n", + " ) # default dtype: float\n", + " X = np.array(X, dtype=float).T # cast features to int type\n", + " return X, y.reshape((-1, 1))\n", + "\n", + "data_filepath = 'housing_data.csv'\n", + "X, y = load_data(data_filepath)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 1: Defining cost functions\n", + "\n", + "We need to define cost functions before creating a linear regression model to calculate\n", + "the error between our prediction and the true values. We will define two cost functions:\n", + "Mean Squared Error (MSE) and Mean Absolute Error (MAE)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 1.1: Mean Squared Error (MSE)\n", + "\n", + "Write the function `mean_squared_error(y_true, y_pred)` that returns a number representing the mean squared error of the predictions.\n", + "\n", + "The formula for Mean Squared Error is as follows:\n", + "$$ MSE(\\boldsymbol{y}, \\boldsymbol{\\hat{y}}) = \\frac{1}{2m} \\sum_{i=1}^{m}(\\hat{y}_i - y_i)^2 $$\n", + "\n", + "where $\\boldsymbol{y}$ is the vector with actual values, $\\boldsymbol{\\hat{y}}$ is the prediction vector, and $m$ is the number of samples in the\n", + "training data.\n", + "\n", + "**Remark**: The formula here follows the lecture slides for consistency. In definitions and implementations elsewhere, the denominator is usually just $m$ instead of $2m$.\n", + "\n", + "**Hint**: Consider using `np.square` or `np.power`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def mean_squared_error(y_true, y_pred):\n", + " \"\"\"\n", + " Calculate mean squared error between y_pred and y_true.\n", + "\n", + " Parameters\n", + " ----------\n", + " y_true (np.ndarray) : (m, 1) numpy matrix consists of true values\n", + " y_pred (np.ndarray) : (m, 1) numpy matrix consists of predictions\n", + " \n", + " Returns\n", + " -------\n", + " The mean squared error value.\n", + " \"\"\"\n", + " return np.mean(np.square(y_true - y_pred))\n", + " \n", + " # TODO: add your solution here and remove `raise NotImplementedError`\n", + " raise NotImplementedError" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "y_true, y_pred = np.array([[3], [5]]), np.array([[12], [15]])\n", + "\n", + "assert mean_squared_error(y_true, y_pred) in [45.25, 90.5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 1.2: Mean Absolute Error (MAE)\n", + "\n", + "Write the function `mean_absolute_error(y_true, y_pred)` that returns a number representing the mean absolute error of the predictions.\n", + "\n", + "The formula for Mean Absolute Error is as follows:\n", + "$$ MSE(\\boldsymbol{y}, \\boldsymbol{\\hat{y}}) = \\frac{1}{m} \\sum_{i=1}^{m}|\\hat{y}_i - y_i| $$\n", + "\n", + "where $\\boldsymbol{y}$ is the vector with actual values, $\\boldsymbol{\\hat{y}}$ is the prediction vector, and $m$ is the number of samples in the\n", + "training data.\n", + "\n", + "**Hint**: Consider using `np.abs`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def mean_absolute_error(y_true, y_pred):\n", + " \"\"\"\n", + " Calculate mean absolute error between y_pred and y_true.\n", + "\n", + " Parameters\n", + " ----------\n", + " y_true (np.ndarray) : (m, 1) numpy matrix consists of true values\n", + " y_pred (np.ndarray) : (m, 1) numpy matrix consists of predictions\n", + " \n", + " Returns\n", + " -------\n", + " The mean absolute error value.\n", + " \"\"\"\n", + " return np.mean(np.abs(y_true - y_pred))\n", + " \n", + " # TODO: add your solution here and remove `raise NotImplementedError`\n", + " raise NotImplementedError" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "y_true, y_pred = np.array([[3], [5]]), np.array([[12], [15]])\n", + "\n", + "assert mean_absolute_error(y_true, y_pred) == 9.5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 2: Linear Regression\n", + "\n", + "Now we’re ready to create our own linear regression model. We will try to approximate a linear function, which can be written as follows:\n", + "\n", + "$$ y = w_0 + w_1 x_1 + w_2 x_2 + \\dots + w_n x_n $$\n", + "\n", + "where $y$ is the target value, $x_1, x_2, \\dots, x_n$ are feature values, and $w_0, w_1, \\dots, w_n$ are parameters. $w_0$ is meant to represent the bias term, while $w_1, \\dots, w_n$ are the feature weights.\n", + "\n", + "**Bias term**\n", + "\n", + "The bias term ($w_0$) is useful in capturing an inherent offset of the target values from the origin, i.e. they have some non-zero value when all features are zero. The bias term accounts for this scenario in our model. Without a bias term (or bias = 0), our regression lines can only pass through the origin, which might not be appropriate for some data.\n", + "\n", + "Consider the scatter plot below. The blue line is the best fitting line without a bias term, while the red line includes a non-zero bias. Since the blue line starts at the origin, it is unable to capture the offset of the points. In contrast, the red line starts higher (at around 5), and hence is better able to approximate the data.\n", + "\n", + " \n", + "
\n", + "\"bias\n", + "
Figure 0: Example of models with bias vs without bias.
\n", + "
\n", + "\n", + "Usually, we have to explicitly add a bias term into our data when building our models. In the following tasks, you'll explore how to do so and how this choice can affect the accuracy of your models." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 2.1: Adding a bias column\n", + "\n", + "In the lecture, we learned that adding a bias column allows our linear model to be more\n", + "flexible. Write the function `add_bias_column(X)` that takes a NumPy matrix `X` and returns\n", + "a new matrix with an additional column. The additional column should have all of its\n", + "elements set to 1 and is located at the first column of the matrix.\n", + "\n", + "
\n", + "\"adding\n", + "
Figure 1: Example of a matrix before and after adding a bias column.
\n", + "
\n", + "\n", + "**Note**: Your function should work for all kinds of matrix shapes.\n", + "\n", + "**Hint**: Consider using `np.hstack` to add the bias column." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def add_bias_column(X):\n", + " \"\"\"\n", + " Create a bias column and combine it with X.\n", + "\n", + " Parameters\n", + " ----------\n", + " X : (m, n) numpy matrix representing a feature matrix\n", + " \n", + " Returns\n", + " -------\n", + " new_X (np.ndarray):\n", + " A (m, n + 1) numpy matrix with the first column consisting of all 1s\n", + " \"\"\"\n", + " return np.hstack((np.ones((X.shape[0], 1)), X))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "without_bias = np.array([[1, 2], [3, 4]])\n", + "expected = np.array([[1, 1, 2], [1, 3, 4]])\n", + "\n", + "assert np.array_equal(add_bias_column(without_bias), expected)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 2.2: Get best fitting bias and weights\n", + "\n", + "Write the function `get_bias_and_weight(X, y, include_bias)` that returns $w_0$ (bias) and\n", + "$w_1, w_2, \\dots, w_n$ (weights) that will lead to best fitting line.\n", + "\n", + "The `include_bias` argument is used to specify if the model includes a bias term, i.e. has a non-zero bias term. Hence, the function should return $w_0 = 0$ if it is set to `false`. The function should return $w_1, \\dots, w_n$ as a NumPy matrix with shape $(n, 1)$, where $n$ is the number of features (excluding the bias column).\n", + "\n", + "We can use the normal equation to get $w_0, w_1, \\dots, w_n$. The normal equation is as\n", + "follows:\n", + "\n", + "$$ \\begin{pmatrix} w_0 \\\\ w_1 \\\\ \\vdots \\\\ w_n \\end{pmatrix} = (X^TX)^{-1}X^T \\boldsymbol{y} $$\n", + "\n", + "where $X$ is the (augmented for bias) feature matrix and $\\boldsymbol{y}$ is the vector of target values.\n", + "\n", + "**Note**: You can use the `add_bias_column` function for this task. (You do not need to re-define the function in Coursemology. However, you are free to do so if you explicitly want to use your own implementation of the function for this task.)\n", + "\n", + "**Hint**: Consider using `numpy.linalg.inv` for the matrix inverse." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "def get_bias_and_weight(X, y, include_bias = True):\n", + " \"\"\"\n", + " Calculate bias and weights that give the best fitting line.\n", + "\n", + " Parameters\n", + " ----------\n", + " X (np.ndarray) : (m, n) numpy matrix representing feature matrix\n", + " y (np.ndarray) : (m, 1) numpy matrix representing target values\n", + " include_bias (boolean) : Specify whether the model should include a bias term\n", + " \n", + " Returns\n", + " -------\n", + " bias (float):\n", + " If include_bias = True, return the bias constant. Else,\n", + " return 0\n", + " weights (np.ndarray):\n", + " A (n, 1) numpy matrix representing the weight constant(s).\n", + " \"\"\"\n", + " if include_bias:\n", + " X = add_bias_column(X)\n", + " weights = np.linalg.inv(X.T @ X) @ X.T @ y\n", + " return (weights[0][0], weights[1:]) if include_bias else (0, weights)\n", + " \n", + " # TODO: add your solution here and remove `raise NotImplementedError`\n", + " raise NotImplementedError" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "public_X, public_y = np.array([[1, 3], [2, 3], [3, 4]]), np.arange(4, 7).reshape((-1, 1))\n", + "\n", + "test_1 = (round(get_bias_and_weight(public_X, public_y)[0], 5) == 3)\n", + "test_2 = np.array_equal(np.round(get_bias_and_weight(public_X, public_y)[1], 1), np.array([[1.0], [0.0]]))\n", + "test_3 = np.array_equal(np.round(get_bias_and_weight(public_X, public_y, False)[1], 2), np.round(np.array([[0.49], [1.20]]), 2))\n", + "\n", + "assert test_1 and test_2 and test_3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 2.3: Get the prediction line\n", + "\n", + "Write the function `get_prediction_linear_regression(X, y, include_bias)` that returns `y_pred`,\n", + "a vector of predicted values for the training data.\n", + "\n", + "**Note**: You can use the `get_bias_and_weight` function for this task. (You do not need to re-define the function in Coursemology. However, you are free to do so if you explicitly want to use your own implementation of the function for this task.)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "def get_prediction_linear_regression(X, y, include_bias = True):\n", + " \"\"\"\n", + " Calculate the best fitting line.\n", + "\n", + " Parameters\n", + " ----------\n", + " X (np.ndarray) : (m, n) numpy matrix representing feature matrix\n", + " y (np.ndarray) : (m, 1) numpy matrix representing target values\n", + " include_bias (boolean) : Specify whether the model should include a bias term\n", + "\n", + " Returns\n", + " -------\n", + " y_pred (np.ndarray):\n", + " A (m, 1) numpy matrix representing prediction values.\n", + " \"\"\"\n", + " bias, weights = get_bias_and_weight(X, y, include_bias)\n", + " return bias + X @ weights" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1 3]\n", + " [2 3]\n", + " [3 4]] 2.9999999999999645 [[1.00000000e+00]\n", + " [1.77635684e-14]]\n", + "[[4.]\n", + " [5.]\n", + " [6.]]\n" + ] + } + ], + "source": [ + "test_X, test_y = np.array([[1, 3], [2, 3], [3, 4]]), np.arange(4, 7).reshape((-1, 1))\n", + "\n", + "assert round(mean_squared_error(test_y, get_prediction_linear_regression(test_X, test_y)), 5) == 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To check your work, create a linear regression model with **floor_area_sqm** as the only\n", + "feature and **asking_price** as the target value. Plot your prediction line using the code\n", + "snippet below. It should look similar to Figure 2.\n", + "\n", + " \n", + "
\n", + "\"regression\n", + "
Figure 2: Example of linear regression using floor_area_sqm as feature.
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "area = X[:, 0].reshape((-1, 1))\n", + "predicted = get_prediction_linear_regression(area, y)\n", + "plt.scatter(area, y)\n", + "plt.plot(area, predicted, color = 'r')\n", + "plt.xlabel(\"Size in square meter\")\n", + "plt.ylabel(\"Price in SGD\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Gradient Descent\n", + "\n", + "We will now learn to use gradient descent to approximate $\\boldsymbol{w} = (w_0, w_1, \\dots, w_n)$.\n", + "\n", + "*Gradient descent* is an algorithm that minimizes the cost function by iteratively trying to\n", + "find the best parameters. In linear regression, we will try to minimize the Mean Squared\n", + "Error. The outline of the algorithm is as follows:\n", + " \n", + "* Start with some $\\boldsymbol{w} = (w_0, \\dots, w_n)$\n", + "* Keep changing $w_0,\\dots, w_n$ to minimize $J(\\boldsymbol{w})$, where $J$ is our cost function\n", + "\n", + "In this problem set, we will initially set $w_0, w_1, \\dots, w_n$ to all be 0s. Then, we will set a\n", + "learning rate $\\alpha$ that will affect the rate of change of $w_0, \\dots, w_n$. Lastly, we will set\n", + "$N$ to specify the number of epochs of gradient descent we want to run.\n", + "\n", + "The pseudo-code of Gradient Descent for linear regression is defined in Algorithm 1.\n", + "\n", + "**Note**: In the following gradient descent-related tasks, calculate the value of the loss function *after* updating the bias and weights.\n", + "\n", + " *The Gradient Descent algorithm is not limited to the linear regression model – it is a general optimisation technique and is also used in many other machine learning models such as Neural Networks.*\n", + "\n", + "
\n", + " \"gradient\n", + " \"gradient\n", + "
\n", + "
Figure 3: Gradient descent tries to find parameters that lead to the lowest MSE.
\n", + "\n", + "For MSE, The partial derivative $\\frac{\\partial J(\\boldsymbol w)}{\\partial w_i}$ with $m$ training samples can be derived as: \n", + "$$\n", + "\\frac{\\partial J(\\boldsymbol{w})}{\\partial w_i} = \\frac 1m\\sum^m_{j=1}(h_w(x^{(j)})-y^{(j)})\\cdot x_i^{(j)} \n", + "$$\n", + "where $h_w$ is our hypothesis." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 2.4: Gradient Descent on a single feature\n", + "\n", + "Write the function `gradient_descent_one_variable(x, y, lr, number_of_epochs)` that\n", + "returns:\n", + "\n", + "* $w_0$ - a number representing the bias constant\n", + "* $w_1$ - a number representing the weight constant\n", + "* $loss$ - a list that contains the MSE scores calculated during the gradient descent process.\n", + "\n", + "The default value is $10^{-5}$ for `lr` and $250$ for `number_of_epochs`.\n", + "\n", + "**Note**: You can use the `mean_squared_error` function for this task. (You do not need to re-define the function in Coursemology. However, you are free to do so if you explicitly want to use your own implementation of the function for this task.)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "def gradient_descent_one_variable(x, y, lr = 1e-5, number_of_epochs = 250):\n", + " \"\"\"\n", + " Approximate bias and weight that give the best fitting line.\n", + "\n", + " Parameters\n", + " ----------\n", + " x (np.ndarray) : (m, 1) numpy matrix representing a feature column\n", + " y (np.ndarray) : (m, 1) numpy matrix representing target values\n", + " lr (float) : Learning rate\n", + " number_of_epochs (int) : Number of gradient descent epochs\n", + " \n", + " Returns\n", + " -------\n", + " bias (float):\n", + " The bias constant\n", + " weight (float):\n", + " The weight constant\n", + " loss (list):\n", + " A list where the i-th element denotes the MSE score at i-th epoch.\n", + " \"\"\"\n", + " # Do not change\n", + " bias = 0\n", + " weight = 0\n", + " loss = []\n", + "\n", + " for _ in range(number_of_epochs):\n", + " predicted_y = bias + weight * x\n", + "\n", + " weight_gradient = np.mean((predicted_y - y)*x)\n", + " weight -= lr * weight_gradient\n", + "\n", + " bias_gradient = np.mean(predicted_y - y)\n", + " bias -= lr * bias_gradient\n", + "\n", + " loss.append(mean_squared_error(y, predicted_y))\n", + " return bias, weight, loss" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "area = X[:, 0].reshape((-1, 1))\n", + "\n", + "loss_initial = gradient_descent_one_variable(area, y, lr = 1e-5, number_of_epochs = 250)[2][0]\n", + "loss_final = gradient_descent_one_variable(area, y, lr = 1e-5, number_of_epochs = 250)[2][-1]\n", + "\n", + "assert loss_initial > loss_final" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To check your work, plot the `loss` against `number_of_epochs` using the code snippet\n", + "below. It should be similar to Figure 4.\n", + "\n", + " \n", + "
\n", + "\"gradient\n", + "
Figure 4: MSE values decrease as the epoch number increases.
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "area = X[:, 0].reshape((-1, 1))\n", + "b, w, loss = gradient_descent_one_variable(area, y, 1e-5, 250)\n", + "plt.plot([i for i in range(len(loss))], loss)\n", + "plt.xlabel('Epoch number')\n", + "plt.ylabel('Loss')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 2.5: Gradient Descent on multiple features\n", + "\n", + "Now, extend the code in Task 2.4 so it is able to find $w_0, w_1, \\dots, w_n$ for multiple features. Write the function `gradient_descent_multi_variable(X, y, lr, number_of_epochs)` that returns:\n", + "\n", + "* $w_0$ - a number representing the bias constant\n", + "* $w_1, w_2, \\dots, w_n$ - $(n,1)$ NumPy matrix, where each element denotes the weight constant of a certain feature\n", + "* $loss$ - a list that contains the MSE scores calculated during the gradient descent process.\n", + "\n", + "**Note**: You can use the `mean_squared_error` function for this task. (You do not need to re-define the function in Coursemology. However, you are free to do so if you explicitly want to use your own implementation of the function for this task.)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "def gradient_descent_multi_variable(X, y, lr = 1e-5, number_of_epochs = 250):\n", + " '''\n", + " Approximate bias and weight that gave the best fitting line.\n", + "\n", + " Parameters\n", + " ----------\n", + " X (np.ndarray) : (m, n) numpy matrix representing feature matrix\n", + " y (np.ndarray) : (m, 1) numpy matrix representing target values\n", + " lr (float) : Learning rate\n", + " number_of_epochs (int) : Number of gradient descent epochs\n", + " \n", + " Returns\n", + " -------\n", + " bias (float):\n", + " The bias constant\n", + " weights (np.ndarray):\n", + " A (n, 1) numpy matrix that specifies the weight constants.\n", + " loss (list):\n", + " A list where the i-th element denotes the MSE score at i-th epoch.\n", + " '''\n", + " # Do not change\n", + " bias = 0\n", + " weights = np.full((X.shape[1], 1), 0).astype(float)\n", + " loss = []\n", + "\n", + " for _ in range(number_of_epochs):\n", + " predicted_y = bias + X @ weights\n", + "\n", + " weight_gradient = np.mean((predicted_y - y)*X, axis=0)\n", + " weight_gradient = weight_gradient.reshape((X.shape[1],1))\n", + " weights -= lr * weight_gradient\n", + "\n", + " bias_gradient = np.mean(predicted_y - y)\n", + " bias -= lr * bias_gradient\n", + "\n", + " loss.append(mean_squared_error(y, predicted_y))\n", + " \n", + " return bias, weights, loss" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "_, _, loss = gradient_descent_multi_variable(X, y, lr = 1e-5, number_of_epochs = 250)\n", + "loss_initial = loss[0]\n", + "loss_final = loss[-1]\n", + "\n", + "assert loss_initial > loss_final" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 2.6: Which algorithm should we use for Linear Regression?\n", + "\n", + "Compare the pros and cons of using normal equation and gradient descent for linear regression. Specifically:\n", + "\n", + "- Compare the speed of the two algorithms on data with many features. \n", + "- Compare the quality of the solutions obtained by the two algorithms. (i.e. how close to the optimal solution are the solutions obtained by the algorithms)\n", + "- Compare whether feature scaling is necessary for each algorithm to perform well.\n", + "\n", + "Finally, select the algorithm you think is more suitable for this problemset and explain why you chose it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 3: Polynomial Regression\n", + "\n", + "In real-world data, a straight line might not fit the data perfectly. Consider the relation between **schools** and **asking_price**.\n", + "\n", + " \n", + "
\n", + "\"school\n", + "
Figure 5: Schools - Price Relationship.
\n", + "
\n", + "\n", + "Houses with 0 schools nearby tend to be cheaper than houses with 1 school nearby. However, as the number of schools increases, the prices decrease. If we try a linear regression on the data, we obtain the following:\n", + "\n", + " \n", + "
\n", + "\"school\n", + "
Figure 5.1: Schools - Price Relationship With Linear Fit.
\n", + "
\n", + "\n", + "Notice how we lose the detail that houses with 0 schools are actually cheaper than houses with 1 school nearby. A polynomial function can better capture this relationship:\n", + "\n", + " \n", + "
\n", + "\"school\n", + "
Figure 5.2: Schools - Price Relationship With Cubic Fit.
\n", + "
\n", + "\n", + "A polynomial function is written as follows:\n", + "\n", + "$$ y = w_0 + w_1 x + w_2 x^2 + ... + w_n x^n $$\n", + "\n", + "where $y$ is the target value, $x$ is a (*single*) feature value, and $n$ is the degree of the polynomial. $w_0$ is the bias term and $w_1, \\dots, w_n$ are the feature weights. \n", + "\n", + "Notice how if we set $x_1 = x, x_2 = x^2, \\dots, x_n = x^n$. The polynomial function is simply linear regression with $n$ features:\n", + "\n", + "$$ y = w_0 + w_1 x_1 + w_2 x_2 + ... + w_n x_n $$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 3.1 : Create Polynomial Matrix\n", + "\n", + "Write the function `create_polynomial_matrix(X, power)` that takes a $(m, 1)$-matrix and an\n", + "integer, and returns a polynomial matrix with shape $(m, power)$.\n", + "\n", + "$$\n", + "\\left[\\begin{array}{cc} \n", + "1\\\\\n", + "2\\\\\n", + "3\n", + "\\end{array}\\right]\n", + "\\xrightarrow{\\text{create\\_polynomial\\_matrix(3)}}\n", + "\\left[\\begin{array}{cc} \n", + "1 & 1^2 & 1^3\\\\ \n", + "2 & 2^2 & 2^3\\\\\n", + "3 & 3^2 & 3^3\n", + "\\end{array}\\right]\n", + "\\rightarrow\n", + "\\left[\\begin{array}{cc} \n", + "1 & 1 & 1\\\\ \n", + "2 & 4 & 8\\\\\n", + "3 & 9 & 27\n", + "\\end{array}\\right]\n", + "t]\n", + "$$\n", + "**Hint**: Consider using `np.tile`/`np.repeat` together with `np.cumprod`/`np.power`." + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "def create_polynomial_matrix(X, power = 2):\n", + " '''\n", + " Create a polynomial matrix.\n", + " \n", + " Parameters\n", + " ----------\n", + " X: (m, 1) numpy matrix\n", + "\n", + " Returns\n", + " -------\n", + " A (m, power) numpy matrix where the i-th column denotes\n", + " X raised to the power of i.\n", + " '''\n", + " result = np.repeat(X, power, axis=1)\n", + " result = np.power(result, np.arange(1, power+1))\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "vector = np.array([[1], [2], [3]])\n", + "poly_matrix = np.array([[1, 1, 1], [2, 4, 8], [3, 9, 27]])\n", + "\n", + "assert np.array_equal(create_polynomial_matrix(vector, 3), poly_matrix)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 3.2: Get the prediction line\n", + "\n", + "Write the function `get_prediction_poly_regression(X, y, power, include_bias)` that returns\n", + "`y_pred`, a vector of predicted values for the training data.\n", + "\n", + "**Note**: You can use the functions `create_polynomial_matrix` and `get_prediction_linear_regression` from before for this task. (You do not need to re-define the functions in Coursemology. However, you are free to do so if you explicitly want to use your own implementation of the functions for this task.)" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [], + "source": [ + "def get_prediction_poly_regression(X, y, power = 2, include_bias = True):\n", + " '''\n", + " Calculate the best polynomial line.\n", + "\n", + " Parameters\n", + " ----------\n", + " X (np.ndarray) : (m, 1) numpy matrix representing feature matrix\n", + " y (np.ndarray) : (m, 1) numpy matrix representing target values\n", + " power (int) : Specify the degree of the polynomial\n", + " include_bias (boolean) : Specify whether the model should include a bias term\n", + "\n", + " Returns\n", + " -------\n", + " A (m, 1) numpy matrix representing prediction values.\n", + " '''\n", + " return get_prediction_linear_regression(create_polynomial_matrix(X, power), y, include_bias)" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [], + "source": [ + "test_X, test_y = np.arange(3).reshape((-1, 1)), np.arange(4, 7).reshape((-1, 1))\n", + "pred_y = get_prediction_poly_regression(test_X, test_y, 2)\n", + "\n", + "assert round(mean_squared_error(test_y, pred_y), 5) == 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To check your work, create a polynomial regression model, using `power = 3` and `include_bias = True`, with **schools** as the only feature and **asking_price** as the target value. Plot your prediction line using the code snippet below. It should look similar to Figure 6.\n", + "\n", + " \n", + "
\n", + "\"polynomial\n", + "
Figure 6: Example of polynomial regression using schools as feature.
\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "schools = X[:, 2].reshape((-1, 1))\n", + "predicted = get_prediction_poly_regression(schools, y, 3)\n", + "plt.scatter(schools, y)\n", + "plt.scatter(schools, predicted, color = 'r', s = 100)\n", + "plt.xlabel(\"Number of schools within 1km\")\n", + "plt.ylabel(\"Price in SGD\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 3.3: Feature Scaling\n", + "\n", + "As we create a higher degree polynomial matrix, each column will have a larger scale\n", + "than the previous one. This can lead to poor performance for gradient descent. Here\n", + "is where feature scaling plays an important role. Write the function `feature_scaling(X)`\n", + "that takes a NumPy matrix `X` and returns a mean-normalized matrix.\n", + "\n", + "**Note**: The normalization occurs on the column level (i-th column is normalized by the\n", + "mean and standard deviation of the i-th column). That is,\n", + "\n", + "$$\n", + "\\text{If} \\quad \\boldsymbol{v} = \\begin{pmatrix} v_1 \\\\ v_2 \\\\ \\vdots \\\\ v_k \\end{pmatrix} \\\\\n", + "\\boldsymbol{v}_{norm} = \\frac{\\boldsymbol{v} - \\boldsymbol{\\hat{v}}}{\\sigma_{v}}\n", + "$$\n", + "\n", + "where $\\boldsymbol{v}$ is a vector of $k$ elements, $\\boldsymbol{\\hat{v}}$ is its mean, and $\\sigma_{v}$ is its standard deviation.\n", + "\n", + "$$\n", + "\\left[\\begin{array}{cc} \n", + "1 & 133\\\\\n", + "4 & 700\\\\\n", + "5 & 133\\\\\n", + "8 & 700\n", + "\\end{array}\\right]\n", + "\\xrightarrow[]{\\text{feature\\_scaling}}\n", + "\\left[\\begin{array}{cc} \n", + "-1.4 & -1\\\\\n", + "-0.2 & 1\\\\\n", + "0.2 & -1\\\\\n", + "1.4 & 1\n", + "\\end{array}\\right]\n", + "$$ \n", + "\n", + "Focusing on the first feature:\n", + "\n", + "$$\n", + "\\boldsymbol{v} = \\left(\\begin{array}{cc}\n", + "1\\\\\n", + "4\\\\\n", + "5\\\\\n", + "8\n", + "\\end{array}\\right), \\boldsymbol{\\hat{v}} = 4.5, \\sigma_{v} = 2.5$$\n", + "\n", + "$$\\boldsymbol{v}_{norm} = \\frac{\\boldsymbol{v} - 4.5}{2.5} = \\left(\\begin{array}{cc}\n", + "-1.4\\\\\n", + "-0.2\\\\\n", + "0.2\\\\\n", + "1.4\n", + "\\end{array}\\right)\n", + "$$\n", + "\n", + "**Hint**: Consider using `np.mean()` and `np.std()`." + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "def feature_scaling(X):\n", + " '''\n", + " Mean normalized each feature column.\n", + "\n", + " Parameters\n", + " ----------\n", + " X (np.ndarray) : (m, n) numpy matrix representing feature matrix\n", + "\n", + " Returns\n", + " -------\n", + " A (m, n) numpy matrix where each column has been mean-normalized.\n", + " '''\n", + " means = np.mean(X, axis=0)\n", + " stds = np.std(X, axis=0)\n", + " return (X - means) / stds" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [], + "source": [ + "public_X = np.array([[1, 133], [4, 700], [5, 133], [8, 700]])\n", + "expected = np.array([[-1.4, -1], [-0.2, 1], [0.2, -1], [1.4, 1]])\n", + "\n", + "assert np.array_equal(feature_scaling(public_X), expected)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 3.4: Find number of epochs to converge\n", + "\n", + "Fill in the function `find_number_of_epochs(X, y, lr, delta_loss)` that that returns:\n", + "\n", + "* $w_0$ - a number representing the bias constant\n", + "* $w_1, w_2, \\dots, w_n$ - $(n, 1)$ NumPy matrix, where each element denotes the weight constant of a certain feature\n", + "* $num\\_of\\_epochs$ - a number representing the number of epochs performed to reach convergence\n", + "\n", + "We define a single epoch as performing gradient descent *once* and calculating the loss. The loss calculation and gradient descent should be performed using MSE.\n", + "\n", + "The definition of convergence is as follows:\n", + "\n", + "$$ |J_{t-1} - J_{t}| < delta\\_loss $$\n", + "\n", + "where $J_{t-1}$ is loss at timestep $t-1$ (previous timestep), $J_{t}$ is loss at timestep $t$ (current timestep), and $delta\\_loss$ is the termination criterion. " + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [], + "source": [ + "def find_number_of_epochs(X, y, lr, delta_loss):\n", + " '''\n", + " Do gradient descent until convergence and return number of epochs\n", + " required.\n", + "\n", + " Parameters\n", + " ----------\n", + " X (np.ndarray) : (m, n) numpy matrix representing feature matrix\n", + " y (np.ndarray) : (m, 1) numpy matrix representing target values\n", + " lr (float) : Learning rate\n", + " delta_loss (float) : Termination criterion\n", + " \n", + " Returns\n", + " -------\n", + " bias (float):\n", + " The bias constant\n", + " weights (np.ndarray):\n", + " A (n, 1) numpy matrix that specifies the weight constants.\n", + " num_of_epochs (int):\n", + " Number of epochs to reach convergence.\n", + " current_loss (float):\n", + " The loss value obtained after convergence.\n", + " '''\n", + " # Do not change\n", + " bias = 0\n", + " weights = np.full((X.shape[1], 1), 0).astype(float)\n", + " num_of_epochs = 0\n", + " previous_loss = 1e14\n", + " current_loss = -1e14\n", + "\n", + " while abs(previous_loss - current_loss) >= delta_loss:\n", + " previous_loss = current_loss\n", + "\n", + " predicted_y = bias + X @ weights\n", + "\n", + " weight_gradient = np.mean((predicted_y - y)*X, axis=0).reshape((X.shape[1],1))\n", + " weights -= lr * weight_gradient\n", + "\n", + " bias_gradient = np.mean(predicted_y - y)\n", + " bias -= lr * bias_gradient\n", + " num_of_epochs += 1\n", + "\n", + " predicted_y = bias + X @ weights\n", + " current_loss = mean_squared_error(predicted_y, y)\n", + " \n", + " return bias, weights, num_of_epochs, current_loss" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [], + "source": [ + "poly_X = create_polynomial_matrix(X[:, 2].reshape((-1, 1)), 3)\n", + "_, _, num_of_epochs, _ = find_number_of_epochs(poly_X, y, 1e-5, 1e7)\n", + "\n", + "assert num_of_epochs > 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 3.5: Analyze the effects of feature scaling on Gradient Descent\n", + "\n", + "In this task, we will examine the influence of feature scaling on the efficiency of gradient descent algorithms. Specifically, we utilize a degree 3 polynomial feature matrix derived from a dataset of schools.\n", + "\n", + "Objective: \n", + "You will compare the convergence patterns of gradient descent with and without the application of feature scaling.\n", + "\n", + "Experiment Procedure: \n", + "Below is a pre-written code segment design for this analysis. You are not required to modify the code. Instead, use it to focus on two key aspects:\n", + "1. Convergence analysis\n", + " - Observe the number of epochs required to achieve convergence for both the non-normalized (original) and normalized feature matrices. \n", + " This comparison should be conducted across various learning rates.\n", + "2. Loss visualization\n", + " - Analyze and plot the loss values corresponding to both the non-normalized and normalized matrices as a function of epoch count, again considering different learning rates.\n", + "\n", + "##### Instructions:\n", + "Execute the provided code and **draw 2 observations or conclusions** from the experiment results. **Include a screenshot** of the results(image and tabulation) along with your submission. These insights should relate to the effects of feature scaling on the efficiency of gradient descent algorithms.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Learning Rate | Epochs to convergence (Not normalized) | Epochs to convergence (Normalized) \n", + "----------------------------------------------------------------------------------------------------\n", + "1.0e-05 | 517177 | 319333 \n", + "5.0e-05 | 161792 | 80692 \n", + "1.0e-04 | 134359 | 44246 \n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "poly_X = create_polynomial_matrix(X[:, 2].reshape((-1, 1)), 3)\n", + "_, _, num_of_epochs, _ = find_number_of_epochs(poly_X, y, 1e-5, 1e7)\n", + "\n", + "assert num_of_epochs > 0 \n", + "poly_X_normalized = feature_scaling(poly_X)\n", + "delta_loss = 1e4\n", + "lrs = [1e-5, 5e-5, 1e-4]\n", + "LIMIT = 50000\n", + "lr_epochs_original = {}\n", + "lr_epochs_normalized = {}\n", + "losses_original = {} # key: lr, value: array of losses\n", + "losses_normalized = {} # key: lr, value: array of losses\n", + "poly_epochs_original = []\n", + "poly_epochs_normalized = []\n", + "print(\"{:<15} | {:<40} | {:<40}\".format(\"Learning Rate\", \"Epochs to convergence (Not normalized)\",\n", + " \"Epochs to convergence (Normalized)\"))\n", + "print(\"-\" * 100)\n", + "for lr in lrs:\n", + " _, _, num_of_epochs, _ = find_number_of_epochs(poly_X, y, lr, delta_loss)\n", + " _, _, num_of_epochs_normalized, _ = find_number_of_epochs(poly_X_normalized, y, lr, delta_loss)\n", + "\n", + " poly_epochs_original.append(num_of_epochs)\n", + " poly_epochs_normalized.append(num_of_epochs_normalized)\n", + "\n", + " lr_epochs_original[lr] = min(num_of_epochs, LIMIT)\n", + " lr_epochs_normalized[lr] = min(num_of_epochs_normalized, LIMIT)\n", + " lr_formatted = \"{:.1e}\".format(lr)\n", + " print(\"{:<15} | {:<40} | {:<40}\".format(lr_formatted, num_of_epochs, num_of_epochs_normalized))\n", + " losses_original[lr] = gradient_descent_multi_variable(poly_X, y, lr, num_of_epochs)[2]\n", + " losses_normalized[lr] = gradient_descent_multi_variable(poly_X_normalized, y, lr, num_of_epochs_normalized)[2]\n", + "\n", + "# Find the global minimum and maximum loss values so that their scales are the same\n", + "all_losses = list(losses_original.values()) + list(losses_normalized.values())\n", + "global_min_loss = min([min(losses) for losses in all_losses])\n", + "global_max_loss = max([max(losses) for losses in all_losses])\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "\n", + "# Plot for Original Data\n", + "plt.subplot(1, 2, 1)\n", + "plt.title('Loss vs Epochs (Original Data)')\n", + "for lr in lrs:\n", + " epochs = lr_epochs_original[lr]\n", + " plt.plot(range(epochs), losses_original[lr][:epochs], label=f'LR = {lr}')\n", + "plt.xlabel('Epoch Number')\n", + "plt.ylabel('Loss')\n", + "plt.ylim([global_min_loss, global_max_loss]) # Set common y-axis limits\n", + "plt.legend()\n", + "\n", + "# Plot for Normalized Data\n", + "plt.subplot(1, 2, 2)\n", + "plt.title('Loss vs Epochs (Normalized Data)')\n", + "for lr in lrs:\n", + " epochs = lr_epochs_normalized[lr]\n", + " plt.plot(range(epochs), losses_normalized[lr][:epochs], label=f'LR = {lr}')\n", + "plt.xlabel('Epoch Number')\n", + "plt.ylabel('Loss')\n", + "plt.ylim([global_min_loss, global_max_loss]) # Set common y-axis limits\n", + "plt.legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submission\n", + "\n", + "Once you are done, please submit your work to Coursemology, by copying the right\n", + "snippets of code into the corresponding box that says “Your answer,” and click “Save.”\n", + "After you save, you can still make changes to your submission.\n", + "\n", + "Once you are satisfied with what you have uploaded, click “Finalize submission.” Note\n", + "that once your submission is finalized, it is considered to be submitted for grading\n", + "and cannot be changed. If you need to undo this action, you will have to email your\n", + "assigned tutor for help. Please do not finalize your submission until you are sure that\n", + "you want to submit your solutions for grading." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.9" + }, + "vscode": { + "interpreter": { + "hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}