maybe good?

2024-04-28 15:58:30 +08:00
parent 1312e694c3
commit d2e87aec97
21 changed files with 3097 additions and 700 deletions
--- a/cs2109s/labs/final/scratchpad.ipynb
+++ b/cs2109s/labs/final/scratchpad.ipynb
@@ -51,11 +51,7 @@
    "**[TODO]**\n",
    "\n",
    "##### 1. Descriptive Analysis\n",
-    "First step: Look at the target values. The target values are floats and NAs, which is interesting. NAs in the target data is a bit suspicious. However, despite being floats, the target values are actually ordinal. I'll convert them to ordinal values by just `Y.fillna(-1).astype(int).`. Now, I can do value counts and see that there are only 7 distinct values, including NaN. I will regard this as a classification problem with 7 classes.\n",
-    "\n",
-    "Looking at the `X`, I realise each entry in the list is an `n` by 16 by 16 matrix. 16 by 16 matrix, my first idea is to look at them like images. Plotting the images showed no relevant info. `6 <= n <= 10`.\n",
-    "\n",
-    "I just realised this is a video dataset. I'll pad all the frames to be of size 10. so that i'll have a 2500 x 10 x 16 x 16 video datset. \n",
+    "**[TODO]**\n",
    "\n",
    "##### 2. Detection and Handling of Missing Values\n",
    "**[TODO]**\n",
@@ -123,21 +119,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 42,
   "id": "cded1ed6",
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2024-04-27T16:15:03.602644Z",
-     "start_time": "2024-04-27T16:15:03.179277Z"
+     "end_time": "2024-04-28T06:46:52.407375Z",
+     "start_time": "2024-04-28T06:46:52.405317Z"
    }
   },
   "outputs": [],
   "source": [
-    "import pandas\n",
    "import pandas as pd\n",
    "import os\n",
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt"
+    "import numpy as np"
   ]
  },
  {
@@ -162,12 +156,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 43,
   "id": "6297e25a",
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2024-04-27T16:15:06.411332Z",
-     "start_time": "2024-04-27T16:15:06.392391Z"
+     "end_time": "2024-04-28T06:46:52.453152Z",
+     "start_time": "2024-04-28T06:46:52.428539Z"
    }
   },
   "outputs": [
@@ -186,7 +180,8 @@
    "    data = np.load(f, allow_pickle=True).item()\n",
    "    X = data['data']\n",
    "    y = data['label']\n",
-    "    \n",
+    "\n",
+    "\n",
    "print('Number of data sample:', len(X))\n",
    "print('Shape of the first data sample:', X[0].shape)\n",
    "print('Shape of the third data sample:', X[2].shape)"
@@ -200,49 +195,6 @@
    "## Data Exploration & Preparation"
   ]
  },
-  {
-   "cell_type": "code",
-   "outputs": [],
-   "source": [
-    "from sklearn.preprocessing import OrdinalEncoder\n",
-    "# Some Helper Functions\n",
-    "def show_images(images, n_row=5, n_col=5, figsize=[12,12]):\n",
-    "    _, axs = plt.subplots(n_row, n_col, figsize=figsize)\n",
-    "    axs = axs.flatten()\n",
-    "    for img, ax in zip(images, axs):\n",
-    "        ax.imshow(img, cmap='gray')\n",
-    "    plt.show()\n",
-    "def nan_columns(X, threshold=0.5):\n",
-    "    count = X.shape[0] * threshold\n",
-    "    nan_columns = X.isna().sum()\n",
-    "    return nan_columns[nan_columns >= count].index\n",
-    "def zero_columns(X, threshold=0.5):\n",
-    "    count = X.shape[0] * threshold\n",
-    "    zero_cols = (X == 0).sum()\n",
-    "    return zero_cols[zero_cols >= count].index\n",
-    "\n",
-    "def object_columns(X):\n",
-    "    return X.dtypes[X.dtypes == 'object'].index\n",
-    "\n",
-    "def convert_to_ordinal(X, columns):\n",
-    "    encoder = OrdinalEncoder()\n",
-    "    return encoder.fit_transform(X[columns])\n",
-    "\n",
-    "def correlated_columns(X, threshold=0.99):\n",
-    "    corr = X.corr()\n",
-    "    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))\n",
-    "    return [column for column in upper.columns if any(upper[column] > threshold)]"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "ExecuteTime": {
-     "end_time": "2024-04-27T16:15:12.963319Z",
-     "start_time": "2024-04-27T16:15:12.025487Z"
-    }
-   },
-   "id": "f68b8b1c21eae6d6",
-   "execution_count": 3
-  },
  {
   "cell_type": "markdown",
   "id": "2f6a464c",
@@ -253,79 +205,52 @@
  },
  {
   "cell_type": "code",
+   "execution_count": 44,
+   "id": "3b1f62dd",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-28T06:46:52.472306Z",
+     "start_time": "2024-04-28T06:46:52.454360Z"
+    }
+   },
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2250\n"
+     "ename": "ValueError",
+     "evalue": "setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2250,) + inhomogeneous part.",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mValueError\u001B[0m                                Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[44], line 10\u001B[0m\n\u001B[1;32m      8\u001B[0m X6 \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39marray([video[:\u001B[38;5;241m6\u001B[39m] \u001B[38;5;28;01mfor\u001B[39;00m video \u001B[38;5;129;01min\u001B[39;00m X])\n\u001B[1;32m      9\u001B[0m \u001B[38;5;66;03m# Now that they are consistent, we can convert them to a numpy array\u001B[39;00m\n\u001B[0;32m---> 10\u001B[0m X6 \u001B[38;5;241m=\u001B[39m \u001B[43mnp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43marray\u001B[49m\u001B[43m(\u001B[49m\u001B[43mX\u001B[49m\u001B[43m)\u001B[49m\n",
+      "\u001B[0;31mValueError\u001B[0m: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2250,) + inhomogeneous part."
     ]
    }
   ],
   "source": [
+    "# Remove nans from the input. This needs to be done in the model for training data as well\n",
    "not_nan_indices = np.argwhere(~np.isnan(np.array(y))).squeeze()\n",
-    "print(len(not_nan_indices))\n",
-    "y_filtered = [y[i] for i in not_nan_indices]\n",
-    "x_filtered = [X[i] for i in not_nan_indices]\n",
-    "X = x_filtered\n",
-    "y = y_filtered\n",
-    "# show_images(X[0], 2, 5, [16, 16])\n",
-    "Y = pd.DataFrame(y)\n",
-    "# show_images(X[0], 1, 10, [10, 1])\n",
-    "# show_images(X[1], 1, 10, [10, 1])\n",
-    "# show_images(X[2], 1, 10, [10, 1])\n",
-    "# show_images(X[3], 1, 10, [10, 1])\n",
-    "# Y[:10].T\n",
-    "# print(type(X[0]))"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "ExecuteTime": {
-     "end_time": "2024-04-27T16:15:14.719386Z",
-     "start_time": "2024-04-27T16:15:14.712849Z"
-    }
-   },
-   "id": "3b1f62dd",
-   "execution_count": 4
+    "y = [y[i] for i in not_nan_indices]\n",
+    "X = [X[i] for i in not_nan_indices]\n",
+    "y = np.array(y).astype(int)\n",
+    "\n",
+    "# Since each video varies in length, we will take the min length, 6, for each video\n",
+    "X6 = np.array([video[:6] for video in X])\n",
+    "# Now that they are consistent, we can convert them to a numpy array\n",
+    "X6 = np.array(X)\n"
+   ]
  },
  {
   "cell_type": "code",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(2250, 10, 256)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "# We can now try to pad the videos to be of size 10\n",
-    "\n",
-    "def process_video(video):\n",
-    "    L = video.shape[0]\n",
-    "    if L < 10:\n",
-    "        return np.concatenate([video, np.zeros((10 - L, 16, 16))]).reshape(10, -1)\n",
-    "    return video.reshape(10, -1).astype(np.float32)\n",
-    "\n",
-    "L_max = 10\n",
-    "X_array = np.zeros((len(X), 10, 256))\n",
-    "for i, video in enumerate(X):\n",
-    "    X_array[i] = process_video(video)\n",
-    "np.expand_dims(X_array, axis=2).shape\n",
-    "print(X_array.shape)\n",
-    "X_array = np.reshape(X_array, (X_array.shape[0], X_array.shape[1], 256)).shape\n",
-    "# flattened_data = print(flattened_data)"
+    "pd.DataFrame(y).value_counts()\n",
+    "# From this, we know that we need to undersample or upsample the data. We will pick understampling as the data is quite large, and understampling will reduce the training time."
   ],
   "metadata": {
-    "collapsed": false,
-    "ExecuteTime": {
-     "end_time": "2024-04-27T16:15:17.056545Z",
-     "start_time": "2024-04-27T16:15:17.014489Z"
-    }
+    "collapsed": false
   },
-   "id": "558f2d74562bc7c8",
-   "execution_count": 5
+   "id": "dd66bb1efa4e602c",
+   "execution_count": null
  },
  {
   "cell_type": "markdown",
@@ -337,16 +262,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "id": "4bb9cdfb",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-04-27T16:12:36.843437Z",
-     "start_time": "2024-04-27T16:12:36.842009Z"
-    }
-   },
+   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "np.isnan(X6).sum() # We know that there is quite a few NaNs in the data. However, I will not be figuring out which column / nan has this value. Instead we can just take the average of each image, adn use that as the input to the nan"
+   ]
  },
  {
   "cell_type": "markdown",
@@ -358,16 +280,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "id": "ed1c17a1",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-04-27T16:12:36.845318Z",
-     "start_time": "2024-04-27T16:12:36.843930Z"
-    }
-   },
+   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "# Check if there are outliers\n",
+    "# We can check if there are outliers by checking the max and min values of each video\n",
+    "np.max(X6, axis=3)\n",
+    "# From this we can see that there are values whic exceed 255, and thus, we can clip that."
+   ]
  },
  {
   "cell_type": "markdown",
@@ -379,16 +301,29 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 83,
   "id": "ad3ab20e",
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2024-04-27T16:12:36.847266Z",
-     "start_time": "2024-04-27T16:12:36.845985Z"
+     "end_time": "2024-04-28T06:59:16.949196Z",
+     "start_time": "2024-04-28T06:59:16.943398Z"
    }
   },
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "0\n0    300\n1    300\n2    300\n3    300\n4    300\n5    300\nName: count, dtype: int64"
+     },
+     "execution_count": 83,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Handling Undersampling\n",
+    "pd.DataFrame(y).value_counts()\n",
+    "# There is a class imbalance, and we will need to undersample the data"
+   ]
  },
  {
   "cell_type": "markdown",
@@ -400,12 +335,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 44,
   "id": "29ddbbcf",
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2024-04-27T16:12:36.849483Z",
-     "start_time": "2024-04-27T16:12:36.848012Z"
+     "end_time": "2024-04-28T06:46:52.478781Z",
+     "start_time": "2024-04-28T06:46:52.477156Z"
    }
   },
   "outputs": [],
@@ -421,12 +356,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 44,
   "id": "93f82e42",
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2024-04-27T16:12:36.852862Z",
-     "start_time": "2024-04-27T16:12:36.851617Z"
+     "end_time": "2024-04-28T06:46:52.483551Z",
+     "start_time": "2024-04-28T06:46:52.482068Z"
    }
   },
   "outputs": [],
@@ -450,16 +385,76 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
-   "id": "19174365",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1800, 6, 16, 16])\n",
+      "(1800,)\n",
+      "<class 'numpy.ndarray'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "# Reduce the data to 6 frames\n",
+    "X = np.array([video[:6] for video in X])\n",
+    "tensor_videos = torch.tensor(X, dtype=torch.float32)\n",
+    "# Clip values to 0 and 255\n",
+    "tensor_videos = np.clip(tensor_videos, 0, 255)\n",
+    "# Replace NaNs in each frame, with the average of the frame. This was generated with GPT\n",
+    "for i in range(tensor_videos.shape[0]):\n",
+    "    for j in range(tensor_videos.shape[1]):\n",
+    "        tensor_videos[i][j][torch.isnan(tensor_videos[i][j])] = torch.mean(tensor_videos[i][j][~torch.isnan(tensor_videos[i][j])])\n",
+    "        \n",
+    "# Undersample the data for each of the 6 classes. Select max of 300 samples for each class\n",
+    "# Very much generated with the assitance of chatGPT with some modifications\n",
+    "# Get the indices of each class\n",
+    "indices = [np.argwhere(y == i).squeeze(1) for i in range(6)]\n",
+    "# Get the number of samples to take for each class\n",
+    "num_samples_to_take = 300\n",
+    "# Get the indices of the samples to take\n",
+    "indices_to_take = [np.random.choice(indices[i], num_samples_to_take, replace=True) for i in range(6)]\n",
+    "# Concatenate the indices\n",
+    "indices_to_take = np.concatenate(indices_to_take)\n",
+    "# Select the samples\n",
+    "tensor_videos = tensor_videos[indices_to_take]\n",
+    "y = y[indices_to_take]\n"
+   ],
   "metadata": {
+    "collapsed": false
+   },
+   "id": "19174365",
+   "execution_count": 82
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "torch.Size([1800, 1, 6, 16, 16])"
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# This is the extra channel dimention to work with the conv3d\n",
+    "tensor_videos = tensor_videos.unsqueeze(1)\n",
+    "tensor_videos.shape"
+   ],
+   "metadata": {
+    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-04-27T16:12:36.854714Z",
-     "start_time": "2024-04-27T16:12:36.853430Z"
+     "end_time": "2024-04-28T07:01:44.496557Z",
+     "start_time": "2024-04-28T07:01:44.492973Z"
    }
   },
-   "outputs": [],
-   "source": []
+   "id": "8b6bcf332c355e9d",
+   "execution_count": 85
  },
  {
   "cell_type": "markdown",
@@ -471,14 +466,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "id": "a85808bf",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-04-27T16:12:36.856476Z",
-     "start_time": "2024-04-27T16:12:36.855157Z"
-    }
-   },
+   "metadata": {},
   "outputs": [],
   "source": []
  },
@@ -492,14 +482,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "id": "dbcde626",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-04-27T16:12:36.858522Z",
-     "start_time": "2024-04-27T16:12:36.857080Z"
-    }
-   },
+   "metadata": {},
   "outputs": [],
   "source": []
  },
@@ -521,18 +506,95 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 230,
   "id": "d8dffd7d",
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2024-04-27T16:15:21.497276Z",
-     "start_time": "2024-04-27T16:15:20.501754Z"
+     "end_time": "2024-04-28T07:57:09.790124Z",
+     "start_time": "2024-04-28T07:57:09.780591Z"
    }
   },
   "outputs": [],
   "source": [
-    "import torch\n",
-    "from torch import nn"
+    "from torch import nn\n",
+    "class CNN3D(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(CNN3D, self).__init__()\n",
+    "        self.conv1 = nn.Conv3d(1, 12, 2, 1,2)\n",
+    "        self.mp = nn.AvgPool3d(2)\n",
+    "        self.relu = nn.LeakyReLU()\n",
+    "        self.fc1 = nn.Linear(3888, 6)\n",
+    "        self.fc2 = nn.Linear(128, 6)\n",
+    "        self.flatten = nn.Flatten()\n",
+    "    def forward(self, x):\n",
+    "        x = self.conv1(x)\n",
+    "        x = self.mp(x)\n",
+    "        x = self.relu(x)\n",
+    "        \n",
+    "        # print(x.shape)\n",
+    "        \n",
+    "        x = x.view(-1, 3888)\n",
+    "        x = self.fc1(x)\n",
+    "        # x = self.fc2(x)\n",
+    "        return x\n",
+    "    \n",
+    "def train(model, criterion, optimizer, loader, epochs = 10):\n",
+    "    for epoch in range(epochs):\n",
+    "        for idx, (inputs, labels) in enumerate(loader):\n",
+    "            optimizer.zero_grad()\n",
+    "            outputs = model(inputs)\n",
+    "            loss = criterion(outputs, labels)\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "        print(f'Epoch {epoch}, Loss: {loss.item()}')\n",
+    "    return model\n",
+    "def process_data(X, y):\n",
+    "    y = np.array(y)\n",
+    "    X = np.array([video[:6] for video in X])\n",
+    "    tensor_videos = torch.tensor(X, dtype=torch.float32)\n",
+    "    # Clip values to 0 and 255\n",
+    "    tensor_videos = np.clip(tensor_videos, 0, 255)\n",
+    "    # Replace NaNs in each frame, with the average of the frame. This was generated with GPT\n",
+    "    for i in range(tensor_videos.shape[0]):\n",
+    "        for j in range(tensor_videos.shape[1]):\n",
+    "            tensor_videos[i][j][torch.isnan(tensor_videos[i][j])] = torch.mean(tensor_videos[i][j][~torch.isnan(tensor_videos[i][j])])\n",
+    "    # Undersample the data for each of the 6 classes. Select max of 300 samples for each class\n",
+    "    # Very much generated with the assitance of chatGPT with some modifications\n",
+    "    # Get the indices of each class\n",
+    "    indices = [np.argwhere(y == i).squeeze(1) for i in range(6)]\n",
+    "    # Get the number of samples to take for each class\n",
+    "    num_samples_to_take = 300\n",
+    "    # Get the indices of the samples to take\n",
+    "    indices_to_take = [np.random.choice(indices[i], num_samples_to_take, replace=True) for i in range(6)]\n",
+    "    # Concatenate the indices\n",
+    "    indices_to_take = np.concatenate(indices_to_take)\n",
+    "    # Select the samples\n",
+    "    tensor_videos = tensor_videos[indices_to_take].unsqueeze(1)\n",
+    "    y = y[indices_to_take]\n",
+    "    return torch.Tensor(tensor_videos), torch.Tensor(y).long()\n",
+    "class Model():\n",
+    "    def __init__(self):\n",
+    "        self.model = CNN3D()\n",
+    "        self.criterion = nn.CrossEntropyLoss()\n",
+    "        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)\n",
+    "    def fit(self, X, y):\n",
+    "        X, y = process_data(X, y)\n",
+    "        train_dataset = torch.utils.data.TensorDataset(X, y)\n",
+    "        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)\n",
+    "        train(self.model, self.criterion, self.optimizer, train_loader)\n",
+    "    def predict(self, X):\n",
+    "        self.model.eval()\n",
+    "\n",
+    "        X = np.array([video[:6] for video in X])\n",
+    "        tensor_videos = torch.tensor(X, dtype=torch.float32)\n",
+    "        # Clip values to 0 and 255\n",
+    "        tensor_videos = np.clip(tensor_videos, 0, 255)\n",
+    "        # Replace NaNs in each frame, with the average of the frame. This was generated with GPT\n",
+    "        for i in range(tensor_videos.shape[0]):\n",
+    "            for j in range(tensor_videos.shape[1]):\n",
+    "                tensor_videos[i][j][torch.isnan(tensor_videos[i][j])] = torch.mean(tensor_videos[i][j][~torch.isnan(tensor_videos[i][j])])\n",
+    "        X = torch.Tensor(tensor_videos.unsqueeze(1))\n",
+    "        return np.argmax(self.model(X).detach().numpy(), axis=1)\n"
   ]
  },
  {
@@ -545,106 +607,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 217,
   "id": "9245ab47",
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2024-04-27T16:19:38.194596Z",
-     "start_time": "2024-04-27T16:19:37.776094Z"
+     "end_time": "2024-04-28T07:55:53.563103Z",
+     "start_time": "2024-04-28T07:55:53.544134Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
-    "# Split train and test\n",
+    "\n",
+    "with open('data.npy', 'rb') as f:\n",
+    "    data = np.load(f, allow_pickle=True).item()\n",
+    "    X = data['data']\n",
+    "    y = data['label']\n",
+    "\n",
+    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)\n",
-    "X_train = [process_video(video) for video in X_train]\n",
-    "X_test = [process_video(video) for video in X_test]\n",
    "\n",
-    "y_train = np.array(y_train).astype(np.int64)\n",
-    "\n",
-    "X_tensor = torch.tensor(X_train, dtype=torch.float32)\n",
-    "y_tensor = torch.tensor(y_train, dtype=torch.long)\n",
-    "\n",
-    "train_dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)\n",
-    "train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)"
+    "not_nan_indices = np.argwhere(~np.isnan(np.array(y_test))).squeeze()\n",
+    "y_test = [y_test[i] for i in not_nan_indices]\n",
+    "X_test = [X_test[i] for i in not_nan_indices]\n",
+    "\n"
   ]
  },
-  {
-   "cell_type": "code",
-   "outputs": [],
-   "source": [
-    "class Model(nn.Module):\n",
-    "    def __init__(self):\n",
-    "        super(Model, self).__init__()\n",
-    "        self.input_size = 256\n",
-    "        self.hidden_layers = 64\n",
-    "        self.num_layers = 1\n",
-    "        self.num_classes = 6\n",
-    "        \n",
-    "        self.lstm = nn.LSTM(self.input_size, self.hidden_layers, self.num_layers, batch_first=True)\n",
-    "        self.fc = nn.Linear(self.hidden_layers, self.num_classes)\n",
-    "    def forward(self, x):\n",
-    "        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_layers).to(x.device)\n",
-    "        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_layers).to(x.device)\n",
-    "\n",
-    "        # Forward propagate LSTM\n",
-    "        out, _ = self.lstm(x, (h0, c0))\n",
-    "        \n",
-    "        out = self.fc(out[:, -1, :])\n",
-    "        return out "
-   ],
-   "metadata": {
-    "collapsed": false,
-    "ExecuteTime": {
-     "end_time": "2024-04-27T16:20:46.738811Z",
-     "start_time": "2024-04-27T16:20:46.734583Z"
-    }
-   },
-   "id": "7396b295037aa70f",
-   "execution_count": 25
-  },
-  {
-   "cell_type": "code",
-   "outputs": [],
-   "source": [],
-   "metadata": {
-    "collapsed": false,
-    "ExecuteTime": {
-     "end_time": "2024-04-27T16:12:37.998501Z",
-     "start_time": "2024-04-27T16:12:37.997472Z"
-    }
-   },
-   "id": "9057629fbaaa8571",
-   "execution_count": 8
-  },
-  {
-   "cell_type": "code",
-   "outputs": [],
-   "source": [
-    "def train_model(model, loss_fn, optimizer, train_loader, num_epochs=10):\n",
-    "    model.train()\n",
-    "    for epoch in range(num_epochs):\n",
-    "        running_loss = 0.0\n",
-    "        for inputs, labels in train_loader:\n",
-    "            optimizer.zero_grad()\n",
-    "            outputs = model(inputs)\n",
-    "            loss = loss_fn(outputs, labels)\n",
-    "            loss.backward()\n",
-    "            optimizer.step()\n",
-    "            running_loss += loss.item()\n",
-    "        print(f\"Epoch {epoch + 1}, Loss: {running_loss / len(train_loader)}\")\n"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "ExecuteTime": {
-     "end_time": "2024-04-27T16:19:57.048696Z",
-     "start_time": "2024-04-27T16:19:57.045181Z"
-    }
-   },
-   "id": "c3901cf56e12eade",
-   "execution_count": 21
-  },
  {
   "cell_type": "code",
   "outputs": [
@@ -652,34 +640,62 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Epoch 1, Loss: nan\n",
-      "Epoch 2, Loss: nan\n",
-      "Epoch 3, Loss: nan\n",
-      "Epoch 4, Loss: nan\n",
-      "Epoch 5, Loss: nan\n",
-      "Epoch 6, Loss: nan\n",
-      "Epoch 7, Loss: nan\n",
-      "Epoch 8, Loss: nan\n",
-      "Epoch 9, Loss: nan\n",
-      "Epoch 10, Loss: nan\n"
+      "Epoch 0, Loss: 4.225716590881348\n",
+      "Epoch 1, Loss: 0.9198675155639648\n",
+      "Epoch 2, Loss: 1.7365752458572388\n",
+      "Epoch 3, Loss: 0.4570190906524658\n",
+      "Epoch 4, Loss: 0.11014104634523392\n",
+      "Epoch 5, Loss: 0.24420055747032166\n",
+      "Epoch 6, Loss: 0.03079795092344284\n",
+      "Epoch 7, Loss: 0.07790327817201614\n",
+      "Epoch 8, Loss: 0.07603466510772705\n",
+      "Epoch 9, Loss: 0.04154537618160248\n",
+      "F1 Score (macro): 0.51\n"
     ]
    }
   ],
   "source": [
    "model = Model()\n",
-    "lossFn = nn.CrossEntropyLoss()\n",
-    "optimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n",
-    "train_model(model, lossFn, optimizer, train_loader, num_epochs=10)"
+    "model.fit(X_train, y_train)\n",
+    "\n",
+    "from sklearn.metrics import f1_score\n",
+    "\n",
+    "y_pred = model.predict(X_test)\n",
+    "print(\"F1 Score (macro): {0:.2f}\".format(f1_score(y_test, y_pred, average='macro'))) # You may encounter errors, you are expected to figure out what's the issue.\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
-     "end_time": "2024-04-27T16:20:49.798810Z",
-     "start_time": "2024-04-27T16:20:48.477326Z"
+     "end_time": "2024-04-28T07:57:38.644155Z",
+     "start_time": "2024-04-28T07:57:35.958882Z"
    }
   },
-   "id": "dbb00fef60449a02",
-   "execution_count": 26
+   "id": "abb2d957f4a15bd2",
+   "execution_count": 235
+  },
+  {
+   "cell_type": "code",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "F1 Score (macro): 0.60\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-04-28T07:57:16.355215Z",
+     "start_time": "2024-04-28T07:57:16.281540Z"
+    }
+   },
+   "id": "37ff28a8da9dba6c",
+   "execution_count": 232
  },
  {
   "cell_type": "markdown",