TheAlgorithms · AQadir44 · Oct 28, 2020
diff --git a/machine_learning/Scikit-learn/SCIKIT_LEARN.ipynb b/machine_learning/Scikit-learn/SCIKIT_LEARN.ipynb
@@ -0,0 +1,293 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np # linear algebra\n",
+    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
+    "from sklearn import feature_extraction, linear_model, model_selection, preprocessing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "HAVE DATASET OF Real or Not? NLP with Disaster Tweets FROM KAGGLE COMPITITION \n",
+    "USE THE KAGGLE API TO DOWNLOAD DATSET : kaggle competitions download -c nlp-getting-started"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train = pd.read_csv(\"train.csv\")\n",
+    "test = pd.read_csv(\"test.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'I love fruits'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train[train[\"target\"] == 0][\"text\"].values[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Forest fire near La Ronge Sask. Canada'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train[train[\"target\"] == 1][\"text\"].values[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "count_vectorizer = feature_extraction.text.CountVectorizer()\n",
+    "\n",
+    "## let's get counts for the first 5 tweets in the data\n",
+    "example_train_vectors = count_vectorizer.fit_transform(train[\"text\"][0:5])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(1, 54)\n",
+      "[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0\n",
+      "  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# we use .todense() here because these vectors are \"sparse\" (only non-zero elements are kept to save space)\n",
+    "print(example_train_vectors[0].todense().shape)\n",
+    "print(example_train_vectors[0].todense())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_vectors = count_vectorizer.fit_transform(train[\"text\"])\n",
+    "\n",
+    "## note that we're NOT using .fit_transform() here. Using just .transform() makes sure\n",
+    "# that the tokens in the train vectors are the only ones mapped to the test vectors - \n",
+    "# i.e. that the train and test vectors use the same set of tokens.\n",
+    "test_vectors = count_vectorizer.transform(test[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.cross_decomposition import PLSRegression\n",
+    "#Fitting training algorithm\n",
+    "l = linear_model.RidgeClassifier(alpha=100,random_state=600)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0.61117318, 0.5923913 , 0.66873065])"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "scores = model_selection.cross_val_score(l, train_vectors, train[\"target\"], cv=3, scoring=\"f1\")\n",
+    "scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RidgeClassifier(alpha=100, class_weight=None, copy_X=True, fit_intercept=True,\n",
+       "                max_iter=None, normalize=False, random_state=600, solver='auto',\n",
+       "                tol=0.001)"
+      ]
+     },
+     "execution_count": 43,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "l.fit(train_vectors, train[\"target\"])\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>3258</td>\n",
+       "      <td>10861</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3259</td>\n",
+       "      <td>10865</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3260</td>\n",
+       "      <td>10868</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3261</td>\n",
+       "      <td>10874</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3262</td>\n",
+       "      <td>10875</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         id  target\n",
+       "3258  10861       0\n",
+       "3259  10865       1\n",
+       "3260  10868       1\n",
+       "3261  10874       1\n",
+       "3262  10875       0"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_submission = pd.read_csv(\"sample_submission.csv\")\n",
+    "sample_submission[\"target\"] = l.predict(test_vectors)\n",
+    "sample_submission.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_submission.to_csv(\"submission.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}