From 49f8627140754f6c92866d4efb80dfbeb60dc232 Mon Sep 17 00:00:00 2001 From: Aqsa Qadir Date: Wed, 28 Oct 2020 12:55:17 +0500 Subject: [PATCH] Add SCIKIT_LEARN --- .../Scikit-learn/SCIKIT_LEARN.ipynb | 293 ++++++++++++++++++ 1 file changed, 293 insertions(+) create mode 100644 machine_learning/Scikit-learn/SCIKIT_LEARN.ipynb diff --git a/machine_learning/Scikit-learn/SCIKIT_LEARN.ipynb b/machine_learning/Scikit-learn/SCIKIT_LEARN.ipynb new file mode 100644 index 0000000..77f25b8 --- /dev/null +++ b/machine_learning/Scikit-learn/SCIKIT_LEARN.ipynb @@ -0,0 +1,293 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np # linear algebra\n", + "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", + "from sklearn import feature_extraction, linear_model, model_selection, preprocessing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "HAVE DATASET OF Real or Not? NLP with Disaster Tweets FROM KAGGLE COMPITITION \n", + "USE THE KAGGLE API TO DOWNLOAD DATSET : kaggle competitions download -c nlp-getting-started" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "train = pd.read_csv(\"train.csv\")\n", + "test = pd.read_csv(\"test.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'I love fruits'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train[train[\"target\"] == 0][\"text\"].values[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Forest fire near La Ronge Sask. Canada'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train[train[\"target\"] == 1][\"text\"].values[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "count_vectorizer = feature_extraction.text.CountVectorizer()\n", + "\n", + "## let's get counts for the first 5 tweets in the data\n", + "example_train_vectors = count_vectorizer.fit_transform(train[\"text\"][0:5])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1, 54)\n", + "[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0\n", + " 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]\n" + ] + } + ], + "source": [ + "# we use .todense() here because these vectors are \"sparse\" (only non-zero elements are kept to save space)\n", + "print(example_train_vectors[0].todense().shape)\n", + "print(example_train_vectors[0].todense())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "train_vectors = count_vectorizer.fit_transform(train[\"text\"])\n", + "\n", + "## note that we're NOT using .fit_transform() here. Using just .transform() makes sure\n", + "# that the tokens in the train vectors are the only ones mapped to the test vectors - \n", + "# i.e. that the train and test vectors use the same set of tokens.\n", + "test_vectors = count_vectorizer.transform(test[\"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cross_decomposition import PLSRegression\n", + "#Fitting training algorithm\n", + "l = linear_model.RidgeClassifier(alpha=100,random_state=600)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.61117318, 0.5923913 , 0.66873065])" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scores = model_selection.cross_val_score(l, train_vectors, train[\"target\"], cv=3, scoring=\"f1\")\n", + "scores" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RidgeClassifier(alpha=100, class_weight=None, copy_X=True, fit_intercept=True,\n", + " max_iter=None, normalize=False, random_state=600, solver='auto',\n", + " tol=0.001)" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l.fit(train_vectors, train[\"target\"])\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtarget
3258108610
3259108651
3260108681
3261108741
3262108750
\n", + "
" + ], + "text/plain": [ + " id target\n", + "3258 10861 0\n", + "3259 10865 1\n", + "3260 10868 1\n", + "3261 10874 1\n", + "3262 10875 0" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_submission = pd.read_csv(\"sample_submission.csv\")\n", + "sample_submission[\"target\"] = l.predict(test_vectors)\n", + "sample_submission.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "sample_submission.to_csv(\"submission.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}