Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SCIKIT_LEARN #39

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
293 changes: 293 additions & 0 deletions machine_learning/Scikit-learn/SCIKIT_LEARN.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np # linear algebra\n",
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
"from sklearn import feature_extraction, linear_model, model_selection, preprocessing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"HAVE DATASET OF Real or Not? NLP with Disaster Tweets FROM KAGGLE COMPITITION \n",
"USE THE KAGGLE API TO DOWNLOAD DATSET : kaggle competitions download -c nlp-getting-started"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv(\"train.csv\")\n",
"test = pd.read_csv(\"test.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'I love fruits'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[train[\"target\"] == 0][\"text\"].values[1]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Forest fire near La Ronge Sask. Canada'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[train[\"target\"] == 1][\"text\"].values[1]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"count_vectorizer = feature_extraction.text.CountVectorizer()\n",
"\n",
"## let's get counts for the first 5 tweets in the data\n",
"example_train_vectors = count_vectorizer.fit_transform(train[\"text\"][0:5])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1, 54)\n",
"[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0\n",
" 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]\n"
]
}
],
"source": [
"# we use .todense() here because these vectors are \"sparse\" (only non-zero elements are kept to save space)\n",
"print(example_train_vectors[0].todense().shape)\n",
"print(example_train_vectors[0].todense())\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"train_vectors = count_vectorizer.fit_transform(train[\"text\"])\n",
"\n",
"## note that we're NOT using .fit_transform() here. Using just .transform() makes sure\n",
"# that the tokens in the train vectors are the only ones mapped to the test vectors - \n",
"# i.e. that the train and test vectors use the same set of tokens.\n",
"test_vectors = count_vectorizer.transform(test[\"text\"])"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.cross_decomposition import PLSRegression\n",
"#Fitting training algorithm\n",
"l = linear_model.RidgeClassifier(alpha=100,random_state=600)\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.61117318, 0.5923913 , 0.66873065])"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scores = model_selection.cross_val_score(l, train_vectors, train[\"target\"], cv=3, scoring=\"f1\")\n",
"scores"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RidgeClassifier(alpha=100, class_weight=None, copy_X=True, fit_intercept=True,\n",
" max_iter=None, normalize=False, random_state=600, solver='auto',\n",
" tol=0.001)"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"l.fit(train_vectors, train[\"target\"])\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>3258</td>\n",
" <td>10861</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3259</td>\n",
" <td>10865</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3260</td>\n",
" <td>10868</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3261</td>\n",
" <td>10874</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3262</td>\n",
" <td>10875</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id target\n",
"3258 10861 0\n",
"3259 10865 1\n",
"3260 10868 1\n",
"3261 10874 1\n",
"3262 10875 0"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_submission = pd.read_csv(\"sample_submission.csv\")\n",
"sample_submission[\"target\"] = l.predict(test_vectors)\n",
"sample_submission.tail()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"sample_submission.to_csv(\"submission.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}