diff --git a/Yasir Abdur Rohman/Project 1/.ipynb_checkpoints/Topic Modelling on Encrypted Twitter Dataset -checkpoint.ipynb b/Yasir Abdur Rohman/Project 1/.ipynb_checkpoints/Topic Modelling on Encrypted Twitter Dataset -checkpoint.ipynb new file mode 100644 index 00000000..7cb2f5af --- /dev/null +++ b/Yasir Abdur Rohman/Project 1/.ipynb_checkpoints/Topic Modelling on Encrypted Twitter Dataset -checkpoint.ipynb @@ -0,0 +1,2773 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Topic Modelling on Encrpyted Twitter Dataset\n", + "*Author: Yasir Abdur Rohman*\n", + "
pandas dataframe .\n",
+ "\n",
+ "The fields from the tweet document that we will obtain are as follows:\n",
+ "- **username:** users\n",
+ "- **created_at:** the date when the tweet created\n",
+ "- **text:** text of the tweet"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# initialize dataframe\n",
+ "df_tweet = pd.DataFrame({'username': [], 'created_at': [], 'text': []})\n",
+ "\n",
+ "# crawling tweet documents\n",
+ "c = tweepy.Cursor(api.search, q=query, tweet_mode='extended').items()\n",
+ "while True:\n",
+ " try:\n",
+ " tweet = c.next()\n",
+ " \n",
+ " # get clean tweet without RT\n",
+ " if (not tweet.retweeted) and ('RT @' not in tweet.full_text):\n",
+ " username = tweet.user.screen_name\n",
+ " created = tweet.created_at\n",
+ " text = tweet.full_text.encode('utf-8', 'ignore').decode('utf-8', 'ignore')\n",
+ "\n",
+ " # get tweet documents before maximum_days\n",
+ " if tweet.created_at < date_max:\n",
+ " break\n",
+ " df_tweet = df_tweet.append({'username': username, 'created_at': created, \n",
+ " 'text': text}, ignore_index=True)\n",
+ " \n",
+ " except tweepy.TweepError as e:\n",
+ " print(e.reason)\n",
+ " time.sleep(15 * 60)\n",
+ " continue\n",
+ " except StopIteration:\n",
+ " break"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Step 2: Exploratory Data Analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "| \n", + " | username | \n", + "created_at | \n", + "text | \n", + "
|---|---|---|---|
| 0 | \n", + "JustNowBulletin | \n", + "2019-08-20 14:20:11 | \n", + "Udacity AWS DeepRacer Scholarship Challenge fo... | \n", + "
| 1 | \n", + "sustaintrain | \n", + "2019-08-20 14:15:10 | \n", + "A new Udacity Green IT training course has bee... | \n", + "
| 2 | \n", + "Upwork | \n", + "2019-08-20 13:20:49 | \n", + "Back to School Isn’t Just For Kids: 20 Courses... | \n", + "
| 3 | \n", + "frengki_17 | \n", + "2019-08-20 13:17:23 | \n", + "@onnowpurbo Belajar tensorflow yang di udacity... | \n", + "
| 4 | \n", + "javaquantum | \n", + "2019-08-20 13:14:13 | \n", + "@xsahil03x @GoogleIndia @udacity Congratulatio... | \n", + "
| 5 | \n", + "RahulSi88662093 | \n", + "2019-08-20 13:08:38 | \n", + "@anandjha_98 @kala83 @Google @udacity Please r... | \n", + "
| 6 | \n", + "MaramAlroshoud | \n", + "2019-08-20 12:49:10 | \n", + "وشو مسك يوداستي؟ — https://t.co/UC0Y4m9Foo htt... | \n", + "
| 7 | \n", + "SafeScholarship | \n", + "2019-08-20 12:29:52 | \n", + "Udacity AWS DeepRacer Scholarship Challenge fo... | \n", + "
| 8 | \n", + "0xAlsharif | \n", + "2019-08-20 12:29:09 | \n", + "Networking for Web Developers. https://t.co/Hf... | \n", + "
| 9 | \n", + "0xAlsharif | \n", + "2019-08-20 12:19:33 | \n", + "تحية وتقدير للمواقع والمصادر التعليمية اللي تج... | \n", + "
EncrpytedDataset to encrypt Twitter dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class EncryptedDataset():\n",
+ " \n",
+ " def __init__(self, *owners, max_key_len=8, max_val_len=8):\n",
+ " self.max_key_len = max_key_len\n",
+ " self.max_val_len = max_val_len\n",
+ " \n",
+ " self.keys = list()\n",
+ " self.values = list()\n",
+ " self.owners = owners\n",
+ " \n",
+ " def add_entry(self, key, value):\n",
+ " key = string2one_hot_matrix(key)\n",
+ " key = key.share(*self.owners)\n",
+ " self.keys.append(key)\n",
+ " \n",
+ " value = string2values(value, max_len=self.max_val_len)\n",
+ " value = value.share(*self.owners)\n",
+ " self.values.append(value)\n",
+ " \n",
+ " def query(self, query_str):\n",
+ " query_matrix = string2one_hot_matrix(query_str)\n",
+ " \n",
+ " query_matrix = query_matrix.share(*self.owners)\n",
+ "\n",
+ " key_matches = list()\n",
+ " for key in self.keys:\n",
+ "\n",
+ " key_match = strings_equal(key, query_matrix)\n",
+ " key_matches.append(key_match)\n",
+ "\n",
+ " result = self.values[0] * key_matches[0]\n",
+ "\n",
+ " for i in range(len(self.values) - 1):\n",
+ " result += self.values[i+1] * key_matches[i+1]\n",
+ " \n",
+ " result = result.get()\n",
+ "\n",
+ " return values2string(result).replace(\".\",\"\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Make workers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "W0820 22:11:36.020632 2788 hook.py:98] Torch was already hooked... skipping hooking process\n"
+ ]
+ }
+ ],
+ "source": [
+ "import syft as sy\n",
+ "import torch as th\n",
+ "hook = sy.TorchHook(th)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "W0820 22:34:05.893684 2788 base.py:628] Worker me already exists. Replacing old worker which could cause unexpected behavior\n",
+ "W0820 22:34:05.894680 2788 base.py:628] Worker me already exists. Replacing old worker which could cause unexpected behavior\n",
+ "W0820 22:34:05.895643 2788 base.py:628] Worker me already exists. Replacing old worker which could cause unexpected behavior\n"
+ ]
+ }
+ ],
+ "source": [
+ "bob = sy.VirtualWorker(hook, id=\"bob\").add_worker(sy.local_worker)\n",
+ "alice = sy.VirtualWorker(hook, id=\"alice\").add_worker(sy.local_worker)\n",
+ "secure_worker = sy.VirtualWorker(hook, id=\"secure_worker\").add_worker(sy.local_worker)\n",
+ "\n",
+ "# clearing objects\n",
+ "bob = bob.clear_objects()\n",
+ "alice = alice.clear_objects()\n",
+ "secure_worker = secure_worker.clear_objects()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Cleaning Dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to C:\\Users\\Yasir Abdur\n",
+ "[nltk_data] Rohman\\AppData\\Roaming\\nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n"
+ ]
+ }
+ ],
+ "source": [
+ "import nltk\n",
+ "nltk.download('stopwords')\n",
+ "from nltk.corpus import stopwords\n",
+ "from string import punctuation\n",
+ "import string\n",
+ "from unicodedata import normalize\n",
+ "import re\n",
+ "\n",
+ "def remove_url(tweet):\n",
+ " \"\"\"Remove URL link.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " tweet = re.sub(r\"\\s—\\s\", \"\", tweet)\n",
+ " tweet = re.sub(r\"http\\S+\", \"\", tweet)\n",
+ " return tweet\n",
+ "\n",
+ "def remove_hashtag_mention(tweet):\n",
+ " \"\"\"Remove hastag and mention.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " result = []\n",
+ " tweet = tweet.split(' ')\n",
+ " for t in tweet:\n",
+ " if t.startswith('#') or t.startswith('@'):\n",
+ " continue\n",
+ " else:\n",
+ " result.append(t)\n",
+ " return ' '.join(result)\n",
+ "\n",
+ "def remove_rt_fav(tweet):\n",
+ " \"\"\"Remove RT and FAV.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " tweet = re.sub(r'^(RT|FAV)','', tweet)\n",
+ " return tweet\n",
+ "\n",
+ "def remove_punctuation(tweet):\n",
+ " \"\"\"Remove punctuation except '-'.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " remove = string.punctuation\n",
+ " translator = str.maketrans(remove, ' '*len(remove))\n",
+ " return tweet.translate(translator)\n",
+ " \n",
+ "def remove_ascii_unicode(tweet):\n",
+ " \"\"\"Remove ASCII and Unicode.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " tweet = normalize('NFKD', tweet)\n",
+ " tweet = tweet.encode('ascii', 'ignore').decode('utf-8')\n",
+ " return tweet\n",
+ "\n",
+ "def remove_newline(tweet):\n",
+ " \"\"\"Remove newline.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " tweet = tweet.replace('\\n', ' ')\n",
+ " return tweet"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# cleaning tweet documents\n",
+ "\n",
+ "for index, row in df_tweet.iterrows():\n",
+ " clean_tweet = remove_ascii_unicode(row['text'])\n",
+ " clean_tweet = clean_tweet.lower()\n",
+ " clean_tweet = remove_rt_fav(clean_tweet)\n",
+ " clean_tweet = remove_newline(clean_tweet)\n",
+ " clean_tweet = remove_url(clean_tweet)\n",
+ " clean_tweet = remove_hashtag_mention(clean_tweet)\n",
+ " clean_tweet = remove_punctuation(clean_tweet)\n",
+ " \n",
+ " # update dataframe\n",
+ " df_tweet.at[index,'text'] = clean_tweet"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 udacity aws deepracer scholarship challenge fo...\n",
+ "1 a new udacity green it training course has bee...\n",
+ "2 back to school isnt just for kids 20 courses ...\n",
+ "3 belajar tensorflow yang di udacity bagus gak pak \n",
+ "4 congratulations \n",
+ "5 please reach out at support udacity com\n",
+ "6 \n",
+ "7 udacity aws deepracer scholarship challenge fo...\n",
+ "8 networking for web developers \n",
+ "9 \n",
+ "Name: text, dtype: object"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# take a look at the results\n",
+ "df_tweet['text'].head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Encrpyt the dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# initialize class EncrpytedDataset\n",
+ "db = EncryptedDataset(bob, alice, secure_worker, max_val_len=280)\n",
+ "\n",
+ "# add Twitter dataset into db\n",
+ "for index, row in df_tweet.iterrows():\n",
+ " # we can make the key combining index and username\n",
+ " key = str(index) + '@' + row['username']\n",
+ " values = row['text']\n",
+ " \n",
+ " db.add_entry(key, values)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'udacity aws deepracer scholarship challenge for international students '"
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# let's testing out our database\n",
+ "db.query(\"0@JustNowBulletin\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "| \n", + " | username | \n", + "created_at | \n", + "text | \n", + "
|---|---|---|---|
| 0 | \n", + "JustNowBulletin | \n", + "2019-08-20 14:20:11 | \n", + "udacity aws deepracer scholarship challenge fo... | \n", + "
| \n", + " | 1 | \n", + "2 | \n", + "3 | \n", + "able | \n", + "added | \n", + "ai | \n", + "also | \n", + "amp | \n", + "android | \n", + "become | \n", + "... | \n", + "time | \n", + "training | \n", + "two | \n", + "u | \n", + "udemy | \n", + "ur | \n", + "via | \n", + "voyage | \n", + "web | \n", + "work | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.003533 | \n", + "0.005734 | \n", + "0.003379 | \n", + "0.000000 | \n", + "0.011932 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.000000 | \n", + "0.00663 | \n", + "0.00338 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.004506 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
| 1 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.003571 | \n", + "0.008878 | \n", + "0.003672 | \n", + "... | \n", + "0.000000 | \n", + "0.00000 | \n", + "0.00000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.003455 | \n", + "
| 2 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.003593 | \n", + "0.005869 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.003756 | \n", + "0.00000 | \n", + "0.00000 | \n", + "0.003702 | \n", + "0.005444 | \n", + "0.003332 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
| 3 | \n", + "0.007639 | \n", + "0.009043 | \n", + "0.008026 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.004154 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.000000 | \n", + "0.00000 | \n", + "0.00000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.003851 | \n", + "0.000000 | \n", + "0.003543 | \n", + "0.000000 | \n", + "
4 rows × 71 columns
\n", + "In this notebook, I will implement Topic Modelling with Latent Dirichlet on Encrypted Twitter Dataset.
+Twitter is one of the social media that has followed the GDPR rules to protect the privacy of each user. However, we can still access tweets from many users by using the Twitter API. Even though there is an agreement to use the Twitter API wisely, it will still damage the privacy of other users who don't want their tweets to be taken. Therefore, here we will implement Encrypted Twitter Dataset as a privacy protection from the user.
+And after we encrypt the Twitter dataset, we can still implement AI securely. Here, we will apply Topic Modeling to uncover abstract themes based on the underlying tweet documents and words in a corpus of text.
+The project consists of the following steps:
+# import library
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import string
+import time
+
+# import dotenv
+import os
+from dotenv import load_dotenv
+
+# import tweepy
+import tweepy
+
+# datetime
+from datetime import datetime, timedelta
+
+# plotly
+from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
+import plotly as py
+import plotly.graph_objs as go
+
+init_notebook_mode(connected=True) #do not miss this line
+
+# gensim
+from gensim import corpora, models, similarities
+
+import warnings
+warnings.filterwarnings("ignore")
+We need Twitter API to crawl Twitter dataset. So, let's setting up for Twitter crawler first.
+ +# load env file
+load_dotenv(".env")
+
+# setting API Twitter
+TWITTER_API_KEY = os.getenv('TWITTER_API_KEY')
+TWITTER_API_SECRET_KEY = os.getenv('TWITTER_API_SECRET_KEY')
+TWITTER_ACCESS_TOKEN = os.getenv('TWITTER_ACCESS_TOKEN')
+TWITTER_ACCESS_TOKEN_SECRET = os.getenv('TWITTER_ACCESS_TOKEN_SECRET')
+
+auth = tweepy.OAuthHandler(TWITTER_API_KEY, TWITTER_API_SECRET_KEY)
+auth.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_TOKEN_SECRET)
+api = tweepy.API(auth, wait_on_rate_limit=True)
+In this project, we can setting up some parameters as follows:
+# initialize crawler parameters
+
+# date
+maximum_days = 7 # 7 days, so we will get all of the tweet documents from 7 days before
+today = datetime.today()
+date_max = today - timedelta(days=maximum_days)
+
+# query
+query = "#Udacity OR #udacity OR Udacity OR udacity"
+Next, we will save the results of crawling the tweet document into the pandas dataframe .
The fields from the tweet document that we will obtain are as follows:
+# initialize dataframe
+df_tweet = pd.DataFrame({'username': [], 'created_at': [], 'text': []})
+
+# crawling tweet documents
+c = tweepy.Cursor(api.search, q=query, tweet_mode='extended').items()
+while True:
+ try:
+ tweet = c.next()
+
+ # get clean tweet without RT
+ if (not tweet.retweeted) and ('RT @' not in tweet.full_text):
+ username = tweet.user.screen_name
+ created = tweet.created_at
+ text = tweet.full_text.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
+
+ # get tweet documents before maximum_days
+ if tweet.created_at < date_max:
+ break
+ df_tweet = df_tweet.append({'username': username, 'created_at': created,
+ 'text': text}, ignore_index=True)
+
+ except tweepy.TweepError as e:
+ print(e.reason)
+ time.sleep(15 * 60)
+ continue
+ except StopIteration:
+ break
+# let's see our twitter dataset
+df_tweet.head(10)
+# let's counting how many tweet documents that we got
+print("The number of tweet documents:", len(df_tweet['text']))
+tweetsT = df_tweet['created_at']
+
+trace = go.Histogram(
+ x = tweetsT,
+ marker = dict(
+ color = 'blue'
+ ),
+ opacity = 0.75
+)
+
+layout = go.Layout(
+ title = 'Tweet Activity',
+ height = 450,
+ width = 1000,
+ xaxis = dict(
+ title = 'Date'
+ ),
+ yaxis = dict(
+ title = 'Tweet Quantity'
+ ),
+ bargap = 0.2,
+)
+
+data = [trace]
+fig = go.Figure(data=data, layout=layout)
+py.offline.iplot(fig)
+First, we create dictionaries to change for each character to index, and vice versa.
+ +char2index = {}
+index2char = {}
+
+for i, char in enumerate(' ' + string.ascii_lowercase + '0123456789' + string.punctuation):
+ char2index[char] = i
+ index2char[i] = char
+Let's build some functions to implement encrpyted dataset.
+ +def string2values(str_input, max_len=8):
+ """Change string to value.
+ Args:
+ str_input(str): string input
+ max_len: maximum length of string
+ Returns:
+ tensor values
+ """
+ str_input = str_input[:max_len].lower()
+
+ # pad strings shorter than max len
+ if(len(str_input) < max_len):
+ str_input = str_input + "." * (max_len - len(str_input))
+
+ values = list()
+ for char in str_input:
+ values.append(char2index[char])
+
+ return th.tensor(values).long()
+
+def values2string(input_values):
+ """Change value to string.
+ Args:
+ input_values: values input
+ Returns:
+ string
+ """
+ s = ""
+ for value in input_values:
+ s += index2char[int(value)]
+ return s
+
+def one_hot(index, length):
+ """Make one hot for each counted string on the index.
+ Args:
+ index
+ Returns:
+ one hot vector
+ """
+ vect = th.zeros(length).long()
+ vect[index] = 1
+ return vect
+
+def string2one_hot_matrix(str_input, max_len=8):
+ """Change string into one hot matrix
+ Args:
+ str_input(str): string input
+ max_len: maximum length of string
+ Returns:
+ one hot matrix
+ """
+ str_input = str_input[:max_len].lower()
+ # pad strings shorter than max len
+ if(len(str_input) < max_len):
+ str_input = str_input + "." * (max_len - len(str_input))
+
+ char_vectors = list()
+ for char in str_input:
+ char_v = one_hot(char2index[char], len(char2index)).unsqueeze(0)
+ char_vectors.append(char_v)
+
+ return th.cat(char_vectors, dim=0)
+
+def strings_equal(str_a, str_b):
+ """Comparing two string inputs
+ Args:
+ str_a(str): string input a
+ str_b(str): string input b
+ Returns:
+ tensor comparison result
+ """
+ vect = (str_a * str_b).sum(1) # show overlapping
+
+ x = vect[0]
+
+ for i in range(vect.shape[0] - 1):
+ x = x * vect[i+1]
+
+ return x
+Now, we will make class EncrpytedDataset to encrypt Twitter dataset.
class EncryptedDataset():
+
+ def __init__(self, *owners, max_key_len=8, max_val_len=8):
+ self.max_key_len = max_key_len
+ self.max_val_len = max_val_len
+
+ self.keys = list()
+ self.values = list()
+ self.owners = owners
+
+ def add_entry(self, key, value):
+ key = string2one_hot_matrix(key)
+ key = key.share(*self.owners)
+ self.keys.append(key)
+
+ value = string2values(value, max_len=self.max_val_len)
+ value = value.share(*self.owners)
+ self.values.append(value)
+
+ def query(self, query_str):
+ query_matrix = string2one_hot_matrix(query_str)
+
+ query_matrix = query_matrix.share(*self.owners)
+
+ key_matches = list()
+ for key in self.keys:
+
+ key_match = strings_equal(key, query_matrix)
+ key_matches.append(key_match)
+
+ result = self.values[0] * key_matches[0]
+
+ for i in range(len(self.values) - 1):
+ result += self.values[i+1] * key_matches[i+1]
+
+ result = result.get()
+
+ return values2string(result).replace(".","")
+import syft as sy
+import torch as th
+hook = sy.TorchHook(th)
+bob = sy.VirtualWorker(hook, id="bob").add_worker(sy.local_worker)
+alice = sy.VirtualWorker(hook, id="alice").add_worker(sy.local_worker)
+secure_worker = sy.VirtualWorker(hook, id="secure_worker").add_worker(sy.local_worker)
+
+# clearing objects
+bob = bob.clear_objects()
+alice = alice.clear_objects()
+secure_worker = secure_worker.clear_objects()
+import nltk
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+from string import punctuation
+import string
+from unicodedata import normalize
+import re
+
+def remove_url(tweet):
+ """Remove URL link.
+ Args:
+ tweet(str): tweet
+ Returns:
+ modified tweet
+ """
+ tweet = re.sub(r"\s—\s", "", tweet)
+ tweet = re.sub(r"http\S+", "", tweet)
+ return tweet
+
+def remove_hashtag_mention(tweet):
+ """Remove hastag and mention.
+ Args:
+ tweet(str): tweet
+ Returns:
+ modified tweet
+ """
+ result = []
+ tweet = tweet.split(' ')
+ for t in tweet:
+ if t.startswith('#') or t.startswith('@'):
+ continue
+ else:
+ result.append(t)
+ return ' '.join(result)
+
+def remove_rt_fav(tweet):
+ """Remove RT and FAV.
+ Args:
+ tweet(str): tweet
+ Returns:
+ modified tweet
+ """
+ tweet = re.sub(r'^(RT|FAV)','', tweet)
+ return tweet
+
+def remove_punctuation(tweet):
+ """Remove punctuation except '-'.
+ Args:
+ tweet(str): tweet
+ Returns:
+ modified tweet
+ """
+ remove = string.punctuation
+ translator = str.maketrans(remove, ' '*len(remove))
+ return tweet.translate(translator)
+
+def remove_ascii_unicode(tweet):
+ """Remove ASCII and Unicode.
+ Args:
+ tweet(str): tweet
+ Returns:
+ modified tweet
+ """
+ tweet = normalize('NFKD', tweet)
+ tweet = tweet.encode('ascii', 'ignore').decode('utf-8')
+ return tweet
+
+def remove_newline(tweet):
+ """Remove newline.
+ Args:
+ tweet(str): tweet
+ Returns:
+ modified tweet
+ """
+ tweet = tweet.replace('\n', ' ')
+ return tweet
+# cleaning tweet documents
+
+for index, row in df_tweet.iterrows():
+ clean_tweet = remove_ascii_unicode(row['text'])
+ clean_tweet = clean_tweet.lower()
+ clean_tweet = remove_rt_fav(clean_tweet)
+ clean_tweet = remove_newline(clean_tweet)
+ clean_tweet = remove_url(clean_tweet)
+ clean_tweet = remove_hashtag_mention(clean_tweet)
+ clean_tweet = remove_punctuation(clean_tweet)
+
+ # update dataframe
+ df_tweet.at[index,'text'] = clean_tweet
+# take a look at the results
+df_tweet['text'].head(10)
+# initialize class EncrpytedDataset
+db = EncryptedDataset(bob, alice, secure_worker, max_val_len=280)
+
+# add Twitter dataset into db
+for index, row in df_tweet.iterrows():
+ # we can make the key combining index and username
+ key = str(index) + '@' + row['username']
+ values = row['text']
+
+ db.add_entry(key, values)
+# let's testing out our database
+db.query("0@JustNowBulletin")
+# we get the same result
+df_tweet.head(1)
+Now, we will implementing Topic Modelling from our Encrpyted Dataset.
+ +# make list of keys
+keys = list()
+
+for index, row in df_tweet.iterrows():
+ key = str(index) + '@' + row['username']
+ keys.append(key)
+
+len(keys)
+We will make corpus from Twitter Encrypted Dataset.
+ +corpus = list()
+a = list()
+
+# for i in range(len(df_tweet['text'])):
+# a = df_tweet['text'][i]
+# corpus.append(a)
+
+for key in keys:
+ a = db.query(key)
+ corpus.append(a)
+# let's see our corpus
+corpus[0:5]
+import gensim
+import logging
+import tempfile
+
+TEMP_FOLDER = tempfile.gettempdir()
+print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))
+
+from gensim import corpora
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+# remove common words and tokenize
+common_words = ['RT','rt', 'udacity']
+stoplist = stopwords.words('english') + list(punctuation) + common_words
+
+texts = [[word for word in str(document).lower().split() if word not in stoplist] for document in corpus]
+dictionary = corpora.Dictionary(texts)
+dictionary.save(os.path.join(TEMP_FOLDER, 'tweet_documents.dict')) # store the dictionary, for future reference
+# print(dictionary)
+
+corpus = [dictionary.doc2bow(text) for text in texts]
+corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'tweet_documents.mm'), corpus) # store to disk, for later use
+TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
+ +# initialize
+tfidf = models.TfidfModel(corpus)
+# use the model to transform vectors
+corpus_tfidf = tfidf[corpus]
+Latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. For example, if observations are words collected into documents, it posits that each document is a mixture of a small number of topics and that each word's presence is attributable to one of the document's topics. LDA is an example of a topic model.
+ +num_of_topics = 4
+
+lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_of_topics)
+corpus_lda = lda[corpus_tfidf]
+# Show first n important word in the topics
+lda.show_topics(num_of_topics, 5)
+# Show probability for each word in each topic
+from collections import OrderedDict
+
+data_lda = {i: OrderedDict(lda.show_topic(i, 25)) for i in range(num_of_topics)}
+
+df_lda = pd.DataFrame(data_lda)
+print(df_lda.shape)
+df_lda = df_lda.fillna(0).T
+print(df_lda.shape)
+
+df_lda.head()
+import seaborn as sns
+import matplotlib.pyplot as plt
+%matplotlib inline
+
+g = sns.clustermap(df_lda.corr(), center=0, cmap="RdBu", metric='cosine', linewidths=.75, figsize=(12, 12))
+plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
+plt.show()
+import pyLDAvis.gensim
+
+pyLDAvis.enable_notebook()
+panel = pyLDAvis.gensim.prepare(lda, corpus_lda, dictionary, mds='tsne')
+panel
+
+
+pandas dataframe .\n",
+ "\n",
+ "The fields from the tweet document that we will obtain are as follows:\n",
+ "- **username:** users\n",
+ "- **created_at:** the date when the tweet created\n",
+ "- **text:** text of the tweet"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# initialize dataframe\n",
+ "df_tweet = pd.DataFrame({'username': [], 'created_at': [], 'text': []})\n",
+ "\n",
+ "# crawling tweet documents\n",
+ "c = tweepy.Cursor(api.search, q=query, tweet_mode='extended').items()\n",
+ "while True:\n",
+ " try:\n",
+ " tweet = c.next()\n",
+ " \n",
+ " # get clean tweet without RT\n",
+ " if (not tweet.retweeted) and ('RT @' not in tweet.full_text):\n",
+ " username = tweet.user.screen_name\n",
+ " created = tweet.created_at\n",
+ " text = tweet.full_text.encode('utf-8', 'ignore').decode('utf-8', 'ignore')\n",
+ "\n",
+ " # get tweet documents before maximum_days\n",
+ " if tweet.created_at < date_max:\n",
+ " break\n",
+ " df_tweet = df_tweet.append({'username': username, 'created_at': created, \n",
+ " 'text': text}, ignore_index=True)\n",
+ " \n",
+ " except tweepy.TweepError as e:\n",
+ " print(e.reason)\n",
+ " time.sleep(15 * 60)\n",
+ " continue\n",
+ " except StopIteration:\n",
+ " break"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Step 2: Exploratory Data Analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "| \n", + " | username | \n", + "created_at | \n", + "text | \n", + "
|---|---|---|---|
| 0 | \n", + "JustNowBulletin | \n", + "2019-08-20 14:20:11 | \n", + "Udacity AWS DeepRacer Scholarship Challenge fo... | \n", + "
| 1 | \n", + "sustaintrain | \n", + "2019-08-20 14:15:10 | \n", + "A new Udacity Green IT training course has bee... | \n", + "
| 2 | \n", + "Upwork | \n", + "2019-08-20 13:20:49 | \n", + "Back to School Isn’t Just For Kids: 20 Courses... | \n", + "
| 3 | \n", + "frengki_17 | \n", + "2019-08-20 13:17:23 | \n", + "@onnowpurbo Belajar tensorflow yang di udacity... | \n", + "
| 4 | \n", + "javaquantum | \n", + "2019-08-20 13:14:13 | \n", + "@xsahil03x @GoogleIndia @udacity Congratulatio... | \n", + "
| 5 | \n", + "RahulSi88662093 | \n", + "2019-08-20 13:08:38 | \n", + "@anandjha_98 @kala83 @Google @udacity Please r... | \n", + "
| 6 | \n", + "MaramAlroshoud | \n", + "2019-08-20 12:49:10 | \n", + "وشو مسك يوداستي؟ — https://t.co/UC0Y4m9Foo htt... | \n", + "
| 7 | \n", + "SafeScholarship | \n", + "2019-08-20 12:29:52 | \n", + "Udacity AWS DeepRacer Scholarship Challenge fo... | \n", + "
| 8 | \n", + "0xAlsharif | \n", + "2019-08-20 12:29:09 | \n", + "Networking for Web Developers. https://t.co/Hf... | \n", + "
| 9 | \n", + "0xAlsharif | \n", + "2019-08-20 12:19:33 | \n", + "تحية وتقدير للمواقع والمصادر التعليمية اللي تج... | \n", + "
EncrpytedDataset to encrypt Twitter dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class EncryptedDataset():\n",
+ " \n",
+ " def __init__(self, *owners, max_key_len=8, max_val_len=8):\n",
+ " self.max_key_len = max_key_len\n",
+ " self.max_val_len = max_val_len\n",
+ " \n",
+ " self.keys = list()\n",
+ " self.values = list()\n",
+ " self.owners = owners\n",
+ " \n",
+ " def add_entry(self, key, value):\n",
+ " key = string2one_hot_matrix(key)\n",
+ " key = key.share(*self.owners)\n",
+ " self.keys.append(key)\n",
+ " \n",
+ " value = string2values(value, max_len=self.max_val_len)\n",
+ " value = value.share(*self.owners)\n",
+ " self.values.append(value)\n",
+ " \n",
+ " def query(self, query_str):\n",
+ " query_matrix = string2one_hot_matrix(query_str)\n",
+ " \n",
+ " query_matrix = query_matrix.share(*self.owners)\n",
+ "\n",
+ " key_matches = list()\n",
+ " for key in self.keys:\n",
+ "\n",
+ " key_match = strings_equal(key, query_matrix)\n",
+ " key_matches.append(key_match)\n",
+ "\n",
+ " result = self.values[0] * key_matches[0]\n",
+ "\n",
+ " for i in range(len(self.values) - 1):\n",
+ " result += self.values[i+1] * key_matches[i+1]\n",
+ " \n",
+ " result = result.get()\n",
+ "\n",
+ " return values2string(result).replace(\".\",\"\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Make workers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "W0820 22:11:36.020632 2788 hook.py:98] Torch was already hooked... skipping hooking process\n"
+ ]
+ }
+ ],
+ "source": [
+ "import syft as sy\n",
+ "import torch as th\n",
+ "hook = sy.TorchHook(th)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "W0820 22:34:05.893684 2788 base.py:628] Worker me already exists. Replacing old worker which could cause unexpected behavior\n",
+ "W0820 22:34:05.894680 2788 base.py:628] Worker me already exists. Replacing old worker which could cause unexpected behavior\n",
+ "W0820 22:34:05.895643 2788 base.py:628] Worker me already exists. Replacing old worker which could cause unexpected behavior\n"
+ ]
+ }
+ ],
+ "source": [
+ "bob = sy.VirtualWorker(hook, id=\"bob\").add_worker(sy.local_worker)\n",
+ "alice = sy.VirtualWorker(hook, id=\"alice\").add_worker(sy.local_worker)\n",
+ "secure_worker = sy.VirtualWorker(hook, id=\"secure_worker\").add_worker(sy.local_worker)\n",
+ "\n",
+ "# clearing objects\n",
+ "bob = bob.clear_objects()\n",
+ "alice = alice.clear_objects()\n",
+ "secure_worker = secure_worker.clear_objects()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Cleaning Dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to C:\\Users\\Yasir Abdur\n",
+ "[nltk_data] Rohman\\AppData\\Roaming\\nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n"
+ ]
+ }
+ ],
+ "source": [
+ "import nltk\n",
+ "nltk.download('stopwords')\n",
+ "from nltk.corpus import stopwords\n",
+ "from string import punctuation\n",
+ "import string\n",
+ "from unicodedata import normalize\n",
+ "import re\n",
+ "\n",
+ "def remove_url(tweet):\n",
+ " \"\"\"Remove URL link.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " tweet = re.sub(r\"\\s—\\s\", \"\", tweet)\n",
+ " tweet = re.sub(r\"http\\S+\", \"\", tweet)\n",
+ " return tweet\n",
+ "\n",
+ "def remove_hashtag_mention(tweet):\n",
+ " \"\"\"Remove hastag and mention.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " result = []\n",
+ " tweet = tweet.split(' ')\n",
+ " for t in tweet:\n",
+ " if t.startswith('#') or t.startswith('@'):\n",
+ " continue\n",
+ " else:\n",
+ " result.append(t)\n",
+ " return ' '.join(result)\n",
+ "\n",
+ "def remove_rt_fav(tweet):\n",
+ " \"\"\"Remove RT and FAV.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " tweet = re.sub(r'^(RT|FAV)','', tweet)\n",
+ " return tweet\n",
+ "\n",
+ "def remove_punctuation(tweet):\n",
+ " \"\"\"Remove punctuation except '-'.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " remove = string.punctuation\n",
+ " translator = str.maketrans(remove, ' '*len(remove))\n",
+ " return tweet.translate(translator)\n",
+ " \n",
+ "def remove_ascii_unicode(tweet):\n",
+ " \"\"\"Remove ASCII and Unicode.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " tweet = normalize('NFKD', tweet)\n",
+ " tweet = tweet.encode('ascii', 'ignore').decode('utf-8')\n",
+ " return tweet\n",
+ "\n",
+ "def remove_newline(tweet):\n",
+ " \"\"\"Remove newline.\n",
+ " Args:\n",
+ " tweet(str): tweet\n",
+ " Returns:\n",
+ " modified tweet\n",
+ " \"\"\"\n",
+ " tweet = tweet.replace('\\n', ' ')\n",
+ " return tweet"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# cleaning tweet documents\n",
+ "\n",
+ "for index, row in df_tweet.iterrows():\n",
+ " clean_tweet = remove_ascii_unicode(row['text'])\n",
+ " clean_tweet = clean_tweet.lower()\n",
+ " clean_tweet = remove_rt_fav(clean_tweet)\n",
+ " clean_tweet = remove_newline(clean_tweet)\n",
+ " clean_tweet = remove_url(clean_tweet)\n",
+ " clean_tweet = remove_hashtag_mention(clean_tweet)\n",
+ " clean_tweet = remove_punctuation(clean_tweet)\n",
+ " \n",
+ " # update dataframe\n",
+ " df_tweet.at[index,'text'] = clean_tweet"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 udacity aws deepracer scholarship challenge fo...\n",
+ "1 a new udacity green it training course has bee...\n",
+ "2 back to school isnt just for kids 20 courses ...\n",
+ "3 belajar tensorflow yang di udacity bagus gak pak \n",
+ "4 congratulations \n",
+ "5 please reach out at support udacity com\n",
+ "6 \n",
+ "7 udacity aws deepracer scholarship challenge fo...\n",
+ "8 networking for web developers \n",
+ "9 \n",
+ "Name: text, dtype: object"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# take a look at the results\n",
+ "df_tweet['text'].head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Encrpyt the dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# initialize class EncrpytedDataset\n",
+ "db = EncryptedDataset(bob, alice, secure_worker, max_val_len=280)\n",
+ "\n",
+ "# add Twitter dataset into db\n",
+ "for index, row in df_tweet.iterrows():\n",
+ " # we can make the key combining index and username\n",
+ " key = str(index) + '@' + row['username']\n",
+ " values = row['text']\n",
+ " \n",
+ " db.add_entry(key, values)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'udacity aws deepracer scholarship challenge for international students '"
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# let's testing out our database\n",
+ "db.query(\"0@JustNowBulletin\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "| \n", + " | username | \n", + "created_at | \n", + "text | \n", + "
|---|---|---|---|
| 0 | \n", + "JustNowBulletin | \n", + "2019-08-20 14:20:11 | \n", + "udacity aws deepracer scholarship challenge fo... | \n", + "
| \n", + " | 1 | \n", + "2 | \n", + "3 | \n", + "able | \n", + "added | \n", + "ai | \n", + "also | \n", + "amp | \n", + "android | \n", + "become | \n", + "... | \n", + "time | \n", + "training | \n", + "two | \n", + "u | \n", + "udemy | \n", + "ur | \n", + "via | \n", + "voyage | \n", + "web | \n", + "work | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.003533 | \n", + "0.005734 | \n", + "0.003379 | \n", + "0.000000 | \n", + "0.011932 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.000000 | \n", + "0.00663 | \n", + "0.00338 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.004506 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
| 1 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.003571 | \n", + "0.008878 | \n", + "0.003672 | \n", + "... | \n", + "0.000000 | \n", + "0.00000 | \n", + "0.00000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.003455 | \n", + "
| 2 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.003593 | \n", + "0.005869 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.003756 | \n", + "0.00000 | \n", + "0.00000 | \n", + "0.003702 | \n", + "0.005444 | \n", + "0.003332 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "
| 3 | \n", + "0.007639 | \n", + "0.009043 | \n", + "0.008026 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.004154 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.000000 | \n", + "0.00000 | \n", + "0.00000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.003851 | \n", + "0.000000 | \n", + "0.003543 | \n", + "0.000000 | \n", + "
4 rows × 71 columns
\n", + "BatchNorm is still not compatible in Federated Learning as issued in [#2132](https://github.com/OpenMined/PySyft/issues/2132), we cannot using transfer learning from pre-trained model such as Resnet18, VGG16. So for the solution we will make CNN layer manually as follows."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# download resnet 18 model\n",
+ "# resnet18 = models.resnet18(pretrained=True)\n",
+ "\n",
+ "# Freeze training for all \"features\" layers\n",
+ "# for param in resnet18.parameters():\n",
+ "# param.requires_grad = False\n",
+ " \n",
+ "# num_ftrs = resnet18.fc.in_features\n",
+ "\n",
+ "# resnet18.fc = nn.Sequential(nn.Linear(num_ftrs, 256),\n",
+ "# nn.Dropout(0.3),\n",
+ "# nn.Linear(256, 128),\n",
+ "# nn.Dropout(0.3),\n",
+ "# nn.Linear(128, 64),\n",
+ "# nn.Dropout(0.3),\n",
+ "# nn.Linear(64, 10),\n",
+ "# nn.LogSoftmax(dim=1))\n",
+ "# architecture resnet18 modified\n",
+ "# resnet18"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class CNN(nn.Module):\n",
+ " def __init__(self,num_classes=10):\n",
+ " super(CNN, self).__init__()\n",
+ " self.conv1 = nn.Conv2d(1, 20, 5, 1)\n",
+ " self.conv2 = nn.Conv2d(20, 50, 5, 1)\n",
+ " self.fc1 = nn.Linear(4*4*50, 500)\n",
+ " self.fc2 = nn.Linear(500, 10)\n",
+ " \n",
+ " def forward(self, x):\n",
+ " x = F.relu(self.conv1(x))\n",
+ " x = F.max_pool2d(x, 2, 2)\n",
+ " x = F.relu(self.conv2(x))\n",
+ " x = F.max_pool2d(x, 2, 2)\n",
+ " x = x.view(-1, 4*4*50)\n",
+ " x = F.relu(self.fc1(x))\n",
+ " x = self.fc2(x)\n",
+ " return F.log_softmax(x, dim=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Train Function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def train(args, model, device, train_loader, optimizer, epoch):\n",
+ " model.train()\n",
+ " for batch_idx, (data, target) in enumerate(federated_train_loader):\n",
+ " model.send(data.location)\n",
+ " data, target = data.to(device), target.to(device)\n",
+ " optimizer.zero_grad()\n",
+ " output = model(data)\n",
+ " loss = F.nll_loss(output, target)\n",
+ " loss.backward()\n",
+ " optimizer.step()\n",
+ " model.get()\n",
+ " if batch_idx % args.log_interval == 0:\n",
+ " loss = loss.get()\n",
+ " print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n",
+ " epoch, batch_idx * args.batch_size, len(train_loader) * args.batch_size, #batch_idx * len(data), len(train_loader.dataset),\n",
+ " 100. * batch_idx / len(train_loader), loss.item()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test Function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def test(args, model, device, test_loader):\n",
+ " model.eval()\n",
+ " test_loss = 0\n",
+ " correct = 0\n",
+ " with torch.no_grad():\n",
+ " for data, target in test_loader:\n",
+ " data, target = data.to(device), target.to(device)\n",
+ " output = model(data)\n",
+ " test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss\n",
+ " pred = output.argmax(1, keepdim=True) # get the index of the max log-probability \n",
+ " correct += pred.eq(target.view_as(pred)).sum().item()\n",
+ "\n",
+ " test_loss /= len(test_loader.dataset)\n",
+ "\n",
+ " print('\\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\\n'.format(\n",
+ " test_loss, correct, len(test_loader.dataset),\n",
+ " 100. * correct / len(test_loader.dataset)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Step 4: Training Neural Network"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train Epoch: 1 [0/60032 (0%)]\tLoss: 2.305633\n",
+ "Train Epoch: 1 [1920/60032 (3%)]\tLoss: 0.740293\n",
+ "Train Epoch: 1 [3840/60032 (6%)]\tLoss: 0.725667\n",
+ "Train Epoch: 1 [5760/60032 (10%)]\tLoss: 0.552224\n",
+ "Train Epoch: 1 [7680/60032 (13%)]\tLoss: 0.808906\n",
+ "Train Epoch: 1 [9600/60032 (16%)]\tLoss: 0.540031\n",
+ "Train Epoch: 1 [11520/60032 (19%)]\tLoss: 0.476487\n",
+ "Train Epoch: 1 [13440/60032 (22%)]\tLoss: 0.433211\n",
+ "Train Epoch: 1 [15360/60032 (26%)]\tLoss: 0.367201\n",
+ "Train Epoch: 1 [17280/60032 (29%)]\tLoss: 0.437580\n",
+ "Train Epoch: 1 [19200/60032 (32%)]\tLoss: 0.465955\n",
+ "Train Epoch: 1 [21120/60032 (35%)]\tLoss: 0.478523\n",
+ "Train Epoch: 1 [23040/60032 (38%)]\tLoss: 0.569970\n",
+ "Train Epoch: 1 [24960/60032 (42%)]\tLoss: 0.323152\n",
+ "Train Epoch: 1 [26880/60032 (45%)]\tLoss: 0.535027\n",
+ "Train Epoch: 1 [28800/60032 (48%)]\tLoss: 0.366544\n"
+ ]
+ },
+ {
+ "ename": "TypeError",
+ "evalue": "add_() takes 1 positional argument but 2 were given",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32mIn this project, we will implement federated learning on Fashion MNIST dataset using CNN. Federated Learning is a machine learning setting where the goal is to train a high-quality centralized model with training data distributed over a large number of clients each with unreliable and relatively slow network connections. This project inspired by an example from Pysyft tutorial on Federated CIFAR10.
+For the better model performance, we will use transfer learning from a pre-trained model.
+The project consists of the following steps:
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, models, transforms
+
+# pysyft
+import syft as sy
+
+import numpy as np
+import matplotlib.pyplot as plt
+class Arguments():
+ def __init__(self):
+ self.batch_size = 64
+ self.test_batch_size = 1000
+ self.epochs = 25
+ self.lr = 0.001
+ self.momentum = 0.5
+ self.no_cuda = True
+ self.seed = 1
+ self.log_interval = 30
+ self.save_model = False
+
+args = Arguments()
+
+use_cuda = not args.no_cuda and torch.cuda.is_available()
+
+torch.manual_seed(args.seed)
+
+device = torch.device("cuda" if use_cuda else "cpu")
+
+kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
+hook = sy.TorchHook(torch)
+bob = sy.VirtualWorker(hook, id="bob")
+alice = sy.VirtualWorker(hook, id="alice")
+def load_data():
+
+ '''Load Fashion MNIST dataset from torch vision module distribute to workers using PySyft's Federated Data loader'''
+
+
+ federated_train_loader = sy.FederatedDataLoader(
+ datasets.FashionMNIST('../data', train=True, download=True,
+ transform=transforms.Compose([
+ transforms.ToTensor(),
+ transforms.Normalize((0.5,), (0.5,))
+ ]))
+ .federate((bob, alice)),
+ batch_size=args.batch_size, shuffle=True, **kwargs)
+
+ test_loader = torch.utils.data.DataLoader(
+ datasets.FashionMNIST('../data', train=False, transform=transforms.Compose([
+ transforms.ToTensor(),
+ transforms.Normalize((0.5,), (0.5,))
+ ])),
+ batch_size=args.test_batch_size, shuffle=True, **kwargs)
+
+ return federated_train_loader,test_loader
+federated_train_loader, test_loader = load_data()
+# obtain one batch of training images
+dataiter = iter(test_loader)
+images, labels = dataiter.next()
+images = images.numpy() # convert images to numpy for display
+
+plt.imshow(images[1].squeeze(), cmap='Greys_r')
+# download resnet 18 model
+# resnet18 = models.resnet18(pretrained=True)
+
+# Freeze training for all "features" layers
+# for param in resnet18.parameters():
+# param.requires_grad = False
+
+# num_ftrs = resnet18.fc.in_features
+
+# resnet18.fc = nn.Sequential(nn.Linear(num_ftrs, 256),
+# nn.Dropout(0.3),
+# nn.Linear(256, 128),
+# nn.Dropout(0.3),
+# nn.Linear(128, 64),
+# nn.Dropout(0.3),
+# nn.Linear(64, 10),
+# nn.LogSoftmax(dim=1))
+# architecture resnet18 modified
+# resnet18
+class CNN(nn.Module):
+ def __init__(self,num_classes=10):
+ super(CNN, self).__init__()
+ self.conv1 = nn.Conv2d(1, 20, 5, 1)
+ self.conv2 = nn.Conv2d(20, 50, 5, 1)
+ self.fc1 = nn.Linear(4*4*50, 500)
+ self.fc2 = nn.Linear(500, 10)
+
+ def forward(self, x):
+ x = F.relu(self.conv1(x))
+ x = F.max_pool2d(x, 2, 2)
+ x = F.relu(self.conv2(x))
+ x = F.max_pool2d(x, 2, 2)
+ x = x.view(-1, 4*4*50)
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return F.log_softmax(x, dim=1)
+def train(args, model, device, train_loader, optimizer, epoch):
+ model.train()
+ for batch_idx, (data, target) in enumerate(federated_train_loader):
+ model.send(data.location)
+ data, target = data.to(device), target.to(device)
+ optimizer.zero_grad()
+ output = model(data)
+ loss = F.nll_loss(output, target)
+ loss.backward()
+ optimizer.step()
+ model.get()
+ if batch_idx % args.log_interval == 0:
+ loss = loss.get()
+ print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+ epoch, batch_idx * args.batch_size, len(train_loader) * args.batch_size, #batch_idx * len(data), len(train_loader.dataset),
+ 100. * batch_idx / len(train_loader), loss.item()))
+def test(args, model, device, test_loader):
+ model.eval()
+ test_loss = 0
+ correct = 0
+ with torch.no_grad():
+ for data, target in test_loader:
+ data, target = data.to(device), target.to(device)
+ output = model(data)
+ test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
+ pred = output.argmax(1, keepdim=True) # get the index of the max log-probability
+ correct += pred.eq(target.view_as(pred)).sum().item()
+
+ test_loss /= len(test_loader.dataset)
+
+ print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+ test_loss, correct, len(test_loader.dataset),
+ 100. * correct / len(test_loader.dataset)))
+# resnet18 = resnet18.to(device)
+cnn = CNN().to(device)
+optimizer = optim.Adam(cnn.parameters(), lr=args.lr)
+
+# Train Neural network and validate with test set after completion of training every epoch
+for epoch in range(1, args.epochs + 1):
+ train(args, cnn, device, federated_train_loader, optimizer, epoch)
+ test(args, cnn, device, test_loader)
+
+if (args.save_model):
+ torch.save(cnn.state_dict(), "FashionMNIST_cnn.pt")
+For some reason, when the model is training suddenly stopped at 48%. I don't know if this is a bug or my code has an error. But, so far we already implementing Federated Learning on Fashion MNIST using CNN. For the future, I hope we can doing transfer learning on Federated Learning.
+ +
+BatchNorm is still not compatible in Federated Learning as issued in [#2132](https://github.com/OpenMined/PySyft/issues/2132), we cannot using transfer learning from pre-trained model such as Resnet18, VGG16. So for the solution we will make CNN layer manually as follows."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# download resnet 18 model\n",
+ "# resnet18 = models.resnet18(pretrained=True)\n",
+ "\n",
+ "# Freeze training for all \"features\" layers\n",
+ "# for param in resnet18.parameters():\n",
+ "# param.requires_grad = False\n",
+ " \n",
+ "# num_ftrs = resnet18.fc.in_features\n",
+ "\n",
+ "# resnet18.fc = nn.Sequential(nn.Linear(num_ftrs, 256),\n",
+ "# nn.Dropout(0.3),\n",
+ "# nn.Linear(256, 128),\n",
+ "# nn.Dropout(0.3),\n",
+ "# nn.Linear(128, 64),\n",
+ "# nn.Dropout(0.3),\n",
+ "# nn.Linear(64, 10),\n",
+ "# nn.LogSoftmax(dim=1))\n",
+ "# architecture resnet18 modified\n",
+ "# resnet18"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class CNN(nn.Module):\n",
+ " def __init__(self,num_classes=10):\n",
+ " super(CNN, self).__init__()\n",
+ " self.conv1 = nn.Conv2d(1, 20, 5, 1)\n",
+ " self.conv2 = nn.Conv2d(20, 50, 5, 1)\n",
+ " self.fc1 = nn.Linear(4*4*50, 500)\n",
+ " self.fc2 = nn.Linear(500, 10)\n",
+ " \n",
+ " def forward(self, x):\n",
+ " x = F.relu(self.conv1(x))\n",
+ " x = F.max_pool2d(x, 2, 2)\n",
+ " x = F.relu(self.conv2(x))\n",
+ " x = F.max_pool2d(x, 2, 2)\n",
+ " x = x.view(-1, 4*4*50)\n",
+ " x = F.relu(self.fc1(x))\n",
+ " x = self.fc2(x)\n",
+ " return F.log_softmax(x, dim=1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Train Function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def train(args, model, device, train_loader, optimizer, epoch):\n",
+ " model.train()\n",
+ " for batch_idx, (data, target) in enumerate(federated_train_loader):\n",
+ " model.send(data.location)\n",
+ " data, target = data.to(device), target.to(device)\n",
+ " optimizer.zero_grad()\n",
+ " output = model(data)\n",
+ " loss = F.nll_loss(output, target)\n",
+ " loss.backward()\n",
+ " optimizer.step()\n",
+ " model.get()\n",
+ " if batch_idx % args.log_interval == 0:\n",
+ " loss = loss.get()\n",
+ " print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'.format(\n",
+ " epoch, batch_idx * args.batch_size, len(train_loader) * args.batch_size, #batch_idx * len(data), len(train_loader.dataset),\n",
+ " 100. * batch_idx / len(train_loader), loss.item()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test Function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def test(args, model, device, test_loader):\n",
+ " model.eval()\n",
+ " test_loss = 0\n",
+ " correct = 0\n",
+ " with torch.no_grad():\n",
+ " for data, target in test_loader:\n",
+ " data, target = data.to(device), target.to(device)\n",
+ " output = model(data)\n",
+ " test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss\n",
+ " pred = output.argmax(1, keepdim=True) # get the index of the max log-probability \n",
+ " correct += pred.eq(target.view_as(pred)).sum().item()\n",
+ "\n",
+ " test_loss /= len(test_loader.dataset)\n",
+ "\n",
+ " print('\\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\\n'.format(\n",
+ " test_loss, correct, len(test_loader.dataset),\n",
+ " 100. * correct / len(test_loader.dataset)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Step 4: Training Neural Network"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train Epoch: 1 [0/60032 (0%)]\tLoss: 2.305633\n",
+ "Train Epoch: 1 [1920/60032 (3%)]\tLoss: 0.740293\n",
+ "Train Epoch: 1 [3840/60032 (6%)]\tLoss: 0.725667\n",
+ "Train Epoch: 1 [5760/60032 (10%)]\tLoss: 0.552224\n",
+ "Train Epoch: 1 [7680/60032 (13%)]\tLoss: 0.808906\n",
+ "Train Epoch: 1 [9600/60032 (16%)]\tLoss: 0.540031\n",
+ "Train Epoch: 1 [11520/60032 (19%)]\tLoss: 0.476487\n",
+ "Train Epoch: 1 [13440/60032 (22%)]\tLoss: 0.433211\n",
+ "Train Epoch: 1 [15360/60032 (26%)]\tLoss: 0.367201\n",
+ "Train Epoch: 1 [17280/60032 (29%)]\tLoss: 0.437580\n",
+ "Train Epoch: 1 [19200/60032 (32%)]\tLoss: 0.465955\n",
+ "Train Epoch: 1 [21120/60032 (35%)]\tLoss: 0.478523\n",
+ "Train Epoch: 1 [23040/60032 (38%)]\tLoss: 0.569970\n",
+ "Train Epoch: 1 [24960/60032 (42%)]\tLoss: 0.323152\n",
+ "Train Epoch: 1 [26880/60032 (45%)]\tLoss: 0.535027\n",
+ "Train Epoch: 1 [28800/60032 (48%)]\tLoss: 0.366544\n"
+ ]
+ },
+ {
+ "ename": "TypeError",
+ "evalue": "add_() takes 1 positional argument but 2 were given",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32mlearning rate, batch size, numpy seed.
+- Create 2 workers Alice and Bob
+
+## Step 2: Load data
+- Load Fashion MNIST dataset using torchvision.datasets.
+
+- Create federated_train_loader using FederatedDataLoader. Don't forget to transform dataset!
+- Create test_loader using torch.utils.DataLoader. Don't forget to transform dataset!
+- Visualize one data
+
+
+
+## Step 3: Convolutional Neural Network
+- Create architecture Class CNN.
+```python
+class CNN(nn.Module):
+ def __init__(self,num_classes=10):
+ super(CNN, self).__init__()
+ self.conv1 = nn.Conv2d(1, 20, 5, 1)
+ self.conv2 = nn.Conv2d(20, 50, 5, 1)
+ self.fc1 = nn.Linear(4*4*50, 500)
+ self.fc2 = nn.Linear(500, 10)
+
+ def forward(self, x):
+ x = F.relu(self.conv1(x))
+ x = F.max_pool2d(x, 2, 2)
+ x = F.relu(self.conv2(x))
+ x = F.max_pool2d(x, 2, 2)
+ x = x.view(-1, 4*4*50)
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return F.log_softmax(x, dim=1)
+```
+
+## Step 4: Training
+- Train Neural network and validate with test set after completion of training every epoch
+
+## My Notes
+- BatchNorm is still not compatible on Federated Learning. So, it is still imposible using pre-trained model such as resnet, VGG, etc.
+- For some reason, the training suddenly stopped at 48%. I don't know if this is a bug or my code has an error. But, so far we already implementing Federated Learning on Fashion MNIST using CNN.
diff --git a/Yasir Abdur Rohman/Project 2/assets/fashion-mnist.png b/Yasir Abdur Rohman/Project 2/assets/fashion-mnist.png
new file mode 100644
index 00000000..1ecde6f1
Binary files /dev/null and b/Yasir Abdur Rohman/Project 2/assets/fashion-mnist.png differ
diff --git a/Yasir Abdur Rohman/Project 2/assets/shoes.png b/Yasir Abdur Rohman/Project 2/assets/shoes.png
new file mode 100644
index 00000000..93d760c5
Binary files /dev/null and b/Yasir Abdur Rohman/Project 2/assets/shoes.png differ
diff --git a/Yasir Abdur Rohman/Project 3/.ipynb_checkpoints/Machine Translation with a Sequence to Sequence Network and Attention (English - Indonesian)-checkpoint.ipynb b/Yasir Abdur Rohman/Project 3/.ipynb_checkpoints/Machine Translation with a Sequence to Sequence Network and Attention (English - Indonesian)-checkpoint.ipynb
new file mode 100644
index 00000000..6b3f9039
--- /dev/null
+++ b/Yasir Abdur Rohman/Project 3/.ipynb_checkpoints/Machine Translation with a Sequence to Sequence Network and Attention (English - Indonesian)-checkpoint.ipynb
@@ -0,0 +1,804 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Machine Translation with a Sequence to Sequence Network and Attention (English - Indonesian)\n",
+ "attn_applied in the code) should contain information about that specific part of the input sequence, and thus help the decoder choose the right output words.\n",
+ "\n",
+ "Calculating the attention weights is done with another feed-forward layer attn, using the decoder’s input and hidden state as inputs. Because there are sentences of all sizes in the training data, to actually create and train this layer we have to choose a maximum sentence length (input length, for encoder outputs) that it can apply to. Sentences of the maximum length will use all the attention weights, while shorter sentences will only use the first few."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class AttnDecoderRNN(nn.Module):\n",
+ " def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):\n",
+ " super(AttnDecoderRNN, self).__init__()\n",
+ " self.hidden_size = hidden_size\n",
+ " self.output_size = output_size\n",
+ " self.dropout_p = dropout_p\n",
+ " self.max_length = max_length\n",
+ "\n",
+ " self.embedding = nn.Embedding(self.output_size, self.hidden_size)\n",
+ " self.attn = nn.Linear(self.hidden_size * 2, self.max_length)\n",
+ " self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)\n",
+ " self.dropout = nn.Dropout(self.dropout_p)\n",
+ " self.gru = nn.GRU(self.hidden_size, self.hidden_size)\n",
+ " self.out = nn.Linear(self.hidden_size, self.output_size)\n",
+ "\n",
+ " def forward(self, input, hidden, encoder_outputs):\n",
+ " embedded = self.embedding(input).view(1, 1, -1)\n",
+ " embedded = self.dropout(embedded)\n",
+ "\n",
+ " attn_weights = F.softmax(\n",
+ " self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)\n",
+ " attn_applied = torch.bmm(attn_weights.unsqueeze(0),\n",
+ " encoder_outputs.unsqueeze(0))\n",
+ "\n",
+ " output = torch.cat((embedded[0], attn_applied[0]), 1)\n",
+ " output = self.attn_combine(output).unsqueeze(0)\n",
+ "\n",
+ " output = F.relu(output)\n",
+ " output, hidden = self.gru(output, hidden)\n",
+ "\n",
+ " output = F.log_softmax(self.out(output[0]), dim=1)\n",
+ " return output, hidden, attn_weights\n",
+ "\n",
+ " def initHidden(self):\n",
+ " return torch.zeros(1, 1, self.hidden_size, device=device)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Training"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Preparing Training Data\n",
+ "To train, for each pair we will need an input tensor (indexes of the words in the input sentence) and target tensor (indexes of the words in the target sentence). While creating these vectors we will append the EOS token to both sequences."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def indexesFromSentence(lang, sentence):\n",
+ " return [lang.word2index[word] for word in sentence.split(' ')]\n",
+ "\n",
+ "\n",
+ "def tensorFromSentence(lang, sentence):\n",
+ " indexes = indexesFromSentence(lang, sentence)\n",
+ " indexes.append(EOS_token)\n",
+ " return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)\n",
+ "\n",
+ "\n",
+ "def tensorsFromPair(pair):\n",
+ " input_tensor = tensorFromSentence(input_lang, pair[0])\n",
+ " target_tensor = tensorFromSentence(output_lang, pair[1])\n",
+ " return (input_tensor, target_tensor)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Training The Model\n",
+ "To train we run the input sentence through the encoder, and keep track of every output and the latest hidden state. Then the decoder is given the teacher_forcing_ratio up to use more of it."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "teacher_forcing_ratio = 0.5\n",
+ "\n",
+ "\n",
+ "def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):\n",
+ " encoder_hidden = encoder.initHidden()\n",
+ "\n",
+ " encoder_optimizer.zero_grad()\n",
+ " decoder_optimizer.zero_grad()\n",
+ "\n",
+ " input_length = input_tensor.size(0)\n",
+ " target_length = target_tensor.size(0)\n",
+ "\n",
+ " encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)\n",
+ "\n",
+ " loss = 0\n",
+ "\n",
+ " for ei in range(input_length):\n",
+ " encoder_output, encoder_hidden = encoder(\n",
+ " input_tensor[ei], encoder_hidden)\n",
+ " encoder_outputs[ei] = encoder_output[0, 0]\n",
+ "\n",
+ " decoder_input = torch.tensor([[SOS_token]], device=device)\n",
+ "\n",
+ " decoder_hidden = encoder_hidden\n",
+ "\n",
+ " use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False\n",
+ "\n",
+ " if use_teacher_forcing:\n",
+ " # Teacher forcing: Feed the target as the next input\n",
+ " for di in range(target_length):\n",
+ " decoder_output, decoder_hidden, decoder_attention = decoder(\n",
+ " decoder_input, decoder_hidden, encoder_outputs)\n",
+ " loss += criterion(decoder_output, target_tensor[di])\n",
+ " decoder_input = target_tensor[di] # Teacher forcing\n",
+ "\n",
+ " else:\n",
+ " # Without teacher forcing: use its own predictions as the next input\n",
+ " for di in range(target_length):\n",
+ " decoder_output, decoder_hidden, decoder_attention = decoder(\n",
+ " decoder_input, decoder_hidden, encoder_outputs)\n",
+ " topv, topi = decoder_output.topk(1)\n",
+ " decoder_input = topi.squeeze().detach() # detach from history as input\n",
+ "\n",
+ " loss += criterion(decoder_output, target_tensor[di])\n",
+ " if decoder_input.item() == EOS_token:\n",
+ " break\n",
+ "\n",
+ " loss.backward()\n",
+ "\n",
+ " encoder_optimizer.step()\n",
+ " decoder_optimizer.step()\n",
+ "\n",
+ " return loss.item() / target_length"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Helper Function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import time\n",
+ "import math\n",
+ "\n",
+ "\n",
+ "def asMinutes(s):\n",
+ " m = math.floor(s / 60)\n",
+ " s -= m * 60\n",
+ " return '%dm %ds' % (m, s)\n",
+ "\n",
+ "\n",
+ "def timeSince(since, percent):\n",
+ " now = time.time()\n",
+ " s = now - since\n",
+ " es = s / (percent)\n",
+ " rs = es - s\n",
+ " return '%s (- %s)' % (asMinutes(s), asMinutes(rs))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The whole training process looks like this:\n",
+ "\n",
+ "- Start a timer\n",
+ "- Initialize optimizers and criterion\n",
+ "- Create set of training pairs\n",
+ "- Start empty losses array for plotting"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):\n",
+ " start = time.time()\n",
+ " plot_losses = []\n",
+ " print_loss_total = 0 # Reset every print_every\n",
+ " plot_loss_total = 0 # Reset every plot_every\n",
+ "\n",
+ " encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)\n",
+ " decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)\n",
+ " training_pairs = [tensorsFromPair(random.choice(pairs))\n",
+ " for i in range(n_iters)]\n",
+ " criterion = nn.NLLLoss()\n",
+ "\n",
+ " for iter in range(1, n_iters + 1):\n",
+ " training_pair = training_pairs[iter - 1]\n",
+ " input_tensor = training_pair[0]\n",
+ " target_tensor = training_pair[1]\n",
+ "\n",
+ " loss = train(input_tensor, target_tensor, encoder,\n",
+ " decoder, encoder_optimizer, decoder_optimizer, criterion)\n",
+ " print_loss_total += loss\n",
+ " plot_loss_total += loss\n",
+ "\n",
+ " if iter % print_every == 0:\n",
+ " print_loss_avg = print_loss_total / print_every\n",
+ " print_loss_total = 0\n",
+ " print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),\n",
+ " iter, iter / n_iters * 100, print_loss_avg))\n",
+ "\n",
+ " if iter % plot_every == 0:\n",
+ " plot_loss_avg = plot_loss_total / plot_every\n",
+ " plot_losses.append(plot_loss_avg)\n",
+ " plot_loss_total = 0\n",
+ "\n",
+ " showPlot(plot_losses)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Plotting Result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def showPlot(points):\n",
+ " plt.figure()\n",
+ " fig, ax = plt.subplots()\n",
+ " # this locator puts ticks at regular intervals\n",
+ " loc = ticker.MultipleLocator(base=0.2)\n",
+ " ax.yaxis.set_major_locator(loc)\n",
+ " plt.plot(points)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Evaluation\n",
+ "Evaluation is mostly the same as training, but there are no targets so we simply feed the decoder’s predictions back to itself for each step. Every time it predicts a word we add it to the output string, and if it predicts the EOS token we stop there. We also store the decoder’s attention outputs for display later."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):\n",
+ " with torch.no_grad():\n",
+ " input_tensor = tensorFromSentence(input_lang, sentence)\n",
+ " input_length = input_tensor.size()[0]\n",
+ " encoder_hidden = encoder.initHidden()\n",
+ "\n",
+ " encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)\n",
+ "\n",
+ " for ei in range(input_length):\n",
+ " encoder_output, encoder_hidden = encoder(input_tensor[ei],\n",
+ " encoder_hidden)\n",
+ " encoder_outputs[ei] += encoder_output[0, 0]\n",
+ "\n",
+ " decoder_input = torch.tensor([[SOS_token]], device=device) # SOS\n",
+ "\n",
+ " decoder_hidden = encoder_hidden\n",
+ "\n",
+ " decoded_words = []\n",
+ " decoder_attentions = torch.zeros(max_length, max_length)\n",
+ "\n",
+ " for di in range(max_length):\n",
+ " decoder_output, decoder_hidden, decoder_attention = decoder(\n",
+ " decoder_input, decoder_hidden, encoder_outputs)\n",
+ " decoder_attentions[di] = decoder_attention.data\n",
+ " topv, topi = decoder_output.data.topk(1)\n",
+ " if topi.item() == EOS_token:\n",
+ " decoded_words.append('attn_applied in the code) should contain information about that specific part of the input sequence, and thus help the decoder choose the right output words.\n",
+ "\n",
+ "Calculating the attention weights is done with another feed-forward layer attn, using the decoder’s input and hidden state as inputs. Because there are sentences of all sizes in the training data, to actually create and train this layer we have to choose a maximum sentence length (input length, for encoder outputs) that it can apply to. Sentences of the maximum length will use all the attention weights, while shorter sentences will only use the first few."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class AttnDecoderRNN(nn.Module):\n",
+ " def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):\n",
+ " super(AttnDecoderRNN, self).__init__()\n",
+ " self.hidden_size = hidden_size\n",
+ " self.output_size = output_size\n",
+ " self.dropout_p = dropout_p\n",
+ " self.max_length = max_length\n",
+ "\n",
+ " self.embedding = nn.Embedding(self.output_size, self.hidden_size)\n",
+ " self.attn = nn.Linear(self.hidden_size * 2, self.max_length)\n",
+ " self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)\n",
+ " self.dropout = nn.Dropout(self.dropout_p)\n",
+ " self.gru = nn.GRU(self.hidden_size, self.hidden_size)\n",
+ " self.out = nn.Linear(self.hidden_size, self.output_size)\n",
+ "\n",
+ " def forward(self, input, hidden, encoder_outputs):\n",
+ " embedded = self.embedding(input).view(1, 1, -1)\n",
+ " embedded = self.dropout(embedded)\n",
+ "\n",
+ " attn_weights = F.softmax(\n",
+ " self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)\n",
+ " attn_applied = torch.bmm(attn_weights.unsqueeze(0),\n",
+ " encoder_outputs.unsqueeze(0))\n",
+ "\n",
+ " output = torch.cat((embedded[0], attn_applied[0]), 1)\n",
+ " output = self.attn_combine(output).unsqueeze(0)\n",
+ "\n",
+ " output = F.relu(output)\n",
+ " output, hidden = self.gru(output, hidden)\n",
+ "\n",
+ " output = F.log_softmax(self.out(output[0]), dim=1)\n",
+ " return output, hidden, attn_weights\n",
+ "\n",
+ " def initHidden(self):\n",
+ " return torch.zeros(1, 1, self.hidden_size, device=device)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Training"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Preparing Training Data\n",
+ "To train, for each pair we will need an input tensor (indexes of the words in the input sentence) and target tensor (indexes of the words in the target sentence). While creating these vectors we will append the EOS token to both sequences."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def indexesFromSentence(lang, sentence):\n",
+ " return [lang.word2index[word] for word in sentence.split(' ')]\n",
+ "\n",
+ "\n",
+ "def tensorFromSentence(lang, sentence):\n",
+ " indexes = indexesFromSentence(lang, sentence)\n",
+ " indexes.append(EOS_token)\n",
+ " return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)\n",
+ "\n",
+ "\n",
+ "def tensorsFromPair(pair):\n",
+ " input_tensor = tensorFromSentence(input_lang, pair[0])\n",
+ " target_tensor = tensorFromSentence(output_lang, pair[1])\n",
+ " return (input_tensor, target_tensor)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Training The Model\n",
+ "To train we run the input sentence through the encoder, and keep track of every output and the latest hidden state. Then the decoder is given the teacher_forcing_ratio up to use more of it."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "teacher_forcing_ratio = 0.5\n",
+ "\n",
+ "\n",
+ "def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):\n",
+ " encoder_hidden = encoder.initHidden()\n",
+ "\n",
+ " encoder_optimizer.zero_grad()\n",
+ " decoder_optimizer.zero_grad()\n",
+ "\n",
+ " input_length = input_tensor.size(0)\n",
+ " target_length = target_tensor.size(0)\n",
+ "\n",
+ " encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)\n",
+ "\n",
+ " loss = 0\n",
+ "\n",
+ " for ei in range(input_length):\n",
+ " encoder_output, encoder_hidden = encoder(\n",
+ " input_tensor[ei], encoder_hidden)\n",
+ " encoder_outputs[ei] = encoder_output[0, 0]\n",
+ "\n",
+ " decoder_input = torch.tensor([[SOS_token]], device=device)\n",
+ "\n",
+ " decoder_hidden = encoder_hidden\n",
+ "\n",
+ " use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False\n",
+ "\n",
+ " if use_teacher_forcing:\n",
+ " # Teacher forcing: Feed the target as the next input\n",
+ " for di in range(target_length):\n",
+ " decoder_output, decoder_hidden, decoder_attention = decoder(\n",
+ " decoder_input, decoder_hidden, encoder_outputs)\n",
+ " loss += criterion(decoder_output, target_tensor[di])\n",
+ " decoder_input = target_tensor[di] # Teacher forcing\n",
+ "\n",
+ " else:\n",
+ " # Without teacher forcing: use its own predictions as the next input\n",
+ " for di in range(target_length):\n",
+ " decoder_output, decoder_hidden, decoder_attention = decoder(\n",
+ " decoder_input, decoder_hidden, encoder_outputs)\n",
+ " topv, topi = decoder_output.topk(1)\n",
+ " decoder_input = topi.squeeze().detach() # detach from history as input\n",
+ "\n",
+ " loss += criterion(decoder_output, target_tensor[di])\n",
+ " if decoder_input.item() == EOS_token:\n",
+ " break\n",
+ "\n",
+ " loss.backward()\n",
+ "\n",
+ " encoder_optimizer.step()\n",
+ " decoder_optimizer.step()\n",
+ "\n",
+ " return loss.item() / target_length"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Helper Function"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import time\n",
+ "import math\n",
+ "\n",
+ "\n",
+ "def asMinutes(s):\n",
+ " m = math.floor(s / 60)\n",
+ " s -= m * 60\n",
+ " return '%dm %ds' % (m, s)\n",
+ "\n",
+ "\n",
+ "def timeSince(since, percent):\n",
+ " now = time.time()\n",
+ " s = now - since\n",
+ " es = s / (percent)\n",
+ " rs = es - s\n",
+ " return '%s (- %s)' % (asMinutes(s), asMinutes(rs))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The whole training process looks like this:\n",
+ "\n",
+ "- Start a timer\n",
+ "- Initialize optimizers and criterion\n",
+ "- Create set of training pairs\n",
+ "- Start empty losses array for plotting"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):\n",
+ " start = time.time()\n",
+ " plot_losses = []\n",
+ " print_loss_total = 0 # Reset every print_every\n",
+ " plot_loss_total = 0 # Reset every plot_every\n",
+ "\n",
+ " encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)\n",
+ " decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)\n",
+ " training_pairs = [tensorsFromPair(random.choice(pairs))\n",
+ " for i in range(n_iters)]\n",
+ " criterion = nn.NLLLoss()\n",
+ "\n",
+ " for iter in range(1, n_iters + 1):\n",
+ " training_pair = training_pairs[iter - 1]\n",
+ " input_tensor = training_pair[0]\n",
+ " target_tensor = training_pair[1]\n",
+ "\n",
+ " loss = train(input_tensor, target_tensor, encoder,\n",
+ " decoder, encoder_optimizer, decoder_optimizer, criterion)\n",
+ " print_loss_total += loss\n",
+ " plot_loss_total += loss\n",
+ "\n",
+ " if iter % print_every == 0:\n",
+ " print_loss_avg = print_loss_total / print_every\n",
+ " print_loss_total = 0\n",
+ " print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),\n",
+ " iter, iter / n_iters * 100, print_loss_avg))\n",
+ "\n",
+ " if iter % plot_every == 0:\n",
+ " plot_loss_avg = plot_loss_total / plot_every\n",
+ " plot_losses.append(plot_loss_avg)\n",
+ " plot_loss_total = 0\n",
+ "\n",
+ " showPlot(plot_losses)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Plotting Result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def showPlot(points):\n",
+ " plt.figure()\n",
+ " fig, ax = plt.subplots()\n",
+ " # this locator puts ticks at regular intervals\n",
+ " loc = ticker.MultipleLocator(base=0.2)\n",
+ " ax.yaxis.set_major_locator(loc)\n",
+ " plt.plot(points)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Evaluation\n",
+ "Evaluation is mostly the same as training, but there are no targets so we simply feed the decoder’s predictions back to itself for each step. Every time it predicts a word we add it to the output string, and if it predicts the EOS token we stop there. We also store the decoder’s attention outputs for display later."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):\n",
+ " with torch.no_grad():\n",
+ " input_tensor = tensorFromSentence(input_lang, sentence)\n",
+ " input_length = input_tensor.size()[0]\n",
+ " encoder_hidden = encoder.initHidden()\n",
+ "\n",
+ " encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)\n",
+ "\n",
+ " for ei in range(input_length):\n",
+ " encoder_output, encoder_hidden = encoder(input_tensor[ei],\n",
+ " encoder_hidden)\n",
+ " encoder_outputs[ei] += encoder_output[0, 0]\n",
+ "\n",
+ " decoder_input = torch.tensor([[SOS_token]], device=device) # SOS\n",
+ "\n",
+ " decoder_hidden = encoder_hidden\n",
+ "\n",
+ " decoded_words = []\n",
+ " decoder_attentions = torch.zeros(max_length, max_length)\n",
+ "\n",
+ " for di in range(max_length):\n",
+ " decoder_output, decoder_hidden, decoder_attention = decoder(\n",
+ " decoder_input, decoder_hidden, encoder_outputs)\n",
+ " decoder_attentions[di] = decoder_attention.data\n",
+ " topv, topi = decoder_output.data.topk(1)\n",
+ " if topi.item() == EOS_token:\n",
+ " decoded_words.append('eng->Indonesian.
+- Do preprocessing:
+ - Read text file and split into lines, split lines into pairs
+ - Normalize text, filter by length and content
+ - Make word lists from sentences in pairs
+- Transform dataset into input_lang for encoder, output_lang for decoder, and pairs for training model.
+
+
+## Step 3: The Seq2Seq Model
+- Create EncoderRNN: The encoder of a seq2seq network is a RNN that outputs some value for every word from the input sentence. For every input word the encoder outputs a vector and a hidden state, and uses the hidden state for the next input word.
+
+
+
+- Create DecoderRNN: The decoder is another RNN that takes the encoder output vector(s) and outputs a sequence of words to create the translation
+
+
+
+- Create Attention Decoder: Attention allows the decoder network to “focus” on a different part of the encoder’s outputs for every step of the decoder’s own outputs.
+
+
+
+## Step 4: Training
+- To train we run the input sentence through the encoder, and keep track of every output and the latest hidden state. Then the decoder is given the token as its first input, and the last hidden state of the encoder as its first hidden state.
+
+## Step 5: Evaluation
+- Evaluation is mostly the same as training, but there are no targets so we simply feed the decoder’s predictions back to itself for each step.
+- The result such as follows:
+```
+> saya malu telah melakukan hal tersebut .
+= i am ashamed of having done so .
+< i am ashamed of having done so . Special thanks to Jassu Ilama for the beautiful 3D modelling and material design of this avatar.
+ +
+
+#### Supported Environments
+
+| | |
+|-------------------------|-----------------------------------------|
+| **Operating systems** | Linux |
+| **Python versions** | Python 3.x (64-bit) |
+| **Distros** | KDE neon, elementary OS, Ubuntu |
+| **Package managers** | APT, pip |
+| **Languages** | English |
+| **System requirements** | preferably a [CUDA supported GPU](https://www.geforce.com/hardware/technology/cuda/supported-gpus), 2GB of free RAM |
+| | |
+
+### Installation
+
+Download the [latest release](https://github.com/DragonComputer/Dragonfire/releases/latest) (the `.deb` file) and:
+
+```Shell
+sudo dpkg -i dragonfire_1.0.0_amd64.deb
+```
+
+or with Docker: `docker pull dragoncomputer/dragonfire`
+
+or simply: `sudo ./install.sh`
+
+To install the dependencies run `sudo apt-get -f install` right after the `dpkg -i` command. The installation will automatically download the pre-trained English model of Mozilla DeepSpeech (1.31 GB download size) and will place it under `/usr/share/dragonfire/deepspeech/models` directory. You can manually [download the model](https://github.com/mozilla/DeepSpeech/releases/download/v0.1.1/deepspeech-0.1.1-models.tar.gz) if you want.
+
+### Usage
+
+```
+usage: dragonfire [-h] [-c] [-s] [-j] [-v] [-g] [--server API_KEY] [-p PORT]
+ [--version]
+
+optional arguments:
+ -h, --help show this help message and exit
+ -c, --cli Command-line interface mode. Give commands to
+ Dragonfire via command-line inputs (keyboard) instead
+ of audio inputs (microphone).
+ -s, --silent Silent mode. Disable Text-to-Speech output. Dragonfire
+ won't generate any audio output.
+ -j, --headless Headless mode. Do not display an avatar animation on
+ the screen. Disable the female head model.
+ -v, --verbose Increase verbosity of log output.
+ -g, --gspeech Instead of using the default speech recognition
+ method(Mozilla DeepSpeech), use Google Speech
+ Recognition service. (more accurate results)
+ --server API_KEY Server mode. Disable any audio functionality, serve a
+ RESTful spaCy API and become a Twitter integrated
+ chatbot.
+ -p PORT, --port PORT Port number for server mode.
+ --version Display the version number of Dragonfire.
+```
+
+or with Docker: `docker run dragonfire [-h] [-c] [-s] [-j] [-v] [-g] [--server API_KEY] [-p PORT]`
+
+or simply start from your Linux application launcher.
+
+
+
+We have also a collective which you can donate:
+
+
+
+
+
+Maybe you may want to become a [backer](https://opencollective.com/dragonfire) or a [sponsor](https://opencollective.com/dragonfire):
diff --git a/iso/dragonfire-master/debian/changelog b/iso/dragonfire-master/debian/changelog
new file mode 100644
index 00000000..4fb6c649
--- /dev/null
+++ b/iso/dragonfire-master/debian/changelog
@@ -0,0 +1,58 @@
+dragonfire (1.0.0) unstable; urgency=medium
+
+ * Post-installation scripts are fixed
+ * An API implemented
+ * The project is turned into more of a server-side application although it's continuing to preserve the features of its desktop application aspect
+ * The package is now Python 3 only
+ * Universal Python wheel generation disabled
+ * Basic Analyzer (done in dragonfire/__init__.py) is heavily pruned and improved. Now it's purely spaCy based
+ * Omniscient and Learner classes are refactored
+ * Learning ability now can store data (that comes from many different Android users) on MySQL database seamlessly
+ * Gspeech alternative added
+
+ -- Mehmet Mert Yildiran
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+