#Install Library
# !pip install tweet-preprocessor
# !pip install textblob
# !pip install wordcloud
# !pip install Sastrawi
# !pip install nltk
# !pip install tweet-preprocessor
# !pip install googletrans==4.0.0-rc1
# # c:\programdata\anaconda3\lib\site-packages (3.8.1)
#pip install scikit-learn
print('Start...')
NF='dataset6.csv'
print('Start...'+NF)
#Import Library
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import preprocessor as p
from textblob import TextBlob
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from googletrans import Translator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, f1_score
nltk.download('punkt')
from wordcloud import WordCloud, STOPWORDS
norm = {
' bgt ': ' banget ',
' gak ': ' tidak ',
' bs ': ' bisa ',
' emg ': ' memang ',
' yg ': ' yang ',
' g ': ' tidak',
' udah': ' sudah ',
' gini ': ' begini ',
' d ': ' di ',
' ad ': ' ada ',
' bhw ': ' bahwa ',
' tp ': ' tetapi ',
' sy ': ' saya ',
' ga ': ' tidak ',
' bkt ': ' bukti ',
' jt ': ' juta ',
' ajah ': ' saja ',
' gw ': ' saya '
}
def clean_twitter_text(text):
text = re.sub(r'@[A-Za-z0-9_]+', '', text)
text = re.sub(r'#\w+', '', text)
text = re.sub(r'RT[\s]+', '', text)
text = re.sub(r'https?://\S+', '', text)
text = re.sub(r'[^A-Za-z0-9 ]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def lemhatizer(str_text):
for i in norm:
str_text = str_text.replace(i, norm[i])
return str_text
def stopword(str_text):
str_text = stop_words_remover_new.remove(str_text)
return str_text
def clean_string(text):
# Menghilangkan spasi di depan dan di akhir
text = text.strip()
# Menghilangkan spasi yang lebih dari satu
text = re.sub(r'\s+', ' ', text)
# Menghilangkan kata-kata tertentu
words_to_remove = ["jokowi", "hari", "anies","dgn","dengan","atau","org","orang","utk"," nya","pak","atau","djarot","aku","saya","jkw","prabowo","jkt","jakarta","pks","pdip","sama"]
pattern = re.compile(r'\b(' + '|'.join(words_to_remove) + r')\b', re.IGNORECASE)
text = pattern.sub('', text)
# Menghilangkan angka dalam bentuk kata dan angka
numbers_in_words = ["satu", "dua", "tiga", "empat", "lima", "enam", "tujuh", "delapan", "sembilan", "nol","ribu","juta"]
pattern = re.compile(r'\b(' + '|'.join(numbers_in_words) + r')\b', re.IGNORECASE)
text = pattern.sub('', text)
text = re.sub(r'\b\d+\b', '', text)
# Menghilangkan kata yang kurang dari 3 huruf
text = ' '.join([word for word in text.split() if len(word) >= 3])
# Menghilangkan spasi yang lebih dari satu lagi setelah penghapusan kata-kata tertentu dan angka
text =re.sub(r'\d+', '', text)
text = re.sub(r'\s+', ' ', text).strip()
text = re.sub(r'\s+', ' ', text).strip()
return text
def stemming(text_cleaning):
factory = StemmerFactory()
stemmer = factory.create_stemmer()
# Asumsikan text_cleaning adalah string
words = text_cleaning.split() # Pisahkan teks menjadi kata-kata
stemmed_words = []
for word in words:
stemmed_word = stemmer.stem(word) # Stem setiap kata
stemmed_word = stopword(stemmed_word) # Hapus stopword jika ada
if stemmed_word: # Jika tidak kosong, tambahkan ke daftar
stemmed_words.append(stemmed_word)
# Gabungkan kembali kata-kata yang sudah di-stem
d_clean = " ".join(stemmed_words)
d_clean = clean_string(d_clean) # Bersihkan lagi string jika diperlukan
return d_clean
def plot_cloud(wordcloud):
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
dataNorm = pd.read_csv(NF)
all_words = ' '.join([text for text in dataNorm['text'].astype(str)])
#full_text
wordcloud = WordCloud(
width=3000,
height=2000,
random_state=3,
background_color='black',
colormap='RdPu',
collocations=False,
stopwords=STOPWORDS
).generate(all_words)
plot_cloud(wordcloud)
data = dataNorm
########################
# Mengonversi teks menjadi string jika belum
data['full_text'] = data['text'].astype(str) #full_text
# Menghapus entri dengan klasifikasi "Tidak Ada Data"
data = data[data['klasifikasi'] != 'Tidak Ada Data']
# Preprocessing: Mengonversi teks menjadi vektor fitur
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['full_text'])
y = data['klasifikasi']
# Memisahkan data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#-----------------------------------------------------
# Melatih model Naive Bayes
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)
# Memprediksi klasifikasi pada data uji
y_pred = model_nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
print(f"Akurasi: {accuracy * 100:.2f}%")
print(f"Presisi: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")
print("\nLaporan Klasifikasi NB:")
print(classification_report(y_test, y_pred, zero_division=0))
# Menghitung F1 Score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score: {f1 * 100:.2f}%")
#-----------------------------------------------------
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
cmd = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model_nb.classes_)
# Menampilkan Confusion Matrix sebagai grafik
cmd.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix NBC')
plt.show()
data = dataNorm
########################
# Mengonversi teks menjadi string jika belum
data['full_text'] = data['text'].astype(str) #full_text
# Menghapus entri dengan klasifikasi "Tidak Ada Data"
data = data[data['klasifikasi'] != 'Tidak Ada Data']
# Preprocessing: Mengonversi teks menjadi vektor fitur
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['full_text'])
y = data['klasifikasi']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#-----------------------------------------------------
# Melatih model SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
print(f'Akurasi model SVM: {accuracy * 100:.2f}%')
print(f"Presisi: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")
print("\nLaporan Klasifikasi SVM:")
print(classification_report(y_test, y_pred, zero_division=0))
# Menghitung F1 Score
print(f"F1 Score: {f1 * 100:.2f}%")
#-----------------------------------------------------
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
cmd = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=svm_model.classes_)
# Menampilkan Confusion Matrix sebagai grafik
cmd.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix SVM')
plt.show()
#UJI KALIMAT NB dan SVM
new_string0 = "bhima yudhistira harap kinerja tim ekonomi amin iklim investasi"
new_string0 = "kami berharap akan kinerjanya bertim perekonomian mengaminkan beriklim investasinya"
new_string = clean_twitter_text(new_string0)
new_string= new_string.lower() #casefolding
stop_words = StopWordRemoverFactory().get_stop_words()
new_array = ArrayDictionary(stop_words)
stop_words_remover_new = StopWordRemover(new_array)
new_string=lemhatizer(new_string)
new_string=stopword(new_string)
new_string = stop_words_remover_new.remove(new_string)
new_string= stemming(new_string)
new_string_vector = vectorizer.transform([new_string])
new_prediction1 = model_nb.predict(new_string_vector)
new_prediction2 = svm_model.predict(new_string_vector)
print(new_string0)
print(new_string)
print(f'Kategori untuk Kalimat Uji / NB: {new_prediction1[0]}')
print(f'Kategori untuk Kalimat Uji / SVM: {new_prediction2[0]}')
Tidak ada komentar:
Posting Komentar