Selasa, 20 Agustus 2024

GENERATE TEXTBLOB

#Install Library

# !pip install tweet-preprocessor
# !pip install textblob
# !pip install wordcloud
# !pip  install Sastrawi
# !pip install nltk  
# !pip install tweet-preprocessor
# !pip install googletrans==4.0.0-rc1
# # c:\programdata\anaconda3\lib\site-packages (3.8.1)
#pip install scikit-learn

print('Start...')

#Import Library
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import preprocessor as p
from textblob import TextBlob
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from googletrans import Translator
 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
nltk.download('punkt')

import re
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary

# Definisi kamus normalisasi
norm = {
    ' bgt ': ' banget ', ' gak ': ' tidak ', ' bs ': ' bisa ', ' emg ': ' memang ',
    ' yg ': ' yang ', ' g ': ' tidak', ' udah': ' sudah ', ' gini ': ' begini ',
    ' d ': ' di ', ' ad ': ' ada ', ' bhw ': ' bahwa ', ' tp ': ' tetapi ',
    ' sy ': ' saya ', ' ga ': ' tidak ', ' bkt ': ' bukti ', ' jt ': ' juta ',
    ' ajah ': ' saja ', ' gw ': ' saya '
}

# Fungsi untuk normalisasi, stopword removal, dan stemming dalam satu fungsi
def process_text(text):
    # Pastikan input adalah string
    if not isinstance(text, str):
        text = str(text)
    
    # Normalisasi dan pembersihan teks
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'RT[\s]+', '', text)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'[^A-Za-z0-9 ]', '', text)
    text = re.sub(r'\s+', ' ', text).strip().lower()

    # Normalisasi menggunakan kamus
    for key, value in norm.items():
        text = text.replace(key, value)

    # Menghapus stopwords yang didefinisikan
    stop_words = stop_words_remover_new.remove(text)

    # Stemming
    stemmed_text = stemmer.stem(stop_words)

    return stemmed_text

# Fungsi clean_string tetap digunakan di akhir proses untuk pembersihan tambahan
def clean_string(text):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    
    # Menghilangkan kata-kata tertentu
    words_to_remove = ["ahok", "hari", "anies","dgn","dengan","atau","org","orang","utk"," nya","pak","atau","djarot","aku","saya","jkw","prabowo","jkt","jakarta","pks","pdip","sama"]
    pattern = re.compile(r'\b(' + '|'.join(words_to_remove) + r')\b', re.IGNORECASE)
    text = pattern.sub('', text)
    
    # Menghilangkan angka dalam bentuk kata dan angka
    numbers_in_words = ["satu", "dua", "tiga", "empat", "lima", "enam", "tujuh", "delapan", "sembilan", "nol","ribu","juta"]
    pattern = re.compile(r'\b(' + '|'.join(numbers_in_words) + r')\b', re.IGNORECASE)
    text = pattern.sub('', text)
    text = re.sub(r'\b\d+\b', '', text)
    
    # Menghilangkan kata yang kurang dari 3 huruf
    text = ' '.join([word for word in text.split() if len(word) >= 3])
    
    # Menghilangkan spasi yang lebih dari satu lagi setelah penghapusan kata-kata tertentu dan angka
    text =re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text 

# Inisialisasi StopWordRemover
more_stop_words =["tidak","buk","tidak","tidak"]
stop_words = StopWordRemoverFactory().get_stop_words()
stop_words.extend(more_stop_words)
new_array = ArrayDictionary(stop_words)
stop_words_remover_new = StopWordRemover(new_array)

# Inisialisasi Stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

#============================================================
# Membaca dataset
df = pd.read_csv("cthdata.csv")
df = df[['text']]
print(df.head())
print(df.shape)

# Mengatasi nilai NaN atau nilai non-string
df['text'] = df['text'].fillna('').astype(str)

# Proses teks secara keseluruhan
df['text'] = df['text'].apply(lambda x: clean_string(process_text(x)))

# Simpan hasil
df['text'].to_csv("DataStemFilter.csv", index=False)
print('Sukses normalisasi data...')
import time        
from googletrans import Translator
from textblob import TextBlob
import pandas as pd

def translate_with_retry(text, src='id', dest='en', max_retries=3):
    for attempt in range(max_retries):
        try:
            return translator.translate(text, src=src, dest=dest).text
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(1)  # Tunggu 1 detik sebelum mencoba lagi
    return None

def translate_simple(tweet):
    translator = Translator()
    try:
        translated_text = translator.translate(tweet, src='id', dest='en').text
        return translated_text
    except Exception as e:
        print(f"Terjadi kesalahan dalam proses terjemahan: {e}")
        return None

# Load Data yang sudah ternormalisasi (StopWord+Lemmatizer+Stemming)
data = pd.read_csv('DataStemFilter.csv')
data_tweet = list(data['text'])
polaritas = 0
status = []
full_text2 = []
polarity = []
total_positif = total_negatif = total_netral = total = 0

for i, tweet in enumerate(data_tweet):
    # Periksa apakah elemen adalah string, jika tidak konversi ke string
    if isinstance(tweet, float):
        tweet = str(tweet) if not pd.isna(tweet) else ""
    elif not isinstance(tweet, str):
        tweet = str(tweet)
    
    # Analisis hanya jika tweet bukan string kosong
    if tweet:
        translated_text = translate_simple(tweet)
        #translated_text = translator.translate(tweet, src='id', dest='en').text 
        #translated_text = translator.translate(tweet, src='id', dest='en', timeout=30).text  # Timeout 10 detik
        #translated_text = translate_with_retry(tweet) 
        
        # Jika terjemahan berhasil, tambahkan ke daftar, jika tidak tambahkan None
        if translated_text:
            full_text2.append(translated_text)
            # Membuat objek TextBlob dari teks yang diterjemahkan
            analysis = TextBlob(str(translated_text))  # tweet
            polar = analysis.polarity
            polaritas += polar
            polarity.append(polar)
            if analysis.sentiment.polarity > 0.0:
                total_positif += 1
                status.append('Positif')
            elif analysis.sentiment.polarity == 0.0:
                total_netral += 1
                status.append('Netral')
            else:
                total_negatif += 1
                status.append('Negatif')
        else:
            # Jika terjemahan gagal, tambahkan nilai default
            full_text2.append(None)
            polarity.append(0)
            status.append('Tidak Ada Data')
    else:
        # Jika tweet kosong, tambahkan nilai default
        full_text2.append(None)
        polarity.append(0)
        status.append('Tidak Ada Data')
    
    total += 1

# Pastikan panjang semua daftar sama dengan panjang DataFrame
if len(full_text2) < len(data):
    missing_entries = len(data) - len(full_text2)
    full_text2.extend([None] * missing_entries)
    polarity.extend([0] * missing_entries)
    status.extend(['Tidak Ada Data'] * missing_entries)

# Menambahkan kolom 'klasifikasi' ke DataFrame
data['translate'] = full_text2
data['polarity'] = polarity
data['klasifikasi'] = status

# Output hasil analisis
print(f'Hasil Analisis Data:\nPositif = {total_positif}\nNetral = {total_netral}\nNegatif = {total_negatif}')
print(f'\nTotal Data : {total}')
data.to_csv("DataAutoKlasifikasi.csv", index=False)
print('Sukses update dataset...')


import seaborn as sns
sns.set_theme()
labels = ['Positif', 'Netral', 'Negatif']
counts = [total_positif, total_netral, total_negatif]

def show_bar(labels, counts):
    sns.set_theme()
    plt.figure(figsize=(8, 6))
    sns.barplot(x=labels, y=counts)
    plt.title('Distribusi Sentimen')
    plt.xlabel('Kategori Sentimen')
    plt.ylabel('Jumlah')
    plt.show()

# Memanggil fungsi untuk menampilkan bar plot
show_bar(labels, counts)


import pandas as pd
print('Isi dataset yang terbaru....')
dataNorm = pd.read_csv('DataAutoKlasifikasi.csv')
print(dataNorm)

 

NB KNN NLP

#Install Library

# !pip install tweet-preprocessor
# !pip install textblob
# !pip install wordcloud
# !pip  install Sastrawi
# !pip install nltk  
# !pip install tweet-preprocessor
# !pip install googletrans==4.0.0-rc1
# # c:\programdata\anaconda3\lib\site-packages (3.8.1)
#pip install scikit-learn

print('Start...')
NF='dataset6.csv'
print('Start...'+NF)
#Import Library
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import preprocessor as p
from textblob import TextBlob
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from googletrans import Translator
 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB 
from sklearn.svm import SVC  
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_score, recall_score, f1_score 
nltk.download('punkt')
from wordcloud import WordCloud, STOPWORDS



norm = {
    ' bgt ': ' banget ',
    ' gak ': ' tidak ',
    ' bs ': ' bisa ',
    ' emg ': ' memang ',
    ' yg ': ' yang ',
    ' g ': ' tidak',
    ' udah': ' sudah ',
    ' gini ': ' begini ',
    ' d ': ' di ',
    ' ad ': ' ada ',
    ' bhw ': ' bahwa ',
    ' tp ': ' tetapi ',
    ' sy ': ' saya ',
    ' ga ': ' tidak ',
    ' bkt ': ' bukti ',
    ' jt ': ' juta ',
    ' ajah ': ' saja ',
    ' gw ': ' saya '
}
def clean_twitter_text(text):
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'RT[\s]+', '', text)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'[^A-Za-z0-9 ]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 
def lemhatizer(str_text):
  for i in norm:
    str_text = str_text.replace(i, norm[i])
  return str_text


def stopword(str_text):
  str_text = stop_words_remover_new.remove(str_text)
  return str_text


def clean_string(text):
    # Menghilangkan spasi di depan dan di akhir
    text = text.strip()
    
    # Menghilangkan spasi yang lebih dari satu
    text = re.sub(r'\s+', ' ', text)
    
    # Menghilangkan kata-kata tertentu
    words_to_remove = ["jokowi", "hari", "anies","dgn","dengan","atau","org","orang","utk"," nya","pak","atau","djarot","aku","saya","jkw","prabowo","jkt","jakarta","pks","pdip","sama"]
    pattern = re.compile(r'\b(' + '|'.join(words_to_remove) + r')\b', re.IGNORECASE)
    text = pattern.sub('', text)
    
    # Menghilangkan angka dalam bentuk kata dan angka
    numbers_in_words = ["satu", "dua", "tiga", "empat", "lima", "enam", "tujuh", "delapan", "sembilan", "nol","ribu","juta"]
    pattern = re.compile(r'\b(' + '|'.join(numbers_in_words) + r')\b', re.IGNORECASE)
    text = pattern.sub('', text)
    text = re.sub(r'\b\d+\b', '', text)
    
    # Menghilangkan kata yang kurang dari 3 huruf
    text = ' '.join([word for word in text.split() if len(word) >= 3])
    
    # Menghilangkan spasi yang lebih dari satu lagi setelah penghapusan kata-kata tertentu dan angka
    text =re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\s+', ' ', text).strip()
    return text
 
def stemming(text_cleaning): 
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    
    # Asumsikan text_cleaning adalah string
    words = text_cleaning.split()  # Pisahkan teks menjadi kata-kata
    stemmed_words = []
    
    for word in words:
        stemmed_word = stemmer.stem(word)  # Stem setiap kata
        stemmed_word = stopword(stemmed_word)  # Hapus stopword jika ada
        if stemmed_word:  # Jika tidak kosong, tambahkan ke daftar
            stemmed_words.append(stemmed_word)
    
    # Gabungkan kembali kata-kata yang sudah di-stem
    d_clean = " ".join(stemmed_words)
    d_clean = clean_string(d_clean)  # Bersihkan lagi string jika diperlukan
    
    return d_clean

def plot_cloud(wordcloud):
    plt.figure(figsize=(10, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
dataNorm = pd.read_csv(NF) 
all_words = ' '.join([text for text in dataNorm['text'].astype(str)])
#full_text
wordcloud = WordCloud(
    width=3000,
    height=2000,
    random_state=3,
    background_color='black',
    colormap='RdPu',
    collocations=False,
    stopwords=STOPWORDS
).generate(all_words)

plot_cloud(wordcloud)

data = dataNorm
######################## 
# Mengonversi teks menjadi string jika belum
data['full_text'] = data['text'].astype(str)  #full_text
# Menghapus entri dengan klasifikasi "Tidak Ada Data"
data = data[data['klasifikasi'] != 'Tidak Ada Data']
# Preprocessing: Mengonversi teks menjadi vektor fitur
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['full_text'])
y = data['klasifikasi']
# Memisahkan data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#-----------------------------------------------------
# Melatih model Naive Bayes
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

# Memprediksi klasifikasi pada data uji
y_pred = model_nb.predict(X_test) 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f"Akurasi: {accuracy * 100:.2f}%")
print(f"Presisi: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%") 
print("\nLaporan Klasifikasi NB:")
print(classification_report(y_test, y_pred, zero_division=0))
# Menghitung F1 Score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1 Score: {f1 * 100:.2f}%")
#-----------------------------------------------------
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
cmd = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model_nb.classes_)
# Menampilkan Confusion Matrix sebagai grafik
cmd.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix NBC')
plt.show() 

data = dataNorm
######################## 
# Mengonversi teks menjadi string jika belum
data['full_text'] = data['text'].astype(str)  #full_text
# Menghapus entri dengan klasifikasi "Tidak Ada Data"
data = data[data['klasifikasi'] != 'Tidak Ada Data']
# Preprocessing: Mengonversi teks menjadi vektor fitur
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['full_text'])
y = data['klasifikasi'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#-----------------------------------------------------

# Melatih model SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f'Akurasi model SVM: {accuracy * 100:.2f}%')
print(f"Presisi: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%") 
print("\nLaporan Klasifikasi SVM:")
print(classification_report(y_test, y_pred, zero_division=0))
# Menghitung F1 Score 
print(f"F1 Score: {f1 * 100:.2f}%")
 
#-----------------------------------------------------

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
cmd = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=svm_model.classes_)

# Menampilkan Confusion Matrix sebagai grafik
cmd.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix SVM')
plt.show()

#UJI KALIMAT NB dan SVM
new_string0 = "bhima yudhistira harap kinerja tim ekonomi amin iklim investasi"  
new_string0 = "kami berharap akan kinerjanya bertim perekonomian mengaminkan beriklim investasinya"  
new_string = clean_twitter_text(new_string0)
new_string= new_string.lower() #casefolding

stop_words = StopWordRemoverFactory().get_stop_words() 
new_array = ArrayDictionary(stop_words)
stop_words_remover_new = StopWordRemover(new_array)

new_string=lemhatizer(new_string)
new_string=stopword(new_string)
new_string = stop_words_remover_new.remove(new_string) 
new_string= stemming(new_string)  
new_string_vector = vectorizer.transform([new_string]) 
new_prediction1 = model_nb.predict(new_string_vector)
new_prediction2 = svm_model.predict(new_string_vector)

print(new_string0)
print(new_string)
print(f'Kategori untuk Kalimat Uji / NB: {new_prediction1[0]}')
print(f'Kategori untuk Kalimat Uji / SVM: {new_prediction2[0]}')