बुधवार, 6 मार्च 2024
# %%
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
import lime
import lime.lime_text
# Load your DataFrame
df = pd.read_csv('english_df.csv')
# %%
df = df.dropna()
# %%
df.sample(2)
# %%
# df=df[0:1000]
# %%
num_n=df['closed_by_name'].unique().shape[0]
# %%
import lime
import sklearn
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
from __future__ import print_function
# %%
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder
label_encoder = LabelEncoder()
# Fit and transform the labels
labels = label_encoder.fit_transform(df['closed_by_name'])
# Now 'encoded_labels' contains the encoded representation of your original labels
# If you need to convert back to the original labels, you can use inverse_transform
decoded_labels = label_encoder.inverse_transform(labels)
print(" labels:", labels)
# %%
X_train, X_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=42)
# %%
X_test.shape[0]
# %%
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Assuming df['text'] is a list of strings
max_sentence_length = max(len(sentence.split()) for sentence in df['text'])
# Create tokenizer with dynamic num_words
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_sentence_length)
tokenizer.fit_on_texts(df['text'])
X = tokenizer.texts_to_sequences(df['text'])
# Pad sequences to ensure they all have the same length
X = pad_sequences(X, maxlen=max_sentence_length, padding='post', truncating='post')
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
# %%
# Define the neural network model with an embedding layer
model = Sequential([
Embedding(input_dim=max_sentence_length, output_dim=200, input_length=X.shape[1]),
GlobalAveragePooling1D(),
Dense(1024, activation='relu'),
Dropout(0.5),
Dense(256, activation='relu'),
Dropout(0.5),
Dense(num_n, activation='softmax')
])
# %%
# Compile the model
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# Train the model
model.fit(X_train, y_train, epochs=500, batch_size=4)
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)
# %%
model.save_weights("model_emb3.h5")
# %%
model.evaluate(X_test, y_test)
# %%
df['text'][0]
# %%
# Predict the classes for X_test
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
# Compare predictions to the true labels
correct_predictions = predicted_classes == y_test
# Find indices of correct predictions
correct_indices = np.where(correct_predictions)[0]
# Print out the indices of correct predictions
print("Indices of correctly predicted instances:", correct_indices)
# If you want to see the correctly predicted instances
# You can loop through correct_indices and print them or analyze them further
for index in correct_indices:
print(f"Correctly predicted instance at index {index}:")
print("Predicted class:", label_encoder.inverse_transform([predicted_classes[index]])[0])
# You can add more details about the prediction or the instance here
# %%
from lime.lime_text import LimeTextExplainer
# %%
explainer = LimeTextExplainer(class_names=label_encoder.classes_)
# %%
def predict_proba(texts):
seq = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(seq, maxlen=max_sentence_length, padding='post', truncating='post')
return model.predict(padded)
# %%
# Choose an index of the instance you want to explain
idx = 3 # Example index, choose appropriately
text_to_explain = df['text'][idx]
# Generate explanation for this text
exp = explainer.explain_instance(text_to_explain, predict_proba, num_features=10)
exp.show_in_notebook(text=True)
# %%
exp.show_in_notebook(text=True)
# %%
# np.save(r'M:\iitb\seminar\correctly_predicted_test_instances.npy',correct_indices)
# %%
# import shap
# import numpy as np
# # Prepare the text for the model (tokenize and pad)
# def prepare_text(text):
# sequence = tokenizer.texts_to_sequences([text])
# padded_sequence = pad_sequences(sequence, maxlen=max_sentence_length, padding='post', truncating='post')
# return padded_sequence
# # Predict and check if the prediction is correct
# def predict_and_check(text, true_label):
# prepared_text = prepare_text(text)
# prediction_prob = model.predict(prepared_text)
# predicted_label = np.argmax(prediction_prob, axis=-1)
# predicted_label_decoded = label_encoder.inverse_transform(predicted_label)[0]
# is_correct = predicted_label_decoded == true_label
# return is_correct, predicted_label_decoded, prediction_prob.max()
# # Initialize SHAP Explainer
# # Note: For complex models, consider using a simpler explainer or reducing the dataset size for explanation.
# background = X_train[:100] # Using a subset of the training set as background
# explainer = shap.DeepExplainer(model, background)
# def explain_prediction(text):
# prepared_text = prepare_text(text)
# shap_values = explainer.shap_values(prepared_text)
# # Assuming you want to visualize the explanation for the first class
# shap.initjs() # Initialize JavaScript visualization in Jupyter notebooks
# shap.force_plot(explainer.expected_value[0], shap_values[0][0], feature_names=tokenizer.word_index)
# text = df['text'][0]
# true_label = df['closed_by_name'][0]
# is_correct, predicted_label, confidence = predict_and_check(text, true_label)
# if is_correct:
# print(f"Model correctly predicted label '{predicted_label}' with confidence {confidence:.2f}")
# explain_prediction(text)
# else:
# print(f"Model incorrectly predicted label '{predicted_label}'. Correct label is '{true_label}'.")
# %%
# import numpy as np
# def predict_and_highlight(text):
# # Tokenize and pad the input text
# sequence = tokenizer.texts_to_sequences([text])
# padded_sequence = pad_sequences(sequence, maxlen=max_sentence_length, padding='post', truncating='post')
# # Predict the class
# prediction = model.predict(padded_sequence)
# predicted_class_index = np.argmax(prediction)
# predicted_class = label_encoder.inverse_transform([predicted_class_index])[0]
# # Identify important keywords (simplified for demonstration)
# # This step requires a more complex implementation for meaningful keyword extraction based on model internals.
# words = text.split()
# important_words = words[:min(3, len(words))] # Placeholder for simplicity: Takes the first few words
# print(f"Predicted class: {predicted_class}")
# print("Important words (simplified):", ", ".join(important_words))
# # Example usage:
# text = df['text'][0]
# predict_and_highlight(text)
सदस्यता लें
संदेश (Atom)
# %% import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import...
-
https://youtu.be/0FAp Kr84-aQ प्रयाण गीत प्रयाणगीतम् चन्दनतुल्या भारतभूमिस्तपस्थली ग्रामो ग्रामः | बाला - बाला देवी प्रतिमा वत्सो वत्सः श्...
-
एलोवेरा प्रकृति का एक वरदान हम सभी को उसका सेवन शुद्ध रूप में करना चाहिए। जिसके लिए हम प्रयास भी करते है नर्सरी से पौधा लाते है...
-
Shami ped Ke Fayde शमी पौधे का संबंध भगवान शनि से है। शमी का पौधा भगवान शिव और शनि दोनों को बहुत प्रिय है। शनिवार को घर में शमी का पौधा लगान...