बुधवार, 6 मार्च 2024
# %%
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
import lime
import lime.lime_text
# Load your DataFrame
df = pd.read_csv('english_df.csv')
# %%
df = df.dropna()
# %%
df.sample(2)
# %%
# df=df[0:1000]
# %%
num_n=df['closed_by_name'].unique().shape[0]
# %%
import lime
import sklearn
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
from __future__ import print_function
# %%
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder
label_encoder = LabelEncoder()
# Fit and transform the labels
labels = label_encoder.fit_transform(df['closed_by_name'])
# Now 'encoded_labels' contains the encoded representation of your original labels
# If you need to convert back to the original labels, you can use inverse_transform
decoded_labels = label_encoder.inverse_transform(labels)
print(" labels:", labels)
# %%
X_train, X_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=42)
# %%
X_test.shape[0]
# %%
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Assuming df['text'] is a list of strings
max_sentence_length = max(len(sentence.split()) for sentence in df['text'])
# Create tokenizer with dynamic num_words
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_sentence_length)
tokenizer.fit_on_texts(df['text'])
X = tokenizer.texts_to_sequences(df['text'])
# Pad sequences to ensure they all have the same length
X = pad_sequences(X, maxlen=max_sentence_length, padding='post', truncating='post')
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
# %%
# Define the neural network model with an embedding layer
model = Sequential([
Embedding(input_dim=max_sentence_length, output_dim=200, input_length=X.shape[1]),
GlobalAveragePooling1D(),
Dense(1024, activation='relu'),
Dropout(0.5),
Dense(256, activation='relu'),
Dropout(0.5),
Dense(num_n, activation='softmax')
])
# %%
# Compile the model
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
# Train the model
model.fit(X_train, y_train, epochs=500, batch_size=4)
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)
# %%
model.save_weights("model_emb3.h5")
# %%
model.evaluate(X_test, y_test)
# %%
df['text'][0]
# %%
# Predict the classes for X_test
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
# Compare predictions to the true labels
correct_predictions = predicted_classes == y_test
# Find indices of correct predictions
correct_indices = np.where(correct_predictions)[0]
# Print out the indices of correct predictions
print("Indices of correctly predicted instances:", correct_indices)
# If you want to see the correctly predicted instances
# You can loop through correct_indices and print them or analyze them further
for index in correct_indices:
print(f"Correctly predicted instance at index {index}:")
print("Predicted class:", label_encoder.inverse_transform([predicted_classes[index]])[0])
# You can add more details about the prediction or the instance here
# %%
from lime.lime_text import LimeTextExplainer
# %%
explainer = LimeTextExplainer(class_names=label_encoder.classes_)
# %%
def predict_proba(texts):
seq = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(seq, maxlen=max_sentence_length, padding='post', truncating='post')
return model.predict(padded)
# %%
# Choose an index of the instance you want to explain
idx = 3 # Example index, choose appropriately
text_to_explain = df['text'][idx]
# Generate explanation for this text
exp = explainer.explain_instance(text_to_explain, predict_proba, num_features=10)
exp.show_in_notebook(text=True)
# %%
exp.show_in_notebook(text=True)
# %%
# np.save(r'M:\iitb\seminar\correctly_predicted_test_instances.npy',correct_indices)
# %%
# import shap
# import numpy as np
# # Prepare the text for the model (tokenize and pad)
# def prepare_text(text):
# sequence = tokenizer.texts_to_sequences([text])
# padded_sequence = pad_sequences(sequence, maxlen=max_sentence_length, padding='post', truncating='post')
# return padded_sequence
# # Predict and check if the prediction is correct
# def predict_and_check(text, true_label):
# prepared_text = prepare_text(text)
# prediction_prob = model.predict(prepared_text)
# predicted_label = np.argmax(prediction_prob, axis=-1)
# predicted_label_decoded = label_encoder.inverse_transform(predicted_label)[0]
# is_correct = predicted_label_decoded == true_label
# return is_correct, predicted_label_decoded, prediction_prob.max()
# # Initialize SHAP Explainer
# # Note: For complex models, consider using a simpler explainer or reducing the dataset size for explanation.
# background = X_train[:100] # Using a subset of the training set as background
# explainer = shap.DeepExplainer(model, background)
# def explain_prediction(text):
# prepared_text = prepare_text(text)
# shap_values = explainer.shap_values(prepared_text)
# # Assuming you want to visualize the explanation for the first class
# shap.initjs() # Initialize JavaScript visualization in Jupyter notebooks
# shap.force_plot(explainer.expected_value[0], shap_values[0][0], feature_names=tokenizer.word_index)
# text = df['text'][0]
# true_label = df['closed_by_name'][0]
# is_correct, predicted_label, confidence = predict_and_check(text, true_label)
# if is_correct:
# print(f"Model correctly predicted label '{predicted_label}' with confidence {confidence:.2f}")
# explain_prediction(text)
# else:
# print(f"Model incorrectly predicted label '{predicted_label}'. Correct label is '{true_label}'.")
# %%
# import numpy as np
# def predict_and_highlight(text):
# # Tokenize and pad the input text
# sequence = tokenizer.texts_to_sequences([text])
# padded_sequence = pad_sequences(sequence, maxlen=max_sentence_length, padding='post', truncating='post')
# # Predict the class
# prediction = model.predict(padded_sequence)
# predicted_class_index = np.argmax(prediction)
# predicted_class = label_encoder.inverse_transform([predicted_class_index])[0]
# # Identify important keywords (simplified for demonstration)
# # This step requires a more complex implementation for meaningful keyword extraction based on model internals.
# words = text.split()
# important_words = words[:min(3, len(words))] # Placeholder for simplicity: Takes the first few words
# print(f"Predicted class: {predicted_class}")
# print("Important words (simplified):", ", ".join(important_words))
# # Example usage:
# text = df['text'][0]
# predict_and_highlight(text)
# %% import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import...