बुधवार, 6 मार्च 2024

# %% import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer import tensorflow as tf from tensorflow.keras import Sequential from tensorflow.keras.layers import Dense, Dropout import lime import lime.lime_text # Load your DataFrame df = pd.read_csv('english_df.csv') # %% df = df.dropna() # %% df.sample(2) # %% # df=df[0:1000] # %% num_n=df['closed_by_name'].unique().shape[0] # %% import lime import sklearn import numpy as np import sklearn import sklearn.ensemble import sklearn.metrics from __future__ import print_function # %% from sklearn.preprocessing import LabelEncoder # Initialize the LabelEncoder label_encoder = LabelEncoder() # Fit and transform the labels labels = label_encoder.fit_transform(df['closed_by_name']) # Now 'encoded_labels' contains the encoded representation of your original labels # If you need to convert back to the original labels, you can use inverse_transform decoded_labels = label_encoder.inverse_transform(labels) print(" labels:", labels) # %% X_train, X_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=42) # %% X_test.shape[0] # %% import tensorflow as tf from sklearn.model_selection import train_test_split from keras.models import Sequential from keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D from tensorflow.keras.preprocessing.sequence import pad_sequences # Assuming df['text'] is a list of strings max_sentence_length = max(len(sentence.split()) for sentence in df['text']) # Create tokenizer with dynamic num_words tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_sentence_length) tokenizer.fit_on_texts(df['text']) X = tokenizer.texts_to_sequences(df['text']) # Pad sequences to ensure they all have the same length X = pad_sequences(X, maxlen=max_sentence_length, padding='post', truncating='post') # Splitting data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42) # %% # Define the neural network model with an embedding layer model = Sequential([ Embedding(input_dim=max_sentence_length, output_dim=200, input_length=X.shape[1]), GlobalAveragePooling1D(), Dense(1024, activation='relu'), Dropout(0.5), Dense(256, activation='relu'), Dropout(0.5), Dense(num_n, activation='softmax') ]) # %% # Compile the model model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # Train the model model.fit(X_train, y_train, epochs=500, batch_size=4) # Evaluate the model loss, accuracy = model.evaluate(X_test, y_test) print("Accuracy:", accuracy) # %% model.save_weights("model_emb3.h5") # %% model.evaluate(X_test, y_test) # %% df['text'][0] # %% # Predict the classes for X_test predictions = model.predict(X_test) predicted_classes = np.argmax(predictions, axis=1) # Compare predictions to the true labels correct_predictions = predicted_classes == y_test # Find indices of correct predictions correct_indices = np.where(correct_predictions)[0] # Print out the indices of correct predictions print("Indices of correctly predicted instances:", correct_indices) # If you want to see the correctly predicted instances # You can loop through correct_indices and print them or analyze them further for index in correct_indices: print(f"Correctly predicted instance at index {index}:") print("Predicted class:", label_encoder.inverse_transform([predicted_classes[index]])[0]) # You can add more details about the prediction or the instance here # %% from lime.lime_text import LimeTextExplainer # %% explainer = LimeTextExplainer(class_names=label_encoder.classes_) # %% def predict_proba(texts): seq = tokenizer.texts_to_sequences(texts) padded = pad_sequences(seq, maxlen=max_sentence_length, padding='post', truncating='post') return model.predict(padded) # %% # Choose an index of the instance you want to explain idx = 3 # Example index, choose appropriately text_to_explain = df['text'][idx] # Generate explanation for this text exp = explainer.explain_instance(text_to_explain, predict_proba, num_features=10) exp.show_in_notebook(text=True) # %% exp.show_in_notebook(text=True) # %% # np.save(r'M:\iitb\seminar\correctly_predicted_test_instances.npy',correct_indices) # %% # import shap # import numpy as np # # Prepare the text for the model (tokenize and pad) # def prepare_text(text): # sequence = tokenizer.texts_to_sequences([text]) # padded_sequence = pad_sequences(sequence, maxlen=max_sentence_length, padding='post', truncating='post') # return padded_sequence # # Predict and check if the prediction is correct # def predict_and_check(text, true_label): # prepared_text = prepare_text(text) # prediction_prob = model.predict(prepared_text) # predicted_label = np.argmax(prediction_prob, axis=-1) # predicted_label_decoded = label_encoder.inverse_transform(predicted_label)[0] # is_correct = predicted_label_decoded == true_label # return is_correct, predicted_label_decoded, prediction_prob.max() # # Initialize SHAP Explainer # # Note: For complex models, consider using a simpler explainer or reducing the dataset size for explanation. # background = X_train[:100] # Using a subset of the training set as background # explainer = shap.DeepExplainer(model, background) # def explain_prediction(text): # prepared_text = prepare_text(text) # shap_values = explainer.shap_values(prepared_text) # # Assuming you want to visualize the explanation for the first class # shap.initjs() # Initialize JavaScript visualization in Jupyter notebooks # shap.force_plot(explainer.expected_value[0], shap_values[0][0], feature_names=tokenizer.word_index) # text = df['text'][0] # true_label = df['closed_by_name'][0] # is_correct, predicted_label, confidence = predict_and_check(text, true_label) # if is_correct: # print(f"Model correctly predicted label '{predicted_label}' with confidence {confidence:.2f}") # explain_prediction(text) # else: # print(f"Model incorrectly predicted label '{predicted_label}'. Correct label is '{true_label}'.") # %% # import numpy as np # def predict_and_highlight(text): # # Tokenize and pad the input text # sequence = tokenizer.texts_to_sequences([text]) # padded_sequence = pad_sequences(sequence, maxlen=max_sentence_length, padding='post', truncating='post') # # Predict the class # prediction = model.predict(padded_sequence) # predicted_class_index = np.argmax(prediction) # predicted_class = label_encoder.inverse_transform([predicted_class_index])[0] # # Identify important keywords (simplified for demonstration) # # This step requires a more complex implementation for meaningful keyword extraction based on model internals. # words = text.split() # important_words = words[:min(3, len(words))] # Placeholder for simplicity: Takes the first few words # print(f"Predicted class: {predicted_class}") # print("Important words (simplified):", ", ".join(important_words)) # # Example usage: # text = df['text'][0] # predict_and_highlight(text)

# %% import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import...