बुधवार, 6 मार्च 2024

# %% import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer import tensorflow as tf from tensorflow.keras import Sequential from tensorflow.keras.layers import Dense, Dropout import lime import lime.lime_text # Load your DataFrame df = pd.read_csv('english_df.csv') # %% df = df.dropna() # %% df.sample(2) # %% # df=df[0:1000] # %% num_n=df['closed_by_name'].unique().shape[0] # %% import lime import sklearn import numpy as np import sklearn import sklearn.ensemble import sklearn.metrics from __future__ import print_function # %% from sklearn.preprocessing import LabelEncoder # Initialize the LabelEncoder label_encoder = LabelEncoder() # Fit and transform the labels labels = label_encoder.fit_transform(df['closed_by_name']) # Now 'encoded_labels' contains the encoded representation of your original labels # If you need to convert back to the original labels, you can use inverse_transform decoded_labels = label_encoder.inverse_transform(labels) print(" labels:", labels) # %% X_train, X_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=42) # %% X_test.shape[0] # %% import tensorflow as tf from sklearn.model_selection import train_test_split from keras.models import Sequential from keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D from tensorflow.keras.preprocessing.sequence import pad_sequences # Assuming df['text'] is a list of strings max_sentence_length = max(len(sentence.split()) for sentence in df['text']) # Create tokenizer with dynamic num_words tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_sentence_length) tokenizer.fit_on_texts(df['text']) X = tokenizer.texts_to_sequences(df['text']) # Pad sequences to ensure they all have the same length X = pad_sequences(X, maxlen=max_sentence_length, padding='post', truncating='post') # Splitting data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42) # %% # Define the neural network model with an embedding layer model = Sequential([ Embedding(input_dim=max_sentence_length, output_dim=200, input_length=X.shape[1]), GlobalAveragePooling1D(), Dense(1024, activation='relu'), Dropout(0.5), Dense(256, activation='relu'), Dropout(0.5), Dense(num_n, activation='softmax') ]) # %% # Compile the model model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # Train the model model.fit(X_train, y_train, epochs=500, batch_size=4) # Evaluate the model loss, accuracy = model.evaluate(X_test, y_test) print("Accuracy:", accuracy) # %% model.save_weights("model_emb3.h5") # %% model.evaluate(X_test, y_test) # %% df['text'][0] # %% # Predict the classes for X_test predictions = model.predict(X_test) predicted_classes = np.argmax(predictions, axis=1) # Compare predictions to the true labels correct_predictions = predicted_classes == y_test # Find indices of correct predictions correct_indices = np.where(correct_predictions)[0] # Print out the indices of correct predictions print("Indices of correctly predicted instances:", correct_indices) # If you want to see the correctly predicted instances # You can loop through correct_indices and print them or analyze them further for index in correct_indices: print(f"Correctly predicted instance at index {index}:") print("Predicted class:", label_encoder.inverse_transform([predicted_classes[index]])[0]) # You can add more details about the prediction or the instance here # %% from lime.lime_text import LimeTextExplainer # %% explainer = LimeTextExplainer(class_names=label_encoder.classes_) # %% def predict_proba(texts): seq = tokenizer.texts_to_sequences(texts) padded = pad_sequences(seq, maxlen=max_sentence_length, padding='post', truncating='post') return model.predict(padded) # %% # Choose an index of the instance you want to explain idx = 3 # Example index, choose appropriately text_to_explain = df['text'][idx] # Generate explanation for this text exp = explainer.explain_instance(text_to_explain, predict_proba, num_features=10) exp.show_in_notebook(text=True) # %% exp.show_in_notebook(text=True) # %% # np.save(r'M:\iitb\seminar\correctly_predicted_test_instances.npy',correct_indices) # %% # import shap # import numpy as np # # Prepare the text for the model (tokenize and pad) # def prepare_text(text): # sequence = tokenizer.texts_to_sequences([text]) # padded_sequence = pad_sequences(sequence, maxlen=max_sentence_length, padding='post', truncating='post') # return padded_sequence # # Predict and check if the prediction is correct # def predict_and_check(text, true_label): # prepared_text = prepare_text(text) # prediction_prob = model.predict(prepared_text) # predicted_label = np.argmax(prediction_prob, axis=-1) # predicted_label_decoded = label_encoder.inverse_transform(predicted_label)[0] # is_correct = predicted_label_decoded == true_label # return is_correct, predicted_label_decoded, prediction_prob.max() # # Initialize SHAP Explainer # # Note: For complex models, consider using a simpler explainer or reducing the dataset size for explanation. # background = X_train[:100] # Using a subset of the training set as background # explainer = shap.DeepExplainer(model, background) # def explain_prediction(text): # prepared_text = prepare_text(text) # shap_values = explainer.shap_values(prepared_text) # # Assuming you want to visualize the explanation for the first class # shap.initjs() # Initialize JavaScript visualization in Jupyter notebooks # shap.force_plot(explainer.expected_value[0], shap_values[0][0], feature_names=tokenizer.word_index) # text = df['text'][0] # true_label = df['closed_by_name'][0] # is_correct, predicted_label, confidence = predict_and_check(text, true_label) # if is_correct: # print(f"Model correctly predicted label '{predicted_label}' with confidence {confidence:.2f}") # explain_prediction(text) # else: # print(f"Model incorrectly predicted label '{predicted_label}'. Correct label is '{true_label}'.") # %% # import numpy as np # def predict_and_highlight(text): # # Tokenize and pad the input text # sequence = tokenizer.texts_to_sequences([text]) # padded_sequence = pad_sequences(sequence, maxlen=max_sentence_length, padding='post', truncating='post') # # Predict the class # prediction = model.predict(padded_sequence) # predicted_class_index = np.argmax(prediction) # predicted_class = label_encoder.inverse_transform([predicted_class_index])[0] # # Identify important keywords (simplified for demonstration) # # This step requires a more complex implementation for meaningful keyword extraction based on model internals. # words = text.split() # important_words = words[:min(3, len(words))] # Placeholder for simplicity: Takes the first few words # print(f"Predicted class: {predicted_class}") # print("Important words (simplified):", ", ".join(important_words)) # # Example usage: # text = df['text'][0] # predict_and_highlight(text)

शनिवार, 4 फ़रवरी 2023

in vscode how to find the code snippet from other code files

In Visual Studio Code, you can search for code snippets across all open files in your workspace by using the "Find All" (Ctrl+Shift+F or Cmd+Shift+F) feature. You can then enter a search term to find specific snippets of code. The search results will be displayed in a panel, and you can navigate to the corresponding file and location by clicking on a result.

# %% import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import...