-
Notifications
You must be signed in to change notification settings - Fork 0
/
interface.py
149 lines (124 loc) · 4.62 KB
/
interface.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import streamlit as st
import pickle
import re
import nltk
from nltk import WordNetLemmatizer, pos_tag, word_tokenize
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
from sklearn.exceptions import InconsistentVersionWarning
# -------------------------------
# Suppress Specific Warnings (Optional)
# -------------------------------
warnings.filterwarnings(action='ignore', category=InconsistentVersionWarning)
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
# -------------------------------
# Set Page Configuration - Must Be First Streamlit Command
# -------------------------------
st.set_page_config(page_title="🔍 Toxicity Detection App", layout="centered")
# -------------------------------
# Initialize and Download NLTK Resources
# -------------------------------
@st.cache_resource
def initialize_nltk():
nltk_packages = [
'punkt_tab',
'omw-1.4',
'wordnet',
'stopwords',
'averaged_perceptron_tagger_eng'
]
for package in nltk_packages:
nltk.download(package, quiet=True)
return stopwords.words('english')
# Initialize NLTK and get English stopwords
stop_words = initialize_nltk()
# -------------------------------
# Define Text Preprocessing Function
# -------------------------------
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(treebank_tag):
"""
Convert TreeBank POS tags to WordNet POS tags for lemmatization.
"""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
def prepare_text(text):
# Remove non-alphabetic characters and convert to lowercase
text = re.sub(r'[^a-zA-Z\']', ' ', text)
text = text.lower()
# Tokenize the text
tokens = word_tokenize(text)
# Remove stopwords
tokens = [token for token in tokens if token not in stop_words]
# POS tagging
tagged_tokens = pos_tag(tokens)
# Lemmatization
lemmatized_tokens = [
lemmatizer.lemmatize(token, pos=get_wordnet_pos(tag))
for token, tag in tagged_tokens
]
# Join tokens back to string
return ' '.join(lemmatized_tokens)
# -------------------------------
# Load Saved TF-IDF Vectorizer and Model
# -------------------------------
@st.cache_resource
def load_resources():
try:
with open("tf_idf.pkt", "rb") as f:
vectorizer = pickle.load(f)
except FileNotFoundError:
st.error("TF-IDF vectorizer file not found. Please ensure 'tf_idf.pkt' is in the correct directory.")
st.stop()
try:
with open("toxicity_model.pkt", "rb") as f:
model = pickle.load(f)
except FileNotFoundError:
st.error("Toxicity model file not found. Please ensure 'toxicity_model.pkt' is in the correct directory.")
st.stop()
return vectorizer, model
vectorizer, model = load_resources()
# -------------------------------
# Streamlit App Layout
# -------------------------------
st.title("🔍 Toxicity Detection App")
st.write("""
This application predicts the toxicity of a given text.
Simply enter any text below, and the model will classify it as **Toxic** or **Non-toxic**.
""")
# Text input
user_input = st.text_area("📄 Enter Text Below:", "", height=200)
# Prediction button
if st.button("📈 Predict Toxicity"):
if user_input.strip() == "":
st.warning("Please enter some text to get a prediction.")
else:
# Preprocess the input text
processed_text = prepare_text(user_input)
# Transform the text using the loaded TF-IDF vectorizer
tf_idf_input = vectorizer.transform([processed_text])
# Predict probabilities
if hasattr(model, "predict_proba"):
pred_proba = model.predict_proba(tf_idf_input)[0][1] # Probability of being toxic
else:
pred_proba = None
# Predict label
pred_label = model.predict(tf_idf_input)[0]
# Display results
st.subheader("📊 Prediction Results:")
st.write(f"**Original Text:** {user_input}")
st.write(f"**Processed Text:** {processed_text}")
if pred_proba is not None:
st.write(f"**Toxicity Probability:** {pred_proba:.4f}")
else:
st.write("**Toxicity Probability:** Not available.")
st.write(f"**Predicted Label:** {'🟥 Toxic' if pred_label == 1 else '🟩 Non-toxic'}")