Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Data analysis steps: data-cleaning, data-outlier-detection #30

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added data_analysis/__init__.py
Empty file.
74 changes: 74 additions & 0 deletions data_analysis/data_cleaning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pandas as pd
import valohai


def main():
rm_duplicates = valohai.parameters("remove_duplicate_rows").value
rm_null = valohai.parameters("remove_null").value
null_look_up_cols = valohai.parameters("null_lookup_columns").value
duplicate_lookup_cols = valohai.parameters("duplicate_lookup_columns").value
clear_fmt = valohai.parameters("clear_formatting").value
output_path = valohai.parameters("output_file_name").value

df = pd.read_csv(valohai.inputs("input-file").path())

print("null_look_up_cols:", null_look_up_cols)
print("duplicate_lookup_cols:", duplicate_lookup_cols)
# Remove records if missing data in particular columns are found
null_look_up_cols = (
null_look_up_cols.split(",")
if isinstance(null_look_up_cols, str) and len(null_look_up_cols) > 0
else None
)
duplicate_lookup_cols = (
duplicate_lookup_cols.split(",")
if isinstance(duplicate_lookup_cols, str) and len(duplicate_lookup_cols) > 0
else None
)
print("null_look_up_cols:", null_look_up_cols)
print("duplicate_lookup_cols:", duplicate_lookup_cols)

if rm_null:
is_empty_values = df.isnull().values.any()
num__empty_values = df.isnull().sum()
print(f"Missing values found: {is_empty_values}")
print(f"Number of Missing values found: {num__empty_values}")

if is_empty_values:
print(
f"Dropping records where following columns are null: {null_look_up_cols if null_look_up_cols else 'all'} ",
)
df.dropna(
subset=null_look_up_cols if null_look_up_cols else None,
inplace=True,
)

# Remove duplicate records
if rm_duplicates:
duplicated_df = df.duplicated(
subset=duplicate_lookup_cols if duplicate_lookup_cols else None,
keep="first",
)
is_duplicate = duplicated_df.values.any()
num_duplicates = duplicated_df.sum()
print(f"Duplicate records found: {is_duplicate}")
print(f"Number of Duplicate records found: {num_duplicates}")
if is_duplicate:
print(
f"Dropping duplicate records based on following columns: {duplicate_lookup_cols if duplicate_lookup_cols else 'all'}",
)
df.drop_duplicates(subset=duplicate_lookup_cols, inplace=True)

# Clear formatting
if clear_fmt:
print("Clear formatting")
df = df.map(lambda x: x.strip() if type(x) == str else x)

if not output_path:
output_path = "cleaned_data.csv"

df.to_csv(valohai.outputs().path(output_path), index=False)


if __name__ == "__main__":
main()
153 changes: 153 additions & 0 deletions data_analysis/outlier_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
from enum import Enum

import pandas as pd
import valohai

from data_analysis.outlier_detection_methods import (
detect_by_autoencoder,
detect_by_iqr,
detect_by_isolation_forest,
detect_by_zscore,
)
from data_analysis.plots.visualizations import (
visualize_outliers_boxplot,
visualize_outliers_histogram,
visualize_outliers_scatterplot,
)


class OutlierDetectionMethod(Enum):
Z_SCORE = "zscore"
IQR = "iqr"
ISOLATION_FOREST = "isolation-forest"
AUTOENCODER = "autoencoder"

def __str__(self):
return self.value

@classmethod
def cast(cls, value: str):
if isinstance(value, OutlierDetectionMethod):
return value
if not value:
return None
return OutlierDetectionMethod(str(value).lower())


def detect_outliers(
df: pd.DataFrame,
columns: list[str],
method: OutlierDetectionMethod | None,
thresholds: dict[str, float],
):
"""
Detect outliers in a DataFrame.

Parameters:
- df: DataFrame
The input DataFrame.
- columns: list[str]
List of columns to consider for outlier detection. If None, all columns will be considered.
- method: OutlierDetectionMethod
Method for outlier detection.
- threshold: float
Threshold value for outlier detection.

Returns:
- DataFrame
DataFrame containing rows identified as outliers.
- DataFrame
DataFrame containing rows not identified as outliers.
"""

if columns is None:
columns = df.columns

outliers = pd.DataFrame()
non_outliers = pd.DataFrame()
match method:
case OutlierDetectionMethod.Z_SCORE:
outliers, non_outliers = detect_by_zscore(
df,
columns=columns,
threshold=thresholds["zscore"],
)
case OutlierDetectionMethod.IQR:
outliers, non_outliers = detect_by_iqr(
df,
columns=columns,
threshold=thresholds["iqr"],
)
case OutlierDetectionMethod.ISOLATION_FOREST:
outliers, non_outliers = detect_by_isolation_forest(
df,
columns=columns,
threshold=thresholds["isolation_forest"],
)
case OutlierDetectionMethod.AUTOENCODER:
outliers, non_outliers = detect_by_autoencoder(
df,
columns=columns,
percentile=thresholds["autoencoder"],
)
return outliers.drop_duplicates(), non_outliers.drop_duplicates()


def main():
input_file_path = valohai.inputs("input-file").path()
column = valohai.parameters("column").value
thresholds = {
"zscore": valohai.parameters("zscore_threshold").value,
"iqr": valohai.parameters("iqr_threshold").value,
"isolation_forest": valohai.parameters("isolation_forest_contamination").value,
"autoencoder": valohai.parameters("autoencoder_percentile").value,
}
x_axis_col_name = valohai.parameters("x_axis_for_visualization").value
output_path = valohai.parameters("output_path").value
is_save_visualizations = valohai.parameters("save_visualizations").value

if input_file_path and input_file_path.endswith(".csv"):
df = pd.read_csv(input_file_path)
all_outliers_from_all_methods = pd.DataFrame()
for method in OutlierDetectionMethod:
outliers, non_outliers = detect_outliers(
df,
columns=[column],
method=method,
thresholds=thresholds,
)
print(f"Number of outliers detected via method ({method}): {len(outliers)}")
if len(outliers) > 0:
all_outliers_from_all_methods = all_outliers_from_all_methods._append(
outliers,
)
outliers.to_csv(valohai.outputs().path(f"{method}/{output_path}"))

all_outliers_from_all_methods.drop_duplicates().to_csv(
valohai.outputs().path(f"possible_outliers_from_column_{column}.csv"),
)
if is_save_visualizations:
visualize_outliers_scatterplot(
df,
x_column=x_axis_col_name,
y_column=column,
output_path=valohai.outputs().path(
f"scatter_plot_{column}.png",
),
)

if is_save_visualizations:
visualize_outliers_histogram(
df,
column,
output_path=valohai.outputs().path(f"histogram_plot_{column}.png"),
)
visualize_outliers_boxplot(
df,
column,
output_path=valohai.outputs().path(f"box_plot_{column}.png"),
)


if __name__ == "__main__":
main()
132 changes: 132 additions & 0 deletions data_analysis/outlier_detection_methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model

char_index = "0abcdefghijklmnopqrstuvwxyz"
char_index += "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
char_index += "123456789"
char_index += "().,-/+=&$?@#!*:;_[]|%⸏{}\"'" + " " + "\\"

char_to_int = {c: i for i, c in enumerate(char_index)}
int_to_char = dict(enumerate(char_index))


def encode_sequence_list(seqs, feat_n=0):
from keras.preprocessing.sequence import pad_sequences

encoded_seqs = []
for seq in seqs:
encoded_seq = [char_to_int[c] for c in seq]
encoded_seqs.append(encoded_seq)
if feat_n > 0:
encoded_seqs.append(np.zeros(feat_n))
return pad_sequences(encoded_seqs, padding="post")


def decode_sequence_list(seqs):
decoded_seqs = []
for seq in seqs:
decoded_seq = [int_to_char[i] for i in seq]
decoded_seqs.append(decoded_seq)
return decoded_seqs


def build_autoencoder(input_dim):
input_layer = Input(shape=(input_dim,))
encoded = Dense(64, activation="relu")(input_layer)
encoded = Dense(32, activation="relu")(encoded)
decoded = Dense(64, activation="relu")(encoded)
decoded = Dense(input_dim, activation="sigmoid")(decoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer="adam", loss="mse")

return autoencoder


def detect_by_autoencoder(df, columns, percentile=95):
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

outliers = pd.DataFrame()
non_outliers = pd.DataFrame()
scaler = StandardScaler()
for col in columns:
encoded_seqs = df[col].values.reshape(-1, 1)
if df[col].dtype == "object":
encoded_seqs = encode_sequence_list(df[col].values, feat_n=0)
X_scaled = scaler.fit_transform(encoded_seqs)
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)
autoencoder = build_autoencoder(X_train.shape[1])
autoencoder.fit(
X_train,
X_train,
epochs=50,
batch_size=32,
shuffle=True,
validation_data=(X_test, X_test),
verbose=0,
)
reconstructed_data = autoencoder.predict(X_scaled)
mse = np.mean(np.square(X_scaled - reconstructed_data), axis=1)
threshold = np.percentile(mse, percentile)
outliers = outliers._append(df[mse > threshold])
non_outliers = non_outliers._append(df[mse <= threshold])

return outliers, non_outliers


def detect_by_zscore(df, columns=None, threshold: float = 2.0):
outliers = pd.DataFrame()
non_outliers = pd.DataFrame()
for col in columns:
main_col = (
df[col]
if df[col].dtype != "object"
else encode_sequence_list(df[col].values, feat_n=0)
)
z_scores = (main_col - main_col.mean()) / main_col.std()
outliers = outliers._append(df[abs(z_scores) > threshold])
non_outliers = non_outliers._append(df[abs(z_scores) <= threshold])
return outliers, non_outliers


def detect_by_iqr(df, columns=None, threshold=1.5):
outliers = pd.DataFrame()
non_outliers = pd.DataFrame()

for col in columns:
main_col = df[col] if df[col].dtype != "object" else df[col].apply(len)
Q1 = main_col.quantile(0.25)
Q3 = main_col.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - threshold * IQR
upper_bound = Q3 + threshold * IQR
outliers = outliers._append(
df[(main_col < lower_bound) | (main_col > upper_bound)],
)
non_outliers = non_outliers._append(
df[(main_col >= lower_bound) & (main_col <= upper_bound)],
)

return outliers, non_outliers


def detect_by_isolation_forest(df, columns=None, threshold=0.1):
from sklearn.ensemble import IsolationForest

outliers = pd.DataFrame()
non_outliers = pd.DataFrame()
clf = IsolationForest(contamination=threshold)
for col in columns:
(
df[col]
if df[col].dtype != "object"
else encode_sequence_list(df[col], feat_n=0)
)
results = clf.fit_predict(df[col].values.reshape(-1, 1))
outlier_indices = results == -1
outliers = outliers._append(df[outlier_indices])
non_outliers = non_outliers._append(df[~outlier_indices])

return outliers, non_outliers
3 changes: 3 additions & 0 deletions data_analysis/outlier_detection_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
tensorflow
scikit-learn
matplotlib
Empty file added data_analysis/plots/__init__.py
Empty file.
Loading
Loading