GOOGLE ADS

Donnerstag, 14. April 2022

Der Grund für unterschiedliche Ergebnisse des KNN-Algorithmus aus PYOD- und Sklearn-Paketen

Neben diesem Beitrag habe ich mit KNN-Algorithmen experimentiert, indem ich sklearn- und PYOD- Pakete für einen unbeaufsichtigten Ansatz auf Benchmark-Datensätzen für die Anomalieerkennungsaufgabe verwendet habe, und ich bekomme unterschiedliche Ergebnisse!

*****************************************************************KNN from PYOD lib
/usr/local/lib/python3.7/dist-packages/pyod/models/base.py:413: UserWarning: y should not be presented in unsupervised learning.
"y should not be presented in unsupervised learning.")
Training time: 3.3526198863983154s
precision recall f1-score support
0 0.96 0.90 0.93 16955
1 0.01 0.04 0.02 663
accuracy 0.86 17618
macro avg 0.49 0.47 0.47 17618
weighted avg 0.92 0.86 0.89 17618
*****************************************************************KNN from sklearn lib
Training time: 0.6735050678253174s
precision recall f1-score support
0 1.00 1.00 1.00 16955
1 1.00 1.00 1.00 663
accuracy 1.00 17618
macro avg 1.00 1.00 1.00 17618
weighted avg 1.00 1.00 1.00 17618

Ich habe versucht, verschiedene Argumente für das zweite Paket festzulegen, indem ich setze contamination, n_neighborshabe aber keine Ergebnisse des sklearn-Pakets erreicht. Kann mir jemand erklären oder die Lösung finden, um das Problem zu beheben, um die Ergebnisse zu vergleichen.

Der vollständige Code für die Vorverarbeitung ohne Normalisierung, da die Daten unter Berücksichtigung dieser Antwort sauber sind, um Verzerrungen zu vermeiden.


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import time
from sklearn import metrics
dataset_name = 'http'
from sklearn.datasets import fetch_kddcup99
http = fetch_kddcup99(subset='http', data_home=None, shuffle=False, random_state=None, percent10=True,
download_if_missing=True, return_X_y=False, as_frame=True)
df = http.frame
name_target = 'labels'
#change types of feature columns
df['duration']=df['duration'].astype(float)
df['src_bytes']=df['src_bytes'].astype(float)
df['dst_bytes']=df['dst_bytes'].astype(float)
num_row, num_colmn = df.shape
#calculate number of classes
classes = df[name_target].unique()
num_class = len(classes)
print(df[name_target].value_counts())
#determine which class is normal (is not anomaly)
label = np.array(df[name_target])
a,b = np.unique(label, return_counts=True)
#print("a is:",a)
#print("b is:",b)
for i in range(len(b)):
if b[i]== b.max():
normal = a[i]
#print('normal:', normal)
elif b[i] == b.min():
unnormal = a[i]
#print('unnorm:',unnormal)
# show anomaly classes
anomaly_class = []
for f in range(len(a)):
if a[f]!= normal:
anomaly_class.append(a[f])
# convert dataset classes to 2 classe: normal and unnormal
label = np.where(label!= normal, unnormal,label)
df[name_target]=label
# showing columns's type: numerical or categorical
numeric =0
categoric = 0
for i in range(df.shape[1]):
df_col = df.iloc[:,i]
if df_col.dtype == int and df.columns[i]!= name_target:
numeric +=1
elif df_col.dtype == float and df.columns[i]!= name_target:
numeric += 1
elif df.columns[i]!= name_target:
categoric += 1
#replace labels with 0 and 1
label = np.where(label == normal, 0,1)
df[name_target]=label
# null_check: if more than half of a column was null, then that columns will be droped
# otherwise if number of null was less than half of column, then nulls will replace with mean of that column
test = []
for i in range(df.shape[1]):
if df.iloc[:,i].isnull().sum() > df.shape[0]//2:
test.append(i)
elif df.iloc[:,i].isnull().sum() < df.shape[0]//2 and df.iloc[:,i].isnull().sum()!= 0:
m = df.iloc[:,i].mean()
df.iloc[:,i] = df.iloc[:,i].replace(to_replace = np.nan, value = m)
df = df.drop(columns=df.columns[test])
#calculate anomaly rate
b = df[name_target].value_counts()
Anomaly_rate= b[1] / (b[0]+b[1])
print(Anomaly_rate)
contamination= float("{:.4f}".format(Anomaly_rate))
print(contamination)
#rename labels column
df = df.rename(columns = {'labels': 'binary_target'})
#df.to_csv(f'/content/{dataset_name}.csv', index = False)

Der vollständige Code der Implementierung von KNN-Modellen:

!pip install pyod
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import classification_report
import time
from sklearn import metrics
import os
import seaborn as sns
if contamination > 0.5:
contamination = 0.5
#X, y = df.loc[:, df.columns!= 'binary_target'], df['binary_target']
seed = 120
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed,stratify=y)
#X_train, X_test, y_train, y_test= train_test_split(df.drop(['binary_target'], axis=1), df['binary_target'], test_size=test_size, random_state=seed)
#create a dataframe
df_all = pd.DataFrame(columns =["method",'TP', 'FP','TN','FN','Accuracy', 'Precision', 'Recall', 'F1_score','Training Time(s)'])
index = df_all.index
index.name = dataset_name
numb = len(df_all)+1
#**********************************************************************KNN
print('*****************************************************************KNN from PYOD lib')
from pyod.models.knn import KNN
model_name_2 = 'KNN'
# train kNN detector
clf_name = 'KNN'
clf = KNN()
start = time.time()
clf.fit(X_train,y_train)
# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores_knn = clf.decision_function(X_test) # outlier scores
stop = time.time()
train_time_knn = stop - start
print(f"Training time: {stop - start}s")
predictions = [round(value) for value in y_test_pred]
accuracy = accuracy_score(y_test, predictions)
accuracy_2 = accuracy * 100.0
for i in range(0,len(predictions)):
if predictions[i] > 0.5:
predictions[i]=1
else:
predictions[i]=0
predictions_2 = predictions
# calculate prediction,recall, f1-score
from sklearn.metrics import f1_score,recall_score,precision_score
precision = precision_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
recall = recall_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score = f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score_2 = np.mean(f1_score)
precision_2 = np.mean(precision)
recall_2 = np.mean(recall)
# evaluate the classification_report
print(classification_report(y_test, predictions_2))
# evaluate the confusion_matrix
cf_matrix =confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
#**********************************************************************KNN_sklearn
print('*****************************************************************KNN from sklearn lib')
from sklearn.neighbors import KNeighborsClassifier
model_name_6 = 'KNN_sklearn'
# train knn detector
neigh = KNeighborsClassifier()
start = time.time()
neigh.fit(X_train,y_train)
# get the prediction on the test data
y_test_pred_6 = neigh.predict(X_test)
stop = time.time()
train_time_knn_sklearn = stop - start
print(f"Training time: {stop - start}s")
#*****************************************************
predictions = [round(value) for value in y_test_pred_6]
accuracy = accuracy_score(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))
accuracy_6 = accuracy * 100.0
for i in range(0,len(predictions)):
if predictions[i] > 0.5:
predictions[i]=1
else:
predictions[i]=0
predictions_6 = predictions
# calculate prediction,recall, f1-score
from sklearn.metrics import f1_score,recall_score,precision_score
precision = precision_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
recall = recall_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score = f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))
f1_score_6 = np.mean(f1_score)
precision_6 = np.mean(precision)
recall_6 = np.mean(recall)
# evaluate the classification_report
print(classification_report(y_test, predictions_6))
# evaluate the confusion_matrix
cf_matrix =confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()


Lösung des Problems

kNN als überwachter Klassifikator (wie in der Klasse von sklearn KNN) prognostiziert basierend auf der Klasse der knächsten Nachbarn eines Punkts.

kNN misst als unüberwachter Ausreißerdetektor (wie in der KNNKlasse von PyOD) die Entfernung zu den knächsten Nachbarn und sagt voraus, dass ein Punkt ein Ausreißer ist, wenn diese Entfernung "groß" ist.

Es gibt also keinen Grund zu erwarten, dass sie dieselben Vorhersagen liefern: Für ein bestimmtes Problem muss die positive Klasse kein „Ausreißer" im herkömmlichen Sinne sein.

Keine Kommentare:

Kommentar veröffentlichen

Warum werden SCHED_FIFO-Threads derselben physischen CPU zugewiesen, obwohl CPUs im Leerlauf verfügbar sind?

Lösung des Problems Wenn ich das richtig verstehe, versuchen Sie, SCHED_FIFO mit aktiviertem Hyperthreading ("HT") zu verwenden, ...