本文共 8757 字,大约阅读时间需要 29 分钟。
用于做网络流量分类
下载地址 使用moore做流量分类 这里用了BP神经网络,CNN神经网络,朴素贝叶斯,决策树,KNN,SVM导入的模块有import tensorflow as tfimport numpy as npimport osfrom tensorflow.keras.models import Sequentialfrom tensorflow.keras import layersfrom sklearn import metrics, neighborsfrom tensorflow import kerasfrom sklearn.svm import SVCfrom sklearn.naive_bayes import GaussianNBfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.metrics import confusion_matrix, roc_curveimport matplotlib.pyplot as pltfrom sklearn.model_selection import train_test_splitimport timefrom sklearn.ensemble import RandomForestClassifier
数据处理
list_y = ['WWW','MAIL','FTP-CONTROL','FTP-PASV','ATTACK','P2P','DATABASE','FTP-DATA','MULTIMEDIA','SERVICES','INTERACTIVE','GAMES']#read the file,change 'Y,N,?,', translate to tensordef data_prepross(filename): X, Y = [], [] for f in filename: print(f) with open(os.getcwd() + '/' + f, 'r') as file: for n, i in enumerate(file.readlines()[253:]): i = i.replace('Y','1') i = i.replace('N', '0') spl = i.split(',') if spl.count('?')>8: continue i = i.replace('\n', '') fz = [float(f) for f in i.split(',')[:-1] if f != '?'] meana = sum(fz) / len(fz) i = i.replace('?', str(0)) #均值填充,加高斯白噪声 x = [float(j) for j in i.split(',')[:-1]] +[meana]*8 + np.random.normal(0,1,256) #x = [float(j) for j in i.split(',')[:-1]] + [0] * 8 #x =x.tolist() y = i.split(',')[-1].replace('FTP-CO0TROL','FTP-CONTROL') y = y.replace('I0TERACTIVE','INTERACTIVE' ) y = list_y.index(y) X.append(x) Y.append(y) file.close() return X, Y#data nomalization#train_x,train_y = data_prepross(['entry01.weka.allclass.arff',])total_x,total_y = data_prepross(['entry01.weka.allclass.arff','entry02.weka.allclass.arff','entry03.weka.allclass.arff','entry04.weka.allclass.arff', 'entry05.weka.allclass.arff','entry09.weka.allclass.arff', 'entry10.weka.allclass.arff','entry07.weka.allclass.arff','entry08.weka.allclass.arff','entry06.weka.allclass.arff'])train_x,test_x,train_y,test_y = train_test_split(total_x,total_y,test_size=0.25, random_state=0)train_x = tf.convert_to_tensor(train_x, dtype=tf.float32)train_y= tf.convert_to_tensor(train_y,dtype=tf.int32)test_x = tf.convert_to_tensor(test_x, dtype=tf.float32)test_y = tf.convert_to_tensor(test_y,dtype= tf.int32)train_x = tf.keras.utils.normalize(train_x, axis=1)test_x = tf.keras.utils.normalize(test_x, axis=1)
BP神经网络
num_classes =12num_pixels=256def baseline_model(): model = Sequential() model.add(layers.Dense(num_pixels, input_dim=num_pixels, activation='relu')) #layers.Dropout(0.5) model.add(layers.Dense(num_classes, activation='softmax')) model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model#,class_weight=class_weightdef baseline(): t1 = time.time() model= baseline_model() X_train = tf.reshape(train_x,[-1,256]) X_test = tf.reshape(test_x, [-1,256]) history = model.fit(X_train, train_y, validation_data=(X_test, test_y), nb_epoch=20, batch_size=batchsize, verbose=2,) scores = model.evaluate(X_test, test_y, verbose=0) predict_y = model.predict(X_test) t2 = time.time() print("Baseline Error: %.2f%%" % (100-scores[1]*100),t2-t1) print(history.history) return scores,predict_y
#CNN
def simple_CNNmodel(): model = keras.models.Sequential([ layers.Conv2D(filters=8, kernel_size=(3, 3),padding='same',input_shape=(16, 16, 1), activation='relu'), layers.MaxPooling2D(pool_size=(2, 2), padding = 'same' ), layers.Conv2D(filters=16, kernel_size=(3, 3), padding='same',activation='relu'), layers.MaxPooling2D(pool_size=(2, 2), padding = 'same' ), #layers.Dropout(0.25), #(5,5,16) > 400 layers.Flatten(), layers.Dense(256, activation='relu'), #layers.Dropout(0.5), #layers.Dense(84, activation='relu'), layers.Dense(128, activation='relu'), #layers.Dropout(0.5), layers.Dense(12, activation='softmax') ])# Compile model model.compile(loss="sparse_categorical_crossentropy", optimizer='adam', metrics=['accuracy']) return model#,class_weight=class_weightdef simple_CNN(): t1 = time.time() model = simple_CNNmodel() X_train = tf.reshape(train_x,[-1,16,16,1]) X_test = tf.reshape(test_x, [-1,16,16,1]) model.summary() history = model.fit(X_train, train_y, validation_data=(X_test, test_y), nb_epoch=25, batch_size=128, verbose=2) scores = model.evaluate(X_test, test_y, verbose=0) t2 = time.time() pred_y = model.predict(X_test) print(scores) print("Baseline Error: %.2f%%" % (100 - scores[1] * 100),t2-t1) print(history.history) return scores,pred_y
朴素贝叶斯
def Bayes(trainData,trainLable,testData,testLable): t1 = time.time() mnb = GaussianNB() # mnb.fit(trainData, trainLable) # y_predict = mnb.predict(testData) t2 =time.time() print(t2-t1) print(confusion_matrix(testLable,y_predict)) print('The Accuracy of Naive Bayes Classifier is:', mnb.score(testData,testLable ))
决策树
def DecisionTr(trainData,trainLable,testData,testLable): t1 = time.time() model = DecisionTreeClassifier() model.fit(trainData, trainLable) predicted = model.predict(testData) score = metrics.accuracy_score(testLable, predicted) t2 = time.time() print(t2-t1,score) print(confusion_matrix(testLable,predicted))
SVM支持向量机
def SVM(trainData,trainLable,testData,testLable): t1=time.time() clf = SVC() clf.fit(trainData, trainLable) svmPredict=clf.predict(testData) svmScore=metrics.accuracy_score(testLable, svmPredict) t2=time.time() print(t2-t1,svmScore)
KNN
def Knn(trainData,trainLable,testData,testLable): t1=time.time() knn = KNeighborsClassifier() knn.fit(trainData, trainLable) knnPredict = knn.predict(testData) knnscore=metrics.accuracy_score(testLable, knnPredict) t2=time.time() print(t2-t1,knnscore) print(confusion_matrix(testLable, knnPredict))Knn(train_x.numpy(),train_y.numpy(),test_x.numpy(),test_y.numpy())
灰度图片绘图
def plt_image(trainx,trainy): p_www = np.where(trainy == 0)[0][0] p_mail = np.where(trainy == 1)[0][0] p_control = np.where(trainy == 2)[0][0] p_pasv = np.where(trainy == 3)[0][0] p_attack = np.where(trainy == 4)[0][0] p_p2p = np.where(trainy == 5)[0][0] p_database = np.where(trainy == 6)[0][0] p_data = np.where(trainy == 7)[0][0] p_multimedia = np.where(trainy == 8)[0][0] p_service = np.where(trainy == 9)[0][0] p_interactive = np.where(trainy == 10)[0][0] p_games = np.where(trainy == 11)[0][0] plt.figure(num='classffication', figsize=(6, 12)) plt.subplot(3,4,1) plt.title('WWW') plt.imshow(np.reshape(trainx[p_www],(16,16))) plt.subplot(3,4,2) plt.title('MAIL') plt.imshow(np.reshape(trainx[p_mail],(16,16))) plt.subplot(3,4,3) plt.title('FTP-CONTROL') plt.imshow(np.reshape(trainx[p_control],(16,16))) plt.subplot(3,4,4) plt.title('FTP-PASV') plt.imshow(np.reshape(trainx[p_pasv],(16,16))) plt.subplot(3,4,5) plt.title('ATTCK') plt.imshow(np.reshape(trainx[p_attack],(16,16))) plt.subplot(3,4,6) plt.title('P2P') plt.imshow(np.reshape(trainx[p_p2p],(16,16))) plt.subplot(3,4,7) plt.title('DATABASE') plt.imshow(np.reshape(train_x[p_database],(16,16))) plt.subplot(3,4,8) plt.title('FTP-DATA') plt.imshow(np.reshape(trainx[p_data],(16,16))) plt.subplot(3,4,9) plt.title('MULTIMEDIA') plt.imshow(np.reshape(train_x[p_multimedia],(16,16))) plt.subplot(3,4,10) plt.title('SERVICES') plt.imshow(np.reshape(trainx[p_service],(16,16))) plt.subplot(3,4,11) plt.title('INTERACTIVE') plt.imshow(np.reshape(train_x[p_interactive],(16,16))) plt.subplot(3,4,12) plt.title('GAMES') plt.imshow(np.reshape(trainx[p_games],(16,16))) plt.show()
混淆矩阵绘图
`def plot_confusion_matrix(title, pred_y): cm = confusion_matrix(test_y,np.argmax(pred_y, 1)) #cm = confusion_matrix(test_y, pred_y> 0.5) labels_name = list_y cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # 归一化 plt.imshow(cm, interpolation='nearest') # 在特定的窗口上显示图像 plt.title(title) # 图像标题 plt.colorbar() num_local = np.array(range(len(labels_name))) plt.xticks(num_local, labels_name, rotation=90) # 将标签印在x轴坐标上 plt.yticks(num_local, labels_name) # 将标签印在y轴坐标上 plt.ylabel('True') plt.xlabel('Predicted') plt.show()
转载地址:http://raben.baihongyu.com/