中文分词 : 利用开源的分词工具jeba分词处理
在进行垃圾邮件过滤的时候,即需要注意垃圾邮件的拦截率(召回率),也需要注意正常邮件被当成垃圾邮件的错判率(精确率), 在当前项目中,我们主要考虑召回率这个指标。
P ( 召 回 率 ) = A A + C P(召回率)=\frac{A}{A+C} P ( 召 回 率 ) = A + C A
步骤1.处理索引文件,处理结果如:’/028/239’: ‘0’, ‘/028/240’: ‘0’, ‘/028/241’: ‘1’,
3.将邮件对应标签添加到文件内容后 文件内容,0 文件内容,1
import osdef read_index_file (file_path) : type_dict = {"spam" : "1" , "ham" : "0" } index_file = open(file_path) index_dict = {} try : for line in index_file: arr = line.split(" " ) if len(arr) == 2 : key, value = arr value = value.replace("../data" , "" ).replace("\n" , "" ) index_dict[value] = type_dict[key.lower()] finally : index_file.close() return index_dict def read_file (file_path) : file = open(file_path, "r" , encoding="gb2312" , errors="ignore" ) content_dict = {} try : is_content = False for line in file: line = line.strip() if line.startswith("From:" ): content_dict["from" ] = line[5 :] elif line.startswith("To:" ): content_dict["to" ] = line[3 :] elif line.startswith("Date:" ): content_dict["data" ] = line[5 :] elif not line: is_content = True if is_content: if "content" in content_dict: content_dict["content" ] += line else : content_dict["content" ] = line finally : file.close() return content_dict def process_file (file_path) : content_dict = read_file(file_path) result_str = content_dict.get("from" , "unkown" ).replace("," , "" ).strip() + "," result_str += content_dict.get("to" , "unkown" ).replace("," , "" ).strip() + "," result_str += content_dict.get("data" , "unkown" ).replace("," , "" ).strip() + "," result_str += content_dict.get("content" , "unkown" ).replace("," , "" ).strip() return result_str index_dict = read_index_file('./data/full/index' ) list0 = os.listdir('./data/data' ) for l1 in list0: l1_path = './data/data/' + l1 print('开始处理文件夹:' + l1_path) list1 = os.listdir(l1_path) print(list1) write_file_path = './data/process01_' + l1 with open(write_file_path, "w" , encoding='utf-8' ) as writer: for l2 in list1: l2_path = l1_path + "/" + l2 index_key = "/" + l1 + "/" + l2 print(index_key) if index_key in index_dict: content_str = process_file(l2_path) content_str += "," + index_dict[index_key] + "\n" writer.writelines(content_str) with open('./data/result_process01' , 'w' , encoding='utf-8' ) as writer: for l1 in list0: file_path = './data/process01_' + l1 print("开始合并文件:" + file_path) with open(file_path, encoding='utf-8' ) as file: for line in file: writer.writelines(line)
map函数与lambda 匿名函数用法
import re import time import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt ## 设置字符集,防止中文乱码 mpl.rcParams['font.sans-serif'] = [u'simHei'] mpl.rcParams['axes.unicode_minus'] = False # 1、文件数据读取 df = pd.read_csv("./data/result_process01", sep=",", header=None, names=["from", "to", "date", "content", "label"]) # print(df.head()) # 2(1)、特征工程1 =>提取发件人和收件人的邮件服务器地址 def extract_email_server_address(str1): it = re.findall(r"@([A-Za-z0-9]*\.[A-Za-z0-9\.]+)", str(str1)) print(it) result = "" if len(it) > 0: result = it[0] #有重复出现问题 if not result: result = "unknown" return result df["to_address"] = pd.Series(map(lambda str: extract_email_server_address(str), df["to"])) df["from_address"] = pd.Series(map(lambda str: extract_email_server_address(str), df["from"])) #print(df.head(2)) # 2(2)、特征工程1 =>查看邮件服务器的数量 print("=================to address================") print(df.to_address.value_counts().head(10)) print("总邮件接收服务器类别数量为:" + str(df.to_address.unique().shape)) print("=================from address================") print(df.from_address.value_counts().head(5)) print("总邮件发送服务器类别数量为:" + str(df.from_address.unique().shape)) from_address_df = df.from_address.value_counts().to_frame() len_less_10_from_adderss_count = from_address_df[from_address_df.from_address <= 10].shape print("发送邮件数量小于10封的服务器数量为:" + str(len_less_10_from_adderss_count))
# 查看一下发送邮件最多的五个运营商所发送的所有邮件中的正常邮件和异常邮件的比例情况 print("所有发送邮件情况") print(df.from_address.value_counts().head(5)) print("所有的正常邮件的发送情况") print(df[df.label==0.0].from_address.value_counts().head(5)) print("所有的异常邮件的发送情况") print(df[df.label==1.0].from_address.value_counts().head(5))
# 基于上一个的描述信息(上述只取了部分,具体下要求得全部情况去判断),我认为如果发送邮箱是:163.com、126.com、tom.com、12.com(假设)的情况下,那么邮件有很大可能属于垃圾邮件 # 如果发送邮箱是:mail.tsinghua.edu.cn\mails.tsinghua.edu.cn\cernet.com ,那么邮件有很大可能是属于正常邮件的 # 所以这里根据邮箱的发送运营商,构建一些新的特征属性 df['from_12'] = pd.Series(map(lambda s: int(s == '12.com'), df['from_address'])) df['from_163'] = pd.Series(map(lambda s: int(s == '163.com'), df['from_address'])) df['from_126'] = pd.Series(map(lambda s: int(s == '126.com'), df['from_address'])) df['from_tom'] = pd.Series(map(lambda s: int(s == 'tom.com'), df['from_address'])) df['from_unknown'] = pd.Series(map(lambda s: int(s == 'unknown'), df['from_address'])) df['from_tsinghua'] = pd.Series(map(lambda s: int(s == 'mail.tsinghua.edu.cn' or s == 'mail.tsinghua.edu.cn'), df['from_address'])) df['from_cernet'] = pd.Series(map(lambda s: int(s == 'cernet.com'), df['from_address'])) df.head(2)
0 Thu 1 Aug 1996 02:09:47 -0500
1 Tue 20 Aug 1996 11:43:05 -0400
2 Wed 21 Aug 1996 15:14:11 +1000
3 Wed 21 Aug 1996 15:14:14 +1000
4 Thu 19 Sep 1996 19:21:19 -0400 (EDT)
5 Tue 10 Sep 1996 12:47:21 +0700
6 Tue 17 Sep 1996 15:45:15 -0400
7 Fri 20 Sep 1996 18:59:32 -0540
8 Sat 21 Sep 1996 09:59:46 -0400 (EDT)
9 Mon 23 Sep 1996 02:56:50 -0540
10 Tue 24 Sep 1996 23:39:23 +0700
11 Thu 03 Oct 1996 13:47:49 -0500 (CDT)
dates = np.unique(list(map(lambda t: str(t).strip(), df['date' ]))) date_lengths = np.unique(list(map(lambda t: len(t), dates))) print("各个字符串长度:" ) print(date_lengths) print("各个长度对应的时间格式:" ) for length in date_lengths: print(np.unique(list(filter(lambda t: len(str(t).strip()) == length, df['date' ]))))
def extract_email_date (str1) : if not isinstance(str1,str): str1 = str(str1) str_len = len(str1) week = "" hour = "" time_quantum = "" if str_len < 10 : week = "unknown" hour = "unknown" time_quantum ="unknown" pass elif str_len == 16 : rex = r"(\d{2}):\d{2}" it = re.findall(rex,str1) if len(it) == 1 : hour = it[0 ] else : hour = "unknown" week = "Fri" time_quantum = "0" pass elif str_len == 19 : week = "Sep" hour = "01" time_quantum = "3" pass elif str_len == 21 : week = "Wed" hour = "17" time_quantum = "1" pass else : rex = r"([A-Za-z]+\d?[A-Za-z]*) .*?(\d{2}):\d{2}:\d{2}.*" it = re.findall(rex,str1) if len(it) == 1 and len(it[0 ]) == 2 : week = it[0 ][0 ][-3 :] hour = it[0 ][1 ] int_hour = int(hour) if int_hour < 8 : time_quantum = "3" elif int_hour < 13 : time_quantum = "0" elif int_hour < 19 : time_quantum = "1" else : time_quantum = "2" pass else : week = "unknown" hour = "unknown" time_quantum = "unknown" week = week.lower() hour = hour.lower() time_quantum = time_quantum.lower() return (week,hour,time_quantum) data_time_extract_result = list(map(lambda st:extract_email_date(st),df["date" ])) df["date_week" ] = pd.Series(map(lambda t:t[0 ],data_time_extract_result)) df["date_hour" ] = pd.Series(map(lambda t:t[1 ],data_time_extract_result)) df["date_time_quantum" ] = pd.Series(map(lambda t:t[2 ],data_time_extract_result)) print(df.head(2 )) print("=======星期属性字段描述======" ) print(df.date_week.value_counts().head(3 )) print(df[["date_week" ,"label" ]].groupby(["date_week" ,"label" ])["label" ].count()) print("=======小时属性字段描述======" ) print(df.date_hour.value_counts().head(3 )) print(df[['date_hour' , 'label' ]].groupby(['date_hour' , 'label' ])['label' ].count()) print("=======时间段属性字段描述======" ) print(df.date_hour.value_counts().head(3 )) print(df[["date_time_quantum" ,"label" ]].groupby(["date_time_quantum" ,"label" ])["label" ].count()) df["has_date" ] = df.apply(lambda c: 0 if c["date_week" ] == "unknown" else 1 ,axis=1 ) print(df.head(2 ))
df['content' ] = df['content' ].astype('str' ) df['jieba_cut_content' ] = list(map(lambda st: " " .join(jieba.cut(st)), df['content' ])) def precess_content_length (lg) : if lg <= 10 : return 0 elif lg <= 100 : return 1 elif lg <= 500 : return 2 elif lg <= 1000 : return 3 elif lg <= 1500 : return 4 elif lg <= 2000 : return 5 elif lg <= 2500 : return 6 elif lg <= 3000 : return 7 elif lg <= 4000 : return 8 elif lg <= 5000 : return 9 elif lg <= 10000 : return 10 elif lg <= 20000 : return 11 elif lg <= 30000 : return 12 elif lg <= 50000 : return 13 else : return 14 df['content_length' ] = pd.Series(map(lambda st: len(st), df['content' ])) df['content_length_type' ] = pd.Series(map(lambda st: precess_content_length(st), df['content_length' ])) df21 = df.groupby(['content_length_type' , 'label' ])['label' ].agg(['count' ]) print(df21)
content_length_type label
0 0 11
1 16
1 0 46
1 115
2 0 545
1 1035
3 0 490
1 924
4 0 452
1 757
5 0 242
1 366
6 0 129
1 247
7 0 97
1 188
8 0 187
1 354
9 0 54
1 105
10 0 149
1 258
11 0 23
1 57
12 0 19
1 24
13 0 2
1 5
14 0 7
1 13
df2 = df21.reset_index() # 获取垃圾邮件的数据 df3 = df2[df2.label == 1][['content_length_type', 'count']].rename(columns={'count':'c1'}) # 获取正常邮件的数据 df4 = df2[df2.label == 0][['content_length_type', 'count']].rename(columns={'count':'c2'}) # 合并数据 df5 = pd.merge(df3, df4) # 计算数据占比 df5['c1_rage'] = df5.apply(lambda r: r['c1'] / (r['c1'] + r['c2']), axis=1) df5['c2_rage'] = df5.apply(lambda r: r['c2'] / (r['c1'] + r['c2']), axis=1) df5['c3_rage'] = df5.apply(lambda r: r['c2'] / r['c1'], axis=1) print(df5.head(10)) # 画图 plt.plot(df5['content_length_type'], df5['c1_rage'], label=u'垃圾邮件比例') plt.plot(df5['content_length_type'], df5['c2_rage'], label=u'正常邮件比例') plt.plot(df5['content_length_type'], df5['c3_rage'], label=u'正常邮件/垃圾邮件') plt.grid(True) plt.legend(loc = 0) plt.show() print(df5.head(10)) # 画图 plt.plot(df5['content_length_type'], df5['c1_rage'], label=u'垃圾邮件比例') plt.plot(df5['content_length_type'], df5['c2_rage'], label=u'正常邮件比例') plt.plot(df5['content_length_type'], df5['c3_rage'], label=u'正常邮件/垃圾邮件') plt.grid(True) plt.legend(loc = 0) plt.show()
结果:content_length_type c1 c2 c1_rage c2_rage c3_rage
0 0 16 11 0.592593 0.407407 0.687500
1 1 115 46 0.714286 0.285714 0.400000
2 2 1035 545 0.655063 0.344937 0.526570
3 3 924 490 0.653465 0.346535 0.530303
4 4 757 452 0.626137 0.373863 0.597094
5 5 366 242 0.601974 0.398026 0.661202
6 6 247 129 0.656915 0.343085 0.522267
7 7 188 97 0.659649 0.340351 0.515957
8 8 354 187 0.654344 0.345656 0.528249
9 9 105 54 0.660377 0.339623 0.514286
def precess_content_sema (x) : if x>10000 : return 0.5 /np.exp(np.log10(x)-np.log10(500 ))+np.log(abs(x-500 )+1 )-np.log(abs(x-10000 ))+1 else : return 0.5 /np.exp(np.log10(x)-np.log10(500 ))+np.log(abs(x-500 )+1 )+1 df["content_sema" ] = list(map(lambda st:precess_content_sema(st),df["content_length" ])) print(df.head(2 ))
=特征筛选 写入文件
通过df.info ()查看当前列信息
RangeIndex: 6917 entries, 0 to 6916
Data columns (total 22 columns):
from 6916 non-null object
to 6889 non-null object
date 6914 non-null object
content 6917 non-null object
label 6917 non-null int64
to_address 6917 non-null object
from_address 6917 non-null object
from_12 6917 non-null int64
from_163 6917 non-null int64
from_126 6917 non-null int64
from_tom 6917 non-null int64
from_unknown 6917 non-null int64
from_tsinghua 6917 non-null int64
from_cernet 6917 non-null int64
date_week 6917 non-null object
date_hour 6917 non-null object
date_time_quantum 6917 non-null object
has_not_date 6917 non-null int64
jieba_cut_content 6917 non-null object
content_length 6917 non-null int64
content_length_type 6917 non-null int64
content_sema 6917 non-null float64
dtypes: float64(1), int64(11), object(10)
memory usage: 1.2+ MB
df.drop(["from" , "to" , "date" , "content" , "to_address" , "from_address" , "date_week" , "date_hour" , "date_time_quantum" ], axis=1 , inplace=True ) df.info()
RangeIndex: 6917 entries, 0 to 6916
Data columns (total 13 columns):
label 6917 non-null int64
from_12 6917 non-null int64
from_163 6917 non-null int64
from_126 6917 non-null int64
from_tom 6917 non-null int64
from_unknown 6917 non-null int64
from_tsinghua 6917 non-null int64
from_cernet 6917 non-null int64
has_not_date 6917 non-null int64
jieba_cut_content 6917 non-null object
content_length 6917 non-null int64
content_length_type 6917 non-null int64
content_sema 6917 non-null float64
dtypes: float64(1), int64(11), object(1)
memory usage: 702.6+ KB
df.to_csv("../data/result_process02" ,encoding="utf-8" ,index=False )
import timeimport numpy as npimport pandas as pdimport matplotlib as mplimport matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizerfrom sklearn.model_selection import train_test_splitfrom sklearn.decomposition import TruncatedSVD from sklearn.naive_bayes import BernoulliNB from sklearn.metrics import f1_score,precision_score,recall_score mpl.rcParams['font.sans-serif' ]=[u'simHei' ] mpl.rcParams['axes.unicode_minus' ]=False df = pd.read_csv("../data/result_process02" ,encoding="utf-8" ,sep="," ) df.dropna(axis=0 ,how="any" ,inplace=True ) print(df.head()) print(df.info()) x_train,x_test,y_train,y_test = train_test_split(df[["has_date" ,"jieba_cut_content" ,"content_sema" ]], df["label" ],test_size=0.2 ,random_state=0 ) print("训练数据集大小:%d" %x_train.shape[0 ]) print("测试数据集大小:%d" %x_test.shape[0 ]) print(x_train.head()) transformer = TfidfVectorizer(norm="l2" ,use_idf=True ) svd = TruncatedSVD(n_components=20 ) jieba_cut_content = list(x_train["jieba_cut_content" ].astype("str" )) transformer_model = transformer.fit(jieba_cut_content) df1 = transformer_model.transform(jieba_cut_content) svd_model = svd.fit(df1) df2 = svd_model.transform(df1) data = pd.DataFrame(df2) print(data.head()) print(data.info()) data["has_date" ] = list(x_train["has_date" ]) data["content_sema" ] = list(x_train["content_sema" ]) print("========数据合并后的data信息========" ) print(data.head()) print(data.info()) t1 = time.time() nb = BernoulliNB(alpha=1.0 ,binarize=0.0005 ) model = nb.fit(data,y_train) t = time.time()-t1 print("贝叶斯模型构建时间为:%.5f ms" %(t*1000 )) jieba_cut_content_test = list(x_test["jieba_cut_content" ].astype("str" )) data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test))) data_test["has_date" ] = list(x_test["has_date" ]) data_test["content_sema" ] = list(x_test["content_sema" ]) print(data_test.head()) print(data_test.info()) y_predict = model.predict(data_test) print("准确率为:%.5f" % precision_score(y_test,y_predict)) print("召回率为:%.5f" % recall_score(y_test,y_predict)) print("F1值为:%.5f" % f1_score(y_test,y_predict))
nb = BernoulliNB(alpha=1.0,binarize=0.0005) #贝叶斯分类模型构建 model = nb.fit(data,y_train)
svc = SVC(C = 1, kernel='rbf', degree=3, gamma=0.001) model = svc.fit(data, y_train)
%%time forest = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=3, random_state=0) model = forest.fit(data, y_train)
knn = KNeighborsClassifier(n_neighbors=2) model = knn.fit(data, y_train)
%%time gb = GradientBoostingClassifier(learning_rate=0.01, n_estimators=100, max_depth=3, min_samples_split=50, loss='deviance', random_state=0) model = gb.fit(data, y_train)
%%time tree = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=0) model = tree.fit(data, y_train)