awk 文本处理

清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>

#############################程序说明#############################
#1.输出路径为当前运行目录上级目录中建立 result文件夹
#2.在result文件夹中建立url_data,url_result,app_result,app_data三个文夹
#3.url_data 待解析URL数据存放目录
#4.url_result 已解析为噪音URL的数据存放目录
#4.app_result 已解析为APP应用所存放文件目录
#5.app_data 待处APP数理
#6.result/log_out.log 处理时间
#7.字段对应说明如下:
#$1:IMSI $2:MDN	$3:MEID	 $4:DestinationIP	$5:DestinationPort	$6:SourceIP	
#$7:SourcePort	$8:ProtocolID	$17:ServiceType	$10:StartTime	$11:EndTime	
#$12:Duration	$13:InputOctets	$14:OutputOctets	$26:DestinationURL
################################################################
#!/bin/sh
awk -F '|' 'BEGIN{
#取得当前主机名,为每个机器单独部署时文件起一个别名
"hostname" | getline  file_name_everyone;
OFS=",";
is_null = "";
url_filename=file_name_everyone"_url.txt";
noice_filename=file_name_everyone"_noice.txt";
app_filename=file_name_everyone"_app.txt";
app_a19_filename=file_name_everyone"_app_a19.txt";
app_result_filename=file_name_everyone"_app_result.txt";   
app_data_filename=file_name_everyone"_app_data.txt"
app_flag="a19";
haed_http="http://";
print "......解析文件开始........"  strftime("%Y-%m-%d %H:%M:%S") >> "../result/log_out.log"}
END{print "......解析文件结束........"  strftime("%Y-%m-%d %H:%M:%S") >> "../result/log_out.log"}
{
	#文件分割 当文件条数为总记录条数50w的倍数时进行拆解文件,500000记录大小为:30M如需要更大的文件,可改变此大小
	#3200000 约等于130M 57000000 约等于2G
	if(NR % 3200000 ==0){
		url_filename=file_name_everyone"_"NR"_url.txt";
		noice_filename=file_name_everyone"_"NR"_noice.txt";
		app_filename=file_name_everyone"_"NR"_app.txt";
		app_a19_filename=file_name_everyone"_"NR"_app_a19.txt";
	}
	#初始化序列号
	fruit_num = $2;
	if(0 == (match(tolower($26),/.*(\.js\b|\.js\W|\.img|\.inf|\.dat|\.dwr|\.fla|\.mp4|\.cmr|\.asm|\.cfg|\.amr|\.war|\.tdz|\.md5|\.jar|\.cmd|\.gif|\.png|\.jpeg|\.bmp|\.def|\.jpg|\.css|\.ico|\.cur|\.swf|\.txt|\.avi|\.xml|\.zip|\.cab|\.crl|\.mp3|\.tpt|\.fcg|\.lrc|\.action|\.rar|\.m4a|\.idx|\.exe|\.dll|\.ini|\.vbs|\.doc|\.flv).*/)) && (length($5)>0 && length($6)>0 && length($7)>0 && length($8)>0 && length($16)>0)){
		#当协议类型为http或wap时
		if($16==1 || $16==2){
			if(length($26)>0){
				 #拆分URL
				 split($26,url,"/");
				 #按问号拆分变量
				 split($26,value,"?");
				 #当url以http 或HTTP 或https 
				if(url[1]=="http:" || url[1]=="HTTP:" || url[1]=="HTTPS:" || url[1]=="https:" ){
				  #当url按斜杠拆分后长度为5时,为没有二级域名时
					if(length(url)==5){
						print fruit_num,$16,$17,$26,url[3],url[4],is_null,substr($26,length(value[1])+2) >> "../result/url_data/"url_filename;
					} else {
						print fruit_num,$16,$17,$26,url[3],url[4],url[5],substr($26,length(value[1])+2) >> "../result/url_data/"url_filename;
				  } 
				 #当url不以http 或HTTP开头时
				 }else {
					if(length(url)==3){
						print fruit_num,$16,$17,haed_http$26,url[1],url[2],is_null,substr($26,length(value[1])+2) >> "../result/url_data/"url_filename;
				  } else {
					  print fruit_num,$16,$17,haed_http$26,url[1],url[2],url[3],substr($26,length(value[1])+2) >> "../result/url_data/"url_filename;
				  }
				}
			} else {
				print fruit_num,$26,$16,$17,1,is_null >> "../result/url_result/"noice_filename;
			}         
		#协议3:SMTP 4:POP3 5:IMAP4 7:RTSP
		} else if($16==3 || $16==4 || $16==5){
			if($17==399 || $17==499 || $17==599){
				print fruit_num,$26,$16,$17,app_flag,is_null,is_null >> "../result/app_result/"app_a19_filename;
			} 
		#协议6:FTP 8:MMS
		} else if($16==6 || $16==8){
			if($17==699 || $17==899){
				print fruit_num,$26,$16,$17,0,-1,is_null >> "../result/app_result/"app_result_filename;
			}
		} else if($16==7){
			if($17==799){
				print fruit_num,$5,$6,$16,$17 >> "../result/app_data/"app_data_filename;
			} 
		}
	} else {
		print fruit_num,$26,$16,$17,1,is_null >> "../result/url_result/"noice_filename;
	}
}' *.txt