前一段时间因为要开发一个新项目,需要一个ip库,由于公司原本无这样一个库,所以急需从文件中匹配ip地址然后存库。下面是我下的一段程序,可能效率不是很高,代码也没有做过多的优化,有些地方比较繁琐,但这样以后脏数据相对来说少点,但或多或少还是存在一定的脏数据。但是这个文件有40多行,我采取的办法是按行数将这个相对来说很大的文件拆分是N多很小的文件分布式分析入库的。当然你也可以通过多进程处理。一个问题有很多解决办法,我分享一下我的想法。当然如果您有更好的办法请多多指正,笔者虚心求教,还有我下面贴出的代码您也可以去优化,然后提出您的优化建议,我们一起探讨,共同进步。
我当时采用的是浏览器运行php文件来执行的,在这里我建议应该在linux环境下采用php命令行执行,这样效率会更高,而且不存在超时的问题。
- <?php
- /*
- *Author:马荣财
- *date:2011/8/2 14:25:28
- *
- *目的:正则匹配ip文件列表导入数据库
- *@param $filename ip列表文件全路径包括目录
- *
- */
- /*
- *分析了一下文件发现主要有一下几种格式
- *A: **省 如甘肃省
- *B: **市**区 如北京市海淀区或如浙江省宁波市北仑区
- *C: ****大学 如清华大学
- *D: ****市 如新疆,宁夏,西藏,内蒙古**市
- *E: 剩下的为一类
- */
- //set_time_limit(0);
- ini_set('max_execution_time', '3000');
- $conn = mysql_connect('localhost','root','') or die('MYSQL 连接错误!');
- mysql_query("set names utf8");
- mysql_select_db('test',$conn);
- $result = mysql_query('select `cid`,`pid`,`name` from `tbl_city`');
- $citys = array();
- while($row=mysql_fetch_array($result)) {
- $citys[$row['name']] = $row;
- }
- function parseAndImport($filename,$citys) {
- //将ip文件列表读入一个数组
- $fileArr = file($filename);
- //分别定义了国家、省份、城市、描述、省份id、城市id、ip段(1,2)
- $country ='';
- $province = '';
- $city ='';
- $ser = '';
- $cid = '';
- $pid = '';
- $ip1 ='';
- $ip2 = '';
- //定义一个计数器
- $i=0;
- $sql = '';
- foreach($fileArr as $key=>$lineText) {
- if(preg_match("|(\d+\.\d+\.\d+\.\d+) +(\d+\.\d+\.\d+\.\d+) +(.*)|",$lineText,$lineArr)) {
- //用空格切割匹配的与城市有关的信息
- $dataArr= explode(' ',$lineArr[3]);
- $ser = $lineArr[3];
- $cityStr = $dataArr[0];
- //把ip装换成整数形式
- $ip1 = sprintf("%u", ip2long($lineArr[1]));
- $ip1 = addslashes($ip1);
- $ip2 = sprintf("%u", ip2long($lineArr[2]));
- $ip2 = addslashes($ip2);
- echo $ip2;exit;
- //先查找A类型的数据
- if(preg_match("|(.*)省$|",$cityStr,$cityArr)) {
- $country = '中国';
- $cityArrlen = count($cityArr);
- if($cityArrlen>0) {
- $province = $cityArr[1];
- $city ='';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- } else {
- $province ='';
- $city = '';
- }
- delLine($filename,$lineText);
- }
- //查找B类型的数据
- elseif(preg_match("|(.*?)市(.*)|",$cityStr,$cityArr) || preg_match("|(.*?)州(.*)|",$cityStr,$cityArr)) {
- $country = '中国';
- $cityArrlen = count($cityArr);
- if($cityArrlen>0) {
- $len=mb_strpos($cityArr[1],'省');
- if($len>0) {
- $pcArr = explode('省',$cityArr[1]);
- $province=$pcArr[0];
- $city=$pcArr[1];
- $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid'];
- $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];
- } else {
- $province = $cityArr[1];
- $areaLen = mb_strpos($cityArr[2],'区');
- if($areaLen>0) {
- $aArr = explode($cityArr[2],'区');
- $city = $aArr[0];
- $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid'];
- $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];
- } else {
- $province = $cityArr[1];
- $city ='';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- }
- } else {
- $province ='';
- $city = '';
- }
- delLine($filename,$lineText);
- }
- //查找C类型的数据
- elseif(preg_match("|(.*)大学(.*)|",$cityStr,$cityArr) || preg_match("|(.*)学院(.*)|",$cityStr,$cityArr)) {
- //参阅附件,这块比较繁琐
- }
- //查找D类型的数据
- elseif(preg_match("|^内蒙古(.*)|",$cityStr,$cityArr)) {
- $cityArrlen = count($cityArr);
- $country = '中国';
- $province = '内蒙古';
- if($cityArrlen>0) {
- $len=mb_strpos($cityArr[1],'市');
- if($len>0) {
- $pcArr = explode('市',$cityArr[1]);
- $city=$pcArr[0];
- $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid'];
- $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];
- } else {
- $city = '';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- }
- else {
- $city = '';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- delLine($filename,$lineText);
- }
- elseif(preg_match("|^西藏(.*)|",$cityStr,$cityArr)) {
- $cityArrlen = count($cityArr);
- $country = '中国';
- $province = '西藏';
- if($cityArrlen>0) {
- $len=mb_strpos($cityArr[1],'市');
- if($len>0) {
- $pcArr = explode('市',$cityArr[1]);
- $city=$pcArr[0];
- $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid'];
- $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];
- } else {
- $city = '';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- }
- else {
- $city='';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- delLine($filename,$lineText);
- }
- elseif(preg_match("|^新疆(.*)|",$cityStr,$cityArr)) {
- $cityArrlen = count($cityArr);
- $country = '中国';
- $province = '新疆';
- if($cityArrlen>0) {
- $len=mb_strpos($cityArr[1],'市');
- if($len>0) {
- $pcArr = explode('市',$cityArr[1]);
- $city=$pcArr[0];
- $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid'];
- $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];
- } else {
- $city = '';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- }
- else {
- $city ='';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- delLine($filename,$lineText);
- }
- elseif(preg_match("|^宁夏(.*)|",$cityStr,$cityArr)) {
- $cityArrlen = count($cityArr);
- $country = '中国';
- $province = '宁夏';
- if($cityArrlen>0) {
- $len=mb_strpos($cityArr[1],'市');
- if($len>0) {
- $pcArr = explode('市',$cityArr[1]);
- $city=$pcArr[0];
- $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid'];
- $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];
- } else {
- $city = '';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- }
- else {
- $city = '';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- delLine($filename,$lineText);
- }
- elseif(preg_match("|^香港(.*)|",$cityStr,$cityArr)) {
- $cityArrlen = count($cityArr);
- $country = '中国';
- $province = '香港';
- if($cityArrlen>0) {
- $len=mb_strpos($cityArr[1],'市');
- if($len>0) {
- $pcArr = explode('市',$cityArr[1]);
- $city=$pcArr[0];
- $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid'];
- $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];
- } else {
- $city = '';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- }
- else {
- $city = '';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- delLine($filename,$lineText);
- }
- elseif(preg_match("|^澳门(.*)|",$cityStr,$cityArr)) {
- $cityArrlen = count($cityArr);
- $country = '中国';
- $province = '澳门';
- if($cityArrlen>0) {
- $len=mb_strpos($cityArr[1],'市');
- if($len>0) {
- $pcArr = explode('市',$cityArr[1]);
- $city=$pcArr[0];
- $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid'];
- $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];
- } else {
- $city = '';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- }
- else {
- $city = '';
- $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid'];
- $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];
- }
- delLine($filename,$lineText);
- }
- else {
- $country = $cityStr;
- $province='';
- $city='';
- $cid = '';
- $pid ='';
- delLine($filename,$lineText);
- }
- }
- $i++;
- echo $ip2;exit;
- $sql = 'insert into `city_ip`(`ip1`,`ip2`,`country`,`ser`,`province_id`,`province_name`,`city_id`,`city_name`) values ("'.$ip1.'","'.$ip2.'","'.$country.'","'.$ser.'","'.$pid.'","'.$province.'","'.$cid.'","'.$city.'")';
- if(mysql_query($sql)) {
- echo $i.'<br/>';
- }
- }
- }
- function delLine($filename,$text) {
- $f1=fopen($filename,'r');
- $tmp=tempnam('e:\web\www\\','newip.txt');//建立临时文件
- $f2=fopen($tmp,'w');
- while(!feof($f1)){
- $line=fgets($f1);
- if ($line!=$text) fputs($f2,$line);
- }
- fclose($f1);
- fclose($f2);
- rename($tmp,$filename);
- }
- parseAndImport('ip1.txt',$citys);
- ?>