前一段时间因为要开发一个新项目,需要一个ip库,由于公司原本无这样一个库,所以急需从文件中匹配ip地址然后存库。下面是我下的一段程序,可能效率不是很高,代码也没有做过多的优化,有些地方比较繁琐,但这样以后脏数据相对来说少点,但或多或少还是存在一定的脏数据。但是这个文件有40多行,我采取的办法是按行数将这个相对来说很大的文件拆分是N多很小的文件分布式分析入库的。当然你也可以通过多进程处理。一个问题有很多解决办法,我分享一下我的想法。当然如果您有更好的办法请多多指正,笔者虚心求教,还有我下面贴出的代码您也可以去优化,然后提出您的优化建议,我们一起探讨,共同进步。

    我当时采用的是浏览器运行php文件来执行的,在这里我建议应该在linux环境下采用php命令行执行,这样效率会更高,而且不存在超时的问题。

    

 
  1. <?php 
  2.      
  3.     /* 
  4.     *Author:马荣财 
  5.     *date:2011/8/2 14:25:28 
  6.     * 
  7.     *目的:正则匹配ip文件列表导入数据库 
  8.     *@param $filename ip列表文件全路径包括目录 
  9.     * 
  10.     */ 
  11.  
  12.     /* 
  13.     *分析了一下文件发现主要有一下几种格式 
  14.     *A:  **省            如甘肃省 
  15.     *B:  **市**区     如北京市海淀区或如浙江省宁波市北仑区 
  16.     *C:  ****大学     如清华大学 
  17.     *D:  ****市 如新疆,宁夏,西藏,内蒙古**市 
  18.     *E:  剩下的为一类 
  19.     */ 
  20.     //set_time_limit(0); 
  21.     ini_set('max_execution_time''3000'); 
  22.     $conn = mysql_connect('localhost','root',''or die('MYSQL 连接错误!'); 
  23.     mysql_query("set names utf8"); 
  24.     mysql_select_db('test',$conn); 
  25.     $result = mysql_query('select `cid`,`pid`,`name` from `tbl_city`'); 
  26.     $citys = array(); 
  27.     while($row=mysql_fetch_array($result)) { 
  28.         $citys[$row['name']] = $row
  29.     } 
  30.     function parseAndImport($filename,$citys) { 
  31.              
  32.             //将ip文件列表读入一个数组 
  33.             $fileArr = file($filename); 
  34.             //分别定义了国家、省份、城市、描述、省份id、城市id、ip段(1,2) 
  35.             $country =''
  36.             $province = ''
  37.             $city =''
  38.             $ser = ''
  39.             $cid = ''
  40.             $pid = ''
  41.             $ip1 =''
  42.             $ip2 = ''
  43.             //定义一个计数器 
  44.             $i=0; 
  45.             $sql = ''
  46.          
  47.             foreach($fileArr as $key=>$lineText) { 
  48.                 if(preg_match("|(\d+\.\d+\.\d+\.\d+) +(\d+\.\d+\.\d+\.\d+) +(.*)|",$lineText,$lineArr)) { 
  49.                     //用空格切割匹配的与城市有关的信息 
  50.                     $dataArrexplode(' ',$lineArr[3]); 
  51.                     $ser = $lineArr[3]; 
  52.                     $cityStr = $dataArr[0]; 
  53.  
  54.                     //把ip装换成整数形式 
  55.                     $ip1 = sprintf("%u"ip2long($lineArr[1])); 
  56.                     $ip1 = addslashes($ip1); 
  57.                     $ip2 = sprintf("%u"ip2long($lineArr[2])); 
  58.                     $ip2 = addslashes($ip2); 
  59.                     echo $ip2;exit
  60.                     //先查找A类型的数据 
  61.                     if(preg_match("|(.*)省$|",$cityStr,$cityArr)) { 
  62.                         $country = '中国'
  63.                         $cityArrlen = count($cityArr); 
  64.                         if($cityArrlen>0) { 
  65.                             $province = $cityArr[1]; 
  66.                             $city =''
  67.                             $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  68.                             $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  69.                         } else { 
  70.                             $province =''
  71.                             $city = ''
  72.                         } 
  73.                         delLine($filename,$lineText); 
  74.                     } 
  75.                     //查找B类型的数据 
  76.                     elseif(preg_match("|(.*?)市(.*)|",$cityStr,$cityArr) || preg_match("|(.*?)州(.*)|",$cityStr,$cityArr)) { 
  77.                         $country = '中国'
  78.                         $cityArrlen = count($cityArr); 
  79.                         if($cityArrlen>0) { 
  80.                             $len=mb_strpos($cityArr[1],'省'); 
  81.                             if($len>0) { 
  82.                                 $pcArr = explode('省',$cityArr[1]); 
  83.                                 $province=$pcArr[0]; 
  84.                                 $city=$pcArr[1]; 
  85.                                 $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid']; 
  86.                                 $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];  
  87.                             } else { 
  88.                                 $province = $cityArr[1]; 
  89.                                 $areaLen = mb_strpos($cityArr[2],'区'); 
  90.                                 if($areaLen>0) { 
  91.                                     $aArr = explode($cityArr[2],'区'); 
  92.                                     $city = $aArr[0]; 
  93.                                     $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid']; 
  94.                                     $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid']; 
  95.                                 } else { 
  96.                                     $province = $cityArr[1]; 
  97.                                     $city =''
  98.                                     $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  99.                                     $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  100.                                 } 
  101.                             } 
  102.                         } else { 
  103.                             $province =''
  104.                             $city = ''
  105.                         } 
  106.                         delLine($filename,$lineText); 
  107.                     } 
  108.                     //查找C类型的数据 
  109.                     elseif(preg_match("|(.*)大学(.*)|",$cityStr,$cityArr) || preg_match("|(.*)学院(.*)|",$cityStr,$cityArr)) { 
  110.                         //参阅附件,这块比较繁琐 
  111.                     } 
  112.                     //查找D类型的数据 
  113.                     elseif(preg_match("|^内蒙古(.*)|",$cityStr,$cityArr)) { 
  114.                         $cityArrlen = count($cityArr); 
  115.                         $country = '中国'
  116.                         $province = '内蒙古'
  117.                         if($cityArrlen>0) { 
  118.                             $len=mb_strpos($cityArr[1],'市'); 
  119.                             if($len>0) { 
  120.                                 $pcArr = explode('市',$cityArr[1]); 
  121.                                 $city=$pcArr[0]; 
  122.                                 $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid']; 
  123.                                 $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];  
  124.                             } else { 
  125.                                 $city = ''
  126.                                 $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  127.                                 $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  128.                             } 
  129.                         } 
  130.                         else { 
  131.                             $city = ''
  132.                             $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  133.                             $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  134.                         } 
  135.                         delLine($filename,$lineText); 
  136.                     } 
  137.                     elseif(preg_match("|^西藏(.*)|",$cityStr,$cityArr)) { 
  138.                         $cityArrlen = count($cityArr); 
  139.                         $country = '中国'
  140.                         $province = '西藏'
  141.                         if($cityArrlen>0) { 
  142.                             $len=mb_strpos($cityArr[1],'市'); 
  143.                             if($len>0) { 
  144.                                 $pcArr = explode('市',$cityArr[1]); 
  145.                                 $city=$pcArr[0]; 
  146.                                 $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid']; 
  147.                                 $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];  
  148.                             } else { 
  149.                                 $city = ''
  150.                                 $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  151.                                 $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  152.                             } 
  153.                         } 
  154.                         else { 
  155.                             $city=''
  156.                             $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  157.                             $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  158.                         } 
  159.                         delLine($filename,$lineText); 
  160.                     } 
  161.                     elseif(preg_match("|^新疆(.*)|",$cityStr,$cityArr)) { 
  162.                         $cityArrlen = count($cityArr); 
  163.                         $country = '中国'
  164.                         $province = '新疆'
  165.                         if($cityArrlen>0) { 
  166.                             $len=mb_strpos($cityArr[1],'市'); 
  167.                             if($len>0) { 
  168.                                 $pcArr = explode('市',$cityArr[1]); 
  169.                                 $city=$pcArr[0]; 
  170.                                 $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid']; 
  171.                                 $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];  
  172.                             } else { 
  173.                                 $city = ''
  174.                                 $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  175.                                 $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  176.                             } 
  177.                         } 
  178.                         else { 
  179.                             $city =''
  180.                             $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  181.                             $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  182.                         } 
  183.                         delLine($filename,$lineText); 
  184.                     } 
  185.                     elseif(preg_match("|^宁夏(.*)|",$cityStr,$cityArr)) { 
  186.                         $cityArrlen = count($cityArr); 
  187.                         $country = '中国'
  188.                         $province = '宁夏'
  189.                         if($cityArrlen>0) { 
  190.                             $len=mb_strpos($cityArr[1],'市'); 
  191.                             if($len>0) { 
  192.                                 $pcArr = explode('市',$cityArr[1]); 
  193.                                 $city=$pcArr[0]; 
  194.                                 $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid']; 
  195.                                 $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];  
  196.                             } else { 
  197.                                 $city = ''
  198.                                 $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  199.                                 $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  200.                             } 
  201.                         } 
  202.                         else { 
  203.                             $city = ''
  204.                             $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  205.                             $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  206.                         } 
  207.                         delLine($filename,$lineText); 
  208.                     } 
  209.                     elseif(preg_match("|^香港(.*)|",$cityStr,$cityArr)) { 
  210.                         $cityArrlen = count($cityArr); 
  211.                         $country = '中国'
  212.                         $province = '香港'
  213.                         if($cityArrlen>0) { 
  214.                             $len=mb_strpos($cityArr[1],'市'); 
  215.                             if($len>0) { 
  216.                                 $pcArr = explode('市',$cityArr[1]); 
  217.                                 $city=$pcArr[0]; 
  218.                                 $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid']; 
  219.                                 $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];  
  220.                             } else { 
  221.                                 $city = ''
  222.                                 $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  223.                                 $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  224.                             } 
  225.                         } 
  226.                         else { 
  227.                             $city = ''
  228.                             $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  229.                             $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  230.                         } 
  231.                         delLine($filename,$lineText); 
  232.                     } 
  233.                     elseif(preg_match("|^澳门(.*)|",$cityStr,$cityArr)) { 
  234.                         $cityArrlen = count($cityArr); 
  235.                         $country = '中国'
  236.                         $province = '澳门'
  237.                         if($cityArrlen>0) { 
  238.                             $len=mb_strpos($cityArr[1],'市'); 
  239.                             if($len>0) { 
  240.                                 $pcArr = explode('市',$cityArr[1]); 
  241.                                 $city=$pcArr[0]; 
  242.                                 $pid =empty($citys[$city]['pid']) ? '' : $citys[$city]['pid']; 
  243.                                 $cid =empty($citys[$city]['cid']) ? '' : $citys[$city]['cid'];  
  244.                             } else { 
  245.                                 $city = ''
  246.                                 $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  247.                                 $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  248.                             } 
  249.                         } 
  250.                         else { 
  251.                             $city = ''
  252.                             $pid =empty($citys[$province]['pid']) ? '' : $citys[$province]['pid']; 
  253.                             $cid =empty($citys[$province]['cid']) ? '' : $citys[$province]['cid'];  
  254.                         } 
  255.                         delLine($filename,$lineText); 
  256.                     } 
  257.                     else { 
  258.                         $country = $cityStr
  259.                         $province=''
  260.                         $city=''
  261.                         $cid = ''
  262.                         $pid =''
  263.                         delLine($filename,$lineText); 
  264.                     } 
  265.                 } 
  266.                 $i++; 
  267.                 echo $ip2;exit
  268.                 $sql = 'insert into `city_ip`(`ip1`,`ip2`,`country`,`ser`,`province_id`,`province_name`,`city_id`,`city_name`) values ("'.$ip1.'","'.$ip2.'","'.$country.'","'.$ser.'","'.$pid.'","'.$province.'","'.$cid.'","'.$city.'")'
  269.                 if(mysql_query($sql)) { 
  270.                     echo $i.'<br/>'
  271.                 } 
  272.             }    
  273.     } 
  274.  
  275.  
  276.     function delLine($filename,$text) { 
  277.         $f1=fopen($filename,'r'); 
  278.         $tmp=tempnam('e:\web\www\\','newip.txt');//建立临时文件 
  279.         $f2=fopen($tmp,'w'); 
  280.         while(!feof($f1)){ 
  281.           $line=fgets($f1); 
  282.           if ($line!=$textfputs($f2,$line); 
  283.         } 
  284.         fclose($f1); 
  285.         fclose($f2); 
  286.         rename($tmp,$filename); 
  287.     } 
  288.     parseAndImport('ip1.txt',$citys); 
  289.  
  290. ?>