Commits

Bi Haicheng committed 94d6f4e

clean code

Comments (0)

Files changed (1)

 
     public $base_url='yyk.39.net';
     /**
-     * 
-     *
-     * undocumented function
-     *
-     * @return void
-     * @access public
-     * @author Bi Haicheng
-     **/
-    public function matchChinese($string)
-    {
-        /*
-        preg_match('/<a\shref=".*?">[^\x00-\xff]*<\/a>/',$string,$match);
-            preg_match('/<a\shref=".*?".*?>[u4e00-u9fa5]*.*?<\/a>/',$string,$match);
-            preg_match('/<a.*?>[\x00-\xff]*<\/a>/',$string,$match);
-            preg_match('/<a.*?>[\x00-\xff]*<\/a>/',$string,$match);
-            preg_match('/<a\shref=".*?".*?>[^u4e00-u9fa5]<\/a>/',$string,$match);
-         */
-//GBK匹配中文
-/*
-            preg_match('/<a\shref=".*?".*?>[\x80-\xff]*<\/a>/',$string,$match);
- */
-        //UTF8匹配中文
-            preg_match('/<a\shref=".*?".*?>[^u4e00-u9fa5]*<\/a>/',$string,$match);
-            return $match;
-    }
-    /**
-     * 
-     *
-     * undocumented function
-     *
-     * @return void
-     * @access public
-     * @author Bi Haicheng
-     **/
-    public function getChinese($string)
-    {
-            //preg_match('/[u4e00-u9fa5]+/',$string,$match);
-            //匹配中文
-            preg_match_all('/[\x80-\xff]+/',$string,$match);
-            return $match;
-    }
-    /**
-     * 从代码中获取医院的URL
-     *
-     * getUrl method
-     *
-     * @return void
-     * @access public
-     * @author Bi Haicheng
-     **/
-    public function getUrl()
-    {
-
-    }
-    /**
-     * 获得医院名称
-     *
-     * 根据URL获取医院名称switch case
-     *
-     * getHosName method
-     *
-     * @return void
-     * @access public
-     * @author Bi Haicheng
-     **/
-    public function getHosName()
-    {
-    }
-    /**
      * 获得医院列表代码
      *
      * getHosLi method
         $area_hos_url=array();
         $area_hos_code=$this->getHosLiCode($area,$reg);
         for ($i = 0; $i < count($area_hos_code); $i++) {
-                /*
-                preg_match_all('/<a\shref=".*?".*?>[\x80-\xff]+<\/a>/',$area_hos_code[$i],$area_hos_url[$i]);
-                preg_match_all('/<a.*?>.*?<\/a>/',$area_hos_code[$i][0],$area_hos_url);
-                 */
             preg_match_all('/<span\sclass="yy_title">.*?<a.*?>[\x80-\xff]+<\/a>.*?<span.*?>.*?<\/span>.*?<\/span>/s',$area_hos_code[$i][0],$area_hos_url[$i]);
             }
-        //减维
         $new_arr=array();
         for ($i = 0; $i < count($area_hos_url); $i++) {
             for ($j = 0; $j < count($area_hos_url[$i]); $j++) {
                 }
             }
         }
-       // var_dump($new_arr);
-       // exit;
         return $new_arr;
     }
     /**
-     * 
+     * get hospital name  list and url
      *
-     * undocumented function
+     * getHosLiUrl method
      *
      * @return void
      * @access public
               $area_hos_name[$i][]);
            }
         }
-        /*
-        echo '<pre>';
-        print_r($area_hos_name);
-        echo '</pre>';
-        exit;
-         */
-         //减维
         $new_arr=array();
         for ($i = 0; $i < count($area_hos_name); $i++) {
             for ($j = 0; $j < count($area_hos_name[$i]); $j++) {
               $area_urls[$new_arr[$i][$j]]);
            }
         }
-        /*
-        var_dump($area_a_arr);
-        echo '<pre>';
-        print_r($area_urls);
-        echo '</pre>';
-        exit;
-         */
-       //减维
         $new_arr=array();
         foreach ($area_urls as $name =>$urlarr) {
             for ($i = 0; $i < count($urlarr); $i++) {
                 $new_arr[$name]=$urlarr[$i];
-                //$new_arr[/*mb_convert_encoding(*/$name /*,"UTF-8")*/]=$urlarr[$i];
             }
         }
-        //医院名称和医院的URL
-        //return $area_urls;
         return $new_arr;
     }
     /**
      * 遍历文件夹中的每个文件得到相关的文件信息
      *
-     * undocumented function
+     * iteratorDir method
      *
      * @return void
      * @access public
     public function iteratorDir()
     {
         $dir_files=array();
-        //step controller
         foreach(new DirectoryIterator($this->base_dir) as $file){
-            //print $file->getPathname()."\n";
             if ($file->isDir()&&!$file->isDot()) {
-                //文件夹名称
-                //内层目录
-                //返回数组,作为一个单独的函数,也可以。后面这样
                 $inner_dir=$this->base_dir.$file->getFilename()."/";
                 foreach (new DirectoryIterator($inner_dir) as $inner_file)
                 {
         return $dir_files;
     }
     /**
-     * 得到文件夹从文件名称
-     *
-     * 用此名称创建子目录
-     *
-     * undocumented function
-     *
-     * @return void
-     * @access public
-     * @author Bi Haicheng
-     **/
-    public  function getFileNames($file)
-    {
-        stat($file);
-    }
-    /**
-     * 创建子目录和生成相应文件
-     *
-     * undocumented function
-     *
-     * @return void
-     * @access public
-     * @author Bi Haicheng
-     **/
-    public function createInnerDirAndPage()
-    {
-        //read dir get one page
-        //get hos list code
-        //
-    }
-    /**
      * 获得某一城市医院列表代码
      *
-     * undocumented function
+     * getHosLiCode method
      *
-     * @return void
+     * @return array
      * @access public
      * @author Bi Haicheng
      **/
     public function getHosLiCode($area,$reg)
     {
-        //$this->iteratorDir();
         $code_arr=array();
         $pagecodes=$this->getPageCode($area,$reg);
-        //var_dump($pagecodes[2]);
         for ($i = 0; $i < count($pagecodes); $i++) {
             preg_match('/<div\sclass="yy_tabcontent2">.*?<ul.*?>.*?<\/ul>.*?<div\sclass="page">.*?<\/div>.*?<\/div>/s',$pagecodes[$i],$code_arr[$i]);
         }
      **/
     public function getPagiUrl($firstpage)
     {
-/*
-         <div class="page">
-         <a href="/area-hp1-0-0-1-1-2.shtml" target="_self">2</a> 
-         <a href="/area-hp1-0-0-1-1-3.shtml" target="_self">3</a> 
-         <a\shref=".*?".*?>[1-9]+<\/a>
-*/
       $pagiurls=array();
-      preg_match('/<div\sclass="page".*?>.*?<\/div>/s',$firstpage,$pagicode);
-   //   var_dump($pagicode[0]);
+       preg_match('/<div\sclass="page".*?>.*?<\/div>/s',$firstpage,$pagicode);
        preg_match_all('/<a\shref=".*?".*?>[0-9]+<\/a>/',$pagicode[0],$pagiurl);
-   //   print_r($pagiurl);
        for ($i = 0; $i < count($pagiurl); $i++) {
            for ($j = 0; $j < count($pagiurl[$i]); $j++) {
            preg_match(
               $pagiurls[$j]);
          }
        }
-    //   print_r($pagiurls);
        $pagiurl_arr=array();
        for ($n = 0; $n < count($pagiurls); $n++) {
         $pagiurl_arr[$n]=$pagiurls[$n][0];
        return $pagiurl_arr;
     }
     /**
-     * 
-     *
-     * undocumented function
-     *
-     * @return void
-     * @access public
-     * @author Bi Haicheng
-     **/
-    public function getPagiArea($firstpage)
-    {
-       preg_match('/<div\sclass="page".*?>.*?<\/div>/s',$firstpage,$pagicode);
-       preg_match_all('/<a\shref=".*?".*?>[0-9]+<\/a>/',$pagicode[0],$pagiurl);
-       for ($i = 0; $i < count($pagiurl); $i++) {
-           for ($j = 0; $j < count($pagiurl[$i]); $j++) {
-           preg_match(
-              '/area-[a-zA-Z0-9]*-[0-9]*-[0-9]*-[0-9]*-[0-9]*-[0-9]*\.shtml/',
-              $pagiurl[$i][$j],
-              $pagiurls);
-         }
-       }
-       return ;
-    }
-    /**
      * 存储医院详细页面文件
      *
-     * undocumented function
+     * storeHosPage method
      *
-     * @return void
+     * @return boolean
      * @access public
      * @author Bi Haicheng
      **/
         $hospital_num=count($hos_name_url);
         $excute_count=0;
         foreach ($hos_name_url as $name=>$url) {
-           //分割URL,获取页面
            $path=substr($url,17);
-           //在相应目录生成静态文件
-           //how much time it excuted.
-           //有多少个URL
-           //echo mb_check_encoding($name,'UTF-8');
-           //echo mb_convert_encoding($name,'UTF-8','GBK');
-           //exit;
-           //var_dump($path);
-           //exit;
            if (!file_exists($this->base_dir.$area.'/'.$reg.'/'.mb_convert_encoding($name,'UTF-8','GBK').'.html')) {
-               //echo $this->base_dir.$area.'/'.$reg.'/'.mb_convert_encoding($name,'UTF-8','GBK').'.html';
-               //echo mb_convert_encoding($name,'UTF-8','GBK').'完成'."\n\r";
                 echo '页面生成中...'."\n\r";
                 $this->createPageByUrl($this->base_url,$path,$area.'/'.$reg.'/',mb_convert_encoding($name,'UTF-8','GBK').'.html');
                 echo mb_convert_encoding($name,'UTF-8','GBK').'完成'."\n\r";
                 sleep(1);
                 $excute_count++;
-                //recorde the complete hospital page
-                //$this->logWhichCom(mb_convert_encoding($name,'UTF-8','GBK'));
            }else{
+               echo '文件已存在,生成下一个页面'."\n\r";
                $excute_count++;
-                echo '文件已存在,生成下一个页面'."\n\r";
            }
            if ($hospital_num!=$excute_num) {
                $this->storeHosPage($area,$reg);
            }else {
                echo '此地区所有医院全部生成。';
+               return true;
            }
         }
     }
     public function logWhichCom($comfile)
     {
         $file_path=$this->base_dir;
-        //put the fetching completed file into logfile
-      //  file_put_contents($file_path.'/fetchingWebPage.log',mb_convert_encoding($comfile,"UTF-8","GBK"),FILE_APPEND|LOCK_EX);
         file_put_contents($file_path.'/fetchingWebPage.log',$comfile,FILE_APPEND|LOCK_EX);
         file_put_contents($file_path.'/fetchingWebPage.log',"\r\n",FILE_APPEND|LOCK_EX);
     }
     /**
      * restart fetching web page
+     *
      * return the last fetching complete file
      *
      * reStart method
          fseek($f, $cursor--, SEEK_END);
          $char = fgetc($f);
         }
-        //echo $last_line;
         return $last_line;
     }
     /**
      **/
     public function getPageCode($area,$reg)
     {
-        //读取目录文件,用文件目录名字生成子目录,
-        //将新生成的子文件放进入子目录
         $page_code=array();
-    //    var_dump($this->iteratorDir());
-   //     foreach($this->iteratorDir() as $reg_files){
         $areareg= $this->iteratorDir();
         $fn=$areareg[$area][$reg];
-           $page_code[0]=file_get_contents($fn);
-
-           //用从当前页面读取的页面作为起点
-           $other_url=$this->getPagiUrl($page_code[0]);
-          // var_dump($other_url);
-          // exit;
-     //   for ($i = 0; $i < count($other_url); $i++) {
-             for ($p = 1; $p <= count($other_url); $p++) {
-             //第一页的Code...N页。代码
+        $page_code[0]=file_get_contents($fn);
+        $other_url=$this->getPagiUrl($page_code[0]);
+        for ($p = 1; $p <= count($other_url); $p++) {
              $page_code[$p]=$this->url_get_contents($this->base_url,'/'.$other_url[$p-1]);
-             //生成还是不生成目录呢?
-             //Todo 如果已经本地已经存在,就读取本地
-    //    }
         }
-      //  }
         return $page_code;
-        //当前页面
-        //下一页面
-        //扑获分页的页面URL
-        //
     }
-} // END 读取静态文件
+} // END
 $hos=new fetchHospital();
-//print_r($hos->matchChinese('<a href="http://yyk.39.net/sh/zonghe/4ea31.html">上海交通大学医学院附属仁济医院</a>'));
-//print_r($hos->getChinese('<a href="http://yyk.39.net/sh/zonghe/4ea31.html">ss上海交sss通大学医学院附属仁济医院</a>'));
-//print_r($hos->getChinese('<a href="http://yyk.39.net/sh/zonghe/4ea31.html">我是</a>'));
-//得到区域所有页面代码没问题
-//$codes=$hos->getPageCode('上海','黄埔');
-//print_r($codes[2]);
-//$lc=$hos->getHosLiCode('上海','黄埔');
-//print_r($lc);
-/*
-echo '<pre>';
-print_r($hos->getHosLiUrl('上海','黄埔'));
-echo count($hos->getHosLiUrl('上海','黄埔'));
-echo '</pre>';
- */
-//print_r($hos->getHosLi('上海','黄埔'));
-//最想知道的是生成了多少个页面
-/*
 foreach ($hos->iteratorDir() as $area=>$reg) {
     foreach ($reg as $regname =>$regurl) {
-       //$hos->logWhichCom($area.'->'.$regname);
-       //$area_reg=explode('->',$hos->reStart());
-       //var_dump($area_reg);
         $area_reg=explode('->',$hos->reStart());
         if(is_array($area_reg)&&!empty($area_reg)&&$area==$area_reg[0]&&$regname==$area_reg[1]){
+        echo '执行中...'."\n\r";
         $hos->storeHosPage($area,$regname);
         echo $area.$regname.'地区已完成,接着进行下一地区'."\n\r";
         $hos->logWhichCom($area.'->'.$regname);
-       //每次循环延迟2秒
-        echo '执行中...'."\n\r";
-        sleep(2);
         }
        else{
+        echo '执行中...'."\n\r";
         $hos->storeHosPage($area,$regname);
         echo $area.$regname.'地区已经完成'."\n\r";
         $hos->logWhichCom($area.'->'.$regname);
-       //每次循环延迟2秒
-        echo '执行中...'."\n\r";
-        sleep(2);
        }
     }
 }
- */
 //$hos->storeHosPage('陕西','咸阳');
-//echo '<pre>';
-//print_r($hos->iteratorDir());
-//echo '</pre>';
 ?>