网站日志蜘蛛抓取分析查看系统

2017年11月22日11:47:33 发表评论 186 views

网站上线后,我们经常会跑到服务器里面通过各种命令查看日记看今天网站有多少蜘蛛过来爬行我们的网站,爬行过哪些页面,什么时候来过等问题。为了减少工作量,能否有一个系统能够每天查看网站的日志并统计起来,下面就分享下我们自己搭建的网站日志系统,支持多个网站日志分析,功能强大。

查看全部,这里只展示一个网站,日志系统支持多站点

网站日志蜘蛛抓取分析查看系统

只查看最近一天的

网站日志蜘蛛抓取分析查看系统

查看那些页面被蜘蛛爬过 并统计蜘蛛ip

网站日志蜘蛛抓取分析查看系统

 

实现流程

第一步 日志切割

  1. #!/bin/bash
  2. PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin:~/bin
  3. export PATH
  4. #!/bin/bash
  5. PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin:~/bin
  6. export PATH
  7. #!/bin/bash
  8. cd /www/wwwlogs  #日志路径自己修改
  9. log_dir="/www/wwwlogs"  #日志路径自己修改
  10. time=`date +%Y-%m-%d`
  11. nginx_dir="/www/server/nginx"   #路径自己根据站点修改
  12. save_days=15
  13. #日志分割,按天分类
  14. website=`ls $log_dir/* | xargs -n 1`
  15. for i in $website
  16. do
  17.   file=`basename $i .log`
  18.   mkdir -p $log_dir/backup/$file
  19.   mv $log_dir/$file.log $log_dir/backup/$file/$file_$time.log
  20. done
  21. $nginx_dir/sbin/nginx -s reload
  22. #删除所有超过15天日志。
  23. if [ "`date +%a`" = "Sun" ]; then
  24.   all_list=`ls $log_dir/backup | xargs -n 1`
  25.   for del in $all_list
  26.   do
  27.     let results=$time-$del
  28.     if [ $results -gt save_days ]; then
  29.       rm -fr $log_dir/backup/$del
  30.     fi
  31.   done
  32. fi
  33. echo "----------------------------------------------------------------------------"
  34. endDate=`date +"%Y-%m-%d %H:%M:%S"`
  35. echo "★[$endDate] 任务执行成功"
  36. echo "----------------------------------------------------------------------------"
  37. echo "----------------------------------------------------------------------------"
  38. endDate=`date +"%Y-%m-%d %H:%M:%S"`
  39. echo "★[$endDate] 任务执行成功"
  40. echo "----------------------------------------------------------------------------"

第二步 日志分析

  1. #!/bin/bash
  2. PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin:~/bin
  3. export PATH
  4. #!/bin/bash
  5. PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin:~/bin
  6. export PATH
  7. /www/server/php/56/bin/php /www/wwwlogs/backup/logs.php
  8. echo "----------------------------------------------------------------------------"
  9. endDate=`date +"%Y-%m-%d %H:%M:%S"`
  10. echo "★[$endDate] 任务执行成功"
  11. echo "----------------------------------------------------------------------------"
  12. echo "----------------------------------------------------------------------------"
  13. endDate=`date +"%Y-%m-%d %H:%M:%S"`
  14. echo "★[$endDate] 任务执行成功"
  15. echo "----------------------------------------------------------------------------"

第三步 迁移日志

  1. #!/bin/bash
  2. PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin:~/bin
  3. export PATH
  4. #!/bin/bash
  5. PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin:~/bin
  6. export PATH
  7. rm -r /www/wwwroot/www.tanhuibiao.com/logs/
  8. cp -r /www/wwwlogs/backup/logs/ /www/wwwroot/www.tanhuibiao.com/
  9. echo "----------------------------------------------------------------------------"
  10. endDate=`date +"%Y-%m-%d %H:%M:%S"`
  11. echo "★[$endDate] 任务执行成功"
  12. echo "----------------------------------------------------------------------------"
  13. echo "----------------------------------------------------------------------------"
  14. endDate=`date +"%Y-%m-%d %H:%M:%S"`
  15. echo "★[$endDate] 任务执行成功"
  16. echo "----------------------------------------------------------------------------"

第四步 把logs.php放到back目录下

  1. <?php
  2. /*
  3.  * 该步骤为初步处理数据
  4.  * 指标1:爬虫来访,抓取总量,抓取时间
  5.  * 指标2、目录抓取比率
  6.  * 指标3、单个页面是否被抓取
  7.  * */
  8. class analysis_logs
  9. {
  10.     public $domain_list = array('www.dabiaoseo.com',
  11.     );
  12.     public $mon_dict = array(
  13.         "Jan"=>"01""Feb"=>"02""Mar"=>"03""Apr"=>"04""May"=>"05",
  14.         "Jun"=>"06""Jul"=>"07""Aug"=>"08""Sep"=>"09""Oct"=>"10",
  15.         "Nov"=>"11""Dec"=>"12",
  16.     );
  17.     //public $logPath = 'E:/Spiders/Log_download/test/';
  18.     public $logPath = '/www/wwwlogs/backup/';
  19.     public $rootPath = '/www/wwwroot/';
  20.     public $total_file = array();
  21.     public function file_name()
  22.     {
  23.         foreach ($this->domain_list as $domain)
  24.         {
  25.             if(is_dir($this->logPath.$domain))    // 如果域名文件存在
  26.             {
  27.                 $file_list = glob($this->logPath.$domain.'/'.date("Y-m-d").'.log'); //获取当天的日志文件
  28.                 foreach ($file_list as $file)   //www/wwwlogs/backup/028webs.com/*.log
  29.                 {
  30.                     $this->total_file[] = $file;  // 将log文件完整路径传入计算函数
  31.                 }
  32.             }
  33.         }
  34.         //$this->analysis_log($this->total_file[0]);
  35.         foreach ($this->total_file as $putfile){
  36.             $this->analysis_log($putfile);
  37.         }
  38.     }
  39.     public function analysis_log($file)
  40.     {
  41.         //单log解析计算
  42.         $logdata = array();
  43.         $log = new SplFileObject($file);
  44.         foreach ($log as $line) {
  45.             if($line){
  46.                 $record_list = explode(' ', $line);
  47.                 if ($record_list[3]) {
  48.                     $ip = $record_list[0];
  49.                     $date = $record_list[3];
  50.                     preg_match('/(\d{2}:\d{2}:\d{2})/',$date,$time);
  51.                     preg_match('/^(20\d\d)/i', explode('/', $date)[2], $year);
  52.                     $month = $this->mon_dict[explode('/', $date)[1]];
  53.                     $day = str_replace('[', '', explode('/', $date)[0]);
  54.                     $day_format = $year[0] . '-' . $month . '-' . $day;
  55.                     $analysis_format = strtotime("{$year[0]}/{$month}/{$day} {$time[0]}");
  56.                     $url = $record_list[6];
  57.                     $status = $record_list[8];
  58.                 }
  59.                 if (!array_key_exists($day_format, $logdata)){
  60.                     $logdata[$day_format] = array('Baidu'=>0,'360'=>0,'sm'=>0, 'Sogou'=>0,'Google'=>0,'crawl_data'=>array(
  61.                         'Baidu'=>array('IP'=>array()),
  62.                         '360'=>array('IP'=>array()),
  63.                         'sm'=>array('IP'=>array()),
  64.                         'Sogou'=>array('IP'=>array()),
  65.                         'Google'=>array('IP'=>array()),
  66.                     ));
  67.                 }
  68.                 else {
  69.                     if (preg_match('/(Baiduspider)/i', $line))   # 查找百度Spider
  70.                     {
  71.                         $logdata[$day_format]['Baidu']++;
  72.                         $logdata[$day_format]['crawl_data']['Baidu'][$status][] = $url;
  73.                         $logdata[$day_format]['crawl_data']['Baidu']['IP'][$ip][] = $analysis_format;
  74.                     }
  75.                     elseif (preg_match('/(HaosouSpider|360Spider)/i', $line))
  76.                     {
  77.                         $logdata[$day_format]['360']++;
  78.                         $logdata[$day_format]['crawl_data']['360'][$status][] = $url;
  79.                         $logdata[$day_format]['crawl_data']['360']['IP'][$ip][] = $analysis_format;
  80.                     }
  81.                     elseif (preg_match('/(YisouSpider)/i', $line))
  82.                     {
  83.                         $logdata[$day_format]['sm']++;
  84.                         $logdata[$day_format]['crawl_data']['sm'][$status][] = $url;
  85.                         $logdata[$day_format]['crawl_data']['sm']['IP'][$ip][] = $analysis_format;
  86.                     }
  87.                     elseif (preg_match('/(Sogou web spider)/i', $line)) {
  88.                         $logdata[$day_format]['Sogou']++;
  89.                         $logdata[$day_format]['crawl_data']['Sogou'][$status][] = $url;
  90.                         $logdata[$day_format]['crawl_data']['Sogou']['IP'][$ip][] = $analysis_format;
  91.                     }
  92.                     elseif (preg_match('/(Googlebot)/',$line)){
  93.                         $logdata[$day_format]['Google']++;
  94.                         $logdata[$day_format]['crawl_data']['Google'][$status][] = $url;
  95.                         $logdata[$day_format]['crawl_data']['Google']['IP'][$ip][] = $analysis_format;
  96.                     }
  97.                     else
  98.                         continue;
  99.                 }
  100.             }
  101.         }
  102.         $domain = explode('/', $file)[4];
  103.         $is_m = 0;
  104.         if(strstr($domain,'m.'))
  105.         {
  106.             $domain = str_replace('m.', '', $domain);
  107.             $is_m = 1;
  108.         }
  109.         $domain_logs = $this->logPath . 'logs/';
  110.         if(!file_exists($domain_logs))
  111.             mkdir($domain_logs,0777);
  112.         if($is_m)
  113.             $put_file = $domain_logs .$domain .'_'. date("Y-m-d") . '_m_analysis.json';
  114.         else
  115.             $put_file = $domain_logs .$domain .'_'. date("Y-m-d") . '_analysis.json';
  116.         $f = fopen($put_file,'a');
  117.         fwrite($f, json_encode($logdata));
  118.         fclose($f);
  119.     }
  120.     public function analysis_time($logdata){
  121.     }
  122. }
  123. $analysis = new analysis_logs();
  124. $analysis->file_name();
  • 我的微信
  • 这是我的微信扫一扫
  • weinxin
  • 我的微信公众号
  • 我的微信公众号扫一扫
  • weinxin

发表评论

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen: