來源:北京SEO 時間:2019-04-21
  前面提過python搭建百度pc端多站點分關鍵詞類別的排名監控腳本,與之類似在百度MO端也能監控。
 
  差別在于百度MO端的加密url并不是301重定向到真實url,而是返回一個正常頁面,然后瀏覽器執行該頁面的js代碼跳轉到真實url。用fiddler抓包訪問加密url返回頁面如下:
 
  直接解析上述頁面即可拿到真實url。
 
  所以,百度PC和MO監控腳本的差別在于獲取真實url的途徑不同。整體代碼如下:
# ‐*‐ coding: utf‐8 ‐*‐
"""
提取的是自然排名url
不含百家號 百度知道 百度貼吧  但是含有百科
百家號 百度知道 百度貼吧 類名是c-container  
事先準備excel文件,每個sheet存儲一類關鍵詞,sheet名字即關鍵詞分類
"""

from openpyxl import load_workbook
import requests
from pyquery import PyQuery as pq
import threading
import queue
import time
import re


class bdmoMonitor(threading.Thread):

    def __init__(self):
        threading.Thread.__init__(self)

    # 讀取excel文件 做好關鍵詞分類
    @staticmethod
    def read_excel(filepath):
        q = queue.Queue()
        group_list = []
        wb_kwd = load_workbook(filepath)
        for sheet_obj in wb_kwd:
            sheet_name = sheet_obj.title
            group_list.append(sheet_name)
            col_a = sheet_obj['A']
            for cell in col_a:
                 kwd = (cell.value)
                 q.put({kwd:sheet_name})
        return q,group_list

    # 初始化結果字典
    @staticmethod
    def result_init(group_list):
        for domain in target_domain:
            result[domain] = {}
            for group in group_list:
                result[domain][group] = 0
        print("初始化結果字典成功")

    # 獲取某詞serp源碼
    def get_html(self,url,retry=2):
        try:
            r = requests.get(url=url,headers=user_agent,timeout=5)
        except Exception as e:
            print('獲取源碼失敗',url,e)
            if retry > 0:
                self.get_html(url,retry-1)
        else:
            html = r.text
            return html

    # 獲取某詞serp源碼上自然排名的所有加密url
    def get_encrpt_urls(self,html):
        encrypt_url_list = []
        if html and '百度' in html:
            doc = pq(html)
            try:
                a_list = doc('#results .c-result-content h3').parents('a').items()
            except Exception as e:
                print('未提取到serp上的解密url', e)
            else:
                for a in a_list:
                    encrypt_url = a.attr('href')
                    encrypt_url_list.append(encrypt_url)
        return encrypt_url_list

    # 訪問某條加密url獲取源碼,從源碼中提取真實url
    def decrypt_url(self,encrypt_url):
        html_proxy = self.get_html(encrypt_url)
        if html_proxy:
            # doc = pq(html_proxy)
            # str_noscript = doc('noscript').html()
            # try:
            #     url_re = re.search('url=(.*?)"', str_noscript, re.S)
            #     real_url = url_re.group(1)
            # except Exception as e:
            #     print(e)
            # else:
            #     return real_url
            real_url = re.search(r'window.location.replace("(.*?)");',html_proxy,re.S|re.I)
            real_url = real_url.group(1) if real_url else 'baidu'
            return real_url

    # 獲取某詞serp源碼首頁排名真實url
    def get_real_urls(self,encrypt_url_list):
        if encrypt_url_list:
            real_url_list = [self.decrypt_url(encrypt_url) for encrypt_url in encrypt_url_list]
            return real_url_list
        else:
            return []

    # 統計每個域名排名的詞數
    def run(self):
        global success_num
        while 1:
            kwd_dict = q.get()
            for kwd,group in kwd_dict.items():
                url = "https://m.baidu.com/s?ie=utf-8&wd={0}".format(kwd)
                html = self.get_html(url)
                encrypt_url_list = self.get_encrpt_urls(html)
                print(kwd)
                real_urls = self.get_real_urls(encrypt_url_list)
                if real_urls:
                    # 可能有提取真實url失敗返回None的情況 干掉None 防止列表轉字符串出錯
                    set_real_urls = set(real_urls)
                    real_urls = [i for i in set_real_urls]
                    real_urls.remove(None) if None in real_urls else real_urls
                    # 將某詞的serp上10條真實url合并為一個字符串
                    domain_str = ''.join(real_urls)
                    try:
                        threadLock.acquire()
                        success_num += 1
                        for domain in target_domain:
                            if domain in domain_str:
                                result[domain][group] += 1
                        print('查詢成功{0}個'.format(success_num))
                    except Exception as e:
                        print(e)
                    finally:
                        threadLock.release()
            q.task_done()

    # 保存數據
    @staticmethod
    def save():
        print ('開始save.....')
        with open('bdmo_result.txt','w',encoding="utf-8") as f:
            for domain,data_dict in result.items():
                for key,value in data_dict.items():
                    f.write(date+'	'+domain+ '	'+key+'	'+str(value)+'
')


if __name__ == "__main__":
    start = time.time()

    # 全局變量 待監控域名列表
    target_domain = ['m.renrenche.com','www.renrenche.com','m.guazi.com','www.guazi.com',
                     'm.che168.com','www.che168.com','m.iautos.cn','so.iautos.cn','www.iautos.cn',
                     'm.hx2car.com','www.hx2car.com','58.com','m.taoche.com','www.taoche.com',
                     'm.51auto.com','www.51auto.com','m.xin.com','www.xin.com']
    user_agent = {
        'User-Agent': 'Mozilla/5.0 (Linux; Android 8.1.0; ALP-AL00 Build/HUAWEIALP-AL00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/63.0.3239.83 Mobile Safari/537.36 T7/10.13 baiduboxapp/10.13.0.11 (Baidu; P1 8.1.0)'}
    threadLock = threading.Lock()  # 鎖
    result = {}   # 結果保存字典
    success_num = 0  # 查詢成功個數
    date = time.strftime("%Y-%m-%d", time.localtime()) # 詢日期

    q,group_list = bdmoMonitor.read_excel('kwd.xlsx')
    bdmoMonitor.result_init(group_list)
    all_num = q.qsize()

    # 設置線程數
    for i in list(range(5)):
        t = bdmoMonitor()
        t.setDaemon(True)
        t.start()
    q.join()

    bdmoMonitor.save()
    end = time.time()
    print('
關鍵詞共{0}個,查詢成功{1}個,耗時{2}min'.format(all_num,success_num,(end-start)/60) )
    print('結果為
', result)



羽毛球的规则