來源:北京SEO 時間:2019-04-11
  判斷一個行業哪些站做的好,有一個大眾比較認可的思路。找一批行業詞,查詢每個詞百度排名前10的url,然后提取下來,最后統計下哪個域名出現次數多。出現次數多的幾個域名就是這個行業的優質站點。
 
  具體到實際情況,查詢一批詞,
 
  1)可以統計url的出現次數,然后最終做計算。(一個關鍵詞serp上同一個域名出現N個url排名計算N次)
 
  2)也可以只計算這批詞排在首頁詞數來篩選域名。(一個關鍵詞serp上同一個域名出現N個url排名計算1次,相當于計算首頁詞數)
 
  以下是第2種方式。
 
# ‐*‐ coding: utf‐8 ‐*‐
"""
一個關鍵詞serp上同一個域名出現N個url排名 計算1次,相當于計算首頁詞數
"""
import requests
from pyquery import PyQuery as pq
import threading
import queue
import time
from urllib.parse import urlparse

class bdpcCover(threading.Thread):

    def __init__(self):
        threading.Thread.__init__(self)

    # 讀取txt文件 關鍵詞進入隊列
    @staticmethod
    def read_file(filepath):
        q = queue.Queue()
        for kwd in open(filepath,encoding='utf-8'):
            kwd = kwd.strip()
            q.put(kwd)
        return q

    # 獲取某詞serp源碼
    def get_html(self,url,retry=2):
        try:
            r = requests.get(url=url,headers=user_agent,timeout=5)
        except Exception as e:
            print('獲取源碼失敗',e)
            if retry > 0:
                self.get_html(url,retry-1)
        else:
            html = r.text
            return html

    # 獲取某詞serp源碼上自然排名的所有url
    def get_encrpt_urls(self,html):
        encrypt_url_list = []
        if html and '_百度搜索' in html:
            doc = pq(html)
            try:
                a_list = doc('.t a').items()
            except Exception as e:
                print('未提取到serp上的解密url', e, url)
            else:
                for a in a_list:
                    encrypt_url = a.attr('href')
                    if encrypt_url.find('http://www.baidu.com/link?url=') == 0:
                        encrypt_url_list.append(encrypt_url)
        return encrypt_url_list

    # 解密某條加密url
    def decrypt_url(self,encrypt_url,retry=1):
        try:
            encrypt_url = encrypt_url.replace('http://','https://')
            r = requests.head(encrypt_url,headers=user_agent)
        except Exception as e:
            print(encrypt_url,'解密失敗',e)
            if retry > 0:
                self.decrypt_url(encrypt_url,retry-1)
        else:
            return r.headers['Location']

    # 獲取某詞serp源碼首頁排名真實url
    def get_real_urls(self,encrypt_url_list):
        real_url_list = [self.decrypt_url(encrypt_url) for encrypt_url in encrypt_url_list]
        return real_url_list

    # 提取某條url域名部分
    def get_domain(self,real_url):
        try:
           res = urlparse(real_url)
        except Exception as e:
           print (e,real_url)
           domain = "xxx"
        else:
           domain = res.netloc
        return domain

    # 獲取某詞serp源碼首頁排名真實url的域名部分
    def get_domains(self,real_url_list):
            domain_list = [self.get_domain(real_url) for real_url in real_url_list]
            # 搜一個詞 同一個域名多個url出現排名 只計算一次
            domain_set = set(domain_list)
            return domain_set

    # 線程函數
    def run(self):
        global success_num
        while 1:
            kwd = q.get()
            url = "https://www.baidu.com/s?ie=utf-8&wd={0}".format(kwd)
            html = self.get_html(url)
            encrypt_url_list = self.get_encrpt_urls(html)
            real_url_list = self.get_real_urls(encrypt_url_list)
            domain_set = self.get_domains(real_url_list)
            if domain_set:
                try:
                    threadLock.acquire()
                    for domain in domain_set:
                        result[domain] = result[domain]+1 if domain in result else 1
                    success_num += 1
                    print('查詢成功{0}個'.format(success_num))
                except Exception as e:
                    print(e)
                finally:
                    print (kwd,'查詢結束')
                    threadLock.release()
            q.task_done()

    # 保存數據
    @staticmethod
    def save():
        print ('開始save.....')
        res_sort = sorted(result.items(), key=lambda s: s[1], reverse=True)
        print(res_sort)
        with open('result2.txt','w',encoding="utf-8") as f:
            for domain,value in res_sort:
                # print(domain,type(domain),type(str(value)))
                f.write(str(domain)+'\t'+str(value)+'\n')


if __name__ == "__main__":
    start = time.time()

    user_agent = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'}
    threadLock = threading.Lock()  # 鎖
    result = {}   # 初始結果保存字典
    success_num = 0  # 查詢成功個數
    q = bdpcCover.read_file('kwd.txt')
    all_num = q.qsize() #總詞數

    # 設置線程數
    for i in list(range(5)):
        t = bdpcCover()
        t.setDaemon(True)
        t.start()
    q.join()

    # 結果保存文件
    bdpcCover.save()
    end = time.time()
    print('\n關鍵詞共{0}個,查詢成功{1}個,耗時{2}min'.format(all_num,success_num,(end-start)/60) )
羽毛球的规则