來源:北京SEO 時間:2019-04-11
  一個url只有被收錄且建立索引后才有可能獲得流量,只有收錄沒有索引就無法通過檢索詞檢索到。通常判斷收錄的方式是直接搜索該url看是否收錄,而判斷建立索引的方式是搜索該url的title,看看能否通過完整的title來檢索到該url。如果能的話,則認為建立了索引!
 
  以上是廣泛認可的一種思路。不過我覺得只要通過url能檢索到就是有索引,因為url也是一種搜索詞嘛。通過title檢索不到因為權重太低。不過不必較真,還是按照大眾化的思路來檢測吧。
 
  以下是批量查詢一批鏈接百度PC是否有收錄和索引!
 
  準備待查詢url文件(url.txt,一行一個)!
 
  查詢結果保存為bdpc_index.txt!
 
# ‐*‐ coding: utf‐8 ‐*‐
"""
準備url.txt,一行一個url,必須帶http或https
區分https或者http
區分https://aaa/bbb和https://aaa/bbb/
查詢某個url是否收錄,有收錄再訪問該url獲取title(模擬百度蜘蛛UA獲取title),查詢通過搜索title該url是否有排名
可能的坑:因為是訪問url獲取title,如果短時間查詢同一個域名下多條url,可能被封造成獲取title錯誤影響最終結果。
"""

import requests
from pyquery import PyQuery as pq
import threading
import queue


class BdpcIndex(threading.Thread):

    def __init__(self):
        threading.Thread.__init__(self)

    # 讀取txt文件 獲取待查詢url
    @staticmethod
    def read_txt(filepath):
        q = queue.Queue()
        for url in open(filepath, encoding='utf-8'):
            url = url.strip()
            q.put(url)
        return q

    # 獲取某待查詢url的title
    def get_title(self, url, user_agent):
        html = self.get_html(url, user_agent)
        if html:
            doc = pq(html)
            title = doc('title').text()
            return title

    # 獲取某待查詢url或某詞的serp源碼
    def get_html(self, url, user_agent, retry=2):
        try:
            r = requests.get(url=url, headers=user_agent, timeout=5)
        except Exception as e:
            print('獲取源碼失敗', url, e)
            if retry > 0:
                self.get_html(url, user_agent, retry - 1)
        else:
            html = r.text
            return html

    # 獲取某待查詢url或某詞的serp源碼上自然排名的所有url
    def get_encrpt_urls(self, html):
        encrypt_url_list = []
        if html and '_百度搜索' in html:
            doc = pq(html)
            try:
                a_list = doc('.t a').items()
            except Exception as e:
                print('未找到加密url,鏈接未收錄', e)
            else:
                for a in a_list:
                    encrypt_url = a.attr('href')
                    if encrypt_url.find('http://www.baidu.com/link?url=') == 0:
                        encrypt_url_list.append(encrypt_url)
        return encrypt_url_list

    # 解密某條加密url
    def decrypt_url(self, encrypt_url, retry=1):
        try:
            encrypt_url = encrypt_url.replace('http://', 'https://')
            r = requests.head(encrypt_url, headers=user_agent)
        except Exception as e:
            print(encrypt_url, '解密失敗', e)
            if retry > 0:
                self.decrypt_url(encrypt_url, retry - 1)
        else:
            return r.headers['Location']

    # 獲取某待查詢url或某詞的serp源碼首頁真實url
    def get_real_urls(self, encrypt_url_list):
        if encrypt_url_list:
            real_url_list = [self.decrypt_url(encrypt_url) for encrypt_url in encrypt_url_list]
            return real_url_list
        else:
            return []

    # 檢查鏈接是否有收錄
    def check_include(self, url, real_urls):
        if url in real_urls:
            return 1
        else:
            return 0

    # 線程函數
    def run(self):
        while 1:
            target_url = q.get()
            # 查詢該target_url是否收錄
            url = "https://www.baidu.com/s?ie=utf-8&wd={0}".format(target_url)
            html = self.get_html(url,user_agent)
            encrypt_url_list = self.get_encrpt_urls(html)
            real_urls = self.get_real_urls(encrypt_url_list)
            num_target_url = self.check_include(target_url, real_urls)
            # 有收錄則判斷是否索引
            if num_target_url == 1:
                # 查詢該target_url的title 檢查是否有索引
                title = self.get_title(target_url, baidu_ua)
                if title:
                    url = "https://www.baidu.com/s?ie=utf-8&wd={0}".format(title)
                    html = self.get_html(url, user_agent)
                    encrypt_url_list = self.get_encrpt_urls(html)
                    real_urls = self.get_real_urls(encrypt_url_list)
                    num_title = self.check_include(target_url, real_urls)
                    if num_title == 1:
                        print(target_url, "收錄且索引")
                        f.write(target_url+'	'+'收錄且索引')
                    elif num_title == 0:
                        print(target_url, "收錄無索引")
                        f.write(target_url + '	' + '收錄無索引')
                else:
                    print(target_url, '未獲取title')
                    f.write(target_url + '	' + '未獲取title')
            elif num_target_url == 0:
                print(target_url, "無收錄")
                f.write(target_url + '	' + '無收錄')
            q.task_done()


if __name__ == "__main__":

    user_agent = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    baidu_ua = {'User-Agent': 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html'}
    q = BdpcIndex.read_txt('url.txt')
    f = open('bdpc_index.txt','w',encoding='utf-8')
    # 設置線程數
    for i in list(range(6)):
        t = BdpcIndex()
        t.setDaemon(True)
        t.start()
    q.join()
    f.flush()
    f.close()

  
羽毛球的规则