本文共 2906 字,大约阅读时间需要 9 分钟。
随机User-Agent scrapy
fake_useragent库,伪装请求头from fake_useragent import UserAgent ua = UserAgent()# ie浏览器的user agentprint(ua.ie) # opera浏览器print(ua.opera) # chrome浏览器print(ua.chrome) # firefox浏览器print(ua.firefox) # safri浏览器print(ua.safari) # 最常用的方式# 写爬虫最实用的是可以随意变换headers,一定要有随机性。支持随机生成请求头print(ua.random)print(ua.random)print(ua.random)
获取发出请求的ip地址
def get_local_ip(): # headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"} ua = UserAgent() head = ua.random headers = { "User-Agent": head} url = 'http://ip.hahado.cn/ip' # 使用的是阿布云收费代理ip proxy = { 'http': 'http://HC9XY1E5IT9P:884F63FA3A@http-pro.abuyun.com:9010'} respone = requests.get(url=url, headers=headers, proxies=proxy) print(respone.text)
from fake_useragent import UserAgentclass RandomUserAgentMiddlware(object): #随机更换user-agent def __init__(self, crawler): super(RandomUserAgentMiddlware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): def get_ua(): return getattr(self.ua, self.ua_type) request.headers.setdefault('User-Agent', get_ua())
代理ip
先把有效的免费的代理ip保持到一个文件里class GetIP(object): def delete_ip(self): dd = pd.read_csv('xici_ip.csv', header=None) print(dd[0]) def judge_ip(self, ip, port, http): #判断ip是否可用 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"} http_url = "http://www.baidu.com" proxy_url = "{0}:{1}".format(ip, port) try: if http == 'HTTP': proxy_dict = { "http": 'http://' + proxy_url, } else: proxy_dict = { "https": 'https://' + proxy_url, } response = requests.get(http_url, proxies=proxy_dict, timeout=5, headers=headers) print(response) except Exception as e: print('无效') return False else: code = response.status_code if code >= 200 and code < 300: print('有效') return True else: print('wuxiao') return False def get_random_ip(self): dd = pd.read_csv('xici_ip3.csv', header=None) first_ip = dd.sample(n=1, random_state=None) ip = first_ip.iloc[0,1] port = first_ip.iloc[0, 2] http = first_ip.iloc[0, 3] judge_re = self.judge_ip(ip, port, http) if judge_re: if http == 'HTTP': return "http://{0}:{1}".format(ip, port) else: return "https://{0}:{1}".format(ip, port) else: return self.get_random_ip()
转载地址:http://cmvii.baihongyu.com/