Python:使用代理proxy爬虫

Python:使用代理proxy爬虫

代理我就不介绍了..代理简介和类型可以参考proxy代理类型:透明代理 匿名代理 混淆代理和高匿代理. 这里写一些python爬虫使用代理的知识, 还有一个代理池的类. 附带一些看到的帖子中用urllib构建的示例.

如果要测试代理是否成功, 抓http://icanhazip.com 这个网站看内容就知道了.


urllib 模块使用代理

urllib/urllib2使用代理比较麻烦, 需要先构建一个ProxyHandler的类, 随后将该类用于构建网页打开的opener的类,再在request中安装该opener.

代理格式是"http://112.25.41.136:80",如果要账号密码是"http://user:password@112.25.41.136:80".

proxy="http://112.25.41.136:80"
# Build ProxyHandler object by given proxy
proxy_support=urllib.request.ProxyHandler({'http':proxy})
# Build opener with ProxyHandler object
opener = urllib.request.build_opener(proxy_support)
# Install opener to request
urllib.request.install_opener(opener)
# Open url
r = urllib.request.urlopen('http://icanhazip.com',timeout = 1000)

requests 模块 使用代理

requests使用代理要比urllib简单多了…这里以单次代理为例. 多次的话可以用session一类构建.

如果需要使用代理,你可以通过为任意请求方法提供 proxies 参数来配置单个请求:

import requests

proxies = {
  "http": "http://10.10.1.10:3128",
  "https": "http://10.10.1.10:1080",
}

r=requests.get("http://icanhazip.com", proxies=proxies)
print r.text

你也可以通过环境变量 HTTP_PROXYHTTPS_PROXY 来配置代理。

export HTTP_PROXY="http://10.10.1.10:3128"
export HTTPS_PROXY="http://10.10.1.10:1080"
python
>>> import requests
>>> r=requests.get("http://icanhazip.com")
>>> print r.text

若你的代理需要使用HTTP Basic Auth,可以使用 http://user:password@host/ 语法:

proxies = {
    "http": "http://user:pass@10.10.1.10:3128/",
}

示例脚本

这里以gatherproxy的高匿代理为例构建一个代理池的类.别的如西刺代理同理构建.

#! /usr/bin/env python
# -*- coding: utf-8 -*-

__author__="Platinhom"
__date__="2016.1.29 23:30"

import re,requests,random

header={'headers':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

class GatherProxy(object):
	'''To get proxy from http://gatherproxy.com/'''
	url='http://gatherproxy.com/proxylist'
	pre1=re.compile(r'<tr.*?>(?:.|\n)*?</tr>')
	pre2=re.compile(r"(?<=\(\').+?(?=\'\))")

	def getelite(self,pages=1,uptime=70,fast=True):
		'''Get Elite Anomy proxy
		Pages define how many pages to get
		Uptime define the uptime(L/D)
		fast define only use fast proxy with short reponse time'''

		proxies=set()
		for i in range(1,pages+1):
			params={"Type":"elite","PageIdx":str(i),"Uptime":str(uptime)}
			r=requests.post(self.url+"/anonymity/t=Elite",params=params,headers=header)
			for td in self.pre1.findall(r.text):
				if fast and 'center fast' not in td:
					continue 
				try:
					tmp= self.pre2.findall(str(td))
					if(len(tmp)==2):
						proxies.add(tmp[0]+":"+str(int('0x'+tmp[1],16)))
				except:
					pass
		return proxies

class ProxyPool(object):
	'''A proxypool class to obtain proxy'''

	gatherproxy=GatherProxy()

	def __init__(self):
		self.pool=set()

	def updateGatherProxy(self,pages=1,uptime=70,fast=True):
		'''Use GatherProxy to update proxy pool'''
		self.pool.update(self.gatherproxy.getelite(pages=pages,uptime=uptime,fast=fast))

	def removeproxy(self,proxy):
		'''Remove a proxy from pool'''
		if (proxy in self.pool):
			self.pool.remove(proxy)

	def randomchoose(self):
		'''Random Get a proxy from pool'''
		if (self.pool):
			return random.sample(self.pool,1)[0]
		else:
			self.updateGatherProxy()
			return random.sample(self.pool,1)[0]

	def getproxy(self):
		'''Get a dict format proxy randomly'''
		proxy=self.randomchoose()
		proxies={'http':'http://'+proxy,'https':'https://'+proxy}
		#r=requests.get('http://icanhazip.com',proxies=proxies,timeout=1)
		try:
			r=requests.get('http://dx.doi.org',proxies=proxies,timeout=1)
			if (r.status_code == 200 ):
				return proxies
			else:
				self.removeproxy(proxy)
				return self.getproxy()
		except:
			self.removeproxy(proxy)
			return self.getproxy()

实例2: urllib代理刷CSDN博客(转载)

转载自FadeTrack 的 Python爬虫入门 《下》. 使用的是西刺代理作为代理的源.

# 刷 CSDN 博客访问量
import urllib.request
import re,random
from multiprocessing.dummy import Pool as ThreadPool 
time_out = 3 # 全局变量 10 秒超时时间
count = 0
proxies = [None]
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
def get_proxy():
    # 使用全局变量,修改之
    global proxies
    try:
        req = urllib.request.Request('http://www.xicidaili.com/',None,headers)
    except:
        print('无法获取代理信息!')
        return
    response = urllib.request.urlopen(req)
    html = response.read().decode('utf-8')
    p = re.compile(r'''<tr\sclass[^>]*>\s+
                                    <td>.+</td>\s+
                                    <td>(.*)?</td>\s+
                                    <td>(.*)?</td>\s+
                                    <td>(.*)?</td>\s+
                                    <td>(.*)?</td>\s+
                                    <td>(.*)?</td>\s+
                                    <td>(.*)?</td>\s+
                                </tr>''',re.VERBOSE)
    proxy_list = p.findall(html)
    for each_proxy in proxy_list[1:]:
        if each_proxy[4] == 'HTTP':
            proxies.append(each_proxy[0]+':'+each_proxy[1])
def change_proxy():
    # 随机从序列中取出一个元素
    proxy = random.choice(proxies)
    # 判断元素是否合理
    if proxy == None:
        proxy_support = urllib.request.ProxyHandler({})
    else:
        proxy_support = urllib.request.ProxyHandler({'http':proxy})
    opener = urllib.request.build_opener(proxy_support)
    opener.addheaders = [('User-Agent',headers['User-Agent'])]
    urllib.request.install_opener(opener)
    print('智能切换代理:%s' % ('本机' if proxy==None else proxy))
def get_req(url):
    # 先伪造一下头部吧,使用字典
    blog_eader = {
                'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36',
                'Host':'blog.csdn.net',
                'Referer':'http://blog.csdn.net/',
                'GET':url
                } 
    req = urllib.request.Request(url,headers = blog_eader)
    return req
# 访问 博客
def look_blog(url):
    # 切换一下IP
    change_proxy()
    req = get_req(url)
    try:
        urllib.request.urlopen(req,timeout = time_out)
    except:
        return
    else:
        print('访问成功!')
# 迭代访问
def click_blog(url):
    for i in range(0,count):
        if(i == count):
            break
        print('当前访问 Blog %s 第 %d 次' % (url,i))
        look_blog(url)
# 获取博客的文章链表
def get_blog_list(url):
    req = get_req(url)
    try:
        response = urllib.request.urlopen(req,timeout = time_out)
    except:
        print('无法挽回的错误')
        return None
    # 由于 Csdn 是 utf-8 所以不需要转码
    html = response.read()
    # 存储一个正则表达式 规则
    regx = '<span class="link_title"><a href="(.+?)">'
    pat = re.compile(regx)
    # 其实这里 写作 list1 = re.findall('<span class="link_title"><a href="(.+?)">',str(html)) 也是一样的结果
    blog_list = re.findall(pat,str(html))
    return blog_list
if __name__ == '__main__':
    global count
    # 基本参数初始化
    # 获取代理
    get_proxy()
    print('有效代理个数为 : %d' % len(proxies))
    blogurl = input('输入blog链接:')
    # 这个地方原本是我的默认输入偷懒用的
    if len(blogurl) == 0:
        blogurl = 'http://blog.csdn.net/bkxiaoc/'
    print('博客地址是:%s' % blogurl)
    try:
        count = int(input('输入次数:'))
    except ValueError:
        print('参数错误')
        quit() 
    if count == 0 or count > 999:
        print('次数过大或过小')
        quit()
    print('次数确认为 %d' % count)
    # 获取 博文 列表,由于测试时我的博文只有一页所以 只能获得一页的列表
    blog_list = get_blog_list(blogurl + '?viewmode=contents')
    if len(blog_list) == 0:
        print('未找到Blog列表')
        quit()
    print('启动!!!!!!!!!!!!!!!!!!!!')
    # 迭代一下 使用多线程
    index = 0
    for each_link in blog_list:
        # 补全头部
        each_link = 'http://blog.csdn.net' + each_link
        blog_list[index] = each_link
        index += 1
    # 有多少个帖子就开多少个线程的一半 let's go
    pool = ThreadPool(int(len(blog_list) / 2))
    results = pool.map(click_blog, blog_list)
    pool.close()
    pool.join()
    print('完成任务!!!!!!!!!!!!!!!!!!!!')

 

发表评论