Preface
python crawler, web spider. Crawls the website to obtain the web page data, and carries on the analysis extraction.
The basic modules are urllib, urllib2, re, etc
(I) basic usage, examples
(1) make a basic GET request to GET the html of the web page
#!coding=utf-8 import urllib import urllib2 url = 'http://www.baidu.com/' # Get request request = urllib2.Request(url) try: # According to the request, get the response response = urllib2.urlopen(request) except urllib2.HTTPError, e: if hasattr(e, 'reason'): print e.reason # Read the response's body html = response.read() # Read the headers of response headers = response.info()
(2) form submission
#!coding=utf-8 import urllib2 import urllib post_url = '' post_data = urllib.urlencode({ 'username': 'username', 'password': 'password', }) post_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0', } request = urllib2.Request( url=post_url, data=post_data, headers=post_headers, ) response = urllib2.urlopen(request) html = response.read()
(3)
Xiao Bian recommends a learning code of learning python: Nan Zhu (qun 491308659)
Whether you are Daniel or Xiaobai, want to change careers or want to join the industry, you can come to understand and learn together! There are development tools in the skirt, many dry goods and technical information sharing!
#!coding=utf-8 import urllib2 import re page_num = 1 url = 'http://tieba.baidu.com/p/3238280985?see_lz=1&pn='+str(page_num) myPage = urllib2.urlopen(url).read().decode('gbk') myRe = re.compile(r'class="d_post_content j_d_post_content ">(.*?)</div>', re.DOTALL) items = myRe.findall(myPage) f = open('baidu.txt', 'a+') import sys reload(sys) sys.setdefaultencoding('utf-8') i = 0 texts = [] for item in items: i += 1 print i text = item.replace('<br>', '') text.replace('\n', '').replace(' ', '') + '\n' print text f.write(text) f.close()
(4)
#coding:utf-8 ''' //Log in 163 email and download email content ''' import urllib import urllib2 import cookielib import re import time import json class Email163: header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} user = '' cookie = None sid = None mailBaseUrl='http://twebmail.mail.163.com' def __init__(self): self.cookie = cookielib.CookieJar() cookiePro = urllib2.HTTPCookieProcessor(self.cookie) urllib2.install_opener(urllib2.build_opener(cookiePro)) def login(self,user,pwd): ''' //Sign in ''' postdata = urllib.urlencode({ 'username':user, 'password':pwd, 'type':1 }) #Note that the login URL is different with different versions req = urllib2.Request( url='https://ssl.mail.163.com/entry/coremail/fcg/ntesdoor2?funcid=loginone&language=-1&passtype=1&iframe=1&product=mail163&from=web&df=email163&race=-2_45_-2_hz&module=&uid='+user+'&style=10&net=t&skinid=null', data=postdata, headers=self.header, ) res = str(urllib2.urlopen(req).read()) #print res patt = re.compile('sid=([^"]+)',re.I) patt = patt.search(res) uname = user.split('@')[0] self.user = user if patt: self.sid = patt.group(1).strip() #print self.sid print '%s Login Successful.....'%(uname) else: print '%s Login failed....'%(uname) def getInBox(self): ''' //Get mailbox list ''' print '\nGet mail lists.....\n' sid = self.sid url = self.mailBaseUrl+'/jy3/list/list.do?sid='+sid+'&fid=1&fr=folder' res = urllib2.urlopen(url).read() #Get mailing list mailList = [] patt = re.compile('<div\s+class="tdLike Ibx_Td_From"[^>]+>.*?href="([^"]+)"[^>]+>(.*?)<\/a>.*?<div\s+class="tdLike Ibx_Td_Subject"[^>]+>.*?href="[^>]+>(.*?)<\/a>',re.I|re.S) patt = patt.findall(res) if patt==None: return mailList for i in patt: line = { 'from':i[1].decode('utf8'), 'url':self.mailBaseUrl+i[0], 'subject':i[2].decode('utf8') } mailList.append(line) return mailList def getMailMsg(self,url): ''' //Download mail content ''' content='' print '\n Download.....%s\n'%(url) res = urllib2.urlopen(url).read() patt = re.compile('contentURL:"([^"]+)"',re.I) patt = patt.search(res) if patt==None: return content url = '%s%s'%(self.mailBaseUrl,patt.group(1)) time.sleep(1) res = urllib2.urlopen(url).read() Djson = json.JSONDecoder(encoding='utf8') jsonRes = Djson.decode(res) if 'resultVar' in jsonRes: content = Djson.decode(res)['resultVar'] time.sleep(3) return content ''' Demon ''' #Initialization mail163 = Email163() #Sign in mail163.login('lpe234@163.com','944898186') time.sleep(2) #Get inbox elist = mail163.getInBox() #Get message content for i in elist: print 'Theme:%s Come from:%s Content:\n%s'%(i['subject'].encode('utf8'),i['from'].encode('utf8')
(5) landing conditions
#1 cookie processing import urllib2, cookielib cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar()) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) content = urllib2.urlopen('http://XXXX').read() #2 use proxy and cookie opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler) #3 form processing import urllib postdata=urllib.urlencode({ 'username':'XXXXX', 'password':'XXXXX', 'continueURI':'http://www.verycd.com/', 'fk':fk, 'login_submit':'Sign in' }) req = urllib2.Request( url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/', data = postdata ) result = urllib2.urlopen(req).read() #4 Disguised as browser access headers = { 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } req = urllib2.Request( url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/', data = postdata, headers = headers ) #5 anti "anti stealing chain" headers = { 'Referer':'http://www.cnbeta.com/articles' }
(6) multithreading
from threading import Thread from Queue import Queue from time import sleep #q is the task queue #NUM is the total number of concurrent threads #How many JOBS are there q = Queue() NUM = 2 JOBS = 10 #Specific processing function, responsible for processing a single task def do_somthing_using(arguments): print arguments #This is the working process, which is responsible for constantly fetching data from the queue and processing def working(): while True: arguments = q.get() do_somthing_using(arguments) sleep(1) q.task_done() #fork NUM threads waiting for queue for i in range(NUM): t = Thread(target=working) t.setDaemon(True) t.start() #Queue JOBS for i in range(JOBS): q.put(i) #Wait for all JOBS to complete q.join()