python crawler - basic module, you must understand!

Posted by mcatalf0221 on Sat, 30 Nov 2019 15:05:25 +0100

Preface
python crawler, web spider. Crawls the website to obtain the web page data, and carries on the analysis extraction.

The basic modules are urllib, urllib2, re, etc

(I) basic usage, examples

(1) make a basic GET request to GET the html of the web page

#!coding=utf-8
import urllib
import urllib2

url = 'http://www.baidu.com/'
# Get request
request = urllib2.Request(url)
try:
    # According to the request, get the response
    response = urllib2.urlopen(request)
except urllib2.HTTPError, e:
    if hasattr(e, 'reason'):
        print e.reason
# Read the response's body
html = response.read()
# Read the headers of response
headers = response.info()

(2) form submission


#!coding=utf-8
import urllib2
import urllib

post_url = ''

post_data = urllib.urlencode({
    'username': 'username',
    'password': 'password',
})

post_headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0',
}

request = urllib2.Request(
    url=post_url,
    data=post_data,
    headers=post_headers,
)

response = urllib2.urlopen(request)

html = response.read()

(3)

Xiao Bian recommends a learning code of learning python: Nan Zhu (qun 491308659)
Whether you are Daniel or Xiaobai, want to change careers or want to join the industry, you can come to understand and learn together! There are development tools in the skirt, many dry goods and technical information sharing!

#!coding=utf-8

import urllib2
import re

page_num = 1
url = 'http://tieba.baidu.com/p/3238280985?see_lz=1&pn='+str(page_num)
myPage = urllib2.urlopen(url).read().decode('gbk')

myRe = re.compile(r'class="d_post_content j_d_post_content ">(.*?)</div>', re.DOTALL)
items = myRe.findall(myPage)

f = open('baidu.txt', 'a+')

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

i = 0
texts = []
for item in items:
    i += 1
    print i
    text = item.replace('<br>', '')
    text.replace('\n', '').replace(' ', '') + '\n'
    print text
    f.write(text)

f.close()

(4)

#coding:utf-8
'''
    //Log in 163 email and download email content

'''
import urllib
import urllib2
import cookielib
import re
import time
import json

class Email163:
    header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    user = ''
    cookie = None
    sid = None
    mailBaseUrl='http://twebmail.mail.163.com'

    def __init__(self):
        self.cookie = cookielib.CookieJar()
        cookiePro = urllib2.HTTPCookieProcessor(self.cookie)
        urllib2.install_opener(urllib2.build_opener(cookiePro))

    def login(self,user,pwd):
        '''
            //Sign in
        '''
        postdata = urllib.urlencode({
                'username':user,
                'password':pwd,
                'type':1
            })
        #Note that the login URL is different with different versions
        req = urllib2.Request(
                url='https://ssl.mail.163.com/entry/coremail/fcg/ntesdoor2?funcid=loginone&language=-1&passtype=1&iframe=1&product=mail163&from=web&df=email163&race=-2_45_-2_hz&module=&uid='+user+'&style=10&net=t&skinid=null',
                data=postdata,
                headers=self.header,
            )
        res = str(urllib2.urlopen(req).read())
        #print res
        patt = re.compile('sid=([^"]+)',re.I)
        patt = patt.search(res)

        uname = user.split('@')[0]
        self.user = user
        if patt:
            self.sid = patt.group(1).strip()
            #print self.sid
            print '%s Login Successful.....'%(uname)
        else:
            print '%s Login failed....'%(uname)


    def getInBox(self):
        '''
            //Get mailbox list
        '''
        print '\nGet mail lists.....\n'
        sid = self.sid
        url = self.mailBaseUrl+'/jy3/list/list.do?sid='+sid+'&fid=1&fr=folder'
        res = urllib2.urlopen(url).read()
        #Get mailing list
        mailList = []
        patt = re.compile('<div\s+class="tdLike Ibx_Td_From"[^>]+>.*?href="([^"]+)"[^>]+>(.*?)<\/a>.*?<div\s+class="tdLike Ibx_Td_Subject"[^>]+>.*?href="[^>]+>(.*?)<\/a>',re.I|re.S)
        patt = patt.findall(res)
        if patt==None:
            return mailList

        for i in patt:
            line =  {
                    'from':i[1].decode('utf8'),
                     'url':self.mailBaseUrl+i[0],
                     'subject':i[2].decode('utf8')
                     }
            mailList.append(line)

        return mailList


    def getMailMsg(self,url):
        '''
            //Download mail content
        '''
        content=''
        print '\n Download.....%s\n'%(url)
        res = urllib2.urlopen(url).read()

        patt = re.compile('contentURL:"([^"]+)"',re.I)
        patt = patt.search(res)
        if patt==None:
            return content
        url = '%s%s'%(self.mailBaseUrl,patt.group(1))
        time.sleep(1)
        res = urllib2.urlopen(url).read()
        Djson = json.JSONDecoder(encoding='utf8')
        jsonRes = Djson.decode(res)
        if 'resultVar' in jsonRes:
            content = Djson.decode(res)['resultVar']
        time.sleep(3)
        return content


'''
    Demon
'''
#Initialization
mail163 = Email163()
#Sign in
mail163.login('lpe234@163.com','944898186')
time.sleep(2)

#Get inbox
elist = mail163.getInBox()

#Get message content
for i in elist:
    print 'Theme:%s   Come from:%s  Content:\n%s'%(i['subject'].encode('utf8'),i['from'].encode('utf8')

(5) landing conditions

#1 cookie processing
 
import urllib2, cookielib
cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
content = urllib2.urlopen('http://XXXX').read()
 
#2 use proxy and cookie
 
opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler)
 
#3 form processing
 
import urllib
postdata=urllib.urlencode({
    'username':'XXXXX',
    'password':'XXXXX',
    'continueURI':'http://www.verycd.com/',
    'fk':fk,
    'login_submit':'Sign in'
})
 
req = urllib2.Request(
    url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/',
    data = postdata
)
result = urllib2.urlopen(req).read()
 
#4 Disguised as browser access
 
headers = {
    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
req = urllib2.Request(
    url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/',
    data = postdata,
    headers = headers
)
 
#5 anti "anti stealing chain"
 
headers = {
    'Referer':'http://www.cnbeta.com/articles'
}

(6) multithreading

from threading import Thread
from Queue import Queue
from time import sleep
#q is the task queue
#NUM is the total number of concurrent threads
#How many JOBS are there
q = Queue()
NUM = 2
JOBS = 10
#Specific processing function, responsible for processing a single task
def do_somthing_using(arguments):
    print arguments
#This is the working process, which is responsible for constantly fetching data from the queue and processing
def working():
    while True:
        arguments = q.get()
        do_somthing_using(arguments)
        sleep(1)
        q.task_done()
#fork NUM threads waiting for queue
for i in range(NUM):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()
#Queue JOBS
for i in range(JOBS):
    q.put(i)
#Wait for all JOBS to complete
q.join()

Topics: Python Windows Firefox JSON Ubuntu