2020-08-04 50b0e5af7862320ca92b4c942a6e36ac 99+ 12 分钟 1.8 k 0次访问

python3爬虫

1,urllib爬取网页和写入文件

import urllib.request
#encoding="utf-8"
# 向指定的url地址发起请求，并返回服务器响应的数据(文件的对象)
response = urllib.request.urlopen("http://www.baidu.com")
 # 获取数据， decode("utf-8") 表示转为字符串utf-8编码
#data =response.read().decode("utf-8")
#读取文件的全部内容
#data =response.read()
#print(data)
#print(type(data))

#将爬取到的网页写入文件，会把读取到的数据赋值给一个字符串变量
#with open(r"D:\py_work\grep\爬虫\file\file1.html" ,"wb") as f:
#    f.write(data)

#response 属性
#返回当前环境的有关信息
print(response.info())

'''
#返回状态码
print(response.getcode())
if response.getcode() == 200 or response.getcode() == 304:
     #处理网页信息
     pass
'''
#返回当前正在爬取的url地址
print(response.geturl())

url = "https://www.baidu.com/s?wd=%E9%95%BF%E5%9F%8E&rsv_spt=1&rsv_iqid=0xe262cf6300014620&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_dl=tb&rsv_sug2=0&rsv_btype=i&inputT=4262&rsv_sug4=6162"
#读取一行
#data = response.readline()

#读取文件的全部内容，会把读取的数据赋值给一个列表变量
data =response.readlines()
print(data)
print(type(data))
print(len(data))
print(type(data[100].decode("utf-8")))

2, 模拟浏览器

import urllib.request
import random
url = "http://www.baidu.com"
'''
#模拟请求头
header ={
    "User-Agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36",
}

#完整的heater
header ={
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36",
    "Content-Type": "text/html;charset=utf-8"

}
#设置一个请求体
req = urllib.request.Request(url,headers=header)

#发起请求

response = urllib.request.urlopen(req)
data= response.read().decode("utf-8")

print(data)
'''
#user-agent大全
agentsList = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",

]
#随机拿出user-agent
agentStr = random.choice(agentsList)

req =urllib.request.Request(url)
#向请求体添加了User-Agent
req.add_header("User-Agent",agentStr)

response =urllib.request.urlopen(req)

data = response.read().decode("utf-8")
print(data)

3,设置超时

import urllib.request



#如果网页长时间未响应，系统判断超时，无法爬取

for i in range(1,100):
    try:
        response = urllib.request.urlopen("http://www.baidu.com", timeout=0.5)
        print(len(response.read().decode("utf-8")))
    except:
        print("请求超时，继续下一个爬取")
        break

4,http请求



'''
使用场景: 进行客户端与服务端之间的消息传递时使用

GET:        通过URL网址传递信息，可以直接在URL网址上添加要传递的信息
POST:       可以向服务器提交数据，是一种比较流行的比较安全的数据传递方式
PUT:        请求服务器存储一个资源，通常要指定存储的位置
DELETE:     请求服务器删除一个资源
HEAD:       请求获取对应的http报头信息
OPTIONS:    可以获取当前URL所支持的请求类型

'''

1,get请求

#GET请求
'''
特点：把数据拼接到请求路径的后面传递给服务器
优点： 速度快
缺点： 承载的数据量少，不安全
'''

import urllib.request

url ="https://yichenxiu.com/html/wenzi/"

response = urllib.request.urlopen(url)

data = response.read().decode("utf-8")
print(data)
print(type(data))

2,json数据解析

'''
概念：一种保存数据的格式
作用：可以保存本地的json文件，也可以将json串进行传输，通常将json称为轻量级的传输方式。

json文件的组成
{}          代表对象(字典)
[]          代表列表
:           代表键值对
,           分隔两个部分

'''
import json

jsonStr = '''{"name":"yichen秀","age":18,
"hobby":["money","power","english"],"parames":{"a":1,"b":2}}'''

#将json格式的字符串转为python数据类型的对象
jsonData = json.loads(jsonStr)
print(jsonData)
print(type(jsonData))
print(jsonData["hobby"])

#将python格式的字符串转为json数据类型的对象
jsonData2 = '''{"name":"yichen秀","age":18,
"hobby":["money","power","english"],"parames":{"a":1,"b":2}}'''

jsonStr2 = json.dumps(jsonData2)
print(jsonStr2)
print(type(jsonStr2))


#读取本地的json文件
path1 = r"D:\py_work\grep\爬虫\file\config.json"

with open(path1,"rb") as f:
    data = json.load(f)
    print(data)
    #字典类型
    print(type(data))
#写本地的json文件

path2 = r"D:\py_work\grep\爬虫\file\config.json"
jsonData3 = '''{"name":"yichen秀","age":18,
"hobby":["money","power","english"],"parames":{"a":1,"b":2}}'''

with open(path2, "w") as f :
    json.dump(jsonData3,f)

3, POST请求

#POST请求
'''
特点: 把参数进行打包，单独传输

优点:数量大，安全(当对服务器数据进行修改时建议使用post)

缺点: 速度慢
'''

import urllib.request
import urllib.parse

url ="https://yichenxiu.com/look/login.php?referer=https%3A%2F%2Fyichenxiu.com%2Flook%2Fwelcome.php"
#将要发送的数据合成一个字典
#字典的键去网址里找，一般为input标签的name属性的值
data = {
    "name":"yichen",
    "password":"****!"
}

#对要发送的数据进行打包
postData = urllib.parse.urlencode(data).encode("utf-8")

header = {
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
    "Content-Type":"application/x-www-form-urlencoded",

}


#请求体
req =urllib.request.Request(url, data=postData,headers=header)

#req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36")
#请求
response = urllib.request.urlopen(req)

data1 = response.read().decode("utf-8")

print(data1)
#把文件保存
with open(r"D:\py_work\grep\爬虫\file\file2.html" ,"w",encoding="utf-8") as f:
    f.write(data1)

5，抓取网页动态Ajax请求的数据

import urllib.request
import ssl
import json,time

def ajaxCrawler(url):

    headers ={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",

    }

    req = urllib.request.Request(url,headers = headers)
    context = ssl._create_unverified_context()
    #使用ssl创建未验证的上下文
    response = urllib.request.urlopen(req,context=context)

    jsonStr = response.read().decode("utf-8")
    jsonData = json.loads(jsonStr)

    return jsonData
'''
url= "https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=40&limit=20"
info = ajaxCrawler(url)
print(info)
'''

for i in range(1,11):
    url ="https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start="+ str(i*20)+"&limit=20"
    info = ajaxCrawler(url)
    print(len(info))

    #print(info)
    time.sleep(1)
    with open(r"D:\py_work\grep\爬虫\file\file3.html", "a",encoding="utf-8" ) as f:
        f.write(str(info))

6,糗事百科爬虫

import urllib.request
import re
def jokeCrawler(url):
    headers ={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",

    }
    req = urllib.request.Request(url,headers = headers)

    response = urllib.request.urlopen(req)
    Html = response.read().decode("utf-8")
    #Html = str(response.read())
    #print(Html)
    #正则匹配
    pat =r'<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'

    divsList = re.findall(pat,Html,re.S)
    #print(divsList)
    #print(len(divsList))
    dic = {}
    for div in divsList:
        #用户名
        re_u = re.compile(r"<h2>(.*?)</h2>",re.S)
        username =re_u.findall(div)
        #print(username[0])
        username = username[0]
        #print(type(username))

        #段子
        re_d = re.compile(r'<div class="content">\n<span>(.*?)</span>', re.S)
        duanzi = re_d.findall(div)
        #print(duanzi[0])
        dic[username] = duanzi

    return dic


url = r"https://www.qiushibaike.com/text/page/2/"
info = jokeCrawler(url)

print(info)
for k,v in info.items():
    print(k + "说\n",v)

with open(r"D:\py_work\grep\爬虫\file\file5.html", "w", encoding="utf-8") as f:
    f.write(str(info))

7,爬图片练习

import urllib.request,re,os
def imageCrawler(url,toPath):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
    }
    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)
    HtmlStr = response.read().decode("utf-8")
    # with open(r"D:\py_work\grep\爬虫\file\image\yhd.html", "wb") as f:
    #   f.write(HtmlStr)

    pat = r'<img src=(.*?) alt=""'
    re_image = re.compile(pat)
    imageList = re_image.findall(HtmlStr)
    print(imageList)
    print(len(imageList))
    num = 1
    for imageUrl in imageList:
        path = os.path.join(toPath, str(num)+".jpg")
        num += 1
        # 把图片下载到本地存储
        urllib.request.urlretrieve("http://"+imageUrl,filename=path)








url = r"https://qianggou.yhd.com/ajax/ajaxActivityProduct.do?pageNo=1&siteType=&provinceId=2&grouponId=558169034&_=1590000642329"
toPath = r"D:\py_work\grep\爬虫\file\image"



imageCrawler(url,toPath)

8，爬取网络中的QQ号

import urllib.request
import ssl
import os
import re
from collections import deque


def writeFileByte(htmlBytes,toPath):
    with open(toPath,"wb") as f:
        f.write(htmlBytes)
def writeFileStr(htmlBytes,toPath):
    with open(toPath,"w") as f:
        f.write(str(htmlBytes))

def getHtmlBytes(url):
    headers = {

        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
    }
    # 请求头
    req = urllib.request.Request(url, headers=headers)
    context=ssl._create_unverified_context()
    # 请求体
    response = urllib.request.urlopen(req,context=context)

    return response.read()

#爬取网页信息+文件存储目录文件
def qqCrawler(url,toPath):
    htmlBytes=getHtmlBytes(url)
    writeFileByte(htmlBytes,r"D:\py_work\grep\爬虫\file\image\file1.html")
    writeFileStr(htmlBytes,r"D:\py_work\grep\爬虫\file\image\file2.txt")
    htmlStr=str(htmlBytes)



#找qq号
    #pat=r'</li>\n<li class="d_name" data-field=(.*?)}'
    pat=r"[1-9]\d{4,9}"
    re_qq=re.compile(pat)

    qqList=re_qq.findall(htmlStr)
    #去重复
    qqList=list(set(qqList))

    f=open(toPath,"a")
    for qqStr in qqList:
        f.write(qqStr+"\n")
    f.close()


    #找网址
    pat1=r'(((http|ftp|https?)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?)'
    re_url=re.compile(pat1)
    urlsList=re_url.findall(htmlStr)
    #去重
    urlsList=list(set(urlsList))


    return urlsList


def center(url,toPath):
    queue=deque()

    queue.append(url)

    while len(queue)!=0:
        targetUrl=queue.popleft()
        urlList=qqCrawler(targetUrl,toPath)
        for item in urlList:
            tempUrl=item[0]
            queue.append(tempUrl)

#爬取地址
url ="https://tieba.baidu.com/p/2"

#QQ存储目录文件
toPath = r"D:\py_work\grep\爬虫\file\image\qqFile.txt"
center(url,toPath)

本文标题：python3爬虫
本文作者：yichen
本文链接：https://yc6.cool/2020/08/04/爬虫/
版权声明：本博客所有文章除特别声明外，均采用 CC BY-NC-SA 4.0 许可协议。转载请注明出处！

python3爬虫

1,urllib爬取网页和写入文件

2, 模拟浏览器

3,设置超时

4,http请求

1,get请求

2,json数据解析

3, POST请求

5，抓取网页动态Ajax请求的数据

6,糗事百科爬虫

7,爬图片练习

8，爬取网络中的QQ号

喜欢这篇文章？打赏一下作者吧

评论

目录

最新评论

最新文章

分类

归档

标签

订阅更新