python3爬虫

1,urllib爬取网页和写入文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import urllib.request
#encoding="utf-8"
# 向指定的url地址发起请求,并返回服务器响应的数据(文件的对象)
response = urllib.request.urlopen("http://www.baidu.com")
# 获取数据, decode("utf-8") 表示转为字符串utf-8编码
#data =response.read().decode("utf-8")
#读取文件的全部内容
#data =response.read()
#print(data)
#print(type(data))

#将爬取到的网页写入文件,会把读取到的数据赋值给一个字符串变量
#with open(r"D:\py_work\grep\爬虫\file\file1.html" ,"wb") as f:
# f.write(data)

#response 属性
#返回当前环境的有关信息
print(response.info())

'''
#返回状态码
print(response.getcode())
if response.getcode() == 200 or response.getcode() == 304:
#处理网页信息
pass
'''
#返回当前正在爬取的url地址
print(response.geturl())

url = "https://www.baidu.com/s?wd=%E9%95%BF%E5%9F%8E&rsv_spt=1&rsv_iqid=0xe262cf6300014620&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_dl=tb&rsv_sug2=0&rsv_btype=i&inputT=4262&rsv_sug4=6162"
#读取一行
#data = response.readline()

#读取文件的全部内容,会把读取的数据赋值给一个列表变量
data =response.readlines()
print(data)
print(type(data))
print(len(data))
print(type(data[100].decode("utf-8")))

2, 模拟浏览器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import urllib.request
import random
url = "http://www.baidu.com"
'''
#模拟请求头
header ={
"User-Agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36",
}

#完整的heater
header ={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36",
"Content-Type": "text/html;charset=utf-8"

}
#设置一个请求体
req = urllib.request.Request(url,headers=header)

#发起请求

response = urllib.request.urlopen(req)
data= response.read().decode("utf-8")

print(data)
'''
#user-agent大全
agentsList = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",

]
#随机拿出user-agent
agentStr = random.choice(agentsList)

req =urllib.request.Request(url)
#向请求体添加了User-Agent
req.add_header("User-Agent",agentStr)

response =urllib.request.urlopen(req)

data = response.read().decode("utf-8")
print(data)

3,设置超时

1
2
3
4
5
6
7
8
9
10
11
12
13
import urllib.request



#如果网页长时间未响应,系统判断超时,无法爬取

for i in range(1,100):
try:
response = urllib.request.urlopen("http://www.baidu.com", timeout=0.5)
print(len(response.read().decode("utf-8")))
except:
print("请求超时,继续下一个爬取")
break

4,http请求

1
2
3
4
5
6
7
8
9
10
11
12
13


'''
使用场景: 进行客户端与服务端之间的消息传递时使用

GET: 通过URL网址传递信息,可以直接在URL网址上添加要传递的信息
POST: 可以向服务器提交数据,是一种比较流行的比较安全的数据传递方式
PUT: 请求服务器存储一个资源,通常要指定存储的位置
DELETE: 请求服务器删除一个资源
HEAD: 请求获取对应的http报头信息
OPTIONS: 可以获取当前URL所支持的请求类型

'''

1,get请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#GET请求
'''
特点:把数据拼接到请求路径的后面传递给服务器
优点: 速度快
缺点: 承载的数据量少,不安全
'''

import urllib.request

url ="https://yichenxiu.com/html/wenzi/"

response = urllib.request.urlopen(url)

data = response.read().decode("utf-8")
print(data)
print(type(data))

2,json数据解析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
'''
概念:一种保存数据的格式
作用:可以保存本地的json文件,也可以将json串进行传输,通常将json称为轻量级的传输方式。

json文件的组成
{} 代表对象(字典)
[] 代表列表
: 代表键值对
, 分隔两个部分

'''
import json

jsonStr = '''{"name":"yichen秀","age":18,
"hobby":["money","power","english"],"parames":{"a":1,"b":2}}'''

#将json格式的字符串转为python数据类型的对象
jsonData = json.loads(jsonStr)
print(jsonData)
print(type(jsonData))
print(jsonData["hobby"])

#将python格式的字符串转为json数据类型的对象
jsonData2 = '''{"name":"yichen秀","age":18,
"hobby":["money","power","english"],"parames":{"a":1,"b":2}}'''

jsonStr2 = json.dumps(jsonData2)
print(jsonStr2)
print(type(jsonStr2))


#读取本地的json文件
path1 = r"D:\py_work\grep\爬虫\file\config.json"

with open(path1,"rb") as f:
data = json.load(f)
print(data)
#字典类型
print(type(data))
#写本地的json文件

path2 = r"D:\py_work\grep\爬虫\file\config.json"
jsonData3 = '''{"name":"yichen秀","age":18,
"hobby":["money","power","english"],"parames":{"a":1,"b":2}}'''

with open(path2, "w") as f :
json.dump(jsonData3,f)

3, POST请求

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#POST请求
'''
特点: 把参数进行打包,单独传输

优点:数量大,安全(当对服务器数据进行修改时建议使用post)

缺点: 速度慢
'''

import urllib.request
import urllib.parse

url ="https://yichenxiu.com/look/login.php?referer=https%3A%2F%2Fyichenxiu.com%2Flook%2Fwelcome.php"
#将要发送的数据合成一个字典
#字典的键去网址里找,一般为input标签的name属性的值
data = {
"name":"yichen",
"password":"****!"
}

#对要发送的数据进行打包
postData = urllib.parse.urlencode(data).encode("utf-8")

header = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
"Content-Type":"application/x-www-form-urlencoded",

}


#请求体
req =urllib.request.Request(url, data=postData,headers=header)

#req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36")
#请求
response = urllib.request.urlopen(req)

data1 = response.read().decode("utf-8")

print(data1)
#把文件保存
with open(r"D:\py_work\grep\爬虫\file\file2.html" ,"w",encoding="utf-8") as f:
f.write(data1)

5,抓取网页动态Ajax请求的数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import urllib.request
import ssl
import json,time

def ajaxCrawler(url):

headers ={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",

}

req = urllib.request.Request(url,headers = headers)
context = ssl._create_unverified_context()
#使用ssl创建未验证的上下文
response = urllib.request.urlopen(req,context=context)

jsonStr = response.read().decode("utf-8")
jsonData = json.loads(jsonStr)

return jsonData
'''
url= "https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start=40&limit=20"
info = ajaxCrawler(url)
print(info)
'''

for i in range(1,11):
url ="https://movie.douban.com/j/chart/top_list?type=17&interval_id=100%3A90&action=&start="+ str(i*20)+"&limit=20"
info = ajaxCrawler(url)
print(len(info))

#print(info)
time.sleep(1)
with open(r"D:\py_work\grep\爬虫\file\file3.html", "a",encoding="utf-8" ) as f:
f.write(str(info))

6,糗事百科爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import urllib.request
import re
def jokeCrawler(url):
headers ={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",

}
req = urllib.request.Request(url,headers = headers)

response = urllib.request.urlopen(req)
Html = response.read().decode("utf-8")
#Html = str(response.read())
#print(Html)
#正则匹配
pat =r'<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'

divsList = re.findall(pat,Html,re.S)
#print(divsList)
#print(len(divsList))
dic = {}
for div in divsList:
#用户名
re_u = re.compile(r"<h2>(.*?)</h2>",re.S)
username =re_u.findall(div)
#print(username[0])
username = username[0]
#print(type(username))

#段子
re_d = re.compile(r'<div class="content">\n<span>(.*?)</span>', re.S)
duanzi = re_d.findall(div)
#print(duanzi[0])
dic[username] = duanzi

return dic


url = r"https://www.qiushibaike.com/text/page/2/"
info = jokeCrawler(url)

print(info)
for k,v in info.items():
print(k + "说\n",v)

with open(r"D:\py_work\grep\爬虫\file\file5.html", "w", encoding="utf-8") as f:
f.write(str(info))

7,爬图片练习

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import urllib.request,re,os
def imageCrawler(url,toPath):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
HtmlStr = response.read().decode("utf-8")
# with open(r"D:\py_work\grep\爬虫\file\image\yhd.html", "wb") as f:
# f.write(HtmlStr)

pat = r'<img src=(.*?) alt=""'
re_image = re.compile(pat)
imageList = re_image.findall(HtmlStr)
print(imageList)
print(len(imageList))
num = 1
for imageUrl in imageList:
path = os.path.join(toPath, str(num)+".jpg")
num += 1
# 把图片下载到本地存储
urllib.request.urlretrieve("http://"+imageUrl,filename=path)








url = r"https://qianggou.yhd.com/ajax/ajaxActivityProduct.do?pageNo=1&siteType=&provinceId=2&grouponId=558169034&_=1590000642329"
toPath = r"D:\py_work\grep\爬虫\file\image"



imageCrawler(url,toPath)

8,爬取网络中的QQ号

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import urllib.request
import ssl
import os
import re
from collections import deque


def writeFileByte(htmlBytes,toPath):
with open(toPath,"wb") as f:
f.write(htmlBytes)
def writeFileStr(htmlBytes,toPath):
with open(toPath,"w") as f:
f.write(str(htmlBytes))

def getHtmlBytes(url):
headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
}
# 请求头
req = urllib.request.Request(url, headers=headers)
context=ssl._create_unverified_context()
# 请求体
response = urllib.request.urlopen(req,context=context)

return response.read()

#爬取网页信息+文件存储目录文件
def qqCrawler(url,toPath):
htmlBytes=getHtmlBytes(url)
writeFileByte(htmlBytes,r"D:\py_work\grep\爬虫\file\image\file1.html")
writeFileStr(htmlBytes,r"D:\py_work\grep\爬虫\file\image\file2.txt")
htmlStr=str(htmlBytes)



#找qq号
#pat=r'</li>\n<li class="d_name" data-field=(.*?)}'
pat=r"[1-9]\d{4,9}"
re_qq=re.compile(pat)

qqList=re_qq.findall(htmlStr)
#去重复
qqList=list(set(qqList))

f=open(toPath,"a")
for qqStr in qqList:
f.write(qqStr+"\n")
f.close()


#找网址
pat1=r'(((http|ftp|https?)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?)'
re_url=re.compile(pat1)
urlsList=re_url.findall(htmlStr)
#去重
urlsList=list(set(urlsList))


return urlsList


def center(url,toPath):
queue=deque()

queue.append(url)

while len(queue)!=0:
targetUrl=queue.popleft()
urlList=qqCrawler(targetUrl,toPath)
for item in urlList:
tempUrl=item[0]
queue.append(tempUrl)

#爬取地址
url ="https://tieba.baidu.com/p/2"

#QQ存储目录文件
toPath = r"D:\py_work\grep\爬虫\file\image\qqFile.txt"
center(url,toPath)

评论


:D 一言句子获取中...

加载中,最新评论有1分钟缓存...