Python网络爬虫学习
更新于:A.D.2022.2.13
按模块学习
urllib库
json库
按功能学习
下载HTML源代码
from urllib.request import urlopen
import sys
html = urlopen(
"url"
).read().decode('utf-8')
if sys.getdefaultencoding() == 'ascii':
sys.stdout = open("2.输出html.html", "w", encoding = 'gb2312')
else:
sys.stdout = open("2.输出html.html", "w", encoding = 'utf-8')
print(html)
下载网页图片
from urllib.request import urlopen
import urllib.parse
import sys
import re
import os
def getHtml(url):
html = urlopen(url).read().decode('utf-8')
return html
def printImg(imgList, url):
for i in imgList:
imgName = i.split("/")[-1]
imgPicture = urlopen(url + i)
File = open('\\image\\' + imgName, 'wb')
File.write(imgPicture.read())
File.close()
def downloadImg(url):
html = getHtml(url)
rule_1 = r'<img src="([^"]+\.png)"'
rule_2 = r'<img src="([^"]+\.jpg)"'
imgList_1 = re.findall(rule_1, html)
printImg(imgList_1, url)
imgList_2 = re.findall(rule_2, html)
printImg(imgList_2, url)
if __name__ == '__main__':
url = "url"
downloadImg(url)
下载通知
from urllib.request import urlopen
import sys
import re
def getHtml(url):
html = urlopen(url).read().decode('utf-8')
return html
def printList(list_1, list_2, list_3):
if sys.getdefaultencoding() == 'ascii':
sys.stdout = open("4.爬取通知.txt", "w", encoding = 'gb2312')
else:
sys.stdout = open("4.爬取通知.txt", "w", encoding = 'utf-8')
for i in range(0, len(list_1)):
print('#%d' % (i+1))
print('标题:' + list_2[i])
print('日期:' + list_3[i])
print('网址:' + 'url' + list_1[i])
print('')
def download(url):
html = getHtml(url)
rule_1 = r'<a target="_blank" href="([^"]+\.htm)">'
List_1 = re.findall(rule_1, html)
rule_2 = r'<a target="_blank" href=.*?>(.*?)</a>'
List_2 = re.findall(rule_2, html)
rule_3 = r'<font color="#3a6399"> (.*?)</font>'
List_3 = re.findall(rule_3, html)
printList(List_1, List_2, List_3)
if __name__ == '__main__':
url = "url"
download(url)
下载任意页数通知
from urllib.request import urlopen
import sys
import re
def getHtml(url):
html = urlopen(url).read().decode('utf-8')
return html
def printList(list_1, list_2, list_3):
if sys.getdefaultencoding() == 'ascii':
sys.stdout = open("5.爬取任意页数通知.txt", "w", encoding = 'gb2312')
else:
sys.stdout = open("5.爬取任意页数通知.txt", "w", encoding = 'utf-8')
print('通知页数:第' + pageNumber + '页')
print('')
for i in range(0, len(list_1)):
print('#%d' % (i+1))
print('标题:' + list_2[i])
print('日期:' + list_3[i])
print('网址:' + 'url' + list_1[i])
print('')
def download(url):
html = getHtml(url)
rule_1 = r'<a target="_blank" href="([^"]+\.htm)">'
List_1 = re.findall(rule_1, html)
rule_2 = r'<a target="_blank" href=.*?>(.*?)</a>'
List_2 = re.findall(rule_2, html)
rule_3 = r'<font color="#3a6399"> (.*?)</font>'
List_3 = re.findall(rule_3, html)
printList(List_1, List_2, List_3)
if __name__ == '__main__':
print('请输入需要爬取的通知页数:')
pageNumber = input()
url = "url" + pageNumber + ".html"
download(url)
Bilibili-Favorites-Download
用于下载B站个人收藏夹信息
https://github.com/ShioKiri/Bilibili-Favorites-Download
- 因为调用一个外部网站API查询了失效视频且没有处理失败情况,目前因为API有一定问题,不能稳定运行,可以自行删去处理失效视频的代码使用
- 没有使用数据库和图形界面,直接保存为TXT或者CSV文件
生成TXT文件
按照一定格式保存为TXT,推荐使用编辑器打开TXT避免速度过慢
from urllib.request import urlopen
import json
import re
import time
import sys
import logging
# -*- coding: UTF-8 -*-
bilibili_uid = 0
aFavVideoCnt = 0
totVideoCnt = 0
totVideoNum = 0
def initUid():
global bilibili_uid
bilibili_uid = int(input())
def getJsonUrl(url):
Data = urlopen(url).read().decode('utf-8')
jsonData = json.loads(Data)
return jsonData
def getFavoriteList(uid):
global totVideoNum
favoriteListUrl = 'https://api.bilibili.com/medialist/gateway/base/created?pn=1&ps=100&up_mid={uid}&is_space=0&jsonp=jsonp'.format(uid=uid)
favoriteListData = getJsonUrl(favoriteListUrl)
listInfo = favoriteListData['data']['list']
favNum = len(listInfo)
for i in range(0, favNum):
totVideoNum += listInfo[i]['media_count']
for i in range(0, favNum):
print('收藏夹编号:#%d 名称:%s 视频数量:%d\n' % (i+1, listInfo[i]['title'], listInfo[i]['media_count']))
getFavListVideo(uid, listInfo[i]['fid'])
def getFavListVideo(uid, fid):
global aFavVideoCnt, totVideoCnt, totVideoNum
aFavVideoCnt = 0
favPageUrl = 'https://api.bilibili.com/x/space/fav/arc?vmid={uid}&ps=30&fid={fid}&tid=0&keyword=&pn={page}&order=fav_time&jsonp=jsonp'.format(uid=uid,fid=fid,page=1)
favPageDate = getJsonUrl(favPageUrl)
printVideoInfo(favPageDate['data']['archives'])
pageCount = favPageDate['data']['pagecount']
for i in range(2, pageCount+1):
favPageUrl = 'https://api.bilibili.com/x/space/fav/arc?vmid={uid}&ps=30&fid={fid}&tid=0&keyword=&pn={page}&order=fav_time&jsonp=jsonp'.format(uid=uid,fid=fid,page=i)
favPageDate = getJsonUrl(favPageUrl)
printVideoInfo(favPageDate['data']['archives'])
def printInfo(Info):
print('投稿时间:' + time.strftime("%Y-%m-%d-%H:%M:%S", time.gmtime(Info['pubdate'])))
print('描述:%s' % (Info['desc']))
print('分区:%s' % (Info['tname']))
print('标签:%s' % (Info['dynamic']))
print('up主用户名:%s up主uid:%s' % (Info['owner']['name'], Info['owner']['mid']))
print('播放:%s 弹幕:%s 回复:%s 收藏:%s 硬币:%s 分享:%s 喜欢:%s' % (Info['stat']['view'], Info['stat']['danmaku'], Info['stat']['reply'], Info['stat']['favorite'], Info['stat']['coin'], Info['stat']['share'], Info['stat']['like'],))
print('收藏时间:' + time.strftime("%Y-%m-%d-%H:%M:%S", time.gmtime(Info['fav_at'])))
print()
def printVideoInfo(aPageInfo):
global aFavVideoCnt, totVideoCnt, totVideoNum
for i in range(0, len(aPageInfo)):
aFavVideoCnt += 1
totVideoCnt += 1
if totVideoCnt % 10 == 0 or totVideoCnt == totVideoNum: logging.info('已经完成了{num1}/{num2}个视频...'.format(num1 = totVideoCnt,num2 = totVideoNum))
Info = aPageInfo[i]
video_id = Info['aid']
if Info['title'] == '已失效视频':
getInvalidVideoInfo(video_id, Info)
else:
print('视频编号:#%d AV号:%d 视频标题:%s' % (aFavVideoCnt, Info['aid'], Info['title']))
print('封面图片:%s' % (Info['pic']))
printInfo(Info)
biliplusApiCnt = 0
runTime = 0
def getInvalidVideoInfo(video_id, Info):
global aFavVideoCnt, biliplusApiCnt, runTime
biliplusApiCnt += 1
while biliplusApiCnt / ((time.time() - runTime) / 60.0) >= 5: time.sleep(1)
url = 'https://www.biliplus.com/api/view?id={vid}'.format(vid = video_id)
InvVideoInfo = getJsonUrl(url)
if 'code' in InvVideoInfo:
print('视频编号:#%d [已失效][BiliplusApi数据缺失] AV号:%d 视频标题:已失效视频' % (aFavVideoCnt, video_id))
print('封面图片:%s' % (Info['pic']))
printInfo(Info)
else:
print('视频编号:#%d [已失效][BiliplusApi获得标题与封面图片] AV号:%d 视频标题:%s' % (aFavVideoCnt, video_id, InvVideoInfo['title']))
print('封面图片:%s' % (InvVideoInfo['pic']))
printInfo(Info)
def start(): # for cmd
global totVideoNum, runTime
logging.basicConfig(level=logging.DEBUG)
if sys.getdefaultencoding() == 'ascii':
sys.stdout = open("FavoriteVideoList.txt", "w", encoding = 'gb2312')
else:
sys.stdout = open("FavoriteVideoList.txt", "w", encoding = 'utf-8')
initUid()
runTime = time.time()
getFavoriteList(bilibili_uid)
print('你的收藏夹共有%d个视频' % (totVideoNum))
logging.info('已完成!')
# def windowMain(Uid, filePath): # for window ui
if __name__ == '__main__': # cmd
start()
生成CSV文件
生成可以用Excel
打开的逗号分隔值文件,因为Unicode(UTF-8)
文件头没有BOM编码
会导致编码错误(后期会处理),可以使用Notepad++
修改Unicode(UTF-8)
为ANSI
,或者修改为带BOM编码格式的Unicode(UTF-8)文件
。
from urllib.request import urlopen
import json
import re
import time
import sys
import logging
# -*- coding: UTF-8 -*-
bilibili_uid = 0
aFavVideoCnt = 0
totVideoCnt = 0
totVideoNum = 0
def initUid():
global bilibili_uid
bilibili_uid = int(input())
def getJsonUrl(url):
Data = urlopen(url).read().decode('utf-8')
jsonData = json.loads(Data)
return jsonData
def getFavoriteList(uid):
global totVideoNum
print('收藏夹编号,收藏夹名称,收藏夹视频数量,视频编号,AV号,视频状态,视频标题,封面图片,投稿时间,描述,分区,标签,up主用户名,up主uid,播放,弹幕,回复,收藏,硬币,分享,喜欢,收藏时间,')
favoriteListUrl = 'https://api.bilibili.com/medialist/gateway/base/created?pn=1&ps=100&up_mid={uid}&is_space=0&jsonp=jsonp'.format(uid=uid)
favoriteListData = getJsonUrl(favoriteListUrl)
listInfo = favoriteListData['data']['list']
favNum = len(listInfo)
for i in range(0, favNum):
totVideoNum += listInfo[i]['media_count']
for i in range(0, favNum):
favInformation = '#{x1},{x2},{x3}'.format(x1=i+1, x2=listInfo[i]['title'], x3=listInfo[i]['media_count'])
getFavListVideo(uid, listInfo[i]['fid'], favInformation)
def printInfo(Info):
print(time.strftime("%Y-%m-%d-%H:%M:%S", time.gmtime(Info['pubdate'])), end=',')
print('"%s"' % (Info['desc']), end=',')
print('"%s"' % (Info['tname']), end=',')
print('"%s"' % (Info['dynamic']), end=',')
print('"%s","%s"' % (Info['owner']['name'], Info['owner']['mid']), end=',')
print('%s,%s,%s,%s,%s,%s,%s' % (Info['stat']['view'], Info['stat']['danmaku'], Info['stat']['reply'], Info['stat']['favorite'], Info['stat']['coin'], Info['stat']['share'], Info['stat']['like'],), end=',')
print(time.strftime("%Y-%m-%d-%H:%M:%S", time.gmtime(Info['fav_at'])), end=',')
print()
def getFavListVideo(uid, fid, favInfo):
global aFavVideoCnt, totVideoCnt, totVideoNum
aFavVideoCnt = 0
favPageUrl = 'https://api.bilibili.com/x/space/fav/arc?vmid={uid}&ps=30&fid={fid}&tid=0&keyword=&pn={page}&order=fav_time&jsonp=jsonp'.format(uid=uid, fid=fid, page=1)
favPageDate = getJsonUrl(favPageUrl)
printVideoInfo(favPageDate['data']['archives'], favInfo)
pageCount = favPageDate['data']['pagecount']
for i in range(2, pageCount+1):
favPageUrl = 'https://api.bilibili.com/x/space/fav/arc?vmid={uid}&ps=30&fid={fid}&tid=0&keyword=&pn={page}&order=fav_time&jsonp=jsonp'.format(uid=uid, fid=fid, page=i)
favPageDate = getJsonUrl(favPageUrl)
printVideoInfo(favPageDate['data']['archives'], favInfo)
def printVideoInfo(aPageInfo, favInfo):
global aFavVideoCnt, totVideoCnt, totVideoNum
for i in range(0, len(aPageInfo)):
print(favInfo, end=',')
aFavVideoCnt += 1
totVideoCnt += 1
if totVideoCnt % 10 == 0 or totVideoCnt == totVideoNum: logging.info('已经完成了{num1}/{num2}个视频...'.format(num1=totVideoCnt, num2=totVideoNum))
Info = aPageInfo[i]
video_id = Info['aid']
if Info['title'] == '已失效视频':
getInvalidVideoInfo(video_id, Info)
else:
print('#%d,%d,[有效],"%s","%s"' % (aFavVideoCnt, Info['aid'], Info['title'], Info['pic']), end=',')
printInfo(Info)
biliplusApiCnt = 0
runTime = 0
def getInvalidVideoInfo(video_id, Info):
global aFavVideoCnt, biliplusApiCnt, runTime
biliplusApiCnt += 1
while biliplusApiCnt / ((time.time() - runTime) / 60.0) >= 5: time.sleep(1)
url = 'https://www.biliplus.com/api/view?id={vid}'.format(vid = video_id)
InvVideoInfo = getJsonUrl(url)
if 'code' in InvVideoInfo:
print('#%d,%d,[失效][BiliplusApi数据缺失],已失效视频,(·w·)' % (aFavVideoCnt, Info['aid']), end=',')
printInfo(Info)
else:
print('#%d,%d,有效,"%s","%s"' % (aFavVideoCnt, Info['aid'], InvVideoInfo['title'], InvVideoInfo['pic']), end=',')
printInfo(Info)
def start(): # for cmd
global totVideoNum, runTime
logging.basicConfig(level=logging.DEBUG)
if sys.getdefaultencoding() == 'ascii':
sys.stdout = open("FavoriteVideoList.csv", "w", encoding = 'gb2312')
else:
sys.stdout = open("FavoriteVideoList.csv", "w", encoding = 'utf-8')
initUid()
runTime = time.time()
getFavoriteList(bilibili_uid)
print('你的收藏夹共有%d个视频' % (totVideoNum))
logging.info('已完成!')
# def windowMain(Uid, filePath): # for window ui
if __name__ == '__main__': # cmd
start()
本文由 落影汐雾 原创,采用 保留署名-非商业性使用-禁止演绎 4.0-国际许可协议
本文链接:https://x.lyxw.xyz/2022/python_web_crawler/