Python网络爬虫学习

Author Avatar
落影汐雾 1月 12, 2022
  • 在其它设备中阅读本文章

更新于:A.D.2022.2.13


按模块学习

urllib库

json库

按功能学习

下载HTML源代码

from urllib.request import urlopen

import sys

html = urlopen(
    "url"
).read().decode('utf-8')

if sys.getdefaultencoding() == 'ascii':
    sys.stdout = open("2.输出html.html", "w", encoding = 'gb2312')
else:
    sys.stdout = open("2.输出html.html", "w", encoding = 'utf-8')

print(html)

下载网页图片

from urllib.request import urlopen
import urllib.parse
import sys
import re
import os

def getHtml(url):
    html = urlopen(url).read().decode('utf-8')
    return html

def printImg(imgList, url):
    for i in imgList:
        imgName = i.split("/")[-1]
        imgPicture = urlopen(url + i)
        File = open('\\image\\' + imgName, 'wb')
        File.write(imgPicture.read())
        File.close()

def downloadImg(url):
    html = getHtml(url)
    rule_1 = r'<img src="([^"]+\.png)"'
    rule_2 = r'<img src="([^"]+\.jpg)"'
    imgList_1 = re.findall(rule_1, html)
    printImg(imgList_1, url)
    imgList_2 = re.findall(rule_2, html)
    printImg(imgList_2, url)

if __name__ == '__main__':
    url = "url"
    downloadImg(url)

下载通知

from urllib.request import urlopen

import sys
import re

def getHtml(url):
    html = urlopen(url).read().decode('utf-8')
    return html

def printList(list_1, list_2, list_3):
    if sys.getdefaultencoding() == 'ascii':
        sys.stdout = open("4.爬取通知.txt", "w", encoding = 'gb2312')
    else:
        sys.stdout = open("4.爬取通知.txt", "w", encoding = 'utf-8')
    for i in range(0, len(list_1)):
        print('#%d' % (i+1))
        print('标题:' + list_2[i])
        print('日期:' + list_3[i])
        print('网址:' + 'url' + list_1[i])
        print('')

def download(url):
    html = getHtml(url)
    rule_1 = r'<a target="_blank" href="([^"]+\.htm)">'
    List_1 = re.findall(rule_1, html)
    rule_2 = r'<a target="_blank" href=.*?>(.*?)</a>'
    List_2 = re.findall(rule_2, html)
    rule_3 = r'<font color="#3a6399">&nbsp;&nbsp;(.*?)</font>'
    List_3 = re.findall(rule_3, html)
    printList(List_1, List_2, List_3)

if __name__ == '__main__':
    url = "url"
    download(url)

下载任意页数通知

from urllib.request import urlopen

import sys
import re

def getHtml(url):
    html = urlopen(url).read().decode('utf-8')
    return html

def printList(list_1, list_2, list_3):
    if sys.getdefaultencoding() == 'ascii':
        sys.stdout = open("5.爬取任意页数通知.txt", "w", encoding = 'gb2312')
    else:
        sys.stdout = open("5.爬取任意页数通知.txt", "w", encoding = 'utf-8')
    print('通知页数:第' + pageNumber + '页')
    print('')
    for i in range(0, len(list_1)):
        print('#%d' % (i+1))
        print('标题:' + list_2[i])
        print('日期:' + list_3[i])
        print('网址:' + 'url' + list_1[i])
        print('')

def download(url):
    html = getHtml(url)
    rule_1 = r'<a target="_blank" href="([^"]+\.htm)">'
    List_1 = re.findall(rule_1, html)
    rule_2 = r'<a target="_blank" href=.*?>(.*?)</a>'
    List_2 = re.findall(rule_2, html)
    rule_3 = r'<font color="#3a6399">&nbsp;&nbsp;(.*?)</font>'
    List_3 = re.findall(rule_3, html)
    printList(List_1, List_2, List_3)

if __name__ == '__main__':
    print('请输入需要爬取的通知页数:')
    pageNumber = input()
    url = "url" + pageNumber + ".html"
    download(url)

Bilibili-Favorites-Download

用于下载B站个人收藏夹信息

https://github.com/ShioKiri/Bilibili-Favorites-Download

  • 因为调用一个外部网站API查询了失效视频且没有处理失败情况,目前因为API有一定问题,不能稳定运行,可以自行删去处理失效视频的代码使用
  • 没有使用数据库和图形界面,直接保存为TXT或者CSV文件

生成TXT文件

按照一定格式保存为TXT,推荐使用编辑器打开TXT避免速度过慢

from urllib.request import urlopen
import json
import re
import time
import sys
import logging
# -*- coding: UTF-8 -*-

bilibili_uid = 0
aFavVideoCnt = 0
totVideoCnt = 0
totVideoNum = 0

def initUid():
    global bilibili_uid
    bilibili_uid = int(input())

def getJsonUrl(url):
    Data = urlopen(url).read().decode('utf-8')
    jsonData = json.loads(Data)
    return jsonData

def getFavoriteList(uid):
    global totVideoNum
    favoriteListUrl = 'https://api.bilibili.com/medialist/gateway/base/created?pn=1&ps=100&up_mid={uid}&is_space=0&jsonp=jsonp'.format(uid=uid)
    favoriteListData = getJsonUrl(favoriteListUrl)
    listInfo = favoriteListData['data']['list']
    favNum = len(listInfo)
    for i in range(0, favNum):
        totVideoNum += listInfo[i]['media_count']
    for i in range(0, favNum):
        print('收藏夹编号:#%d 名称:%s 视频数量:%d\n' % (i+1, listInfo[i]['title'], listInfo[i]['media_count']))
        getFavListVideo(uid, listInfo[i]['fid'])

def getFavListVideo(uid, fid):
    global aFavVideoCnt, totVideoCnt, totVideoNum
    aFavVideoCnt = 0
    favPageUrl = 'https://api.bilibili.com/x/space/fav/arc?vmid={uid}&ps=30&fid={fid}&tid=0&keyword=&pn={page}&order=fav_time&jsonp=jsonp'.format(uid=uid,fid=fid,page=1)
    favPageDate = getJsonUrl(favPageUrl)
    printVideoInfo(favPageDate['data']['archives'])
    pageCount = favPageDate['data']['pagecount']
    for i in range(2, pageCount+1):
        favPageUrl = 'https://api.bilibili.com/x/space/fav/arc?vmid={uid}&ps=30&fid={fid}&tid=0&keyword=&pn={page}&order=fav_time&jsonp=jsonp'.format(uid=uid,fid=fid,page=i)
        favPageDate = getJsonUrl(favPageUrl)
        printVideoInfo(favPageDate['data']['archives'])

def printInfo(Info):
    print('投稿时间:' + time.strftime("%Y-%m-%d-%H:%M:%S", time.gmtime(Info['pubdate'])))
    print('描述:%s' % (Info['desc']))
    print('分区:%s' % (Info['tname']))
    print('标签:%s' % (Info['dynamic']))
    print('up主用户名:%s  up主uid:%s' % (Info['owner']['name'], Info['owner']['mid']))
    print('播放:%s  弹幕:%s  回复:%s  收藏:%s  硬币:%s  分享:%s  喜欢:%s' % (Info['stat']['view'], Info['stat']['danmaku'], Info['stat']['reply'], Info['stat']['favorite'], Info['stat']['coin'], Info['stat']['share'], Info['stat']['like'],))
    print('收藏时间:' + time.strftime("%Y-%m-%d-%H:%M:%S", time.gmtime(Info['fav_at'])))
    print()

def printVideoInfo(aPageInfo):
    global aFavVideoCnt, totVideoCnt, totVideoNum
    for i in range(0, len(aPageInfo)):
        aFavVideoCnt += 1
        totVideoCnt += 1
        if totVideoCnt % 10 == 0 or totVideoCnt == totVideoNum: logging.info('已经完成了{num1}/{num2}个视频...'.format(num1 = totVideoCnt,num2 = totVideoNum))
        Info = aPageInfo[i]
        video_id = Info['aid']
        if Info['title'] == '已失效视频':
            getInvalidVideoInfo(video_id, Info)
        else:
            print('视频编号:#%d  AV号:%d  视频标题:%s' % (aFavVideoCnt, Info['aid'], Info['title']))
            print('封面图片:%s' % (Info['pic']))
            printInfo(Info)

biliplusApiCnt = 0
runTime = 0

def getInvalidVideoInfo(video_id, Info):
    global aFavVideoCnt, biliplusApiCnt, runTime
    biliplusApiCnt += 1
    while biliplusApiCnt / ((time.time() - runTime) / 60.0) >= 5: time.sleep(1)
    url = 'https://www.biliplus.com/api/view?id={vid}'.format(vid = video_id)
    InvVideoInfo = getJsonUrl(url)
    if 'code' in InvVideoInfo:
        print('视频编号:#%d  [已失效][BiliplusApi数据缺失]  AV号:%d  视频标题:已失效视频' % (aFavVideoCnt, video_id))
        print('封面图片:%s' % (Info['pic']))
        printInfo(Info)
    else:
        print('视频编号:#%d  [已失效][BiliplusApi获得标题与封面图片]  AV号:%d  视频标题:%s' % (aFavVideoCnt, video_id, InvVideoInfo['title']))
        print('封面图片:%s' % (InvVideoInfo['pic']))
        printInfo(Info)

def start(): # for cmd
    global totVideoNum, runTime
    logging.basicConfig(level=logging.DEBUG)
    if sys.getdefaultencoding() == 'ascii':
        sys.stdout = open("FavoriteVideoList.txt", "w", encoding = 'gb2312')
    else:
        sys.stdout = open("FavoriteVideoList.txt", "w", encoding = 'utf-8')
    initUid()
    runTime = time.time()
    getFavoriteList(bilibili_uid)
    print('你的收藏夹共有%d个视频' % (totVideoNum))
    logging.info('已完成!')

# def windowMain(Uid, filePath): # for window ui

if __name__ == '__main__': # cmd
    start()

生成CSV文件

生成可以用Excel打开的逗号分隔值文件,因为Unicode(UTF-8)文件头没有BOM编码会导致编码错误(后期会处理),可以使用Notepad++修改Unicode(UTF-8)ANSI,或者修改为带BOM编码格式的Unicode(UTF-8)文件

from urllib.request import urlopen
import json
import re
import time
import sys
import logging
# -*- coding: UTF-8 -*-

bilibili_uid = 0
aFavVideoCnt = 0
totVideoCnt = 0
totVideoNum = 0

def initUid():
    global bilibili_uid
    bilibili_uid = int(input())

def getJsonUrl(url):
    Data = urlopen(url).read().decode('utf-8')
    jsonData = json.loads(Data)
    return jsonData

def getFavoriteList(uid):
    global totVideoNum
    print('收藏夹编号,收藏夹名称,收藏夹视频数量,视频编号,AV号,视频状态,视频标题,封面图片,投稿时间,描述,分区,标签,up主用户名,up主uid,播放,弹幕,回复,收藏,硬币,分享,喜欢,收藏时间,')
    favoriteListUrl = 'https://api.bilibili.com/medialist/gateway/base/created?pn=1&ps=100&up_mid={uid}&is_space=0&jsonp=jsonp'.format(uid=uid)
    favoriteListData = getJsonUrl(favoriteListUrl)
    listInfo = favoriteListData['data']['list']
    favNum = len(listInfo)
    for i in range(0, favNum):
        totVideoNum += listInfo[i]['media_count']
    for i in range(0, favNum):
        favInformation = '#{x1},{x2},{x3}'.format(x1=i+1, x2=listInfo[i]['title'], x3=listInfo[i]['media_count'])
        getFavListVideo(uid, listInfo[i]['fid'], favInformation)

def printInfo(Info):
    print(time.strftime("%Y-%m-%d-%H:%M:%S", time.gmtime(Info['pubdate'])), end=',')
    print('"%s"' % (Info['desc']), end=',')
    print('"%s"' % (Info['tname']), end=',')
    print('"%s"' % (Info['dynamic']), end=',')
    print('"%s","%s"' % (Info['owner']['name'], Info['owner']['mid']), end=',')
    print('%s,%s,%s,%s,%s,%s,%s' % (Info['stat']['view'], Info['stat']['danmaku'], Info['stat']['reply'], Info['stat']['favorite'], Info['stat']['coin'], Info['stat']['share'], Info['stat']['like'],), end=',')
    print(time.strftime("%Y-%m-%d-%H:%M:%S", time.gmtime(Info['fav_at'])), end=',')
    print()

def getFavListVideo(uid, fid, favInfo):
    global aFavVideoCnt, totVideoCnt, totVideoNum
    aFavVideoCnt = 0
    favPageUrl = 'https://api.bilibili.com/x/space/fav/arc?vmid={uid}&ps=30&fid={fid}&tid=0&keyword=&pn={page}&order=fav_time&jsonp=jsonp'.format(uid=uid, fid=fid, page=1)
    favPageDate = getJsonUrl(favPageUrl)
    printVideoInfo(favPageDate['data']['archives'], favInfo)
    pageCount = favPageDate['data']['pagecount']
    for i in range(2, pageCount+1):
        favPageUrl = 'https://api.bilibili.com/x/space/fav/arc?vmid={uid}&ps=30&fid={fid}&tid=0&keyword=&pn={page}&order=fav_time&jsonp=jsonp'.format(uid=uid, fid=fid, page=i)
        favPageDate = getJsonUrl(favPageUrl)
        printVideoInfo(favPageDate['data']['archives'], favInfo)

def printVideoInfo(aPageInfo, favInfo):
    global aFavVideoCnt, totVideoCnt, totVideoNum
    for i in range(0, len(aPageInfo)):
        print(favInfo, end=',')
        aFavVideoCnt += 1
        totVideoCnt += 1
        if totVideoCnt % 10 == 0 or totVideoCnt == totVideoNum: logging.info('已经完成了{num1}/{num2}个视频...'.format(num1=totVideoCnt, num2=totVideoNum))
        Info = aPageInfo[i]
        video_id = Info['aid']
        if Info['title'] == '已失效视频':
            getInvalidVideoInfo(video_id, Info)
        else:
            print('#%d,%d,[有效],"%s","%s"' % (aFavVideoCnt, Info['aid'], Info['title'], Info['pic']), end=',')
            printInfo(Info)

biliplusApiCnt = 0
runTime = 0

def getInvalidVideoInfo(video_id, Info):
    global aFavVideoCnt, biliplusApiCnt, runTime
    biliplusApiCnt += 1
    while biliplusApiCnt / ((time.time() - runTime) / 60.0) >= 5: time.sleep(1)
    url = 'https://www.biliplus.com/api/view?id={vid}'.format(vid = video_id)
    InvVideoInfo = getJsonUrl(url)
    if 'code' in InvVideoInfo:
        print('#%d,%d,[失效][BiliplusApi数据缺失],已失效视频,(·w·)' % (aFavVideoCnt, Info['aid']), end=',')
        printInfo(Info)
    else:
        print('#%d,%d,有效,"%s","%s"' % (aFavVideoCnt, Info['aid'], InvVideoInfo['title'], InvVideoInfo['pic']), end=',')
        printInfo(Info)

def start(): # for cmd
    global totVideoNum, runTime
    logging.basicConfig(level=logging.DEBUG)
    if sys.getdefaultencoding() == 'ascii':
        sys.stdout = open("FavoriteVideoList.csv", "w", encoding = 'gb2312')
    else:
        sys.stdout = open("FavoriteVideoList.csv", "w", encoding = 'utf-8')
    initUid()
    runTime = time.time()
    getFavoriteList(bilibili_uid)
    print('你的收藏夹共有%d个视频' % (totVideoNum))
    logging.info('已完成!')

# def windowMain(Uid, filePath): # for window ui

if __name__ == '__main__': # cmd
    start()

本文由 落影汐雾 原创,采用 保留署名-非商业性使用-禁止演绎 4.0-国际许可协议
本文链接:https://x.lyxw.xyz/2022/python_web_crawler/