python spider learning

本文章记录python爬虫的学习内容，相当于学习python的知识点。

差不多就学到能爬图片，能爬视频，爬歌吧，简单入个门，也就是感兴趣然后来学了点，还是不错，比二进制收益来的快多了，哈哈哈。后面还是补一下hook，和反调试的文章吧，整理太花时间了，但还是必须整理。。。

request库用法

一行代码 Get 请求

r = requests.get('https://api.github.com/events')

一行代码 Post 请求

r = requests.post('https://httpbin.org/post', data = {'key':'value'})

获取服务器响应文本内容

import requests
r = requests.get('https://api.github.com/events')
r.text

假装自己是浏览器

url = 'https://api.github.com/some/endpoint'
headers = {'user-agent': 'my-app/0.0.1'}
r = requests.get(url, headers=headers)

获取服务器响应文本内容

import requests
r = requests.get('https://api.github.com/events')
r.text

获取响应码

r = requests.get('https://httpbin.org/get')
r.status_code

re库正则表达式

用来提取很多字符串中，我们需要的信息。

re.S，用来针对换行这种。

贪婪匹配与非贪婪匹配，就是.和.?

import re

s="s_#aa&"

res = re.findall('.*',s,re.S)
print(res)

res = re.findall('.*?',s,re.S)
print(res)

# ['s_#aa&', '']
# ['', 's', '', '_', '', '#', '', 'a', '', 'a', '', '&', '']

*?是一个数一个数都要匹配，.*？则是匹配

.*？这个用的比较多匹配任意字符串，我只能说6逼，这个几乎你给个头给给尾，直接就能帮你匹配到想要的字符串。

下面来个例子，也是后面有个例子会用到，直接就明白了。


import re

s=''' <li>
    <div class="list_num ">21.</div>
    <div class="pic"><a href="http://product.dangdang.com/28541936.html" target="_blank"><img src="http://img3m6.ddimg.cn/38/25/28541936-1_l_9.jpg" alt="男孩的学习力"  title="男孩的学习力"/></a></div>
    <div class="name"><a href="http://product.dangdang.com/28541936.html" target="_blank" title="男孩的学习力">男孩的学习力</a></div> '''

res = re.findall('<li>.*?>(\d+).*?',s,re.S)
print(res)
res = re.findall('<img src="(.*?)"',s,re.S)
print(res)

res = re.findall('title="(.*?)".*?class="name"',s,re.S)
print(res)
#['21']
#['http://img3m6.ddimg.cn/38/25/28541936-1_l_9.jpg']
#['男孩的学习力']

第一个爬虫

爬取当当网的书籍数据。

主要分3块

request用get请求网站，接收到服务器的返回数据
正则表达式处理数据，获得我们想要的。
将有效数据整理，然后存入文件。

get请求


def request_juger(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException as e:
        print(e)
        return None

正则表达式搜索


import re

s=''' <li>
    <div class="list_num ">22.</div>   
    <div class="pic"><a href="http://product.dangdang.com/28541936.html" target="_blank"><img src="http://img3m6.ddimg.cn/38/25/28541936-1_l_9.jpg" alt="男孩的学习力"  title="男孩的学习力"/></a></div>    
    <div class="name"><a href="http://product.dangdang.com/28541936.html" target="_blank" title="男孩的学习力">男孩的学习力</a></div>    
    <div class="star"><span class="level"><span style="width: 97.2%;"></span></span><a href="http://product.dangdang.com/28541936.html?point=comment_point" target="_blank">123849条评论</a><span class="tuijian">100%推荐</span></div>    
    <div class="publisher_info">[日]<a href="http://search.dangdang.com/?key=富永雄辅" title="[日]富永雄辅 著，吴一红 译，酷威文化 出品" target="_blank">富永雄辅</a> 著，<a href="http://search.dangdang.com/?key=吴一红" title="[日]富永雄辅 著，吴一红 译，酷威文化 出品" target="_blank">吴一红</a> 译，<a href="http://search.dangdang.com/?key=酷威文化" title="[日]富永雄辅 著，吴一红 译，酷威文化 出品" target="_blank">酷威文化</a> 出品</div>    
    <div class="publisher_info"><span>2020-06-01</span>&nbsp;<a href="http://search.dangdang.com/?key=四川文艺出版社" target="_blank">四川文艺出版社</a></div>    

            <div class="biaosheng">五星评分：<span>91796次</span></div>
                      
    
    <div class="price">        
        <p><span class="price_n">&yen;17.90</span>
                        <span class="price_r">&yen;39.80</span>(<span class="price_s">4.5折</span>)
                    </p>
                    <p class="price_e">电子书：<span class="price_n">&yen;7.99</span></p>
                <div class="buy_button">
                          <a ddname="加入购物车" name="" href="javascript:AddToShoppingCart('28541936');" class="listbtn_buy">加入购物车</a>
                        
                        <a name="" href="http://product.dangdang.com/1901212680.html" class="listbtn_buydz" target="_blank">购买电子书</a>
                        <a ddname="加入收藏" id="addto_favorlist_28541936" name="" href="javascript:showMsgBox('addto_favorlist_28541936',encodeURIComponent('28541936&platform=3'), 'http://myhome.dangdang.com/addFavoritepop');" class="listbtn_collect">收藏</a>
     
        </div> '''

cmp=re.compile('<li>.*?>(\d+).*?</div>.*?<img src="(.*?)".*?title="(.*?)".*?class="name".*?class="tuijian">(.*?)</span>.*?target="_blank">(.*?)</a>.*?<div class="biaosheng">(.*?)<span>.*?"price_n">(.*?)</span>',re.S)
res = re.findall(cmp,s)
print(res)

存入文件

def write_to_file(t):
    with open('book.txt', 'a', encoding='UTF-8') as f:
        f.write(json.dumps(t, ensure_ascii=False) + '\n')

可以看到json.dumps()

json.dumps将一个Python数据结构转换为JSON
import json
data = {
    'name' : 'myname',
    'age' : 100,
}
json_str = json.dumps(data)

ensure_ascii=True：默认输出ASCLL码，如果把这个该成False,就可以输出中文。

完整代码，还是没太搞懂yield那个，为什么返回了全部的，而不是一组。


import re
import requests
import json

#检测响应是否正确，一般是200，正确就会返回服务器传回来的数据
def request_juger(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException as e:
        print(e)
        return None

#提起有用信息
def find_useful(html):
    cmp = re.compile(
        '<li>.*?>(\d+).*?</div>.*?<img src="(.*?)".*?title="(.*?)".*?class="name".*?class="tuijian">(.*?)</span>.*?target="_blank">(.*?)</a>.*?<div class="biaosheng">(.*?)<span>.*?"price_n">(.*?)</span>',
        re.S)
    texts = re.findall(cmp, html)

	#数据结构
    for text in texts:
        yield {
            'range': text[0],
            'image': text[1],
            'title': text[2],
            'recommend': text[3],
            'author': text[4],
            'times': text[5],
            'price': text[6]
        }



def write_to_file(t):
    with open('book.txt', 'a', encoding='UTF-8') as f:
        f.write(json.dumps(t, ensure_ascii=False) + '\n')

def main(i):
    url="http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-"+str(i)
    texts=request_juger(url)
    text=find_useful(texts)
    for i in text:
        write_to_file(i)


if __name__ == "__main__":
    for i in range(1, 26):
        main(i)

BeautifulSoup

python的一个库，可以用来达到re库的一些效果，得到某一字符串，标题，超链接等等。


from bs4 import BeautifulSoup

html_doc = """

<html><head><title>学习python的正确姿势</title></head>
<body>
<p class="title"><b>小帅b的故事</b></p>

<p class="story">有一天，小帅b想给大家讲两个笑话
<a href="http://example.com/1" class="sister" id="link1">一个笑话长</a>,
<a href="http://example.com/2" class="sister" id="link2">一个笑话短</a> ,
他问大家，想听长的还是短的？</p>

<p class="story">...</p>

"""

soup = BeautifulSoup(html_doc,'lxml')

#获取标题的内容
print(soup.title.string)
##学习python的正确姿势

#获取 p 标签里面的内容
print(soup.p.string)
#小帅b的故事

#获取 title 的父级标签
print(soup.title.parent.name)
#head

#获取超链接
print(soup.a)
#<a class="sister" href="http://example.com/1" id="link1">一个笑话长</a>

#获取所有超链接
print(soup.find_all('a'))
#[<a class="sister" href="http://example.com/1" id="link1">一个笑话长</a>, <a class="sister" href="http://example.com/2" id="link2">一个笑话短</a>]

#获取 id 为 link2 的超链接
print(soup.find(id="link2"))
#<a class="sister" href="http://example.com/2" id="link2">一个笑话短</a>

#获取网页中所有的内容
print(soup.get_text())
# 学习python的正确姿势

# 小帅b的故事
# 有一天，小帅b想给大家讲两个笑话
# 一个笑话长,
# 一个笑话短 ,
# 他问大家，想听长的还是短的？
# ...

第二个爬虫

爬取豆瓣电影前250的电影信息，写入excel。不同于上个爬虫的就是使用的BeautifulSoup库，

爬取目标: url=https://movie.douban.com/top250?start=0&filter=，改变的只有strat=25*页数。

html基本信息


<ol class="grid_view">
 <li>
            <div class="item">
                <div class="pic">
                    <em class="">1</em>
                    <a href="https://movie.douban.com/subject/1292052/">
                        <img width="100" alt="肖申克的救赎" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class="">
                    </a>
                </div>
                <div class="info">
                    <div class="hd">
                        <a href="https://movie.douban.com/subject/1292052/" class="">
                            <span class="title">肖申克的救赎</span>
                                    <span class="title">&nbsp;/&nbsp;The Shawshank Redemption</span>
                                <span class="other">&nbsp;/&nbsp;月黑高飞(港)  /  刺激1995(台)</span>
                        </a>


                            <span class="playable">[可播放]</span>
                    </div>
                    <div class="bd">
                        <p class="">
                            导演: 弗兰克·德拉邦特 Frank Darabont&nbsp;&nbsp;&nbsp;主演: 蒂姆·罗宾斯 Tim Robbins /...<br>
                            1994&nbsp;/&nbsp;美国&nbsp;/&nbsp;犯罪 剧情
                        </p>

                        
                        <div class="star">
                                <span class="rating5-t"></span>
                                <span class="rating_num" property="v:average">9.7</span>
                                <span property="v:best" content="10.0"></span>
                                <span>2477393人评价</span>
                        </div>

                            <p class="quote">
                                <span class="inq">希望让人自由。</span>
                            </p>
                    </div>
                </div>
            </div>
        </li>

可以发现都在class=”grid_view”里面，我们需要得到序号，图片url，名称，作者评分，和短评。然后写入exlsx

先介绍个xlwt库，这个库就是可以将数据写入excel的库。


import xlwt
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding = 'utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
# 写入excel
# 参数对应 行, 列, 值
worksheet.write(1,0, label = 'this is test')

完整python代码


import requests
import json
from bs4 import BeautifulSoup
import xlwt

#创建excel，添加第一行的信息
workbook = xlwt.Workbook(encoding = 'utf-8',style_compression=0)
worksheet = workbook.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True)
worksheet.write(0, 0, '名称')
worksheet.write(0, 1, '图片')
worksheet.write(0, 2, '排名')
worksheet.write(0, 3, '评分')
worksheet.write(0, 4, '作者')
worksheet.write(0, 5, '简介')

n=1


#检测响应是否正确，一般是200，正确就会返回服务器传回来的数据
def request_juger(url):
    header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
            'Chrome/95.0.4638.69 Safari/537.36',
    }

    try:
        response = requests.get(url=url,headers=header)
        if response.status_code == 200:
            return response.text
    except requests.RequestException as e:
        print(e)
        return None

#提起有用信息
def find_useful_xlwt(html):
    soup = BeautifulSoup(html, 'lxml')
    list = soup.find(class_='grid_view').find_all('li')

    #利用BeautifulSoup去寻找
    for item in list:
        item_index=item.find(class_='').string
        item_name=item.find(class_='title').string
        item_picture=item.find('a').find('img').get('src')
        item_author=item.find('p').text.replace("\n",'').replace(" ",'')[0:20]
        item_score=item.find(class_='rating_num').string
        if item.find(class_="inq") is not None:
            item_intr=item.find(class_="inq").string

        #print('爬取电影：' + item_index + ' | '+ item_picture + ' | ' + item_name + ' | ' + item_autor + ' | ' + item_score + ' | ' + item_intr)
        global n

        worksheet.write(n, 0, item_name)
        worksheet.write(n, 1, item_picture)
        worksheet.write(n, 2, item_index)
        worksheet.write(n, 3, item_score)
        worksheet.write(n, 4, item_author)
        worksheet.write(n, 5, item_intr)
        n=n+1

def main(i):
    url='https://movie.douban.com/top250?start=' + str(i * 25) + '&filter='
    html=request_juger(url)
    find_useful_xlwt(html)

if __name__ == "__main__":
    for i in range(0, 10):
        main(i)
#保存文件，后缀名需要为xls
workbook.save(u'豆瓣最受欢迎的250部电影.xls')

多线程

一个菜鸟上的例子，创建了一个线程类，继承threading.Thread


#!/usr/bin/python3
import threading
import time

exitFlag = 0

class myThread (threading.Thread):
    def __init__(self, threadID, name, counter):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.counter = counter
    #定义了一个 run 方法
    def run(self):
        print ("开始线程：" + self.name)
        print_time(self.name, self.counter, 10)
        print ("退出线程：" + self.name)

def print_time(threadName, delay, counter):
    while counter:
        if exitFlag:
            threadName.exit()
        time.sleep(delay)
        print ("%s: %s" % (threadName, time.ctime(time.time())))
        counter -= 1

# 创建新线程
thread1 = myThread(1, "Thread-1", 1)
thread2 = myThread(2, "Thread-2", 2)

# 开启新线程
thread1.start()
thread2.start()
thread1.join()
thread2.join()
print ("退出主线程")

线程锁，用来保证公共数据在某一时间只会被一个线程使用。


    def run(self):
        print ("开启线程： " + self.name)
        # 获取锁，用于线程同步
        threadLock.acquire()
        print_time(self.name, self.counter, 3)
        # 释放锁，开启下一个线程
        threadLock.release()

threadLock = threading.Lock()

第三个爬虫

爬取某一网站上的图片，嘿嘿嘿。

一个用来判断是否可以下载图片的小步骤。


import requests

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/95.0.4638.69 Safari/537.36',
    "Referer":"https://s2.baozimh.com"

}

filename='111.jpg'

with open(filename, 'wb') as f:
    img = requests.get('https://s2.baozimh.com/scomic/douluodalu-fengxuandongman/0/9-htxl/2.jpg', headers=headers).content
    f.write(img)

下面的一个项目中的例子，自己也写了写，真不错。



import requests
import json
from bs4 import BeautifulSoup
import xlwt
import os


header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/95.0.4638.69 Safari/537.36',
        "Referer": "https://www.mzitu.com/all/"
    }
#检测响应是否正确，一般是200，正确就会返回服务器传回来的数据
def request_juger(url):
    header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
            'Chrome/95.0.4638.69 Safari/537.36',
        "Referer": "https://www.mzitu.com/all/"
    }

    try:
        response = requests.get(url=url,headers=header)
        if response.status_code == 200:
            return response.text
    except requests.RequestException as e:
        print(e)
        return None

#提起有用信息
def get_page_urls():
    base_url = 'https://www.mzitu.com/page/'

    for i in range(4, 5):
        url = base_url + str(i)
        html=request_juger(url)
        soup = BeautifulSoup(html, 'lxml')
        list = soup.find(class_='postlist').find_all('li')

        meizi_url=[]
        for i in list:
            meizi_url.append(i.find('span').find('a').get('href'))

    return meizi_url
def download_Pic(title, image_list):
    # 新建文件夹
    os.mkdir(title)
    j = 1
    # 下载图片
    for item in image_list:
        filename = '%s/%s.jpg' % (title, str(j))
        print('downloading....%s : NO.%s' % (title, str(j)))
        with open(filename, 'wb') as f:
            img = requests.get(url=item,headers=header).content
            f.write(img)
        j += 1


def download_images(url):
    pages=[]

    html = request_juger(url)
    soup = BeautifulSoup(html, 'lxml')
    title=soup.find('h2').string
    page=soup.find(class_='pagenavi').find_all('a')[-2].find('span').string

    image_list=[]
    for i in range(1,int(page)):
        html = request_juger(url + '/%s' % i)
        soup = BeautifulSoup(html, 'lxml')
        img_url = soup.find('img').get('src')
        image_list.append(img_url)
    print(image_list)
    download_Pic(title, image_list)


def main():
    urls=get_page_urls()
    for url in urls:
        download_images(url)


if __name__ == "__main__":
    main()

照猫画虎，去爬了点漫画，斗罗大陆。


import requests
import json
from bs4 import BeautifulSoup
import xlwt
import os


header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/95.0.4638.69 Safari/537.36',
    "Referer":"https://s2.baozimh.com"
    }
#检测响应是否正确，一般是200，正确就会返回服务器传回来的数据
def request_juger(url):
    header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/95.0.4638.69 Safari/537.36',
        "Referer":"https://s2.baozimh.com"
    }

    try:
        response = requests.get(url=url,headers=header)
        if response.status_code == 200:
            return response.text
    except requests.RequestException as e:
        print(e)
        return None

#提起有用信息
def get_page_urls():
    base_url = 'https://cn.webmota.com/comic/chapter/douluodalu-fengxuandongman/0_'
    manhua_url = []
    for i in range(0, 10):
        url = base_url + str(i) + '.html'
        manhua_url.append(url)

    return manhua_url
def download_Pic(title, image_list):
    # 新建文件夹
    os.mkdir(title)
    j = 1
    # 下载图片
    for item in image_list:
        filename = '%s/%s.jpg' % (title, str(j))
        print('downloading....%s : NO.%s' % (title, str(j)))
        with open(filename, 'wb') as f:
            img = requests.get(url=item,headers=header).content
            f.write(img)
        j += 1
def download_images(url):
    html = request_juger(url)
    soup = BeautifulSoup(html, 'lxml')
    title=soup.find('head').find('title').string
    list=[]
    page=soup.find(class_='comic-text__amp').find('em').string.replace('\n','').replace(' ','')[-1]

    for i in range(int(page)):
        list.append(soup.find_all('img')[i].get('src'))
    download_Pic(title,list)

def main():
    urls=get_page_urls()
    print(urls)
    for url in urls:
        download_images(url)


if __name__ == "__main__":
    main()

若要多线程爬，直接导入相应线程库，弄成下面这种


with concurrent.futures.ProcessPoolExecutor(max_workers=5) as exector:
    for url in urls:
        exector.submit(download_images, url)

selenium

本来想直接上手爬视频的结果发现链接里面根本没有mp4，还是得回来看selenium获取源代码。

首先pip安装

pip install selenium

接着配谷歌driver环境，去官网下载后，然后配环境变量就行了，然后关机重启一次就能用了。

测试


driver = webdriver.Chrome()
driver.get("https://new.iskcd.com/20211106/HhseFdeO/index.m3u8")
text=driver.page_source
print(text)

第四个爬虫

开始爬视频了，也是自己摸索了很久，视频和图片的爬取还是有非常大的不同的，很多网站视频都采用了m3m8的方式来把mp4文件变成一个个的ts文件，爬取思路就是先得到index.m3m8，然后访问里面的链接，得到返回值，然后提取所有ts文件的url，下载下来，然后用ffmpeg来组装成mp4。

代码，单线程，比较慢。


import requests
from bs4 import BeautifulSoup
import re
import subprocess

num=0
header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/95.0.4638.69 Safari/537.36',
    }
#检测响应是否正确，一般是200，正确就会返回服务器传回来的数据
def request_juger(url):
    header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/95.0.4638.69 Safari/537.36',

    }

    try:
        response = requests.get(url=url,headers=header)
        if response.status_code == 200:
            return response.text
    except requests.RequestException as e:
        print(e)
        return None

#提起有用信息
def get_page_urls():
    base_url = 'https://www.great-elec.com/video/924-0-'
    manhua_url = []
    for i in range(7,11):
        url = base_url + str(i) + '.html'
        print(url)
        manhua_url.append(url)

    return manhua_url
def download_mp4(title, ts_urls):
    global num
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/95.0.4638.69 Safari/537.36',

    }
    # 发起请求

    print(num)
    for ts_url in ts_urls:
        name=ts_url.split('/')[-1]
        res = requests.get(url=ts_url, headers=header)
        with open("D:/learning record/学术报告/python_spider/爬虫/mp4/file_list{}.txt".format(num), "a+") as f:
            f.write("file '{}'\n".format(name))
        with open('D:\\learning record\\学术报告\\python_spider\\爬虫\\mp4\\{}'.format(name), 'wb') as f:
            f.write(res.content)

    cmd='ffmpeg -f concat -i "D:/learning record/学术报告/python_spider/爬虫/mp4/file_list{0}.txt". -c copy "D:/learning record/学术报告/python_spider/爬虫/mp4/vidoe/output{1}.mp4"'.format(num,num)
    print(cmd)
    subprocess.Popen(cmd,shell=True)
    num+=1

def download(url):
    title='aa'

    html = request_juger(url)
    soup = BeautifulSoup(html, 'lxml')
    a=soup.find(class_='box').find('p').find('script')
    cmp1 = re.compile('<script>.*?now="(.*?)";.*?' ,re.S)
    texts = re.findall(cmp1,html)


    m3m8_url=texts[0]
    print(m3m8_url[:40])
    m3u8_html = requests.get(url=m3m8_url, headers=header).text
    print(m3u8_html[-22:-1])

    new_m3m8_url=m3m8_url[:40]+m3u8_html[-22:-1]
    print(new_m3m8_url)
    new_m3m8_html = requests.get(url=new_m3m8_url, headers=header).text

    ts_urls = re.findall(re.compile(',\n(.*?.ts)\n#'), new_m3m8_html)

    download_mp4(title,ts_urls)


def main():
    urls=get_page_urls()

    for url in urls:
        download(url)

if __name__ == "__main__":
    main()

后面弄多线程的时候发现实际上还存在问题，就是拼接new_m3m8_url，可能会出现未能拼接成一个有效的url，当然可以采用split先分割，然后在拼接，因为/是固定的嘛，但是太懒了就没弄了。

多线程爬，实际上也不快


#!/usr/bin/python3
import threading
import requests
from bs4 import BeautifulSoup
import re
import subprocess

exitFlag = 0

class myThread(threading.Thread):
    def __init__(self, star, name, end):
        threading.Thread.__init__(self)
        self.star = star
        self.name = name
        self.end = end
    #定义了一个 run 方法
    def run(self):
        print ("开始线程：" + self.name)
        func(self.star,self.end)
        print ("退出线程：" + self.name)


header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/95.0.4638.69 Safari/537.36',
    }
#检测响应是否正确，一般是200，正确就会返回服务器传回来的数据
def request_juger(url):
    header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/95.0.4638.69 Safari/537.36',

    }

    try:
        response = requests.get(url=url,headers=header)
        if response.status_code == 200:
            return response.text
    except requests.RequestException as e:
        print(e)
        return None

#提起有用信息
def get_page_urls(star,end):
    base_url = 'https://www.great-elec.com/video/924-0-'
    manhua_url = []
    for i in range(star,end):
        url = base_url + str(i) + '.html'
        print(url)
        manhua_url.append(url)

    return manhua_url
def download_mp4(title, ts_urls,num):

    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/95.0.4638.69 Safari/537.36',

    }
    # 发起请求

    for ts_url in ts_urls:
        name=ts_url.split('/')[-1]
        res = requests.get(url=ts_url, headers=header)
        with open("D:/learning record/学术报告/python_spider/爬虫/mp4/file_list{}.txt".format(num), "a+") as f:
            f.write("file '{}'\n".format(name))
        with open('D:\\learning record\\学术报告\\python_spider\\爬虫\\mp4\\{}'.format(name), 'wb') as f:
            f.write(res.content)

    cmd='ffmpeg -f concat -i "D:/learning record/学术报告/python_spider/爬虫/mp4/file_list{0}.txt". -c copy "D:/learning record/学术报告/python_spider/爬虫/mp4/vidoe/output{1}.mp4"'.format(num,num)
    print(cmd)
    subprocess.Popen(cmd,shell=True)
    num+=1

def download(url,num):
    title='aa'

    html = request_juger(url)
    soup = BeautifulSoup(html, 'lxml')
    a=soup.find(class_='box').find('p').find('script')
    cmp1 = re.compile('<script>.*?now="(.*?)";.*?' ,re.S)
    texts = re.findall(cmp1,html)


    m3m8_url=texts[0]
    print(m3m8_url[:43])
    m3u8_html = requests.get(url=m3m8_url, headers=header).text
    print(m3u8_html[-22:-1])

    new_m3m8_url=m3m8_url[:43]+m3u8_html[-22:-1]
    print(new_m3m8_url)
    new_m3m8_html = requests.get(url=new_m3m8_url, headers=header).text

    ts_urls = re.findall(re.compile(',\n(.*?.ts)\n#'), new_m3m8_html)
    print(ts_urls)
    download_mp4(title,ts_urls,num)


def func(star ,end):
    urls=get_page_urls(star,end)
    for url in urls:
        download(url,star)


if __name__ == "__main__":
    # 创建新线程
    thread1 = myThread(13, "Thread-1", 14)
    thread2 = myThread(16, "Thread-2", 18)

    # 开启新线程
    thread1.start()
    thread2.start()
    thread1.join()
    thread2.join()
    print ("退出主线程")

第五个爬虫

爬取歌曲，实际上也是找链接，倒是学了json的知识点，就相当于多维字典数组，挺方便。

实现了输入任意名称，就能爬取显示的所有歌曲，但是试听的只能有一部分。


import requests
import re
import os
import json

def download_song(title, song_list,song_name):
    # 新建文件夹
    os.mkdir(title)
    j = 0

    for item in song_list:
        filename = '%s/%s.mp3' % (title, song_name[j])
        print('downloading....%s : NO.%s' % (title, song_name[j]))
        with open(filename, 'wb') as f:
            mp3 = requests.get(item).content
            f.write(mp3)
        j += 1

def request_juger(url):
    header={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/95.0.4638.69 Safari/537.36',
        "Referer": "https://www.kugou.com/"

    }

    try:
        response = requests.get(url=url,headers=header)
        if response.status_code == 200:
            return response.text
    except requests.RequestException as e:
        print(e)
        return None


def main():
    name=input()
    base_url='https://searchrecommend.kugou.com/get/complex?callback=jQuery112403385798993366811_1636390150231&word=%s&_=1636390150232'%(name)
    text=request_juger(base_url)

    useful=re.match(".*?({.*}).*", text, re.S)
    res = json.loads(useful.group(1))
    list = res['data']['song']
    song_list=[]
    song_name=[]
    for i in list:
        AlbumID=i['AlbumID']
        hash=i['hash']
        song_url='https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery19108914882384086649_1636392409637&hash=%s&dfid=4FIh0T1FDOGg2mC8cp3BaW48&appid=1014&mid=96e0f9a5a8a4d183f0034aa8ab27c2c9&platid=4&album_id=%s&_=1636392409638'%(hash,AlbumID)
        #print(song_url)
        song_text = request_juger(song_url)
        song_useful = re.match(".*?({.*}).*", song_text, re.S)
        song_res = json.loads(song_useful.group(1))
        if(song_res['data']['play_url']==''):
            continue
        song_list.append(song_res['data']['play_url'])
        song_name.append(song_res['data']['song_name'])
    print(song_list)
    download_song(name,song_list,song_name)

if __name__ == "__main__":
    main()