内网隔离的环境下需要安装软件包,除了自建源之外一个简单可行的方法就是直接从其他设备下载所需软件包,但是诸如centos cbs软件包是没有一键下载整个目录的,这个爬虫脚本就是用于解决这个问题的。直接更换url即可使用。

import time

import requests
from bs4 import BeautifulSoup

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.5,zh-HK;q=0.3,en;q=0.2',
    'Connection': 'keep-alive',
    'Host': 'cbs.centos.org',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:79.0) Gecko/20100101 Firefox/79.0'
}
# Assign url to the address of the page you need to download 
url = 'https://cbs.centos.org/kojifiles/packages/ceph/12.2.5/0.el7/aarch64/'


def get_file_list():
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # print(soup)
    rst = []
    for i in soup.find_all('a')[13:]:
        rst.append(i['href'])
    return rst


def format_float(num):
    return '{:.2f}'.format(num)


def download_file(name, u, s):
    r = s.get(u, stream=True)
    length = float(r.headers['content-length'])
    f = open(name, 'wb')
    count = 0
    count_tmp = 0
    time1 = time.time()
    for chunk in r.iter_content(chunk_size=512):
        if chunk:
            f.write(chunk)
            count += len(chunk)
            if time.time() - time1 > 2:
                p = count / length * 100
                speed = (count - count_tmp) / 1024 / 1024 / 2
                count_tmp = count
                print(name + ': ' + format_float(p) + '%' + ' Speed: ' + format_float(speed) + 'M/S')
                time1 = time.time()
    f.close()


if __name__ == '__main__':
    session = requests.Session()
    session.headers = headers
    file_list = get_file_list()
    for i in file_list:
        download_file(i, url + i, session)