内网隔离的环境下需要安装软件包,除了自建源之外一个简单可行的方法就是直接从其他设备下载所需软件包,但是诸如centos cbs软件包是没有一键下载整个目录的,这个爬虫脚本就是用于解决这个问题的。直接更换url即可使用。
import time
import requests
from bs4 import BeautifulSoup
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.5,zh-HK;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Host': 'cbs.centos.org',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:79.0) Gecko/20100101 Firefox/79.0'
}
# Assign url to the address of the page you need to download
url = 'https://cbs.centos.org/kojifiles/packages/ceph/12.2.5/0.el7/aarch64/'
def get_file_list():
response = session.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# print(soup)
rst = []
for i in soup.find_all('a')[13:]:
rst.append(i['href'])
return rst
def format_float(num):
return '{:.2f}'.format(num)
def download_file(name, u, s):
r = s.get(u, stream=True)
length = float(r.headers['content-length'])
f = open(name, 'wb')
count = 0
count_tmp = 0
time1 = time.time()
for chunk in r.iter_content(chunk_size=512):
if chunk:
f.write(chunk)
count += len(chunk)
if time.time() - time1 > 2:
p = count / length * 100
speed = (count - count_tmp) / 1024 / 1024 / 2
count_tmp = count
print(name + ': ' + format_float(p) + '%' + ' Speed: ' + format_float(speed) + 'M/S')
time1 = time.time()
f.close()
if __name__ == '__main__':
session = requests.Session()
session.headers = headers
file_list = get_file_list()
for i in file_list:
download_file(i, url + i, session)