feat: 爬取漏洞依赖并对版本信息格式进行转换

This commit is contained in:
dqy 2024-04-22 17:06:12 +08:00
parent 5993a14368
commit 2c844c8ed1
4 changed files with 3317 additions and 0 deletions

62
crawler/crawler.py Normal file
View File

@ -0,0 +1,62 @@
import requests
from bs4 import BeautifulSoup
def fetch_html(url):
"""从指定URL获取HTML内容"""
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def parse_html(html):
"""解析HTML获取每个tr中第二个td下的所有a和span标签的内容"""
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", id="sortable-table")
results = []
if table:
rows = table.find("tbody").find_all("tr")
for row in rows:
tds = row.find_all("td")
if len(tds) >= 2:
a_tags = tds[1].find_all("a")
span_tags = tds[1].find_all("span")
spans = [span.text.strip() for span in span_tags]
for a_tag in a_tags:
results.append((a_tag.text.strip(), spans))
return results
def save_results_to_file(results, filename):
"""保存提取的数据到TXT文件"""
with open(filename, "a", encoding="utf-8") as file: # Append mode
for data in results:
package_name, version_ranges = data
file.write(f"Package Name: {package_name}\n")
file.write("Version Ranges: " + ", ".join(version_ranges) + "\n")
file.write("-" * 50 + "\n") # Adds a separator for clarity
def main():
base_url = "https://security.snyk.io/vuln/pip/"
page_number = 1
while True:
url = f"{base_url}{page_number}"
print(f"Fetching data from {url}")
html_content = fetch_html(url)
if not html_content:
print("No more data found or failed to fetch.")
break
extracted_data = parse_html(html_content)
if not extracted_data:
print("No relevant data found on page.")
break
save_results_to_file(extracted_data, "extracted_data.txt")
page_number += 1
print("Results have been saved to 'extracted_data.txt'.")
if __name__ == "__main__":
main()

2700
crawler/extracted_data.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,507 @@
Package Name: apache-airflow
Version Ranges: <2.6.1,>=2.3.0
--------------------------------------------------
Package Name: mlflow
Version Ranges: <2.10.0
--------------------------------------------------
Package Name: torch
Version Ranges: <1.10.0
--------------------------------------------------
Package Name: aiohttp
Version Ranges: <3.9.2,>=1.0.5
--------------------------------------------------
Package Name: keras
Version Ranges: <2.13.1rc0
--------------------------------------------------
Package Name: llama-index
Version Ranges: <0.10.24
--------------------------------------------------
Package Name: zenml
Version Ranges: <0.42.2,<0.43.1,<0.44.4,<0.47.0,>=0.43.0,>=0.44.0,>=0.46.0
--------------------------------------------------
Package Name: gradio
Version Ranges: <4.9.0
--------------------------------------------------
Package Name: bentoml
Version Ranges: <1.2.5
--------------------------------------------------
Package Name: langchain
Version Ranges: <0.0.353
--------------------------------------------------
Package Name: scrapy
Version Ranges: <1.8.4,<2.11.1,>=2.0.0
--------------------------------------------------
Package Name: sqlparse
Version Ranges: <0.5.0
--------------------------------------------------
Package Name: gunicorn
Version Ranges: <22.0.0
--------------------------------------------------
Package Name: magnum
Version Ranges: <14.1.2,<15.0.2,<16.0.2,<17.0.2,>=15.0.0.0rc1,>=16.0.0.0rc1,>=17.0.0.0rc1
--------------------------------------------------
Package Name: nicegui
Version Ranges: <1.4.21,>=1.4.6
--------------------------------------------------
Package Name: idna
Version Ranges: <3.7
--------------------------------------------------
Package Name: llama-index-core
Version Ranges: <0.10.24
--------------------------------------------------
Package Name: litellm
Version Ranges: <1.34.42
--------------------------------------------------
Package Name: roundup
Version Ranges: <1.2.1,<1.4.6,>=1.2.0,>=1.4.0
--------------------------------------------------
Package Name: transformers
Version Ranges: <4.37.0
--------------------------------------------------
Package Name: dirac
Version Ranges: <8.0.37,<9.0.0a22,>=8.0.0,>=8.1.0a1
--------------------------------------------------
Package Name: yt-dlp
Version Ranges: <2024.4.9,>=2021.4.11
--------------------------------------------------
Package Name: rafcon
Version Ranges: <0.15.4
--------------------------------------------------
Package Name: radicale
Version Ranges: <3.0.0
--------------------------------------------------
Package Name: pcaspy
Version Ranges: <0.7.1
--------------------------------------------------
Package Name: holidays
Version Ranges: <0.45
--------------------------------------------------
Package Name: evennia
Version Ranges: <4.0.0
--------------------------------------------------
Package Name: django-json-widget
Version Ranges: <2.0.0
--------------------------------------------------
Package Name: avocado-framework
Version Ranges: <104.0
--------------------------------------------------
Package Name: arrendatools.plantillas
Version Ranges: <0.4.3
--------------------------------------------------
Package Name: amazon-product-details-scraper
Version Ranges: <1.0.4
--------------------------------------------------
Package Name: aiopioneer
Version Ranges: <0.1.5
--------------------------------------------------
Package Name: pgadmin4
Version Ranges: <8.4
--------------------------------------------------
Package Name: pymongo
Version Ranges: <4.6.3
--------------------------------------------------
Package Name: voila
Version Ranges: <0.2.17,<0.3.8,<0.4.4,<0.5.6,>=0.0.2,>=0.3.0a0,>=0.4.0a0,>=0.5.0a0
--------------------------------------------------
Package Name: piccolo-admin
Version Ranges: <1.3.2
--------------------------------------------------
Package Name: cryptoauthlib
Version Ranges: <20200912
--------------------------------------------------
Package Name: mosaicml
Version Ranges: <0.5.0
--------------------------------------------------
Package Name: mlrun
Version Ranges: <1.7.0rc5
--------------------------------------------------
Package Name: eventlet
Version Ranges: <0.34.3
--------------------------------------------------
Package Name: salt
Version Ranges: <3005.5
--------------------------------------------------
Package Name: django-two-factor-auth
Version Ranges: <1.13
--------------------------------------------------
Package Name: pillow
Version Ranges: <10.2.0
--------------------------------------------------
Package Name: ipywidgets
Version Ranges: <5.2.0,>=5.0.0
--------------------------------------------------
Package Name: pylint
Version Ranges: <2.6.1
--------------------------------------------------
Package Name: pytest-cov
Version Ranges: <2.0.0
--------------------------------------------------
Package Name: jupyterhub
Version Ranges: <4.1.0
--------------------------------------------------
Package Name: geonode
Version Ranges: <4.1.0
--------------------------------------------------
Package Name: langchain-core
Version Ranges: <0.1.7
--------------------------------------------------
Package Name: lektor
Version Ranges: <3.3.11
--------------------------------------------------
Package Name: ansys-geometry-core
Version Ranges: <0.3.3,<0.4.12,>=0.3.0,>=0.4.0
--------------------------------------------------
Package Name: nautobot
Version Ranges: <1.6.10,<2.1.2,>=2.0.0
--------------------------------------------------
Package Name: mobsfscan
Version Ranges: <0.3.8
--------------------------------------------------
Package Name: esphome
Version Ranges: <2024.2.1
--------------------------------------------------
Package Name: qiskit-ibm-runtime
Version Ranges: <0.21.2,>=0.1.0
--------------------------------------------------
Package Name: jupyter-server-proxy
Version Ranges: <3.2.3,<4.1.1,>=4.0.0
--------------------------------------------------
Package Name: oauthenticator
Version Ranges: <16.3.0
--------------------------------------------------
Package Name: octoprint
Version Ranges: <1.10.0rc1
--------------------------------------------------
Package Name: wiki
Version Ranges: <0.10.1
--------------------------------------------------
Package Name: astropy
Version Ranges: <5.3.3
--------------------------------------------------
Package Name: yaql
Version Ranges: <3.0.0
--------------------------------------------------
Package Name: black
Version Ranges: <24.3.0
--------------------------------------------------
Package Name: fgr
Version Ranges: <0.4.0
--------------------------------------------------
Package Name: vantage6
Version Ranges: <4.2.0
--------------------------------------------------
Package Name: paddlepaddle
Version Ranges: <2.6.0
--------------------------------------------------
Package Name: mssql-django
Version Ranges: <1.4.1
--------------------------------------------------
Package Name: aiosmtpd
Version Ranges: <1.4.5
--------------------------------------------------
Package Name: ckan
Version Ranges: <2.10.1,<2.9.9,>=2.10.0
--------------------------------------------------
Package Name: langchain-community
Version Ranges: <0.0.27
--------------------------------------------------
Package Name: libosdp
Version Ranges: <3.0.0
--------------------------------------------------
Package Name: weasyprint
Version Ranges: <61.2,>=61.0
--------------------------------------------------
Package Name: apache-superset
Version Ranges: <3.0.3
--------------------------------------------------
Package Name: jwcrypto
Version Ranges: <1.5.6,>=0.5.0
--------------------------------------------------
Package Name: paho-mqtt
Version Ranges: <1.1
--------------------------------------------------
Package Name: rq
Version Ranges: <0.7.1
--------------------------------------------------
Package Name: eth-abi
Version Ranges: <5.0.1
--------------------------------------------------
Package Name: prefect
Version Ranges: <2.15.0
--------------------------------------------------
Package Name: django-treenode
Version Ranges: <0.20.0
--------------------------------------------------
Package Name: hypercorn
Version Ranges: <0.16.0
--------------------------------------------------
Package Name: streamlink
Version Ranges: <5.3.0
--------------------------------------------------
Package Name: kedro
Version Ranges: <0.19.3
--------------------------------------------------
Package Name: pyccel
Version Ranges: <1.9.0
--------------------------------------------------
Package Name: django
Version Ranges: <3.2.24,<4.2.10,<5.0.2,>=3.2,>=4.2,>=5.0
--------------------------------------------------
Package Name: videomass
Version Ranges: <5.0.4
--------------------------------------------------
Package Name: ultralytics
Version Ranges: <8.1.0
--------------------------------------------------
Package Name: intel-extension-for-transformers
Version Ranges: <1.2.2
--------------------------------------------------
Package Name: labgrid
Version Ranges: <23.0.2
--------------------------------------------------
Package Name: docassemble.webapp
Version Ranges: <1.4.97
--------------------------------------------------
Package Name: docassemble.base
Version Ranges: <1.4.97,>=1.4.53
--------------------------------------------------
Package Name: docassemble
Version Ranges: <1.4.97
--------------------------------------------------
Package Name: langchain-experimental
Version Ranges: <0.0.52
--------------------------------------------------
Package Name: label-studio
Version Ranges: <1.10.1
--------------------------------------------------
Package Name: rpyc
Version Ranges: <5.2.1
--------------------------------------------------
Package Name: peewee
Version Ranges: <3.17.1
--------------------------------------------------
Package Name: urllib3-future
Version Ranges: <2.4.902
--------------------------------------------------
Package Name: flask-appbuilder
Version Ranges: <4.3.11
--------------------------------------------------
Package Name: pretix
Version Ranges: <2024.1.1
--------------------------------------------------
Package Name: orjson
Version Ranges: <3.9.15
--------------------------------------------------
Package Name: pypqc
Version Ranges: <0.0.6.1
--------------------------------------------------
Package Name: mjml
Version Ranges: <0.11.0
--------------------------------------------------
Package Name: onnx
Version Ranges: <1.16.0
--------------------------------------------------
Package Name: fastecdsa
Version Ranges: <2.3.2
--------------------------------------------------
Package Name: pymatgen
Version Ranges: <2024.2.20
--------------------------------------------------
Package Name: cryptography
Version Ranges: <42.0.2,>=35.0.0
--------------------------------------------------
Package Name: apache-airflow-providers-mongo
Version Ranges: <4.0.0,>=1.0.0
--------------------------------------------------
Package Name: cbor2
Version Ranges: <5.6.0
--------------------------------------------------
Package Name: intel-extension-for-tensorflow
Version Ranges: <2.13.0.0
--------------------------------------------------
Package Name: tuf
Version Ranges: <3.1.1,>=2.0.0
--------------------------------------------------
Package Name: zpywallet
Version Ranges: <0.6.2
--------------------------------------------------
Package Name: dipdup
Version Ranges: <3.0.2
--------------------------------------------------
Package Name: clip-retrieval
Version Ranges: <2.23.1
--------------------------------------------------
Package Name: procrastinate
Version Ranges: <0.11.0
--------------------------------------------------
Package Name: embedchain
Version Ranges: <0.1.57
--------------------------------------------------
Package Name: miarec-ftpfs
Version Ranges: <2024.1.2
--------------------------------------------------
Package Name: miarec-sshfs
Version Ranges: <2024.1.5
--------------------------------------------------
Package Name: linkml
Version Ranges: <1.5.2
--------------------------------------------------
Package Name: toodledo
Version Ranges: <1.5.0
--------------------------------------------------
Package Name: renku
Version Ranges: <1.11.0
--------------------------------------------------
Package Name: vunnel
Version Ranges: <0.18.0
--------------------------------------------------
Package Name: panda3d
Version Ranges: <1.9.4
--------------------------------------------------
Package Name: ludwig
Version Ranges: <0.7
--------------------------------------------------
Package Name: ethyca-fides
Version Ranges: <2.1.0
--------------------------------------------------
Package Name: hiddifypanel
Version Ranges: <9.0.0.dev30
--------------------------------------------------
Package Name: dgl
Version Ranges: <0.9.0
--------------------------------------------------
Package Name: deephaven-core
Version Ranges: <0.30.0
--------------------------------------------------
Package Name: borgmatic
Version Ranges: <1.8.7
--------------------------------------------------
Package Name: cg
Version Ranges: <26.0.4
--------------------------------------------------
Package Name: ccryptofeed
Version Ranges: <2.2.3
--------------------------------------------------
Package Name: c2cgeoform
Version Ranges: <2.1.26
--------------------------------------------------
Package Name: appfl
Version Ranges: <0.4.0
--------------------------------------------------
Package Name: nonebot2
Version Ranges: <2.2.0,>=2.0.0a16
--------------------------------------------------
Package Name: acryl-datahub
Version Ranges: <0.8.45
--------------------------------------------------
Package Name: bullmq
Version Ranges: <1.15.0
--------------------------------------------------
Package Name: aiobotocore
Version Ranges: <2.9.1
--------------------------------------------------
Package Name: diffoscope
Version Ranges: <256
--------------------------------------------------
Package Name: kinto-attachment
Version Ranges: <6.4.0
--------------------------------------------------
Package Name: bandit
Version Ranges: <1.7.7
--------------------------------------------------
Package Name: dnspython
Version Ranges: <2.6.1
--------------------------------------------------
Package Name: products.sqlalchemyda
Version Ranges: <2.2
--------------------------------------------------
Package Name: clearml
Version Ranges: <1.14.2
--------------------------------------------------
Package Name: tensorflow
Version Ranges: <1.7.1
--------------------------------------------------
Package Name: pyload-ng
Version Ranges: <0.5.0b3.dev78
--------------------------------------------------
Package Name: fastapi
Version Ranges: <0.109.1
--------------------------------------------------
Package Name: python-multipart
Version Ranges: <0.0.7
--------------------------------------------------
Package Name: kinto
Version Ranges: <6.1.0
--------------------------------------------------
Package Name: cupy
Version Ranges: <13.0.0
--------------------------------------------------
Package Name: llama-hub
Version Ranges: <0.0.67
--------------------------------------------------
Package Name: borgbackup
Version Ranges: <1.0.7
--------------------------------------------------
Package Name: snakemake
Version Ranges: <7.9.0
--------------------------------------------------
Package Name: lief
Version Ranges: <0.12.3
--------------------------------------------------
Package Name: checkov
Version Ranges: <2.0.1029
--------------------------------------------------
Package Name: dash-html-components
Version Ranges: <2.0.0
--------------------------------------------------
Package Name: dash
Version Ranges: <2.15.0
--------------------------------------------------
Package Name: dash-core-components
Version Ranges: <2.0.0
--------------------------------------------------
Package Name: glance-store
Version Ranges: <4.3.3,<4.7.0,>=4.4.0
--------------------------------------------------
Package Name: dagster
Version Ranges: <1.1.10
--------------------------------------------------
Package Name: wagtail
Version Ranges: <5.2rc1
--------------------------------------------------
Package Name: pycryptodome
Version Ranges: <3.19.1
--------------------------------------------------
Package Name: celery
Version Ranges: <4.4.0rc5
--------------------------------------------------
Package Name: vantage6-server
Version Ranges: <4.2.0
--------------------------------------------------
Package Name: tuitse-tsusin
Version Ranges: <1.3.2
--------------------------------------------------
Package Name: apache-airflow-providers-cncf-kubernetes
Version Ranges: <7.0.0,>=5.2.0
--------------------------------------------------
Package Name: whoogle-search
Version Ranges: <0.8.4
--------------------------------------------------
Package Name: jupyterlab-lsp
Version Ranges: <5.0.2
--------------------------------------------------
Package Name: changedetection.io
Version Ranges: <0.45.13
--------------------------------------------------
Package Name: jupyterlab
Version Ranges: <4.0.11,>=4.0.0
--------------------------------------------------
Package Name: ansible-core
Version Ranges: <2.14.14,<2.15.9,<2.16.3,>=2.15.0,>=2.16.0
--------------------------------------------------
Package Name: readthedocs-sphinx-search
Version Ranges: <0.3.2
--------------------------------------------------
Package Name: zodb3
Version Ranges: <3.8.3,<3.9.0c2,>=3.8.0a1,>=3.9.0
--------------------------------------------------

48
crawler/transfer.py Normal file
View File

@ -0,0 +1,48 @@
"""转换原有的漏洞文件格式"""
import re
from packaging.specifiers import SpecifierSet
def load_vulnerable_packages(filename):
"""从文件加载有漏洞的包信息"""
with open(filename, "r", encoding="utf-8") as file:
content = file.read()
vulnerabilities = {}
blocks = content.split("--------------------------------------------------")
range_pattern = re.compile(r"\[(.*?),\s*(.*?)\)")
for block in blocks:
name_match = re.search(r"Package Name: (.+)", block)
if name_match:
package_name = name_match.group(1).strip()
ranges = range_pattern.findall(block)
specifier_list = []
for start, end in ranges:
if start and end:
specifier_list.append(f">={start},<{end}")
elif start:
specifier_list.append(f">={start}")
elif end:
specifier_list.append(f"<{end}")
if specifier_list:
vulnerabilities[package_name] = SpecifierSet(",".join(specifier_list))
return vulnerabilities
def save_vulnerabilities_to_file(vuln_packages, filename):
"""将漏洞信息写入到文件中"""
with open(filename, "w", encoding="utf-8") as file:
for package, specifiers in vuln_packages.items():
file.write(f"Package Name: {package}\n")
file.write(f"Version Ranges: {specifiers}\n")
file.write("-" * 50 + "\n")
def main():
vulnerabilities = load_vulnerable_packages("extracted_data.txt")
save_vulnerabilities_to_file(vulnerabilities, "trans_extracted_data.txt")
if __name__ == "__main__":
main()