feat: 爬取漏洞依赖并对版本信息格式进行转换

This commit is contained in:
dqy
2024-04-22 17:06:12 +08:00
parent 5993a14368
commit 2c844c8ed1
4 changed files with 3317 additions and 0 deletions

62
crawler/crawler.py Normal file
View File

@@ -0,0 +1,62 @@
import requests
from bs4 import BeautifulSoup
def fetch_html(url):
"""从指定URL获取HTML内容"""
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def parse_html(html):
"""解析HTML获取每个tr中第二个td下的所有a和span标签的内容"""
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", id="sortable-table")
results = []
if table:
rows = table.find("tbody").find_all("tr")
for row in rows:
tds = row.find_all("td")
if len(tds) >= 2:
a_tags = tds[1].find_all("a")
span_tags = tds[1].find_all("span")
spans = [span.text.strip() for span in span_tags]
for a_tag in a_tags:
results.append((a_tag.text.strip(), spans))
return results
def save_results_to_file(results, filename):
"""保存提取的数据到TXT文件"""
with open(filename, "a", encoding="utf-8") as file: # Append mode
for data in results:
package_name, version_ranges = data
file.write(f"Package Name: {package_name}\n")
file.write("Version Ranges: " + ", ".join(version_ranges) + "\n")
file.write("-" * 50 + "\n") # Adds a separator for clarity
def main():
base_url = "https://security.snyk.io/vuln/pip/"
page_number = 1
while True:
url = f"{base_url}{page_number}"
print(f"Fetching data from {url}")
html_content = fetch_html(url)
if not html_content:
print("No more data found or failed to fetch.")
break
extracted_data = parse_html(html_content)
if not extracted_data:
print("No relevant data found on page.")
break
save_results_to_file(extracted_data, "extracted_data.txt")
page_number += 1
print("Results have been saved to 'extracted_data.txt'.")
if __name__ == "__main__":
main()