feat: 爬取漏洞依赖并对版本信息格式进行转换
This commit is contained in:
62
crawler/crawler.py
Normal file
62
crawler/crawler.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def fetch_html(url):
|
||||
"""从指定URL获取HTML内容"""
|
||||
response = requests.get(url)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def parse_html(html):
|
||||
"""解析HTML,获取每个tr中第二个td下的所有a和span标签的内容"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
table = soup.find("table", id="sortable-table")
|
||||
results = []
|
||||
if table:
|
||||
rows = table.find("tbody").find_all("tr")
|
||||
for row in rows:
|
||||
tds = row.find_all("td")
|
||||
if len(tds) >= 2:
|
||||
a_tags = tds[1].find_all("a")
|
||||
span_tags = tds[1].find_all("span")
|
||||
spans = [span.text.strip() for span in span_tags]
|
||||
for a_tag in a_tags:
|
||||
results.append((a_tag.text.strip(), spans))
|
||||
return results
|
||||
|
||||
|
||||
def save_results_to_file(results, filename):
|
||||
"""保存提取的数据到TXT文件"""
|
||||
with open(filename, "a", encoding="utf-8") as file: # Append mode
|
||||
for data in results:
|
||||
package_name, version_ranges = data
|
||||
file.write(f"Package Name: {package_name}\n")
|
||||
file.write("Version Ranges: " + ", ".join(version_ranges) + "\n")
|
||||
file.write("-" * 50 + "\n") # Adds a separator for clarity
|
||||
|
||||
|
||||
def main():
|
||||
base_url = "https://security.snyk.io/vuln/pip/"
|
||||
page_number = 1
|
||||
while True:
|
||||
url = f"{base_url}{page_number}"
|
||||
print(f"Fetching data from {url}")
|
||||
html_content = fetch_html(url)
|
||||
if not html_content:
|
||||
print("No more data found or failed to fetch.")
|
||||
break
|
||||
extracted_data = parse_html(html_content)
|
||||
if not extracted_data:
|
||||
print("No relevant data found on page.")
|
||||
break
|
||||
save_results_to_file(extracted_data, "extracted_data.txt")
|
||||
page_number += 1
|
||||
print("Results have been saved to 'extracted_data.txt'.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user