BackDoorBuster/crawler/crawler.py

63 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
def fetch_html(url):
"""从指定URL获取HTML内容"""
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def parse_html(html):
"""解析HTML获取每个tr中第二个td下的所有a和span标签的内容"""
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", id="sortable-table")
results = []
if table:
rows = table.find("tbody").find_all("tr")
for row in rows:
tds = row.find_all("td")
if len(tds) >= 2:
a_tags = tds[1].find_all("a")
span_tags = tds[1].find_all("span")
spans = [span.text.strip() for span in span_tags]
for a_tag in a_tags:
results.append((a_tag.text.strip(), spans))
return results
def save_results_to_file(results, filename):
"""保存提取的数据到TXT文件"""
with open(filename, "a", encoding="utf-8") as file: # Append mode
for data in results:
package_name, version_ranges = data
file.write(f"Package Name: {package_name}\n")
file.write("Version Ranges: " + ", ".join(version_ranges) + "\n")
file.write("-" * 50 + "\n") # Adds a separator for clarity
def main():
base_url = "https://security.snyk.io/vuln/pip/"
page_number = 1
while True:
url = f"{base_url}{page_number}"
print(f"Fetching data from {url}")
html_content = fetch_html(url)
if not html_content:
print("No more data found or failed to fetch.")
break
extracted_data = parse_html(html_content)
if not extracted_data:
print("No relevant data found on page.")
break
save_results_to_file(extracted_data, "extracted_data.txt")
page_number += 1
print("Results have been saved to 'extracted_data.txt'.")
if __name__ == "__main__":
main()