import requests from bs4 import BeautifulSoup def fetch_html(url): """从指定URL获取HTML内容""" response = requests.get(url) if response.status_code == 200: return response.text else: return None def parse_html(html): """解析HTML,获取每个tr中第二个td下的所有a和span标签的内容""" soup = BeautifulSoup(html, "html.parser") table = soup.find("table", id="sortable-table") results = [] if table: rows = table.find("tbody").find_all("tr") for row in rows: tds = row.find_all("td") if len(tds) >= 2: a_tags = tds[1].find_all("a") span_tags = tds[1].find_all("span") spans = [span.text.strip() for span in span_tags] for a_tag in a_tags: results.append((a_tag.text.strip(), spans)) return results def save_results_to_file(results, filename): """保存提取的数据到TXT文件""" with open(filename, "a", encoding="utf-8") as file: # Append mode for data in results: package_name, version_ranges = data file.write(f"Package Name: {package_name}\n") file.write("Version Ranges: " + ", ".join(version_ranges) + "\n") file.write("-" * 50 + "\n") # Adds a separator for clarity def main(): base_url = "https://security.snyk.io/vuln/pip/" page_number = 1 while True: url = f"{base_url}{page_number}" print(f"Fetching data from {url}") html_content = fetch_html(url) if not html_content: print("No more data found or failed to fetch.") break extracted_data = parse_html(html_content) if not extracted_data: print("No relevant data found on page.") break save_results_to_file(extracted_data, "extracted_data.txt") page_number += 1 print("Results have been saved to 'extracted_data.txt'.") if __name__ == "__main__": main()