63 lines
2.0 KiB
Python
63 lines
2.0 KiB
Python
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
|
||
def fetch_html(url):
|
||
"""从指定URL获取HTML内容"""
|
||
response = requests.get(url)
|
||
if response.status_code == 200:
|
||
return response.text
|
||
else:
|
||
return None
|
||
|
||
|
||
def parse_html(html):
|
||
"""解析HTML,获取每个tr中第二个td下的所有a和span标签的内容"""
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
table = soup.find("table", id="sortable-table")
|
||
results = []
|
||
if table:
|
||
rows = table.find("tbody").find_all("tr")
|
||
for row in rows:
|
||
tds = row.find_all("td")
|
||
if len(tds) >= 2:
|
||
a_tags = tds[1].find_all("a")
|
||
span_tags = tds[1].find_all("span")
|
||
spans = [span.text.strip() for span in span_tags]
|
||
for a_tag in a_tags:
|
||
results.append((a_tag.text.strip(), spans))
|
||
return results
|
||
|
||
|
||
def save_results_to_file(results, filename):
|
||
"""保存提取的数据到TXT文件"""
|
||
with open(filename, "a", encoding="utf-8") as file: # Append mode
|
||
for data in results:
|
||
package_name, version_ranges = data
|
||
file.write(f"Package Name: {package_name}\n")
|
||
file.write("Version Ranges: " + ", ".join(version_ranges) + "\n")
|
||
file.write("-" * 50 + "\n") # Adds a separator for clarity
|
||
|
||
|
||
def main():
|
||
base_url = "https://security.snyk.io/vuln/pip/"
|
||
page_number = 1
|
||
while True:
|
||
url = f"{base_url}{page_number}"
|
||
print(f"Fetching data from {url}")
|
||
html_content = fetch_html(url)
|
||
if not html_content:
|
||
print("No more data found or failed to fetch.")
|
||
break
|
||
extracted_data = parse_html(html_content)
|
||
if not extracted_data:
|
||
print("No relevant data found on page.")
|
||
break
|
||
save_results_to_file(extracted_data, "extracted_data.txt")
|
||
page_number += 1
|
||
print("Results have been saved to 'extracted_data.txt'.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|