981 字
5 分钟
爬虫数据 | WTO争端案件数据
这是一个基于selenium库的爬虫代码,根据WTO争端事件的网址规律和网页代码结构进行内容爬取,需要使用谷歌的Chrome浏览器。
import pandas as pdfrom selenium import webdriverfrom selenium.webdriver.chrome.service import Servicefrom selenium.webdriver.common.by import Byfrom webdriver_manager.chrome import ChromeDriverManagerimport reimport os
# 确保目录存在project_directory = "设置为你的路径"if not os.path.exists(project_directory): os.makedirs(project_directory)
# 初始化Chrome WebDriverservice = Service(ChromeDriverManager().install())driver = webdriver.Chrome(service=service)
# 初始化一个空的DataFrame来存储所有数据columns = ["DS", "Title", "Consultations requested", "Complainant", "Respondent", "Agreements cited", "Third Parties (original proceedings)"]df = pd.DataFrame(columns=columns)
try: # 具体案例ID请在range函数中设置 for case_number in range(1, 645): url = f"https://www.wto.org/english/tratop_e/dispu_e/cases_e/ds{case_number}_e.htm" driver.get(url) driver.implicitly_wait(20) # 等待页面加载完成
# 获取页面的HTML内容 page_source = driver.page_source
# 使用正则表达式提取信息 extracted_info = { "Title": re.search(r"<h1>(.*?)</h1>", page_source, re.DOTALL), "Consultations requested": re.search(r"<td>.*?Consultations requested.*?</td><td>(.*?)</td>", page_source), "Complainant": re.search(r"<td>Complainant: </td><td>(.*?)</td>", page_source), "Respondent": re.search(r"<td>Respondent:</td><td>(.*?)</td>", page_source), "Agreements cited": re.search(r"<td>Agreements cited:.*?</td><td>(.*?)</td>", page_source, re.DOTALL), "Third Parties (original proceedings)": re.search(r"<td>Third Parties.*?original proceedings.*?</td><td>(.*?)</td>", page_source) }
# 清理并添加数据到DataFrame row = { "DS": f"DS{case_number}", "Title": re.sub(r'<span class="dsnumber">DS.*?:</span>', '', extracted_info["Title"].group(1)).replace(' ', ' ') if extracted_info["Title"] else None, "Consultations requested": extracted_info["Consultations requested"].group(1).replace(' ', ' ') if extracted_info["Consultations requested"] else None, "Complainant": extracted_info["Complainant"].group(1).replace(' ', ' ') if extracted_info["Complainant"] else None, "Respondent": extracted_info["Respondent"].group(1).replace(' ', ' ') if extracted_info["Respondent"] else None, "Agreements cited": re.sub(r'<[^>]+>', '', extracted_info["Agreements cited"].group(1)).replace(' ', ' ') if extracted_info["Agreements cited"] else None, "Third Parties (original proceedings)": extracted_info["Third Parties (original proceedings)"].group(1).replace(' ', ' ') if extracted_info["Third Parties (original proceedings)"] else None } df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
# 如果已经处理了10个案例,就停止循环(可选) # if case_number == 10: # break
finally: # 关闭浏览器 driver.quit()
# 保存DataFrame到Excel文件excel_path = os.path.join(project_directory, 'WTO_cases.xlsx')df.to_excel(excel_path, index=False)print(f"Data has been saved to {excel_path}")Excel表格的数据结构如下:
| DS | Title | Consultations requested | Complainant | Respondent | Agreements cited | Third Parties |
|---|---|---|---|---|---|---|
| DS1 | Malaysia — Prohibition of Imports of Polyethylene and Polypropylene | 10 January 1995 | Singapore | Malaysia |
|
- |
| DS2 | United States — Standards for Reformulated and Conventional Gasoline | 24 January 1995 | Venezuela, Bolivarian Republic of | United States |
|
Australia; Canada; European Communities; Norway |
| DS3 | Korea — Measures Concerning the Testing and Inspection of Agricultural Products | 4 April 1995 | United States | Korea, Republic of |
|
- |
| DS4 | United States — Standards for Reformulated and Conventional Gasoline | 10 April 1995 | Brazil | United States |
|
- |
| DS5 | Korea — Measures Concerning the Shelf-Life of Products | 3 May 1995 | United States | Korea, Republic of |
|
- |
| DS6 | United States — Imposition of Import Duties on Automobiles from Japan under Sections 301 and 304 of the Trade act of 1974 | 17 May 1995 | Japan | United States |
|
- |
| DS7 | European Communities — Trade Description of Scallops | 19 May 1995 | Canada | European Communities |
|
Australia; Chile; Iceland; Japan; Peru; United States |
| DS8 | Japan — Taxes on Alcoholic Beverages | 21 June 1995 | European Communities | Japan |
|
- |
| DS9 | European Communities — Duties on Imports of Cereals | 30 June 1995 | Canada | European Communities |
|
- |
| DS10 | Japan — Taxes on Alcoholic Beverages | 7 July 1995 | Canada | Japan |
|
- |