马里# 2024-04-10 12:00:00 # 2024-04-10 12:00:00 # https://apnews.com/article/trump-hush-money-trial-jury-selection-4b8a0a0a0a0a0a0a0a0a0a0a0a0a0a0 import json import requests from bs4 import BeautifulSoup def scrape_article(url): try: # Send a GET request to the URL response = requests.get(url) response.raise_for_status() # Raise an error for bad status codes # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') # Extract the title title = soup.find('h1').get_text() if soup.find('h1') else 'No title found' # Extract the article body article_body = [] # This selector might need adjustment based on the actual HTML of the article for paragraph in soup.find_all('p'): article_body.append(paragraph.get_text()) # Combine all paragraphs into a single text article_text = '/n'.join(article_body) # Return the extracted data as a dictionary return { 'title': title, 'article_text': article_text, 'url': url } except Exception as e: print(f"Error scraping {url}: {e}") return None # Example usage if __name__ == "__main__": article_url = "https://apnews.com/article/trump-hush-money-trial-jury-selection-4b8a0a0a0a0a0a0a0a0a0a0a0a0a0a0" article_data = scrape_article(article_url) if article_data: print(json.dumps(article_data, indent=2))

Reuters