Let’s build a simple web scraper which extracts headlines from The New York Times and uses a BERT model from Huggingface to detect the sentiment of each.
from beam import App, Runtime, Image app = App( name="web-scraper", runtime=Runtime( cpu=1, memory="8Gi", image=Image( python_version="python3.8", python_packages=["bs4", "transformers", "torch"], ), ), )
@app.run()
@app.run() def scrape_nyt(): ...
Show Code
from beam import App, Runtime, Image import time import requests from bs4 import BeautifulSoup from transformers import pipeline app = App( name="web-scraper", runtime=Runtime( cpu=1, memory="8Gi", image=Image( python_version="python3.8", python_packages=["bs4", "transformers", "torch"], ), ), ) @app.run() def scrape_nyt(): res = requests.get("https://www.nytimes.com") soup = BeautifulSoup(res.content, "html.parser") # Grab all headlines headlines = soup.find_all("h3", class_="indicate-hover", text=True) total_headlines = len(headlines) negative_headlines = 0 # Iterate through each headline for h in headlines: title = h.get_text() print(title) sentiment = predict_sentiment(title) print(sentiment) if sentiment.get("NEGATIVE") > sentiment.get("POSITIVE"): negative_headlines += 1 print(f"{negative_headlines} negative headlines / {total_headlines} total") def predict_sentiment(title): model = pipeline( "sentiment-analysis", model="siebert/sentiment-roberta-large-english" ) result = model(title, truncation=True, top_k=2) prediction = {i["label"]: i["score"] for i in result} return prediction
beam run your_file.py:scrape_nyt
(.venv) beta9@MacBook-Air-2 web-scraping % beam-stage run app.py:scrape_nyt i Using cached image. ✓ App initialized. ✓ Container scheduled, logs will appear below. Starting app... Loading handler in 'app.py:scrape_nyt'... Running task: c021040d-aea7-4406-9b5e-79d898f7592a This Hummus Holds Up After 800 Years {'POSITIVE': 0.9985199570655823, 'NEGATIVE': 0.0014800893841311336} Task complete: c021040d-aea7-4406-9b5e-79d898f7592a, duration: 177.36207103729248s
Was this page helpful?