Beam includes an optional loader argument which you can add to a REST API, Task Queue, or Scheduled Job. A loader is a function that will be run exactly once when your container first starts.

app.py
# The loader function runs once when the container starts
@app.rest_api(
    loader=your_loader_function,
)

Anything returned from the loader can be retrieved from the inputs dictionary in the handler, as context.

run.py
from beam import App, Runtime
from transformers import LlamaForCausalLM, LlamaTokenizer

app = App(
    name="loader-example",
    runtime=Runtime(),
)

# This function runs once when the container first starts
def load_models():
    model_id = "psmathur/orca_mini_3b"

    tokenizer = LlamaTokenizer.from_pretrained(model_id)
    model = LlamaForCausalLM.from_pretrained(model_id)

    return tokenizer, model


@app.rest_api(loader=load_models)
def generate(**inputs):
    # The loaded model is passed in using the `context` field
    tokenizer, model = inputs["context"]

    # Do something with the tokenizer and model...

Example: Caching a Huggingface Model

import os

from beam import App, Runtime, Image, Volume
from transformers import pipeline


app = App(
    name="sentiment-analysis",
    runtime=Runtime(
        cpu=1,
        memory="8Gi",
        image=Image(
            python_version="python3.9",
            python_packages=["transformers", "torch"],
        ),
    ),
    volumes=[Volume(name="cached_models", path="./cached_models")],
)

# Cache models in a Beam storage volume
os.environ["TRANSFORMERS_CACHE"] = "./cached_models"

# This function runs once when the container boots
def load_models():
    model = pipeline(
        "sentiment-analysis", model="siebert/sentiment-roberta-large-english"
    )

    return model


@app.rest_api(loader=load_models)
def predict(**inputs):
    # Retrieve cached model from loader
    model = inputs["context"]

    # Inference
    result = model(inputs["text"], truncation=True, top_k=1)
    prediction = {i["label"]: i["score"] for i in result}

    print(prediction)

    return {"prediction": prediction}

Using Loaders with Multiple Workers

If you are scaling out vertically with workers, the loader function will run once for each worker that starts up.