Application Real-Time Monitoring Service (ARMS) provides the self-developed agents for Python that collect application telemetry data. It leverages the OpenTelemetry specification for automated instrumentation, helping you to gain insights into the observability data of your large language model (LLM) applications.
Background information
LLM applications are trained on massive datasets to generate human-like responses. However, the following factors make LLM outputs unpredictable:
Performance drift between training and production environments
Data distribution shifts leading to degraded performance
Outdated training data
Unreliable external data sources
These challenges highlight the need to proactively monitor output quality degradation.
ARMS helps you monitor your LLM applications through automatic instrumentation using an ARMS agent for Python. Through a comprehensive trace view, ARMS gives you deeper insights into metrics such as the input and output of different operation types, and token consumption. For more information, see LLM Trace Explorer.
Supported frameworks
Framework | PyPI/Github repository address | Minimum version | Maximum version |
OpenAI | V1.0.0 | No limits | |
DashScope | V1.0.0 | No limits | |
LlamaIndex | V0.10.5 | V0.11.0 | |
LangChain | V0.1.0 | No limits | |
Dify | V0.8.3 | No limits |
Procedure
Step 1: Install an ARMS agent for Python
Choose a method based on the environment where your LLM application is deployed:
Step 2: Start the application with the agent
Use the following code to start your LLM application:
aliyun-instrument python llm_app.py
Replace llm_app.py with the actual application file.
If you do not have an LLM application, you can try one in the Demos sections.
Step 3: Verify the results
After about one minute, log on to the ARMS console. In the left-side navigation pane, choose . If the application appears on the Application List page, the application is being monitored.
Demos
OpenAI
llm_app.py
import openai
from fastapi import FastAPI
import uvicorn
app = FastAPI()
@app.get("/")
def call_openai():
client = openai.OpenAI(api_key="sk-xxx')
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Write a haiku."}],
max_tokens=20,
)
return {"data": f"{response}"}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt
fastapi
uvicorn
openai >= 1.0.0
DashScope
llm_app.py
from http import HTTPStatus
import dashscope
from dashscope import Generation
from fastapi import FastAPI
import uvicorn
app = FastAPI()
@app.get("/")
def call_dashscope():
dashscope.api_key = 'YOUR-DASHSCOPE-API-KEY'
responses = Generation.call(model=Generation.Models.qwen_turbo,
prompt='How is the weather today?')
resp = ""
if responses.status_code == HTTPStatus.OK:
resp = f"Result is: {responses.output}"
else:
resp = f"Failed request_id: {responses.request_id}, status_code: {responses.status_code}, code: {responses.code}, message: {responses.message})
return {"data": f"{resp}"}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt
fastapi
uvicorn
dashscope >= 1.0.0
LlamaIndex
Store your knowledge base documents (pdf, txt, doc, etc.) in the data directory.
llm_app.py
import time
from fastapi import FastAPI
import uvicorn
import aiohttp
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.dashscope import DashScopeEmbedding
import chromadb
import dashscope
import os
from dotenv import load_dotenv
from llama_index.core.llms import ChatMessage
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.dashscope import DashScope, DashScopeGenerationModels
import random
load_dotenv()
os.environ["DASHSCOPE_API_KEY"] = 'sk-xxxxxx'
dashscope.api_key = 'sk-xxxxxxx'
api_key = 'sk-xxxxxxxx'
llm = DashScope(model_name=DashScopeGenerationModels.QWEN_MAX,api_key=api_key)
# create client and a new collection
chroma_client = chromadb.EphemeralClient()
chroma_collection = chroma_client.create_collection("chapters")
# define embedding function
embed_model = DashScopeEmbedding(model_name="text-embedding-v1", api_key=api_key)
# load documents
filename_fn = lambda filename: {"file_name": filename}
# automatically sets the metadata of each document according to filename_fn
documents = SimpleDirectoryReader(
"./data/", file_metadata=filename_fn
).load_data()
# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context, embed_model=embed_model
)
retriever = VectorIndexRetriever(
index=index,
similarity_top_k=4,
verbose=True
)
# configure response synthesizer
response_synthesizer = get_response_synthesizer(llm=llm, response_mode="refine")
# assemble query engine
query_engine = RetrieverQueryEngine(
retriever=retriever,
response_synthesizer=response_synthesizer,
)
SYSTEM_PROMPT = """
You are a general knowledge chatbot for children. Your task is to respond to questions using information from your knowledge base. Please avoid answering subjective questions.
"""
# Initialize the conversation with a system message
messages = [ChatMessage(role="system", content=SYSTEM_PROMPT)]
app = FastAPI()
async def fetch(question):
url = "https://www.aliyun.com"
call_url = os.environ.get("LLM_INFRA_URL")
if call_url is None or call_url == "":
call_url = url
else:
call_url = f"{call_url}?question={question}"
print(call_url)
async with aiohttp.ClientSession() as session:
async with session.get(call_url) as response:
print(f"GET Status: {response.status}")
data = await response.text()
print(f"GET Response JSON: {data}")
return data
@app.get("/heatbeat")
def heatbeat():
return {"msg", "ok"}
cnt = 0
@app.get("/query")
async def call(question: str = None):
global cnt
cnt += 1
if cnt == 20:
cnt = 0
raise BaseException("query is over limit,20 ", 401)
# Add user message to the conversation history
message = ChatMessage(role="user", content=question)
# Convert messages into a string
message_string = f"{message.role}:{message.content}"
search = await fetch(question)
print(f"search:{search}")
resp = query_engine.query(message_string)
print(resp)
return {"data": f"{resp}".encode('utf-8').decode('utf-8')}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt
fastapi
uvicorn
numpy==1.23.5
llama-index==0.10.62
llama-index-core==0.10.28
llama-index-embeddings-dashscope==0.1.3
llama-index-llms-dashscope==0.1.2
llama-index-vector-stores-chroma==0.1.6
aiohttp
LangChain
llm_app.py
from fastapi import FastAPI
from langchain.llms.fake import FakeListLLM
import uvicorn
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
app = FastAPI()
llm = FakeListLLM(responses=["I'll callback later.", "You 'console' them!"])
template = """Question: {question}
Answer: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)
question = "What NFL team won the Super Bowl in the year Justin Beiber was born?"
@app.get("/")
def call_langchain():
res = llm_chain.run(question)
return {"data": res}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt
fastapi
uvicorn
langchain
langchain_community
Dify
Use Dify to create a customized AI-powered Q&A assistant for a website.