{
  "steward": "api-telemetry-steward",
  "project": "Azure-AI-RAG-CSharp-Semantic-Kernel-Functions",
  "runDate": "2026-03-21",
  "runId": "2026-03-21T00-00-00",
  "findings": [
    {
      "id": "ATEL-COVER-001",
      "title": "No telemetry on the chat completion flow",
      "severity": "critical",
      "category": "COVER",
      "file": "src/ChatAPI/Services/ChatService.cs",
      "description": "ChatService.GetResponseAsync is the core business operation. It has no try/catch, no span creation, no timing, and no structured event for success or failure. If the LLM call fails, there is no business-level record of the failure.",
      "recommendation": "Wrap the LLM call in a try/catch. Emit a structured event on success (with latency, token counts, session ID) and failure (with error type and session ID). Add an ActivitySource span scoped to GetResponseAsync.",
      "status": "open"
    },
    {
      "id": "ATEL-COVER-002",
      "title": "LLM response content logged in full — potential PII exposure",
      "severity": "critical",
      "category": "COVER",
      "file": "src/ChatAPI/Services/ChatService.cs",
      "line": 55,
      "description": "ChatService.cs:55 logs the full LLM assistant reply and ChatController.cs:26 logs the same serialized JSON result. If a user asks about personal or sensitive information, that content is written verbatim to Application Insights, creating a PII risk in production.",
      "recommendation": "Remove or truncate log statements that emit full response content. Log only metadata: response length, session ID, request ID, and whether the response contained a function call.",
      "status": "open"
    },
    {
      "id": "ATEL-SKTEL-001",
      "title": "Semantic Kernel OTel diagnostics not enabled — no AI operation spans",
      "severity": "critical",
      "category": "SKTEL",
      "file": "src/ChatAPI/Program.cs",
      "line": 65,
      "description": "SK 1.31.0 ships built-in OpenTelemetry support, but the project does not set the SEMANTICKERNEL_EXPERIMENTAL_GENAI_ENABLE_OTEL_DIAGNOSTICS env var and does not register the SK ActivitySource with the OTel pipeline. No AI spans appear in Application Insights.",
      "recommendation": "Register 'Microsoft.SemanticKernel*' activity source via .WithTracing() in the OTel pipeline and set SEMANTICKERNEL_EXPERIMENTAL_GENAI_ENABLE_OTEL_DIAGNOSTICS=true.",
      "status": "open"
    },
    {
      "id": "ATEL-COVER-003",
      "title": "No telemetry on session creation",
      "severity": "notable",
      "category": "COVER",
      "file": "src/ChatAPI/Controllers/SessionController.cs",
      "description": "SessionController.GetSession generates a session ID but emits no event. Session creation volume, rate anomalies, and session-to-conversation correlation are not observable.",
      "recommendation": "Emit a session.created structured event with a hashed session ID and timestamp.",
      "status": "open"
    },
    {
      "id": "ATEL-COVER-004",
      "title": "No telemetry on RAG search execution",
      "severity": "notable",
      "category": "COVER",
      "file": "src/ChatAPI/Data/AISearchData.cs",
      "description": "AISearchData.RetrieveDocumentationAsync executes a hybrid semantic + vector search but emits nothing. Result count, search latency, and failure conditions are not recorded.",
      "recommendation": "Emit a search.query.executed event after each search, including result count, latency, and index used.",
      "status": "open"
    },
    {
      "id": "ATEL-SKTEL-002",
      "title": "Token usage not tracked — no cost visibility",
      "severity": "notable",
      "category": "SKTEL",
      "file": "src/ChatAPI/Services/ChatService.cs",
      "line": 49,
      "description": "IChatCompletionService.GetChatMessageContentAsync returns ChatMessageContent whose InnerContent exposes token usage counts. This data is never read. There is no visibility into AI API cost or ability to alert on runaway token consumption.",
      "recommendation": "After the LLM call, extract token usage from response.InnerContent (cast to OpenAI.Chat.ChatCompletion) and log or record as a metric.",
      "status": "open"
    },
    {
      "id": "ATEL-EVENT-001",
      "title": "Error paths use LogInformation instead of LogError",
      "severity": "notable",
      "category": "EVENT",
      "file": "src/ChatAPI/Plugins/AISearchDataPlugin.cs",
      "line": 46,
      "description": "AISearchDataPlugin.ResourceLookup catches exceptions and calls _logger.LogInformation(ex, ...) before re-throwing. Using LogInformation for exception paths means these errors are not surfaced in Application Insights as failures and do not trigger error-rate alerts.",
      "recommendation": "Change all exception-path log calls from LogInformation to LogError in both AISearchDataPlugin and ProductDataPlugin.",
      "status": "open"
    },
    {
      "id": "ATEL-COVER-005",
      "title": "No error telemetry on chat completion failure path",
      "severity": "notable",
      "category": "COVER",
      "file": "src/ChatAPI/Services/ChatService.cs",
      "description": "ChatService.GetResponseAsync has no try/catch block. If the LLM call throws, the exception propagates unhandled. Azure Monitor captures the HTTP 500 but no business-level error event records the session, error type, or context.",
      "recommendation": "Add try/catch to GetResponseAsync. On failure, emit a structured error event with session ID and error type (not user question content).",
      "status": "open"
    },
    {
      "id": "ATEL-EVENT-002",
      "title": "Session ID never tagged on logs or spans",
      "severity": "minor",
      "category": "EVENT",
      "file": "src/ChatAPI/Services/ChatService.cs",
      "description": "sessionId is passed into GetResponseAsync but never added to a logging scope or OTel span attribute. All log statements in ChatService and downstream are disconnected from session context, making conversation-level correlation impossible.",
      "recommendation": "Use _logger.BeginScope(new Dictionary<string, object> { [\"SessionId\"] = sessionId }) to attach session context to all log statements within GetResponseAsync.",
      "status": "open"
    },
    {
      "id": "ATEL-METRIC-001",
      "title": "No custom business metrics defined",
      "severity": "minor",
      "category": "METRIC",
      "description": "No Meter, Counter, or Histogram is defined anywhere in the ChatAPI. Business metrics for chat volume, error rates, token usage, and search latency are entirely absent.",
      "recommendation": "Define a ChatMetrics class using System.Diagnostics.Metrics.Meter with counters for chat requests and errors, and histograms for LLM and search latency. Register as a singleton.",
      "status": "open"
    },
    {
      "id": "ATEL-EVENT-003",
      "title": "Log messages use vague, non-semantic names",
      "severity": "minor",
      "category": "EVENT",
      "description": "Existing log messages ('Chat History Count {count}', 'Init Chat History', 'Response {response}') are diagnostic and do not follow a semantic verb-noun past-tense naming convention. They cannot serve as structured business events.",
      "recommendation": "Standardize log message format to semantic verb-noun past-tense pattern: 'chat.history.initialized', 'chat.response.completed', 'product.lookup.completed'.",
      "status": "open"
    },
    {
      "id": "ATEL-INFRA-001",
      "title": "Azure Monitor / OpenTelemetry infrastructure correctly wired",
      "severity": "info",
      "category": "INFRA",
      "file": "src/ChatAPI/Program.cs",
      "line": 65,
      "description": "Program.cs registers Azure.Monitor.OpenTelemetry.AspNetCore via UseAzureMonitor() with DefaultAzureCredential. This provides automatic HTTP request traces, Azure SDK dependency spans, and exception telemetry. The OTel foundation is sound.",
      "recommendation": "No action required. Build business event telemetry on top of this foundation.",
      "status": "open"
    },
    {
      "id": "ATEL-SKTEL-003",
      "title": "SK 1.31.0 supports OTel diagnostics via experimental feature flag",
      "severity": "info",
      "category": "SKTEL",
      "file": "src/ChatAPI/ChatAPI.csproj",
      "description": "Semantic Kernel 1.31.0 includes built-in OpenTelemetry support under the Microsoft.SemanticKernel.Diagnostics activity source. Enabling it provides automatic spans for chat completion, function calls, and prompt rendering at low implementation cost.",
      "recommendation": "Enable by setting SEMANTICKERNEL_EXPERIMENTAL_GENAI_ENABLE_OTEL_DIAGNOSTICS=true and registering the SK activity source (see ATEL-SKTEL-001 for implementation details).",
      "status": "open"
    }
  ],
  "summary": {
    "critical": 3,
    "notable": 5,
    "minor": 3,
    "info": 2,
    "total": 13
  }
}