{
  "steward": "api-resilience-steward",
  "project": "Azure-AI-RAG-CSharp-Semantic-Kernel-Functions",
  "runDate": "2026-03-21",
  "runId": "2026-03-21T00-00-00",
  "findings": [
    {
      "id": "RESL-TIMEOUT-001",
      "title": "No timeout configured for Azure OpenAI calls",
      "severity": "critical",
      "category": "TIMEOUT",
      "file": "src/ChatAPI/Services/ChatService.cs",
      "line": 49,
      "description": "AzureOpenAIClient is registered with no AzureOpenAIClientOptions specifying a timeout. GetChatMessageContentAsync is called with no enclosing timeout guard or CancellationTokenSource. A slow or hung Azure OpenAI response will block the ASP.NET Core request thread indefinitely, causing thread pool exhaustion under load.",
      "recommendation": "Configure a custom HttpClient with an explicit Timeout when constructing AzureOpenAIClient via AzureOpenAIClientOptions, or wrap GetChatMessageContentAsync with a CancellationTokenSource using a configured timeout (e.g., 30 seconds). Prefer propagating HttpContext.RequestAborted through the stack.",
      "status": "open"
    },
    {
      "id": "RESL-TIMEOUT-002",
      "title": "No timeout configured for Azure AI Search calls",
      "severity": "critical",
      "category": "TIMEOUT",
      "file": "src/ChatAPI/Data/AISearchData.cs",
      "line": 56,
      "description": "SearchClient is registered as a singleton with no SearchClientOptions specifying a network timeout. RetrieveDocumentationAsync calls SearchAsync with no timeout guard. A degraded Azure AI Search service will hold all threads calling this method indefinitely.",
      "recommendation": "Pass SearchClientOptions with Retry.NetworkTimeout configured (e.g., 15 seconds) when constructing SearchClient in Program.cs.",
      "status": "open"
    },
    {
      "id": "RESL-CANCEL-001",
      "title": "CancellationToken not propagated anywhere in the service layer",
      "severity": "critical",
      "category": "CANCEL",
      "file": "src/ChatAPI/Controllers/ChatController.cs",
      "line": 20,
      "description": "ChatController.Post does not accept or use CancellationToken. ChatService.GetResponseAsync has no CancellationToken parameter. No downstream async operation receives a cancellation token. Client disconnections do not abort in-flight Azure OpenAI, Cosmos DB, or AI Search operations, causing unnecessary resource consumption and Azure spend.",
      "recommendation": "Add CancellationToken cancellationToken = default to ChatController.Post, ChatService.GetResponseAsync, and all Data layer async methods. Pass HttpContext.RequestAborted from the controller to the service and propagate to all SDK async calls.",
      "status": "open"
    },
    {
      "id": "RESL-RETRY-001",
      "title": "No retry policy on Azure OpenAI chat completion calls",
      "severity": "notable",
      "category": "RETRY",
      "file": "src/ChatAPI/Services/ChatService.cs",
      "line": 49,
      "description": "GetChatMessageContentAsync is called with no retry policy. Azure OpenAI is subject to 429 rate limiting and transient 5xx errors. A single transient failure fails the entire chat request with no recovery attempt.",
      "recommendation": "Apply a retry policy with exponential backoff and jitter (3-5 attempts) targeting 429 and 5xx responses. This can be configured at the AzureOpenAIClient HTTP pipeline level via AzureOpenAIClientOptions.Transport or via Microsoft.Extensions.Http.Resilience.",
      "status": "open"
    },
    {
      "id": "RESL-RETRY-002",
      "title": "No retry policy on Azure OpenAI embedding generation calls",
      "severity": "notable",
      "category": "RETRY",
      "file": "src/ChatAPI/Plugins/AISearchDataPlugin.cs",
      "line": 36,
      "description": "GenerateEmbeddingAsync is called with no retry policy. Transient embedding failures surface as unhandled exceptions from the kernel function, failing the entire RAG lookup.",
      "recommendation": "Apply the same retry policy as RESL-RETRY-001. Both chat completion and embedding share the same AzureOpenAIClient instance, so a single client-level retry configuration covers both.",
      "status": "open"
    },
    {
      "id": "RESL-RETRY-003",
      "title": "No retry policy on Azure AI Search calls",
      "severity": "notable",
      "category": "RETRY",
      "file": "src/ChatAPI/Data/AISearchData.cs",
      "line": 56,
      "description": "SearchAsync is called with no retry policy. Transient Azure AI Search failures (network errors, 429, 503) surface directly to the kernel function caller.",
      "recommendation": "Pass SearchClientOptions with Retry configured (MaxRetries = 3, Mode = RetryMode.Exponential) when constructing SearchClient in Program.cs.",
      "status": "open"
    },
    {
      "id": "RESL-CB-001",
      "title": "No circuit breaker on any critical external dependency",
      "severity": "notable",
      "category": "CB",
      "file": "src/ChatAPI/Program.cs",
      "description": "No circuit breaker is applied to Azure OpenAI, Azure AI Search, or Cosmos DB. During a sustained outage, every incoming request will attempt the failing call, exhausting threads and wasting client budget with no failure isolation.",
      "recommendation": "Add the Microsoft.Extensions.Http.Resilience package and apply AddStandardResilienceHandler() to the Azure OpenAI and AI Search client HTTP pipelines. This provides retry, timeout, and circuit breaker in a single configured pipeline.",
      "status": "open"
    },
    {
      "id": "RESL-CANCEL-002",
      "title": "Azure AI Search SearchAsync called without CancellationToken",
      "severity": "minor",
      "category": "CANCEL",
      "file": "src/ChatAPI/Data/AISearchData.cs",
      "line": 56,
      "description": "SearchAsync and GetResultsAsync are called without passing a CancellationToken. The SDK overloads accept one. This is a minor gap — the root issue is covered by RESL-CANCEL-001.",
      "recommendation": "Add CancellationToken cancellationToken = default to RetrieveDocumentationAsync and pass it to SearchAsync and the async enumerator for GetResultsAsync.",
      "status": "open"
    },
    {
      "id": "RESL-CANCEL-003",
      "title": "Cosmos DB operations called without CancellationToken",
      "severity": "minor",
      "category": "CANCEL",
      "file": "src/ChatAPI/Data/ChatHistoryData.cs",
      "description": "CreateItemAsync, ReadItemAsync, and ReadNextAsync calls in ChatHistoryData and ProductData do not receive a CancellationToken. The Cosmos DB SDK supports cancellation on all these calls.",
      "recommendation": "Thread CancellationToken through all Cosmos DB data layer methods and pass it to SDK calls.",
      "status": "open"
    },
    {
      "id": "RESL-HTTPCLIENT-001",
      "title": "No raw new HttpClient() usage detected",
      "severity": "info",
      "category": "HTTPCLIENT",
      "description": "The project uses Azure SDK clients (AzureOpenAIClient, SearchClient, CosmosClient) and Semantic Kernel abstractions throughout. No new HttpClient() instantiation was found, so there is no socket exhaustion risk from direct HttpClient misuse.",
      "status": "open"
    },
    {
      "id": "RESL-COSMOS-001",
      "title": "Cosmos DB SDK built-in retry active but not explicitly tuned",
      "severity": "info",
      "category": "RETRY",
      "file": "src/ChatAPI/Program.cs",
      "line": 23,
      "description": "CosmosClient is registered with only a connection string and no CosmosClientOptions. The SDK defaults to 9 retry attempts for rate-limited requests with up to 30-second waits, providing a baseline. Explicit retry tuning is not present. Configuration values are deferred to the CosmosDB Steward and API Config Steward.",
      "status": "open"
    }
  ],
  "summary": {
    "critical": 3,
    "notable": 4,
    "minor": 2,
    "info": 2,
    "total": 11
  }
}
