Fix AI timeout and token limits for thinking model (kimi-k2.5)

- Increase max_tokens from 4096 to 16384 to accommodate reasoning tokens - Increase timeout from 90s to 180s for thinking model latency - Add logging for response diagnostics (content length, reasoning, finish reason) - Better error message when model exhausts tokens on reasoning Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 15:50:53 -05:00
parent 18c7989983
commit 36271585d9
1 changed files with 11 additions and 4 deletions
--- a/backend/src/modules/investment-planning/investment-planning.service.ts
+++ b/backend/src/modules/investment-planning/investment-planning.service.ts
@@ -426,9 +426,11 @@ Based on this complete financial picture, provide your investment recommendation
          model,
          messages,
          temperature: 0.3,
-          max_tokens: 4096,
+          // High token limit to accommodate thinking models (e.g. kimi-k2.5)
+          // which use tokens for internal reasoning before generating output
+          max_tokens: 16384,
        }),
-        signal: AbortSignal.timeout(90000), // 90 second timeout
+        signal: AbortSignal.timeout(180000), // 3 minute timeout for thinking models
      });

      if (!response.ok) {
@@ -438,10 +440,15 @@ Based on this complete financial picture, provide your investment recommendation
      }

      const data = await response.json() as any;
-      const content = data.choices?.[0]?.message?.content;
+      const msg = data.choices?.[0]?.message;
+      // Thinking models (kimi-k2.5) may return content in 'content' or
+      // spend all tokens on 'reasoning_content' with content=null
+      const content = msg?.content || null;
+
+      this.logger.log(`AI response: content=${content ? content.length + ' chars' : 'null'}, reasoning=${msg?.reasoning_content ? 'yes' : 'no'}, finish=${data.choices?.[0]?.finish_reason}`);

      if (!content) {
-        throw new Error('Empty response from AI API');
+        throw new Error('AI model returned empty content — it may have exhausted tokens on reasoning. Try a non-thinking model or increase max_tokens.');
      }

      // Parse the JSON response — handle potential markdown code fences