Refactor thinking and effort control: per-request opt-in (#4515)

bhavyaus · web-flow · commit be24308a3a6e · 2026-03-20T00:46:24.000Z
* Refactor thinking and effort control: make per-request opt-in via enableThinking and reasoningEffort

- Add reasoning_effort to IChatModelCapabilities from CAPI model list
- Add supportsReasoningEffort on ChatEndpoint/IChatEndpoint
- Add enableThinking and reasoningEffort to IMakeChatRequestOptions
- Build configurationSchema on VS Code LM API models for model picker effort dropdown
- Remove disableThinking, AnthropicThinkingEffort, ResponsesApiReasoningEffort configs
- Thinking is off by default; callers opt in with enableThinking: true
  - Agent mode (toolCallingLoop): enables thinking, passes reasoningEffort from modelConfiguration
  - ResponsesProxy / MessagesProxy: enables thinking
  - Inline chat, utility requests, LM wrapper: thinking off (default)
- Effort level driven by configurationSchema in model picker (no default, user must choose)
- BYOK Anthropic provider reads effort from options.modelConfiguration

* refactor: Improve reasoningEffort handling across multiple components

* Fix tests: add enableThinking: true to Agent location tests, restore maxThinkingBudget cap

* Add defaultReasoningEffort, thread enableThinking/reasoningEffort to subagent loops and proxy endpoints

- Add defaultReasoningEffort to IChatEndpoint (computed per model family: high for Anthropic/Gemini, medium for OpenAI)
- Use defaultReasoningEffort as fallback in responsesApi, messagesApi, and configurationSchema
- Delegate supportsReasoningEffort/defaultReasoningEffort in pass-through endpoints
- Thread enableThinking/reasoningEffort through execution and search subagent loops
- Add enableThinking: true to oaiLanguageModelServer and claudeLanguageModelServer
- Restore maxThinkingBudget cap in customizeCapiBody

* refactor: Adjust thinking budget calculation to use endpoint's maxThinkingBudget

* Address PR feedback: fix comment, validate effort, remove defaultReasoningEffort

- Fix misleading comment in messagesApi (thinking gated by enableThinking, not reasoningEffort)
- Validate reasoningEffort against known values before sending to Messages API
- Remove defaultReasoningEffort from IChatEndpoint and ChatEndpoint
- Compute picker default locally in buildConfigurationSchema (UI concern only)
- Remove effort fallbacks from messagesApi and responsesApi (pure caller control)

* Address PR feedback round 2: validate effort, conditional schema default, location-gated thinking in fetch

- Validate reasoningEffort against known values in messagesApi before sending
- Fix comment to reflect enableThinking gating (not reasoningEffort)
- Remove defaultReasoningEffort from endpoint (picker default is UI-only concern)
- Compute picker default locally in buildConfigurationSchema
- Gate thinking by location in DefaultToolCallingLoop.fetch() (Agent/MessagesProxy only)
- Remove enableThinking from IToolCallingLoopOptions (decision made at fetch level)
- Validate effort in BYOK anthropicProvider

* refactor: Enable effort picker only for Claude and GPT models in configuration schema
diff --git a/package.json b/package.json
@@ -3224,19 +3224,6 @@
 							"onExp"
 						]
 					},
-					"github.copilot.chat.anthropic.thinking.effort": {
-						"type": "string",
-						"markdownDescription": "%github.copilot.config.anthropic.thinking.effort%",
-						"enum": [
-							"low",
-							"medium",
-							"high"
-						],
-						"default": "high",
-						"tags": [
-							"preview"
-						]
-					},
 					"github.copilot.chat.anthropic.thinking.forceExtendedThinking": {
 						"type": "boolean",
 						"markdownDescription": "%github.copilot.config.anthropic.thinking.forceExtendedThinking%",
@@ -3790,22 +3777,6 @@
 							"split"
 						]
 					},
-					"github.copilot.chat.responsesApiReasoningEffort": {
-						"type": "string",
-						"default": "default",
-						"markdownDescription": "%github.copilot.config.responsesApiReasoningEffort%",
-						"tags": [
-							"experimental",
-							"onExp"
-						],
-						"enum": [
-							"low",
-							"medium",
-							"high",
-							"xhigh",
-							"default"
-						]
-					},
 					"github.copilot.chat.responsesApiReasoningSummary": {
 						"type": "string",
 						"default": "detailed",
diff --git a/package.nls.json b/package.nls.json
@@ -343,12 +343,10 @@
 	"github.copilot.config.anthropic.toolSearchTool.enabled": "Enable tool search tool for Anthropic models. When enabled, tools are dynamically discovered and loaded on-demand using natural language search, reducing context window usage when many tools are available.",
 	"github.copilot.config.anthropic.toolSearchTool.mode": "Controls how tool search works for Anthropic models. 'server' uses Anthropic's built-in regex-based tool search. 'client' uses local embeddings-based semantic search for more accurate tool discovery.",
 	"github.copilot.config.useResponsesApi": "Use the Responses API instead of the Chat Completions API when supported. Enables reasoning and reasoning summaries.\n\n**Note**: This is an experimental feature that is not yet activated for all users.\n\n**Important**: URL API path resolution for custom OpenAI-compatible and Azure models is independent of this setting and fully determined by `url` property of `#github.copilot.chat.customOAIModels#` or `#github.copilot.chat.azureModels#` respectively.",
-	"github.copilot.config.responsesApiReasoningEffort": "Sets the reasoning effort used for the Responses API. Requires `#github.copilot.chat.useResponsesApi#`.",
 	"github.copilot.config.responsesApiReasoningSummary": "Sets the reasoning summary style used for the Responses API. Requires `#github.copilot.chat.useResponsesApi#`.",
 	"github.copilot.config.responsesApiContextManagement.enabled": "Enables context management for the Responses API. Requires `#github.copilot.chat.useResponsesApi#`.",
 	"github.copilot.config.updated53CodexPrompt.enabled": "Enables the updated prompt for gpt-5.3-codex model.",
 	"github.copilot.config.anthropic.thinking.budgetTokens": "Maximum number of tokens to allocate for extended thinking in Anthropic models. Setting this value enables extended thinking. Valid range is `1,024` to `max_tokens-1`.",
-	"github.copilot.config.anthropic.thinking.effort": "Controls how much thinking Claude does for models that support adaptive thinking. `high` (default) provides deep reasoning, `medium` offers a balance of speed and quality, `low` minimizes thinking for simpler tasks.",
 	"github.copilot.config.anthropic.thinking.forceExtendedThinking": "Force extended thinking for models that support adaptive thinking (e.g., Sonnet 4.6, Opus 4.6). When enabled, uses explicit token budgets instead of adaptive thinking.",
 	"github.copilot.config.anthropic.promptCaching.extendedTtl": "Enable extended prompt cache TTL for Anthropic models.",
 	"github.copilot.config.anthropic.tools.websearch.enabled": "Enable Anthropic's native web search tool for BYOK Claude models. When enabled, allows Claude to search the web for current information. \n\n**Note**: This is an experimental feature only available for BYOK Anthropic Claude models.",
diff --git a/src/extension/byok/node/test/openAIEndpoint.spec.ts b/src/extension/byok/node/test/openAIEndpoint.spec.ts
@@ -141,7 +141,6 @@ describe('OpenAIEndpoint - Reasoning Properties', () => {
 
 	describe('Responses API mode (useResponsesApi = true)', () => {
 		it('should preserve reasoning object when thinking is supported', () => {
-			accessor.get(IConfigurationService).setConfig(ConfigKey.ResponsesApiReasoningEffort, 'medium');
 			accessor.get(IConfigurationService).setConfig(ConfigKey.ResponsesApiReasoningSummary, 'detailed');
 			const endpoint = instaService.createInstance(OpenAIEndpoint,
 				modelMetadata,
@@ -171,7 +170,6 @@ describe('OpenAIEndpoint - Reasoning Properties', () => {
 				}
 			};
 
-			accessor.get(IConfigurationService).setConfig(ConfigKey.ResponsesApiReasoningEffort, 'medium');
 			accessor.get(IConfigurationService).setConfig(ConfigKey.ResponsesApiReasoningSummary, 'detailed');
 			const endpoint = instaService.createInstance(OpenAIEndpoint,
 				modelWithoutThinking,
diff --git a/src/extension/byok/vscode-node/anthropicProvider.ts b/src/extension/byok/vscode-node/anthropicProvider.ts
@@ -263,8 +263,9 @@ export class AnthropicLMProvider extends AbstractLanguageModelChatProvider {
 				betas.push('advanced-tool-use-2025-11-20');
 			}
 
-			const effort = supportsAdaptiveThinking
-				? this._configurationService.getConfig(ConfigKey.AnthropicThinkingEffort)
+			const rawEffort = options.modelConfiguration?.reasoningEffort;
+			const effort = supportsAdaptiveThinking && typeof rawEffort === 'string'
+				? rawEffort as 'low' | 'medium' | 'high'
 				: undefined;
 
 			const params: Anthropic.Beta.Messages.MessageCreateParamsStreaming = {
diff --git a/src/extension/chatSessions/claude/node/claudeLanguageModelServer.ts b/src/extension/chatSessions/claude/node/claudeLanguageModelServer.ts
@@ -230,6 +230,7 @@ export class ClaudeLanguageModelServer extends Disposable {
 				messages: messagesForLogging,
 				finishedCb: async () => undefined,
 				location: ChatLocation.MessagesProxy,
+				enableThinking: true,
 				userInitiatedRequest: isUserInitiatedMessage
 			}, tokenSource.token);
 
@@ -615,6 +616,10 @@ class ClaudeStreamingPassThroughEndpoint implements IChatEndpoint {
 		return this.base.maxThinkingBudget;
 	}
 
+	public get supportsReasoningEffort(): string[] | undefined {
+		return this.base.supportsReasoningEffort;
+	}
+
 	public get supportsToolCalls(): boolean {
 		return this.base.supportsToolCalls;
 	}
diff --git a/src/extension/conversation/vscode-node/languageModelAccess.ts b/src/extension/conversation/vscode-node/languageModelAccess.ts
@@ -44,6 +44,50 @@ import { PromptRenderer } from '../../prompts/node/base/promptRenderer';
 import { isImageDataPart } from '../common/languageModelChatMessageHelpers';
 import { LanguageModelAccessPrompt } from './languageModelAccessPrompt';
 
+/**
+ * Builds a configurationSchema for the model picker based on the endpoint's supported capabilities.
+ * Models that support reasoning_effort get a "Thinking Effort" dropdown in the model picker UI.
+ */
+function buildConfigurationSchema(endpoint: IChatEndpoint): { configurationSchema?: vscode.LanguageModelConfigurationSchema } {
+	const effortLevels = endpoint.supportsReasoningEffort;
+	if (!effortLevels || effortLevels.length === 0) {
+		return {};
+	}
+
+	// Only enable effort picker for Claude and GPT models
+	const family = endpoint.family.toLowerCase();
+	if (!family.startsWith('claude') && !family.startsWith('gpt-')) {
+		return {};
+	}
+
+	const preferred = family.startsWith('claude') ? 'high' : 'medium';
+	const defaultEffort = effortLevels.includes(preferred) ? preferred : undefined;
+
+	return {
+		configurationSchema: {
+			properties: {
+				reasoningEffort: {
+					type: 'string',
+					title: vscode.l10n.t('Thinking Effort'),
+					enum: effortLevels,
+					enumItemLabels: effortLevels.map(level => level.charAt(0).toUpperCase() + level.slice(1)),
+					enumDescriptions: effortLevels.map(level => {
+						switch (level) {
+							case 'none': return vscode.l10n.t('No reasoning applied');
+							case 'low': return vscode.l10n.t('Faster responses with less reasoning');
+							case 'medium': return vscode.l10n.t('Balanced reasoning and speed');
+							case 'high': return vscode.l10n.t('Maximum reasoning depth');
+							default: return level;
+						}
+					}),
+					default: defaultEffort,
+					group: 'navigation',
+				}
+			}
+		}
+	};
+}
+
 /**
  * Returns a description of the model's capabilities and intended use cases.
  * This is shown in the rich hover when selecting models.
@@ -291,7 +335,8 @@ export class LanguageModelAccess extends Disposable implements IExtensionContrib
 				capabilities: {
 					imageInput: endpoint instanceof AutoChatEndpoint ? true : endpoint.supportsVision,
 					toolCalling: endpoint.supportsToolCalls,
-				}
+				},
+				...buildConfigurationSchema(endpoint),
 			};
 
 			models.push(model);
@@ -566,7 +611,17 @@ export class CopilotLanguageModelWrapper extends Disposable {
 		// This links the wrapper's chat span back to the original invoke_agent trace.
 		const parentTraceContext = (_options as { modelOptions?: OTelModelOptions }).modelOptions?._otelTraceContext ?? undefined;
 
-		const makeRequest = () => endpoint.makeChatRequest('copilotLanguageModelWrapper', messages, callback, token, ChatLocation.Other, { extensionId }, options, !!extensionId, telemetryProperties);
+		const makeRequest = () => endpoint.makeChatRequest2({
+			debugName: 'copilotLanguageModelWrapper',
+			messages,
+			finishedCb: callback,
+			location: ChatLocation.Other,
+			source: { extensionId },
+			requestOptions: options,
+			userInitiatedRequest: !!extensionId,
+			telemetryProperties,
+			reasoningEffort: typeof _options.modelConfiguration?.reasoningEffort === 'string' ? _options.modelConfiguration.reasoningEffort : undefined,
+		}, token);
 
 		// Run request within the parent OTel context (no extra span) so chat spans in chatMLFetcher inherit the agent trace
 		const wrappedRequest = parentTraceContext
diff --git a/src/extension/externalAgents/node/oaiLanguageModelServer.ts b/src/extension/externalAgents/node/oaiLanguageModelServer.ts
@@ -205,6 +205,7 @@ export class OpenAILanguageModelServer extends Disposable {
 				messages: messagesForLogging,
 				finishedCb: async () => undefined,
 				location: ChatLocation.ResponsesProxy,
+				enableThinking: true,
 				userInitiatedRequest: isUserInitiatedMessage
 			}, tokenSource.token);
 
@@ -420,6 +421,10 @@ class StreamingPassThroughEndpoint implements IChatEndpoint {
 		return this.base.maxThinkingBudget;
 	}
 
+	public get supportsReasoningEffort(): string[] | undefined {
+		return this.base.supportsReasoningEffort;
+	}
+
 	public get supportsToolCalls(): boolean {
 		return this.base.supportsToolCalls;
 	}
diff --git a/src/extension/intents/node/toolCallingLoop.ts b/src/extension/intents/node/toolCallingLoop.ts
@@ -100,7 +100,7 @@ export interface IToolCallingBuiltPromptEvent {
 	tools: LanguageModelToolInformation[];
 }
 
-export type ToolCallingLoopFetchOptions = Required<Pick<IMakeChatRequestOptions, 'messages' | 'finishedCb' | 'requestOptions' | 'userInitiatedRequest' | 'turnId'>> & Pick<IMakeChatRequestOptions, 'disableThinking'>;
+export type ToolCallingLoopFetchOptions = Required<Pick<IMakeChatRequestOptions, 'messages' | 'finishedCb' | 'requestOptions' | 'userInitiatedRequest' | 'turnId'>> & Pick<IMakeChatRequestOptions, 'enableThinking' | 'reasoningEffort'>;
 
 interface StartHookResult {
 	/**
@@ -1139,7 +1139,10 @@ export abstract class ToolCallingLoop<TOptions extends IToolCallingLoopOptions =
 		let statefulMarker: string | undefined;
 		const toolCalls: IToolCall[] = [];
 		let thinkingItem: ThinkingDataItem | undefined;
-		const disableThinking = isContinuation && isAnthropicFamily(endpoint) && !ToolCallingLoop.messagesContainThinking(effectiveBuildPromptResult.messages);
+		const rawEffort = this.options.request.modelConfiguration?.reasoningEffort;
+		const reasoningEffort = typeof rawEffort === 'string' ? rawEffort : undefined;
+		const shouldDisableThinking = isContinuation && isAnthropicFamily(endpoint) && !ToolCallingLoop.messagesContainThinking(effectiveBuildPromptResult.messages);
+		const enableThinking = !shouldDisableThinking;
 		let phase: string | undefined;
 		let compaction: OpenAIContextManagementResponse | undefined;
 		const fetchResult = await this.fetch({
@@ -1187,7 +1190,8 @@ export abstract class ToolCallingLoop<TOptions extends IToolCallingLoopOptions =
 				})),
 			},
 			userInitiatedRequest: (iterationNumber === 0 && !isContinuation && !this.options.request.subAgentInvocationId) || this.stopHookUserInitiated,
-			disableThinking,
+			enableThinking,
+			reasoningEffort,
 		}, token).finally(() => {
 			this.stopHookUserInitiated = false;
 		});
diff --git a/src/extension/prompt/node/defaultIntentRequestHandler.ts b/src/extension/prompt/node/defaultIntentRequestHandler.ts
@@ -694,16 +694,19 @@ class DefaultToolCallingLoop extends ToolCallingLoop<IDefaultToolLoopOptions> {
 		const debugName = this.options.request.subAgentInvocationId ?
 			`tool/runSubagent${this.options.request.subAgentName ? `-${this.options.request.subAgentName}` : ''}` :
 			`${ChatLocation.toStringShorter(this.options.location)}/${this.options.intent?.id}`;
+		const location = this.options.overrideRequestLocation ?? this.options.location;
+		const isThinkingLocation = location === ChatLocation.Agent || location === ChatLocation.MessagesProxy;
 		return this.options.invocation.endpoint.makeChatRequest2({
 			...opts,
+			enableThinking: isThinkingLocation && opts.enableThinking,
 			debugName,
 			conversationId: this.options.conversation.sessionId,
 			turnId: opts.turnId,
 			finishedCb: (text, index, delta) => {
 				this.telemetry.markReceivedToken();
 				return opts.finishedCb!(text, index, delta);
 			},
-			location: this.options.overrideRequestLocation ?? this.options.location,
+			location,
 			requestOptions: {
 				...opts.requestOptions,
 				tools: normalizeToolSchema(
diff --git a/src/extension/prompt/node/executionSubagentToolCallingLoop.ts b/src/extension/prompt/node/executionSubagentToolCallingLoop.ts
@@ -121,13 +121,15 @@ export class ExecutionSubagentToolCallingLoop extends ToolCallingLoop<IExecution
 		return allTools.filter(tool => allowedExecutionTools.has(tool.name as ToolName));
 	}
 
-	protected async fetch({ messages, finishedCb, requestOptions }: ToolCallingLoopFetchOptions, token: CancellationToken): Promise<ChatResponse> {
+	protected async fetch({ messages, finishedCb, requestOptions, enableThinking, reasoningEffort }: ToolCallingLoopFetchOptions, token: CancellationToken): Promise<ChatResponse> {
 		const endpoint = await this.getEndpoint();
 		return endpoint.makeChatRequest2({
 			debugName: ExecutionSubagentToolCallingLoop.ID,
 			messages,
 			finishedCb,
 			location: this.options.location,
+			enableThinking,
+			reasoningEffort,
 			requestOptions: {
 				...(requestOptions ?? {}),
 				temperature: 0
diff --git a/src/extension/prompt/node/searchSubagentToolCallingLoop.ts b/src/extension/prompt/node/searchSubagentToolCallingLoop.ts
@@ -135,13 +135,15 @@ export class SearchSubagentToolCallingLoop extends ToolCallingLoop<ISearchSubage
 		return allTools.filter(tool => allowedSearchTools.has(tool.name as ToolName));
 	}
 
-	protected async fetch({ messages, finishedCb, requestOptions }: ToolCallingLoopFetchOptions, token: CancellationToken): Promise<ChatResponse> {
+	protected async fetch({ messages, finishedCb, requestOptions, enableThinking, reasoningEffort }: ToolCallingLoopFetchOptions, token: CancellationToken): Promise<ChatResponse> {
 		const endpoint = await this.getEndpoint();
 		return endpoint.makeChatRequest2({
 			debugName: SearchSubagentToolCallingLoop.ID,
 			messages,
 			finishedCb,
 			location: this.options.location,
+			enableThinking,
+			reasoningEffort,
 			requestOptions: {
 				...requestOptions,
 				temperature: 0
diff --git a/src/platform/configuration/common/configurationService.ts b/src/platform/configuration/common/configurationService.ts
@@ -873,8 +873,6 @@ export namespace ConfigKey {
 	export const AnthropicToolSearchMode = defineSetting<'server' | 'client'>('chat.anthropic.toolSearchTool.mode', ConfigType.ExperimentBased, 'server');
 	/** Prompt optimization mode for Claude 4.6 models. 'control' uses the current prompt, 'combined' uses a single optimized prompt, 'split' uses separate Opus/Sonnet prompts. */
 	export const AnthropicPromptOptimization = defineSetting<'control' | 'combined' | 'split'>('chat.anthropic.promptOptimization', ConfigType.ExperimentBased, 'control');
-	/** Configure reasoning effort sent to Responses API */
-	export const ResponsesApiReasoningEffort = defineSetting<'low' | 'medium' | 'high' | 'xhigh' | 'default'>('chat.responsesApiReasoningEffort', ConfigType.ExperimentBased, 'default');
 	/** Configure reasoning summary style sent to Responses API */
 	export const ResponsesApiReasoningSummary = defineSetting<'off' | 'detailed'>('chat.responsesApiReasoningSummary', ConfigType.ExperimentBased, 'detailed');
 	/** Enable context_management sent to Responses API */
@@ -884,8 +882,6 @@ export namespace ConfigKey {
 	export const EnableChatImageUpload = defineSetting<boolean>('chat.imageUpload.enabled', ConfigType.ExperimentBased, true);
 	/** Thinking token budget for Anthropic extended thinking. If set, enables extended thinking. */
 	export const AnthropicThinkingBudget = defineSetting<number>('chat.anthropic.thinking.budgetTokens', ConfigType.ExperimentBased, 16000);
-	/** Effort level for Anthropic adaptive thinking models. Controls how much thinking Claude does. */
-	export const AnthropicThinkingEffort = defineSetting<'low' | 'medium' | 'high'>('chat.anthropic.thinking.effort', ConfigType.Simple, 'high');
 	/** Force extended thinking (with explicit token budgets) even on models that support adaptive thinking. */
 	export const AnthropicForceExtendedThinking = defineSetting<boolean>('chat.anthropic.thinking.forceExtendedThinking', ConfigType.ExperimentBased, false);
 	/** Enable Anthropic web search tool for BYOK Claude models */
diff --git a/src/platform/endpoint/common/endpointProvider.ts b/src/platform/endpoint/common/endpointProvider.ts
@@ -53,6 +53,7 @@ export type IChatModelCapabilities = {
 		adaptive_thinking?: boolean;
 		max_thinking_budget?: number;
 		min_thinking_budget?: number;
+		reasoning_effort?: string[];
 	};
 };
 
diff --git a/src/platform/endpoint/node/chatEndpoint.ts b/src/platform/endpoint/node/chatEndpoint.ts
diff --git a/src/platform/endpoint/node/messagesApi.ts b/src/platform/endpoint/node/messagesApi.ts
diff --git a/src/platform/endpoint/node/responsesApi.ts b/src/platform/endpoint/node/responsesApi.ts
diff --git a/src/platform/endpoint/node/test/copilotChatEndpoint.spec.ts b/src/platform/endpoint/node/test/copilotChatEndpoint.spec.ts
diff --git a/src/platform/networking/common/networking.ts b/src/platform/networking/common/networking.ts