diff --git a/app/src/ai/blocklist/controller/input_context.rs b/app/src/ai/blocklist/controller/input_context.rs index fc5e9acb9d..8faa432c3d 100644 --- a/app/src/ai/blocklist/controller/input_context.rs +++ b/app/src/ai/blocklist/controller/input_context.rs @@ -73,7 +73,7 @@ pub(super) fn input_context_for_request( { let session_context = SessionContext::from_session(active_session, app); if session_context.is_remote() { - add_remote_codebase_context(&mut context, app); + add_remote_codebase_context(&mut context, &session_context, app); } else { add_local_codebase_context(&mut context, app); } @@ -118,8 +118,15 @@ fn add_local_codebase_context(context: &mut Vec, app: &AppContex } #[cfg(not(target_family = "wasm"))] -fn add_remote_codebase_context(context: &mut Vec, app: &AppContext) { - for codebase in RemoteCodebaseIndexModel::as_ref(app).codebases_for_agent_context() { +fn add_remote_codebase_context( + context: &mut Vec, + session_context: &SessionContext, + app: &AppContext, +) { + let Some(host_id) = session_context.host_id() else { + return; + }; + for codebase in RemoteCodebaseIndexModel::as_ref(app).codebases_for_agent_context(host_id) { context.push(AIAgentContext::Codebase { name: codebase.name, path: codebase.path, @@ -128,7 +135,12 @@ fn add_remote_codebase_context(context: &mut Vec, app: &AppConte } #[cfg(target_family = "wasm")] -fn add_remote_codebase_context(_context: &mut Vec, _app: &AppContext) {} +fn add_remote_codebase_context( + _context: &mut Vec, + _session_context: &SessionContext, + _app: &AppContext, +) { +} /// Parses context reference strings like from the user query and returns /// a map of reference strings to AIAgentAttachment objects. diff --git a/app/src/ai/codebase_auto_indexing.rs b/app/src/ai/codebase_auto_indexing.rs index fb9fc589b1..6d2b4e2121 100644 --- a/app/src/ai/codebase_auto_indexing.rs +++ b/app/src/ai/codebase_auto_indexing.rs @@ -22,6 +22,16 @@ impl CodebaseAutoIndexingSurface { } } +pub(crate) fn should_use_codebase_indexing( + surface: CodebaseAutoIndexingSurface, + ctx: &AppContext, +) -> bool { + codebase_indexing_enabled( + surface, + UserWorkspaces::as_ref(ctx).is_codebase_context_enabled(ctx), + ) +} + pub(crate) fn should_auto_index_codebase( surface: CodebaseAutoIndexingSurface, ctx: &AppContext, @@ -33,15 +43,21 @@ pub(crate) fn should_auto_index_codebase( ) } -pub(crate) fn codebase_auto_indexing_enabled( +fn codebase_indexing_enabled( surface: CodebaseAutoIndexingSurface, codebase_context_enabled: bool, - auto_indexing_enabled: bool, ) -> bool { FeatureFlag::FullSourceCodeEmbedding.is_enabled() && surface.required_feature_enabled() && codebase_context_enabled - && auto_indexing_enabled +} + +pub(crate) fn codebase_auto_indexing_enabled( + surface: CodebaseAutoIndexingSurface, + codebase_context_enabled: bool, + auto_indexing_enabled: bool, +) -> bool { + codebase_indexing_enabled(surface, codebase_context_enabled) && auto_indexing_enabled } pub(crate) fn auto_index_candidate_roots( diff --git a/app/src/ai/get_relevant_files/remote_search/native.rs b/app/src/ai/get_relevant_files/remote_search/native.rs index ec8df55f63..2dafb54062 100644 --- a/app/src/ai/get_relevant_files/remote_search/native.rs +++ b/app/src/ai/get_relevant_files/remote_search/native.rs @@ -1,12 +1,20 @@ use ::ai::index::full_source_code_embedding::{ - store_client::StoreClient, ContentHash, Fragment, RepoMetadata, + search_shaping::{build_fragments_from_file_contents, fragments_to_context_locations}, + store_client::StoreClient, + ContentHash, FragmentMetadata as AiFragmentMetadata, FragmentMetadataLocation, RepoMetadata, }; +use ::ai::index::locations::CodeContextLocation; use itertools::Itertools; use remote_server::proto::{ - file_context_proto, FragmentMetadata, LineRange, ReadFileContextFile, ReadFileContextRequest, - ReadFileContextResponse, + file_context_proto, FragmentMetadata as ProtoFragmentMetadata, LineRange, ReadFileContextFile, + ReadFileContextRequest, ReadFileContextResponse, +}; +use std::{ + collections::{HashMap, HashSet}, + path::PathBuf, + str::FromStr, + sync::Arc, }; -use std::{collections::HashMap, path::PathBuf, str::FromStr, sync::Arc}; use string_offset::ByteOffset; use warpui::{AppContext, ModelContext, SingletonEntity}; @@ -14,8 +22,8 @@ use crate::{ ai::{ agent::{AnyFileContent, FileContext, SearchCodebaseFailureReason, SearchCodebaseResult}, blocklist::SessionContext, + codebase_auto_indexing::{should_use_codebase_indexing, CodebaseAutoIndexingSurface}, }, - features::FeatureFlag, remote_server::codebase_index_model::{ RemoteCodebaseIndexModel, RemoteCodebaseSearchAvailability, RemoteCodebaseSearchContext, }, @@ -53,7 +61,7 @@ pub(super) fn send_request( action_id: crate::ai::agent::AIAgentActionId, ctx: &mut ModelContext, ) -> RemoteSearchRequest { - if !FeatureFlag::RemoteCodebaseIndexing.is_enabled() { + if !should_use_codebase_indexing(CodebaseAutoIndexingSurface::Remote, ctx) { return RemoteSearchRequest::Ready(SearchCodebaseResult::Failed { reason: SearchCodebaseFailureReason::CodebaseNotIndexed, message: "Remote codebase search is not enabled.".to_string(), @@ -106,13 +114,6 @@ pub(super) fn send_request( RemoteSearchRequest::Pending(abort_handle) } availability @ RemoteCodebaseSearchAvailability::NotIndexed { .. } => { - RemoteCodebaseIndexModel::handle(ctx).update(ctx, |model, ctx| { - model.request_active_repo_index( - &session_context, - requested_codebase_path.as_deref(), - ctx, - ); - }); RemoteSearchRequest::Ready(remote_availability_failure(availability)) } RemoteCodebaseSearchAvailability::NoConnectedHost @@ -124,6 +125,51 @@ pub(super) fn send_request( } } +#[cfg(test)] +mod tests { + use remote_server::proto::{file_context_proto, FileContextProto}; + use std::path::PathBuf; + + use super::file_contents_from_response; + + #[test] + fn file_contents_from_response_keeps_only_whole_text_files() { + let response = remote_server::proto::ReadFileContextResponse { + file_contexts: vec![ + FileContextProto { + file_name: "/repo/src/lib.rs".to_string(), + content: Some(file_context_proto::Content::TextContent( + "content".to_string(), + )), + line_range_start: None, + line_range_end: None, + last_modified_epoch_millis: None, + line_count: 1, + }, + FileContextProto { + file_name: "/repo/src/fragment.rs".to_string(), + content: Some(file_context_proto::Content::TextContent( + "fragment".to_string(), + )), + line_range_start: Some(1), + line_range_end: Some(2), + last_modified_epoch_millis: None, + line_count: 1, + }, + ], + failed_files: vec![], + }; + + let file_contents = file_contents_from_response(response); + + assert_eq!(file_contents.len(), 1); + assert_eq!( + file_contents.get(&PathBuf::from("/repo/src/lib.rs")), + Some(&"content".to_string()) + ); + } +} + // The controller owns request lifecycle concerns like cancellation, pending request tracking, and // result emission. This function only contains the remote-specific pipeline: store content hashes // -> daemon fragment metadata -> remote file reads -> fragment reranking. @@ -176,20 +222,37 @@ async fn execute_remote_codebase_search( repo_path ); } - let mut metadata = metadata_response.fragments; + let mut remote_fragments = metadata_response.fragments; if let Some(partial_paths) = partial_paths { - metadata.retain(|fragment| { + remote_fragments.retain(|fragment| { partial_paths .iter() .any(|partial_path| fragment.path.contains(partial_path)) }); } - if metadata.is_empty() { + if remote_fragments.is_empty() { + return Ok(SearchCodebaseResult::Success { files: vec![] }); + } + + // Convert daemon protobuf metadata into the shared search-shaping metadata format so + // remote search reuses the same fragment reconstruction and context expansion path as + // local search. + let parsed_fragment_metadata = remote_fragments + .into_iter() + .filter_map(|fragment| match remote_fragment_metadata(fragment) { + Ok(metadata) => Some(metadata), + Err(err) => { + log::warn!("Failed to parse remote codebase fragment metadata: {err:?}"); + None + } + }) + .collect_vec(); + if parsed_fragment_metadata.is_empty() { return Ok(SearchCodebaseResult::Success { files: vec![] }); } let response = client - .read_file_context(read_fragment_metadata_request(&metadata)) + .read_file_context(read_full_fragment_files_request(&parsed_fragment_metadata)) .await?; if !response.failed_files.is_empty() && response.file_contexts.is_empty() { let failed = response @@ -209,42 +272,98 @@ async fn execute_remote_codebase_search( message: format!("Failed to read remote search result files: {failed}"), }); } - - let (fragments, mut file_contexts_by_identity) = - remote_fragments_and_file_contexts(response, &metadata)?; + let file_contents = file_contents_from_response(response); + let read_fragment_result = build_fragments_from_file_contents( + parsed_fragment_metadata.iter().cloned(), + &file_contents, + ); + if !read_fragment_result.fail_to_read_path.is_empty() { + log::warn!( + "Remote codebase search failed to read {} fragment file(s)", + read_fragment_result.fail_to_read_path.len() + ); + } + let fragments = read_fragment_result.successfully_read; if fragments.is_empty() { return Ok(SearchCodebaseResult::Success { files: vec![] }); } let reranked_fragments = store_client.rerank_fragments(query, fragments).await?; - let files = reranked_fragments + let metadata_by_hash = fragment_metadata_by_hash(&parsed_fragment_metadata); + let locations = fragments_to_context_locations( + reranked_fragments, + |hash| metadata_by_hash.get(hash).map(Vec::as_slice), + RETRIEVE_FRAGMENT_CONTEXT_LENGTH, + ); + if locations.is_empty() { + return Ok(SearchCodebaseResult::Success { files: vec![] }); + } + + let response = client + .read_file_context(read_context_locations_request(&locations)) + .await?; + if !response.failed_files.is_empty() && response.file_contexts.is_empty() { + let failed = response + .failed_files + .iter() + .map(|file| { + let reason = file + .error + .as_ref() + .map(|error| error.message.as_str()) + .unwrap_or("unknown error"); + format!("{}: {reason}", file.path) + }) + .join(", "); + return Ok(SearchCodebaseResult::Failed { + reason: SearchCodebaseFailureReason::InvalidFilePaths, + message: format!("Failed to read remote search result files: {failed}"), + }); + } + let files = response + .file_contexts .into_iter() - .filter_map(|fragment| { - file_contexts_by_identity.remove(&RemoteFragmentIdentity::from_fragment(&fragment)) - }) + .filter_map(proto_file_context_to_file_context) .collect_vec(); Ok(SearchCodebaseResult::Success { files }) } -fn read_fragment_metadata_request(metadata: &[FragmentMetadata]) -> ReadFileContextRequest { +const RETRIEVE_FRAGMENT_CONTEXT_LENGTH: usize = 0; + +fn remote_fragment_metadata( + fragment: ProtoFragmentMetadata, +) -> anyhow::Result<(ContentHash, AiFragmentMetadata)> { + let content_hash = ContentHash::from_str(&fragment.content_hash)?; + Ok(( + content_hash, + AiFragmentMetadata { + absolute_path: PathBuf::from(fragment.path), + location: FragmentMetadataLocation { + start_line: fragment.start_line as usize, + end_line: fragment.end_line as usize, + byte_range: ByteOffset::from(fragment.byte_start as usize) + ..ByteOffset::from(fragment.byte_end as usize), + }, + }, + )) +} + +fn read_full_fragment_files_request( + metadata: &[(ContentHash, AiFragmentMetadata)], +) -> ReadFileContextRequest { + let mut seen_paths = HashSet::new(); ReadFileContextRequest { files: metadata .iter() - .map(|fragment| { - let line_ranges = - if fragment.start_line > 0 && fragment.end_line >= fragment.start_line { - vec![LineRange { - start: fragment.start_line, - end: fragment.end_line.saturating_add(1), - }] - } else { - vec![] - }; - ReadFileContextFile { - path: fragment.path.clone(), - line_ranges, - } + .filter_map(|(_, fragment)| { + let path = fragment.absolute_path.to_string_lossy().to_string(); + seen_paths + .insert(path.clone()) + .then_some(ReadFileContextFile { + path, + line_ranges: vec![], + }) }) .collect(), max_file_bytes: None, @@ -252,109 +371,59 @@ fn read_fragment_metadata_request(metadata: &[FragmentMetadata]) -> ReadFileCont } } -#[derive(Clone, Debug, Eq, PartialEq, Hash)] -struct RemoteFragmentIdentity { - content_hash: String, - path: String, - byte_start: u64, - byte_end: u64, -} - -impl RemoteFragmentIdentity { - fn from_metadata(metadata: &FragmentMetadata) -> Self { - Self { - content_hash: metadata.content_hash.clone(), - path: metadata.path.clone(), - byte_start: metadata.byte_start, - byte_end: metadata.byte_end, - } - } - - fn from_fragment(fragment: &Fragment) -> Self { - let byte_range = fragment.byte_range(); - Self { - content_hash: fragment.content_hash().to_string(), - path: fragment.absolute_path().to_string_lossy().to_string(), - byte_start: byte_range.start.as_usize() as u64, - byte_end: byte_range.end.as_usize() as u64, - } - } -} - -#[derive(Clone, Debug, Eq, PartialEq, Hash)] -struct RemoteReadContextKey { - path: String, - line_range_start: Option, - line_range_end: Option, -} - -impl RemoteReadContextKey { - fn from_metadata(metadata: &FragmentMetadata) -> Self { - let line_range = (metadata.start_line > 0 && metadata.end_line >= metadata.start_line) - .then_some((metadata.start_line, metadata.end_line.saturating_add(1))); - Self { - path: metadata.path.clone(), - line_range_start: line_range.map(|(start, _)| start), - line_range_end: line_range.map(|(_, end)| end), +fn file_contents_from_response(response: ReadFileContextResponse) -> HashMap { + let mut file_contents = HashMap::new(); + for file_context in response.file_contexts { + if file_context.line_range_start.is_some() || file_context.line_range_end.is_some() { + continue; } - } - - fn from_file_context(file_context: &remote_server::proto::FileContextProto) -> Self { - Self { - path: file_context.file_name.clone(), - line_range_start: file_context.line_range_start, - line_range_end: file_context.line_range_end, + if let Some(file_context_proto::Content::TextContent(content)) = file_context.content { + file_contents.insert(PathBuf::from(file_context.file_name), content); } } + file_contents } -fn remote_fragments_and_file_contexts( - response: ReadFileContextResponse, - metadata: &[FragmentMetadata], -) -> anyhow::Result<(Vec, HashMap)> { - let mut fragments = Vec::new(); - let mut file_contexts_by_read_key: HashMap> = - HashMap::new(); - for file_context in response.file_contexts { - let read_key = RemoteReadContextKey::from_file_context(&file_context); - let Some(file_context) = proto_file_context_to_file_context(file_context) else { - continue; - }; - file_contexts_by_read_key - .entry(read_key) +fn fragment_metadata_by_hash( + metadata: &[(ContentHash, AiFragmentMetadata)], +) -> HashMap> { + let mut metadata_by_hash: HashMap> = HashMap::new(); + for (content_hash, metadata) in metadata { + metadata_by_hash + .entry(content_hash.clone()) .or_default() - .push(file_context); + .push(metadata.clone()); } + metadata_by_hash +} - let mut file_contexts_by_identity = HashMap::new(); - - for fragment_metadata in metadata { - let read_key = RemoteReadContextKey::from_metadata(fragment_metadata); - let Some(file_context) = file_contexts_by_read_key - .get(&read_key) - .and_then(|file_contexts| file_contexts.last()) - .cloned() - else { - continue; - }; - let AnyFileContent::StringContent(content) = &file_context.content else { - continue; - }; - let content_hash = ContentHash::from_str(&fragment_metadata.content_hash)?; - fragments.push(Fragment::from_byte_range( - content.clone(), - content_hash, - PathBuf::from(fragment_metadata.path.clone()), - ByteOffset::from(fragment_metadata.byte_start as usize) - ..ByteOffset::from(fragment_metadata.byte_end as usize), - )); - file_contexts_by_identity.insert( - RemoteFragmentIdentity::from_metadata(fragment_metadata), - file_context, - ); +fn read_context_locations_request( + locations: &HashSet, +) -> ReadFileContextRequest { + ReadFileContextRequest { + files: locations + .iter() + .map(|location| match location { + CodeContextLocation::WholeFile(path) => ReadFileContextFile { + path: path.to_string_lossy().to_string(), + line_ranges: vec![], + }, + CodeContextLocation::Fragment(fragment) => ReadFileContextFile { + path: fragment.path.to_string_lossy().to_string(), + line_ranges: fragment + .line_ranges + .iter() + .map(|range| LineRange { + start: range.start as u32, + end: range.end as u32, + }) + .collect(), + }, + }) + .collect(), + max_file_bytes: None, + max_batch_bytes: None, } - - Ok((fragments, file_contexts_by_identity)) } // Keep this conversion at the AI boundary: `FileContext` lives in the `ai` crate, so the @@ -401,7 +470,7 @@ fn remote_availability_failure( SearchCodebaseResult::Failed { reason: SearchCodebaseFailureReason::CodebaseNotIndexed, message: format!( - "The remote codebase at {} is not indexed yet. Indexing has been requested; try again after it finishes.", + "The remote codebase at {} is not indexed yet.", remote_path.path.as_str() ), } diff --git a/app/src/ai/persisted_workspace.rs b/app/src/ai/persisted_workspace.rs index a46f6f43f8..484ae21658 100644 --- a/app/src/ai/persisted_workspace.rs +++ b/app/src/ai/persisted_workspace.rs @@ -675,16 +675,13 @@ impl PersistedWorkspace { let _ = model.index_and_store_rules(directory_path.clone(), ctx); }); - if FeatureFlag::FullSourceCodeEmbedding.is_enabled() { - let auto_indexing_enabled = UserWorkspaces::as_ref(ctx) - .is_codebase_context_enabled(ctx) - && *CodeSettings::as_ref(ctx).auto_indexing_enabled; - - if auto_indexing_enabled { - CodebaseIndexManager::handle(ctx).update(ctx, |manager, ctx| { - manager.index_directory(directory_path, ctx); - }); - } + if FeatureFlag::FullSourceCodeEmbedding.is_enabled() + && UserWorkspaces::as_ref(ctx).is_codebase_context_enabled(ctx) + && *CodeSettings::as_ref(ctx).auto_indexing_enabled + { + CodebaseIndexManager::handle(ctx).update(ctx, |manager, ctx| { + manager.index_directory(directory_path, ctx); + }); } } diff --git a/app/src/remote_server/codebase_index_model.rs b/app/src/remote_server/codebase_index_model.rs index 2d3e3295dd..489cf272ef 100644 --- a/app/src/remote_server/codebase_index_model.rs +++ b/app/src/remote_server/codebase_index_model.rs @@ -10,7 +10,8 @@ use warpui::{Entity, ModelContext, SingletonEntity}; use crate::ai::blocklist::SessionContext; use crate::ai::codebase_auto_indexing::{ - auto_index_candidate_roots, should_auto_index_codebase, CodebaseAutoIndexingSurface, + auto_index_candidate_roots, should_auto_index_codebase, should_use_codebase_indexing, + CodebaseAutoIndexingSurface, }; use crate::send_telemetry_from_ctx; use crate::server::telemetry::{ @@ -189,6 +190,9 @@ impl RemoteCodebaseIndexModel { explicit_repo_path: Option<&str>, ctx: &mut ModelContext, ) -> bool { + if !should_use_codebase_indexing(CodebaseAutoIndexingSurface::Remote, ctx) { + return false; + } let Some(host_id) = session_context.host_id() else { return false; }; @@ -212,12 +216,13 @@ impl RemoteCodebaseIndexModel { true } - pub fn codebases_for_agent_context(&self) -> Vec { + pub fn codebases_for_agent_context(&self, host_id: &HostId) -> Vec { let mut entries = self .statuses .iter() .filter(|&(remote_path, status)| { - search_availability_for_status(status, remote_path.clone()).is_ready() + remote_path.host_id == *host_id + && search_availability_for_status(status, remote_path.clone()).is_ready() }) .map(|(remote_path, _)| { let path = remote_path.path.as_str().to_string(); @@ -232,6 +237,9 @@ impl RemoteCodebaseIndexModel { } pub fn request_index(&self, remote_path: RemotePath, ctx: &mut ModelContext) { + if !should_use_codebase_indexing(CodebaseAutoIndexingSurface::Remote, ctx) { + return; + } RemoteServerManager::handle(ctx).update(ctx, |manager, ctx| { manager.ensure_codebase_indexed( remote_path, @@ -244,6 +252,9 @@ impl RemoteCodebaseIndexModel { } pub fn resync_index(&self, remote_path: RemotePath, ctx: &mut ModelContext) { + if !should_use_codebase_indexing(CodebaseAutoIndexingSurface::Remote, ctx) { + return; + } RemoteServerManager::handle(ctx).update(ctx, |manager, ctx| { manager.resync_codebase(remote_path, ctx); }); @@ -283,6 +294,9 @@ impl RemoteCodebaseIndexModel { ) { match event { RemoteServerManagerEvent::CodebaseIndexStatusesSnapshot { host_id, statuses } => { + if !should_use_codebase_indexing(CodebaseAutoIndexingSurface::Remote, ctx) { + return; + } let (changed, telemetry_updates) = self.apply_statuses_snapshot_with_telemetry(host_id, statuses); for update in telemetry_updates { @@ -303,6 +317,9 @@ impl RemoteCodebaseIndexModel { mutation_kind, session_id: _, } => { + if !should_use_codebase_indexing(CodebaseAutoIndexingSurface::Remote, ctx) { + return; + } if let Some(update) = self.apply_status_update_with_telemetry(remote_path.clone(), status.clone()) { @@ -321,6 +338,8 @@ impl RemoteCodebaseIndexModel { is_git, } => { self.record_navigated_directory(*session_id, remote_path, *is_git); + // Remote manual indexing can target non-git folders, but automatic indexing should + // match local behavior and only index directories resolved by repo detection. if *is_git && should_auto_index_codebase(CodebaseAutoIndexingSurface::Remote, ctx) && self.should_request_auto_index_for_navigated_git_repo(remote_path) @@ -387,27 +406,28 @@ impl RemoteCodebaseIndexModel { | RemoteServerManagerEvent::ServerMessageDecodingError { .. } => {} } } - fn should_request_auto_index_for_navigated_git_repo(&self, remote_path: &RemotePath) -> bool { - let Some(status) = self.status_for_repo(remote_path) else { - return true; - }; - - match search_availability_for_status(status, remote_path.clone()) { - RemoteCodebaseSearchAvailability::Ready(_) - | RemoteCodebaseSearchAvailability::Indexing { .. } => false, - RemoteCodebaseSearchAvailability::NoConnectedHost - | RemoteCodebaseSearchAvailability::NoActiveRepo - | RemoteCodebaseSearchAvailability::NotIndexed { .. } - | RemoteCodebaseSearchAvailability::Unavailable { .. } => true, - } - } fn handle_codebase_context_enablement_changed(&mut self, ctx: &mut ModelContext) { - if !should_auto_index_codebase(CodebaseAutoIndexingSurface::Remote, ctx) { + if !should_use_codebase_indexing(CodebaseAutoIndexingSurface::Remote, ctx) { + let remote_paths = self.clear_remote_codebase_indexing_state(); + if !remote_paths.is_empty() { + ctx.emit(RemoteCodebaseIndexModelEvent::SettingsEntriesChanged); + } + for remote_path in remote_paths { + RemoteServerManager::handle(ctx).update(ctx, |manager, ctx| { + manager.drop_codebase_index(remote_path, ctx); + }); + } return; } let remote_paths = self.active_git_repo_paths_needing_auto_index(); + if remote_paths.is_empty() + || !should_auto_index_codebase(CodebaseAutoIndexingSurface::Remote, ctx) + { + return; + } + emit_auto_index_requested_telemetry( RemoteCodebaseAutoIndexTrigger::CodebaseContextEnablementChanged, remote_paths.len(), @@ -427,6 +447,11 @@ impl RemoteCodebaseIndexModel { } } + fn clear_remote_codebase_indexing_state(&mut self) -> Vec { + let statuses = std::mem::take(&mut self.statuses); + statuses.into_keys().collect() + } + fn active_git_repo_paths_needing_auto_index(&self) -> Vec { auto_index_candidate_roots( self.active_git_repos_by_session.values().cloned(), @@ -434,6 +459,21 @@ impl RemoteCodebaseIndexModel { ) } + fn should_request_auto_index_for_navigated_git_repo(&self, remote_path: &RemotePath) -> bool { + let Some(status) = self.status_for_repo(remote_path) else { + return true; + }; + + match search_availability_for_status(status, remote_path.clone()) { + RemoteCodebaseSearchAvailability::Ready(_) + | RemoteCodebaseSearchAvailability::Indexing { .. } => false, + RemoteCodebaseSearchAvailability::NoConnectedHost + | RemoteCodebaseSearchAvailability::NoActiveRepo + | RemoteCodebaseSearchAvailability::NotIndexed { .. } + | RemoteCodebaseSearchAvailability::Unavailable { .. } => true, + } + } + fn apply_statuses_snapshot( &mut self, host_id: &HostId, diff --git a/app/src/remote_server/codebase_index_model_tests.rs b/app/src/remote_server/codebase_index_model_tests.rs index 8bf7d14752..4f61569058 100644 --- a/app/src/remote_server/codebase_index_model_tests.rs +++ b/app/src/remote_server/codebase_index_model_tests.rs @@ -290,7 +290,7 @@ fn codebases_for_agent_context_includes_searchable_remote_paths() { status_with_state("/workspaces/stale", RemoteCodebaseIndexState::Stale), ); - let entries = model.codebases_for_agent_context(); + let entries = model.codebases_for_agent_context(&host()); assert_eq!( entries, @@ -325,7 +325,60 @@ fn codebases_for_agent_context_skips_unsearchable_remote_paths() { status_with_state("/workspaces/failed", RemoteCodebaseIndexState::Failed), ); - assert!(model.codebases_for_agent_context().is_empty()); + assert!(model.codebases_for_agent_context(&host()).is_empty()); +} + +#[test] +fn codebases_for_agent_context_only_includes_active_host_paths() { + let mut model = RemoteCodebaseIndexModel::default(); + let active_host = host_with_name("active-host"); + let other_host = host_with_name("other-host"); + model.apply_status_update( + remote_path_for_host(&active_host, "/workspaces/active"), + ready_status("/workspaces/active"), + ); + model.apply_status_update( + remote_path_for_host(&other_host, "/workspaces/other"), + ready_status("/workspaces/other"), + ); + + let entries = model.codebases_for_agent_context(&active_host); + + assert_eq!( + entries, + vec![RemoteCodebaseContextEntry { + name: "active".to_string(), + path: "/workspaces/active".to_string(), + }] + ); +} + +#[test] +fn clear_remote_codebase_indexing_state_returns_paths_and_preserves_active_repo() { + let mut model = RemoteCodebaseIndexModel::default(); + let host = host(); + model.apply_status_update( + remote_path("/workspaces/warp"), + ready_status("/workspaces/warp"), + ); + model.record_navigated_directory(session(1), &remote_path("/workspaces/warp"), true); + + let remote_paths = model.clear_remote_codebase_indexing_state(); + + assert_eq!(remote_paths, vec![remote_path("/workspaces/warp")]); + assert!(model.entries_for_settings().is_empty()); + assert_eq!( + model.active_repos_by_host.get(&host), + Some(&remote_path("/workspaces/warp")) + ); + assert_eq!( + model.active_git_repo_paths_needing_auto_index(), + vec![remote_path("/workspaces/warp")] + ); + assert!(matches!( + model.availability_for_remote(&host, Some("/workspaces/warp"), None), + RemoteCodebaseSearchAvailability::NotIndexed { .. } + )); } #[test] diff --git a/crates/ai/src/index/full_source_code_embedding/codebase_index.rs b/crates/ai/src/index/full_source_code_embedding/codebase_index.rs index 7f9dc2c9cf..ab2d550672 100644 --- a/crates/ai/src/index/full_source_code_embedding/codebase_index.rs +++ b/crates/ai/src/index/full_source_code_embedding/codebase_index.rs @@ -17,19 +17,21 @@ use super::{ RetrieveFileError, }, merkle_tree::{MerkleTree, SerializedCodebaseIndex}, + search_shaping::{ + build_fragments_from_file_contents, fragments_to_context_locations, ReadFragmentResult, + }, store_client::StoreClient, sync_client::{FlushFragmentResult, SyncOperationError}, CodebaseContextConfig, ContentHash, EmbeddingConfig, Error, Fragment, NodeHash, RepoMetadata, }; use crate::{ - index::locations::{CodeContextLocation, FileFragmentLocation}, + index::locations::CodeContextLocation, telemetry::{AITelemetryEvent, CodebaseContextSyncType}, workspace::{WorkspaceMetadata, WorkspaceMetadataEvent}, }; use instant::Instant; use std::{ collections::{HashMap, HashSet}, - ops::Range, path::PathBuf, sync::atomic::{AtomicUsize, Ordering}, time::Duration, @@ -47,7 +49,6 @@ cfg_if::cfg_if! { Entry, matches_gitignores, full_source_code_embedding::sync_client::CodebaseIndexSyncOperation, - full_source_code_embedding::FragmentLocation }; use warp_core::send_telemetry_from_ctx; use warp_core::interval_timer::IntervalTimer; @@ -1673,82 +1674,21 @@ impl CodebaseIndex { } } - // Convert fragments into CodeContextLocations. This function groups and dedupes fragments in the same file. - // It also allows the caller to define a context line number surrounding the relevant fragment. fn process_fragments( &self, fragments: Vec, context_lines: usize, ) -> HashSet { - // Map to collect fragments by file path - let mut fragments_by_path: HashMap<&PathBuf, Vec>> = HashMap::new(); - let mut whole_files = HashSet::new(); - - // First pass - collect all fragments and their line ranges by file path - for fragment in &fragments { - if let Some(metadata) = self - .fragment_metadatas_from_hash(&fragment.content_hash) - .and_then(|metadatas| { - metadatas.iter().find(|m| { - m.absolute_path == fragment.location.absolute_path - && m.location.byte_range == fragment.location.byte_range - }) - }) - { - // Add line range with context to the appropriate file's collection - let path = &fragment.location.absolute_path; - let start = metadata.location.start_line.saturating_sub(context_lines); - let end = metadata.location.end_line + 1 + context_lines; // Make the range inclusive on both ends - - fragments_by_path.entry(path).or_default().push(start..end); - } else { - // Fallback to whole file if metadata not found - whole_files.insert(fragment.location.absolute_path.clone()); - } - } - - // Second pass - process each file's fragments - let mut result = HashSet::new(); - - // Process each file's fragments - for (path, mut line_ranges) in fragments_by_path { - if line_ranges.is_empty() { - continue; - } - - // We can skip the fragments if the entire file is already included in the context. - if whole_files.contains(path) { - continue; - } - - // Sort ranges by start position - line_ranges.sort_by_key(|range| range.start); - - // Merge overlapping or adjacent ranges - let mut merged_ranges: Vec> = Vec::new(); - for range in line_ranges { - if let Some(last) = merged_ranges.last_mut() { - // If current range overlaps or is adjacent to the last one, merge them - if range.start <= last.end { - last.end = last.end.max(range.end); - } else { - merged_ranges.push(range); - } - } else { - merged_ranges.push(range); - } - } - - // Add file fragment location with all merged ranges - result.insert(CodeContextLocation::Fragment(FileFragmentLocation { - path: path.clone(), - line_ranges: merged_ranges, - })); - } - - // Add whole files to the result set - result.extend(whole_files.into_iter().map(CodeContextLocation::WholeFile)); - result + // Keep local and remote search aligned by using the same fragment-to-context expansion + // helper for range merging, deduping, and context-line handling. + fragments_to_context_locations( + fragments, + |content_hash| { + self.fragment_metadatas_from_hash(content_hash) + .map(Vec::as_slice) + }, + context_lines, + ) } /// A new index built from a snapshot. This constructor builds the index and starts @@ -2371,98 +2311,22 @@ impl CodebaseIndex { } } -#[derive(Default)] -pub struct ReadFragmentResult { - pub successfully_read: Vec, - pub fail_to_read: Vec, - pub fail_to_read_path: Vec, -} - #[cfg(feature = "local_fs")] pub(super) async fn build_fragments_from_metadata( metadatas: impl IntoIterator, ) -> ReadFragmentResult { - let mut fragments = Vec::new(); - let mut fail_to_read = Vec::new(); - let mut fail_to_read_path = Vec::new(); - - // Group fragments by file path - let mut fragments_by_path: HashMap<_, Vec<_>> = HashMap::new(); - for (content_hash, metadata) in metadatas { - fragments_by_path - .entry(metadata.absolute_path) - .or_default() - .push((content_hash, metadata.location.byte_range)); - } - - // Process each file and its fragments - for (file_path, file_fragments) in fragments_by_path { - let mut has_failed_to_read_fragments = false; - // Read the file content once - if let Ok(file_content) = async_fs::read_to_string(&file_path).await { - // Process all fragments for this file - for (content_hash, fragment_ranges) in file_fragments { - let start_idx = fragment_ranges.start.as_usize(); - let end_idx = fragment_ranges.end.as_usize(); - - if start_idx <= end_idx - && end_idx <= file_content.len() - && file_content.is_char_boundary(start_idx) - && file_content.is_char_boundary(end_idx) - { - let content = file_content[start_idx..end_idx].to_string(); - if content.is_empty() { - log::trace!( - "Fragment for {:?} with range {:?} is empty", - file_path.display(), - fragment_ranges - ); - fail_to_read.push(content_hash); - has_failed_to_read_fragments = true; - } else if ContentHash::from_content(&content) != content_hash { - log::trace!( - "Fragment for {:?} with range {:?} does not match its content hash", - file_path.display(), - fragment_ranges - ); - fail_to_read.push(content_hash); - has_failed_to_read_fragments = true; - } else { - fragments.push(Fragment { - content, - content_hash, - location: FragmentLocation { - absolute_path: file_path.clone(), - byte_range: fragment_ranges, - }, - }); - } - } else { - log::trace!("Invalid byte range {fragment_ranges:?} for file: {file_path:?}"); - fail_to_read.push(content_hash); - has_failed_to_read_fragments = true; - } - } - } else { - log::trace!("Failed to read file: {file_path:?}"); - fail_to_read.extend( - file_fragments - .into_iter() - .map(|(content_hash, _)| content_hash), - ); - has_failed_to_read_fragments = true; - } - - if has_failed_to_read_fragments { - fail_to_read_path.push(file_path); + let metadatas = metadatas.into_iter().collect::>(); + let mut file_contents = HashMap::new(); + for path in metadatas + .iter() + .map(|(_, metadata)| metadata.absolute_path.clone()) + .collect::>() + { + if let Ok(file_content) = async_fs::read_to_string(&path).await { + file_contents.insert(path, file_content); } } - - ReadFragmentResult { - successfully_read: fragments, - fail_to_read, - fail_to_read_path, - } + build_fragments_from_file_contents(metadatas, &file_contents) } #[cfg(not(feature = "local_fs"))] diff --git a/crates/ai/src/index/full_source_code_embedding/mod.rs b/crates/ai/src/index/full_source_code_embedding/mod.rs index 9b4b8d1b46..7dab1499c1 100644 --- a/crates/ai/src/index/full_source_code_embedding/mod.rs +++ b/crates/ai/src/index/full_source_code_embedding/mod.rs @@ -5,6 +5,7 @@ mod fragment_metadata; pub mod manager; mod merkle_tree; mod priority_queue; +pub mod search_shaping; mod snapshot; pub mod store_client; mod sync_client; @@ -20,7 +21,7 @@ pub use codebase_index::{CodebaseIndex, RetrievalID, SyncProgress}; pub use merkle_tree::{ContentHash, NodeHash}; pub use snapshot::SnapshotStorage; -pub use fragment_metadata::FragmentMetadata; +pub use fragment_metadata::{FragmentLocation as FragmentMetadataLocation, FragmentMetadata}; use string_offset::ByteOffset; use thiserror::Error; use warp_graphql::queries::rerank_fragments::FragmentLocationInput; diff --git a/crates/ai/src/index/full_source_code_embedding/search_shaping.rs b/crates/ai/src/index/full_source_code_embedding/search_shaping.rs new file mode 100644 index 0000000000..47a0de915b --- /dev/null +++ b/crates/ai/src/index/full_source_code_embedding/search_shaping.rs @@ -0,0 +1,316 @@ +use std::{ + collections::{HashMap, HashSet}, + ops::Range, + path::PathBuf, +}; + +use crate::index::locations::{CodeContextLocation, FileFragmentLocation}; + +use super::{ContentHash, Fragment, FragmentLocation, FragmentMetadata}; + +#[derive(Default)] +pub struct ReadFragmentResult { + pub successfully_read: Vec, + pub fail_to_read: Vec, + pub fail_to_read_path: Vec, +} + +pub fn build_fragments_from_file_contents( + metadatas: impl IntoIterator, + file_contents: &HashMap, +) -> ReadFragmentResult { + let mut fragments = Vec::new(); + let mut fail_to_read = Vec::new(); + let mut fail_to_read_path = Vec::new(); + + // Group fragments by file path. + let mut fragments_by_path: HashMap<_, Vec<_>> = HashMap::new(); + for (content_hash, metadata) in metadatas { + fragments_by_path + .entry(metadata.absolute_path) + .or_default() + .push((content_hash, metadata.location.byte_range)); + } + + // Process each file and its fragments. + for (file_path, file_fragments) in fragments_by_path { + let mut has_failed_to_read_fragments = false; + if let Some(file_content) = file_contents.get(&file_path) { + // Process all fragments for this file. + for (content_hash, fragment_ranges) in file_fragments { + let start_idx = fragment_ranges.start.as_usize(); + let end_idx = fragment_ranges.end.as_usize(); + + if start_idx <= end_idx + && end_idx <= file_content.len() + && file_content.is_char_boundary(start_idx) + && file_content.is_char_boundary(end_idx) + { + let content = file_content[start_idx..end_idx].to_string(); + if content.is_empty() { + log::trace!( + "Fragment for {:?} with range {:?} is empty", + file_path.display(), + fragment_ranges + ); + fail_to_read.push(content_hash); + has_failed_to_read_fragments = true; + } else if ContentHash::from_content(&content) != content_hash { + log::trace!( + "Fragment for {:?} with range {:?} does not match its content hash", + file_path.display(), + fragment_ranges + ); + fail_to_read.push(content_hash); + has_failed_to_read_fragments = true; + } else { + fragments.push(Fragment { + content, + content_hash, + location: FragmentLocation { + absolute_path: file_path.clone(), + byte_range: fragment_ranges, + }, + }); + } + } else { + log::trace!("Invalid byte range {fragment_ranges:?} for file: {file_path:?}"); + fail_to_read.push(content_hash); + has_failed_to_read_fragments = true; + } + } + } else { + log::trace!("Failed to read file: {file_path:?}"); + fail_to_read.extend( + file_fragments + .into_iter() + .map(|(content_hash, _)| content_hash), + ); + has_failed_to_read_fragments = true; + } + + if has_failed_to_read_fragments { + fail_to_read_path.push(file_path); + } + } + + ReadFragmentResult { + successfully_read: fragments, + fail_to_read, + fail_to_read_path, + } +} + +// Convert fragments into CodeContextLocations. This function groups and dedupes fragments in the same file. +// It also allows the caller to define a context line number surrounding the relevant fragment. +pub fn fragments_to_context_locations<'a>( + fragments: Vec, + metadata_for_hash: impl Fn(&ContentHash) -> Option<&'a [FragmentMetadata]>, + context_lines: usize, +) -> HashSet { + // Map to collect fragments by file path. + let mut fragments_by_path: HashMap<&PathBuf, Vec>> = HashMap::new(); + let mut whole_files = HashSet::new(); + + // First pass - collect all fragments and their line ranges by file path. + for fragment in &fragments { + if let Some(metadata) = metadata_for_hash(&fragment.content_hash).and_then(|metadatas| { + metadatas.iter().find(|m| { + m.absolute_path == fragment.location.absolute_path + && m.location.byte_range == fragment.location.byte_range + }) + }) { + // Add line range with context to the appropriate file's collection. + let path = &fragment.location.absolute_path; + let start = metadata.location.start_line.saturating_sub(context_lines); + let end = metadata.location.end_line + 1 + context_lines; + + fragments_by_path.entry(path).or_default().push(start..end); + } else { + // Fallback to whole file if metadata not found. + whole_files.insert(fragment.location.absolute_path.clone()); + } + } + + // Second pass - process each file's fragments. + let mut result = HashSet::new(); + + // Process each file's fragments. + for (path, mut line_ranges) in fragments_by_path { + if line_ranges.is_empty() { + continue; + } + + // We can skip the fragments if the entire file is already included in the context. + if whole_files.contains(path) { + continue; + } + + // Sort ranges by start position. + line_ranges.sort_by_key(|range| range.start); + + // Merge overlapping or adjacent ranges. + let mut merged_ranges: Vec> = Vec::new(); + for range in line_ranges { + if let Some(last) = merged_ranges.last_mut() { + // If current range overlaps or is adjacent to the last one, merge them. + if range.start <= last.end { + last.end = last.end.max(range.end); + } else { + merged_ranges.push(range); + } + } else { + merged_ranges.push(range); + } + } + + // Add file fragment location with all merged ranges. + result.insert(CodeContextLocation::Fragment(FileFragmentLocation { + path: path.clone(), + line_ranges: merged_ranges, + })); + } + + // Add whole files to the result set. + result.extend(whole_files.into_iter().map(CodeContextLocation::WholeFile)); + result +} + +#[cfg(test)] +mod tests { + use std::{ + collections::{HashMap, HashSet}, + ops::Range, + path::PathBuf, + }; + + use string_offset::ByteOffset; + + use super::super::{ContentHash, Fragment, FragmentLocation, FragmentMetadata}; + use super::{build_fragments_from_file_contents, fragments_to_context_locations}; + use crate::index::locations::{CodeContextLocation, FileFragmentLocation}; + + fn metadata( + path: &str, + byte_range: Range, + start_line: usize, + end_line: usize, + ) -> FragmentMetadata { + FragmentMetadata { + absolute_path: PathBuf::from(path), + location: super::super::fragment_metadata::FragmentLocation { + start_line, + end_line, + byte_range, + }, + } + } + + fn fragment(content: &str, path: &str, byte_range: Range) -> Fragment { + Fragment { + content: content.to_string(), + content_hash: ContentHash::from_content(content), + location: FragmentLocation { + absolute_path: PathBuf::from(path), + byte_range, + }, + } + } + + #[test] + fn builds_fragments_from_exact_byte_ranges() { + let path = PathBuf::from("/repo/src/lib.rs"); + let content = "before\nneedle\nπ-after".to_string(); + let fragment_content = "needle"; + let start = content.find(fragment_content).unwrap(); + let end = start + fragment_content.len(); + let content_hash = ContentHash::from_content(fragment_content); + let metadata = metadata( + path.to_string_lossy().as_ref(), + ByteOffset::from(start)..ByteOffset::from(end), + 2, + 2, + ); + + let result = build_fragments_from_file_contents( + [(content_hash.clone(), metadata)], + &HashMap::from([(path.clone(), content)]), + ); + + assert_eq!(result.fail_to_read.len(), 0); + assert_eq!(result.successfully_read.len(), 1); + let fragment = &result.successfully_read[0]; + assert_eq!(fragment.content, fragment_content); + assert_eq!(fragment.content_hash, content_hash); + assert_eq!(fragment.location.absolute_path, path); + } + + #[test] + fn rejects_invalid_hashes_and_byte_ranges() { + let path = PathBuf::from("/repo/src/lib.rs"); + let content = "abcπdef".to_string(); + let bad_hash_metadata = metadata( + path.to_string_lossy().as_ref(), + ByteOffset::from(0)..ByteOffset::from(3), + 1, + 1, + ); + let invalid_boundary_metadata = metadata( + path.to_string_lossy().as_ref(), + ByteOffset::from(4)..ByteOffset::from(5), + 1, + 1, + ); + + let result = build_fragments_from_file_contents( + [ + (ContentHash::from_content("not abc"), bad_hash_metadata), + (ContentHash::from_content("π"), invalid_boundary_metadata), + ], + &HashMap::from([(path.clone(), content)]), + ); + + assert!(result.successfully_read.is_empty()); + assert_eq!(result.fail_to_read.len(), 2); + assert_eq!(result.fail_to_read_path, vec![path]); + } + + #[test] + fn shapes_fragments_into_merged_context_locations() { + let path = "/repo/src/lib.rs"; + let fragment_a = fragment("a", path, ByteOffset::from(0)..ByteOffset::from(1)); + let fragment_b = fragment("b", path, ByteOffset::from(2)..ByteOffset::from(3)); + let metadata_a = metadata(path, ByteOffset::from(0)..ByteOffset::from(1), 10, 12); + let metadata_b = metadata(path, ByteOffset::from(2)..ByteOffset::from(3), 15, 17); + let metadata_by_hash = HashMap::from([ + (fragment_a.content_hash.clone(), vec![metadata_a]), + (fragment_b.content_hash.clone(), vec![metadata_b]), + ]); + + let result = fragments_to_context_locations( + vec![fragment_a, fragment_b], + |hash| metadata_by_hash.get(hash).map(Vec::as_slice), + 2, + ); + + assert_eq!( + result, + HashSet::from([CodeContextLocation::Fragment(FileFragmentLocation { + path: PathBuf::from(path), + line_ranges: vec![8..20], + })]) + ); + } + + #[test] + fn falls_back_to_whole_file_when_metadata_is_missing() { + let path = "/repo/src/lib.rs"; + let fragment = fragment("a", path, ByteOffset::from(0)..ByteOffset::from(1)); + let result = fragments_to_context_locations(vec![fragment], |_| None, 2); + + assert_eq!( + result, + HashSet::from([CodeContextLocation::WholeFile(PathBuf::from(path))]) + ); + } +}