AirLibrary/Indexing/Scan/
ScanFile.rs

1//! # ScanFile
2//!
3//! ## File: Indexing/Scan/ScanFile.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides individual file scanning functionality for the File Indexer
8//! service, handling reading, metadata extraction, and categorization of files
9//! for indexing.
10//!
11//! ## Primary Responsibility
12//!
13//! Scan individual files to extract metadata, content, and prepare them for
14//! indexing operations.
15//!
16//! ## Secondary Responsibilities
17//!
18//! - File access validation and permission checking
19//! - Encoding detection for text files
20//! - Language detection for code files
21//! - File size validation
22//! - Symbolic link detection
23//!
24//! ## Dependencies
25//!
26//! **External Crates:**
27//! - `tokio` - Async file I/O operations
28//! - `sha2` - Checksum calculation for file integrity
29//!
30//! **Internal Modules:**
31//! - `crate::Result` - Error handling type
32//! - `crate::AirError` - Error types
33//! - `crate::Configuration::IndexingConfig` - Indexing configuration
34//! - `super::super::State::CreateState` - State structure definitions
35//! - `super::Process::ProcessContent` - Content processing operations
36//!
37//! ## Dependents
38//!
39//! - `Indexing::Scan::ScanDirectory` - Batch file processing
40//! - `Indexing::Watch::WatchFile` - Individual file change handling
41//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
42//!
43//! ## VSCode Pattern Reference
44//!
45//! Inspired by VSCode's file scanning in
46//! `src/vs/workbench/services/files/`
47//!
48//! ## Security Considerations
49//!
50//! - Path canonicalization before access
51//! - File size limits enforced
52//! - Timeout protection for I/O operations
53//! - Permission checking before reads
54//!
55//! ## Performance Considerations
56//!
57//! - Asynchronous file reading
58//! - Batch processing operations
59//! - Memory-efficient streaming for large files
60//! - Cached metadata when available
61//!
62//! ## Error Handling Strategy
63//!
64//! File scanning returns Results with detailed error messages about
65//! why a file cannot be scanned or accessed. Errors are logged and
66//! individual file failures don't halt batch operations.
67//!
68//! ## Thread Safety
69//!
70//! File scanning operations are designed for parallel execution and
71/// produce results that can be safely merged into shared state.
72use std::{
73	path::PathBuf,
74	time::{Duration, Instant},
75};
76
77use tokio::sync::RwLock;
78
79use crate::{
80	AirError,
81	Configuration::IndexingConfig,
82	Indexing::{
83		Process::{
84			ExtractSymbols::ExtractSymbols,
85			ProcessContent::{DetectEncoding, DetectLanguage, DetectMimeType},
86		},
87		State::CreateState::{FileMetadata, SymbolInfo, SymbolLocation},
88	},
89	Result,
90};
91
92/// Index a single file internally with comprehensive validation
93///
94/// This function is called by parallel tasks during directory scanning
95/// and includes:
96/// - File metadata extraction
97/// - Size validation
98/// - SHA-256 checksum calculation
99/// - Encoding detection
100/// - MIME type detection
101/// - Language detection
102/// - Symbol extraction for code files
103pub async fn IndexFileInternal(
104	file_path:&PathBuf,
105	config:&IndexingConfig,
106	_index_ref:&RwLock<crate::Indexing::State::CreateState::FileIndex>,
107	_patterns:&[String],
108) -> Result<(FileMetadata, Vec<SymbolInfo>)> {
109	let start_time = Instant::now();
110
111	// Get file metadata with error handling
112	let metadata = std::fs::metadata(file_path)
113		.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
114
115	// Get modified time
116	let modified = metadata
117		.modified()
118		.map_err(|e| AirError::FileSystem(format!("Failed to get modification time: {}", e)))?;
119
120	let modified_time = chrono::DateTime::<chrono::Utc>::from(modified);
121
122	// Check if file size exceeds limit
123	let file_size = metadata.len();
124	if file_size > config.MaxFileSizeMb as u64 * 1024 * 1024 {
125		return Err(AirError::FileSystem(format!(
126			"File size {} exceeds limit {} MB",
127			file_size, config.MaxFileSizeMb
128		)));
129	}
130
131	// File read with timeout protection
132	let content = tokio::time::timeout(Duration::from_secs(30), tokio::fs::read(file_path))
133		.await
134		.map_err(|_| AirError::FileSystem(format!("Timeout reading file: {} (30s limit)", file_path.display())))?
135		.map_err(|e| AirError::FileSystem(format!("Failed to read file: {}", e)))?;
136
137	// Check for symbolic link
138	let is_symlink = std::fs::symlink_metadata(file_path)
139		.map(|m| m.file_type().is_symlink())
140		.unwrap_or(false);
141
142	// Calculate SHA-256 checksum
143	let checksum = CalculateChecksum(&content);
144
145	// Detect file encoding
146	let encoding = DetectEncoding(&content);
147
148	// Detect MIME type
149	let mime_type = DetectMimeType(file_path, &content);
150
151	// Detect programming language
152	let language = DetectLanguage(file_path);
153
154	// Count lines for text files
155	let line_count = if mime_type.starts_with("text/") {
156		Some(content.iter().filter(|&&b| b == b'\n').count() as u32 + 1)
157	} else {
158		None
159	};
160
161	// Extract symbols from code for VSCode Outline View
162	let symbols = if let Some(lang) = &language {
163		ExtractSymbols(file_path, &content, lang).await?
164	} else {
165		Vec::new()
166	};
167
168	let permissions = GetPermissionsString(&metadata);
169
170	let elapsed = start_time.elapsed();
171
172	log::trace!(
173		"[ScanFile] Indexed {} in {}ms ({} symbols)",
174		file_path.display(),
175		elapsed.as_millis(),
176		symbols.len()
177	);
178
179	Ok((
180		FileMetadata {
181			path:file_path.clone(),
182			size:file_size,
183			modified:modified_time,
184			mime_type,
185			language,
186			line_count,
187			checksum,
188			is_symlink,
189			permissions,
190			encoding,
191			indexed_at:chrono::Utc::now(),
192			symbol_count:symbols.len() as u32,
193		},
194		symbols,
195	))
196}
197
198/// Validate file access and permissions before scanning
199pub async fn ValidateFileAccess(file_path:&PathBuf) -> bool {
200	tokio::task::spawn_blocking({
201		let file_path = file_path.to_path_buf();
202		move || {
203			// Try to read file metadata
204			let can_access = std::fs::metadata(&file_path).is_ok();
205			if can_access {
206				// Try to open file for reading
207				std::fs::File::open(&file_path).is_ok()
208			} else {
209				false
210			}
211		}
212	})
213	.await
214	.unwrap_or(false)
215}
216
217/// Calculate SHA-256 checksum for file content
218pub fn CalculateChecksum(content:&[u8]) -> String {
219	use sha2::{Digest, Sha256};
220	let mut hasher = Sha256::new();
221	hasher.update(content);
222	format!("{:x}", hasher.finalize())
223}
224
225/// Get file permissions as string
226#[cfg(unix)]
227pub fn GetPermissionsString(metadata:&std::fs::Metadata) -> String {
228	use std::os::unix::fs::PermissionsExt;
229	let mode = metadata.permissions().mode();
230	let mut perms = String::new();
231	// Read permission
232	perms.push(if mode & 0o400 != 0 { 'r' } else { '-' });
233	// Write permission
234	perms.push(if mode & 0o200 != 0 { 'w' } else { '-' });
235	// Execute permission
236	perms.push(if mode & 0o100 != 0 { 'x' } else { '-' });
237	// Group permissions
238	perms.push(if mode & 0o040 != 0 { 'r' } else { '-' });
239	perms.push(if mode & 0o020 != 0 { 'w' } else { '-' });
240	perms.push(if mode & 0o010 != 0 { 'x' } else { '-' });
241	// Other permissions
242	perms.push(if mode & 0o004 != 0 { 'r' } else { '-' });
243	perms.push(if mode & 0o002 != 0 { 'w' } else { '-' });
244	perms.push(if mode & 0o001 != 0 { 'x' } else { '-' });
245	perms
246}
247
248/// Get file permissions as string for non-Unix systems
249#[cfg(not(unix))]
250pub fn GetPermissionsString(_metadata:&std::fs::Metadata) -> String { "--------".to_string() }
251
252/// Scan file and return just the metadata (without symbols)
253pub async fn ScanFileMetadata(file_path:&PathBuf) -> Result<FileMetadata> {
254	let metadata = std::fs::metadata(file_path)
255		.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
256
257	let modified = metadata
258		.modified()
259		.map_err(|e| AirError::FileSystem(format!("Failed to get modification time: {}", e)))?;
260
261	let modified_time = chrono::DateTime::<chrono::Utc>::from(modified);
262
263	Ok(FileMetadata {
264		path:file_path.clone(),
265		size:metadata.len(),
266		modified:modified_time,
267		mime_type:"application/octet-stream".to_string(),
268		language:None,
269		line_count:None,
270		checksum:String::new(),
271		is_symlink:metadata.file_type().is_symlink(),
272		permissions:GetPermissionsString(&metadata),
273		encoding:None,
274		indexed_at:chrono::Utc::now(),
275		symbol_count:0,
276	})
277}
278
279/// Check if file has been modified since last indexed
280pub fn FileModifiedSince(file_path:&PathBuf, last_indexed:chrono::DateTime<chrono::Utc>) -> Result<bool> {
281	let metadata = std::fs::metadata(file_path)
282		.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
283
284	let modified = metadata
285		.modified()
286		.map_err(|e| AirError::FileSystem(format!("Failed to get modification time: {}", e)))?;
287
288	let modified_time = chrono::DateTime::<chrono::Utc>::from(modified);
289
290	Ok(modified_time > last_indexed)
291}
292
293/// Get file size with error handling
294pub async fn GetFileSize(file_path:&PathBuf) -> Result<u64> {
295	tokio::task::spawn_blocking({
296		let file_path = file_path.to_path_buf();
297		move || {
298			let metadata = std::fs::metadata(&file_path)
299				.map_err(|e| AirError::FileSystem(format!("Failed to get file metadata: {}", e)))?;
300			Ok(metadata.len())
301		}
302	})
303	.await?
304}
305
306/// Check if file is text-based (likely to be code or documentation)
307pub fn IsTextFile(metadata:&FileMetadata) -> bool {
308	metadata.mime_type.starts_with("text/")
309		|| metadata.mime_type.contains("json")
310		|| metadata.mime_type.contains("xml")
311		|| metadata.mime_type.contains("yaml")
312		|| metadata.mime_type.contains("toml")
313		|| metadata.language.is_some()
314}
315
316/// Check if file is binary (not suitable for indexing)
317pub fn IsBinaryFile(metadata:&FileMetadata) -> bool {
318	!IsTextFile(metadata)
319		|| metadata.mime_type == "application/octet-stream"
320		|| metadata.mime_type == "application/zip"
321		|| metadata.mime_type == "application/x-tar"
322		|| metadata.mime_type == "application/x-gzip"
323		|| metadata.mime_type == "application/x-bzip2"
324}