AirLibrary/Indexing/State/
CreateState.rs

1//! # CreateState
2//!
3//! ## File: Indexing/State/CreateState.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides state creation functions for the File Indexer service, including
8//! the construction of index entries, symbols, and related data structures
9//! used throughout the indexing system.
10//!
11//! ## Primary Responsibility
12//!
13//! Create and initialize index state structures including FileIndex,
14//! FileMetadata, SymbolInfo, and related types.
15//!
16//! ## Secondary Responsibilities
17//!
18//! - Generate index version strings
19//! - Calculate index checksums for integrity verification
20//! - Create new empty indexes
21//! - Backup corrupted indexes
22//!
23//! ## Dependencies
24//!
25//! **External Crates:**
26//! - `chrono` - Timestamp generation for index metadata
27//! - `sha2` - Checksum calculation for index integrity
28//! - `serde` - Serialization/deserialization of index structures
29//!
30//! **Internal Modules:**
31//! - `crate::Result` - Error handling type
32//! - `crate::AirError` - Error types
33//!
34//! ## Dependents
35//!
36//! - `Indexing::Store::StoreEntry` - Creates entries for index storage
37//! - `Indexing::Store::UpdateIndex` - Updates index state
38//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
39//!
40//! ## VSCode Pattern Reference
41//!
42//! Inspired by VSCode's indexer state creation in
43//! `src/vs/workbench/services/search/common/`
44//!
45//! ## Security Considerations
46//!
47//! - Checksums prevent tampering with index data
48//! - Version tracking enables corruption detection
49//! - Path traversal protection applied during validation
50//!
51//! ## Performance Considerations
52//!
53//! - Lightweight state creation operations
54//! - Hash calculations are amortized across index operations
55//! - Memory-efficient data structures for large indexes
56//!
57//! ## Error Handling Strategy
58//!
59//! State creation operations use result types and propagate errors up
60//! with clear messages about what failed during creation or validation.
61//!
62//! ## Thread Safety
63//!
64//! State structures are designed to be moved into Arc<RwLock<>> for
65//! thread-safe shared access across indexing and search operations.
66
67use std::{
68	collections::{HashMap, HashSet},
69	path::PathBuf,
70};
71#[cfg(unix)]
72use std::os::unix::fs::PermissionsExt;
73
74use serde::{Deserialize, Serialize};
75use sha2::{Digest, Sha256};
76
77use crate::{AirError, Result};
78
79/// Maximum file size allowed for indexing (100MB)
80pub const MAX_FILE_SIZE_BYTES:u64 = 100 * 1024 * 1024;
81
82/// Symbol information extracted from files for VSCode Outline View
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct SymbolInfo {
85	/// Symbol name (function, class, variable, etc.)
86	pub name:String,
87	/// Symbol kind (function, class, struct, interface, etc.)
88	pub kind:SymbolKind,
89	/// Line number where symbol is defined
90	pub line:u32,
91	/// Column number
92	pub column:u32,
93	/// Full qualified path
94	pub full_path:String,
95}
96
97/// Symbol kind for VSCode compatibility
98#[derive(Debug, Clone, Serialize, Deserialize, Hash, Eq, PartialEq)]
99pub enum SymbolKind {
100	File = 0,
101	Module = 1,
102	Namespace = 2,
103	Package = 3,
104	Class = 4,
105	Method = 5,
106	Property = 6,
107	Field = 7,
108	Constructor = 8,
109	Enum = 9,
110	Interface = 10,
111	Function = 11,
112	Variable = 12,
113	Constant = 13,
114	String = 14,
115	Number = 15,
116	Boolean = 16,
117	Array = 17,
118	Object = 18,
119	Key = 19,
120	Null = 20,
121	EnumMember = 21,
122	Struct = 22,
123	Event = 23,
124	Operator = 24,
125	TypeParameter = 25,
126}
127
128/// Symbol location for cross-referencing
129#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct SymbolLocation {
131	/// File containing the symbol
132	pub file_path:PathBuf,
133	/// Line number
134	pub line:u32,
135	/// Symbol information
136	pub symbol:SymbolInfo,
137}
138
139/// File metadata with comprehensive information
140#[derive(Debug, Clone, Serialize, Deserialize)]
141pub struct FileMetadata {
142	/// File path
143	pub path:PathBuf,
144	/// File size in bytes
145	pub size:u64,
146	/// Last modification timestamp
147	pub modified:chrono::DateTime<chrono::Utc>,
148	/// MIME type
149	pub mime_type:String,
150	/// Detected programming language
151	pub language:Option<String>,
152	/// Line count for text files
153	pub line_count:Option<u32>,
154	/// SHA-256 checksum for change detection
155	pub checksum:String,
156	/// Whether file is a symbolic link
157	pub is_symlink:bool,
158	/// File permissions (format: "rwxrwxrwx")
159	pub permissions:String,
160	/// File encoding (UTF-8, ASCII, etc.)
161	pub encoding:Option<String>,
162	/// Last indexed timestamp
163	pub indexed_at:chrono::DateTime<chrono::Utc>,
164	/// Number of symbols extracted
165	pub symbol_count:u32,
166}
167
168/// File index structure with comprehensive metadata
169#[derive(Debug, Clone, Serialize, Deserialize)]
170pub struct FileIndex {
171	/// Indexed files with complete metadata
172	pub files:HashMap<PathBuf, FileMetadata>,
173	/// Content index for fast text search
174	/// Maps words/tokens to file paths where they appear
175	pub content_index:HashMap<String, Vec<PathBuf>>,
176	/// Symbol index for VSCode Outline View and Go to Symbol
177	/// Maps symbol names to their definitions
178	pub symbol_index:HashMap<String, Vec<SymbolLocation>>,
179	/// Reverse symbol index for cross-referencing
180	pub file_symbols:HashMap<PathBuf, Vec<SymbolInfo>>,
181	/// Last update timestamp for all indexes
182	pub last_updated:chrono::DateTime<chrono::Utc>,
183	/// Index version for corruption detection
184	pub index_version:String,
185	/// Index checksum for integrity verification
186	pub index_checksum:String,
187}
188
189/// Create a new empty file index
190pub fn CreateNewIndex() -> FileIndex {
191	FileIndex {
192		files:HashMap::new(),
193		content_index:HashMap::new(),
194		symbol_index:HashMap::new(),
195		file_symbols:HashMap::new(),
196		last_updated:chrono::Utc::now(),
197		index_version:GenerateIndexVersion(),
198		index_checksum:String::new(),
199	}
200}
201
202/// Generate index version string
203pub fn GenerateIndexVersion() -> String { format!("{}-{}", env!("CARGO_PKG_VERSION"), chrono::Utc::now().timestamp()) }
204
205/// Calculate index checksum for integrity verification
206pub fn CalculateIndexChecksum(index:&FileIndex) -> Result<String> {
207	let checksum_input = format!(
208		"{}:{}:{}:{}",
209		index.files.len(),
210		index.content_index.len(),
211		index.symbol_index.len(),
212		index.last_updated.timestamp()
213	);
214
215	let mut hasher = Sha256::new();
216	hasher.update(checksum_input.as_bytes());
217	Ok(format!("{:x}", hasher.finalize()))
218}
219
220/// Create file metadata from raw information
221pub fn CreateFileMetadata(
222	path:PathBuf,
223	size:u64,
224	modified:chrono::DateTime<chrono::Utc>,
225	mime_type:String,
226	language:Option<String>,
227	line_count:Option<u32>,
228	checksum:String,
229	is_symlink:bool,
230	permissions:String,
231	encoding:Option<String>,
232	symbol_count:u32,
233) -> FileMetadata {
234	FileMetadata {
235		path,
236		size,
237		modified,
238		mime_type,
239		language,
240		line_count,
241		checksum,
242		is_symlink,
243		permissions,
244		encoding,
245		indexed_at:chrono::Utc::now(),
246		symbol_count,
247	}
248}
249
250/// Create symbol info with validation
251pub fn CreateSymbolInfo(name:String, kind:SymbolKind, line:u32, column:u32, full_path:String) -> SymbolInfo {
252	SymbolInfo { name, kind, line, column, full_path }
253}
254
255/// Create symbol location for cross-referencing
256pub fn CreateSymbolLocation(file_path:PathBuf, line:u32, symbol:SymbolInfo) -> SymbolLocation {
257	SymbolLocation { file_path, line, symbol }
258}
259
260/// Get file permissions as string from metadata
261#[cfg(unix)]
262pub fn GetPermissionsString(metadata:&std::fs::Metadata) -> String {
263	let mode = metadata.permissions().mode();
264	let mut perms = String::new();
265	// Read permission
266	perms.push(if mode & 0o400 != 0 { 'r' } else { '-' });
267	// Write permission
268	perms.push(if mode & 0o200 != 0 { 'w' } else { '-' });
269	// Execute permission
270	perms.push(if mode & 0o100 != 0 { 'x' } else { '-' });
271	// Group permissions
272	perms.push(if mode & 0o040 != 0 { 'r' } else { '-' });
273	perms.push(if mode & 0o020 != 0 { 'w' } else { '-' });
274	perms.push(if mode & 0o010 != 0 { 'x' } else { '-' });
275	// Other permissions
276	perms.push(if mode & 0o004 != 0 { 'r' } else { '-' });
277	perms.push(if mode & 0o002 != 0 { 'w' } else { '-' });
278	perms.push(if mode & 0o001 != 0 { 'x' } else { '-' });
279	perms
280}
281
282/// Get file permissions as string for non-Unix systems
283#[cfg(not(unix))]
284pub fn GetPermissionsString(_metadata:&std::fs::Metadata) -> String { "--------".to_string() }
285
286/// Validate file size against maximum allowed
287pub fn ValidateFileSize(size:u64) -> Result<()> {
288	if size > MAX_FILE_SIZE_BYTES {
289		return Err(AirError::FileSystem(format!(
290			"File size {} exceeds maximum allowed size of {} bytes",
291			size, MAX_FILE_SIZE_BYTES
292		)));
293	}
294	Ok(())
295}
296
297/// Check if index size is within sane limits
298pub fn ValidateIndexSize(index:&FileIndex) -> Result<()> {
299	const MAX_INDEXED_FILES:usize = 1_000_000;
300	const MAX_SYMBOLS:usize = 10_000_000;
301
302	if index.files.len() > MAX_INDEXED_FILES {
303		return Err(AirError::Internal(format!(
304			"Index exceeds maximum file count: {} > {}",
305			index.files.len(),
306			MAX_INDEXED_FILES
307		)));
308	}
309
310	let total_symbols:usize = index.file_symbols.values().map(|v| v.len()).sum();
311	if total_symbols > MAX_SYMBOLS {
312		return Err(AirError::Internal(format!(
313			"Index exceeds maximum symbol count: {} > {}",
314			total_symbols, MAX_SYMBOLS
315		)));
316	}
317
318	Ok(())
319}