Skip to main content

AirLibrary/Indexing/State/
CreateState.rs

1//! # CreateState
2//!
3//! ## File: Indexing/State/CreateState.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides state creation functions for the File Indexer service, including
8//! the construction of index entries, symbols, and related data structures
9//! used throughout the indexing system.
10//!
11//! ## Primary Responsibility
12//!
13//! Create and initialize index state structures including FileIndex,
14//! FileMetadata, SymbolInfo, and related types.
15//!
16//! ## Secondary Responsibilities
17//!
18//! - Generate index version strings
19//! - Calculate index checksums for integrity verification
20//! - Create new empty indexes
21//! - Backup corrupted indexes
22//!
23//! ## Dependencies
24//!
25//! **External Crates:**
26//! - `chrono` - Timestamp generation for index metadata
27//! - `sha2` - Checksum calculation for index integrity
28//! - `serde` - Serialization/deserialization of index structures
29//!
30//! **Internal Modules:**
31//! - `crate::Result` - Error handling type
32//! - `crate::AirError` - Error types
33//!
34//! ## Dependents
35//!
36//! - `Indexing::Store::StoreEntry` - Creates entries for index storage
37//! - `Indexing::Store::UpdateIndex` - Updates index state
38//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
39//!
40//! ## VSCode Pattern Reference
41//!
42//! Inspired by VSCode's indexer state creation in
43//! `src/vs/workbench/services/search/common/`
44//!
45//! ## Security Considerations
46//!
47//! - Checksums prevent tampering with index data
48//! - Version tracking enables corruption detection
49//! - Path traversal protection applied during validation
50//!
51//! ## Performance Considerations
52//!
53//! - Lightweight state creation operations
54//! - Hash calculations are amortized across index operations
55//! - Memory-efficient data structures for large indexes
56//!
57//! ## Error Handling Strategy
58//!
59//! State creation operations use result types and propagate errors up
60//! with clear messages about what failed during creation or validation.
61//!
62//! ## Thread Safety
63//!
64//! State structures are designed to be moved into Arc<RwLock<>> for
65//! thread-safe shared access across indexing and search operations.
66
67use std::{collections::HashMap, path::PathBuf};
68#[cfg(unix)]
69use std::os::unix::fs::PermissionsExt;
70
71use serde::{Deserialize, Serialize};
72use sha2::{Digest, Sha256};
73
74use crate::{AirError, Result};
75
76/// Maximum file size allowed for indexing (100MB)
77pub const MAX_FILE_SIZE_BYTES:u64 = 100 * 1024 * 1024;
78
79/// Symbol information extracted from files for VSCode Outline View
80#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct SymbolInfo {
82	/// Symbol name (function, class, variable, etc.)
83	pub name:String,
84
85	/// Symbol kind (function, class, struct, interface, etc.)
86	pub kind:SymbolKind,
87
88	/// Line number where symbol is defined
89	pub line:u32,
90
91	/// Column number
92	pub column:u32,
93
94	/// Full qualified path
95	pub full_path:String,
96}
97
98/// Symbol kind for VSCode compatibility
99#[derive(Debug, Clone, Serialize, Deserialize, Hash, Eq, PartialEq)]
100pub enum SymbolKind {
101	File = 0,
102
103	Module = 1,
104
105	Namespace = 2,
106
107	Package = 3,
108
109	Class = 4,
110
111	Method = 5,
112
113	Property = 6,
114
115	Field = 7,
116
117	Constructor = 8,
118
119	Enum = 9,
120
121	Interface = 10,
122
123	Function = 11,
124
125	Variable = 12,
126
127	Constant = 13,
128
129	String = 14,
130
131	Number = 15,
132
133	Boolean = 16,
134
135	Array = 17,
136
137	Object = 18,
138
139	Key = 19,
140
141	Null = 20,
142
143	EnumMember = 21,
144
145	Struct = 22,
146
147	Event = 23,
148
149	Operator = 24,
150
151	TypeParameter = 25,
152}
153
154/// Symbol location for cross-referencing
155#[derive(Debug, Clone, Serialize, Deserialize)]
156pub struct SymbolLocation {
157	/// File containing the symbol
158	pub file_path:PathBuf,
159
160	/// Line number
161	pub line:u32,
162
163	/// Symbol information
164	pub symbol:SymbolInfo,
165}
166
167/// File metadata with comprehensive information
168#[derive(Debug, Clone, Serialize, Deserialize)]
169pub struct FileMetadata {
170	/// File path
171	pub path:PathBuf,
172
173	/// File size in bytes
174	pub size:u64,
175
176	/// Last modification timestamp
177	pub modified:chrono::DateTime<chrono::Utc>,
178
179	/// MIME type
180	pub mime_type:String,
181
182	/// Detected programming language
183	pub language:Option<String>,
184
185	/// Line count for text files
186	pub line_count:Option<u32>,
187
188	/// SHA-256 checksum for change detection
189	pub checksum:String,
190
191	/// Whether file is a symbolic link
192	pub is_symlink:bool,
193
194	/// File permissions (format: "rwxrwxrwx")
195	pub permissions:String,
196
197	/// File encoding (UTF-8, ASCII, etc.)
198	pub encoding:Option<String>,
199
200	/// Last indexed timestamp
201	pub indexed_at:chrono::DateTime<chrono::Utc>,
202
203	/// Number of symbols extracted
204	pub symbol_count:u32,
205}
206
207/// File index structure with comprehensive metadata
208#[derive(Debug, Clone, Serialize, Deserialize)]
209pub struct FileIndex {
210	/// Indexed files with complete metadata
211	pub files:HashMap<PathBuf, FileMetadata>,
212
213	/// Content index for fast text search
214	/// Maps words/tokens to file paths where they appear
215	pub content_index:HashMap<String, Vec<PathBuf>>,
216
217	/// Symbol index for VSCode Outline View and Go to Symbol
218	/// Maps symbol names to their definitions
219	pub symbol_index:HashMap<String, Vec<SymbolLocation>>,
220
221	/// Reverse symbol index for cross-referencing
222	pub file_symbols:HashMap<PathBuf, Vec<SymbolInfo>>,
223
224	/// Last update timestamp for all indexes
225	pub last_updated:chrono::DateTime<chrono::Utc>,
226
227	/// Index version for corruption detection
228	pub index_version:String,
229
230	/// Index checksum for integrity verification
231	pub index_checksum:String,
232}
233
234/// Create a new empty file index
235pub fn CreateNewIndex() -> FileIndex {
236	FileIndex {
237		files:HashMap::new(),
238
239		content_index:HashMap::new(),
240
241		symbol_index:HashMap::new(),
242
243		file_symbols:HashMap::new(),
244
245		last_updated:chrono::Utc::now(),
246
247		index_version:GenerateIndexVersion(),
248
249		index_checksum:String::new(),
250	}
251}
252
253/// Generate index version string
254pub fn GenerateIndexVersion() -> String { format!("{}-{}", env!("CARGO_PKG_VERSION"), chrono::Utc::now().timestamp()) }
255
256/// Calculate index checksum for integrity verification
257pub fn CalculateIndexChecksum(index:&FileIndex) -> Result<String> {
258	let checksum_input = format!(
259		"{}:{}:{}:{}",
260		index.files.len(),
261		index.content_index.len(),
262		index.symbol_index.len(),
263		index.last_updated.timestamp()
264	);
265
266	let mut hasher = Sha256::new();
267
268	hasher.update(checksum_input.as_bytes());
269
270	// sha2 0.11: digest output is `hybrid_array::Array` which has no
271	// `LowerHex` impl; `hex::encode` is the 1:1 replacement.
272	Ok(hex::encode(hasher.finalize()))
273}
274
275/// Create file metadata from raw information
276pub fn CreateFileMetadata(
277	path:PathBuf,
278
279	size:u64,
280
281	modified:chrono::DateTime<chrono::Utc>,
282
283	mime_type:String,
284
285	language:Option<String>,
286
287	line_count:Option<u32>,
288
289	checksum:String,
290
291	is_symlink:bool,
292
293	permissions:String,
294
295	encoding:Option<String>,
296
297	symbol_count:u32,
298) -> FileMetadata {
299	FileMetadata {
300		path,
301
302		size,
303
304		modified,
305
306		mime_type,
307
308		language,
309
310		line_count,
311
312		checksum,
313
314		is_symlink,
315
316		permissions,
317
318		encoding,
319
320		indexed_at:chrono::Utc::now(),
321
322		symbol_count,
323	}
324}
325
326/// Create symbol info with validation
327pub fn CreateSymbolInfo(name:String, kind:SymbolKind, line:u32, column:u32, full_path:String) -> SymbolInfo {
328	SymbolInfo { name, kind, line, column, full_path }
329}
330
331/// Create symbol location for cross-referencing
332pub fn CreateSymbolLocation(file_path:PathBuf, line:u32, symbol:SymbolInfo) -> SymbolLocation {
333	SymbolLocation { file_path, line, symbol }
334}
335
336/// Get file permissions as string from metadata
337#[cfg(unix)]
338pub fn GetPermissionsString(metadata:&std::fs::Metadata) -> String {
339	let mode = metadata.permissions().mode();
340
341	let mut perms = String::new();
342
343	// Read permission
344	perms.push(if mode & 0o400 != 0 { 'r' } else { '-' });
345
346	// Write permission
347	perms.push(if mode & 0o200 != 0 { 'w' } else { '-' });
348
349	// Execute permission
350	perms.push(if mode & 0o100 != 0 { 'x' } else { '-' });
351
352	// Group permissions
353	perms.push(if mode & 0o040 != 0 { 'r' } else { '-' });
354
355	perms.push(if mode & 0o020 != 0 { 'w' } else { '-' });
356
357	perms.push(if mode & 0o010 != 0 { 'x' } else { '-' });
358
359	// Other permissions
360	perms.push(if mode & 0o004 != 0 { 'r' } else { '-' });
361
362	perms.push(if mode & 0o002 != 0 { 'w' } else { '-' });
363
364	perms.push(if mode & 0o001 != 0 { 'x' } else { '-' });
365
366	perms
367}
368
369/// Get file permissions as string for non-Unix systems
370#[cfg(not(unix))]
371pub fn GetPermissionsString(_metadata:&std::fs::Metadata) -> String { "--------".to_string() }
372
373/// Validate file size against maximum allowed
374pub fn ValidateFileSize(size:u64) -> Result<()> {
375	if size > MAX_FILE_SIZE_BYTES {
376		return Err(AirError::FileSystem(format!(
377			"File size {} exceeds maximum allowed size of {} bytes",
378			size, MAX_FILE_SIZE_BYTES
379		)));
380	}
381
382	Ok(())
383}
384
385/// Check if index size is within sane limits
386pub fn ValidateIndexSize(index:&FileIndex) -> Result<()> {
387	const MAX_INDEXED_FILES:usize = 1_000_000;
388
389	const MAX_SYMBOLS:usize = 10_000_000;
390
391	if index.files.len() > MAX_INDEXED_FILES {
392		return Err(AirError::Internal(format!(
393			"Index exceeds maximum file count: {} > {}",
394			index.files.len(),
395			MAX_INDEXED_FILES
396		)));
397	}
398
399	let total_symbols:usize = index.file_symbols.values().map(|v| v.len()).sum();
400
401	if total_symbols > MAX_SYMBOLS {
402		return Err(AirError::Internal(format!(
403			"Index exceeds maximum symbol count: {} > {}",
404			total_symbols, MAX_SYMBOLS
405		)));
406	}
407
408	Ok(())
409}