Skip to main content

AirLibrary/Indexing/Process/
ProcessContent.rs

1//! # ProcessContent
2//!
3//! ## File: Indexing/Process/ProcessContent.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides content processing functionality for the File Indexer service,
8//! handling encoding detection, MIME type detection, and content tokenization.
9//!
10//! ## Primary Responsibility
11//!
12//! Process file content for indexing by detecting encoding, mime types, and
13//! tokenizing text for search operations.
14//!
15//! ## Secondary Responsibilities
16//!
17//! - File encoding detection (UTF-8, UTF-16, ASCII)
18//! - MIME type detection from extensions and content
19//! - Content tokenization for search indexing
20//! - Language detection for code analysis
21//!
22//! ## Dependencies
23//!
24//! **External Crates:**
25//! - None (uses std library)
26//!
27//! **Internal Modules:**
28//! - `crate::Result` - Error handling type
29//!
30//! ## Dependents
31//!
32//! - `Indexing::Scan::ScanFile` - Content processing during file scan
33//! - `Indexing::Store::StoreEntry` - Index storage operations
34//!
35//! ## VSCode Pattern Reference
36//!
37//! Inspired by VSCode's content processing in
38//! `src/vs/base/node/encoding/`
39//!
40//! ## Security Considerations
41//!
42//! - Safe BOM marker detection
43//! - Null byte filtering
44//! - Length limits on processed content
45//!
46//! ## Performance Considerations
47//!
48//! - Efficient tokenization with minimal allocations
49//! - Early termination for binary files
50//! - Lazy content evaluation
51//!
52//! ## Error Handling Strategy
53//!
54//! Content processing functions return Option or safe defaults when
55//! detection fails, rather than errors, to allow indexing to continue.
56//!
57//! ## Thread Safety
58//!
59//! Content processing functions are pure and safe to call from
60//! parallel indexing tasks.
61
62use std::path::PathBuf;
63
64use crate::Result;
65
66/// Detect file encoding (simplified detection)
67pub fn DetectEncoding(content:&[u8]) -> Option<String> {
68	if content.is_empty() {
69		return None;
70	}
71
72	// Check for BOM markers
73	if content.starts_with(&[0xEF, 0xBB, 0xBF]) {
74		return Some("UTF-8 (BOM)".to_string());
75	}
76
77	if content.starts_with(&[0xFE, 0xFF]) {
78		return Some("UTF-16 (BE)".to_string());
79	}
80
81	if content.starts_with(&[0xFF, 0xFE]) {
82		return Some("UTF-16 (LE)".to_string());
83	}
84
85	if content.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
86		return Some("UTF-32 (BE)".to_string());
87	}
88
89	if content.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
90		return Some("UTF-32 (LE)".to_string());
91	}
92
93	// Check if all bytes are ASCII
94	if content.iter().all(|&b| b.is_ascii()) {
95		return Some("ASCII".to_string());
96	}
97
98	// Assume UTF-8 for other cases
99	Some("UTF-8".to_string())
100}
101
102/// Detect MIME type with comprehensive file type detection
103pub fn DetectMimeType(file_path:&PathBuf, content:&[u8]) -> String {
104	if let Some(extension) = file_path.extension() {
105		match extension.to_string_lossy().to_lowercase().as_str() {
106			"rs" => "text/x-rust".to_string(),
107
108			"ts" => "text/x-typescript".to_string(),
109
110			"tsx" => "text/typescript-jsx".to_string(),
111
112			"js" => "text/javascript".to_string(),
113
114			"jsx" => "text/javascript-jsx".to_string(),
115
116			"mjs" => "text/javascript".to_string(),
117
118			"cjs" => "text/javascript".to_string(),
119
120			"json" => "application/json".to_string(),
121
122			"jsonc" => "application/json+comments".to_string(),
123
124			"toml" => "text/x-toml".to_string(),
125
126			"yaml" | "yml" => "text/x-yaml".to_string(),
127
128			"md" => "text/markdown".to_string(),
129
130			"mdx" => "text/markdown-jsx".to_string(),
131
132			"txt" => "text/plain".to_string(),
133
134			"html" | "htm" => "text/html".to_string(),
135
136			"css" => "text/css".to_string(),
137
138			"scss" => "text/x-scss".to_string(),
139
140			"sass" => "text/x-sass".to_string(),
141
142			"less" => "text/x-less".to_string(),
143
144			"xml" => "application/xml".to_string(),
145
146			"py" => "text/x-python".to_string(),
147
148			"java" => "text/x-java".to_string(),
149
150			"go" => "text/x-go".to_string(),
151
152			"sh" => "text/x-shellscript".to_string(),
153
154			"bash" => "text/x-shellscript".to_string(),
155
156			"zsh" => "text/x-shellscript".to_string(),
157
158			"fish" => "text/x-shellscript".to_string(),
159
160			"rb" => "text/x-ruby".to_string(),
161
162			"php" => "text/x-php".to_string(),
163
164			"swift" => "text/x-swift".to_string(),
165
166			"kt" | "kts" => "text/x-kotlin".to_string(),
167
168			"scala" => "text/x-scala".to_string(),
169
170			"cs" => "text/x-csharp".to_string(),
171
172			"vb" => "text/x-vbnet".to_string(),
173
174			"f#" => "text/x-fsharp".to_string(),
175
176			"r" => "text/x-r".to_string(),
177
178			"lua" => "text/x-lua".to_string(),
179
180			"pl" => "text/x-perl".to_string(),
181
182			"ps1" => "text/x-powershell".to_string(),
183
184			"sql" => "text/x-sql".to_string(),
185
186			"graphql" | "gql" => "application/graphql".to_string(),
187
188			"graphqls" => "application/graphql".to_string(),
189
190			"proto" => "text/x-protobuf".to_string(),
191
192			"wasm" => "application/wasm".to_string(),
193
194			"wat" => "text/x-wat".to_string(),
195
196			"lock" => "application/json".to_string(),
197
198			"graphqlconfig" => "application/json".to_string(),
199
200			"graphqlrc" => "application/json".to_string(),
201
202			"graphqlconfig.yaml" | "graphqlrc.yaml" => "text/x-yaml".to_string(),
203
204			"graphqlrc.yml" => "text/x-yaml".to_string(),
205
206			"graphqlconfig.json" | "graphqlrc.json" => "application/json".to_string(),
207
208			"graphqlconfig.js" | "graphqlrc.js" => "text/javascript".to_string(),
209
210			"graphqlconfig.ts" | "graphqlrc.ts" => "text/x-typescript".to_string(),
211
212			"graphqlconfig.toml" | "graphqlrc.toml" => "text/x-toml".to_string(),
213
214			_ => {
215				// Use content-based detection
216				DetectMimeTypeFromContent(content)
217			},
218		}
219	} else {
220		// No extension, try content-based detection
221		DetectMimeTypeFromContent(content)
222	}
223}
224
225/// Detect MIME type from content (magic numbers)
226fn DetectMimeTypeFromContent(content:&[u8]) -> String {
227	if content.is_empty() {
228		return "application/octet-stream".to_string();
229	}
230
231	if content.starts_with(b"{") || content.starts_with(b"[") {
232		"application/json".to_string()
233	} else if content.starts_with(b"#!") {
234		"text/x-shellscript".to_string()
235	} else if content.starts_with(b"<?xml") {
236		"application/xml".to_string()
237	} else if content.starts_with(b"<!DOCTYPE") || content.starts_with(b"<html") {
238		"text/html".to_string()
239	} else if content.starts_with(b"---") {
240		"text/x-yaml".to_string()
241	} else if content.is_ascii() && !content.windows(4).any(|w| w.starts_with(&[0u8])) {
242		"text/plain".to_string()
243	} else {
244		"application/octet-stream".to_string()
245	}
246}
247
248/// Detect programming language from file extension and shebang
249pub fn DetectLanguage(file_path:&PathBuf) -> Option<String> {
250	if let Some(extension) = file_path.extension() {
251		let lang = match extension.to_string_lossy().to_lowercase().as_str() {
252			"rs" => "rust",
253
254			"ts" | "tsx" => "typescript",
255
256			"js" | "jsx" | "mjs" | "cjs" => "javascript",
257
258			"json" | "jsonc" | "graphqlconfig" | "graphqlrc" | "lock" => "json",
259
260			"toml" | "graphqlconfig.toml" | "graphqlrc.toml" => "toml",
261
262			"yaml" | "yml" | "graphqlconfig.yaml" | "graphqlrc.yaml" | "graphqlrc.yml" => "yaml",
263
264			"md" | "mdx" => "markdown",
265
266			"txt" => "plaintext",
267
268			"html" | "htm" => "html",
269
270			"css" => "css",
271
272			"scss" => "scss",
273
274			"sass" => "sass",
275
276			"less" => "less",
277
278			"xml" => "xml",
279
280			"py" => "python",
281
282			"java" => "java",
283
284			"go" => "go",
285
286			"sh" | "bash" => "shellscript",
287
288			"zsh" => "shellscript",
289
290			"fish" => "fish",
291
292			"rb" => "ruby",
293
294			"php" => "php",
295
296			"swift" => "swift",
297
298			"kt" | "kts" => "kotlin",
299
300			"scala" => "scala",
301
302			"cpp" | "cc" | "cxx" | "hpp" | "hxx" => "cpp",
303
304			"c" | "h" => "c",
305
306			"cs" => "csharp",
307
308			"vb" => "vb",
309
310			"f#" | "fs" | "fsi" | "fsx" => "fsharp",
311
312			"r" | "rmd" => "r",
313
314			"jl" => "julia",
315
316			"lua" => "lua",
317
318			"pl" => "perl",
319
320			"ps1" | "psm1" | "psd1" => "powershell",
321
322			"sql" => "sql",
323
324			"graphql" | "gql" | "graphqls" => "graphql",
325
326			"proto" => "protobuf",
327
328			"wasm" => "wasm",
329
330			"wat" => "wat",
331
332			"clj" | "cljs" | "cljc" | "edn" => "clojure",
333
334			"hs" | "lhs" => "haskell",
335
336			"erl" | "hrl" => "erlang",
337
338			"ex" | "exs" => "elixir",
339
340			"dart" => "dart",
341
342			"nim" => "nim",
343
344			"v" => "v",
345
346			"zig" => "zig",
347
348			"odin" => "odin",
349
350			"mojo" => "mojo",
351
352			_ => return None,
353		};
354
355		return Some(lang.to_string());
356	}
357
358	// Try to detect from shebang
359	if let Ok(content) = std::fs::read_to_string(file_path) {
360		if let Some(first_line) = content.lines().next() {
361			if first_line.starts_with("#!") {
362				let shebang_path = first_line.split_whitespace().nth(1).unwrap_or("");
363
364				let lang = match shebang_path.rsplit('/').next().unwrap_or("") {
365					"bash" => "shellscript",
366
367					"sh" => "shellscript",
368
369					"zsh" => "shellscript",
370
371					"fish" => "fish",
372
373					"python" | "python2" | "python3" => "python",
374
375					"node" => "javascript",
376
377					"ruby" => "ruby",
378
379					"perl" => "perl",
380
381					"php" => "php",
382
383					"lua" => "lua",
384
385					"r" | "Rscript" => "r",
386
387					"julia" => "julia",
388
389					"rust" | "rustc" => "rust",
390
391					"go" => "go",
392
393					"java" => "java",
394
395					"scala" | "scalac" => "scala",
396
397					"kotlin" | "kotlinc" => "kotlin",
398
399					"swift" => "swift",
400
401					_ => return None,
402				};
403
404				return Some(lang.to_string());
405			}
406		}
407	}
408
409	None
410}
411
412/// Tokenize content for indexing with improved word boundary handling
413pub fn TokenizeContent(content:&str) -> Vec<String> {
414	let mut tokens = Vec::new();
415
416	let mut current_token = String::new();
417
418	let mut in_token = false;
419
420	for c in content.chars() {
421		if c.is_alphanumeric() || c == '_' {
422			current_token.push(c);
423
424			in_token = true;
425		} else if in_token {
426			// End of token
427			tokens.push(current_token.to_lowercase());
428
429			current_token.clear();
430
431			in_token = false;
432		}
433	}
434
435	// Don't forget the last token
436	if in_token {
437		tokens.push(current_token.to_lowercase());
438	}
439
440	tokens
441}
442
443/// Remove null bytes and control characters from content
444pub fn SanitizeContent(content:&str) -> String { content.chars().filter(|c| *c != '\0' && !c.is_control()).collect() }
445
446/// Convert content to UTF-8 string with error handling
447pub fn ContentToString(content:&[u8]) -> Result<String> {
448	String::from_utf8(content.to_vec())
449		.map_err(|e| crate::AirError::FileSystem(format!("Invalid UTF-8 content: {}", e)))
450}
451
452/// Check if content is likely binary (contains null bytes or high ratio of
453/// non-text)
454pub fn IsBinaryContent(content:&[u8]) -> bool {
455	const MAX_NULL_BYTES:usize = 10;
456
457	const BINARY_SCAN_LIMIT:usize = 8000;
458
459	let scan_length = content.len().min(BINARY_SCAN_LIMIT);
460
461	let null_count = content[..scan_length].iter().filter(|&&b| b == 0).count();
462
463	if null_count > MAX_NULL_BYTES {
464		return true;
465	}
466
467	// Check for high ratio of non-text bytes in first chunk
468	let scan_bytes = &content[..scan_length];
469
470	let text_ratio = scan_bytes
471		.iter()
472		.filter(|&&b| b.is_ascii_graphic() || b.is_ascii_whitespace() || b >= 0x80)
473		.count() as f64
474		/ scan_length as f64;
475
476	text_ratio < 0.7
477}
478
479/// Get line count from content
480pub fn GetLineCount(content:&str) -> u32 {
481	if content.is_empty() {
482		return 0;
483	}
484
485	content.lines().count() as u32
486}
487
488/// Get char count from content
489pub fn GetCharCount(content:&str) -> usize { content.chars().count() }
490
491/// Truncate content to specified maximum size in characters
492pub fn TruncateContent(content:&str, max_chars:usize) -> String {
493	let chars:Vec<char> = content.chars().take(max_chars).collect();
494
495	chars.into_iter().collect()
496}