AirLibrary/Indexing/Process/
ProcessContent.rs1use std::path::PathBuf;
63
64use crate::Result;
65
66pub fn DetectEncoding(content:&[u8]) -> Option<String> {
68 if content.is_empty() {
69 return None;
70 }
71
72 if content.starts_with(&[0xEF, 0xBB, 0xBF]) {
74 return Some("UTF-8 (BOM)".to_string());
75 }
76
77 if content.starts_with(&[0xFE, 0xFF]) {
78 return Some("UTF-16 (BE)".to_string());
79 }
80
81 if content.starts_with(&[0xFF, 0xFE]) {
82 return Some("UTF-16 (LE)".to_string());
83 }
84
85 if content.starts_with(&[0x00, 0x00, 0xFE, 0xFF]) {
86 return Some("UTF-32 (BE)".to_string());
87 }
88
89 if content.starts_with(&[0xFF, 0xFE, 0x00, 0x00]) {
90 return Some("UTF-32 (LE)".to_string());
91 }
92
93 if content.iter().all(|&b| b.is_ascii()) {
95 return Some("ASCII".to_string());
96 }
97
98 Some("UTF-8".to_string())
100}
101
102pub fn DetectMimeType(file_path:&PathBuf, content:&[u8]) -> String {
104 if let Some(extension) = file_path.extension() {
105 match extension.to_string_lossy().to_lowercase().as_str() {
106 "rs" => "text/x-rust".to_string(),
107
108 "ts" => "text/x-typescript".to_string(),
109
110 "tsx" => "text/typescript-jsx".to_string(),
111
112 "js" => "text/javascript".to_string(),
113
114 "jsx" => "text/javascript-jsx".to_string(),
115
116 "mjs" => "text/javascript".to_string(),
117
118 "cjs" => "text/javascript".to_string(),
119
120 "json" => "application/json".to_string(),
121
122 "jsonc" => "application/json+comments".to_string(),
123
124 "toml" => "text/x-toml".to_string(),
125
126 "yaml" | "yml" => "text/x-yaml".to_string(),
127
128 "md" => "text/markdown".to_string(),
129
130 "mdx" => "text/markdown-jsx".to_string(),
131
132 "txt" => "text/plain".to_string(),
133
134 "html" | "htm" => "text/html".to_string(),
135
136 "css" => "text/css".to_string(),
137
138 "scss" => "text/x-scss".to_string(),
139
140 "sass" => "text/x-sass".to_string(),
141
142 "less" => "text/x-less".to_string(),
143
144 "xml" => "application/xml".to_string(),
145
146 "py" => "text/x-python".to_string(),
147
148 "java" => "text/x-java".to_string(),
149
150 "go" => "text/x-go".to_string(),
151
152 "sh" => "text/x-shellscript".to_string(),
153
154 "bash" => "text/x-shellscript".to_string(),
155
156 "zsh" => "text/x-shellscript".to_string(),
157
158 "fish" => "text/x-shellscript".to_string(),
159
160 "rb" => "text/x-ruby".to_string(),
161
162 "php" => "text/x-php".to_string(),
163
164 "swift" => "text/x-swift".to_string(),
165
166 "kt" | "kts" => "text/x-kotlin".to_string(),
167
168 "scala" => "text/x-scala".to_string(),
169
170 "cs" => "text/x-csharp".to_string(),
171
172 "vb" => "text/x-vbnet".to_string(),
173
174 "f#" => "text/x-fsharp".to_string(),
175
176 "r" => "text/x-r".to_string(),
177
178 "lua" => "text/x-lua".to_string(),
179
180 "pl" => "text/x-perl".to_string(),
181
182 "ps1" => "text/x-powershell".to_string(),
183
184 "sql" => "text/x-sql".to_string(),
185
186 "graphql" | "gql" => "application/graphql".to_string(),
187
188 "graphqls" => "application/graphql".to_string(),
189
190 "proto" => "text/x-protobuf".to_string(),
191
192 "wasm" => "application/wasm".to_string(),
193
194 "wat" => "text/x-wat".to_string(),
195
196 "lock" => "application/json".to_string(),
197
198 "graphqlconfig" => "application/json".to_string(),
199
200 "graphqlrc" => "application/json".to_string(),
201
202 "graphqlconfig.yaml" | "graphqlrc.yaml" => "text/x-yaml".to_string(),
203
204 "graphqlrc.yml" => "text/x-yaml".to_string(),
205
206 "graphqlconfig.json" | "graphqlrc.json" => "application/json".to_string(),
207
208 "graphqlconfig.js" | "graphqlrc.js" => "text/javascript".to_string(),
209
210 "graphqlconfig.ts" | "graphqlrc.ts" => "text/x-typescript".to_string(),
211
212 "graphqlconfig.toml" | "graphqlrc.toml" => "text/x-toml".to_string(),
213
214 _ => {
215 DetectMimeTypeFromContent(content)
217 },
218 }
219 } else {
220 DetectMimeTypeFromContent(content)
222 }
223}
224
225fn DetectMimeTypeFromContent(content:&[u8]) -> String {
227 if content.is_empty() {
228 return "application/octet-stream".to_string();
229 }
230
231 if content.starts_with(b"{") || content.starts_with(b"[") {
232 "application/json".to_string()
233 } else if content.starts_with(b"#!") {
234 "text/x-shellscript".to_string()
235 } else if content.starts_with(b"<?xml") {
236 "application/xml".to_string()
237 } else if content.starts_with(b"<!DOCTYPE") || content.starts_with(b"<html") {
238 "text/html".to_string()
239 } else if content.starts_with(b"---") {
240 "text/x-yaml".to_string()
241 } else if content.is_ascii() && !content.windows(4).any(|w| w.starts_with(&[0u8])) {
242 "text/plain".to_string()
243 } else {
244 "application/octet-stream".to_string()
245 }
246}
247
248pub fn DetectLanguage(file_path:&PathBuf) -> Option<String> {
250 if let Some(extension) = file_path.extension() {
251 let lang = match extension.to_string_lossy().to_lowercase().as_str() {
252 "rs" => "rust",
253
254 "ts" | "tsx" => "typescript",
255
256 "js" | "jsx" | "mjs" | "cjs" => "javascript",
257
258 "json" | "jsonc" | "graphqlconfig" | "graphqlrc" | "lock" => "json",
259
260 "toml" | "graphqlconfig.toml" | "graphqlrc.toml" => "toml",
261
262 "yaml" | "yml" | "graphqlconfig.yaml" | "graphqlrc.yaml" | "graphqlrc.yml" => "yaml",
263
264 "md" | "mdx" => "markdown",
265
266 "txt" => "plaintext",
267
268 "html" | "htm" => "html",
269
270 "css" => "css",
271
272 "scss" => "scss",
273
274 "sass" => "sass",
275
276 "less" => "less",
277
278 "xml" => "xml",
279
280 "py" => "python",
281
282 "java" => "java",
283
284 "go" => "go",
285
286 "sh" | "bash" => "shellscript",
287
288 "zsh" => "shellscript",
289
290 "fish" => "fish",
291
292 "rb" => "ruby",
293
294 "php" => "php",
295
296 "swift" => "swift",
297
298 "kt" | "kts" => "kotlin",
299
300 "scala" => "scala",
301
302 "cpp" | "cc" | "cxx" | "hpp" | "hxx" => "cpp",
303
304 "c" | "h" => "c",
305
306 "cs" => "csharp",
307
308 "vb" => "vb",
309
310 "f#" | "fs" | "fsi" | "fsx" => "fsharp",
311
312 "r" | "rmd" => "r",
313
314 "jl" => "julia",
315
316 "lua" => "lua",
317
318 "pl" => "perl",
319
320 "ps1" | "psm1" | "psd1" => "powershell",
321
322 "sql" => "sql",
323
324 "graphql" | "gql" | "graphqls" => "graphql",
325
326 "proto" => "protobuf",
327
328 "wasm" => "wasm",
329
330 "wat" => "wat",
331
332 "clj" | "cljs" | "cljc" | "edn" => "clojure",
333
334 "hs" | "lhs" => "haskell",
335
336 "erl" | "hrl" => "erlang",
337
338 "ex" | "exs" => "elixir",
339
340 "dart" => "dart",
341
342 "nim" => "nim",
343
344 "v" => "v",
345
346 "zig" => "zig",
347
348 "odin" => "odin",
349
350 "mojo" => "mojo",
351
352 _ => return None,
353 };
354
355 return Some(lang.to_string());
356 }
357
358 if let Ok(content) = std::fs::read_to_string(file_path) {
360 if let Some(first_line) = content.lines().next() {
361 if first_line.starts_with("#!") {
362 let shebang_path = first_line.split_whitespace().nth(1).unwrap_or("");
363
364 let lang = match shebang_path.rsplit('/').next().unwrap_or("") {
365 "bash" => "shellscript",
366
367 "sh" => "shellscript",
368
369 "zsh" => "shellscript",
370
371 "fish" => "fish",
372
373 "python" | "python2" | "python3" => "python",
374
375 "node" => "javascript",
376
377 "ruby" => "ruby",
378
379 "perl" => "perl",
380
381 "php" => "php",
382
383 "lua" => "lua",
384
385 "r" | "Rscript" => "r",
386
387 "julia" => "julia",
388
389 "rust" | "rustc" => "rust",
390
391 "go" => "go",
392
393 "java" => "java",
394
395 "scala" | "scalac" => "scala",
396
397 "kotlin" | "kotlinc" => "kotlin",
398
399 "swift" => "swift",
400
401 _ => return None,
402 };
403
404 return Some(lang.to_string());
405 }
406 }
407 }
408
409 None
410}
411
412pub fn TokenizeContent(content:&str) -> Vec<String> {
414 let mut tokens = Vec::new();
415
416 let mut current_token = String::new();
417
418 let mut in_token = false;
419
420 for c in content.chars() {
421 if c.is_alphanumeric() || c == '_' {
422 current_token.push(c);
423
424 in_token = true;
425 } else if in_token {
426 tokens.push(current_token.to_lowercase());
428
429 current_token.clear();
430
431 in_token = false;
432 }
433 }
434
435 if in_token {
437 tokens.push(current_token.to_lowercase());
438 }
439
440 tokens
441}
442
443pub fn SanitizeContent(content:&str) -> String { content.chars().filter(|c| *c != '\0' && !c.is_control()).collect() }
445
446pub fn ContentToString(content:&[u8]) -> Result<String> {
448 String::from_utf8(content.to_vec())
449 .map_err(|e| crate::AirError::FileSystem(format!("Invalid UTF-8 content: {}", e)))
450}
451
452pub fn IsBinaryContent(content:&[u8]) -> bool {
455 const MAX_NULL_BYTES:usize = 10;
456
457 const BINARY_SCAN_LIMIT:usize = 8000;
458
459 let scan_length = content.len().min(BINARY_SCAN_LIMIT);
460
461 let null_count = content[..scan_length].iter().filter(|&&b| b == 0).count();
462
463 if null_count > MAX_NULL_BYTES {
464 return true;
465 }
466
467 let scan_bytes = &content[..scan_length];
469
470 let text_ratio = scan_bytes
471 .iter()
472 .filter(|&&b| b.is_ascii_graphic() || b.is_ascii_whitespace() || b >= 0x80)
473 .count() as f64
474 / scan_length as f64;
475
476 text_ratio < 0.7
477}
478
479pub fn GetLineCount(content:&str) -> u32 {
481 if content.is_empty() {
482 return 0;
483 }
484
485 content.lines().count() as u32
486}
487
488pub fn GetCharCount(content:&str) -> usize { content.chars().count() }
490
491pub fn TruncateContent(content:&str, max_chars:usize) -> String {
493 let chars:Vec<char> = content.chars().take(max_chars).collect();
494
495 chars.into_iter().collect()
496}