]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
Enhance text file detection logic for file attachments (#16199)
authorAleksander Grygier <redacted>
Fri, 26 Sep 2025 17:25:29 +0000 (19:25 +0200)
committerGitHub <redacted>
Fri, 26 Sep 2025 17:25:29 +0000 (19:25 +0200)
* feat: Enhances text file detection logic

* chore: Build static `webui` output

* chore: update webui build output

tools/server/public/index.html.gz
tools/server/webui/src/lib/constants/binary-detection.ts [new file with mode: 0644]
tools/server/webui/src/lib/constants/supported-file-types.ts
tools/server/webui/src/lib/enums/files.ts
tools/server/webui/src/lib/utils/text-files.ts

index 53c6a9b5cfb525d99241072c8cbbd511a02f6793..d3efa63e408ee06b58ec74f9b7425b6188a061fa 100644 (file)
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/webui/src/lib/constants/binary-detection.ts b/tools/server/webui/src/lib/constants/binary-detection.ts
new file mode 100644 (file)
index 0000000..a4440fd
--- /dev/null
@@ -0,0 +1,14 @@
+export interface BinaryDetectionOptions {
+       /** Number of characters to check from the beginning of the file */
+       prefixLength: number;
+       /** Maximum ratio of suspicious characters allowed (0.0 to 1.0) */
+       suspiciousCharThresholdRatio: number;
+       /** Maximum absolute number of null bytes allowed */
+       maxAbsoluteNullBytes: number;
+}
+
+export const DEFAULT_BINARY_DETECTION_OPTIONS: BinaryDetectionOptions = {
+       prefixLength: 1024 * 10, // Check the first 10KB of the string
+       suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
+       maxAbsoluteNullBytes: 2
+};
index f53b55c476ff05268365aa27e15076a88baf0a04..f6c5d2dc18fe1cd09f053b97b447fd5aad7ce100 100644 (file)
@@ -176,5 +176,13 @@ export const TEXT_FILE_TYPES = {
        [FileTypeText.SVELTE]: {
                extensions: [FileExtensionText.SVELTE],
                mimeTypes: [MimeTypeText.SVELTE]
+       },
+       [FileTypeText.LATEX]: {
+               extensions: [FileExtensionText.TEX],
+               mimeTypes: [MimeTypeText.LATEX]
+       },
+       [FileTypeText.BIBTEX]: {
+               extensions: [FileExtensionText.BIB],
+               mimeTypes: [MimeTypeText.BIBTEX]
        }
 } as const;
index 5aec9e7e9f7d30d436774dcb25fe998bbc9bcde0..19b79d32de3c438edb2b895ef6cd2490f808e0da 100644 (file)
@@ -59,7 +59,9 @@ export enum FileTypeText {
        SWIFT = 'swift',
        DART = 'dart',
        VUE = 'vue',
-       SVELTE = 'svelte'
+       SVELTE = 'svelte',
+       LATEX = 'latex',
+       BIBTEX = 'bibtex'
 }
 
 // File extension enums
@@ -115,7 +117,9 @@ export enum FileExtensionText {
        SWIFT = '.swift',
        DART = '.dart',
        VUE = '.vue',
-       SVELTE = '.svelte'
+       SVELTE = '.svelte',
+       TEX = '.tex',
+       BIB = '.bib'
 }
 
 // MIME type enums
@@ -174,5 +178,7 @@ export enum MimeTypeText {
        SWIFT = 'text/x-swift',
        DART = 'text/x-dart',
        VUE = 'text/x-vue',
-       SVELTE = 'text/x-svelte'
+       SVELTE = 'text/x-svelte',
+       LATEX = 'text/x-tex',
+       BIBTEX = 'text/x-bibtex'
 }
index 496f5c79e47b3a788f226d4061177191b3f316a0..d882e3c2d39b76c9a0c6e4eedd8dc37ff3f9674d 100644 (file)
@@ -3,6 +3,10 @@
  * Handles text file detection, reading, and validation
  */
 
+import {
+       DEFAULT_BINARY_DETECTION_OPTIONS,
+       type BinaryDetectionOptions
+} from '$lib/constants/binary-detection';
 import { FileExtensionText } from '$lib/enums/files';
 
 /**
@@ -43,41 +47,51 @@ export async function readFileAsText(file: File): Promise<string> {
  * Heuristic check to determine if content is likely from a text file
  * Detects binary files by counting suspicious characters and null bytes
  * @param content - The file content to analyze
+ * @param options - Optional configuration for detection parameters
  * @returns True if the content appears to be text-based
  */
-export function isLikelyTextFile(content: string): boolean {
+export function isLikelyTextFile(
+       content: string,
+       options: Partial<BinaryDetectionOptions> = {}
+): boolean {
        if (!content) return true;
 
-       const sample = content.substring(0, 1000);
+       const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options };
+       const sample = content.substring(0, config.prefixLength);
 
-       let suspiciousCount = 0;
        let nullCount = 0;
+       let suspiciousControlCount = 0;
 
        for (let i = 0; i < sample.length; i++) {
                const charCode = sample.charCodeAt(i);
 
-               // Count null bytes
+               // Count null bytes - these are strong indicators of binary files
                if (charCode === 0) {
                        nullCount++;
-                       suspiciousCount++;
 
                        continue;
                }
 
-               // Count suspicious control characters (excluding common ones like tab, newline, carriage return)
+               // Count suspicious control characters
+               // Allow common whitespace characters: tab (9), newline (10), carriage return (13)
                if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
-                       suspiciousCount++;
+                       // Count most suspicious control characters
+                       if (charCode < 8 || (charCode > 13 && charCode < 27)) {
+                               suspiciousControlCount++;
+                       }
                }
 
                // Count replacement characters (indicates encoding issues)
                if (charCode === 0xfffd) {
-                       suspiciousCount++;
+                       suspiciousControlCount++;
                }
        }
 
-       // Reject if too many null bytes or suspicious characters
-       if (nullCount > 2) return false;
-       if (suspiciousCount / sample.length > 0.1) return false;
+       // Reject if too many null bytes
+       if (nullCount > config.maxAbsoluteNullBytes) return false;
+
+       // Reject if too many suspicious characters
+       if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false;
 
        return true;
 }