--- /dev/null
+export interface BinaryDetectionOptions {
+ /** Number of characters to check from the beginning of the file */
+ prefixLength: number;
+ /** Maximum ratio of suspicious characters allowed (0.0 to 1.0) */
+ suspiciousCharThresholdRatio: number;
+ /** Maximum absolute number of null bytes allowed */
+ maxAbsoluteNullBytes: number;
+}
+
+export const DEFAULT_BINARY_DETECTION_OPTIONS: BinaryDetectionOptions = {
+ prefixLength: 1024 * 10, // Check the first 10KB of the string
+ suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
+ maxAbsoluteNullBytes: 2
+};
* Handles text file detection, reading, and validation
*/
+import {
+ DEFAULT_BINARY_DETECTION_OPTIONS,
+ type BinaryDetectionOptions
+} from '$lib/constants/binary-detection';
import { FileExtensionText } from '$lib/enums/files';
/**
* Heuristic check to determine if content is likely from a text file
* Detects binary files by counting suspicious characters and null bytes
* @param content - The file content to analyze
+ * @param options - Optional configuration for detection parameters
* @returns True if the content appears to be text-based
*/
-export function isLikelyTextFile(content: string): boolean {
+export function isLikelyTextFile(
+ content: string,
+ options: Partial<BinaryDetectionOptions> = {}
+): boolean {
if (!content) return true;
- const sample = content.substring(0, 1000);
+ const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options };
+ const sample = content.substring(0, config.prefixLength);
- let suspiciousCount = 0;
let nullCount = 0;
+ let suspiciousControlCount = 0;
for (let i = 0; i < sample.length; i++) {
const charCode = sample.charCodeAt(i);
- // Count null bytes
+ // Count null bytes - these are strong indicators of binary files
if (charCode === 0) {
nullCount++;
- suspiciousCount++;
continue;
}
- // Count suspicious control characters (excluding common ones like tab, newline, carriage return)
+ // Count suspicious control characters
+ // Allow common whitespace characters: tab (9), newline (10), carriage return (13)
if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
- suspiciousCount++;
+ // Count most suspicious control characters
+ if (charCode < 8 || (charCode > 13 && charCode < 27)) {
+ suspiciousControlCount++;
+ }
}
// Count replacement characters (indicates encoding issues)
if (charCode === 0xfffd) {
- suspiciousCount++;
+ suspiciousControlCount++;
}
}
- // Reject if too many null bytes or suspicious characters
- if (nullCount > 2) return false;
- if (suspiciousCount / sample.length > 0.1) return false;
+ // Reject if too many null bytes
+ if (nullCount > config.maxAbsoluteNullBytes) return false;
+
+ // Reject if too many suspicious characters
+ if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false;
return true;
}