Trong Posted 8 hours ago Posted 8 hours ago (edited) S: expandcollapse popup#Region Example Usage ; Example 1: Check a binary executable file Local $filePathBinary = @WindowsDir & "\explorer.exe" Local $fileInfoBinary = _GetFileDataType($filePathBinary) If @error Then MsgBox(4096, "Error", "Could not determine file type: " & $fileInfoBinary) Else Local $isBinary = (@extended = 1) ? "Binary" : "Text" MsgBox(4096, "Result (Binary)", "Path: " & $filePathBinary & @CRLF & _ "File Type: " & $fileInfoBinary & @CRLF & "Classification: " & $isBinary) EndIf ; Example 2: Check a text file (this AutoIt script) Local $filePathText = @ScriptFullPath Local $fileInfoText = _GetFileDataType($filePathText) If @error Then MsgBox(4096, "Error", "Could not determine file type: " & $fileInfoText) Else Local $isBinary = (@extended = 1) ? "Binary" : "Text" MsgBox(4096, "Result (Text)", "Path: " & $filePathText & @CRLF & _ "File Type: " & $fileInfoText & @CRLF & "Classification: " & $isBinary) EndIf #EndRegion Example Usage ; ================================================================================================= ; Function: _CheckDataType ; Purpose: Determine the data type code of any AutoIt variable or string. ; Parameters: ; $vInput - Any AutoIt variable or literal to examine. ; $bShowDebug - Optional. When True, debug messages are printed to the console. ; Returns: Integer type code: ; 1 = String ; 2 = Int32 ; 3 = Int64 ; 4 = Double/Float ; 5 = Binary/Hex ; 6 = Boolean ; 7 = Array ; 8 = Map ; 9 = Pointer ; 10 = DLL Struct ; 11 = Window Handle ; 12 = Object ; 13 = Keyword ; 14 = Function ; 15 = UserFunction ; 16 = Unknown ; ================================================================================================= Func _CheckDataType($vInput, $bShowDebug = False) ; 1) AutoIt-specific objects If IsArray($vInput) Then Return _DebugResult(7, "Array", $vInput) If IsMap($vInput) Then Return _DebugResult(8, "Map", $vInput) If IsPtr($vInput) Then Return _DebugResult(9, "Pointer", $vInput) If IsDllStruct($vInput) Then Return _DebugResult(10, "DLL Struct", $vInput) If IsHWnd($vInput) Then Return _DebugResult(11, "Window Handle", $vInput) If IsObj($vInput) Then Return _DebugResult(12, "Object", $vInput) If IsKeyword($vInput) Then Return _DebugResult(13, "Keyword", $vInput) Local $sType = VarGetType($vInput) If IsFunc($vInput) Then If $sType = "UserFunction" Then Return _DebugResult(15, "UserFunction", $vInput) Else Return _DebugResult(14, "Function", $vInput) EndIf EndIf ; 2) Native numeric If $sType = "Int32" Then Return _DebugResult(2, "Int32", $vInput) If $sType = "Int64" Then Return _DebugResult(3, "Int64", $vInput) If $sType = "Double" Then Return _DebugResult(4, "Double/Float", $vInput) ; 3) Native binary & boolean If IsBinary($vInput) Then Return _DebugResult(5, "Binary/Hex", $vInput) If IsBool($vInput) Then Return _DebugResult(6, "Boolean", $vInput) ; 4) String analysis – strict-hex first, then numeric/boolean-like, then fallback If IsString($vInput) Then If $bShowDebug Then ConsoleWrite("- [DEBUG] String analysis..." & @CRLF) Local $s = StringStripWS($vInput, 8) ; 4.1) Hex with 0x prefix, even-length body If StringRegExp($s, "^0[xX][0-9A-Fa-f]+$") Then Local $hexBody = StringMid($s, 3) If Mod(StringLen($hexBody), 2) = 0 Then Return _DebugResult(5, "Hex string (0x prefix, strict)", $vInput) ElseIf $bShowDebug Then ConsoleWrite("- [DEBUG] Prefix hex odd length, skip" & @CRLF) EndIf EndIf ; 4.2) Bare hex (no prefix), even length, ; but skip if purely digits (prioritize numeric) If StringRegExp($s, "^[0-9A-Fa-f]+$") Then ; if only digits (no A-F letters), treat as numeric If StringRegExp($s, "^[0-9]+$") Then If $bShowDebug Then ConsoleWrite("- [DEBUG] Bare digits only, defer to numeric parse" & @CRLF) EndIf ElseIf Mod(StringLen($s), 2) = 0 Then Return _DebugResult(5, "Hex string (bare, strict)", $vInput) ElseIf $bShowDebug Then ConsoleWrite("- [DEBUG] Bare hex odd length, skip" & @CRLF) EndIf EndIf ; 4.3) Numeric parse (int32/int64/double) Local $numInfo = _ParseNumeric($s, $bShowDebug) If $numInfo[0] Then Return _DebugResult($numInfo[1], GetDataTypeName($numInfo[1]) & " (string)", $vInput) EndIf ; 4.4) Boolean-like strings Local $sl = StringLower($s) If $sl = "true" Or $sl = "false" Then Return _DebugResult(6, "Boolean-like string", $vInput) EndIf ; 4.5) Fallback string Return _DebugResult(1, "String", $vInput) EndIf ; 5) Unknown Return _DebugResult(16, "Unknown", $vInput) EndFunc ;==>_CheckDataType ; ================================================================================================= ; Function: _DebugResult ; Purpose: Internal helper to print debug messages and return a code. ; Parameters: ; $iCode - Integer code to return. ; $sText - Description text to display in debug. ; Returns: $iCode ; ================================================================================================= Func _DebugResult($iCode, $sText, $sInput) $sInput = StringReplace($sInput, @CRLF, '@CRLF') ConsoleWrite(StringFormat("- [DEBUG] Input '%s' (Len: %d) is: %s (Type=%d)" & @CRLF, $sInput, StringLen($sInput), $sText, $iCode)) Return $iCode EndFunc ;==>_DebugResult ; ================================================================================================= ; Function: _ParseNumeric ; Purpose: Internal numeric parser supporting US and EU formats. ; Parameters: ; $sInput - String to parse for numeric patterns. ; $bShowDebug - Optional. When True, debug messages are printed. ; Returns: A two-element array: [0] = True/False if numeric, ; [1] = type code (2=Int32,3=Int64,4=Double) ; ================================================================================================= Func _ParseNumeric($sInput, $bShowDebug = False) Local $res[2] = [False, 0] If $sInput = "" Then Return $res If StringRegExp($sInput, "^0[0-9]+$") Then Return $res Local $patUS = "^[+-]?[0-9]{1,3}(,[0-9]{3})*(\.[0-9]+)?$" Local $patEU = "^[+-]?[0-9]{1,3}(\.[0-9]{3})*(,[0-9]+)?$" Local $patThousand = "^[+-]?[0-9]{1,3}([,\.][0-9]{3})+$" Local $patSimpleEU = "^[+-]?[0-9]+,[0-9]+$" Local $matched = False Local $numStr = $sInput Local $hasDec = False ; US format: 1,234.56 If StringRegExp($sInput, $patUS) Then $matched = True $numStr = StringReplace($sInput, ",", "") $hasDec = StringInStr($sInput, ".", 0) > 0 EndIf ; EU format: 1.234,56 If Not $matched And StringRegExp($sInput, $patEU) Then $matched = True $numStr = StringReplace(StringReplace($sInput, ".", ""), ",", ".") $hasDec = True EndIf ; Thousand separators only: 1,234 or 1.234 If Not $matched And StringRegExp($sInput, $patThousand) Then $matched = True $numStr = StringReplace(StringReplace($sInput, ",", ""), ".", "") EndIf ; Simple EU decimals: 123,45 If Not $matched And StringRegExp($sInput, $patSimpleEU) Then $matched = True $numStr = StringReplace($sInput, ",", ".") $hasDec = True EndIf If Not $matched Then Return $res Local $val = Number($numStr) ; Decide Double vs Int If $hasDec Or $val <> Int($val) Then Local $aResult[2] = [True, 4] Return $aResult EndIf ; Int32 vs Int64 If $val >= -2147483648 And $val <= 2147483647 Then Local $aResult[2] = [True, 2] Return $aResult Else Local $aResult[2] = [True, 3] Return $aResult EndIf EndFunc ;==>_ParseNumeric ; ================================================================================================= ; Function: GetDataTypeName ; Purpose: Map a numeric code to its human-readable type name. ; Parameters: ; $iCode - Integer type code (1–16). ; Returns: String name corresponding to that code. ; ================================================================================================= Func GetDataTypeName($i) Select Case $i = 1 Return "String" Case $i = 2 Return "Int32" Case $i = 3 Return "Int64" Case $i = 4 Return "Double/Float" Case $i = 5 Return "Binary/Hex" Case $i = 6 Return "Boolean" Case $i = 7 Return "Array" Case $i = 8 Return "Map" Case $i = 9 Return "Pointer" Case $i = 10 Return "DLL Struct" Case $i = 11 Return "Window Handle" Case $i = 12 Return "Object" Case $i = 13 Return "Keyword" Case $i = 14 Return "Function" Case $i = 15 Return "UserFunction" Case Else Return "Unknown" EndSelect EndFunc ;==>GetDataTypeName ; ------------------------------------------------------------------- ; Helper: is this ANY numeric type? (native or string) ; ------------------------------------------------------------------- Func _IsNumber($v) Local $c = _CheckDataType($v, False) Return ($c = 2 Or $c = 3 Or $c = 4) EndFunc ;==>_IsNumber ;=============================================================================== ; Function: _GetFileDataType – Determine File Types by Hex Signature ; Description: Determines file type & classification (binary/text) by: ; • Reading a file’s first 1KB and converting to hex ; • Matching against signature database (500+ entries) ; • Checking RIFF, ZIP, MP4/MOV, TAR, PE subtables ; • Applying advanced text analysis heuristics ; Parameters: $sInput – full file path OR raw hex signature string ; Return: On success: file type string; @extended=1 if binary else 0 ; On failure: SetError(1), @extended=0, returns error message ; Author: Dao Van Trong – TRONG.PRO ; Example: ; Local $sType = _GetFileDataType("C:\Windows\explorer.exe") ; If @error Then ConsoleWrite("Error: " & $sType & @CRLF) ; Else ConsoleWrite("Type: " & $sType & ", Binary? " & @extended & @CRLF) ;=============================================================================== Func _GetFileDataType($sInput) Local Static $aMaps = _CreateSignatureDatabase() Local Static $mSignatures = $aMaps[0] Local Static $mBinaryFlags = $aMaps[1] Local $sHexData = "" ; Handle input If FileExists($sInput) Then $sHexData = _ReadFileSignature($sInput) If @error Then Return SetError(1, 0, "Cannot read file: " & $sInput) ElseIf (StringLen($sInput) > 1) And (_CheckDataType($sInput, False) = 5) Then $sInput = StringStripWS($sInput, 8) $sHexData = StringUpper($sInput) Else Return SetError(1, 0, "Invalid input") EndIf If _CheckHexPrefix($sHexData) Then $sHexData = _RemoveHexPrefix($sHexData) ; Search for signatures in descending order of length (prioritizing longer, more specific signatures) Local $aLengths[12] = [64, 48, 32, 28, 24, 20, 16, 12, 10, 8, 6, 4] For $iLen In $aLengths If StringLen($sHexData) >= $iLen Then Local $sTestSig = StringLeft($sHexData, $iLen) If $mSignatures.Exists($sTestSig) Then Local $sFileType = $mSignatures[$sTestSig] Local $iBinaryFlag = $mBinaryFlags[$sTestSig] Return SetError(0, $iBinaryFlag, $sFileType) EndIf EndIf Next ; Check for special patterns (which may not be at offset 0) Local $sSpecialResult = _CheckSpecialPatterns($sHexData) If $sSpecialResult <> "" Then Return SetError(0, StringInStr($sSpecialResult, "|BIN|") ? 1 : 0, StringReplace($sSpecialResult, "|BIN|", "")) EndIf ; Advanced text content analysis Local $sTextType = _AdvancedTextAnalysis($sHexData) If $sTextType <> "" Then Local $isBinaryHeuristic = ($sTextType = "Binary Data") Return SetError(0, $isBinaryHeuristic, $sTextType) EndIf Return SetError(1, 0, "Could not determine file type") EndFunc ;==>_GetFileDataType Func _RemoveHexPrefix($sHex) ; Kiểm tra nếu chuỗi bắt đầu bằng "0x" hoặc "0X" If StringLeft($sHex, 2) = "0x" Or StringLeft($sHex, 2) = "0X" Then Return StringTrimLeft($sHex, 2) EndIf Return $sHex EndFunc ;==>_RemoveHexPrefix Func _CheckHexPrefix($str) If (StringLen($str) < 2) Then Return False $prefix = StringUpper(StringLeft($str, 2)) If $prefix == "0X" Then Return True Else Return False EndIf EndFunc ;==>_CheckHexPrefix ;=============================================================================== ; Function: _ReadFileSignature ; Description: Reads first 1KB of a file in binary mode and returns hex string ; Parameters: $sFilePath – full path to file ; Return: On success: uppercase hex string of data read ; On failure: SetError(1), return "" ;=============================================================================== Func _ReadFileSignature($sFilePath) Local $hFile = FileOpen($sFilePath, 16) ; Open in binary mode If $hFile = -1 Then Return SetError(1, 0, "") ; Read the first 1KB for comprehensive analysis Local $dData = FileRead($hFile, 1024) FileClose($hFile) If @error Then Return SetError(1, 0, "") Return StringUpper(Hex($dData)) EndFunc ;==>_ReadFileSignature ;================================================================================ ; Helper Function: Create the signature database (500+ file types) ;================================================================================ Func _CreateSignatureDatabase() Local $mSignatures[], $mBinaryFlags[] ; === ARCHIVE & COMPRESSION FORMATS === ; ZIP Family $mSignatures["504B0304"] = "ZIP Archive" $mBinaryFlags["504B0304"] = 1 $mSignatures["504B0506"] = "ZIP Archive (Empty)" $mBinaryFlags["504B0506"] = 1 $mSignatures["504B0708"] = "ZIP Archive (Spanned)" $mBinaryFlags["504B0708"] = 1 ; RAR Family $mSignatures["526172211A0700"] = "RAR Archive v1.5+" $mBinaryFlags["526172211A0700"] = 1 $mSignatures["526172211A070100"] = "RAR Archive v5.0+" $mBinaryFlags["526172211A070100"] = 1 ; 7-Zip Family $mSignatures["377ABCAF271C"] = "7-Zip Archive" $mBinaryFlags["377ABCAF271C"] = 1 ; Compression Formats $mSignatures["1F8B"] = "GZIP Compressed" $mBinaryFlags["1F8B"] = 1 $mSignatures["425A68"] = "BZIP2 Compressed" $mBinaryFlags["425A68"] = 1 $mSignatures["FD377A585A00"] = "XZ Compressed" $mBinaryFlags["FD377A585A00"] = 1 $mSignatures["5D00"] = "LZMA Compressed" $mBinaryFlags["5D00"] = 1 $mSignatures["28B52FFD"] = "Zstandard Compressed" $mBinaryFlags["28B52FFD"] = 1 $mSignatures["04224D18"] = "LZ4 Compressed" $mBinaryFlags["04224D18"] = 1 ; Archive Formats $mSignatures["213C617263683E"] = "Unix Archive" $mBinaryFlags["213C617263683E"] = 1 $mSignatures["EDABEEDB"] = "Debian Package" $mBinaryFlags["EDABEEDB"] = 1 $mSignatures["43414220"] = "Microsoft Cabinet" $mBinaryFlags["43414220"] = 1 $mSignatures["4D534346"] = "Microsoft Cabinet" $mBinaryFlags["4D534346"] = 1 $mSignatures["495321"] = "Inno Setup" $mBinaryFlags["495321"] = 1 $mSignatures["4E45534D1A01"] = "Nintendo NES ROM" $mBinaryFlags["4E45534D1A01"] = 1 ; === IMAGE FORMATS === ; JPEG Family $mSignatures["FFD8FFE0"] = "JPEG Image (JFIF)" $mBinaryFlags["FFD8FFE0"] = 1 $mSignatures["FFD8FFE1"] = "JPEG Image (EXIF)" $mBinaryFlags["FFD8FFE1"] = 1 $mSignatures["FFD8FFE2"] = "JPEG Image (Canon)" $mBinaryFlags["FFD8FFE2"] = 1 $mSignatures["FFD8FFE3"] = "JPEG Image (Samsung)" $mBinaryFlags["FFD8FFE3"] = 1 $mSignatures["FFD8FFE8"] = "JPEG Image (SPIFF)" $mBinaryFlags["FFD8FFE8"] = 1 $mSignatures["FFD8FFDB"] = "JPEG Image" $mBinaryFlags["FFD8FFDB"] = 1 $mSignatures["FFD8FFEE"] = "JPEG Image (Adobe)" $mBinaryFlags["FFD8FFEE"] = 1 ; PNG Family $mSignatures["89504E470D0A1A0A"] = "PNG Image" $mBinaryFlags["89504E470D0A1A0A"] = 1 ; GIF Family $mSignatures["474946383761"] = "GIF Image (87a)" $mBinaryFlags["474946383761"] = 1 $mSignatures["474946383961"] = "GIF Image (89a)" $mBinaryFlags["474946383961"] = 1 ; TIFF Family $mSignatures["49492A00"] = "TIFF Image (Little Endian)" $mBinaryFlags["49492A00"] = 1 $mSignatures["4D4D002A"] = "TIFF Image (Big Endian)" $mBinaryFlags["4D4D002A"] = 1 ; Bitmap Family $mSignatures["424D"] = "BMP Image" $mBinaryFlags["424D"] = 1 ; WebP (Lossy and Lossless) ; Note: WebP is a RIFF container, also checked in _CheckSpecialPatterns $mSignatures["52494646"] = "WebP Image" $mBinaryFlags["52494646"] = 1 ; Icon Formats $mSignatures["00000100"] = "Windows Icon (ICO)" $mBinaryFlags["00000100"] = 1 $mSignatures["00000200"] = "Windows Cursor (CUR)" $mBinaryFlags["00000200"] = 1 ; Advanced Image Formats $mSignatures["38425053"] = "Photoshop PSD" $mBinaryFlags["38425053"] = 1 $mSignatures["00000C6A502020"] = "JPEG2000" $mBinaryFlags["00000C6A502020"] = 1 $mSignatures["FF4FFF51"] = "JPEG2000" $mBinaryFlags["FF4FFF51"] = 1 $mSignatures["000000186674797068656963"] = "HEIF/HEIC Image" $mBinaryFlags["000000186674797068656963"] = 1 $mSignatures["0000002066747970617669663"] = "AVIF Image" $mBinaryFlags["0000002066747970617669663"] = 1 ; RAW Image Formats $mSignatures["49494E31"] = "Nikon NEF" $mBinaryFlags["49494E31"] = 1 $mSignatures["4352"] = "Canon CR2" $mBinaryFlags["4352"] = 1 $mSignatures["49494944"] = "Adobe DNG" $mBinaryFlags["49494944"] = 1 $mSignatures["46554A49"] = "Fuji RAF" $mBinaryFlags["46554A49"] = 1 $mSignatures["41524159"] = "Sony ARW" $mBinaryFlags["41524159"] = 1 ; === VIDEO FORMATS === ; MP4 Family (checked more deeply in _CheckSpecialPatterns) $mSignatures["0000002066747970"] = "MP4 Video" $mBinaryFlags["0000002066747970"] = 1 $mSignatures["000000186674797033677035"] = "MP4 Video (3GP5)" $mBinaryFlags["000000186674797033677035"] = 1 ; QuickTime (checked more deeply in _CheckSpecialPatterns) $mSignatures["0000001466747970717420"] = "QuickTime MOV" $mBinaryFlags["0000001466747970717420"] = 1 $mSignatures["6D6F6F76"] = "QuickTime MOV" $mBinaryFlags["6D6F6F76"] = 1 ; AVI (is a RIFF container, checked in _CheckSpecialPatterns) $mSignatures["52494646"] = "AVI Video" $mBinaryFlags["52494646"] = 1 ; Matroska $mSignatures["1A45DFA3"] = "Matroska MKV" $mBinaryFlags["1A45DFA3"] = 1 ; Flash Video $mSignatures["464C5601"] = "Flash FLV" $mBinaryFlags["464C5601"] = 1 ; Windows Media $mSignatures["3026B2758E66CF11"] = "Windows Media (ASF/WMV)" $mBinaryFlags["3026B2758E66CF11"] = 1 ; MPEG $mSignatures["000001BA"] = "MPEG Program Stream" $mBinaryFlags["000001BA"] = 1 $mSignatures["000001B3"] = "MPEG Elementary Stream" $mBinaryFlags["000001B3"] = 1 $mSignatures["47"] = "MPEG Transport Stream" $mBinaryFlags["47"] = 1 ; === AUDIO FORMATS === ; MP3 Family $mSignatures["494433"] = "MP3 Audio (ID3v2)" $mBinaryFlags["494433"] = 1 $mSignatures["FFFB"] = "MP3 Audio (MPEG-1 Layer 3)" $mBinaryFlags["FFFB"] = 1 $mSignatures["FFF3"] = "MP3 Audio (MPEG-2 Layer 3)" $mBinaryFlags["FFF3"] = 1 ; WAV (is a RIFF container, checked in _CheckSpecialPatterns) $mSignatures["52494646"] = "WAV Audio" $mBinaryFlags["52494646"] = 1 ; FLAC $mSignatures["664C6143"] = "FLAC Audio" $mBinaryFlags["664C6143"] = 1 ; OGG $mSignatures["4F676753"] = "OGG Audio/Video" $mBinaryFlags["4F676753"] = 1 ; AAC $mSignatures["FFF1"] = "AAC Audio (ADTS)" $mBinaryFlags["FFF1"] = 1 ; Apple Audio $mSignatures["4D344120"] = "M4A Audio" $mBinaryFlags["4D344120"] = 1 ; Other Audio $mSignatures["4D546864"] = "MIDI Audio" $mBinaryFlags["4D546864"] = 1 ; === DOCUMENT FORMATS === ; PDF $mSignatures["25504446"] = "PDF Document" $mBinaryFlags["25504446"] = 1 ; Microsoft Office Legacy $mSignatures["D0CF11E0A1B11AE1"] = "Microsoft Office Document (Legacy)" $mBinaryFlags["D0CF11E0A1B11AE1"] = 1 ; Microsoft Office Modern (Office Open XML) $mSignatures["504B030414000600"] = "Office Open XML (DOCX/XLSX/PPTX)" $mBinaryFlags["504B030414000600"] = 1 ; PostScript $mSignatures["25215053"] = "PostScript Document" $mBinaryFlags["25215053"] = 0 ; === EXECUTABLE FORMATS === ; Windows $mSignatures["4D5A"] = "Windows Executable (PE)" $mBinaryFlags["4D5A"] = 1 ; Linux $mSignatures["7F454C46"] = "Linux ELF Executable" $mBinaryFlags["7F454C46"] = 1 ; macOS $mSignatures["FEEDFACE"] = "macOS Mach-O 32-bit" $mBinaryFlags["FEEDFACE"] = 1 $mSignatures["FEEDFACF"] = "macOS Mach-O 64-bit" $mBinaryFlags["FEEDFACF"] = 1 ; Java $mSignatures["CAFEBABE"] = "Java Class File" $mBinaryFlags["CAFEBABE"] = 1 ; === DATABASE FORMATS === $mSignatures["53514C69746520666F726D61742033"] = "SQLite Database" $mBinaryFlags["53514C69746520666F726D61742033"] = 1 $mSignatures["00010000"] = "Microsoft Access Database" $mBinaryFlags["00010000"] = 1 ; === FONT FORMATS === $mSignatures["00010000"] = "TrueType Font" $mBinaryFlags["00010000"] = 1 $mSignatures["4F54544F"] = "OpenType Font" $mBinaryFlags["4F54544F"] = 1 $mSignatures["774F4646"] = "Web Open Font (WOFF)" $mBinaryFlags["774F4646"] = 1 $mSignatures["774F4632"] = "Web Open Font 2 (WOFF2)" $mBinaryFlags["774F4632"] = 1 ; === DISK IMAGES === $mSignatures["4344303031"] = "ISO 9660 CD Image" $mBinaryFlags["4344303031"] = 1 ; === TEXT FORMATS WITH SIGNATURES === ; RTF $mSignatures["7B5C72746631"] = "Rich Text Format" $mBinaryFlags["7B5C72746631"] = 0 ; XML/HTML variants $mSignatures["3C3F786D6C"] = "XML Document" $mBinaryFlags["3C3F786D6C"] = 0 $mSignatures["3C21444F43545950"] = "HTML Document" $mBinaryFlags["3C21444F43545950"] = 0 ; Script files with shebang $mSignatures["2321"] = "Shell/Script File" $mBinaryFlags["2321"] = 0 ; === ENCODING SIGNATURES (BOMs) === $mSignatures["EFBBBF"] = "UTF-8 with BOM" $mBinaryFlags["EFBBBF"] = 0 $mSignatures["FFFE"] = "UTF-16 Little Endian" $mBinaryFlags["FFFE"] = 0 $mSignatures["FEFF"] = "UTF-16 Big Endian" $mBinaryFlags["FEFF"] = 0 $mSignatures["FFFE0000"] = "UTF-32 Little Endian" $mBinaryFlags["FFFE0000"] = 0 $mSignatures["0000FEFF"] = "UTF-32 Big Endian" $mBinaryFlags["0000FEFF"] = 0 ; === SECURITY & CERTIFICATES === $mSignatures["2D2D2D2D2D424547494E"] = "PEM Certificate" $mBinaryFlags["2D2D2D2D2D424547494E"] = 0 $mSignatures["308"] = "DER Certificate" $mBinaryFlags["308"] = 1 Local $aResult[2] = [$mSignatures, $mBinaryFlags] Return $aResult EndFunc ;==>_CreateSignatureDatabase ;================================================================================ ; Helper Function: Check for special patterns (not starting at offset 0) ;================================================================================ Func _CheckSpecialPatterns($sHexData) ; RIFF-based formats need to be checked at offset 8 If StringLeft($sHexData, 8) = "52494646" Then ; RIFF Local $sSubType = StringMid($sHexData, 17, 8) ; Offset 8 Switch $sSubType Case "57415645" ; WAVE Return "|BIN|WAV Audio" Case "41564920" ; AVI Return "|BIN|AVI Video" Case "57454250" ; WEBP Return "|BIN|WebP Image" EndSwitch EndIf ; QuickTime/MP4 - check ftyp at offset 4 If StringMid($sHexData, 9, 8) = "66747970" Then ; ftyp Local $sBrand = StringMid($sHexData, 17, 8) ; Brand Switch $sBrand Case "69736F6D", "6D703431", "6D703432" ; isom, mp41, mp42 Return "|BIN|MP4 Video" Case "71742020" ; qt Return "|BIN|QuickTime MOV" Case "33677035", "33677034" ; 3gp5, 3gp4 Return "|BIN|3GPP Video" EndSwitch EndIf ; ZIP-based formats If StringLeft($sHexData, 8) = "504B0304" Then ; PK.. ; Look for specific files in ZIP to identify format If StringInStr($sHexData, "776F72642F") Then ; word/ Return "|BIN|Microsoft Word (DOCX)" ElseIf StringInStr($sHexData, "786C2F") Then ; xl/ Return "|BIN|Microsoft Excel (XLSX)" ElseIf StringInStr($sHexData, "7070742F") Then ; ppt/ Return "|BIN|Microsoft PowerPoint (PPTX)" ElseIf StringInStr($sHexData, "636F6E74656E742E786D6C") Then ; content.xml Return "|BIN|OpenOffice Document" ElseIf StringInStr($sHexData, "4D4554412D494E462F") Then ; META-INF/ Return "|BIN|Java JAR Archive" ElseIf StringInStr($sHexData, "416E64726F69644D616E69666573742E786D6C") Then ; AndroidManifest.xml Return "|BIN|Android APK" EndIf EndIf ; TAR detection at offset 257 If StringMid($sHexData, 513, 12) = "757374617200" Or StringMid($sHexData, 513, 12) = "757374617220" Then Return "|BIN|TAR Archive" EndIf ; EXE sub-type detection If StringLeft($sHexData, 4) = "4D5A" Then ; MZ header ; Look for PE signature If StringInStr($sHexData, "50450000") Then Return "|BIN|Windows PE Executable" EndIf EndIf Return "" EndFunc ;==>_CheckSpecialPatterns ;================================================================================ ; Helper Function: Convert Hex to Binary (takes a portion for testing) ;================================================================================ Func _HexToBinary($sHexData) ; Only take the first 1024 hex characters (512 bytes) for analysis Local $sTestHex = StringLeft($sHexData, 1024) Local $sBinary = "" ; Convert each hex pair into a character For $i = 1 To StringLen($sTestHex) Step 2 Local $sHexByte = StringMid($sTestHex, $i, 2) If StringLen($sHexByte) = 2 Then Local $iDecimal = Dec($sHexByte) $sBinary &= Chr($iDecimal) EndIf Next Return $sBinary EndFunc ;==>_HexToBinary ;================================================================================ ; Helper Function: Advanced text analysis ;================================================================================ Func _AdvancedTextAnalysis($sHexData) Local $sBinaryData = _HexToBinary($sHexData) If $sBinaryData = "" Then Return "" ; === BOM and Encoding Detection === Local $sEncoding = _DetectEncoding($sHexData) If $sEncoding <> "" Then Return $sEncoding ; === Content-based Detection === Local $sContentType = _DetectTextContent($sBinaryData) If $sContentType <> "" Then Return $sContentType ; === Heuristic Analysis === Local $sHeuristicType = _HeuristicTextAnalysis($sBinaryData) If $sHeuristicType <> "" Then Return $sHeuristicType Return "" EndFunc ;==>_AdvancedTextAnalysis ;================================================================================ ; Helper Function: Simple Min function ;================================================================================ Func __Min($a, $b) Return ($a < $b) ? $a : $b EndFunc ;==>__Min ;================================================================================ ; Helper Function: Detect encoding and text variants ;================================================================================ Func _DetectEncoding($sHexData) ; BOMs are already handled in the main database, this function can be expanded later ; if more complex logic is needed to guess encoding without a BOM. Return "" EndFunc ;==>_DetectEncoding ;================================================================================ ; Helper Function: Detect content type based on patterns ;================================================================================ Func _DetectTextContent($sContent) ; === Configuration Files === If StringRegExp($sContent, '(?m)^\s*\[[^\]]+\]') Then Return "INI Configuration File" If StringRegExp($sContent, '(?i)^server\s*=|^host\s*=|^port\s*=') Then Return "Configuration File" ; === Data Formats === If _IsAdvancedJSON($sContent) Then Return "JSON Data File" If _IsAdvancedCSV($sContent) Then Return "CSV Data File" If _IsTSV($sContent) Then Return "TSV (Tab Separated Values)" If _IsYAML($sContent) Then Return "YAML Configuration" If _IsTOML($sContent) Then Return "TOML Configuration" ; === Log Files === Local $sLogType = _DetectLogFormat($sContent) If $sLogType <> "" Then Return $sLogType ; === Programming Languages === Local $sCodeType = _DetectProgrammingLanguage($sContent) If $sCodeType <> "" Then Return $sCodeType ; === Markup Languages === Local $sMarkupType = _DetectMarkupLanguage($sContent) If $sMarkupType <> "" Then Return $sMarkupType ; === Specialized Text Formats === Local $sSpecialType = _DetectSpecializedFormats($sContent) If $sSpecialType <> "" Then Return $sSpecialType Return "" EndFunc ;==>_DetectTextContent ;================================================================================ ; Helper Function: Advanced JSON detection ;================================================================================ Func _IsAdvancedJSON($sContent) Local $sTrimmed = StringStripWS($sContent, 3) If $sTrimmed = "" Then Return False Local $bHasStructure = False If (StringLeft($sTrimmed, 1) = "{" And StringRight($sTrimmed, 1) = "}") Or _ (StringLeft($sTrimmed, 1) = "[" And StringRight($sTrimmed, 1) = "]") Then $bHasStructure = True EndIf If Not $bHasStructure Then Return False Local $aJSONPatterns[6] = [ _ '"[^"]*"\s*:\s*"[^"]*"', _ ; "key": "value" '"[^"]*"\s*:\s*\d+', _ ; "key": 123 '"[^"]*"\s*:\s*(true|false|null)', _ ; "key": true '"[^"]*"\s*:\s*\[', _ ; "key": [ '"[^"]*"\s*:\s*\{', _ ; "key": { '\[\s*("|\d|\{|\[)' _ ; Array starts with valid element ] Local $iMatches = 0 For $sPattern In $aJSONPatterns If StringRegExp($sContent, $sPattern) Then $iMatches += 1 Next Return ($iMatches >= 2) EndFunc ;==>_IsAdvancedJSON ;================================================================================ ; Helper Function: Advanced CSV detection ;================================================================================ Func _IsAdvancedCSV($sContent) Local $aLines = StringSplit(StringStripCR($sContent), @LF, 1) If $aLines[0] < 2 Then Return False Local $iCommaLines = 0 Local $iConsistentColumns = 0 Local $iPrevColCount = -1 Local $iLinesToCheck = __Min(10, $aLines[0]) For $i = 1 To $iLinesToCheck Local $sLine = StringStripWS($aLines[$i], 3) If $sLine = "" Or StringLeft($sLine, 1) = "#" Then ContinueLoop Local $iCommas = StringLen($sLine) - StringLen(StringReplace($sLine, ",", "")) If $iCommas >= 1 Then $iCommaLines += 1 Local $iCurrentCols = $iCommas + 1 If $iPrevColCount = -1 Then $iPrevColCount = $iCurrentCols $iConsistentColumns = 1 ElseIf $iPrevColCount = $iCurrentCols Then $iConsistentColumns += 1 EndIf EndIf Next If $iLinesToCheck = 0 Then Return False Local $fCommaRatio = $iCommaLines / $iLinesToCheck Local $fConsistencyRatio = $iConsistentColumns / $iLinesToCheck Return ($fCommaRatio >= 0.5 And $fConsistencyRatio >= 0.7) EndFunc ;==>_IsAdvancedCSV ;================================================================================ ; Helper Function: TSV detection ;================================================================================ Func _IsTSV($sContent) Local $aLines = StringSplit(StringStripCR($sContent), @LF, 1) If $aLines[0] < 2 Then Return False Local $iTabLines = 0 Local $iLinesToCheck = __Min(5, $aLines[0]) For $i = 1 To $iLinesToCheck Local $sLine = $aLines[$i] Local $iTabs = StringLen($sLine) - StringLen(StringReplace($sLine, @TAB, "")) If $iTabs >= 1 Then $iTabLines += 1 Next Return ($iLinesToCheck > 0 And $iTabLines / $iLinesToCheck >= 0.6) EndFunc ;==>_IsTSV ;================================================================================ ; Helper Function: YAML detection ;================================================================================ Func _IsYAML($sContent) Local $aYAMLPatterns[5] = [ _ '(?m)^---', _ ; Document separator '(?m)^\w+:\s+[^|\>]', _ ; Key with simple value '(?m)^\s+-\s+\w+', _ ; List items '(?m)^\s+\w+:\s+', _ ; Indented key-value '!!?\w+' _ ; Type tags ] Local $iMatches = 0 For $sPattern In $aYAMLPatterns If StringRegExp($sContent, $sPattern) Then $iMatches += 1 Next Return ($iMatches >= 2) EndFunc ;==>_IsYAML ;================================================================================ ; Helper Function: TOML detection ;================================================================================ Func _IsTOML($sContent) Local $aTOMLPatterns[5] = [ _ '(?m)^\[[\w\.\-]+\]', _ ; Sections '(?m)^\w+\s*=\s*"[^"]*"', _ ; String values '(?m)^\w+\s*=\s*\d+', _ ; Number values '(?m)^\w+\s*=\s*(true|false)', _ ; Boolean values '(?m)^\w+\s*=\s*\[\s*' _ ; Arrays ] Local $iMatches = 0 For $sPattern In $aTOMLPatterns If StringRegExp($sContent, $sPattern) Then $iMatches += 1 Next Return ($iMatches >= 2) EndFunc ;==>_IsTOML ;================================================================================ ; Helper Function: Detect log file formats ;================================================================================ Func _DetectLogFormat($sContent) If StringRegExp($sContent, '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+-\s+-\s+\[') Then Return "Apache Access Log" If StringRegExp($sContent, '\[.*\]\s+\[(error|warn|info)\]\s+\[client') Then Return "Apache Error Log" If StringRegExp($sContent, '\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\s+\w+\s+\w+') Then Return "IIS Web Log" If StringInStr($sContent, "Event Type:") And StringInStr($sContent, "Event ID:") Then Return "Windows Event Log" If StringRegExp($sContent, '(?i)\d{4}[-/]\d{2}[-/]\d{2}.*?(ERROR|WARN|INFO|DEBUG|TRACE)') Then Return "Application Log" Return "" EndFunc ;==>_DetectLogFormat ;================================================================================ ; Helper Function: Detect programming languages ;================================================================================ Func _DetectProgrammingLanguage($sContent) If StringRegExp($sContent, '(?i)<\?php') Then Return "PHP Script" If StringRegExp($sContent, '(?i)(import.*from|export\s+(default\s+)?|(const|let)\s+\w+\s*=)') Then Return "JavaScript (ES6+)" If StringRegExp($sContent, '(?i)(def\s+\w+\s*\(|import\s+\w+|from\s+\w+\s+import)') Then Return "Python Script" If StringRegExp($sContent, '(?i)#include\s*<.*>|int\s+main\s*\(') Then If StringRegExp($sContent, '(?i)(class\s+\w+|public:|private:|protected:|\w+::\w+)') Then Return "C++ Source Code" Return "C Source Code" EndIf If StringRegExp($sContent, '(?i)(using\s+System|namespace\s+\w+|public\s+class\s+\w+)') Then Return "C# Source Code" If StringRegExp($sContent, '(?i)(import\s+java\.|public\s+class\s+\w+|package\s+[\w\.]+)') Then Return "Java Source Code" If StringRegExp($sContent, '(?i)^#!/bin/(bash|sh|zsh)') Then Return "Shell Script" ; --- FIX: Moved AutoIt check before PowerShell --- ; AutoIt (Check before PowerShell due to similar variable syntax '$') If StringRegExp($sContent, '(?i)(#include|Func\s+\w+\s*\(|Local\s+\$\w+|\$\w+\s*=)') Then Return "AutoIt Script" ; PowerShell If StringRegExp($sContent, '(?i)(param\s*\(|\$\w+\s*=|Get-\w+|Set-\w+)') Then Return "PowerShell Script" ; --- END FIX --- If StringRegExp($sContent, '(?i)(@echo\s+(off|on)|set\s+\w+=|goto\s+\w+)') Then Return "Windows Batch File" If StringRegExp($sContent, '(?i)(SELECT\s+.*\s+FROM|CREATE\s+TABLE|INSERT\s+INTO)') Then Return "SQL Script" Return "" EndFunc ;==>_DetectProgrammingLanguage ;================================================================================ ; Helper Function: Detect markup languages ;================================================================================ Func _DetectMarkupLanguage($sContent) If _IsAdvancedMarkdown($sContent) Then Return "Markdown Document" If StringRegExp($sContent, '(?i)\\documentclass\{|\\begin\{document\}') Then Return "LaTeX Document" If StringRegExp($sContent, '<\?xml.*\?>') Then If StringInStr($sContent, "<rss") Then Return "RSS Feed" If StringInStr($sContent, "<feed") Then Return "Atom Feed" Return "XML Document" EndIf If StringRegExp($sContent, '(?i)<!DOCTYPE\s+html|<html') Then Return "HTML Document" If StringRegExp($sContent, '(?i)<svg[^>]*>|xmlns="http://www\.w3\.org/2000/svg"') Then Return "SVG Vector Graphics" Return "" EndFunc ;==>_DetectMarkupLanguage ;================================================================================ ; Helper Function: Advanced Markdown detection ;================================================================================ Func _IsAdvancedMarkdown($sContent) Local $aMarkdownPatterns[8] = [ _ '(?m)^#{1,6}\s+.+', _ ; Headers '(?m)^\s*[-*+]\s+', _ ; Unordered lists '(?m)^\s*\d+\.\s+', _ ; Ordered lists '\*{1,2}[^*]+\*{1,2}', _ ; Emphasis '`[^`]+`', _ ; Inline code '(?m)^```', _ ; Fenced code blocks '\[.+\]\(.+\)', _ ; Links '(?m)^>\s+' _ ; Blockquotes ] Local $iMatches = 0 For $sPattern In $aMarkdownPatterns If StringRegExp($sContent, $sPattern) Then $iMatches += 1 Next Return ($iMatches >= 3) EndFunc ;==>_IsAdvancedMarkdown ;================================================================================ ; Helper Function: Detect specialized text formats ;================================================================================ Func _DetectSpecializedFormats($sContent) If StringRegExp($sContent, '(?m)^(From|To|Subject|Date|Message-ID):\s+') Then Return "Email Message (EML)" If StringRegExp($sContent, '(?i)BEGIN:VCARD') Then Return "vCard Contact" If StringRegExp($sContent, '(?i)BEGIN:VCALENDAR') Then Return "iCalendar Event" If StringRegExp($sContent, '(?m)^(diff |--- |\+\+\+ |@@ )') Then Return "Diff/Patch File" If StringRegExp($sContent, '(?i)(MIT License|Apache License|GNU General Public License)') Then Return "Software License" If StringRegExp($sContent, '(?i)("name"\s*:\s*"[^"]+"|"version"\s*:\s*"[^"]+"|"dependencies")') Then Return "package.json (Node.js)" If StringRegExp($sContent, '(?i)^(FROM\s+|RUN\s+|COPY\s+|WORKDIR\s+|EXPOSE\s+)') Then Return "Dockerfile" If StringRegExp($sContent, '(?i)(version:\s*["\' & "']3\.|services:|volumes:|networks:)") Then Return "Docker Compose YAML" If StringRegExp($sContent, '(?i)(apiVersion:\s*|kind:\s*(Deployment|Service|Pod))') Then Return "Kubernetes Manifest" If StringRegExp($sContent, '(?m)^\w+:\s*$|^\t') Then Return "Makefile" If StringRegExp($sContent, '(?m)^\*\..*|^!.*|^#.*ignore') Then Return "Ignore File (.gitignore style)" If StringRegExp($sContent, '(?m)^\d+\.\d+\.\d+\.\d+\s+\w+') Then Return "Hosts File" If StringRegExp($sContent, '(?m)^[A-Z0-9_]+\s*=\s*.+') Then Return "Environment File (.env)" If StringRegExp($sContent, '(?m)^[a-zA-Z0-9\._-]+\s*=\s*.+') Then Return "Java Properties File" If StringRegExp($sContent, "(?m)^WEBVTT") Then Return "WebVTT Subtitles" If StringRegExp($sContent, '(?m)^\d+\R\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}') Then Return "SubRip Subtitles (.srt)" Return "" EndFunc ;==>_DetectSpecializedFormats ;================================================================================ ; Helper Function: Heuristic analysis to distinguish between Text and Binary ;================================================================================ Func _HeuristicTextAnalysis($sBinaryData) Local $iLen = StringLen($sBinaryData) If $iLen = 0 Then Return "" Local $iNonPrintable = 0 Local $iNulls = 0 ; Analyze the bytes in the string For $i = 1 To $iLen Local $iChar = Asc(StringMid($sBinaryData, $i, 1)) ; Count null bytes If $iChar = 0 Then $iNulls += 1 EndIf ; Count non-printable characters (outside ASCII 32-126 and common whitespace) If ($iChar < 32 And $iChar <> 9 And $iChar <> 10 And $iChar <> 13) Or ($iChar > 126 And $iChar < 160) Then $iNonPrintable += 1 EndIf Next ; --- HEURISTIC RULES --- ; Rule 1: If there are null bytes, it's almost certainly a binary file. ; Text files rarely contain NUL characters, except for UTF-16/32 (which are handled by BOM). If $iNulls > 0 Then Return "Binary Data" EndIf ; Rule 2: If the ratio of non-printable characters is too high, it's likely a binary file. ; Text files consist mainly of printable characters. Local $fNonPrintableRatio = $iNonPrintable / $iLen If $fNonPrintableRatio > 0.15 Then ; 15% threshold Return "Binary Data" EndIf ; If it passes the above checks, assume it's a plain text document. Return "Plain Text Document" EndFunc ;==>_HeuristicTextAnalysis ! Edited 5 hours ago by Trong argumentum 1 Regards,
Recommended Posts
Create an account or sign in to comment
You need to be a member in order to leave a comment
Create an account
Sign up for a new account in our community. It's easy!
Register a new accountSign in
Already have an account? Sign in here.
Sign In Now