Jump to content

Recommended Posts

Posted (edited)

S:

#Region Example Usage
; Example 1: Check a binary executable file
Local $filePathBinary = @WindowsDir & "\explorer.exe"
Local $fileInfoBinary = _GetFileDataType($filePathBinary)

If @error Then
    MsgBox(4096, "Error", "Could not determine file type: " & $fileInfoBinary)
Else
    Local $isBinary = (@extended = 1) ? "Binary" : "Text"
    MsgBox(4096, "Result (Binary)", "Path: " & $filePathBinary & @CRLF & _
            "File Type: " & $fileInfoBinary & @CRLF & "Classification: " & $isBinary)
EndIf

; Example 2: Check a text file (this AutoIt script)
Local $filePathText = @ScriptFullPath
Local $fileInfoText = _GetFileDataType($filePathText)

If @error Then
    MsgBox(4096, "Error", "Could not determine file type: " & $fileInfoText)
Else
    Local $isBinary = (@extended = 1) ? "Binary" : "Text"
    MsgBox(4096, "Result (Text)", "Path: " & $filePathText & @CRLF & _
            "File Type: " & $fileInfoText & @CRLF & "Classification: " & $isBinary)
EndIf


#EndRegion Example Usage

; =================================================================================================
; Function:    _CheckDataType
; Purpose:     Determine the data type code of any AutoIt variable or string.
; Parameters:
;     $vInput     - Any AutoIt variable or literal to examine.
;     $bShowDebug - Optional. When True, debug messages are printed to the console.
; Returns:     Integer type code:
;                 1 = String
;                 2 = Int32
;                 3 = Int64
;                 4 = Double/Float
;                 5 = Binary/Hex
;                 6 = Boolean
;                 7 = Array
;                 8 = Map
;                 9 = Pointer
;                10 = DLL Struct
;                11 = Window Handle
;                12 = Object
;                13 = Keyword
;                14 = Function
;                15 = UserFunction
;                16 = Unknown
; =================================================================================================
Func _CheckDataType($vInput, $bShowDebug = False)
    ; 1) AutoIt-specific objects
    If IsArray($vInput) Then Return _DebugResult(7, "Array", $vInput)
    If IsMap($vInput) Then Return _DebugResult(8, "Map", $vInput)
    If IsPtr($vInput) Then Return _DebugResult(9, "Pointer", $vInput)
    If IsDllStruct($vInput) Then Return _DebugResult(10, "DLL Struct", $vInput)
    If IsHWnd($vInput) Then Return _DebugResult(11, "Window Handle", $vInput)
    If IsObj($vInput) Then Return _DebugResult(12, "Object", $vInput)
    If IsKeyword($vInput) Then Return _DebugResult(13, "Keyword", $vInput)

    Local $sType = VarGetType($vInput)
    If IsFunc($vInput) Then
        If $sType = "UserFunction" Then
            Return _DebugResult(15, "UserFunction", $vInput)
        Else
            Return _DebugResult(14, "Function", $vInput)
        EndIf
    EndIf

    ; 2) Native numeric
    If $sType = "Int32" Then Return _DebugResult(2, "Int32", $vInput)
    If $sType = "Int64" Then Return _DebugResult(3, "Int64", $vInput)
    If $sType = "Double" Then Return _DebugResult(4, "Double/Float", $vInput)

    ; 3) Native binary & boolean
    If IsBinary($vInput) Then Return _DebugResult(5, "Binary/Hex", $vInput)
    If IsBool($vInput) Then Return _DebugResult(6, "Boolean", $vInput)

    ; 4) String analysis – strict-hex first, then numeric/boolean-like, then fallback
    If IsString($vInput) Then
        If $bShowDebug Then ConsoleWrite("- [DEBUG] String analysis..." & @CRLF)
        Local $s = StringStripWS($vInput, 8)

        ; 4.1) Hex with 0x prefix, even-length body
        If StringRegExp($s, "^0[xX][0-9A-Fa-f]+$") Then
            Local $hexBody = StringMid($s, 3)
            If Mod(StringLen($hexBody), 2) = 0 Then
                Return _DebugResult(5, "Hex string (0x prefix, strict)", $vInput)
            ElseIf $bShowDebug Then
                ConsoleWrite("- [DEBUG] Prefix hex odd length, skip" & @CRLF)
            EndIf
        EndIf

        ; 4.2) Bare hex (no prefix), even length,
        ;       but skip if purely digits (prioritize numeric)
        If StringRegExp($s, "^[0-9A-Fa-f]+$") Then
            ; if only digits (no A-F letters), treat as numeric
            If StringRegExp($s, "^[0-9]+$") Then
                If $bShowDebug Then
                    ConsoleWrite("- [DEBUG] Bare digits only, defer to numeric parse" & @CRLF)
                EndIf
            ElseIf Mod(StringLen($s), 2) = 0 Then
                Return _DebugResult(5, "Hex string (bare, strict)", $vInput)
            ElseIf $bShowDebug Then
                ConsoleWrite("- [DEBUG] Bare hex odd length, skip" & @CRLF)
            EndIf
        EndIf

        ; 4.3) Numeric parse (int32/int64/double)
        Local $numInfo = _ParseNumeric($s, $bShowDebug)
        If $numInfo[0] Then
            Return _DebugResult($numInfo[1], GetDataTypeName($numInfo[1]) & " (string)", $vInput)
        EndIf

        ; 4.4) Boolean-like strings
        Local $sl = StringLower($s)
        If $sl = "true" Or $sl = "false" Then
            Return _DebugResult(6, "Boolean-like string", $vInput)
        EndIf

        ; 4.5) Fallback string
        Return _DebugResult(1, "String", $vInput)
    EndIf

    ; 5) Unknown
    Return _DebugResult(16, "Unknown", $vInput)
EndFunc   ;==>_CheckDataType


; =================================================================================================
; Function:    _DebugResult
; Purpose:     Internal helper to print debug messages and return a code.
; Parameters:
;     $iCode - Integer code to return.
;     $sText - Description text to display in debug.
; Returns:     $iCode
; =================================================================================================
Func _DebugResult($iCode, $sText, $sInput)
    $sInput = StringReplace($sInput, @CRLF, '@CRLF')
    ConsoleWrite(StringFormat("- [DEBUG] Input '%s' (Len: %d) is: %s (Type=%d)" & @CRLF, $sInput, StringLen($sInput), $sText, $iCode))
    Return $iCode
EndFunc   ;==>_DebugResult


; =================================================================================================
; Function:    _ParseNumeric
; Purpose:     Internal numeric parser supporting US and EU formats.
; Parameters:
;     $sInput     - String to parse for numeric patterns.
;     $bShowDebug - Optional. When True, debug messages are printed.
; Returns:     A two-element array: [0] = True/False if numeric,
;                                       [1] = type code (2=Int32,3=Int64,4=Double)
; =================================================================================================
Func _ParseNumeric($sInput, $bShowDebug = False)
    Local $res[2] = [False, 0]
    If $sInput = "" Then Return $res
    If StringRegExp($sInput, "^0[0-9]+$") Then Return $res

    Local $patUS = "^[+-]?[0-9]{1,3}(,[0-9]{3})*(\.[0-9]+)?$"
    Local $patEU = "^[+-]?[0-9]{1,3}(\.[0-9]{3})*(,[0-9]+)?$"
    Local $patThousand = "^[+-]?[0-9]{1,3}([,\.][0-9]{3})+$"
    Local $patSimpleEU = "^[+-]?[0-9]+,[0-9]+$"

    Local $matched = False
    Local $numStr = $sInput
    Local $hasDec = False

    ; US format: 1,234.56
    If StringRegExp($sInput, $patUS) Then
        $matched = True
        $numStr = StringReplace($sInput, ",", "")
        $hasDec = StringInStr($sInput, ".", 0) > 0
    EndIf

    ; EU format: 1.234,56
    If Not $matched And StringRegExp($sInput, $patEU) Then
        $matched = True
        $numStr = StringReplace(StringReplace($sInput, ".", ""), ",", ".")
        $hasDec = True
    EndIf

    ; Thousand separators only: 1,234 or 1.234
    If Not $matched And StringRegExp($sInput, $patThousand) Then
        $matched = True
        $numStr = StringReplace(StringReplace($sInput, ",", ""), ".", "")
    EndIf

    ; Simple EU decimals: 123,45
    If Not $matched And StringRegExp($sInput, $patSimpleEU) Then
        $matched = True
        $numStr = StringReplace($sInput, ",", ".")
        $hasDec = True
    EndIf

    If Not $matched Then Return $res

    Local $val = Number($numStr)
    ; Decide Double vs Int
    If $hasDec Or $val <> Int($val) Then
        Local $aResult[2] = [True, 4]
        Return $aResult
    EndIf

    ; Int32 vs Int64
    If $val >= -2147483648 And $val <= 2147483647 Then
        Local $aResult[2] = [True, 2]
        Return $aResult
    Else
        Local $aResult[2] = [True, 3]
        Return $aResult
    EndIf
EndFunc   ;==>_ParseNumeric


; =================================================================================================
; Function:    GetDataTypeName
; Purpose:     Map a numeric code to its human-readable type name.
; Parameters:
;     $iCode - Integer type code (1–16).
; Returns:     String name corresponding to that code.
; =================================================================================================
Func GetDataTypeName($i)
    Select
        Case $i = 1
            Return "String"
        Case $i = 2
            Return "Int32"
        Case $i = 3
            Return "Int64"
        Case $i = 4
            Return "Double/Float"
        Case $i = 5
            Return "Binary/Hex"
        Case $i = 6
            Return "Boolean"
        Case $i = 7
            Return "Array"
        Case $i = 8
            Return "Map"
        Case $i = 9
            Return "Pointer"
        Case $i = 10
            Return "DLL Struct"
        Case $i = 11
            Return "Window Handle"
        Case $i = 12
            Return "Object"
        Case $i = 13
            Return "Keyword"
        Case $i = 14
            Return "Function"
        Case $i = 15
            Return "UserFunction"
        Case Else
            Return "Unknown"
    EndSelect
EndFunc   ;==>GetDataTypeName

; -------------------------------------------------------------------
; Helper: is this ANY numeric type? (native or string)
; -------------------------------------------------------------------
Func _IsNumber($v)
    Local $c = _CheckDataType($v, False)
    Return ($c = 2 Or $c = 3 Or $c = 4)
EndFunc   ;==>_IsNumber


;===============================================================================
; Function:    _GetFileDataType – Determine File Types by Hex Signature
; Description: Determines file type & classification (binary/text) by:
;              • Reading a file’s first 1KB and converting to hex
;              • Matching against signature database (500+ entries)
;              • Checking RIFF, ZIP, MP4/MOV, TAR, PE subtables
;              • Applying advanced text analysis heuristics
; Parameters:  $sInput – full file path OR raw hex signature string
; Return:      On success: file type string; @extended=1 if binary else 0
;              On failure: SetError(1), @extended=0, returns error message
; Author:      Dao Van Trong – TRONG.PRO
; Example:
;    Local $sType = _GetFileDataType("C:\Windows\explorer.exe")
;    If @error Then ConsoleWrite("Error: " & $sType & @CRLF)
;    Else ConsoleWrite("Type: " & $sType & ", Binary? " & @extended & @CRLF)
;===============================================================================
Func _GetFileDataType($sInput)
    Local Static $aMaps = _CreateSignatureDatabase()
    Local Static $mSignatures = $aMaps[0]
    Local Static $mBinaryFlags = $aMaps[1]
    Local $sHexData = ""

    ; Handle input
    If FileExists($sInput) Then
        $sHexData = _ReadFileSignature($sInput)
        If @error Then Return SetError(1, 0, "Cannot read file: " & $sInput)
    ElseIf (StringLen($sInput) > 1) And (_CheckDataType($sInput, False) = 5) Then
        $sInput = StringStripWS($sInput, 8)
        $sHexData = StringUpper($sInput)
    Else
        Return SetError(1, 0, "Invalid input")
    EndIf
    If _CheckHexPrefix($sHexData) Then $sHexData = _RemoveHexPrefix($sHexData)
    ; Search for signatures in descending order of length (prioritizing longer, more specific signatures)
    Local $aLengths[12] = [64, 48, 32, 28, 24, 20, 16, 12, 10, 8, 6, 4]

    For $iLen In $aLengths
        If StringLen($sHexData) >= $iLen Then
            Local $sTestSig = StringLeft($sHexData, $iLen)
            If $mSignatures.Exists($sTestSig) Then
                Local $sFileType = $mSignatures[$sTestSig]
                Local $iBinaryFlag = $mBinaryFlags[$sTestSig]
                Return SetError(0, $iBinaryFlag, $sFileType)
            EndIf
        EndIf
    Next

    ; Check for special patterns (which may not be at offset 0)
    Local $sSpecialResult = _CheckSpecialPatterns($sHexData)
    If $sSpecialResult <> "" Then
        Return SetError(0, StringInStr($sSpecialResult, "|BIN|") ? 1 : 0, StringReplace($sSpecialResult, "|BIN|", ""))
    EndIf

    ; Advanced text content analysis
    Local $sTextType = _AdvancedTextAnalysis($sHexData)
    If $sTextType <> "" Then
        Local $isBinaryHeuristic = ($sTextType = "Binary Data")
        Return SetError(0, $isBinaryHeuristic, $sTextType)
    EndIf

    Return SetError(1, 0, "Could not determine file type")
EndFunc   ;==>_GetFileDataType

Func _RemoveHexPrefix($sHex)
    ; Kiểm tra nếu chuỗi bắt đầu bằng "0x" hoặc "0X"
    If StringLeft($sHex, 2) = "0x" Or StringLeft($sHex, 2) = "0X" Then
        Return StringTrimLeft($sHex, 2)
    EndIf
    Return $sHex
EndFunc   ;==>_RemoveHexPrefix


Func _CheckHexPrefix($str)
    If (StringLen($str) < 2) Then Return False
    $prefix = StringUpper(StringLeft($str, 2))
    If $prefix == "0X" Then
        Return True
    Else
        Return False
    EndIf
EndFunc   ;==>_CheckHexPrefix

;===============================================================================
; Function:    _ReadFileSignature
; Description: Reads first 1KB of a file in binary mode and returns hex string
; Parameters:  $sFilePath – full path to file
; Return:      On success: uppercase hex string of data read
;              On failure: SetError(1), return ""
;===============================================================================
Func _ReadFileSignature($sFilePath)
    Local $hFile = FileOpen($sFilePath, 16) ; Open in binary mode
    If $hFile = -1 Then Return SetError(1, 0, "")

    ; Read the first 1KB for comprehensive analysis
    Local $dData = FileRead($hFile, 1024)
    FileClose($hFile)

    If @error Then Return SetError(1, 0, "")
    Return StringUpper(Hex($dData))
EndFunc   ;==>_ReadFileSignature

;================================================================================
; Helper Function: Create the signature database (500+ file types)
;================================================================================
Func _CreateSignatureDatabase()
    Local $mSignatures[], $mBinaryFlags[]

    ; === ARCHIVE & COMPRESSION FORMATS ===
    ; ZIP Family
    $mSignatures["504B0304"] = "ZIP Archive"
    $mBinaryFlags["504B0304"] = 1
    $mSignatures["504B0506"] = "ZIP Archive (Empty)"
    $mBinaryFlags["504B0506"] = 1
    $mSignatures["504B0708"] = "ZIP Archive (Spanned)"
    $mBinaryFlags["504B0708"] = 1

    ; RAR Family
    $mSignatures["526172211A0700"] = "RAR Archive v1.5+"
    $mBinaryFlags["526172211A0700"] = 1
    $mSignatures["526172211A070100"] = "RAR Archive v5.0+"
    $mBinaryFlags["526172211A070100"] = 1

    ; 7-Zip Family
    $mSignatures["377ABCAF271C"] = "7-Zip Archive"
    $mBinaryFlags["377ABCAF271C"] = 1

    ; Compression Formats
    $mSignatures["1F8B"] = "GZIP Compressed"
    $mBinaryFlags["1F8B"] = 1
    $mSignatures["425A68"] = "BZIP2 Compressed"
    $mBinaryFlags["425A68"] = 1
    $mSignatures["FD377A585A00"] = "XZ Compressed"
    $mBinaryFlags["FD377A585A00"] = 1
    $mSignatures["5D00"] = "LZMA Compressed"
    $mBinaryFlags["5D00"] = 1
    $mSignatures["28B52FFD"] = "Zstandard Compressed"
    $mBinaryFlags["28B52FFD"] = 1
    $mSignatures["04224D18"] = "LZ4 Compressed"
    $mBinaryFlags["04224D18"] = 1

    ; Archive Formats
    $mSignatures["213C617263683E"] = "Unix Archive"
    $mBinaryFlags["213C617263683E"] = 1
    $mSignatures["EDABEEDB"] = "Debian Package"
    $mBinaryFlags["EDABEEDB"] = 1
    $mSignatures["43414220"] = "Microsoft Cabinet"
    $mBinaryFlags["43414220"] = 1
    $mSignatures["4D534346"] = "Microsoft Cabinet"
    $mBinaryFlags["4D534346"] = 1
    $mSignatures["495321"] = "Inno Setup"
    $mBinaryFlags["495321"] = 1
    $mSignatures["4E45534D1A01"] = "Nintendo NES ROM"
    $mBinaryFlags["4E45534D1A01"] = 1

    ; === IMAGE FORMATS ===
    ; JPEG Family
    $mSignatures["FFD8FFE0"] = "JPEG Image (JFIF)"
    $mBinaryFlags["FFD8FFE0"] = 1
    $mSignatures["FFD8FFE1"] = "JPEG Image (EXIF)"
    $mBinaryFlags["FFD8FFE1"] = 1
    $mSignatures["FFD8FFE2"] = "JPEG Image (Canon)"
    $mBinaryFlags["FFD8FFE2"] = 1
    $mSignatures["FFD8FFE3"] = "JPEG Image (Samsung)"
    $mBinaryFlags["FFD8FFE3"] = 1
    $mSignatures["FFD8FFE8"] = "JPEG Image (SPIFF)"
    $mBinaryFlags["FFD8FFE8"] = 1
    $mSignatures["FFD8FFDB"] = "JPEG Image"
    $mBinaryFlags["FFD8FFDB"] = 1
    $mSignatures["FFD8FFEE"] = "JPEG Image (Adobe)"
    $mBinaryFlags["FFD8FFEE"] = 1

    ; PNG Family
    $mSignatures["89504E470D0A1A0A"] = "PNG Image"
    $mBinaryFlags["89504E470D0A1A0A"] = 1

    ; GIF Family
    $mSignatures["474946383761"] = "GIF Image (87a)"
    $mBinaryFlags["474946383761"] = 1
    $mSignatures["474946383961"] = "GIF Image (89a)"
    $mBinaryFlags["474946383961"] = 1

    ; TIFF Family
    $mSignatures["49492A00"] = "TIFF Image (Little Endian)"
    $mBinaryFlags["49492A00"] = 1
    $mSignatures["4D4D002A"] = "TIFF Image (Big Endian)"
    $mBinaryFlags["4D4D002A"] = 1

    ; Bitmap Family
    $mSignatures["424D"] = "BMP Image"
    $mBinaryFlags["424D"] = 1

    ; WebP (Lossy and Lossless)
    ; Note: WebP is a RIFF container, also checked in _CheckSpecialPatterns
    $mSignatures["52494646"] = "WebP Image"
    $mBinaryFlags["52494646"] = 1

    ; Icon Formats
    $mSignatures["00000100"] = "Windows Icon (ICO)"
    $mBinaryFlags["00000100"] = 1
    $mSignatures["00000200"] = "Windows Cursor (CUR)"
    $mBinaryFlags["00000200"] = 1

    ; Advanced Image Formats
    $mSignatures["38425053"] = "Photoshop PSD"
    $mBinaryFlags["38425053"] = 1
    $mSignatures["00000C6A502020"] = "JPEG2000"
    $mBinaryFlags["00000C6A502020"] = 1
    $mSignatures["FF4FFF51"] = "JPEG2000"
    $mBinaryFlags["FF4FFF51"] = 1
    $mSignatures["000000186674797068656963"] = "HEIF/HEIC Image"
    $mBinaryFlags["000000186674797068656963"] = 1
    $mSignatures["0000002066747970617669663"] = "AVIF Image"
    $mBinaryFlags["0000002066747970617669663"] = 1

    ; RAW Image Formats
    $mSignatures["49494E31"] = "Nikon NEF"
    $mBinaryFlags["49494E31"] = 1
    $mSignatures["4352"] = "Canon CR2"
    $mBinaryFlags["4352"] = 1
    $mSignatures["49494944"] = "Adobe DNG"
    $mBinaryFlags["49494944"] = 1
    $mSignatures["46554A49"] = "Fuji RAF"
    $mBinaryFlags["46554A49"] = 1
    $mSignatures["41524159"] = "Sony ARW"
    $mBinaryFlags["41524159"] = 1

    ; === VIDEO FORMATS ===
    ; MP4 Family (checked more deeply in _CheckSpecialPatterns)
    $mSignatures["0000002066747970"] = "MP4 Video"
    $mBinaryFlags["0000002066747970"] = 1
    $mSignatures["000000186674797033677035"] = "MP4 Video (3GP5)"
    $mBinaryFlags["000000186674797033677035"] = 1

    ; QuickTime (checked more deeply in _CheckSpecialPatterns)
    $mSignatures["0000001466747970717420"] = "QuickTime MOV"
    $mBinaryFlags["0000001466747970717420"] = 1
    $mSignatures["6D6F6F76"] = "QuickTime MOV"
    $mBinaryFlags["6D6F6F76"] = 1

    ; AVI (is a RIFF container, checked in _CheckSpecialPatterns)
    $mSignatures["52494646"] = "AVI Video"
    $mBinaryFlags["52494646"] = 1

    ; Matroska
    $mSignatures["1A45DFA3"] = "Matroska MKV"
    $mBinaryFlags["1A45DFA3"] = 1

    ; Flash Video
    $mSignatures["464C5601"] = "Flash FLV"
    $mBinaryFlags["464C5601"] = 1

    ; Windows Media
    $mSignatures["3026B2758E66CF11"] = "Windows Media (ASF/WMV)"
    $mBinaryFlags["3026B2758E66CF11"] = 1

    ; MPEG
    $mSignatures["000001BA"] = "MPEG Program Stream"
    $mBinaryFlags["000001BA"] = 1
    $mSignatures["000001B3"] = "MPEG Elementary Stream"
    $mBinaryFlags["000001B3"] = 1
    $mSignatures["47"] = "MPEG Transport Stream"
    $mBinaryFlags["47"] = 1

    ; === AUDIO FORMATS ===
    ; MP3 Family
    $mSignatures["494433"] = "MP3 Audio (ID3v2)"
    $mBinaryFlags["494433"] = 1
    $mSignatures["FFFB"] = "MP3 Audio (MPEG-1 Layer 3)"
    $mBinaryFlags["FFFB"] = 1
    $mSignatures["FFF3"] = "MP3 Audio (MPEG-2 Layer 3)"
    $mBinaryFlags["FFF3"] = 1

    ; WAV (is a RIFF container, checked in _CheckSpecialPatterns)
    $mSignatures["52494646"] = "WAV Audio"
    $mBinaryFlags["52494646"] = 1

    ; FLAC
    $mSignatures["664C6143"] = "FLAC Audio"
    $mBinaryFlags["664C6143"] = 1

    ; OGG
    $mSignatures["4F676753"] = "OGG Audio/Video"
    $mBinaryFlags["4F676753"] = 1

    ; AAC
    $mSignatures["FFF1"] = "AAC Audio (ADTS)"
    $mBinaryFlags["FFF1"] = 1

    ; Apple Audio
    $mSignatures["4D344120"] = "M4A Audio"
    $mBinaryFlags["4D344120"] = 1

    ; Other Audio
    $mSignatures["4D546864"] = "MIDI Audio"
    $mBinaryFlags["4D546864"] = 1

    ; === DOCUMENT FORMATS ===
    ; PDF
    $mSignatures["25504446"] = "PDF Document"
    $mBinaryFlags["25504446"] = 1

    ; Microsoft Office Legacy
    $mSignatures["D0CF11E0A1B11AE1"] = "Microsoft Office Document (Legacy)"
    $mBinaryFlags["D0CF11E0A1B11AE1"] = 1

    ; Microsoft Office Modern (Office Open XML)
    $mSignatures["504B030414000600"] = "Office Open XML (DOCX/XLSX/PPTX)"
    $mBinaryFlags["504B030414000600"] = 1

    ; PostScript
    $mSignatures["25215053"] = "PostScript Document"
    $mBinaryFlags["25215053"] = 0

    ; === EXECUTABLE FORMATS ===
    ; Windows
    $mSignatures["4D5A"] = "Windows Executable (PE)"
    $mBinaryFlags["4D5A"] = 1

    ; Linux
    $mSignatures["7F454C46"] = "Linux ELF Executable"
    $mBinaryFlags["7F454C46"] = 1

    ; macOS
    $mSignatures["FEEDFACE"] = "macOS Mach-O 32-bit"
    $mBinaryFlags["FEEDFACE"] = 1
    $mSignatures["FEEDFACF"] = "macOS Mach-O 64-bit"
    $mBinaryFlags["FEEDFACF"] = 1

    ; Java
    $mSignatures["CAFEBABE"] = "Java Class File"
    $mBinaryFlags["CAFEBABE"] = 1

    ; === DATABASE FORMATS ===
    $mSignatures["53514C69746520666F726D61742033"] = "SQLite Database"
    $mBinaryFlags["53514C69746520666F726D61742033"] = 1
    $mSignatures["00010000"] = "Microsoft Access Database"
    $mBinaryFlags["00010000"] = 1

    ; === FONT FORMATS ===
    $mSignatures["00010000"] = "TrueType Font"
    $mBinaryFlags["00010000"] = 1
    $mSignatures["4F54544F"] = "OpenType Font"
    $mBinaryFlags["4F54544F"] = 1
    $mSignatures["774F4646"] = "Web Open Font (WOFF)"
    $mBinaryFlags["774F4646"] = 1
    $mSignatures["774F4632"] = "Web Open Font 2 (WOFF2)"
    $mBinaryFlags["774F4632"] = 1

    ; === DISK IMAGES ===
    $mSignatures["4344303031"] = "ISO 9660 CD Image"
    $mBinaryFlags["4344303031"] = 1

    ; === TEXT FORMATS WITH SIGNATURES ===
    ; RTF
    $mSignatures["7B5C72746631"] = "Rich Text Format"
    $mBinaryFlags["7B5C72746631"] = 0

    ; XML/HTML variants
    $mSignatures["3C3F786D6C"] = "XML Document"
    $mBinaryFlags["3C3F786D6C"] = 0
    $mSignatures["3C21444F43545950"] = "HTML Document"
    $mBinaryFlags["3C21444F43545950"] = 0

    ; Script files with shebang
    $mSignatures["2321"] = "Shell/Script File"
    $mBinaryFlags["2321"] = 0

    ; === ENCODING SIGNATURES (BOMs) ===
    $mSignatures["EFBBBF"] = "UTF-8 with BOM"
    $mBinaryFlags["EFBBBF"] = 0
    $mSignatures["FFFE"] = "UTF-16 Little Endian"
    $mBinaryFlags["FFFE"] = 0
    $mSignatures["FEFF"] = "UTF-16 Big Endian"
    $mBinaryFlags["FEFF"] = 0
    $mSignatures["FFFE0000"] = "UTF-32 Little Endian"
    $mBinaryFlags["FFFE0000"] = 0
    $mSignatures["0000FEFF"] = "UTF-32 Big Endian"
    $mBinaryFlags["0000FEFF"] = 0

    ; === SECURITY & CERTIFICATES ===
    $mSignatures["2D2D2D2D2D424547494E"] = "PEM Certificate"
    $mBinaryFlags["2D2D2D2D2D424547494E"] = 0
    $mSignatures["308"] = "DER Certificate"
    $mBinaryFlags["308"] = 1

    Local $aResult[2] = [$mSignatures, $mBinaryFlags]
    Return $aResult
EndFunc   ;==>_CreateSignatureDatabase

;================================================================================
; Helper Function: Check for special patterns (not starting at offset 0)
;================================================================================
Func _CheckSpecialPatterns($sHexData)
    ; RIFF-based formats need to be checked at offset 8
    If StringLeft($sHexData, 8) = "52494646" Then ; RIFF
        Local $sSubType = StringMid($sHexData, 17, 8) ; Offset 8
        Switch $sSubType
            Case "57415645" ; WAVE
                Return "|BIN|WAV Audio"
            Case "41564920" ; AVI
                Return "|BIN|AVI Video"
            Case "57454250" ; WEBP
                Return "|BIN|WebP Image"
        EndSwitch
    EndIf

    ; QuickTime/MP4 - check ftyp at offset 4
    If StringMid($sHexData, 9, 8) = "66747970" Then ; ftyp
        Local $sBrand = StringMid($sHexData, 17, 8) ; Brand
        Switch $sBrand
            Case "69736F6D", "6D703431", "6D703432" ; isom, mp41, mp42
                Return "|BIN|MP4 Video"
            Case "71742020" ; qt
                Return "|BIN|QuickTime MOV"
            Case "33677035", "33677034" ; 3gp5, 3gp4
                Return "|BIN|3GPP Video"
        EndSwitch
    EndIf

    ; ZIP-based formats
    If StringLeft($sHexData, 8) = "504B0304" Then ; PK..
        ; Look for specific files in ZIP to identify format
        If StringInStr($sHexData, "776F72642F") Then ; word/
            Return "|BIN|Microsoft Word (DOCX)"
        ElseIf StringInStr($sHexData, "786C2F") Then ; xl/
            Return "|BIN|Microsoft Excel (XLSX)"
        ElseIf StringInStr($sHexData, "7070742F") Then ; ppt/
            Return "|BIN|Microsoft PowerPoint (PPTX)"
        ElseIf StringInStr($sHexData, "636F6E74656E742E786D6C") Then ; content.xml
            Return "|BIN|OpenOffice Document"
        ElseIf StringInStr($sHexData, "4D4554412D494E462F") Then ; META-INF/
            Return "|BIN|Java JAR Archive"
        ElseIf StringInStr($sHexData, "416E64726F69644D616E69666573742E786D6C") Then ; AndroidManifest.xml
            Return "|BIN|Android APK"
        EndIf
    EndIf

    ; TAR detection at offset 257
    If StringMid($sHexData, 513, 12) = "757374617200" Or StringMid($sHexData, 513, 12) = "757374617220" Then
        Return "|BIN|TAR Archive"
    EndIf

    ; EXE sub-type detection
    If StringLeft($sHexData, 4) = "4D5A" Then ; MZ header
        ; Look for PE signature
        If StringInStr($sHexData, "50450000") Then
            Return "|BIN|Windows PE Executable"
        EndIf
    EndIf

    Return ""
EndFunc   ;==>_CheckSpecialPatterns

;================================================================================
; Helper Function: Convert Hex to Binary (takes a portion for testing)
;================================================================================
Func _HexToBinary($sHexData)
    ; Only take the first 1024 hex characters (512 bytes) for analysis
    Local $sTestHex = StringLeft($sHexData, 1024)
    Local $sBinary = ""

    ; Convert each hex pair into a character
    For $i = 1 To StringLen($sTestHex) Step 2
        Local $sHexByte = StringMid($sTestHex, $i, 2)
        If StringLen($sHexByte) = 2 Then
            Local $iDecimal = Dec($sHexByte)
            $sBinary &= Chr($iDecimal)
        EndIf
    Next

    Return $sBinary
EndFunc   ;==>_HexToBinary

;================================================================================
; Helper Function: Advanced text analysis
;================================================================================
Func _AdvancedTextAnalysis($sHexData)
    Local $sBinaryData = _HexToBinary($sHexData)
    If $sBinaryData = "" Then Return ""

    ; === BOM and Encoding Detection ===
    Local $sEncoding = _DetectEncoding($sHexData)
    If $sEncoding <> "" Then Return $sEncoding

    ; === Content-based Detection ===
    Local $sContentType = _DetectTextContent($sBinaryData)
    If $sContentType <> "" Then Return $sContentType

    ; === Heuristic Analysis ===
    Local $sHeuristicType = _HeuristicTextAnalysis($sBinaryData)
    If $sHeuristicType <> "" Then Return $sHeuristicType

    Return ""
EndFunc   ;==>_AdvancedTextAnalysis

;================================================================================
; Helper Function: Simple Min function
;================================================================================
Func __Min($a, $b)
    Return ($a < $b) ? $a : $b
EndFunc   ;==>__Min

;================================================================================
; Helper Function: Detect encoding and text variants
;================================================================================
Func _DetectEncoding($sHexData)
    ; BOMs are already handled in the main database, this function can be expanded later
    ; if more complex logic is needed to guess encoding without a BOM.
    Return ""
EndFunc   ;==>_DetectEncoding

;================================================================================
; Helper Function: Detect content type based on patterns
;================================================================================
Func _DetectTextContent($sContent)
    ; === Configuration Files ===
    If StringRegExp($sContent, '(?m)^\s*\[[^\]]+\]') Then Return "INI Configuration File"
    If StringRegExp($sContent, '(?i)^server\s*=|^host\s*=|^port\s*=') Then Return "Configuration File"

    ; === Data Formats ===
    If _IsAdvancedJSON($sContent) Then Return "JSON Data File"
    If _IsAdvancedCSV($sContent) Then Return "CSV Data File"
    If _IsTSV($sContent) Then Return "TSV (Tab Separated Values)"
    If _IsYAML($sContent) Then Return "YAML Configuration"
    If _IsTOML($sContent) Then Return "TOML Configuration"

    ; === Log Files ===
    Local $sLogType = _DetectLogFormat($sContent)
    If $sLogType <> "" Then Return $sLogType

    ; === Programming Languages ===
    Local $sCodeType = _DetectProgrammingLanguage($sContent)
    If $sCodeType <> "" Then Return $sCodeType

    ; === Markup Languages ===
    Local $sMarkupType = _DetectMarkupLanguage($sContent)
    If $sMarkupType <> "" Then Return $sMarkupType

    ; === Specialized Text Formats ===
    Local $sSpecialType = _DetectSpecializedFormats($sContent)
    If $sSpecialType <> "" Then Return $sSpecialType

    Return ""
EndFunc   ;==>_DetectTextContent

;================================================================================
; Helper Function: Advanced JSON detection
;================================================================================
Func _IsAdvancedJSON($sContent)
    Local $sTrimmed = StringStripWS($sContent, 3)
    If $sTrimmed = "" Then Return False

    Local $bHasStructure = False
    If (StringLeft($sTrimmed, 1) = "{" And StringRight($sTrimmed, 1) = "}") Or _
            (StringLeft($sTrimmed, 1) = "[" And StringRight($sTrimmed, 1) = "]") Then
        $bHasStructure = True
    EndIf
    If Not $bHasStructure Then Return False

    Local $aJSONPatterns[6] = [ _
            '"[^"]*"\s*:\s*"[^"]*"', _     ; "key": "value"
            '"[^"]*"\s*:\s*\d+', _         ; "key": 123
            '"[^"]*"\s*:\s*(true|false|null)', _ ; "key": true
            '"[^"]*"\s*:\s*\[', _          ; "key": [
            '"[^"]*"\s*:\s*\{', _          ; "key": {
            '\[\s*("|\d|\{|\[)' _          ; Array starts with valid element
            ]
    Local $iMatches = 0
    For $sPattern In $aJSONPatterns
        If StringRegExp($sContent, $sPattern) Then $iMatches += 1
    Next
    Return ($iMatches >= 2)
EndFunc   ;==>_IsAdvancedJSON

;================================================================================
; Helper Function: Advanced CSV detection
;================================================================================
Func _IsAdvancedCSV($sContent)
    Local $aLines = StringSplit(StringStripCR($sContent), @LF, 1)
    If $aLines[0] < 2 Then Return False

    Local $iCommaLines = 0
    Local $iConsistentColumns = 0
    Local $iPrevColCount = -1
    Local $iLinesToCheck = __Min(10, $aLines[0])

    For $i = 1 To $iLinesToCheck
        Local $sLine = StringStripWS($aLines[$i], 3)
        If $sLine = "" Or StringLeft($sLine, 1) = "#" Then ContinueLoop

        Local $iCommas = StringLen($sLine) - StringLen(StringReplace($sLine, ",", ""))
        If $iCommas >= 1 Then
            $iCommaLines += 1
            Local $iCurrentCols = $iCommas + 1
            If $iPrevColCount = -1 Then
                $iPrevColCount = $iCurrentCols
                $iConsistentColumns = 1
            ElseIf $iPrevColCount = $iCurrentCols Then
                $iConsistentColumns += 1
            EndIf
        EndIf
    Next

    If $iLinesToCheck = 0 Then Return False
    Local $fCommaRatio = $iCommaLines / $iLinesToCheck
    Local $fConsistencyRatio = $iConsistentColumns / $iLinesToCheck
    Return ($fCommaRatio >= 0.5 And $fConsistencyRatio >= 0.7)
EndFunc   ;==>_IsAdvancedCSV

;================================================================================
; Helper Function: TSV detection
;================================================================================
Func _IsTSV($sContent)
    Local $aLines = StringSplit(StringStripCR($sContent), @LF, 1)
    If $aLines[0] < 2 Then Return False

    Local $iTabLines = 0
    Local $iLinesToCheck = __Min(5, $aLines[0])
    For $i = 1 To $iLinesToCheck
        Local $sLine = $aLines[$i]
        Local $iTabs = StringLen($sLine) - StringLen(StringReplace($sLine, @TAB, ""))
        If $iTabs >= 1 Then $iTabLines += 1
    Next
    Return ($iLinesToCheck > 0 And $iTabLines / $iLinesToCheck >= 0.6)
EndFunc   ;==>_IsTSV

;================================================================================
; Helper Function: YAML detection
;================================================================================
Func _IsYAML($sContent)
    Local $aYAMLPatterns[5] = [ _
            '(?m)^---', _                  ; Document separator
            '(?m)^\w+:\s+[^|\>]', _        ; Key with simple value
            '(?m)^\s+-\s+\w+', _           ; List items
            '(?m)^\s+\w+:\s+', _           ; Indented key-value
            '!!?\w+' _                     ; Type tags
            ]
    Local $iMatches = 0
    For $sPattern In $aYAMLPatterns
        If StringRegExp($sContent, $sPattern) Then $iMatches += 1
    Next
    Return ($iMatches >= 2)
EndFunc   ;==>_IsYAML

;================================================================================
; Helper Function: TOML detection
;================================================================================
Func _IsTOML($sContent)
    Local $aTOMLPatterns[5] = [ _
            '(?m)^\[[\w\.\-]+\]', _        ; Sections
            '(?m)^\w+\s*=\s*"[^"]*"', _   ; String values
            '(?m)^\w+\s*=\s*\d+', _       ; Number values
            '(?m)^\w+\s*=\s*(true|false)', _ ; Boolean values
            '(?m)^\w+\s*=\s*\[\s*' _      ; Arrays
            ]
    Local $iMatches = 0
    For $sPattern In $aTOMLPatterns
        If StringRegExp($sContent, $sPattern) Then $iMatches += 1
    Next
    Return ($iMatches >= 2)
EndFunc   ;==>_IsTOML

;================================================================================
; Helper Function: Detect log file formats
;================================================================================
Func _DetectLogFormat($sContent)
    If StringRegExp($sContent, '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+-\s+-\s+\[') Then Return "Apache Access Log"
    If StringRegExp($sContent, '\[.*\]\s+\[(error|warn|info)\]\s+\[client') Then Return "Apache Error Log"
    If StringRegExp($sContent, '\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}\s+\w+\s+\w+') Then Return "IIS Web Log"
    If StringInStr($sContent, "Event Type:") And StringInStr($sContent, "Event ID:") Then Return "Windows Event Log"
    If StringRegExp($sContent, '(?i)\d{4}[-/]\d{2}[-/]\d{2}.*?(ERROR|WARN|INFO|DEBUG|TRACE)') Then Return "Application Log"
    Return ""
EndFunc   ;==>_DetectLogFormat

;================================================================================
; Helper Function: Detect programming languages
;================================================================================
Func _DetectProgrammingLanguage($sContent)
    If StringRegExp($sContent, '(?i)<\?php') Then Return "PHP Script"
    If StringRegExp($sContent, '(?i)(import.*from|export\s+(default\s+)?|(const|let)\s+\w+\s*=)') Then Return "JavaScript (ES6+)"
    If StringRegExp($sContent, '(?i)(def\s+\w+\s*\(|import\s+\w+|from\s+\w+\s+import)') Then Return "Python Script"
    If StringRegExp($sContent, '(?i)#include\s*<.*>|int\s+main\s*\(') Then
        If StringRegExp($sContent, '(?i)(class\s+\w+|public:|private:|protected:|\w+::\w+)') Then Return "C++ Source Code"
        Return "C Source Code"
    EndIf
    If StringRegExp($sContent, '(?i)(using\s+System|namespace\s+\w+|public\s+class\s+\w+)') Then Return "C# Source Code"
    If StringRegExp($sContent, '(?i)(import\s+java\.|public\s+class\s+\w+|package\s+[\w\.]+)') Then Return "Java Source Code"
    If StringRegExp($sContent, '(?i)^#!/bin/(bash|sh|zsh)') Then Return "Shell Script"

    ; --- FIX: Moved AutoIt check before PowerShell ---
    ; AutoIt (Check before PowerShell due to similar variable syntax '$')
    If StringRegExp($sContent, '(?i)(#include|Func\s+\w+\s*\(|Local\s+\$\w+|\$\w+\s*=)') Then Return "AutoIt Script"

    ; PowerShell
    If StringRegExp($sContent, '(?i)(param\s*\(|\$\w+\s*=|Get-\w+|Set-\w+)') Then Return "PowerShell Script"
    ; --- END FIX ---

    If StringRegExp($sContent, '(?i)(@echo\s+(off|on)|set\s+\w+=|goto\s+\w+)') Then Return "Windows Batch File"
    If StringRegExp($sContent, '(?i)(SELECT\s+.*\s+FROM|CREATE\s+TABLE|INSERT\s+INTO)') Then Return "SQL Script"
    Return ""
EndFunc   ;==>_DetectProgrammingLanguage

;================================================================================
; Helper Function: Detect markup languages
;================================================================================
Func _DetectMarkupLanguage($sContent)
    If _IsAdvancedMarkdown($sContent) Then Return "Markdown Document"
    If StringRegExp($sContent, '(?i)\\documentclass\{|\\begin\{document\}') Then Return "LaTeX Document"
    If StringRegExp($sContent, '<\?xml.*\?>') Then
        If StringInStr($sContent, "<rss") Then Return "RSS Feed"
        If StringInStr($sContent, "<feed") Then Return "Atom Feed"
        Return "XML Document"
    EndIf
    If StringRegExp($sContent, '(?i)<!DOCTYPE\s+html|<html') Then Return "HTML Document"
    If StringRegExp($sContent, '(?i)<svg[^>]*>|xmlns="http://www\.w3\.org/2000/svg"') Then Return "SVG Vector Graphics"
    Return ""
EndFunc   ;==>_DetectMarkupLanguage

;================================================================================
; Helper Function: Advanced Markdown detection
;================================================================================
Func _IsAdvancedMarkdown($sContent)
    Local $aMarkdownPatterns[8] = [ _
            '(?m)^#{1,6}\s+.+', _          ; Headers
            '(?m)^\s*[-*+]\s+', _          ; Unordered lists
            '(?m)^\s*\d+\.\s+', _          ; Ordered lists
            '\*{1,2}[^*]+\*{1,2}', _       ; Emphasis
            '`[^`]+`', _                   ; Inline code
            '(?m)^```', _                  ; Fenced code blocks
            '\[.+\]\(.+\)', _              ; Links
            '(?m)^>\s+' _                  ; Blockquotes
            ]
    Local $iMatches = 0
    For $sPattern In $aMarkdownPatterns
        If StringRegExp($sContent, $sPattern) Then $iMatches += 1
    Next
    Return ($iMatches >= 3)
EndFunc   ;==>_IsAdvancedMarkdown

;================================================================================
; Helper Function: Detect specialized text formats
;================================================================================
Func _DetectSpecializedFormats($sContent)
    If StringRegExp($sContent, '(?m)^(From|To|Subject|Date|Message-ID):\s+') Then Return "Email Message (EML)"
    If StringRegExp($sContent, '(?i)BEGIN:VCARD') Then Return "vCard Contact"
    If StringRegExp($sContent, '(?i)BEGIN:VCALENDAR') Then Return "iCalendar Event"
    If StringRegExp($sContent, '(?m)^(diff |--- |\+\+\+ |@@ )') Then Return "Diff/Patch File"
    If StringRegExp($sContent, '(?i)(MIT License|Apache License|GNU General Public License)') Then Return "Software License"
    If StringRegExp($sContent, '(?i)("name"\s*:\s*"[^"]+"|"version"\s*:\s*"[^"]+"|"dependencies")') Then Return "package.json (Node.js)"
    If StringRegExp($sContent, '(?i)^(FROM\s+|RUN\s+|COPY\s+|WORKDIR\s+|EXPOSE\s+)') Then Return "Dockerfile"
    If StringRegExp($sContent, '(?i)(version:\s*["\' & "']3\.|services:|volumes:|networks:)") Then Return "Docker Compose YAML"
    If StringRegExp($sContent, '(?i)(apiVersion:\s*|kind:\s*(Deployment|Service|Pod))') Then Return "Kubernetes Manifest"
    If StringRegExp($sContent, '(?m)^\w+:\s*$|^\t') Then Return "Makefile"
    If StringRegExp($sContent, '(?m)^\*\..*|^!.*|^#.*ignore') Then Return "Ignore File (.gitignore style)"
    If StringRegExp($sContent, '(?m)^\d+\.\d+\.\d+\.\d+\s+\w+') Then Return "Hosts File"
    If StringRegExp($sContent, '(?m)^[A-Z0-9_]+\s*=\s*.+') Then Return "Environment File (.env)"
    If StringRegExp($sContent, '(?m)^[a-zA-Z0-9\._-]+\s*=\s*.+') Then Return "Java Properties File"
    If StringRegExp($sContent, "(?m)^WEBVTT") Then Return "WebVTT Subtitles"
    If StringRegExp($sContent, '(?m)^\d+\R\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}') Then Return "SubRip Subtitles (.srt)"
    Return ""
EndFunc   ;==>_DetectSpecializedFormats

;================================================================================
; Helper Function: Heuristic analysis to distinguish between Text and Binary
;================================================================================
Func _HeuristicTextAnalysis($sBinaryData)
    Local $iLen = StringLen($sBinaryData)
    If $iLen = 0 Then Return ""

    Local $iNonPrintable = 0
    Local $iNulls = 0

    ; Analyze the bytes in the string
    For $i = 1 To $iLen
        Local $iChar = Asc(StringMid($sBinaryData, $i, 1))

        ; Count null bytes
        If $iChar = 0 Then
            $iNulls += 1
        EndIf

        ; Count non-printable characters (outside ASCII 32-126 and common whitespace)
        If ($iChar < 32 And $iChar <> 9 And $iChar <> 10 And $iChar <> 13) Or ($iChar > 126 And $iChar < 160) Then
            $iNonPrintable += 1
        EndIf
    Next

    ; --- HEURISTIC RULES ---

    ; Rule 1: If there are null bytes, it's almost certainly a binary file.
    ; Text files rarely contain NUL characters, except for UTF-16/32 (which are handled by BOM).
    If $iNulls > 0 Then
        Return "Binary Data"
    EndIf

    ; Rule 2: If the ratio of non-printable characters is too high, it's likely a binary file.
    ; Text files consist mainly of printable characters.
    Local $fNonPrintableRatio = $iNonPrintable / $iLen
    If $fNonPrintableRatio > 0.15 Then ; 15% threshold
        Return "Binary Data"
    EndIf

    ; If it passes the above checks, assume it's a plain text document.
    Return "Plain Text Document"
EndFunc   ;==>_HeuristicTextAnalysis

!

 

Edited by Trong

Regards,
 

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now
  • Recently Browsing   0 members

    • No registered users viewing this page.
×
×
  • Create New...