pixelsearch Posted March 30, 2022 Share Posted March 30, 2022 (edited) Hi everybody A few years ago, @Jon made public his C# and C++ code to detect the encoding of a text file, in this link. It's the detection method used by AutoIt (thanks to @TheDcoder for providing the link a few days ago) As I'm a C++ newbie, I focused on the interesting detection functions, trying to understand them, then compiled the C++ code (to an exe file) and tested it on a few files. Everything worked fine and the files I tested had their content correctly detected, but (if i'm not wrong) the code works, by design, for 1 file only : int wmain(int argc, wchar_t* argv[]) { if (argc != 2) Return 1 ... So I thought : why not trying to compile the code as a dll, allowing us to check all files from a chosen folder, by running an AutoIt script that calls the dll ? Jon's C++ code is mainly composed of 3 parts, named : 1) text_encoding_detect.h 2) text_encoding_detect.cpp 3) main.cpp Below is main.cpp, reworked for 2 reasons : * it has to be compiled as a dll (and no more as an exe) * the string messages indicating the detection result have all been replaced by integer values, to avoid string returns from the dll (as discussed yesterday in another thread). No worry, the string messages will all be displayed in AutoIt. The code may look a bit messy because I kept the (now) unused original lines, commented out, so everybody can see what has been altered in main.cpp : // // Copyright 2015-2016 Jonathan Bennett <jon@autoitscript.com> // // https://www.autoitscript.com // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include <stdio.h> #include <tchar.h> #include "text_encoding_detect.h" using namespace AutoIt::Common; // int wmain(int argc, wchar_t* argv[]) extern "C" __declspec(dllexport) int text_encoding_detect (wchar_t* sfilename) { /* if (argc != 2) { wprintf(L"\nUsage: %s filename.", argv[0]); // comment out all wprintf lines return 1; } */ // Open file in binary mode // FILE *file = _wfopen(argv[1], L"rb"); FILE *file = _wfopen(sfilename, L"rb"); if (file == NULL) { // wprintf(L"\nCould not open file.\n"); return 1; } // Get file size fseek(file, 0, SEEK_END); long fsize = ftell(file); fseek(file, 0, SEEK_SET); // Read it all in unsigned char *buffer = new unsigned char[fsize]; fread(buffer, fsize, 1, file); fclose(file); // Detect the encoding TextEncodingDetect textDetect; TextEncodingDetect::Encoding encoding = textDetect.DetectEncoding(buffer, fsize); int iret = 0; // added line. Also added all following iret lines // wprintf(L"\nEncoding: "); if (encoding == TextEncodingDetect::None) // wprintf(L"Binary"); iret = 10; else if (encoding == TextEncodingDetect::ASCII) // wprintf(L"ASCII (chars in the 0-127 range)"); iret = 11; else if (encoding == TextEncodingDetect::ANSI) // wprintf(L"ANSI (chars in the range 0-255 range)"); iret = 12; else if (encoding == TextEncodingDetect::UTF8_BOM || encoding == TextEncodingDetect::UTF8_NOBOM) // wprintf(L"UTF-8"); iret = 13; else if (encoding == TextEncodingDetect::UTF16_LE_BOM || encoding == TextEncodingDetect::UTF16_LE_NOBOM) // wprintf(L"UTF-16 Little Endian"); iret = 14; else if (encoding == TextEncodingDetect::UTF16_BE_BOM || encoding == TextEncodingDetect::UTF16_BE_NOBOM) // wprintf(L"UTF-16 Big Endian"); iret = 15; // Free up delete[] buffer; // return 0; return iret; } AutoIt code for using the dll : expandcollapse popup#include <Array.au3> #include <File.au3> #include <MsgBoxConstants.au3> Opt("MustDeclareVars", 1) Local $hDLL = DllOpen(@ScriptDir & "\text_encoding_detect.dll") If $hDLL = - 1 Then Exit Msgbox($MB_TOPMOST, "DllOpen", "error occured") Global $bAbort = False ; user can press Esc to abort while detecting (wrong folder ?) Local $sPath = FileSelectFolder("Text encoding detect : choose folder", "", $FSF_NEWDIALOG) If @error Then ; MsgBox(BitOr($MB_TOPMOST, $MB_ICONWARNING), "Error", _ ; "No folder selected ") ; Cancel, Red X or Esc key in FileSelectFolder() DllClose($hDLL) Exit Else If StringRight($sPath, 1) <> "\" Then $sPath &= "\" ; "C:\" => "C:\" "C:\Temp" => "C:\Temp\" EndIf EndIf Local $aFileList = _FileListToArray($sPath, Default, $FLTA_FILES) ; return files only Local $iKeep_error = @error, $sMsg = "" Switch $iKeep_error Case 0 ; no error (and at least 1 file found because of non-error 4) Case 1 $sMsg = "Invalid Folder" Case 2 $sMsg = "Invalid Filter" Case 3 $sMsg = "Invalid Flag" Case 4 $sMsg = "No File(s) Found" Case Else $sMsg = "Not documented" EndSwitch If $iKeep_error Then MsgBox($MB_TOPMOST, "_FileListToArray", _ "Path : " & $sPath & @crlf & _ "error #" & $iKeep_error & " : " & $sMsg) DllClose($hDLL) Exit EndIf ; _ArrayDisplay($aFileList, "$aFileList") Local $sFile, $aRet, $iTotFiles = $aFileList[0], $aResult[$iTotFiles][2] HotKeySet("{ESC}", "_Abort") For $iInc = 1 To $iTotFiles _SplashOn("File : " & $iInc & " / " & $iTotFiles) $sFile = $sPath & $aFileList[$iInc] $aRet = DllCall($hDLL, "int:cdecl", "text_encoding_detect", "wstr", $sFile) If @error Then $iKeep_error = @error HotKeySet("{ESC}") SplashOff() Msgbox($MB_TOPMOST, "DllCall", "error " & $iKeep_error & @crlf & $sFile) DllClose($hDLL) Exit EndIf Switch $aRet[0] ; Jon's string messages from C++ main.ccp Case 1 $sMsg = "Could not open file" Case 10 $sMsg = "Binary" Case 11 $sMsg = "ASCII (chars in the 0-127 range)" Case 12 $sMsg = "ANSI (chars in the range 0-255 range)" Case 13 $sMsg = "UTF-8" Case 14 $sMsg = "UTF-16 Little Endian" Case 15 $sMsg = "UTF-16 Big Endian" Case Else $sMsg = "Not documented" EndSwitch $aResult[$iInc - 1][0] = $aFileList[$iInc] $aResult[$iInc - 1][1] = $sMsg If $bAbort Then ; user keyed Esc Redim $aResult[$iInc][2] ExitLoop EndIf Next HotKeySet("{ESC}") SplashOff() _ArrayDisplay($aResult, ($bAbort ? "PARTIAL - " : "") & $sPath, _ Default, Default, Default, "File name|Encoding") DllClose($hDLL) ;========================================================== Func _Abort() HotKeySet("{ESC}") $bAbort = True EndFunc ; ==>_Abort ;========================================================== Func _SplashOn($sFirstLine, $sSecondLine = "please wait...") SplashTextOn("", $sFirstLine & @CRLF & $sSecondLine, _ 250, 50, -1, -1, $DLG_NOTITLE + $DLG_TEXTVCENTER) EndFunc ;==>_SplashOn 2nd file name is a non-ANSI name correctly passed to the dll : "b" & ChrW(1034) A few ideas I'll probably work on : * add a file extension column in the final ArrayDisplay (it will allow to sort by extension) * add another column in ArrayDisplay to indicate the position (offset) in the file where the detection has been made. The goal is not to upload a dll here, but if Jon or an AutoIt developer think it may interest users, then the decision is theirs to compile and upload it as a dll, especially their compilers are certainly better than the free one I use. Meanwhile, any interested user having the tools to compile could be interested too. If you guys think something should be corrected in the script, please indicate it in this thread. Thanks Edited April 5, 2022 by pixelsearch typo TheDcoder and Musashi 2 Link to comment Share on other sites More sharing options...
Recommended Posts
Create an account or sign in to comment
You need to be a member in order to leave a comment
Create an account
Sign up for a new account in our community. It's easy!
Register a new accountSign in
Already have an account? Sign in here.
Sign In Now