From 9dd478697ae3a8950db85901acfb3e5637a7fec4 Mon Sep 17 00:00:00 2001
From: saint <saint@saint.li>
Date: Mon, 9 Sep 2024 20:45:05 +1000
Subject: [PATCH] v0.6 Further attempt to get this reverse dobule encoding
 right now

---
 goss-correctencoding.rb | 69 ++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 21 deletions(-)

diff --git a/goss-correctencoding.rb b/goss-correctencoding.rb
index 4d2dec2..f1e2ef6 100644
--- a/goss-correctencoding.rb
+++ b/goss-correctencoding.rb
@@ -31,30 +31,57 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base
   end
 
   # Method to detect and fix text encoding
-  def fix_text_encoding(broken_content)
-    # Step 1: Try to detect encoding of the corrupted (double-encoded) content
-    detection = CharlockHolmes::EncodingDetector.detect(broken_content)
-    original_encoding = detection[:encoding]
-    puts "Original encoding detected: #{original_encoding}"
-  
-    # Step 2: First decode the double-encoded content
-    begin
-      # Convert the content assuming it was double-encoded, so decode twice
-        # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8
-      first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8')
-      
-      # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8
-      fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8')
-      
-    rescue => e
-      puts "Error during encoding fix: #{e.message}"
-      puts e.backtrace.join("\n")  # Print the full stack trace
-
-      fixed_content = broken_content # Fall back to the broken content if decoding fails
+  def fix_text_encoding(content)
+    # Detect if content is already UTF-8 (should be the target encoding)
+    if content.encoding == Encoding::UTF_8 && content.valid_encoding?
+      # Return as-is if it is already properly encoded
+      return content
     end
   
-    return fixed_content
+    # Step 1: Assume the content was incorrectly encoded as ISO-8859-1 (Windows-1252) and needs to be corrected
+    begin
+      # Convert from ISO-8859-1 (or Windows-1252) back to UTF-8
+      # Force the encoding to ISO-8859-1 first, then encode to UTF-8 properly
+      fixed_content = content.force_encoding("ISO-8859-1").encode("UTF-8")
+    rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
+      # If conversion fails, leave it as is
+      return content
+    end
+  
+    # Step 2: After reversing the encoding once, check if the result is valid UTF-8
+    # If it's still not valid UTF-8, force it back and try again
+    unless fixed_content.valid_encoding?
+      fixed_content = fixed_content.force_encoding("ISO-8859-1").encode("UTF-8")
+    end
+
+  # Return the properly decoded content
+  return fixed_content
 end
+    
+    
+#    # Step 1: Try to detect encoding of the corrupted (double-encoded) content
+#    detection = CharlockHolmes::EncodingDetector.detect(broken_content)
+#    original_encoding = detection[:encoding]
+#    puts "Original encoding detected: #{original_encoding}"
+#  
+#    # Step 2: First decode the double-encoded content
+#    begin
+#      # Convert the content assuming it was double-encoded, so decode twice
+#        # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8
+#      first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8')
+#      
+#      # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8
+#      fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8')
+#      
+#    rescue => e
+#      puts "Error during encoding fix: #{e.message}"
+#      puts e.backtrace.join("\n")  # Print the full stack trace
+#
+#      fixed_content = broken_content # Fall back to the broken content if decoding fails
+#    end
+#  
+#    return fixed_content
+#end
 
 #    # Detect encoding
 #    detection = CharlockHolmes::EncodingDetector.detect(raw_content)