v0.6 Further attempt to get this reverse dobule encoding right now

This commit is contained in:
David Sainty 2024-09-09 22:15:26 +10:00
parent 535adfebba
commit 3cb70bb799

View File

@ -44,26 +44,6 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base
corrected_content corrected_content
end end
# Ensure the content is treated as UTF-8 (even if incorrectly encoded)
content.force_encoding('UTF-8')
# Continue decoding until no more invalid sequences are found
previous_content = ""
while previous_content != content
previous_content = content.dup
# Step 1: First attempt to convert from ISO-8859-1 to UTF-8
if content.valid_encoding?
# Decode from ISO-8859-1 (or Windows-1252) to UTF-8
content = content.force_encoding('ISO-8859-1').encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
else
content = previous_content # Stop if invalid encoding issues arise
end
end
return content
end
# # Step 1: Try to detect encoding of the corrupted (double-encoded) content # # Step 1: Try to detect encoding of the corrupted (double-encoded) content
# detection = CharlockHolmes::EncodingDetector.detect(broken_content) # detection = CharlockHolmes::EncodingDetector.detect(broken_content)
# original_encoding = detection[:encoding] # original_encoding = detection[:encoding]