v0.4 - Add debugging

This commit is contained in:
David Sainty 2024-09-09 17:43:30 +10:00
parent 655a2619f8
commit fae7ef730a

View File

@ -31,28 +31,43 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base
# Method to detect and fix text encoding # Method to detect and fix text encoding
def fix_text_encoding(text) def fix_text_encoding(text)
begin
# Detect encoding # Detect encoding
detection = CharlockHolmes::EncodingDetector.detect(text) detection = CharlockHolmes::EncodingDetector.detect(text)
original_encoding = detection[:encoding] original_encoding = detection[:encoding]
puts "Original encoding detected: #{original_encoding}" puts "Original encoding detected: #{original_encoding}"
# Force the encoding to the detected one, then covnert to UTF-8 # # Force the encoding to the detected one, then covnert to UTF-8
if original_encoding == 'ISO-8859-1' || original_encoding == 'windows-1252' # if original_encoding == 'ISO-8859-1' || original_encoding == 'windows-1252'
# For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8 # # For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8
# text.force_encoding('ISO-8859-1').encode('UTF-8') # # text.force_encoding('ISO-8859-1').encode('UTF-8')
text.force_encoding(original_encoding).encode('UTF-8', invalid: :replace, undef: :replace, replace: '?') # text.force_encoding(original_encoding).encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
else # else
# Try to convert from detected encoding to UTF-8 # # Try to convert from detected encoding to UTF-8
text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?') # text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?')
end # end
rescue StandardError => e
if original_encoding
begin
decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8')
rescue => e
puts "Error during encoding conversion: #{e.message}" puts "Error during encoding conversion: #{e.message}"
puts e.backtrace.join("\n") # Print the full stack trace decoded_content = raw_content # Fall back to raw content if decoding fails
text
end end
else
decoded_content = raw_content # Fallback if encoding detection fails
end end
# Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again)
return decoded_content
end
# rescue StandardError => e
# puts "Error during encoding conversion: #{e.message}"
# puts e.backtrace.join("\n") # Print the full stack trace
# text
# end
# end
# Method to fix encoding issues in post content # Method to fix encoding issues in post content
def fix_encoding def fix_encoding
offset = 0 offset = 0