v0.6 Further attempt to get this reverse dobule encoding right now
This commit is contained in:
		| @@ -31,31 +31,58 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base | ||||
|   end | ||||
|  | ||||
|   # Method to detect and fix text encoding | ||||
|   def fix_text_encoding(broken_content) | ||||
|     # Step 1: Try to detect encoding of the corrupted (double-encoded) content | ||||
|     detection = CharlockHolmes::EncodingDetector.detect(broken_content) | ||||
|     original_encoding = detection[:encoding] | ||||
|     puts "Original encoding detected: #{original_encoding}" | ||||
|    | ||||
|     # Step 2: First decode the double-encoded content | ||||
|     begin | ||||
|       # Convert the content assuming it was double-encoded, so decode twice | ||||
|         # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8 | ||||
|       first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8') | ||||
|        | ||||
|       # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8 | ||||
|       fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8') | ||||
|        | ||||
|     rescue => e | ||||
|       puts "Error during encoding fix: #{e.message}" | ||||
|       puts e.backtrace.join("\n")  # Print the full stack trace | ||||
|  | ||||
|       fixed_content = broken_content # Fall back to the broken content if decoding fails | ||||
|   def fix_text_encoding(content) | ||||
|     # Detect if content is already UTF-8 (should be the target encoding) | ||||
|     if content.encoding == Encoding::UTF_8 && content.valid_encoding? | ||||
|       # Return as-is if it is already properly encoded | ||||
|       return content | ||||
|     end | ||||
|    | ||||
|     return fixed_content | ||||
|     # Step 1: Assume the content was incorrectly encoded as ISO-8859-1 (Windows-1252) and needs to be corrected | ||||
|     begin | ||||
|       # Convert from ISO-8859-1 (or Windows-1252) back to UTF-8 | ||||
|       # Force the encoding to ISO-8859-1 first, then encode to UTF-8 properly | ||||
|       fixed_content = content.force_encoding("ISO-8859-1").encode("UTF-8") | ||||
|     rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError | ||||
|       # If conversion fails, leave it as is | ||||
|       return content | ||||
|     end | ||||
|    | ||||
|     # Step 2: After reversing the encoding once, check if the result is valid UTF-8 | ||||
|     # If it's still not valid UTF-8, force it back and try again | ||||
|     unless fixed_content.valid_encoding? | ||||
|       fixed_content = fixed_content.force_encoding("ISO-8859-1").encode("UTF-8") | ||||
|     end | ||||
|  | ||||
|   # Return the properly decoded content | ||||
|   return fixed_content | ||||
| end | ||||
|      | ||||
|      | ||||
| #    # Step 1: Try to detect encoding of the corrupted (double-encoded) content | ||||
| #    detection = CharlockHolmes::EncodingDetector.detect(broken_content) | ||||
| #    original_encoding = detection[:encoding] | ||||
| #    puts "Original encoding detected: #{original_encoding}" | ||||
| #   | ||||
| #    # Step 2: First decode the double-encoded content | ||||
| #    begin | ||||
| #      # Convert the content assuming it was double-encoded, so decode twice | ||||
| #        # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8 | ||||
| #      first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8') | ||||
| #       | ||||
| #      # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8 | ||||
| #      fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8') | ||||
| #       | ||||
| #    rescue => e | ||||
| #      puts "Error during encoding fix: #{e.message}" | ||||
| #      puts e.backtrace.join("\n")  # Print the full stack trace | ||||
| # | ||||
| #      fixed_content = broken_content # Fall back to the broken content if decoding fails | ||||
| #    end | ||||
| #   | ||||
| #    return fixed_content | ||||
| #end | ||||
|  | ||||
| #    # Detect encoding | ||||
| #    detection = CharlockHolmes::EncodingDetector.detect(raw_content) | ||||
| #    original_encoding = detection[:encoding] | ||||
|   | ||||
		Reference in New Issue
	
	Block a user