174 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			174 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
| # Federated Computer, Inc.
 | |
| # David Sainty <saint@federated.computer>  2024 A.D.
 | |
| # Gossamer Threads to Discourse -- Correct Encoding
 | |
| # v0.6 Further attempt to get this reverse dobule encoding right now
 | |
| 
 | |
| require 'mysql2'
 | |
| require 'active_record'
 | |
| require 'charlock_holmes'
 | |
| 
 | |
| # require 'concurrent-ruby'
 | |
| require File.expand_path("../../../../config/environment", __FILE__)
 | |
| require File.expand_path("../../../../script/import_scripts/base", __FILE__)
 | |
| 
 | |
| class GossamerForumsCorrectEncoding < ImportScripts::Base
 | |
|   def initialize
 | |
|     super
 | |
|       begin
 | |
|         # Initialize MySQL client to connect to Gossamer Forums database
 | |
|         @mysql_client = Mysql2::Client.new(
 | |
|           host: "slowtwitch.northend.network",
 | |
|           username: "admin",
 | |
|           password: "yxnh93Ybbz2Nm8#mp28zCVv",
 | |
|           database: "slowtwitch"
 | |
|         )
 | |
|       rescue Mysql2::Error => e
 | |
|         puts "Error connecting to MySQL: #{e.message}"
 | |
|         puts e.backtrace.join("\n")  # Print the full stack trace
 | |
|         exit 1
 | |
|       end
 | |
|       @batch_size = 1000   # Number of posts to process in each batch
 | |
|   end
 | |
| 
 | |
|   # Method to detect and fix text encoding
 | |
|   def fix_text_encoding(content)
 | |
|     begin
 | |
|       # Treat as Windows-1252 (cp1252) and then decode into UTF-8
 | |
|       corrected_content = content.encode('CP1252').force_encoding('UTF-8')
 | |
|     rescue Encoding::UndefinedConversionError => e
 | |
|       puts "Error during encoding conversion: #{e.message}"
 | |
|       puts e.backtrace.join("\n")  # Print the full stack trace
 | |
|       return content # Return the original content if conversion fails
 | |
|     end
 | |
| 
 | |
|     corrected_content
 | |
|   end
 | |
| 
 | |
|     # Ensure the content is treated as UTF-8 (even if incorrectly encoded)
 | |
|     content.force_encoding('UTF-8')
 | |
|   
 | |
|     # Continue decoding until no more invalid sequences are found
 | |
|     previous_content = ""
 | |
|     while previous_content != content
 | |
|       previous_content = content.dup
 | |
|   
 | |
|       # Step 1: First attempt to convert from ISO-8859-1 to UTF-8
 | |
|       if content.valid_encoding?
 | |
|         # Decode from ISO-8859-1 (or Windows-1252) to UTF-8
 | |
|         content = content.force_encoding('ISO-8859-1').encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
 | |
|       else
 | |
|         content = previous_content # Stop if invalid encoding issues arise
 | |
|       end
 | |
|     end
 | |
| 
 | |
|   return content
 | |
| end
 | |
|     
 | |
| #    # Step 1: Try to detect encoding of the corrupted (double-encoded) content
 | |
| #    detection = CharlockHolmes::EncodingDetector.detect(broken_content)
 | |
| #    original_encoding = detection[:encoding]
 | |
| #    puts "Original encoding detected: #{original_encoding}"
 | |
| #  
 | |
| #    # Step 2: First decode the double-encoded content
 | |
| #    begin
 | |
| #      # Convert the content assuming it was double-encoded, so decode twice
 | |
| #        # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8
 | |
| #      first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8')
 | |
| #      
 | |
| #      # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8
 | |
| #      fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8')
 | |
| #      
 | |
| #    rescue => e
 | |
| #      puts "Error during encoding fix: #{e.message}"
 | |
| #      puts e.backtrace.join("\n")  # Print the full stack trace
 | |
| #
 | |
| #      fixed_content = broken_content # Fall back to the broken content if decoding fails
 | |
| #    end
 | |
| #  
 | |
| #    return fixed_content
 | |
| #end
 | |
| 
 | |
| #    # Detect encoding
 | |
| #    detection = CharlockHolmes::EncodingDetector.detect(raw_content)
 | |
| #    original_encoding = detection[:encoding]
 | |
| #    puts "Original encoding detected: #{original_encoding}"
 | |
|   
 | |
| #      # Force the encoding to the detected one, then covnert to UTF-8
 | |
| #      if original_encoding == 'ISO-8859-1' || original_encoding == 'windows-1252'
 | |
| #        # For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8
 | |
| #        # text.force_encoding('ISO-8859-1').encode('UTF-8')
 | |
| #        text.force_encoding(original_encoding).encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
 | |
| #      else
 | |
| #        # Try to convert from detected encoding to UTF-8
 | |
| #        text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?')
 | |
| #      end
 | |
| 
 | |
| #    if original_encoding
 | |
| #      begin
 | |
| #        decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8')
 | |
| #      rescue => e
 | |
| #        puts "Error during encoding conversion: #{e.message}"
 | |
| #        decoded_content = raw_content # Fall back to raw content if decoding fails
 | |
| #      end
 | |
| #    else
 | |
| #      decoded_content = raw_content # Fallback if encoding detection fails
 | |
| #    end
 | |
| #
 | |
| #    # Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again)
 | |
| #    return decoded_content
 | |
| #  end
 | |
|         
 | |
| #    rescue StandardError => e
 | |
| #      puts "Error during encoding conversion: #{e.message}"
 | |
| #      puts e.backtrace.join("\n")  # Print the full stack trace
 | |
| #      text
 | |
| #    end
 | |
| #  end
 | |
| 
 | |
|     # Method to fix encoding issues in post content
 | |
|   def fix_encoding
 | |
|     offset = 0
 | |
| 
 | |
|     loop do
 | |
|       puts "OFFSET:  #{offset}"
 | |
|       begin
 | |
|         posts = Post.limit(@batch_size).offset(offset)
 | |
|         break if posts.empty?
 | |
| 
 | |
|         posts.each do |post|
 | |
|           raw_content = post.raw
 | |
|           puts "--> NEXT POST:   post.id:  #{post.id}"
 | |
|           fixed_content = fix_text_encoding(raw_content)
 | |
|           if fixed_content != raw_content
 | |
|             puts "Updating post #{post.id}"
 | |
|             puts "------- raw_content:\n#{raw_content}"
 | |
|             puts "+++++++ fixed_content:\n#{fixed_content}"
 | |
|             puts "---------------------------------------------------------------------------------------------"
 | |
|             # post.update(raw: fixed_content)
 | |
| #            post.raw = fixed_content
 | |
| #            if post.save
 | |
| #              puts "Post ##{post.id} updated successfully."
 | |
| #            else
 | |
| #              puts "Failed to update Post ##{post.id}: #{post.errors.full_messages.join(', ')}"
 | |
| #            end
 | |
|           end
 | |
|         end
 | |
|       rescue
 | |
|         puts "Error:  #{e.message}"
 | |
|         puts e.backtrace.join("\n")  # Print the full stack trace
 | |
|       end
 | |
| 
 | |
|       offset += @batch_size
 | |
|     end
 | |
|   end
 | |
| 
 | |
|   def perform_encoding_correction
 | |
|     puts "Encoding Correction beginning!"
 | |
|     fix_encoding
 | |
|     puts "Encoding Correction complete!"
 | |
|   end
 | |
| end
 | |
| 
 | |
| GossamerForumsCorrectEncoding.new.perform_encoding_correction
 | |
| 
 |