Class: XMLCleaner

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
public/app/lib/xml_cleaner.rb

Overview

Document class used to process (potentially flawed) HTML on its way to becoming PDFs handles character entities and namespaces

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file) ⇒ XMLCleaner

Returns a new instance of XMLCleaner.



6
7
8
# File 'public/app/lib/xml_cleaner.rb', line 6

def initialize(file)
  @file = file
end

Instance Attribute Details

#fileObject

Returns the value of attribute file



4
5
6
# File 'public/app/lib/xml_cleaner.rb', line 4

def file
  @file
end

Instance Method Details

#characters(chars) ⇒ Object



36
37
38
# File 'public/app/lib/xml_cleaner.rb', line 36

def characters(chars)
  @file << entity_gsub!(chars)
end

#end_element_namespace(name, prefix = nil, uri = nil) ⇒ Object



40
41
42
43
# File 'public/app/lib/xml_cleaner.rb', line 40

def end_element_namespace(name, prefix= nil, uri=nil)
  return if name == 'ridiculous_wrapper_element'
  @file << '</' << name << '>'
end

#entity_gsub!(chars) ⇒ Object

gsub out all potentially problematic chars with entity references



11
12
13
14
15
16
17
18
19
20
21
22
# File 'public/app/lib/xml_cleaner.rb', line 11

def entity_gsub!(chars)
  mapping = {
    '&' => '&amp;',
    '<' => '&lt;',
    '>' => '&gt;',
    '"' => '&quot;',
    "'" => '&apos;'
  }
  re = /&(?!amp;)|[<>'"]/
  chars.gsub!(re, mapping)
  chars
end

#start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = []) ⇒ Object



25
26
27
28
29
30
31
32
33
34
# File 'public/app/lib/xml_cleaner.rb', line 25

def start_element_namespace(name, attrs=[], prefix=nil, uri=nil, ns=[])
  return if name == 'ridiculous_wrapper_element'
  @file << "<#{name}"
  unless attrs.empty?
    attrs.each do |attr|
      @file << " #{attr.localname}=\"#{entity_gsub!(attr.value)}\""
    end
  end
  @file << '>'
end