Robot Framework Integrated Development Environment (RIDE)
htmlreader.py
Go to the documentation of this file.
1 # Copyright 2008-2015 Nokia Networks
2 # Copyright 2016- Robot Framework Foundation
3 #
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7 #
8 # http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 import re
16 from robotide.lib.robot.output import LOGGER
17 from robotide.lib.robot.utils import PY2
18 
19 if PY2:
20  from htmlentitydefs import entitydefs
21  from HTMLParser import HTMLParser
22 
23 else:
24  from html.entities import entitydefs
25  from html.parser import HTMLParser
26 
27  unichr = chr
28 
29 
30 NON_BREAKING_SPACE = u'\xA0'
31 
32 
33 class HtmlReader(HTMLParser):
34  IGNORE = 0
35  INITIAL = 1
36  PROCESS = 2
37 
38  def __init__(self, spaces=2):
39  self._spaces_spaces = spaces
40  HTMLParser.__init__(self)
41  self._encoding_encoding = 'ISO-8859-1'
42  self._handlers_handlers = {'table_start' : self.table_starttable_start,
43  'table_end' : self.table_endtable_end,
44  'tr_start' : self.tr_starttr_start,
45  'tr_end' : self.tr_endtr_end,
46  'td_start' : self.td_starttd_start,
47  'td_end' : self.td_endtd_end,
48  'th_start' : self.td_starttd_start,
49  'th_end' : self.td_endtd_end,
50  'br_start' : self.br_startbr_start,
51  'meta_start' : self.meta_startmeta_start}
52 
53  def read(self, htmlfile, populator, path=None):
54  self.populatorpopulator = populator
55  self.statestate = self.IGNOREIGNORE
56  self.current_rowcurrent_row = None
57  self.current_cellcurrent_cell = None
58 
59  report_html_pattern = b'<meta content="Robot Framework .*" name="Generator">'
60 
61  is_report_html = False
62  for line in htmlfile.readlines():
63  self.feed(self._decode_decode(line))
64  if re.match(report_html_pattern, line):
65  is_report_html = True
66  # Calling close is required by the HTMLParser but may cause problems
67  # if the same instance of our HtmlParser is reused. Currently it's
68  # used only once so there's no problem.
69  self.close()
70  if self.populatorpopulator.eof() and not is_report_html:
71  # Only warn when the html file is not report html
72  LOGGER.warn("Using test data in HTML format is deprecated. "
73  "Convert '%s' to plain text format."
74  % (path or htmlfile.name))
75 
76  def _decode(self, line):
77  return line.decode(self._encoding_encoding)
78 
79  def handle_starttag(self, tag, attrs):
80  handler = self._handlers_handlers.get(tag+'_start')
81  if handler is not None:
82  handler(attrs)
83 
84  def handle_endtag(self, tag):
85  handler = self._handlers_handlers.get(tag+'_end')
86  if handler is not None:
87  handler()
88 
89  def handle_data(self, data):
90  if self.statestate == self.IGNOREIGNORE or self.current_cellcurrent_cell is None:
91  return
92  if NON_BREAKING_SPACE in data:
93  data = data.replace(NON_BREAKING_SPACE, ' ')
94  self.current_cellcurrent_cell.append(data)
95 
96  def handle_entityref(self, name):
97  value = self._handle_entityref_handle_entityref(name)
98  self.handle_datahandle_data(value)
99 
100  def _handle_entityref(self, name):
101  if name == 'apos': # missing from entitydefs
102  return "'"
103  try:
104  value = entitydefs[name]
105  except KeyError:
106  return '&'+name+';'
107  if value.startswith('&#'):
108  return unichr(int(value[2:-1]))
109  if PY2:
110  return value.decode('ISO-8859-1')
111  return value
112 
113  def handle_charref(self, number):
114  value = self._handle_charref_handle_charref(number)
115  self.handle_datahandle_data(value)
116 
117  def _handle_charref(self, number):
118  if number.startswith(('x', 'X')):
119  base = 16
120  number = number[1:]
121  else:
122  base = 10
123  try:
124  return unichr(int(number, base))
125  except ValueError:
126  return '&#'+number+';'
127 
128  def unknown_decl(self, data):
129  # Ignore everything even if it's invalid. This kind of stuff comes
130  # at least from MS Excel
131  pass
132 
133  def table_start(self, attrs=None):
134  self.state = self.INITIAL
135  self.current_row = None
136  self.current_cell = None
137 
138  def table_end(self):
139  if self.current_rowcurrent_row is not None:
140  self.tr_endtr_end()
141  self.statestate = self.IGNOREIGNORE
142 
143  def tr_start(self, attrs=None):
144  if self.current_rowcurrent_row is not None:
145  self.tr_endtr_end()
146  self.current_rowcurrent_row = []
147 
148  def tr_end(self):
149  if self.current_rowcurrent_row is None:
150  return
151  if self.current_cellcurrent_cell is not None:
152  self.td_endtd_end()
153  if self.statestate == self.INITIALINITIAL:
154  accepted = self.populatorpopulator.start_table(self.current_rowcurrent_row)
155  self.statestate = self.PROCESSPROCESS if accepted else self.IGNOREIGNORE
156  elif self.statestate == self.PROCESSPROCESS:
157  self.populatorpopulator.add(self.current_rowcurrent_row)
158  self.current_rowcurrent_row = None
159 
160  def td_start(self, attrs=None):
161  if self.current_cellcurrent_cell is not None:
162  self.td_endtd_end()
163  if self.current_rowcurrent_row is None:
164  self.tr_starttr_start()
165  self.current_cellcurrent_cell = []
166 
167  def td_end(self):
168  if self.current_cellcurrent_cell is not None and self.statestate != self.IGNOREIGNORE:
169  cell = ''.join(self.current_cellcurrent_cell)
170  self.current_rowcurrent_row.append(cell)
171  self.current_cellcurrent_cell = None
172 
173  def br_start(self, attrs=None):
174  self.handle_datahandle_data('\n')
175 
176  def meta_start(self, attrs):
177  encoding = self._get_encoding_from_meta_get_encoding_from_meta(attrs)
178  if encoding:
179  self._encoding_encoding = encoding
180 
181  def _get_encoding_from_meta(self, attrs):
182  valid_http_equiv = False
183  encoding = None
184  for name, value in attrs:
185  name = name.lower()
186  if name == 'charset': # html5
187  return value
188  if name == 'http-equiv' and value.lower() == 'content-type':
189  valid_http_equiv = True
190  if name == 'content':
191  encoding = self._get_encoding_from_content_attr_get_encoding_from_content_attr(value)
192  return encoding if valid_http_equiv else None
193 
195  for token in value.split(';'):
196  token = token.strip()
197  if token.lower().startswith('charset='):
198  return token[8:]
199 
200  def handle_pi(self, data):
201  encoding = self._get_encoding_from_pi_get_encoding_from_pi(data)
202  if encoding:
203  self._encoding_encoding = encoding
204 
205  def _get_encoding_from_pi(self, data):
206  data = data.strip()
207  if not data.lower().startswith('xml '):
208  return None
209  if data.endswith('?'):
210  data = data[:-1]
211  for token in data.split():
212  if token.lower().startswith('encoding='):
213  encoding = token[9:]
214  if encoding.startswith("'") or encoding.startswith('"'):
215  encoding = encoding[1:-1]
216  return encoding
217  return None
def read(self, htmlfile, populator, path=None)
Definition: htmlreader.py:53