Robot Framework
tokenizer.py
Go to the documentation of this file.
1 # Copyright 2008-2015 Nokia Networks
2 # Copyright 2016- Robot Framework Foundation
3 #
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7 #
8 # http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 
16 import re
17 
18 from .tokens import Token
19 
20 
21 class Tokenizer:
22 
25  _space_splitter = re.compile(r'(\s{2,}|\t)', re.UNICODE)
26 
29  _pipe_splitter = re.compile(r'((?:\A|\s+)\|(?:\s+|\Z))', re.UNICODE)
30 
31  def tokenize(self, data, data_only=False):
32  current = []
33  for lineno, line in enumerate(data.splitlines(not data_only), start=1):
34  tokens = self._tokenize_line_tokenize_line(line, lineno, not data_only)
35  tokens, starts_new = self._cleanup_tokens_cleanup_tokens(tokens, data_only)
36  if starts_new:
37  if current:
38  yield current
39  current = tokens
40  else:
41  current.extend(tokens)
42  yield current
43 
44  def _tokenize_line(self, line, lineno, include_separators=True):
45  # Performance optimized code.
46  tokens = []
47  append = tokens.append
48  offset = 0
49  if line[:1] == '|' and line[:2].strip() == '|':
50  splitter = self._split_from_pipes_split_from_pipes
51  else:
52  splitter = self._split_from_spaces_split_from_spaces
53  for value, is_data in splitter(line.rstrip()):
54  if is_data:
55  append(Token(None, value, lineno, offset))
56  elif include_separators:
57  append(Token(Token.SEPARATOR, value, lineno, offset))
58  offset += len(value)
59  if include_separators:
60  trailing_whitespace = line[len(line.rstrip()):]
61  append(Token(Token.EOL, trailing_whitespace, lineno, offset))
62  return tokens
63 
64  def _split_from_spaces(self, line):
65  is_data = True
66  for value in self._space_splitter_space_splitter.split(line):
67  yield value, is_data
68  is_data = not is_data
69 
70  def _split_from_pipes(self, line):
71  splitter = self._pipe_splitter_pipe_splitter
72  _, separator, rest = splitter.split(line, 1)
73  yield separator, False
74  while splitter.search(rest):
75  token, separator, rest = splitter.split(rest, 1)
76  yield token, True
77  yield separator, False
78  yield rest, True
79 
80  def _cleanup_tokens(self, tokens, data_only):
81  has_data, continues = self._handle_comments_and_continuation_handle_comments_and_continuation(tokens)
82  self._remove_trailing_empty_remove_trailing_empty(tokens)
83  if continues:
84  self._remove_leading_empty_remove_leading_empty(tokens)
85  if not has_data:
86  self._ensure_data_after_continuation_ensure_data_after_continuation(tokens)
87  starts_new = False
88  else:
89  starts_new = has_data
90  if data_only:
91  tokens = self._remove_non_data_remove_non_data(tokens)
92  return tokens, starts_new
93 
95  has_data = False
96  continues = False
97  commented = False
98  for token in tokens:
99  if token.type is None:
100  # lstrip needed to strip possible leading space from first token.
101  # Other leading/trailing spaces have been consumed as separators.
102  value = token.value.lstrip()
103  if commented:
104  token.type = Token.COMMENT
105  elif value:
106  if value[0] == '#':
107  token.type = Token.COMMENT
108  commented = True
109  elif not has_data:
110  if value == '...' and not continues:
111  token.type = Token.CONTINUATION
112  continues = True
113  else:
114  has_data = True
115  return has_data, continues
116 
117  def _remove_trailing_empty(self, tokens):
118  for token in reversed(tokens):
119  if not token.value and token.type != Token.EOL:
120  tokens.remove(token)
121  elif token.type is None:
122  break
123 
124  def _remove_leading_empty(self, tokens):
125  data_or_continuation = (None, Token.CONTINUATION)
126  for token in list(tokens):
127  if not token.value:
128  tokens.remove(token)
129  elif token.type in data_or_continuation:
130  break
131 
133  cont = self._find_continuation_find_continuation(tokens)
134  token = Token(lineno=cont.lineno, col_offset=cont.end_col_offset)
135  tokens.insert(tokens.index(cont) + 1, token)
136 
137  def _find_continuation(self, tokens):
138  for token in tokens:
139  if token.type == Token.CONTINUATION:
140  return token
141 
142  def _remove_non_data(self, tokens):
143  return [t for t in tokens if t.type is None]
def _handle_comments_and_continuation(self, tokens)
Definition: tokenizer.py:94
def _cleanup_tokens(self, tokens, data_only)
Definition: tokenizer.py:80
def _ensure_data_after_continuation(self, tokens)
Definition: tokenizer.py:132
def _remove_trailing_empty(self, tokens)
Definition: tokenizer.py:117
def tokenize(self, data, data_only=False)
Definition: tokenizer.py:31
def _tokenize_line(self, line, lineno, include_separators=True)
Definition: tokenizer.py:44
Token representing piece of Robot Framework data.
Definition: tokens.py:39