Orcus
sax_parser.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef INCLUDED_ORCUS_SAX_PARSER_HPP
9 #define INCLUDED_ORCUS_SAX_PARSER_HPP
10 
11 #include "sax_parser_base.hpp"
12 
13 #include <string_view>
14 
15 namespace orcus {
16 
18 {
24  static constexpr uint8_t baseline_version = 10;
25 };
26 
28 {
29 public:
36  {
37  (void)dtd;
38  }
39 
47  void start_declaration(std::string_view decl)
48  {
49  (void)decl;
50  }
51 
57  void end_declaration(std::string_view decl)
58  {
59  (void)decl;
60  }
61 
68  {
69  (void)elem;
70  }
71 
78  {
79  (void)elem;
80  }
81 
96  void characters(std::string_view val, bool transient)
97  {
98  (void)val; (void)transient;
99  }
100 
110  {
111  (void)attr;
112  }
113 };
114 
130 template<typename HandlerT, typename ConfigT = sax_parser_default_config>
132 {
133 public:
134  typedef HandlerT handler_type;
135  typedef ConfigT config_type;
136 
137  sax_parser(std::string_view content, handler_type& handler);
138  ~sax_parser() = default;
139 
140  void parse();
141 
142 private:
143 
148  void header();
149  void body();
150  void element();
151  void element_open(std::ptrdiff_t begin_pos);
152  void element_close(std::ptrdiff_t begin_pos);
153  void special_tag();
154  void declaration(const char* name_check);
155  void cdata();
156  void doctype();
157  void characters();
158  void attribute();
159 
160 private:
161  handler_type& m_handler;
162 };
163 
164 template<typename HandlerT, typename ConfigT>
165 sax_parser<HandlerT,ConfigT>::sax_parser(std::string_view content, handler_type& handler) :
166  sax::parser_base(content.data(), content.size()),
167  m_handler(handler)
168 {
169 }
170 
171 template<typename HandlerT, typename ConfigT>
172 void sax_parser<HandlerT,ConfigT>::parse()
173 {
174  m_nest_level = 0;
175  mp_char = mp_begin;
176  header();
177  skip_space_and_control();
178  body();
179 
180  assert(m_buffer_pos == 0);
181 }
182 
183 template<typename HandlerT, typename ConfigT>
184 void sax_parser<HandlerT,ConfigT>::header()
185 {
186  // we don't handle multi byte encodings so we can just skip bom entry if exists.
187  skip_bom();
188 
189  // Allow leading whitespace in the XML stream.
190  // TODO : Make this configurable since strictly speaking such an XML
191  // sttream is invalid.
192  skip_space_and_control();
193 
194  if (!has_char() || cur_char() != '<')
195  throw malformed_xml_error("xml file must begin with '<'.", offset());
196 
197  if (config_type::baseline_version >= 11)
198  {
199  // XML version 1.1 requires a header declaration whereas in 1.0 it's
200  // optional.
201  if (next_char_checked() != '?')
202  throw malformed_xml_error("xml file must begin with '<?'.", offset());
203 
204  declaration("xml");
205  }
206 }
207 
208 template<typename HandlerT, typename ConfigT>
209 void sax_parser<HandlerT,ConfigT>::body()
210 {
211  while (has_char())
212  {
213  if (cur_char() == '<')
214  {
215  element();
216  if (!m_root_elem_open)
217  // Root element closed. Stop parsing.
218  return;
219  }
220  else if (m_nest_level)
221  // Call characters only when in xml hierarchy.
222  characters();
223  else
224  next();
225  }
226 }
227 
228 template<typename HandlerT, typename ConfigT>
229 void sax_parser<HandlerT,ConfigT>::element()
230 {
231  assert(cur_char() == '<');
232  std::ptrdiff_t pos = offset();
233  char c = next_char_checked();
234  switch (c)
235  {
236  case '/':
237  element_close(pos);
238  return;
239  case '!':
240  special_tag();
241  return;
242  case '?':
243  declaration(nullptr);
244  return;
245  }
246 
247  element_open(pos);
248 }
249 
250 template<typename HandlerT, typename ConfigT>
251 void sax_parser<HandlerT,ConfigT>::element_open(std::ptrdiff_t begin_pos)
252 {
253  sax::parser_element elem;
254  element_name(elem, begin_pos);
255 
256  while (true)
257  {
258  skip_space_and_control();
259  char c = cur_char_checked();
260  if (c == '/')
261  {
262  // Self-closing element: <element/>
263  if (next_and_char() != '>')
264  throw malformed_xml_error("expected '/>' to self-close the element.", offset());
265  next();
266  elem.end_pos = offset();
267  m_handler.start_element(elem);
268  reset_buffer_pos();
269  m_handler.end_element(elem);
270  if (!m_nest_level)
271  m_root_elem_open = false;
272 #if ORCUS_DEBUG_SAX_PARSER
273  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
274 #endif
275  return;
276  }
277  else if (c == '>')
278  {
279  // End of opening element: <element>
280  next();
281  elem.end_pos = offset();
282  nest_up();
283  m_handler.start_element(elem);
284  reset_buffer_pos();
285 #if ORCUS_DEBUG_SAX_PARSER
286  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
287 #endif
288  return;
289  }
290  else
291  attribute();
292  }
293 }
294 
295 template<typename HandlerT, typename ConfigT>
296 void sax_parser<HandlerT,ConfigT>::element_close(std::ptrdiff_t begin_pos)
297 {
298  assert(cur_char() == '/');
299  nest_down();
300  next_check();
301  sax::parser_element elem;
302  element_name(elem, begin_pos);
303 
304  if (cur_char() != '>')
305  throw malformed_xml_error("expected '>' to close the element.", offset());
306  next();
307  elem.end_pos = offset();
308 
309  m_handler.end_element(elem);
310 #if ORCUS_DEBUG_SAX_PARSER
311  cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
312 #endif
313  if (!m_nest_level)
314  m_root_elem_open = false;
315 }
316 
317 template<typename HandlerT, typename ConfigT>
318 void sax_parser<HandlerT,ConfigT>::special_tag()
319 {
320  assert(cur_char() == '!');
321  // This can be either <![CDATA, <!--, or <!DOCTYPE.
322  size_t len = available_size();
323  if (len < 2)
324  throw malformed_xml_error("special tag too short.", offset());
325 
326  switch (next_and_char())
327  {
328  case '-':
329  {
330  // Possibly comment.
331  if (next_and_char() != '-')
332  throw malformed_xml_error("comment expected.", offset());
333 
334  len -= 2;
335  if (len < 3)
336  throw malformed_xml_error("malformed comment.", offset());
337 
338  next();
339  comment();
340  }
341  break;
342  case '[':
343  {
344  // Possibly a CDATA.
345  expects_next("CDATA[", 6);
346  if (has_char())
347  cdata();
348  }
349  break;
350  case 'D':
351  {
352  // check if this is a DOCTYPE.
353  expects_next("OCTYPE", 6);
354  skip_space_and_control();
355  if (has_char())
356  doctype();
357  }
358  break;
359  default:
360  throw malformed_xml_error("failed to parse special tag.", offset());
361  }
362 }
363 
364 template<typename HandlerT, typename ConfigT>
365 void sax_parser<HandlerT,ConfigT>::declaration(const char* name_check)
366 {
367  assert(cur_char() == '?');
368  next_check();
369 
370  // Get the declaration name first.
371  std::string_view decl_name;
372  name(decl_name);
373 #if ORCUS_DEBUG_SAX_PARSER
374  cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
375 #endif
376 
377  if (name_check && decl_name != name_check)
378  {
379  std::ostringstream os;
380  os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
381  throw malformed_xml_error(os.str(), offset());
382  }
383 
384  m_handler.start_declaration(decl_name);
385  skip_space_and_control();
386 
387  // Parse the attributes.
388  while (cur_char_checked() != '?')
389  {
390  attribute();
391  skip_space_and_control();
392  }
393  if (next_char_checked() != '>')
394  throw malformed_xml_error("declaration must end with '?>'.", offset());
395 
396  m_handler.end_declaration(decl_name);
397  reset_buffer_pos();
398  next();
399 #if ORCUS_DEBUG_SAX_PARSER
400  cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
401 #endif
402 }
403 
404 template<typename HandlerT, typename ConfigT>
405 void sax_parser<HandlerT,ConfigT>::cdata()
406 {
407  size_t len = available_size();
408  assert(len > 3);
409 
410  // Parse until we reach ']]>'.
411  const char* p0 = mp_char;
412  size_t i = 0, match = 0;
413  for (char c = cur_char(); i < len; ++i, c = next_and_char())
414  {
415  if (c == ']')
416  {
417  // Be aware that we may encounter a series of more than two ']'
418  // characters, in which case we'll only count the last two.
419 
420  if (match == 0)
421  // First ']'
422  ++match;
423  else if (match == 1)
424  // Second ']'
425  ++match;
426  }
427  else if (c == '>' && match == 2)
428  {
429  // Found ']]>'.
430  size_t cdata_len = i - 2;
431  m_handler.characters(std::string_view(p0, cdata_len), false);
432  next();
433  return;
434  }
435  else
436  match = 0;
437  }
438  throw malformed_xml_error("malformed CDATA section.", offset());
439 }
440 
441 template<typename HandlerT, typename ConfigT>
442 void sax_parser<HandlerT,ConfigT>::doctype()
443 {
444  // Parse the root element first.
445  sax::doctype_declaration param;
446  name(param.root_element);
447  skip_space_and_control();
448 
449  // Either PUBLIC or SYSTEM.
450  size_t len = available_size();
451  if (len < 6)
452  throw malformed_xml_error("DOCTYPE section too short.", offset());
453 
454  param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
455  char c = cur_char();
456  if (c == 'P')
457  {
458  if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
459  throw malformed_xml_error("malformed DOCTYPE section.", offset());
460 
461  param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
462  }
463  else if (c == 'S')
464  {
465  if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
466  throw malformed_xml_error("malformed DOCTYPE section.", offset());
467  }
468 
469  next_check();
470  skip_space_and_control();
471 
472  // Parse FPI.
473  value(param.fpi, false);
474 
475  has_char_throw("DOCTYPE section too short.");
476  skip_space_and_control();
477  has_char_throw("DOCTYPE section too short.");
478 
479  if (cur_char() == '>')
480  {
481  // Optional URI not given. Exit.
482 #if ORCUS_DEBUG_SAX_PARSER
483  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
484 #endif
485  m_handler.doctype(param);
486  next();
487  return;
488  }
489 
490  // Parse optional URI.
491  value(param.uri, false);
492 
493  has_char_throw("DOCTYPE section too short.");
494  skip_space_and_control();
495  has_char_throw("DOCTYPE section too short.");
496 
497  if (cur_char() != '>')
498  throw malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
499 
500 #if ORCUS_DEBUG_SAX_PARSER
501  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
502 #endif
503  m_handler.doctype(param);
504  next();
505 }
506 
507 template<typename HandlerT, typename ConfigT>
508 void sax_parser<HandlerT,ConfigT>::characters()
509 {
510  const char* p0 = mp_char;
511  for (; has_char(); next())
512  {
513  if (cur_char() == '<')
514  break;
515 
516  if (cur_char() == '&')
517  {
518  // Text span with one or more encoded characters. Parse using cell buffer.
519  cell_buffer& buf = get_cell_buffer();
520  buf.reset();
521  buf.append(p0, mp_char-p0);
522  characters_with_encoded_char(buf);
523  if (buf.empty())
524  m_handler.characters(std::string_view{}, false);
525  else
526  m_handler.characters(buf.str(), true);
527  return;
528  }
529  }
530 
531  if (mp_char > p0)
532  {
533  std::string_view val(p0, mp_char-p0);
534  m_handler.characters(val, false);
535  }
536 }
537 
538 template<typename HandlerT, typename ConfigT>
539 void sax_parser<HandlerT,ConfigT>::attribute()
540 {
541  sax::parser_attribute attr;
542  attribute_name(attr.ns, attr.name);
543 
544 #if ORCUS_DEBUG_SAX_PARSER
545  cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl;
546 #endif
547 
548  skip_space_and_control();
549 
550  char c = cur_char_checked();
551  if (c != '=')
552  {
553  std::ostringstream os;
554  os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
555  throw malformed_xml_error(os.str(), offset());
556  }
557 
558  next_check(); // skip the '='.
559  skip_space_and_control();
560 
561  attr.transient = value(attr.value, true);
562  if (attr.transient)
563  // Value is stored in a temporary buffer. Push a new buffer.
564  inc_buffer_pos();
565 
566 #if ORCUS_DEBUG_SAX_PARSER
567  cout << "sax_parser::attribute: value='" << attr.value << "'" << endl;
568 #endif
569 
570  m_handler.attribute(attr);
571 }
572 
573 }
574 
575 #endif
576 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: parser_base.hpp:23
Definition: sax_parser_base.hpp:108
Definition: sax_parser.hpp:28
void end_declaration(std::string_view decl)
Definition: sax_parser.hpp:57
void doctype(const orcus::sax::doctype_declaration &dtd)
Definition: sax_parser.hpp:35
void attribute(const orcus::sax::parser_attribute &attr)
Definition: sax_parser.hpp:109
void characters(std::string_view val, bool transient)
Definition: sax_parser.hpp:96
void start_declaration(std::string_view decl)
Definition: sax_parser.hpp:47
void end_element(const orcus::sax::parser_element &elem)
Definition: sax_parser.hpp:77
void start_element(const orcus::sax::parser_element &elem)
Definition: sax_parser.hpp:67
Definition: sax_parser.hpp:132
Definition: sax_parser_base.hpp:37
Definition: sax_parser_base.hpp:96
Definition: sax_parser_base.hpp:77
Definition: sax_parser.hpp:18
static constexpr uint8_t baseline_version
Definition: sax_parser.hpp:24