Orcus
csv_parser.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef ORCUS_CSV_PARSER_HPP
9 #define ORCUS_CSV_PARSER_HPP
10 
11 #include "csv_parser_base.hpp"
12 
13 namespace orcus {
14 
16 {
17 public:
21  void begin_parse() {}
22 
26  void end_parse() {}
27 
31  void begin_row() {}
32 
36  void end_row() {}
37 
49  void cell(std::string_view value, bool transient)
50  {
51  (void)value; (void)transient;
52  }
53 };
54 
61 template<typename HandlerT>
63 {
64 public:
65  typedef HandlerT handler_type;
66 
67  csv_parser(std::string_view content, handler_type& hdl, const csv::parser_config& config);
68  void parse();
69 
70 private:
71 
72  // handlers
73  void row();
74  void cell();
75  void quoted_cell();
76 
77  void parse_cell_with_quote(const char* p0, size_t len0);
78 
82  void push_cell_value(const char* p, size_t n);
83 
84 private:
85  handler_type& m_handler;
86 };
87 
88 template<typename _Handler>
90  std::string_view content, handler_type& hdl, const csv::parser_config& config) :
91  csv::parser_base(content, config), m_handler(hdl) {}
92 
93 template<typename _Handler>
94 void csv_parser<_Handler>::parse()
95 {
96 #if ORCUS_DEBUG_CSV
97  for (const char* p = mp_begin; p < mp_end; ++p)
98  std::cout << *p;
99  std::cout << std::endl;
100 #endif
101 
102  m_handler.begin_parse();
103  while (has_char())
104  row();
105  m_handler.end_parse();
106 }
107 
108 template<typename _Handler>
109 void csv_parser<_Handler>::row()
110 {
111  m_handler.begin_row();
112  while (true)
113  {
114  if (is_text_qualifier(cur_char()))
115  quoted_cell();
116  else
117  cell();
118 
119  if (!has_char())
120  {
121  m_handler.end_row();
122  return;
123  }
124 
125  char c = cur_char();
126  if (c == '\n')
127  {
128  next();
129 #if ORCUS_DEBUG_CSV
130  cout << "(LF)" << endl;
131 #endif
132  m_handler.end_row();
133  return;
134  }
135 
136  if (!is_delim(c))
137  throw orcus::parse_error("expected a delimiter", offset());
138 
139  next();
140 
141  if (m_config.trim_cell_value)
142  skip_blanks();
143 
144  if (!has_char())
145  {
146  m_handler.end_row();
147  return;
148  }
149  }
150 }
151 
152 template<typename _Handler>
153 void csv_parser<_Handler>::cell()
154 {
155  const char* p = mp_char;
156  size_t len = 0;
157  char c = cur_char();
158  while (c != '\n' && !is_delim(c))
159  {
160  ++len;
161  next();
162  if (!has_char())
163  break;
164  c = cur_char();
165  }
166 
167  if (!len)
168  p = nullptr;
169 
170  push_cell_value(p, len);
171 }
172 
173 template<typename _Handler>
174 void csv_parser<_Handler>::quoted_cell()
175 {
176 #if ORCUS_DEBUG_CSV
177  cout << "--- quoted cell" << endl;
178 #endif
179  char c = cur_char();
180  assert(is_text_qualifier(c));
181  next(); // Skip the opening quote.
182  if (!has_char())
183  return;
184 
185  const char* p0 = mp_char;
186  size_t len = 1;
187  for (; has_char(); next(), ++len)
188  {
189  c = cur_char();
190 #if ORCUS_DEBUG_CSV
191  cout << "'" << c << "'" << endl;
192 #endif
193  if (!is_text_qualifier(c))
194  continue;
195 
196  // current char is a quote. Check if the next char is also a text
197  // qualifier.
198 
199  if (has_next() && is_text_qualifier(peek_char()))
200  {
201  next();
202  parse_cell_with_quote(p0, len);
203  return;
204  }
205 
206  // Closing quote.
207  m_handler.cell({p0, len-1}, false);
208  next();
209  skip_blanks();
210  return;
211  }
212 
213  // Stream ended prematurely. Handle it gracefully.
214  m_handler.cell({p0, len}, false);
215 }
216 
217 template<typename _Handler>
218 void csv_parser<_Handler>::parse_cell_with_quote(const char* p0, size_t len0)
219 {
220 #if ORCUS_DEBUG_CSV
221  using namespace std;
222  cout << "--- parse cell with quote" << endl;
223 #endif
224  assert(is_text_qualifier(cur_char()));
225 
226  // Push the preceding chars to the temp buffer.
227  m_cell_buf.reset();
228  m_cell_buf.append(p0, len0);
229 
230  // Parse the rest, until the closing quote.
231  next();
232  const char* p_cur = mp_char;
233  size_t cur_len = 0;
234  for (; has_char(); next(), ++cur_len)
235  {
236  char c = cur_char();
237 #if ORCUS_DEBUG_CSV
238  cout << "'" << c << "'" << endl;
239 #endif
240  if (!is_text_qualifier(c))
241  continue;
242 
243  if (has_next() && is_text_qualifier(peek_char()))
244  {
245  // double quotation. Copy the current segment to the cell buffer.
246  m_cell_buf.append(p_cur, cur_len);
247 
248  next(); // to the 2nd quote.
249  p_cur = mp_char;
250  cur_len = 0;
251  continue;
252  }
253 
254  // closing quote. Flush the current segment to the cell
255  // buffer, push the value to the handler, and exit normally.
256  m_cell_buf.append(p_cur, cur_len);
257 
258  m_handler.cell(m_cell_buf.str(), true);
259  next();
260  skip_blanks();
261  return;
262  }
263 
264  // Stream ended prematurely.
265  throw parse_error("stream ended prematurely while parsing quoted cell.", offset());
266 }
267 
268 template<typename _Handler>
269 void csv_parser<_Handler>::push_cell_value(const char* p, size_t n)
270 {
271  size_t len = n;
272 
273  if (m_config.trim_cell_value)
274  {
275  // Trim any leading blanks.
276  for (size_t i = 0; i < n; ++i, --len, ++p)
277  {
278  if (!is_blank(*p))
279  break;
280  }
281 
282  // Trim any trailing blanks.
283  if (len)
284  {
285  const char* p_end = p + (len-1);
286  for (; p != p_end; --p_end, --len)
287  {
288  if (!is_blank(*p_end))
289  break;
290  }
291  }
292  }
293 
294  m_handler.cell({p, len}, false);
295 #if ORCUS_DEBUG_CSV
296  if (len)
297  cout << "(cell:'" << std::string(p, len) << "')" << endl;
298  else
299  cout << "(cell:'')" << endl;
300 #endif
301 }
302 
303 }
304 
305 #endif
306 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: csv_parser_base.hpp:58
Definition: csv_parser.hpp:16
void end_row()
Definition: csv_parser.hpp:36
void end_parse()
Definition: csv_parser.hpp:26
void begin_row()
Definition: csv_parser.hpp:31
void begin_parse()
Definition: csv_parser.hpp:21
void cell(std::string_view value, bool transient)
Definition: csv_parser.hpp:49
Definition: csv_parser.hpp:63
Definition: exception.hpp:94
Definition: parser_base.hpp:23
Definition: config.hpp:20
Definition: csv_parser_base.hpp:37