JSON Voorhees
Killer JSON for C++
parse.hpp
Go to the documentation of this file.
1 /** \file jsonv/parse.hpp
2  *
3  * Copyright (c) 2012-2014 by Travis Gockel. All rights reserved.
4  *
5  * This program is free software: you can redistribute it and/or modify it under the terms of the Apache License
6  * as published by the Apache Software Foundation, either version 2 of the License, or (at your option) any later
7  * version.
8  *
9  * \author Travis Gockel (travis@gockelhut.com)
10 **/
11 #ifndef __JSONV_PARSE_HPP_INCLUDED__
12 #define __JSONV_PARSE_HPP_INCLUDED__
13 
14 #include <jsonv/config.hpp>
15 #include <jsonv/string_view.hpp>
16 #include <jsonv/value.hpp>
17 
18 #include <cstddef>
19 #include <deque>
20 #include <stdexcept>
21 
22 namespace jsonv
23 {
24 
25 class tokenizer;
26 
27 /** An error encountered when parsing.
28  *
29  * \see parse
30 **/
32  public std::runtime_error
33 {
34 public:
35  typedef std::size_t size_type;
36 
37  /** Description of a single parsing problem. **/
38  struct problem
39  {
40  public:
41  problem(size_type line, size_type column, size_type character, std::string message);
42 
43  /** The line of input this error was encountered on. A new "line" is determined by carriage return or line feed.
44  * If you are in Windows and line breaks are two characters, the line number of the error will appear to be
45  * twice as high as you would think.
46  **/
47  size_type line() const
48  {
49  return _line;
50  }
51 
52  /** The character index on the current line this error was encountered on. **/
53  size_type column() const
54  {
55  return _column;
56  }
57 
58  /** The character index into the entire input this error was encountered on. **/
59  size_type character() const
60  {
61  return _character;
62  }
63 
64  /** A message from the parser which has user-readable details about the encountered problem. **/
65  const std::string& message() const
66  {
67  return _message;
68  }
69 
70  private:
71  size_type _line;
72  size_type _column;
73  size_type _character;
74  std::string _message;
75  };
76 
77  typedef std::deque<problem> problem_list;
78 
79 public:
80  parse_error(problem_list, value partial_result);
81 
82  virtual ~parse_error() noexcept;
83 
84  /** The list of problems which ultimately contributed to this \c parse_error. There will always be at least one
85  * \c problem in this list.
86  **/
87  const problem_list& problems() const;
88 
89  /** Get the partial result of parsing. There is no guarantee this value even resembles the input JSON as the input
90  * JSON was malformed.
91  **/
92  const value& partial_result() const;
93 
94 private:
95  problem_list _problems;
96  value _partial_result;
97 };
98 
99 /** Get a string representation of a problem. **/
100 JSONV_PUBLIC std::ostream& operator<<(std::ostream& os, const parse_error::problem& p);
101 
102 /** Get a string representation of a problem. **/
103 JSONV_PUBLIC std::string to_string(const parse_error::problem& p);
104 
105 /** Get a string representation of a \c parse_error. **/
106 JSONV_PUBLIC std::ostream& operator<<(std::ostream& os, const parse_error& p);
107 
108 /** Get a string representation of a \c parse_error. **/
109 JSONV_PUBLIC std::string to_string(const parse_error& p);
110 
111 /** Configuration for various parsing options. All parse functions should take in a \c parse_options as a paramter and
112  * should respect your settings.
113 **/
115 {
116 public:
117  using size_type = value::size_type;
118 
119  /** When a parse error is encountered, what should the parser do? **/
120  enum class on_error
121  {
122  /** Immediately throw a \c parse_error -- do not attempt to construct a partial result. **/
123  fail_immediately,
124  /** Attempt to continue parsing and constructing a result. **/
125  collect_all,
126  /** Ignore all errors and pretend to be successful. This is not recommended unless you are 100% certain the
127  * JSON you are attempting to parse is valid. Using this failure mode does not improve parser performance.
128  **/
129  ignore,
130  };
131 
132  /** The encoding format for strings. **/
133  enum class encoding
134  {
135  /** Use UTF-8 like a sane library should.
136  *
137  * \see http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf#G7404
138  **/
139  utf8,
140  /** Like \c utf8, but check that there are no unprintable characters in the input stream (see \c std::isprint).
141  * To contrast this with \c utf8, this mode will reject things such as the \c tab and \c newline characters,
142  * while this will reject them.
143  **/
144  utf8_strict,
145  /** Use the CESU-8 Compatibility Encoding Scheme for UTF-16? It is generally not recommended unless your
146  * processing environment requires binary collation with UTF-16. If you do not know you need this, you probably
147  * do not.
148  *
149  * \see http://www.unicode.org/reports/tr26/
150  **/
151  cesu8,
152  };
153 
154  /** When dealing with comma separators, how should extra commas be treated? **/
155  enum class commas
156  {
157  /** Do not allow any extra commas anywhere -- require valid JSON. **/
158  strict,
159  /** Allow a single trailing comma at the end of an array or object (similar to C++ \c enum definitions). **/
160  allow_trailing,
161  };
162 
163  /** How should numbers be dealt with? **/
164  enum class numbers
165  {
166  /** Parse \e all forms of decimal input that we can. To contrast this from \c strict, the \c decimal does not
167  * allow leading zeros on numbers.
168  **/
169  decimal,
170  /** Strictly comply with the JSON specification for numbers -- no leading zeros! **/
171  strict,
172  };
173 
174 public:
175  /** Create an instance with the default options. **/
176  parse_options();
177 
178  ~parse_options() noexcept;
179 
180  /** Create a parser with the default options -- this is the same result as the default constructor, but might be
181  * helpful if you like to be more explicit.
182  **/
183  static parse_options create_default();
184 
185  /** Create a strict parser. In general, these options are meant to fail on anything that is not a 100% valid JSON
186  * document. More specifically:
187  *
188  * \code
189  * failure_mode() == on_error::fail_immediately
190  * string_encoding() == encoding::utf8_strict
191  * number_encoding() == numbers::strict
192  * comma_policy() == commas::strict
193  * max_structure_depth() == 20
194  * require_document() == true
195  * complete_parse() == true
196  * comments() == false
197  * \endcode
198  **/
199  static parse_options create_strict();
200 
201  /** See \c on_error. The default failure mode is \c fail_immediately. **/
202  on_error failure_mode() const;
203  parse_options& failure_mode(on_error mode);
204 
205  /** The maximum allowed parsing failures the parser can encounter before throwing an error. This is only applicable
206  * if the \c failure_mode is not \c on_error::fail_immediately. By default, this value is 10.
207  *
208  * You should probably not set this value to an unreasonably high number, as each parse error encountered must be
209  * stored in memory for some period of time.
210  **/
211  std::size_t max_failures() const;
212  parse_options& max_failures(std::size_t limit);
213 
214  /** The output encoding for multi-byte characters in strings. The default value is UTF-8 because UTF-8 is best. Keep
215  * in mind this changes the output encoding for \e all decoded strings. If you need mixed encodings, you must
216  * handle that in your application.
217  **/
218  encoding string_encoding() const;
219  parse_options& string_encoding(encoding);
220 
221  /** How should a parser interpret numbers? By default, this is \c numbers::decimal, which allows any form of decimal
222  * input.
223  **/
224  numbers number_encoding() const;
225  parse_options& number_encoding(numbers);
226 
227  /** How should extra commas be treated? By default, this is \c commas::allow_trailing. **/
228  commas comma_policy() const;
229  parse_options& comma_policy(commas);
230 
231  /** The maximum allowed nesting depth of any structure in the JSON document. The JSON specification technically
232  * limits the depth to 20, but very few implementations actually conform to this, so it is fairly dangerous to set
233  * this value. By default, the value is 0, which means we should not do any depth checking.
234  **/
235  size_type max_structure_depth() const;
236  parse_options& max_structure_depth(size_type depth);
237 
238  /** If set to true, the result of a parse is required to have \c kind of \c kind::object or \c kind::array. By
239  * default, this is turned off, which will allow \c parse to return incomplete documents.
240  **/
241  bool require_document() const;
242  parse_options& require_document(bool);
243 
244  /** Should the input be completely parsed to consider the parsing a success? This is on by default. Disabling this
245  * option can be useful for situations where JSON input is coming from some stream and you wish to process distinct
246  * objects separately (this technique is used to great effect in jq: http://stedolan.github.io/jq/).
247  *
248  * \warning
249  * When using this option, it is best to construct a \c tokenizer for your input stream and reuse that. The
250  * \c parse functions all internally buffer your \c istream and while they \e attempt to use \c putback re-put
251  * characters back into the \c istream, they are not necessarily successful at doing so.
252  **/
253  bool complete_parse() const;
254  parse_options& complete_parse(bool);
255 
256  /** Are JSON comments allowed?
257  *
258  * \warning
259  * There is no "official" syntax for JSON comments, but this system allows
260  **/
261  bool comments() const;
262  parse_options& comments(bool);
263 
264 private:
265  // For the purposes of ABI compliance, most modifications to the variables in this class should bump the minor
266  // version number.
267  on_error _failure_mode = on_error::fail_immediately;
268  std::size_t _max_failures = 10;
269  encoding _string_encoding = encoding::utf8;
270  numbers _number_encoding = numbers::decimal;
271  commas _comma_policy = commas::allow_trailing;
272  size_type _max_struct_depth = 0;
273  bool _require_document = false;
274  bool _complete_parse = true;
275  bool _comments = true;
276 };
277 
278 /** Reads a JSON value from the input stream.
279  *
280  * \note
281  * This function is \e not intended for verifying if the input is valid JSON, as it will intentionally correctly parse
282  * invalid JSON (so long as it resembles valid JSON). See \c parse_options::create_strict for a strict-mode parse.
283  *
284  * \example "parse(std::istream&, const parse_options&)"
285  * Parse JSON from some file.
286  * \code
287  * std::ifstream file("file.json");
288  * jsonv::value out = parse(file);
289  * \endcode
290  *
291  * \throws parse_error if an error is found in the JSON. If the \a input terminates unexpectedly, a \c parse_error will
292  * still be thrown with a message like "Unexpected end: unmatched {...". If you suspect the input of going bad, you
293  * can check the state flags or set the exception mask of the stream (exceptions thrown by \a input while processing
294  * will be propagated out)
295 **/
296 value JSONV_PUBLIC parse(std::istream& input, const parse_options& = parse_options());
297 
298 /** Construct a JSON value from the given input.
299  *
300  * \throws parse_error if an error is found in the JSON.
301 **/
302 value JSONV_PUBLIC parse(const string_view& input, const parse_options& = parse_options());
303 
304 /** Construct a JSON value from the given input in `[begin, end)`.
305  *
306  * \throws parse_error if an error is found in the JSON.
307 **/
308 value JSONV_PUBLIC parse(const char* begin, const char* end, const parse_options& = parse_options());
309 
310 /** Reads a JSON value from a buffered \c tokenizer. This less convenient function is useful when setting
311  * \c parse_options::complete_parse to \c false.
312  *
313  * \see parse(std::istream&, const parse_options&)
314  *
315  * \example "parse(tokenizer&, const parse_options&)"
316  * \code
317  * tcp_stream input(get_network_stream());
318  * jsonv::tokenizer buffered(input);
319  * jsonv::parse_options options = jsonv::parse_options().complete_parse(false);
320  * jsonv::value x = parse(buffered, options);
321  * jsonv::value y = parse(buffered, options);
322  * \endcode
323 **/
324 value JSONV_PUBLIC parse(tokenizer& input, const parse_options& = parse_options());
325 
326 }
327 
328 #endif/*__JSONV_PARSE_HPP_INCLUDED__*/
Configuration for various parsing options.
Definition: parse.hpp:114
size_type line() const
The line of input this error was encountered on.
Definition: parse.hpp:47
An error encountered when parsing.
Definition: parse.hpp:31
const std::string & message() const
A message from the parser which has user-readable details about the encountered problem.
Definition: parse.hpp:65
Copyright (c) 2014-2019 by Travis Gockel.
Description of a single parsing problem.
Definition: parse.hpp:38
The existing extractor or serializer should be kept, but no exception should be thrown.
commas
When dealing with comma separators, how should extra commas be treated?
Definition: parse.hpp:155
numbers
How should numbers be dealt with?
Definition: parse.hpp:164
Splits input into tokens, allowing traversal of JSON without verification.
Definition: tokenizer.hpp:99
JSONV_PUBLIC std::string to_string(const parse_error::problem &p)
Get a string representation of a problem.
size_type column() const
The character index on the current line this error was encountered on.
Definition: parse.hpp:53
encoding
The encoding format for strings.
Definition: parse.hpp:133
Pulls in an implementation of string_view.
on_error
When a parse error is encountered, what should the parser do?
Definition: parse.hpp:120
size_type character() const
The character index into the entire input this error was encountered on.
Definition: parse.hpp:59
#define JSONV_PUBLIC
This function or class is part of the public API for JsonVoorhees.
Definition: config.hpp:104
JSONV_STRING_VIEW_TYPE string_view
A non-owning reference to a string.
Definition: string_view.hpp:52
Represents a single JSON value, which can be any one of a potential kind, each behaving slightly diff...
Definition: value.hpp:131
Copyright (c) 2012-2018 by Travis Gockel.