JSON Voorhees
Killer JSON for C++
tokenizer.hpp
Go to the documentation of this file.
1 /** \file jsonv/tokenizer.hpp
2  * A stream-based tokenizer meant to help with creating custom parsers. If you are happy with the JSON Voorhees AST
3  * (\c value and friends), it is probably easier to use the functions in \c jsonv/parse.hpp.
4  *
5  * Copyright (c) 2014-2018 by Travis Gockel. All rights reserved.
6  *
7  * This program is free software: you can redistribute it and/or modify it under the terms of the Apache License
8  * as published by the Apache Software Foundation, either version 2 of the License, or (at your option) any later
9  * version.
10  *
11  * \author Travis Gockel (travis@gockelhut.com)
12 **/
13 #ifndef __JSONV_TOKENIZER_INCLUDED__
14 #define __JSONV_TOKENIZER_INCLUDED__
15 
16 #include <jsonv/config.hpp>
17 #include <jsonv/string_view.hpp>
18 
19 #include <iosfwd>
20 #include <memory>
21 #include <vector>
22 
23 namespace jsonv
24 {
25 
26 /** The kind of token that was encountered in a \c tokenizer. The tokenizer will is parsing this information anyway, so
27  * it is easy to expose.
28 **/
29 enum class token_kind : unsigned int
30 {
31  /** Unknown value...either uninitialized or a parse error. **/
32  unknown = 0x00000,
33  /** The beginning of an array: \c [. **/
34  array_begin = 0x00001,
35  /** The end of an array: \c ]. **/
36  array_end = 0x00002,
37  /** A boolean: \c true or \c false. **/
38  boolean = 0x00004,
39  /** The literal \c null. **/
40  null = 0x00008,
41  /** A number -- in either integer or decimal type. **/
42  number = 0x00010,
43  /** A separator was encountered: \c ,. **/
44  separator = 0x00020,
45  /** A string was encountered. It could be the key of an object, but it is not the responsibility of the \c tokenizer
46  * to track this.
47  **/
48  string = 0x00040,
49  /** The beginning of an object: \c {. **/
50  object_begin = 0x00080,
51  /** The delimiter between an object key and value: \c :. **/
52  object_key_delimiter = 0x00100,
53  /** The end of an object: \c }. **/
54  object_end = 0x00200,
55  /** The whitespace in between things. **/
56  whitespace = 0x00400,
57  /** A JSON comment block. **/
58  comment = 0x00800,
59  /** Indicates that a parse error happened. **/
60  parse_error_indicator = 0x10000,
61 };
62 
63 /** Combine multiple flag values. **/
65 {
66  return token_kind(static_cast<unsigned int>(a) | static_cast<unsigned int>(b));
67 }
68 
69 /** Filter out flag values. **/
71 {
72  return token_kind(static_cast<unsigned int>(a) & static_cast<unsigned int>(b));
73 }
74 
75 /** Invert flag values. **/
77 {
78  return token_kind(~static_cast<unsigned int>(a));
79 }
80 
81 /** Output the given \c token_kind to the \c std::ostream. **/
82 JSONV_PUBLIC std::ostream& operator<<(std::ostream&, const token_kind&);
83 
84 /** Convert the given \c token_kind to an \c std::string. **/
85 JSONV_PUBLIC std::string to_string(const token_kind&);
86 
87 /** Splits input into tokens, allowing traversal of JSON without verification. This is the basis for JSON parsers.
88  *
89  * An important thing to remember is a \c tokenizer does not perform any real validation of any kind beyond emitting
90  * a \c token_kind::unknown when it encounters complete garbage. What does this mean? Given the input string:
91  *
92  * \code
93  * [{]{{{{{{}}{{]]]]][][]]][[[[]]]"fdsadf"]]]}}}}}}}}]]]
94  * \endcode
95  *
96  * A \c tokenizer will emit \c token_kind::array_begin, \c token_kind::object_begin, \c token_kind::array_end and so
97  * on, even though it is illegal JSON. It is up to a higher-level construct to detect such failures.
98 **/
100 {
101 public:
102  using size_type = std::vector<char>::size_type;
103 
104  /// \deprecated
105  /// See \ref buffer_reserve.
106  static size_type min_buffer_size();
107 
108  /// \deprecated
109  /// See \ref buffer_reserve.
110  static void set_min_buffer_size(size_type sz);
111 
112  /** A representation of what this tokenizer has. **/
113  struct token
114  {
115  string_view text;
117 
118  operator std::pair<string_view, token_kind>()
119  {
120  return { text, kind };
121  }
122  };
123 
124 public:
125  /// Construct a tokenizer to read the given non-owned \a input.
126  explicit tokenizer(string_view input);
127 
128  /// Construct a tokenizer from the provided \a input.
129  explicit tokenizer(std::istream& input);
130 
131  ~tokenizer() noexcept;
132 
133  /// Get the input this instance is reading from.
134  const string_view& input() const;
135 
136  /** Attempt to go to the next token in the input stream. The contents of \c current will be cleared.
137  *
138  * \returns \c true if another token was obtained; \c false if we reached EOF or an I/O failure. Check \c input to
139  * see which.
140  **/
141  bool next();
142 
143  /** Get the current token and its associated \c token_kind.
144  *
145  * \returns The current token.
146  * \throws std::logic_error if \c next has not been called or if it returned \c false.
147  **/
148  const token& current() const;
149 
150  /// \deprecated
151  /// Calling this function has no effect and will be removed in 2.0.
152  void buffer_reserve(size_type sz);
153 
154 private:
155  explicit tokenizer(std::shared_ptr<std::string> input);
156 
157 private:
158  string_view _input;
159  const char* _position;
160  token _current; //!< The current token
161  std::shared_ptr<void> _track; //!< Used to track input data when needed (\c std::istream constructor)
162 };
163 
164 }
165 
166 #endif/*__JSONV_TOKENIZER_INCLUDED__*/
constexpr token_kind operator~(token_kind a)
Invert flag values.
Definition: tokenizer.hpp:76
Copyright (c) 2014-2019 by Travis Gockel.
token_kind
The kind of token that was encountered in a tokenizer.
Definition: tokenizer.hpp:29
JSONV_PUBLIC const value null
An instance with kind::null.
Splits input into tokens, allowing traversal of JSON without verification.
Definition: tokenizer.hpp:99
constexpr token_kind operator|(token_kind a, token_kind b)
Combine multiple flag values.
Definition: tokenizer.hpp:64
constexpr token_kind operator&(token_kind a, token_kind b)
Filter out flag values.
Definition: tokenizer.hpp:70
JSONV_PUBLIC std::string to_string(const parse_error::problem &p)
Get a string representation of a problem.
kind
Describes the kind of data a value holds.
Definition: value.hpp:69
Pulls in an implementation of string_view.
A representation of what this tokenizer has.
Definition: tokenizer.hpp:113
#define JSONV_PUBLIC
This function or class is part of the public API for JsonVoorhees.
Definition: config.hpp:104
JSONV_STRING_VIEW_TYPE string_view
A non-owning reference to a string.
Definition: string_view.hpp:52