diff options
Diffstat (limited to 'boost/wave/cpplexer/re2clex/cpp_re2c_lexer.hpp')
-rw-r--r-- | boost/wave/cpplexer/re2clex/cpp_re2c_lexer.hpp | 429 |
1 files changed, 429 insertions, 0 deletions
diff --git a/boost/wave/cpplexer/re2clex/cpp_re2c_lexer.hpp b/boost/wave/cpplexer/re2clex/cpp_re2c_lexer.hpp new file mode 100644 index 0000000000..748738544b --- /dev/null +++ b/boost/wave/cpplexer/re2clex/cpp_re2c_lexer.hpp @@ -0,0 +1,429 @@ +/*============================================================================= + Boost.Wave: A Standard compliant C++ preprocessor library + + Re2C based C++ lexer + + http://www.boost.org/ + + Copyright (c) 2001-2011 Hartmut Kaiser. Distributed under the Boost + Software License, Version 1.0. (See accompanying file + LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +=============================================================================*/ + +#if !defined(CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED) +#define CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED + +#include <string> +#include <cstdio> +#include <cstdarg> +#if defined(BOOST_SPIRIT_DEBUG) +#include <iostream> +#endif // defined(BOOST_SPIRIT_DEBUG) + +#include <boost/concept_check.hpp> +#include <boost/assert.hpp> +#include <boost/spirit/include/classic_core.hpp> + +#include <boost/wave/wave_config.hpp> +#include <boost/wave/language_support.hpp> +#include <boost/wave/token_ids.hpp> +#include <boost/wave/util/file_position.hpp> +#include <boost/wave/cpplexer/validate_universal_char.hpp> +#include <boost/wave/cpplexer/cpplexer_exceptions.hpp> +#include <boost/wave/cpplexer/token_cache.hpp> +#include <boost/wave/cpplexer/convert_trigraphs.hpp> + +#include <boost/wave/cpplexer/cpp_lex_interface.hpp> +#include <boost/wave/cpplexer/re2clex/scanner.hpp> +#include <boost/wave/cpplexer/re2clex/cpp_re.hpp> +#if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0 +#include <boost/wave/cpplexer/detect_include_guards.hpp> +#endif + +#include <boost/wave/cpplexer/cpp_lex_interface_generator.hpp> + +// this must occur after all of the includes and before any code appears +#ifdef BOOST_HAS_ABI_HEADERS +#include BOOST_ABI_PREFIX +#endif + +/////////////////////////////////////////////////////////////////////////////// +namespace boost { +namespace wave { +namespace cpplexer { +namespace re2clex { + +/////////////////////////////////////////////////////////////////////////////// +// +// encapsulation of the re2c based cpp lexer +// +/////////////////////////////////////////////////////////////////////////////// + +template <typename IteratorT, + typename PositionT = boost::wave::util::file_position_type, + typename TokenT = lex_token<PositionT> > +class lexer +{ +public: + typedef TokenT token_type; + typedef typename token_type::string_type string_type; + + lexer(IteratorT const &first, IteratorT const &last, + PositionT const &pos, boost::wave::language_support language_); + ~lexer(); + + token_type& get(token_type&); + void set_position(PositionT const &pos) + { + // set position has to change the file name and line number only + filename = pos.get_file(); + scanner.line = pos.get_line(); +// scanner.column = scanner.curr_column = pos.get_column(); + scanner.file_name = filename.c_str(); + } +#if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0 + bool has_include_guards(std::string& guard_name) const + { + return guards.detected(guard_name); + } +#endif + +// error reporting from the re2c generated lexer + static int report_error(Scanner const* s, int code, char const *, ...); + +private: + static char const *tok_names[]; + + Scanner scanner; + string_type filename; + string_type value; + bool at_eof; + boost::wave::language_support language; +#if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0 + include_guards<token_type> guards; +#endif + +#if BOOST_WAVE_SUPPORT_THREADING == 0 + static token_cache<string_type> const cache; +#else + token_cache<string_type> const cache; +#endif +}; + +/////////////////////////////////////////////////////////////////////////////// +// initialize cpp lexer +template <typename IteratorT, typename PositionT, typename TokenT> +inline +lexer<IteratorT, PositionT, TokenT>::lexer(IteratorT const &first, + IteratorT const &last, PositionT const &pos, + boost::wave::language_support language_) + : filename(pos.get_file()), at_eof(false), language(language_) +#if BOOST_WAVE_SUPPORT_THREADING != 0 + , cache() +#endif +{ + using namespace std; // some systems have memset in std + memset(&scanner, '\0', sizeof(Scanner)); + scanner.eol_offsets = aq_create(); + if (first != last) { + scanner.first = scanner.act = (uchar *)&(*first); + scanner.last = scanner.first + std::distance(first, last); + } + scanner.line = pos.get_line(); + scanner.column = scanner.curr_column = pos.get_column(); + scanner.error_proc = report_error; + scanner.file_name = filename.c_str(); + +#if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0 + scanner.enable_ms_extensions = true; +#else + scanner.enable_ms_extensions = false; +#endif + +#if BOOST_WAVE_SUPPORT_VARIADICS_PLACEMARKERS != 0 + scanner.act_in_c99_mode = boost::wave::need_c99(language_); +#endif + +#if BOOST_WAVE_SUPPORT_IMPORT_KEYWORD != 0 + scanner.enable_import_keyword = !boost::wave::need_c99(language_); +#else + scanner.enable_import_keyword = false; +#endif + + scanner.detect_pp_numbers = boost::wave::need_prefer_pp_numbers(language_); + scanner.single_line_only = boost::wave::need_single_line(language_); + +#if BOOST_WAVE_SUPPORT_CPP0X != 0 + scanner.act_in_cpp0x_mode = boost::wave::need_cpp0x(language_); +#else + scanner.act_in_cpp0x_mode = false; +#endif +} + +template <typename IteratorT, typename PositionT, typename TokenT> +inline +lexer<IteratorT, PositionT, TokenT>::~lexer() +{ + using namespace std; // some systems have free in std + aq_terminate(scanner.eol_offsets); + free(scanner.bot); +} + +/////////////////////////////////////////////////////////////////////////////// +// get the next token from the input stream +template <typename IteratorT, typename PositionT, typename TokenT> +inline TokenT& +lexer<IteratorT, PositionT, TokenT>::get(TokenT& result) +{ + if (at_eof) + return result = token_type(); // return T_EOI + + unsigned int actline = scanner.line; + token_id id = token_id(scan(&scanner)); + + switch (static_cast<unsigned int>(id)) { + case T_IDENTIFIER: + // test identifier characters for validity (throws if invalid chars found) + value = string_type((char const *)scanner.tok, + scanner.cur-scanner.tok); + if (!boost::wave::need_no_character_validation(language)) + impl::validate_identifier_name(value, actline, scanner.column, filename); + break; + + case T_STRINGLIT: + case T_CHARLIT: + case T_RAWSTRINGLIT: + // test literal characters for validity (throws if invalid chars found) + value = string_type((char const *)scanner.tok, + scanner.cur-scanner.tok); + if (boost::wave::need_convert_trigraphs(language)) + value = impl::convert_trigraphs(value); + if (!boost::wave::need_no_character_validation(language)) + impl::validate_literal(value, actline, scanner.column, filename); + break; + +#if BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0 + case T_PP_HHEADER: + case T_PP_QHEADER: + case T_PP_INCLUDE: + // convert to the corresponding ..._next token, if appropriate + { + value = string_type((char const *)scanner.tok, + scanner.cur-scanner.tok); + + // Skip '#' and whitespace and see whether we find an 'include_next' here. + typename string_type::size_type start = value.find("include"); + if (value.compare(start, 12, "include_next", 12) == 0) + id = token_id(id | AltTokenType); + break; + } +#endif + + case T_LONGINTLIT: // supported in C++0x, C99 and long_long mode + value = string_type((char const *)scanner.tok, + scanner.cur-scanner.tok); + if (!boost::wave::need_long_long(language)) { + // syntax error: not allowed in C++ mode + BOOST_WAVE_LEXER_THROW(lexing_exception, invalid_long_long_literal, + value.c_str(), actline, scanner.column, filename.c_str()); + } + break; + + case T_OCTALINT: + case T_DECIMALINT: + case T_HEXAINT: + case T_INTLIT: + case T_FLOATLIT: + case T_FIXEDPOINTLIT: + case T_CCOMMENT: + case T_CPPCOMMENT: + case T_SPACE: + case T_SPACE2: + case T_ANY: + case T_PP_NUMBER: + value = string_type((char const *)scanner.tok, + scanner.cur-scanner.tok); + break; + + case T_EOF: + // T_EOF is returned as a valid token, the next call will return T_EOI, + // i.e. the actual end of input + at_eof = true; + value.clear(); + break; + + case T_OR_TRIGRAPH: + case T_XOR_TRIGRAPH: + case T_LEFTBRACE_TRIGRAPH: + case T_RIGHTBRACE_TRIGRAPH: + case T_LEFTBRACKET_TRIGRAPH: + case T_RIGHTBRACKET_TRIGRAPH: + case T_COMPL_TRIGRAPH: + case T_POUND_TRIGRAPH: + if (boost::wave::need_convert_trigraphs(language)) { + value = cache.get_token_value(BASEID_FROM_TOKEN(id)); + } + else { + value = string_type((char const *)scanner.tok, + scanner.cur-scanner.tok); + } + break; + + case T_ANY_TRIGRAPH: + if (boost::wave::need_convert_trigraphs(language)) { + value = impl::convert_trigraph( + string_type((char const *)scanner.tok)); + } + else { + value = string_type((char const *)scanner.tok, + scanner.cur-scanner.tok); + } + break; + + default: + if (CATEGORY_FROM_TOKEN(id) != EXTCATEGORY_FROM_TOKEN(id) || + IS_CATEGORY(id, UnknownTokenType)) + { + value = string_type((char const *)scanner.tok, + scanner.cur-scanner.tok); + } + else { + value = cache.get_token_value(id); + } + break; + } + +// std::cerr << boost::wave::get_token_name(id) << ": " << value << std::endl; + + // the re2c lexer reports the new line number for newline tokens + result = token_type(id, value, PositionT(filename, actline, scanner.column)); + +#if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0 + return guards.detect_guard(result); +#else + return result; +#endif +} + +template <typename IteratorT, typename PositionT, typename TokenT> +inline int +lexer<IteratorT, PositionT, TokenT>::report_error(Scanner const *s, int errcode, + char const *msg, ...) +{ + BOOST_ASSERT(0 != s); + BOOST_ASSERT(0 != msg); + + using namespace std; // some system have vsprintf in namespace std + + char buffer[200]; // should be large enough + va_list params; + va_start(params, msg); + vsprintf(buffer, msg, params); + va_end(params); + + BOOST_WAVE_LEXER_THROW_VAR(lexing_exception, errcode, buffer, s->line, + s->column, s->file_name); +// BOOST_UNREACHABLE_RETURN(0); + return 0; +} + +/////////////////////////////////////////////////////////////////////////////// +// +// lex_functor +// +/////////////////////////////////////////////////////////////////////////////// + +template <typename IteratorT, + typename PositionT = boost::wave::util::file_position_type, + typename TokenT = typename lexer<IteratorT, PositionT>::token_type> +class lex_functor +: public lex_input_interface_generator<TokenT> +{ +public: + typedef TokenT token_type; + + lex_functor(IteratorT const &first, IteratorT const &last, + PositionT const &pos, boost::wave::language_support language) + : re2c_lexer(first, last, pos, language) + {} + virtual ~lex_functor() {} + +// get the next token from the input stream + token_type& get(token_type& result) { return re2c_lexer.get(result); } + void set_position(PositionT const &pos) { re2c_lexer.set_position(pos); } +#if BOOST_WAVE_SUPPORT_PRAGMA_ONCE != 0 + bool has_include_guards(std::string& guard_name) const + { return re2c_lexer.has_include_guards(guard_name); } +#endif + +private: + lexer<IteratorT, PositionT, TokenT> re2c_lexer; +}; + +#if BOOST_WAVE_SUPPORT_THREADING == 0 +/////////////////////////////////////////////////////////////////////////////// +template <typename IteratorT, typename PositionT, typename TokenT> +token_cache<typename lexer<IteratorT, PositionT, TokenT>::string_type> const + lexer<IteratorT, PositionT, TokenT>::cache = + token_cache<typename lexer<IteratorT, PositionT, TokenT>::string_type>(); +#endif + +} // namespace re2clex + +/////////////////////////////////////////////////////////////////////////////// +// +// The new_lexer_gen<>::new_lexer function (declared in cpp_lex_interface.hpp) +// should be defined inline, if the lex_functor shouldn't be instantiated +// separately from the lex_iterator. +// +// Separate (explicit) instantiation helps to reduce compilation time. +// +/////////////////////////////////////////////////////////////////////////////// + +#if BOOST_WAVE_SEPARATE_LEXER_INSTANTIATION != 0 +#define BOOST_WAVE_RE2C_NEW_LEXER_INLINE +#else +#define BOOST_WAVE_RE2C_NEW_LEXER_INLINE inline +#endif + +/////////////////////////////////////////////////////////////////////////////// +// +// The 'new_lexer' function allows the opaque generation of a new lexer object. +// It is coupled to the iterator type to allow to decouple the lexer/iterator +// configurations at compile time. +// +// This function is declared inside the cpp_lex_token.hpp file, which is +// referenced by the source file calling the lexer and the source file, which +// instantiates the lex_functor. But is is defined here, so it will be +// instantiated only while compiling the source file, which instantiates the +// lex_functor. While the cpp_re2c_token.hpp file may be included everywhere, +// this file (cpp_re2c_lexer.hpp) should be included only once. This allows +// to decouple the lexer interface from the lexer implementation and reduces +// compilation time. +// +/////////////////////////////////////////////////////////////////////////////// + +template <typename IteratorT, typename PositionT, typename TokenT> +BOOST_WAVE_RE2C_NEW_LEXER_INLINE +lex_input_interface<TokenT> * +new_lexer_gen<IteratorT, PositionT, TokenT>::new_lexer(IteratorT const &first, + IteratorT const &last, PositionT const &pos, + boost::wave::language_support language) +{ + using re2clex::lex_functor; + return new lex_functor<IteratorT, PositionT, TokenT>(first, last, pos, language); +} + +#undef BOOST_WAVE_RE2C_NEW_LEXER_INLINE + +/////////////////////////////////////////////////////////////////////////////// +} // namespace cpplexer +} // namespace wave +} // namespace boost + +// the suffix header occurs after all of the code +#ifdef BOOST_HAS_ABI_HEADERS +#include BOOST_ABI_SUFFIX +#endif + +#endif // !defined(CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED) |