From bdda4e1daa86fb053ce6b2a00ffa411bab3b8c11 Mon Sep 17 00:00:00 2001 From: DongHun Kwak Date: Wed, 9 Feb 2022 08:13:59 +0900 Subject: Imported Upstream version 2.0.2 --- ...c_a_lexer_generator_based_on_lookahead_tdfa.tex | 387 +++++++++++++++++++++ doc/papers/2020_simpa/Makefile | 2 +- ...c_a_lexer_generator_based_on_lookahead_tdfa.tex | 335 ------------------ 3 files changed, 388 insertions(+), 336 deletions(-) create mode 100644 doc/papers/2020_simpa/2020_trofimovich_re2c_a_lexer_generator_based_on_lookahead_tdfa.tex delete mode 100644 doc/papers/2020_simpa/re2c_a_lexer_generator_based_on_lookahead_tdfa.tex (limited to 'doc/papers') diff --git a/doc/papers/2020_simpa/2020_trofimovich_re2c_a_lexer_generator_based_on_lookahead_tdfa.tex b/doc/papers/2020_simpa/2020_trofimovich_re2c_a_lexer_generator_based_on_lookahead_tdfa.tex new file mode 100644 index 00000000..18329b20 --- /dev/null +++ b/doc/papers/2020_simpa/2020_trofimovich_re2c_a_lexer_generator_based_on_lookahead_tdfa.tex @@ -0,0 +1,387 @@ +%% +%% Copyright 2007, 2008, 2009 Elsevier Ltd +%% +%% This file is part of the 'Elsarticle Bundle'. +%% --------------------------------------------- +%% +%% It may be distributed under the conditions of the LaTeX Project Public +%% License, either version 1.2 of this license or (at your option) any +%% later version. The latest version of this license is in +%% http://www.latex-project.org/lppl.txt +%% and version 1.2 or later is part of all distributions of LaTeX +%% version 1999/12/01 or later. +%% +%% The list of all files belonging to the 'Elsarticle Bundle' is +%% given in the file `manifest.txt'. +%% + +%% Template article for Elsevier's document class `elsarticle' +%% with numbered style bibliographic references +%% SP 2008/03/01 + +\documentclass[preprint,12pt, a4paper]{elsarticle} + +%% Use the option review to obtain double line spacing +%% \documentclass[authoryear,preprint,review,12pt]{elsarticle} + +%% For including figures, graphicx.sty has been loaded in +%% elsarticle.cls. If you prefer to use the old commands +%% please give \usepackage{epsfig} + +%% The amssymb package provides various useful mathematical symbols +\usepackage{amssymb} +\usepackage{hyperref} +%% The amsthm package provides extended theorem environments +%% \usepackage{amsthm} + +%% The lineno packages adds line numbers. Start line numbering with +%% \begin{linenumbers}, end it with \end{linenumbers}. Or switch it on +%% for the whole article with \linenumbers. +%\usepackage{lineno} + +\journal{Software Impacts} + +\begin{document} + +\begin{frontmatter} + +%% Title, authors and addresses + +%% use the tnoteref command within \title for footnotes; +%% use the tnotetext command for theassociated footnote; +%% use the fnref command within \author or \address for footnotes; +%% use the fntext command for theassociated footnote; +%% use the corref command within \author for corresponding author footnotes; +%% use the cortext command for theassociated footnote; +%% use the ead command for the email address, +%% and the form \ead[url] for the home page: +%% \title{Title\tnoteref{label1}} +%% \tnotetext[label1]{} +%% \author{Name\corref{cor1}\fnref{label2}} +%% \ead{email address} +%\ead[url]{https://re2c.org} +%% \fntext[label2]{} +%% \cortext[cor1]{} +%% \address{Address\fnref{label3}} +%% \fntext[label3]{} + +\title{RE2C: a lexer generator based on lookahead-TDFA} + +%% use optional labels to link authors explicitly to addresses: +%% \author[label1,label2]{} +%% \address[label1]{} +%% \address[label2]{} + +\author{U. Trofimovich} +\ead{skvadrik@gmail.com} + +\begin{abstract} +RE2C is a regular expression compiler: +it transforms regular expressions into finite state machines +and encodes them as programs in the target language. +At the core of RE2C is the lookahead-TDFA algorithm +that allows it to perform fast and lightweight submatch extraction. +This article describes the algorithm used in RE2C and gives an example of TDFA construction. +\end{abstract} + +\begin{keyword} +Lexical analysis \sep Regular expressions \sep Finite automata + +%% PACS codes here, in the form: \PACS code \sep code + +%% MSC codes here, in the form: \MSC code \sep code +%% or \MSC[2008] code \sep code (2000 is the default) + +\end{keyword} + +\end{frontmatter} + +%\linenumbers + +\section{Introduction} + +%\item A short description of the high-level functionality and purpose of the software for a diverse, non-specialist audience + +\noindent +Regular expression engines can be divided in two categories: run-time libraries and lexer generators. +Run-time libraries perform interpretation or just-in-time compilation of regular expressions. +They use a variety of algorithms ranging from recursive backtracking to automata, string searching, or some combination of the above. +Lexer generators, on the other hand, perform ahead-of-time compilation. +They use algorithms based on deterministic finite automata (DFA) and spend considerable time on compilation and optimization in order to emit better code. +Consequently, lexer generators usually do not support features that cannot be implemented on vanilla DFA. +One such feature is submatch extraction --- the ability to find correspondence between parts of the regular expression and parts of the input string. +Submatch extraction is a special case of the parsing problem: +in addition to solving the recognition problem it has to find the derivation of the input string in the grammar defined by the regular expression. +%Submatch extraction differs from simple matching in the same way as parsing differs from recognition: +%not only one has to decide if the input string belongs to the language defined by the regular expression, but also to find its derivation in the regular grammar. +Unlike full parsing, submatch extraction needs only a partial derivation. %: it is of no interest how each individual character is derived. +Therefore it would be wasteful to perform full parsing, +and a more specialized algorithm is needed that would have overhead proportional to submatch detalization. +For an optimizing lexer generator like RE2C \cite{RE2C} \cite{RE2Cx} it is important that the generated code is at least as fast and memory-efficient as hand-written code, +and there is zero overhead if the submatch feature is not used. + +\section{Tagged DFA} + +\noindent +The above requirements place tight constraints on the submatch extraction algorithm: +it should be a one-pass DFA-based algorithm that works in constant memory independent of the input length. +Such an algorithm was invented by Ville Laurikari \cite{Lau00}. +It works as follows. +First, the regular expression is converted to a nondeterministic finite automaton with tagged transitions (TNFA). +Tags are submatch markers that can be placed anywhere in the regular expression; +for example, a capturing group can be represented with a pair of tags for the opening and closing parentheses. +%It is also possible to use standalone tags anywhere in the regular expression. +TNFA is in essence a nondeterministic finite state transducer that rewrites symbolic strings into tagged strings. +The crucial part of Laurikari algorithm is TNFA determinization. +Its basic principle is the same as the powerset construction that converts NFA to DFA: +the NFA is simulated on all possible input strings, maintaining a set of states at each step. +If the current state-set is new, it becomes a new DFA state, otherwise it is mapped to an existing DFA state. +In the case of TNFA state-sets are augmented with tag information: +each substate has an associated vector of registers that hold tag values, +so that the whole state-set is a matrix indexed by TNFA states and tags. +Registers are needed because different substates are reached by different TNFA paths, +and the tags along one path may disagree with tags along another path. +If a tag is updated on TNFA path, its value is stored into a new register. +The augmented state-sets cannot be mapped in the same way as ordinary DFA states, +because they may have identical substates, but different registers. +The key insight is that mapping of such states is still possible if there is a bijection between their registers: +the bijective transformation can be encoded in the form of register reordering operations on TDFA transitions. +After determinization all the information in TDFA state-sets is erased, +and the resulting TDFA is like ordinary DFA extended with a fixed number of registers that hold tag values, +and transitions that update and reorder register values. +Algorithms like minimization are applicable to TDFA, +but they need to differentiate between transitions with different register operations. + +\section{Lookahead TDFA} + +\begin{figure}[t!] +\includegraphics[width=\linewidth]{example.pdf} +%\vspace{-1em} +\caption{ +Lookahead-TDFA construction for regular expression $a^* \, t_1 (t_2 \, b)^* c \;$ with tags $t_1$ and $t_2$ +that matches strings $a \dots a b \dots b c$ (tag $t_1$ marks the boundary between $a$s and $b$s, and tag $t_2$ marks the last $b$). +Top: TNFA ($\epsilon/\epsilon$ labels are omitted, and negative tags indicate no-match). +Middle: lookahead-TDFA under construction with state-sets, register matrices and lookahead tags +($p$ is the current position, and $\varnothing$ is no-match). +Bottom left: lookahead-TDFA after construction. +Bottom right: lookahead-TDFA after optimization (the number of registers is minimized and some register operations are removed). +}\label{fig:example} +\end{figure} + +\noindent +One improvement that allows to greatly reduce the number of register operations is the use of the lookahead symbol \cite{Tro17}. +The idea is to delay the application of register operations until the lookahead symbol is known, +and attach the operations to the outgoing transition on that symbol instead of the preceding incoming transition. +The insight is that some of TNFA paths that have reached the current TDFA state are canceled after the application of lookahead symbol, +and all register operations associated with these paths can be canceled as well. +In other words, delaying for one step allows one to split register operations on the lookahead symbol, +and instead of doing all of them, do only the relevant part. +This requires a modification to the underlying TDFA state-sets: +in addition to registers, each substate needs to have an associated list of lookahead tags. +The additional information is erased after determinization, +and the resulting lookahead-TDFA is faster than ordinary TDFA and better suited for further transformations like minimization and register allocation. +On the whole, this idea is similar to the improvement of LALR parsers over LR parsers: +the use of lookahead information in states greatly reduces the number of conflicts +(for TDFA a conflict is not an error, but it means extra registers and register operations). +An example of lookahead-TDFA construction can be seen on figure \ref{fig:example}. + + +\section{Ambiguity resolution} + +\noindent +One of the challenges that makes parsing harder than recognition is ambiguity: +there may be several ways to parse the input string. +Which way to prefer is usually defined by a disambiguation policy. +TDFA algorithm can be parameterized over different policies; +for example, RE2C supports both the Perl leftmost-greedy policy and the POSIX longest-match policy \cite{BorTro19}. +Disambiguation happens at the time of determinization (in $\epsilon$-closure construction), +and the resulting TDFA does not perform any disambiguation at run-time: ambiguity-resolving decisions are embedded into its structure. + +\section{Impact} + +\noindent +RE2C is a lexer generator of choice for projects that need \emph{fast} lexical analyzers. +It has a flexible user interface that allows one to customize the generated code for a particular environment and input model. +Submatch extraction is yet another feature that makes RE2C a good alternative to other lexer generators. +Notable projects that use RE2C are the following: + +\begin{itemize} +\item Ninja, a build system with a focus on speed. \cite{Ninja} + Ninja is used in a great number of open-source projects, + and it is a necessary building block in many operating systems and platforms. + +\item PHP, a popular general-purpose scripting language. \cite{PHP} + +\item BRL-CAD, a constructive solid geometry solid modeling computer-aided design system. \cite{BRLCAD} + +\item STEPCode, an implementation of ISO 10303 standard. \cite{STEPCode} + +\item Apache SpamAssassin, a program for e-mail spam filtering. \cite{SpamAssassin} + +\item Yasm, the Modular Assembler Project. \cite{Yasm} + +\item Wake, the SiFive build tool. \cite{Wake} +\end{itemize} + +\noindent +RE2C has a permissive license, which allows it to be used in proprietary software as well. +The project has been ported to many operating systems; +according to Repology \cite{Repology}, it is packaged in more than 60 distributions. +Besides being a useful practical tool, RE2C is a research playground for the development of new algorithms in the field of automata and formal languages. + +\section{Conclusion and future work} + +\noindent +Submatch extraction algorithm implemented in RE2C +is both an important theoretical development and a useful practical improvement. +It allows one to program lexical analyzers capable of submatch extraction +without resorting to manual post-processing or the use of string-searching functions. +The overhead on submatch extraction is proportional to submatch detalization: +there is zero overhead if no submatch information is needed, +and even for large submatch-heavy regular expressions like RFC-compliant URI and HTTP parsers the overhead is modest +(about 1.25x compared to simple recognition on DFA \cite{Tro17}). +% +Future work may relate TDFA to other parsing automata, such as DSST \cite{Gra15} and sta-DFA \cite{Cho18}, +and investigate the possibility of using tagged counter automata for counted repetition \cite{Bec09}. + + +\section*{Acknowledgments} +\label{} +\noindent +I want to thank my parents Vladimir and Elina, +my friend and fellow researcher Angelo Borsotti, +my school math teachers Tatyana Leonidovna and Demian Vladimirovich, +and, most of all, Sergei. + + +\begin{thebibliography}{00} + +\bibitem{RE2C} + RE2C, a lexer generator for C, C++ and Go. + Website: \url{https://re2c.org}, + source code: \url{https://github.com/skvadrik/re2c}. + +\bibitem{RE2Cx} + Peter Bumbulis, + Donald D. Cowan, + \textit{RE2C: A more versatile scanner generator}, + ACM Letters on Programming Languages and Systems (LOPLAS), + vol. 2, 1-4, + pp. 70-84, + 1993. + +\bibitem{Lau00} + Ville Laurikari, + \textit{NFAs with tagged transitions, their conversion to deterministic automata and application to regular expressions}, + Proceedings Seventh International Symposium on String Processing and Information Retrieval, 2000. SPIRE 2000, + pp. 181-187, + URL: \url{http://laurikari.net/ville/spire2000-tnfa.pdf}. + +\bibitem{Tro17} + Ulya Trofimovich, + \textit{Tagged Deterministic Finite Automata with Lookahead}, + arXiv:1907.08837 [cs.FL], + 2017. + +\bibitem{BorTro19} + Angelo Borsotti, Ulya Trofimovich, + \textit{Efficient POSIX Submatch Extraction on NFA}, + preprint, 2019, + URL: \url{https://re2c.org/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf}. + +\bibitem{Ninja} + Ninja build system, + URL: \url{https://ninja-build.org}, + build files that use RE2C: \url{https://ninja-build.org/build.ninja.html}. + +\bibitem{PHP} + PHP Internals Book, + chapter \textit{Building PHP}, + URL: \url{http://www.phpinternalsbook.com/php7/build_system/building_php.html}. + +\bibitem{BRLCAD} + BRL-CAD: Tools, + URL: \url{http://sourceforge.net/p/brlcad/code/HEAD/tree/brlcad/trunk/misc/tools/re2c}. + +\bibitem{STEPCode} + STEPCode: Build Process, + URL: \url{https://stepcode.github.io/docs/build_process}. + +\bibitem{SpamAssassin} + SpamAssassin (sa-compile), + URL: \url{https://spamassassin.apache.org/full/3.2.x/doc/sa-compile.html}. + +\bibitem{Yasm} + Yasm, + URL: \url{https://yasm.tortall.net}. + +\bibitem{Wake} + Wake, + URL: \url{https://github.com/sifive/wake}. + +\bibitem{Repology} + Repology: RE2C information + URL: \url{https://repology.org/project/re2c/information}. + +\bibitem{Gra15} + Niels Bj{\o}rn Bugge Grathwohl, + \textit{Parsing with Regular Expressions \& Extensions to Kleene Algebra}, + DIKU, University of Copenhagen, + 2015. + +\bibitem{Cho18} + Mohammad Imran Chowdhury, + \textit{staDFA: An Efficient Subexpression Matching Method}, + Master thesis, + Florida State University, + 2018. + +\bibitem{Bec09} + Michela Becchi, + \textit{Data structures, algorithms and architectures for efficient regular expression evaluation}, + Washington University In St. Louis, School of Engineering and Applied Science, + 2009. + +\end{thebibliography} + +\vfill\null +\clearpage + +\section*{Current code version} +\label{} + +\begin{table}[!h] +\begin{tabular}{|l|p{5.7cm}|p{7.3cm}|} +%\begin{tabular}{|l|p{6.5cm}|p{6.5cm}|} +\hline +\textbf{Nr.} & \textbf{Code metadata description} & \textbf{Please fill in this column} \\ +\hline +C1 & Current code version & 2.0 \\ +\hline +C2 & Permanent link to code/repository used for this code version & + \textit{https://github.com/skvadrik/re2c} \\ +\hline +C3 & Permanent link to Reproducible Capsule & + \textit{https://codeocean.com/capsule/7270e8e2-5b1b-4ee4-941e-6269cf09d542/} \\ +\hline +C4 & Legal Code License & Public domain \\ +\hline +C5 & Code versioning system used & Git \\ +\hline +C6 & Software code languages, tools, and services used & C++, Bison, RE2C (self-hosting) \\ +\hline +C7 & Compilation requirements, operating environments \& dependencies & + OS: Linux, BSD, Nix/Guix, GNU Hurd, OS X, Windows. + Dependencies: C++ compiler, Bash, CMake or Autotools, optional Bison and Docutils. \\ +\hline +C8 & If available Link to developer documentation/manual & \textit{https://re2c.org} \\ +\hline +C9 & Support email for questions & \textit{re2c-general@lists.sourceforge.net} \\ +\hline +\end{tabular} +\caption{Code metadata (mandatory)} +\label{} +\end{table} + +\end{document} +\endinput diff --git a/doc/papers/2020_simpa/Makefile b/doc/papers/2020_simpa/Makefile index ad5b5972..ef070291 100644 --- a/doc/papers/2020_simpa/Makefile +++ b/doc/papers/2020_simpa/Makefile @@ -1,4 +1,4 @@ -re2c_a_lexer_generator_based_on_lookahead_tdfa.pdf : re2c_a_lexer_generator_based_on_lookahead_tdfa.tex example.pdf +2020_trofimovich_re2c_a_lexer_generator_based_on_lookahead_tdfa.pdf : 2020_trofimovich_re2c_a_lexer_generator_based_on_lookahead_tdfa.tex example.pdf pdflatex -shell-escape $< $<.log example.pdf : example.tex diff --git a/doc/papers/2020_simpa/re2c_a_lexer_generator_based_on_lookahead_tdfa.tex b/doc/papers/2020_simpa/re2c_a_lexer_generator_based_on_lookahead_tdfa.tex deleted file mode 100644 index 546c56a3..00000000 --- a/doc/papers/2020_simpa/re2c_a_lexer_generator_based_on_lookahead_tdfa.tex +++ /dev/null @@ -1,335 +0,0 @@ -%% -%% Copyright 2007, 2008, 2009 Elsevier Ltd -%% -%% This file is part of the 'Elsarticle Bundle'. -%% --------------------------------------------- -%% -%% It may be distributed under the conditions of the LaTeX Project Public -%% License, either version 1.2 of this license or (at your option) any -%% later version. The latest version of this license is in -%% http://www.latex-project.org/lppl.txt -%% and version 1.2 or later is part of all distributions of LaTeX -%% version 1999/12/01 or later. -%% -%% The list of all files belonging to the 'Elsarticle Bundle' is -%% given in the file `manifest.txt'. -%% - -%% Template article for Elsevier's document class `elsarticle' -%% with numbered style bibliographic references -%% SP 2008/03/01 - -\documentclass[preprint,12pt, a4paper]{elsarticle} - -%% Use the option review to obtain double line spacing -%% \documentclass[authoryear,preprint,review,12pt]{elsarticle} - -%% For including figures, graphicx.sty has been loaded in -%% elsarticle.cls. If you prefer to use the old commands -%% please give \usepackage{epsfig} - -%% The amssymb package provides various useful mathematical symbols -\usepackage{amssymb} -\usepackage{hyperref} -%% The amsthm package provides extended theorem environments -%% \usepackage{amsthm} - -%% The lineno packages adds line numbers. Start line numbering with -%% \begin{linenumbers}, end it with \end{linenumbers}. Or switch it on -%% for the whole article with \linenumbers. -%\usepackage{lineno} - -\journal{Software Impacts} - -\begin{document} - -\begin{frontmatter} - -%% Title, authors and addresses - -%% use the tnoteref command within \title for footnotes; -%% use the tnotetext command for theassociated footnote; -%% use the fnref command within \author or \address for footnotes; -%% use the fntext command for theassociated footnote; -%% use the corref command within \author for corresponding author footnotes; -%% use the cortext command for theassociated footnote; -%% use the ead command for the email address, -%% and the form \ead[url] for the home page: -%% \title{Title\tnoteref{label1}} -%% \tnotetext[label1]{} -%% \author{Name\corref{cor1}\fnref{label2}} -%% \ead{email address} -%\ead[url]{https://re2c.org} -%% \fntext[label2]{} -%% \cortext[cor1]{} -%% \address{Address\fnref{label3}} -%% \fntext[label3]{} - -\title{RE2C: a lexer generator based on lookahead-TDFA} - -%% use optional labels to link authors explicitly to addresses: -%% \author[label1,label2]{} -%% \address[label1]{} -%% \address[label2]{} - -\author{U. Trofimovich} -\ead{skvadrik@gmail.com} - -\begin{abstract} -RE2C is a regular expression compiler: -it transforms regular expressions into finite state machines -and encodes them as programs in the target language. -At the core of RE2C is the lookahead-TDFA algorithm -that allows it to perform fast and lightweight submatch extraction. -This article describes the algorithm used in RE2C and gives an example of TDFA construction. -\end{abstract} - -\begin{keyword} -Lexical analysis \sep Regular expressions \sep Finite automata - -%% PACS codes here, in the form: \PACS code \sep code - -%% MSC codes here, in the form: \MSC code \sep code -%% or \MSC[2008] code \sep code (2000 is the default) - -\end{keyword} - -\end{frontmatter} - -%\linenumbers - -\section{Introduction} - -%\item A short description of the high-level functionality and purpose of the software for a diverse, non-specialist audience - -\noindent -Regular expression engines can be divided in two categories: run-time libraries and lexer generators. -Run-time libraries perform interpretation or just-in-time compilation of regular expressions. -They use a variety of algorithms ranging from recursive backtracking to automata, string searching, or some combination of the above. -Lexer generators, on the other hand, perform ahead-of-time compilation. -They use algorithms based on deterministic finite automata (DFA) and spend considerable time on compilation and optimization in order to emit better code. -Consequently, lexer generators usually do not support features that cannot be implemented on vanilla DFA. -One such feature is submatch extraction --- the ability to find correspondence between parts of the regular expression and parts of the input string. -Submatch extraction is a special case of the parsing problem: -in addition to solving the recognition problem it has to find the derivation of the input string in the grammar defined by the regular expression. -%Submatch extraction differs from simple matching in the same way as parsing differs from recognition: -%not only one has to decide if the input string belongs to the language defined by the regular expression, but also to find its derivation in the regular grammar. -Unlike full parsing, submatch extraction needs only a partial derivation. %: it is of no interest how each individual character is derived. -Therefore it would be wasteful to perform full parsing, -and a more specialized algorithm is needed that would have overhead proportional to submatch detalization. -For an optimizing lexer generator like RE2C \cite{RE2C} \cite{RE2Cx} it is important that the generated code is at least as fast and memory-efficient as hand-written code, -and there is zero overhead if the submatch feature is not used. - -\section{Tagged DFA} - -\noindent -The above requirements place tight constraints on the submatch extraction algorithm: -it should be a one-pass DFA-based algorithm that works in constant memory independent of the input length. -Such an algorithm was invented by Ville Laurikari \cite{Lau00}. -It works as follows. -First, the regular expression is converted to a nondeterministic finite automaton with tagged transitions (TNFA). -Tags are submatch markers that can be placed anywhere in the regular expression; -for example, a capturing group can be represented with a pair of tags for the opening and closing parentheses. -%It is also possible to use standalone tags anywhere in the regular expression. -TNFA is in essence a nondeterministic finite state transducer that rewrites symbolic strings into tagged strings. -The crucial part of Laurikari algorithm is TNFA determinization. -Its basic principle is the same as the powerset construction that converts NFA to DFA: -the NFA is simulated on all possible input strings, maintaining a set of states at each step. -If the current state-set is new, it becomes a new DFA state, otherwise it is mapped to an existing DFA state. -In the case of TNFA state-sets are augmented with tag information: -each substate has an associated vector of registers that hold tag values, -so that the whole state-set is a matrix indexed by TNFA states and tags. -Registers are needed because different substates are reached by different TNFA paths, -and the tags along one path may disagree with tags along another path. -If a tag is updated on TNFA path, its value is stored into a new register. -The augmented state-sets cannot be mapped in the same way as ordinary DFA states, -because they may have identical substates, but different registers. -The key insight is that mapping of such states is still possible if there is a bijection between their registers: -the bijective transformation can be encoded in the form of register reordering operations on TDFA transitions. -After determinization all the information in TDFA state-sets is erased, -and the resulting TDFA is like ordinary DFA extended with a fixed number of registers that hold tag values, -and transitions that update and reorder register values. -Algorithms like minimization are applicable to TDFA, -but they need to differentiate between transitions with different register operations. - -\section{Lookahead TDFA} - -\begin{figure}[t!] -\includegraphics[width=\linewidth]{example.pdf} -%\vspace{-1em} -\caption{ -Lookahead-TDFA construction for regular expression $a^* \, t_1 (t_2 \, b)^* c \;$ with tags $t_1$ and $t_2$ -that matches strings $a \dots a b \dots b c$ (tag $t_1$ marks the boundary between $a$s and $b$s, and tag $t_2$ marks the last $b$). -Top: TNFA ($\epsilon/\epsilon$ labels are omitted, and negative tags indicate no-match). -Middle: lookahead-TDFA under construction with state-sets, register matrices and lookahead tags -($p$ is the current position, and $\varnothing$ is no-match). -Bottom left: lookahead-TDFA after construction. -Bottom right: lookahead-TDFA after optimization (the number of registers is minimized and some register operations are removed). -}\label{fig:example} -\end{figure} - -\noindent -One improvement that allows to greatly reduce the number of register operations is the use of the lookahead symbol \cite{Tro17}. -The idea is to delay the application of register operations until the lookahead symbol is known, -and attach the operations to the outgoing transition on that symbol instead of the preceding incoming transition. -The insight is that some of TNFA paths that have reached the current TDFA state are canceled after the application of lookahead symbol, -and all register operations associated with these paths can be canceled as well. -In other words, delaying for one step allows one to split register operations on the lookahead symbol, -and instead of doing all of them, do only the relevant part. -This requires a modification to the underlying TDFA state-sets: -in addition to registers, each substate needs to have an associated list of lookahead tags. -The additional information is erased after determinization, -and the resulting lookahead-TDFA is faster than ordinary TDFA and better suited for further transformations like minimization and register allocation. -On the whole, this idea is similar to the improvement of LALR parsers over LR parsers: -the use of lookahead information in states greatly reduces the number of conflicts -(for TDFA a conflict is not an error, but it means extra registers and register operations). -An example of lookahead-TDFA construction can be seen on figure \ref{fig:example}. - - -\section{Ambiguity resolution} - -\noindent -One of the challenges that makes parsing harder than recognition is ambiguity: -there may be several ways to parse the input string. -Which way to prefer is usually defined by a disambiguation policy. -TDFA algorithm can be parameterized over different policies; -for example, RE2C supports both the Perl leftmost-greedy policy and the POSIX longest-match policy \cite{BorTro19}. -Disambiguation happens at the time of determinization (in $\epsilon$-closure construction), -and the resulting TDFA does not perform any disambiguation at run-time: ambiguity-resolving decisions are embedded into its structure. - -\section{Conclusion and future work} - -\noindent -RE2C is a tool used in open-source projects such as Ninja \cite{Ninja}, PHP \cite{PHP} and others; -it is often chosen for its fast lexers and flexible user interface. -% -Submatch extraction algorithm implemented in RE2C -is both an important theoretical development and a useful practical improvement. -It allows one to program lexical analyzers capable of submatch extraction -without resorting to manual post-processing or the use of string-searching functions. -The overhead on submatch extraction is proportional to submatch detalization: -there is zero overhead if no submatch information is needed, -and even for large submatch-heavy regular expressions like RFC-compliant URI and HTTP parsers the overhead is modest -(about 1.25x compared to simple recognition on DFA \cite{Tro17}). -% -Future work may relate TDFA to other parsing automata, such as DSST \cite{Gra15} and sta-DFA \cite{Cho18}, -and investigate the possibility of using tagged counter automata for counted repetition \cite{Bec09}. - - -\section*{Acknowledgments} -\label{} -\noindent -I want to thank my parents Vladimir and Elina, -my fellow researcher and friend Angelo Borsotti, -my school math teachers Tatyana Leonidovna and Demian Vladimirovich, -and, most of all, Sergei. - - -\begin{thebibliography}{00} - -\bibitem{RE2C} - RE2C, a lexer generator for C, C++ and Go. - Website: \url{https://re2c.org}, - source code: \url{https://github.com/skvadrik/re2c}. - -\bibitem{RE2Cx} - Peter Bumbulis, - Donald D. Cowan, - \textit{RE2C: A more versatile scanner generator}, - ACM Letters on Programming Languages and Systems (LOPLAS), - vol. 2, 1-4, - pp. 70-84, - 1993. - -\bibitem{Lau00} - Ville Laurikari, - \textit{NFAs with tagged transitions, their conversion to deterministic automata and application to regular expressions}, - Proceedings Seventh International Symposium on String Processing and Information Retrieval, 2000. SPIRE 2000, - pp. 181-187, - URL: \url{http://laurikari.net/ville/spire2000-tnfa.pdf}. - -\bibitem{Tro17} - Ulya Trofimovich, - \textit{Tagged Deterministic Finite Automata with Lookahead}, - arXiv:1907.08837 [cs.FL], - 2017. - -\bibitem{BorTro19} - Angelo Borsotti, Ulya Trofimovich, - \textit{Efficient POSIX Submatch Extraction on NFA}, - preprint, 2019, - URL: \url{https://re2c.org/2019_borsotti_trofimovich_efficient_posix_submatch_extraction_on_nfa.pdf}. - -\bibitem{Ninja} - Ninja build system. - Website: \url{https://ninja-build.org}, - build files that use RE2C: \url{https://ninja-build.org/build.ninja.html}. - -\bibitem{PHP} - PHP programming language. - \textit{Building PHP}, - PHP Internals Book, - URL: \url{http://www.phpinternalsbook.com/php5/build_system/building_php.html}. - -\bibitem{Gra15} - Niels Bj{\o}rn Bugge Grathwohl, - \textit{Parsing with Regular Expressions \& Extensions to Kleene Algebra}, - DIKU, University of Copenhagen, - 2015. - -\bibitem{Cho18} - Mohammad Imran Chowdhury, - \textit{staDFA: An Efficient Subexpression Matching Method}, - Master thesis, - Florida State University, - 2018. - -\bibitem{Bec09} - Michela Becchi, - \textit{Data structures, algorithms and architectures for efficient regular expression evaluation}, - Washington University In St. Louis, School of Engineering and Applied Science, - 2009. - -\end{thebibliography} - -\vfill\null -\clearpage - -\section*{Current code version} -\label{} - -\begin{table}[!h] -\begin{tabular}{|l|p{6.5cm}|p{6.5cm}|} -\hline -\textbf{Nr.} & \textbf{Code metadata description} & \textbf{Please fill in this column} \\ -\hline -C1 & Current code version & 2.0 \\ -\hline -C2 & Permanent link to code/repository used for this code version & - \textit{https://github.com/skvadrik/re2c} \\ -\hline -C3 & Permanent link to Reproducible Capsule & \\ -\hline -C4 & Legal Code License & Public domain \\ -\hline -C5 & Code versioning system used & Git \\ -\hline -C6 & Software code languages, tools, and services used & - Source code is written in C++. - Supports code generation into C and Go. \\ -\hline -C7 & Compilation requirements, operating environments \& dependencies & - Supported operating systems: Linux, BSD, Nix/Guix, GNU Hurd, OS X, Windows; ports exist for IBM z/OS and old versions of Minix. - Dependencies: C++ compiler supporting C++98, Bash, optionally CMake or Autotools (including Libtool), optional Bison, optional rst2man (for documentation). \\ -\hline -C8 & If available Link to developer documentation/manual & \textit{https://re2c.org} \\ -\hline -C9 & Support email for questions & \textit{skvadrik@gmail.com} \\ -\hline -\end{tabular} -\caption{Code metadata (mandatory)} -\label{} -\end{table} - -\end{document} -\endinput -- cgit v1.2.3