diff options
-rw-r--r-- | .cargo_vcs_info.json | 5 | ||||
-rw-r--r-- | .github/workflows/ci.yml | 31 | ||||
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | Cargo.toml | 29 | ||||
-rw-r--r-- | Cargo.toml.orig | 19 | ||||
-rw-r--r-- | LICENSE-APACHE | 201 | ||||
-rw-r--r-- | LICENSE-MIT | 25 | ||||
-rw-r--r-- | bors.toml | 3 | ||||
-rw-r--r-- | rustfmt.toml | 1 | ||||
-rw-r--r-- | src/expr.rs | 610 | ||||
-rw-r--r-- | src/lib.rs | 149 | ||||
-rw-r--r-- | src/literal.rs | 361 | ||||
-rw-r--r-- | src/token.rs | 44 | ||||
-rw-r--r-- | tests/clang.rs | 339 | ||||
-rw-r--r-- | tests/input/chars.h | 3 | ||||
-rw-r--r-- | tests/input/fail.h | 9 | ||||
-rw-r--r-- | tests/input/floats.h | 8 | ||||
-rw-r--r-- | tests/input/int_signed.h | 3 | ||||
-rw-r--r-- | tests/input/int_unsigned.h | 29 | ||||
-rw-r--r-- | tests/input/strings.h | 17 | ||||
-rw-r--r-- | tests/input/test_llvm_bug_9069.h | 4 |
21 files changed, 1892 insertions, 0 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json new file mode 100644 index 0000000..72d29e1 --- /dev/null +++ b/.cargo_vcs_info.json @@ -0,0 +1,5 @@ +{ + "git": { + "sha1": "c7ccdfbc37b508cfda1171ab4f89afaeb72e82f3" + } +} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..8af3b70 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,31 @@ +name: CI +on: + push: + branches: + - master + pull_request: + branches: + - master + + +jobs: + build_and_test: + name: Build and Test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Install LLVM and Clang + uses: KyleMayes/install-llvm-action@v1 + with: + version: "11.0" + directory: ${{ runner.temp }}/llvm-11.0 + + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + + - uses: actions-rs/cargo@v1 + with: + command: test + args: --verbose --all diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a9d37c5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..4956001 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,29 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "cexpr" +version = "0.6.0" +authors = ["Jethro Beekman <jethro@jbeekman.nl>"] +description = "A C expression parser and evaluator" +documentation = "https://docs.rs/cexpr/" +keywords = ["C", "expression", "parser"] +license = "Apache-2.0/MIT" +repository = "https://github.com/jethrogb/rust-cexpr" +[dependencies.nom] +version = "7" +features = ["std"] +default-features = false +[dev-dependencies.clang-sys] +version = ">= 0.13.0, < 0.29.0" +[badges.travis-ci] +repository = "jethrogb/rust-cexpr" diff --git a/Cargo.toml.orig b/Cargo.toml.orig new file mode 100644 index 0000000..6de1e89 --- /dev/null +++ b/Cargo.toml.orig @@ -0,0 +1,19 @@ +[package] +name = "cexpr" +version = "0.6.0" +edition = "2018" +authors = ["Jethro Beekman <jethro@jbeekman.nl>"] +license = "Apache-2.0/MIT" +description = "A C expression parser and evaluator" +documentation = "https://docs.rs/cexpr/" +repository = "https://github.com/jethrogb/rust-cexpr" +keywords = ["C","expression","parser"] + +[badges] +travis-ci = { repository = "jethrogb/rust-cexpr" } + +[dependencies] +nom = { version = "7", default-features = false, features = ["std"] } + +[dev-dependencies] +clang-sys = ">= 0.13.0, < 0.29.0" diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..16fe87b --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..ed958e7 --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,25 @@ +(C) Copyright 2016 Jethro G. Beekman + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/bors.toml b/bors.toml new file mode 100644 index 0000000..ca08e81 --- /dev/null +++ b/bors.toml @@ -0,0 +1,3 @@ +status = [ + "continuous-integration/travis-ci/push", +] diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..32a9786 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1 @@ +edition = "2018" diff --git a/src/expr.rs b/src/expr.rs new file mode 100644 index 0000000..7f7e458 --- /dev/null +++ b/src/expr.rs @@ -0,0 +1,610 @@ +// (C) Copyright 2016 Jethro G. Beekman +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +//! Evaluating C expressions from tokens. +//! +//! Numerical operators are supported. All numerical values are treated as +//! `i64` or `f64`. Type casting is not supported. `i64` are converted to +//! `f64` when used in conjunction with a `f64`. Right shifts are always +//! arithmetic shifts. +//! +//! The `sizeof` operator is not supported. +//! +//! String concatenation is supported, but width prefixes are ignored; all +//! strings are treated as narrow strings. +//! +//! Use the `IdentifierParser` to substitute identifiers found in expressions. + +use std::collections::HashMap; +use std::num::Wrapping; +use std::ops::{ + AddAssign, BitAndAssign, BitOrAssign, BitXorAssign, DivAssign, MulAssign, RemAssign, ShlAssign, + ShrAssign, SubAssign, +}; + +use crate::literal::{self, CChar}; +use crate::token::{Kind as TokenKind, Token}; +use crate::ToCexprResult; +use nom::branch::alt; +use nom::combinator::{complete, map, map_opt}; +use nom::multi::{fold_many0, many0, separated_list0}; +use nom::sequence::{delimited, pair, preceded}; +use nom::*; + +/// Expression parser/evaluator that supports identifiers. +#[derive(Debug)] +pub struct IdentifierParser<'ident> { + identifiers: &'ident HashMap<Vec<u8>, EvalResult>, +} +#[derive(Copy, Clone)] +struct PRef<'a>(&'a IdentifierParser<'a>); + +/// A shorthand for the type of cexpr expression evaluation results. +pub type CResult<'a, R> = IResult<&'a [Token], R, crate::Error<&'a [Token]>>; + +/// The result of parsing a literal or evaluating an expression. +#[derive(Debug, Clone, PartialEq)] +#[allow(missing_docs)] +pub enum EvalResult { + Int(Wrapping<i64>), + Float(f64), + Char(CChar), + Str(Vec<u8>), + Invalid, +} + +macro_rules! result_opt ( + (fn $n:ident: $e:ident -> $t:ty) => ( + #[allow(dead_code)] + #[allow(clippy::wrong_self_convention)] + fn $n(self) -> Option<$t> { + if let EvalResult::$e(v) = self { + Some(v) + } else { + None + } + } + ); +); + +impl EvalResult { + result_opt!(fn as_int: Int -> Wrapping<i64>); + result_opt!(fn as_float: Float -> f64); + result_opt!(fn as_char: Char -> CChar); + result_opt!(fn as_str: Str -> Vec<u8>); + + #[allow(clippy::wrong_self_convention)] + fn as_numeric(self) -> Option<EvalResult> { + match self { + EvalResult::Int(_) | EvalResult::Float(_) => Some(self), + _ => None, + } + } +} + +impl From<Vec<u8>> for EvalResult { + fn from(s: Vec<u8>) -> EvalResult { + EvalResult::Str(s) + } +} + +// =========================================== +// ============= Clang tokens ================ +// =========================================== + +macro_rules! exact_token ( + ($k:ident, $c:expr) => ({ + move |input: &[Token]| { + if input.is_empty() { + let res: CResult<'_, &[u8]> = Err(crate::nom::Err::Incomplete(Needed::new($c.len()))); + res + } else { + if input[0].kind==TokenKind::$k && &input[0].raw[..]==$c { + Ok((&input[1..], &input[0].raw[..])) + } else { + Err(crate::nom::Err::Error((input, crate::ErrorKind::ExactToken(TokenKind::$k,$c)).into())) + } + } + } + }); +); + +fn identifier_token(input: &[Token]) -> CResult<'_, &[u8]> { + if input.is_empty() { + let res: CResult<'_, &[u8]> = Err(nom::Err::Incomplete(Needed::new(1))); + res + } else { + if input[0].kind == TokenKind::Identifier { + Ok((&input[1..], &input[0].raw[..])) + } else { + Err(crate::nom::Err::Error((input, crate::ErrorKind::TypedToken(TokenKind::Identifier)).into())) + } + } +} + +fn p(c: &'static str) -> impl Fn(&[Token]) -> CResult<'_, &[u8]> { + exact_token!(Punctuation, c.as_bytes()) +} + +fn one_of_punctuation(c: &'static [&'static str]) -> impl Fn(&[Token]) -> CResult<'_, &[u8]> { + move |input| { + if input.is_empty() { + let min = c + .iter() + .map(|opt| opt.len()) + .min() + .expect("at least one option"); + Err(crate::nom::Err::Incomplete(Needed::new(min))) + } else if input[0].kind == TokenKind::Punctuation + && c.iter().any(|opt| opt.as_bytes() == &input[0].raw[..]) + { + Ok((&input[1..], &input[0].raw[..])) + } else { + Err(crate::nom::Err::Error( + ( + input, + crate::ErrorKind::ExactTokens(TokenKind::Punctuation, c), + ) + .into(), + )) + } + } +} + +// ================================================== +// ============= Numeric expressions ================ +// ================================================== + +impl<'a> AddAssign<&'a EvalResult> for EvalResult { + fn add_assign(&mut self, rhs: &'a EvalResult) { + use self::EvalResult::*; + *self = match (&*self, rhs) { + (&Int(a), &Int(b)) => Int(a + b), + (&Float(a), &Int(b)) => Float(a + (b.0 as f64)), + (&Int(a), &Float(b)) => Float(a.0 as f64 + b), + (&Float(a), &Float(b)) => Float(a + b), + _ => Invalid, + }; + } +} +impl<'a> BitAndAssign<&'a EvalResult> for EvalResult { + fn bitand_assign(&mut self, rhs: &'a EvalResult) { + use self::EvalResult::*; + *self = match (&*self, rhs) { + (&Int(a), &Int(b)) => Int(a & b), + _ => Invalid, + }; + } +} +impl<'a> BitOrAssign<&'a EvalResult> for EvalResult { + fn bitor_assign(&mut self, rhs: &'a EvalResult) { + use self::EvalResult::*; + *self = match (&*self, rhs) { + (&Int(a), &Int(b)) => Int(a | b), + _ => Invalid, + }; + } +} +impl<'a> BitXorAssign<&'a EvalResult> for EvalResult { + fn bitxor_assign(&mut self, rhs: &'a EvalResult) { + use self::EvalResult::*; + *self = match (&*self, rhs) { + (&Int(a), &Int(b)) => Int(a ^ b), + _ => Invalid, + }; + } +} +impl<'a> DivAssign<&'a EvalResult> for EvalResult { + fn div_assign(&mut self, rhs: &'a EvalResult) { + use self::EvalResult::*; + *self = match (&*self, rhs) { + (&Int(a), &Int(b)) => Int(a / b), + (&Float(a), &Int(b)) => Float(a / (b.0 as f64)), + (&Int(a), &Float(b)) => Float(a.0 as f64 / b), + (&Float(a), &Float(b)) => Float(a / b), + _ => Invalid, + }; + } +} +impl<'a> MulAssign<&'a EvalResult> for EvalResult { + fn mul_assign(&mut self, rhs: &'a EvalResult) { + use self::EvalResult::*; + *self = match (&*self, rhs) { + (&Int(a), &Int(b)) => Int(a * b), + (&Float(a), &Int(b)) => Float(a * (b.0 as f64)), + (&Int(a), &Float(b)) => Float(a.0 as f64 * b), + (&Float(a), &Float(b)) => Float(a * b), + _ => Invalid, + }; + } +} +impl<'a> RemAssign<&'a EvalResult> for EvalResult { + fn rem_assign(&mut self, rhs: &'a EvalResult) { + use self::EvalResult::*; + *self = match (&*self, rhs) { + (&Int(a), &Int(b)) => Int(a % b), + (&Float(a), &Int(b)) => Float(a % (b.0 as f64)), + (&Int(a), &Float(b)) => Float(a.0 as f64 % b), + (&Float(a), &Float(b)) => Float(a % b), + _ => Invalid, + }; + } +} +impl<'a> ShlAssign<&'a EvalResult> for EvalResult { + fn shl_assign(&mut self, rhs: &'a EvalResult) { + use self::EvalResult::*; + *self = match (&*self, rhs) { + (&Int(a), &Int(b)) => Int(a << (b.0 as usize)), + _ => Invalid, + }; + } +} +impl<'a> ShrAssign<&'a EvalResult> for EvalResult { + fn shr_assign(&mut self, rhs: &'a EvalResult) { + use self::EvalResult::*; + *self = match (&*self, rhs) { + (&Int(a), &Int(b)) => Int(a >> (b.0 as usize)), + _ => Invalid, + }; + } +} +impl<'a> SubAssign<&'a EvalResult> for EvalResult { + fn sub_assign(&mut self, rhs: &'a EvalResult) { + use self::EvalResult::*; + *self = match (&*self, rhs) { + (&Int(a), &Int(b)) => Int(a - b), + (&Float(a), &Int(b)) => Float(a - (b.0 as f64)), + (&Int(a), &Float(b)) => Float(a.0 as f64 - b), + (&Float(a), &Float(b)) => Float(a - b), + _ => Invalid, + }; + } +} + +fn unary_op(input: (&[u8], EvalResult)) -> Option<EvalResult> { + use self::EvalResult::*; + assert_eq!(input.0.len(), 1); + match (input.0[0], input.1) { + (b'+', i) => Some(i), + (b'-', Int(i)) => Some(Int(Wrapping(i.0.wrapping_neg()))), // impl Neg for Wrapping not until rust 1.10... + (b'-', Float(i)) => Some(Float(-i)), + (b'-', _) => unreachable!("non-numeric unary op"), + (b'~', Int(i)) => Some(Int(!i)), + (b'~', Float(_)) => None, + (b'~', _) => unreachable!("non-numeric unary op"), + _ => unreachable!("invalid unary op"), + } +} + +fn numeric<I: Clone, E: nom::error::ParseError<I>, F>( + f: F, +) -> impl FnMut(I) -> nom::IResult<I, EvalResult, E> +where + F: FnMut(I) -> nom::IResult<I, EvalResult, E>, +{ + nom::combinator::map_opt(f, EvalResult::as_numeric) +} + +impl<'a> PRef<'a> { + fn unary(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + alt(( + delimited(p("("), |i| self.numeric_expr(i), p(")")), + numeric(|i| self.literal(i)), + numeric(|i| self.identifier(i)), + map_opt( + pair(one_of_punctuation(&["+", "-", "~"][..]), |i| self.unary(i)), + unary_op, + ), + ))(input) + } + + fn mul_div_rem(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + let (input, acc) = self.unary(input)?; + fold_many0( + pair(complete(one_of_punctuation(&["*", "/", "%"][..])), |i| { + self.unary(i) + }), + move || acc.clone(), + |mut acc, (op, val): (&[u8], EvalResult)| { + match op[0] as char { + '*' => acc *= &val, + '/' => acc /= &val, + '%' => acc %= &val, + _ => unreachable!(), + }; + acc + }, + )(input) + } + + fn add_sub(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + let (input, acc) = self.mul_div_rem(input)?; + fold_many0( + pair(complete(one_of_punctuation(&["+", "-"][..])), |i| { + self.mul_div_rem(i) + }), + move || acc.clone(), + |mut acc, (op, val): (&[u8], EvalResult)| { + match op[0] as char { + '+' => acc += &val, + '-' => acc -= &val, + _ => unreachable!(), + }; + acc + }, + )(input) + } + + fn shl_shr(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + let (input, acc) = self.add_sub(input)?; + numeric(fold_many0( + pair(complete(one_of_punctuation(&["<<", ">>"][..])), |i| { + self.add_sub(i) + }), + move || acc.clone(), + |mut acc, (op, val): (&[u8], EvalResult)| { + match op { + b"<<" => acc <<= &val, + b">>" => acc >>= &val, + _ => unreachable!(), + }; + acc + }, + ))(input) + } + + fn and(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + let (input, acc) = self.shl_shr(input)?; + numeric(fold_many0( + preceded(complete(p("&")), |i| self.shl_shr(i)), + move || acc.clone(), + |mut acc, val: EvalResult| { + acc &= &val; + acc + }, + ))(input) + } + + fn xor(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + let (input, acc) = self.and(input)?; + numeric(fold_many0( + preceded(complete(p("^")), |i| self.and(i)), + move || acc.clone(), + |mut acc, val: EvalResult| { + acc ^= &val; + acc + }, + ))(input) + } + + fn or(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + let (input, acc) = self.xor(input)?; + numeric(fold_many0( + preceded(complete(p("|")), |i| self.xor(i)), + move || acc.clone(), + |mut acc, val: EvalResult| { + acc |= &val; + acc + }, + ))(input) + } + + #[inline(always)] + fn numeric_expr(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + self.or(input) + } +} + +// ======================================================= +// ============= Literals and identifiers ================ +// ======================================================= + +impl<'a> PRef<'a> { + fn identifier(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + match input.split_first() { + None => Err(Err::Incomplete(Needed::new(1))), + Some(( + &Token { + kind: TokenKind::Identifier, + ref raw, + }, + rest, + )) => { + if let Some(r) = self.identifiers.get(&raw[..]) { + Ok((rest, r.clone())) + } else { + Err(Err::Error( + (input, crate::ErrorKind::UnknownIdentifier).into(), + )) + } + } + Some(_) => Err(Err::Error( + (input, crate::ErrorKind::TypedToken(TokenKind::Identifier)).into(), + )), + } + } + + fn literal(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + match input.split_first() { + None => Err(Err::Incomplete(Needed::new(1))), + Some(( + &Token { + kind: TokenKind::Literal, + ref raw, + }, + rest, + )) => match literal::parse(raw) { + Ok((_, result)) => Ok((rest, result)), + _ => Err(Err::Error((input, crate::ErrorKind::InvalidLiteral).into())), + }, + Some(_) => Err(Err::Error( + (input, crate::ErrorKind::TypedToken(TokenKind::Literal)).into(), + )), + } + } + + fn string(self, input: &'_ [Token]) -> CResult<'_, Vec<u8>> { + alt(( + map_opt(|i| self.literal(i), EvalResult::as_str), + map_opt(|i| self.identifier(i), EvalResult::as_str), + ))(input) + .to_cexpr_result() + } + + // "string1" "string2" etc... + fn concat_str(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + map( + pair(|i| self.string(i), many0(complete(|i| self.string(i)))), + |(first, v)| { + Vec::into_iter(v) + .fold(first, |mut s, elem| { + Vec::extend_from_slice(&mut s, Vec::<u8>::as_slice(&elem)); + s + }) + .into() + }, + )(input) + .to_cexpr_result() + } + + fn expr(self, input: &'_ [Token]) -> CResult<'_, EvalResult> { + alt(( + |i| self.numeric_expr(i), + delimited(p("("), |i| self.expr(i), p(")")), + |i| self.concat_str(i), + |i| self.literal(i), + |i| self.identifier(i), + ))(input) + .to_cexpr_result() + } + + fn macro_definition(self, input: &'_ [Token]) -> CResult<'_, (&'_ [u8], EvalResult)> { + pair(identifier_token, |i| self.expr(i))(input) + } +} + +impl<'a> ::std::ops::Deref for PRef<'a> { + type Target = IdentifierParser<'a>; + fn deref(&self) -> &IdentifierParser<'a> { + self.0 + } +} + +impl<'ident> IdentifierParser<'ident> { + fn as_ref(&self) -> PRef<'_> { + PRef(self) + } + + /// Create a new `IdentifierParser` with a set of known identifiers. When + /// a known identifier is encountered during parsing, it is substituted + /// for the value specified. + pub fn new(identifiers: &HashMap<Vec<u8>, EvalResult>) -> IdentifierParser<'_> { + IdentifierParser { identifiers } + } + + /// Parse and evaluate an expression of a list of tokens. + /// + /// Returns an error if the input is not a valid expression or if the token + /// stream contains comments, keywords or unknown identifiers. + pub fn expr<'a>(&self, input: &'a [Token]) -> CResult<'a, EvalResult> { + self.as_ref().expr(input) + } + + /// Parse and evaluate a macro definition from a list of tokens. + /// + /// Returns the identifier for the macro and its replacement evaluated as an + /// expression. The input should not include `#define`. + /// + /// Returns an error if the replacement is not a valid expression, if called + /// on most function-like macros, or if the token stream contains comments, + /// keywords or unknown identifiers. + /// + /// N.B. This is intended to fail on function-like macros, but if it the + /// macro takes a single argument, the argument name is defined as an + /// identifier, and the macro otherwise parses as an expression, it will + /// return a result even on function-like macros. + /// + /// ```c + /// // will evaluate into IDENTIFIER + /// #define DELETE(IDENTIFIER) + /// // will evaluate into IDENTIFIER-3 + /// #define NEGATIVE_THREE(IDENTIFIER) -3 + /// ``` + pub fn macro_definition<'a>(&self, input: &'a [Token]) -> CResult<'a, (&'a [u8], EvalResult)> { + crate::assert_full_parse(self.as_ref().macro_definition(input)) + } +} + +/// Parse and evaluate an expression of a list of tokens. +/// +/// Returns an error if the input is not a valid expression or if the token +/// stream contains comments, keywords or identifiers. +pub fn expr(input: &[Token]) -> CResult<'_, EvalResult> { + IdentifierParser::new(&HashMap::new()).expr(input) +} + +/// Parse and evaluate a macro definition from a list of tokens. +/// +/// Returns the identifier for the macro and its replacement evaluated as an +/// expression. The input should not include `#define`. +/// +/// Returns an error if the replacement is not a valid expression, if called +/// on a function-like macro, or if the token stream contains comments, +/// keywords or identifiers. +pub fn macro_definition(input: &[Token]) -> CResult<'_, (&'_ [u8], EvalResult)> { + IdentifierParser::new(&HashMap::new()).macro_definition(input) +} + +/// Parse a functional macro declaration from a list of tokens. +/// +/// Returns the identifier for the macro and the argument list (in order). The +/// input should not include `#define`. The actual definition is not parsed and +/// may be obtained from the unparsed data returned. +/// +/// Returns an error if the input is not a functional macro or if the token +/// stream contains comments. +/// +/// # Example +/// ``` +/// use cexpr::expr::{IdentifierParser, EvalResult, fn_macro_declaration}; +/// use cexpr::assert_full_parse; +/// use cexpr::token::Kind::*; +/// use cexpr::token::Token; +/// +/// // #define SUFFIX(arg) arg "suffix" +/// let tokens = vec![ +/// (Identifier, &b"SUFFIX"[..]).into(), +/// (Punctuation, &b"("[..]).into(), +/// (Identifier, &b"arg"[..]).into(), +/// (Punctuation, &b")"[..]).into(), +/// (Identifier, &b"arg"[..]).into(), +/// (Literal, &br#""suffix""#[..]).into(), +/// ]; +/// +/// // Try to parse the functional part +/// let (expr, (ident, args)) = fn_macro_declaration(&tokens).unwrap(); +/// assert_eq!(ident, b"SUFFIX"); +/// +/// // Create dummy arguments +/// let idents = args.into_iter().map(|arg| +/// (arg.to_owned(), EvalResult::Str(b"test".to_vec())) +/// ).collect(); +/// +/// // Evaluate the macro +/// let (_, evaluated) = assert_full_parse(IdentifierParser::new(&idents).expr(expr)).unwrap(); +/// assert_eq!(evaluated, EvalResult::Str(b"testsuffix".to_vec())); +/// ``` +pub fn fn_macro_declaration(input: &[Token]) -> CResult<'_, (&[u8], Vec<&[u8]>)> { + pair( + identifier_token, + delimited( + p("("), + separated_list0(p(","), identifier_token), + p(")"), + ), + )(input) +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..5170f97 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,149 @@ +// (C) Copyright 2016 Jethro G. Beekman +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +//! A C expression parser and evaluator. +//! +//! This crate provides methods for parsing and evaluating simple C expressions. In general, the +//! crate can handle most arithmetic expressions that would appear in macros or the definition of +//! constants, as well as string and character constants. +//! +//! The main entry point for is [`token::parse`], which parses a byte string and returns its +//! evaluated value. +#![warn(rust_2018_idioms)] +#![warn(missing_docs)] +#![allow(deprecated)] + +pub mod nom { + //! nom's result types, re-exported. + pub use nom::{error::ErrorKind, error::Error, Err, IResult, Needed}; +} +pub mod expr; +pub mod literal; +pub mod token; + +/// Parsing errors specific to C parsing +#[derive(Debug)] +pub enum ErrorKind { + /// Expected the specified token + ExactToken(token::Kind, &'static [u8]), + /// Expected one of the specified tokens + ExactTokens(token::Kind, &'static [&'static str]), + /// Expected a token of the specified kind + TypedToken(token::Kind), + /// An unknown identifier was encountered + UnknownIdentifier, + /// An invalid literal was encountered. + /// + /// When encountered, this generally means a bug exists in the data that + /// was passed in or the parsing logic. + InvalidLiteral, + /// A full parse was requested, but data was left over after parsing finished. + Partial, + /// An error occurred in an underlying nom parser. + Parser(nom::ErrorKind), +} + +impl From<nom::ErrorKind> for ErrorKind { + fn from(k: nom::ErrorKind) -> Self { + ErrorKind::Parser(k) + } +} + +impl From<u32> for ErrorKind { + fn from(_: u32) -> Self { + ErrorKind::InvalidLiteral + } +} + +/// Parsing errors specific to C parsing. +/// +/// This is a superset of `(I, nom::ErrorKind)` that includes the additional errors specified by +/// [`ErrorKind`]. +#[derive(Debug)] +pub struct Error<I> { + /// The remainder of the input stream at the time of the error. + pub input: I, + /// The error that occurred. + pub error: ErrorKind, +} + +impl<I> From<(I, nom::ErrorKind)> for Error<I> { + fn from(e: (I, nom::ErrorKind)) -> Self { + Self::from((e.0, ErrorKind::from(e.1))) + } +} + +impl<I> From<(I, ErrorKind)> for Error<I> { + fn from(e: (I, ErrorKind)) -> Self { + Self { + input: e.0, + error: e.1, + } + } +} + +impl<I> From<::nom::error::Error<I>> for Error<I> { + fn from(e: ::nom::error::Error<I>) -> Self { + Self { + input: e.input, + error: e.code.into(), + } + } +} + +impl<I> ::nom::error::ParseError<I> for Error<I> { + fn from_error_kind(input: I, kind: nom::ErrorKind) -> Self { + Self { + input, + error: kind.into(), + } + } + + fn append(_: I, _: nom::ErrorKind, other: Self) -> Self { + other + } +} + +// in lieu of https://github.com/Geal/nom/issues/1010 +trait ToCexprResult<I, O> { + fn to_cexpr_result(self) -> nom::IResult<I, O, Error<I>>; +} +impl<I, O, E> ToCexprResult<I, O> for nom::IResult<I, O, E> +where + Error<I>: From<E>, +{ + fn to_cexpr_result(self) -> nom::IResult<I, O, Error<I>> { + match self { + Ok(v) => Ok(v), + Err(nom::Err::Incomplete(n)) => Err(nom::Err::Incomplete(n)), + Err(nom::Err::Error(e)) => Err(nom::Err::Error(e.into())), + Err(nom::Err::Failure(e)) => Err(nom::Err::Failure(e.into())), + } + } +} + +/// If the input result indicates a succesful parse, but there is data left, +/// return an `Error::Partial` instead. +pub fn assert_full_parse<'i, I: 'i, O, E>( + result: nom::IResult<&'i [I], O, E>, +) -> nom::IResult<&'i [I], O, Error<&'i [I]>> +where + Error<&'i [I]>: From<E>, +{ + match result.to_cexpr_result() { + Ok((rem, output)) => { + if rem.is_empty() { + Ok((rem, output)) + } else { + Err(nom::Err::Error((rem, ErrorKind::Partial).into())) + } + } + Err(nom::Err::Incomplete(n)) => Err(nom::Err::Incomplete(n)), + Err(nom::Err::Failure(e)) => Err(nom::Err::Failure(e)), + Err(nom::Err::Error(e)) => Err(nom::Err::Error(e)), + } +} diff --git a/src/literal.rs b/src/literal.rs new file mode 100644 index 0000000..68e85c7 --- /dev/null +++ b/src/literal.rs @@ -0,0 +1,361 @@ +// (C) Copyright 2016 Jethro G. Beekman +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +//! Parsing C literals from byte slices. +//! +//! This will parse a representation of a C literal into a Rust type. +//! +//! # characters +//! Character literals are stored into the `CChar` type, which can hold values +//! that are not valid Unicode code points. ASCII characters are represented as +//! `char`, literal bytes with the high byte set are converted into the raw +//! representation. Escape sequences are supported. If hex and octal escapes +//! map to an ASCII character, that is used, otherwise, the raw encoding is +//! used, including for values over 255. Unicode escapes are checked for +//! validity and mapped to `char`. Character sequences are not supported. Width +//! prefixes are ignored. +//! +//! # strings +//! Strings are interpreted as byte vectors. Escape sequences are supported. If +//! hex and octal escapes map onto multi-byte characters, they are truncated to +//! one 8-bit character. Unicode escapes are converted into their UTF-8 +//! encoding. Width prefixes are ignored. +//! +//! # integers +//! Integers are read into `i64`. Binary, octal, decimal and hexadecimal are +//! all supported. If the literal value is between `i64::MAX` and `u64::MAX`, +//! it is bit-cast to `i64`. Values over `u64::MAX` cannot be parsed. Width and +//! sign suffixes are ignored. Sign prefixes are not supported. +//! +//! # real numbers +//! Reals are read into `f64`. Width suffixes are ignored. Sign prefixes are +//! not supported in the significand. Hexadecimal floating points are not +//! supported. + +use std::char; +use std::str::{self, FromStr}; + +use nom::branch::alt; +use nom::bytes::complete::is_not; +use nom::bytes::complete::tag; +use nom::character::complete::{char, one_of}; +use nom::combinator::{complete, map, map_opt, opt, recognize}; +use nom::multi::{fold_many0, many0, many1, many_m_n}; +use nom::sequence::{delimited, pair, preceded, terminated, tuple}; +use nom::*; + +use crate::expr::EvalResult; +use crate::ToCexprResult; + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +/// Representation of a C character +pub enum CChar { + /// A character that can be represented as a `char` + Char(char), + /// Any other character (8-bit characters, unicode surrogates, etc.) + Raw(u64), +} + +impl From<u8> for CChar { + fn from(i: u8) -> CChar { + match i { + 0..=0x7f => CChar::Char(i as u8 as char), + _ => CChar::Raw(i as u64), + } + } +} + +// A non-allocating version of this would be nice... +impl std::convert::Into<Vec<u8>> for CChar { + fn into(self) -> Vec<u8> { + match self { + CChar::Char(c) => { + let mut s = String::with_capacity(4); + s.extend(&[c]); + s.into_bytes() + } + CChar::Raw(i) => { + let mut v = Vec::with_capacity(1); + v.push(i as u8); + v + } + } + } +} + +/// ensures the child parser consumes the whole input +pub fn full<I: Clone, O, F>( + f: F, +) -> impl Fn(I) -> nom::IResult<I, O> +where + I: nom::InputLength, + F: Fn(I) -> nom::IResult<I, O>, +{ + move |input| { + let res = f(input); + match res { + Ok((i, o)) => { + if i.input_len() == 0 { + Ok((i, o)) + } else { + Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::Complete))) + } + } + r => r, + } + } +} + +// ================================= +// ======== matching digits ======== +// ================================= + +macro_rules! byte { + ($($p: pat)|* ) => {{ + fn parser(i: &[u8]) -> crate::nom::IResult<&[u8], u8> { + match i.split_first() { + $(Some((&c @ $p,rest)))|* => Ok((rest,c)), + Some(_) => Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::OneOf))), + None => Err(nom::Err::Incomplete(Needed::new(1))), + } + } + + parser + }} +} + +fn binary(i: &[u8]) -> nom::IResult<&[u8], u8> { + byte!(b'0'..=b'1')(i) +} + +fn octal(i: &[u8]) -> nom::IResult<&[u8], u8> { + byte!(b'0'..=b'7')(i) +} + +fn decimal(i: &[u8]) -> nom::IResult<&[u8], u8> { + byte!(b'0'..=b'9')(i) +} + +fn hexadecimal(i: &[u8]) -> nom::IResult<&[u8], u8> { + byte!(b'0' ..= b'9' | b'a' ..= b'f' | b'A' ..= b'F')(i) +} + +// ======================================== +// ======== characters and strings ======== +// ======================================== + +fn escape2char(c: char) -> CChar { + CChar::Char(match c { + 'a' => '\x07', + 'b' => '\x08', + 'f' => '\x0c', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'v' => '\x0b', + _ => unreachable!("invalid escape {}", c), + }) +} + +fn c_raw_escape(n: Vec<u8>, radix: u32) -> Option<CChar> { + str::from_utf8(&n) + .ok() + .and_then(|i| u64::from_str_radix(i, radix).ok()) + .map(|i| match i { + 0..=0x7f => CChar::Char(i as u8 as char), + _ => CChar::Raw(i), + }) +} + +fn c_unicode_escape(n: Vec<u8>) -> Option<CChar> { + str::from_utf8(&n) + .ok() + .and_then(|i| u32::from_str_radix(i, 16).ok()) + .and_then(char::from_u32) + .map(CChar::Char) +} + +fn escaped_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { + preceded( + char('\\'), + alt(( + map(one_of(r#"'"?\"#), CChar::Char), + map(one_of("abfnrtv"), escape2char), + map_opt(many_m_n(1, 3, octal), |v| c_raw_escape(v, 8)), + map_opt(preceded(char('x'), many1(hexadecimal)), |v| { + c_raw_escape(v, 16) + }), + map_opt( + preceded(char('u'), many_m_n(4, 4, hexadecimal)), + c_unicode_escape, + ), + map_opt( + preceded(char('U'), many_m_n(8, 8, hexadecimal)), + c_unicode_escape, + ), + )), + )(i) +} + +fn c_width_prefix(i: &[u8]) -> nom::IResult<&[u8], &[u8]> { + alt((tag("u8"), tag("u"), tag("U"), tag("L")))(i) +} + +fn c_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { + delimited( + terminated(opt(c_width_prefix), char('\'')), + alt(( + escaped_char, + map(byte!(0 ..= 91 /* \=92 */ | 93 ..= 255), CChar::from), + )), + char('\''), + )(i) +} + +fn c_string(i: &[u8]) -> nom::IResult<&[u8], Vec<u8>> { + delimited( + alt((preceded(c_width_prefix, char('"')), char('"'))), + fold_many0( + alt(( + map(escaped_char, |c: CChar| c.into()), + map(is_not([b'\\', b'"']), |c: &[u8]| c.into()), + )), + Vec::new, + |mut v: Vec<u8>, res: Vec<u8>| { + v.extend_from_slice(&res); + v + }, + ), + char('"'), + )(i) +} + +// ================================ +// ======== parse integers ======== +// ================================ + +fn c_int_radix(n: Vec<u8>, radix: u32) -> Option<u64> { + str::from_utf8(&n) + .ok() + .and_then(|i| u64::from_str_radix(i, radix).ok()) +} + +fn take_ul(input: &[u8]) -> IResult<&[u8], &[u8]> { + let r = input.split_at_position(|c| c != b'u' && c != b'U' && c != b'l' && c != b'L'); + match r { + Err(Err::Incomplete(_)) => Ok((&input[input.len()..], input)), + res => res, + } +} + +fn c_int(i: &[u8]) -> nom::IResult<&[u8], i64> { + map( + terminated( + alt(( + map_opt(preceded(tag("0x"), many1(complete(hexadecimal))), |v| { + c_int_radix(v, 16) + }), + map_opt(preceded(tag("0X"), many1(complete(hexadecimal))), |v| { + c_int_radix(v, 16) + }), + map_opt(preceded(tag("0b"), many1(complete(binary))), |v| { + c_int_radix(v, 2) + }), + map_opt(preceded(tag("0B"), many1(complete(binary))), |v| { + c_int_radix(v, 2) + }), + map_opt(preceded(char('0'), many1(complete(octal))), |v| { + c_int_radix(v, 8) + }), + map_opt(many1(complete(decimal)), |v| c_int_radix(v, 10)), + |input| Err(crate::nom::Err::Error(nom::error::Error::new(input, crate::nom::ErrorKind::Fix))), + )), + opt(take_ul), + ), + |i| i as i64, + )(i) +} + +// ============================== +// ======== parse floats ======== +// ============================== + +fn float_width(i: &[u8]) -> nom::IResult<&[u8], u8> { + nom::combinator::complete(byte!(b'f' | b'l' | b'F' | b'L'))(i) +} + +fn float_exp(i: &[u8]) -> nom::IResult<&[u8], (Option<u8>, Vec<u8>)> { + preceded( + byte!(b'e' | b'E'), + pair(opt(byte!(b'-' | b'+')), many1(complete(decimal))), + )(i) +} + +fn c_float(i: &[u8]) -> nom::IResult<&[u8], f64> { + map_opt( + alt(( + terminated( + recognize(tuple(( + many1(complete(decimal)), + byte!(b'.'), + many0(complete(decimal)), + ))), + opt(float_width), + ), + terminated( + recognize(tuple(( + many0(complete(decimal)), + byte!(b'.'), + many1(complete(decimal)), + ))), + opt(float_width), + ), + terminated( + recognize(tuple(( + many0(complete(decimal)), + opt(byte!(b'.')), + many1(complete(decimal)), + float_exp, + ))), + opt(float_width), + ), + terminated( + recognize(tuple(( + many1(complete(decimal)), + opt(byte!(b'.')), + many0(complete(decimal)), + float_exp, + ))), + opt(float_width), + ), + terminated(recognize(many1(complete(decimal))), float_width), + )), + |v| str::from_utf8(v).ok().and_then(|i| f64::from_str(i).ok()), + )(i) +} + +// ================================ +// ======== main interface ======== +// ================================ + +fn one_literal(input: &[u8]) -> nom::IResult<&[u8], EvalResult, crate::Error<&[u8]>> { + alt(( + map(full(c_char), EvalResult::Char), + map(full(c_int), |i| EvalResult::Int(::std::num::Wrapping(i))), + map(full(c_float), EvalResult::Float), + map(full(c_string), EvalResult::Str), + ))(input) + .to_cexpr_result() +} + +/// Parse a C literal. +/// +/// The input must contain exactly the representation of a single literal +/// token, and in particular no whitespace or sign prefixes. +pub fn parse(input: &[u8]) -> IResult<&[u8], EvalResult, crate::Error<&[u8]>> { + crate::assert_full_parse(one_literal(input)) +} diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..dbc5949 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,44 @@ +// (C) Copyright 2016 Jethro G. Beekman +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +//! Representation of a C token +//! +//! This is designed to map onto a libclang CXToken. + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[allow(missing_docs)] +pub enum Kind { + Punctuation, + Keyword, + Identifier, + Literal, + Comment, +} + +/// A single token in a C expression. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Token { + /// The type of this token. + pub kind: Kind, + /// The bytes that make up the token. + pub raw: Box<[u8]>, +} + +impl<'a> From<(Kind, &'a [u8])> for Token { + fn from((kind, value): (Kind, &'a [u8])) -> Token { + Token { + kind, + raw: value.to_owned().into_boxed_slice(), + } + } +} + +/// Remove all comment tokens from a vector of tokens +pub fn remove_comments(v: &mut Vec<Token>) -> &mut Vec<Token> { + v.retain(|t| t.kind != Kind::Comment); + v +} diff --git a/tests/clang.rs b/tests/clang.rs new file mode 100644 index 0000000..b2484f0 --- /dev/null +++ b/tests/clang.rs @@ -0,0 +1,339 @@ +// (C) Copyright 2016 Jethro G. Beekman +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. +extern crate cexpr; +extern crate clang_sys; + +use std::collections::HashMap; +use std::io::Write; +use std::str::{self, FromStr}; +use std::{char, ffi, mem, ptr, slice}; + +use cexpr::assert_full_parse; +use cexpr::expr::{fn_macro_declaration, EvalResult, IdentifierParser}; +use cexpr::literal::CChar; +use cexpr::token::Token; +use clang_sys::*; + +// main testing routine +fn test_definition( + ident: Vec<u8>, + tokens: &[Token], + idents: &mut HashMap<Vec<u8>, EvalResult>, +) -> bool { + fn bytes_to_int(value: &[u8]) -> Option<EvalResult> { + str::from_utf8(value) + .ok() + .map(|s| s.replace("n", "-")) + .map(|s| s.replace("_", "")) + .and_then(|v| i64::from_str(&v).ok()) + .map(::std::num::Wrapping) + .map(Int) + } + + use cexpr::expr::EvalResult::*; + + let display_name = String::from_utf8_lossy(&ident).into_owned(); + + let functional; + let test = { + // Split name such as Str_test_string into (Str,test_string) + let pos = ident + .iter() + .position(|c| *c == b'_') + .expect(&format!("Invalid definition in testcase: {}", display_name)); + let mut expected = &ident[..pos]; + let mut value = &ident[(pos + 1)..]; + + functional = expected == b"Fn"; + + if functional { + let ident = value; + let pos = ident + .iter() + .position(|c| *c == b'_') + .expect(&format!("Invalid definition in testcase: {}", display_name)); + expected = &ident[..pos]; + value = &ident[(pos + 1)..]; + } + + if expected == b"Str" { + let mut splits = value.split(|c| *c == b'U'); + let mut s = Vec::with_capacity(value.len()); + s.extend_from_slice(splits.next().unwrap()); + for split in splits { + let (chr, rest) = split.split_at(6); + let chr = u32::from_str_radix(str::from_utf8(chr).unwrap(), 16).unwrap(); + write!(s, "{}", char::from_u32(chr).unwrap()).unwrap(); + s.extend_from_slice(rest); + } + Some(Str(s)) + } else if expected == b"Int" { + bytes_to_int(value) + } else if expected == b"Float" { + str::from_utf8(value) + .ok() + .map(|s| s.replace("n", "-").replace("p", ".")) + .and_then(|v| f64::from_str(&v).ok()) + .map(Float) + } else if expected == b"CharRaw" { + str::from_utf8(value) + .ok() + .and_then(|v| u64::from_str(v).ok()) + .map(CChar::Raw) + .map(Char) + } else if expected == b"CharChar" { + str::from_utf8(value) + .ok() + .and_then(|v| u32::from_str(v).ok()) + .and_then(char::from_u32) + .map(CChar::Char) + .map(Char) + } else { + Some(Invalid) + } + .expect(&format!("Invalid definition in testcase: {}", display_name)) + }; + + let result = if functional { + let mut fnidents; + let expr_tokens; + match fn_macro_declaration(&tokens) { + Ok((rest, (_, args))) => { + fnidents = idents.clone(); + expr_tokens = rest; + for arg in args { + let val = match test { + Int(_) => bytes_to_int(&arg), + Str(_) => Some(Str(arg.to_owned())), + _ => unimplemented!(), + } + .expect(&format!( + "Invalid argument in functional macro testcase: {}", + display_name + )); + fnidents.insert(arg.to_owned(), val); + } + } + e => { + println!( + "Failed test for {}, unable to parse functional macro declaration: {:?}", + display_name, e + ); + return false; + } + } + assert_full_parse(IdentifierParser::new(&fnidents).expr(&expr_tokens)) + } else { + IdentifierParser::new(idents) + .macro_definition(&tokens) + .map(|(i, (_, val))| (i, val)) + }; + + match result { + Ok((_, val)) => { + if val == test { + if let Some(_) = idents.insert(ident, val) { + panic!("Duplicate definition for testcase: {}", display_name); + } + true + } else { + println!( + "Failed test for {}, expected {:?}, got {:?}", + display_name, test, val + ); + false + } + } + e => { + if test == Invalid { + true + } else { + println!( + "Failed test for {}, expected {:?}, got {:?}", + display_name, test, e + ); + false + } + } + } +} + +// support code for the clang lexer +unsafe fn clang_str_to_vec(s: CXString) -> Vec<u8> { + let vec = ffi::CStr::from_ptr(clang_getCString(s)) + .to_bytes() + .to_owned(); + clang_disposeString(s); + vec +} + +#[allow(non_upper_case_globals)] +unsafe fn token_clang_to_cexpr(tu: CXTranslationUnit, orig: &CXToken) -> Token { + Token { + kind: match clang_getTokenKind(*orig) { + CXToken_Comment => cexpr::token::Kind::Comment, + CXToken_Identifier => cexpr::token::Kind::Identifier, + CXToken_Keyword => cexpr::token::Kind::Keyword, + CXToken_Literal => cexpr::token::Kind::Literal, + CXToken_Punctuation => cexpr::token::Kind::Punctuation, + _ => panic!("invalid token kind: {:?}", *orig), + }, + raw: clang_str_to_vec(clang_getTokenSpelling(tu, *orig)).into_boxed_slice(), + } +} + +extern "C" fn visit_children_thunk<F>( + cur: CXCursor, + parent: CXCursor, + closure: CXClientData, +) -> CXChildVisitResult +where + F: FnMut(CXCursor, CXCursor) -> CXChildVisitResult, +{ + unsafe { (&mut *(closure as *mut F))(cur, parent) } +} + +unsafe fn visit_children<F>(cursor: CXCursor, mut f: F) +where + F: FnMut(CXCursor, CXCursor) -> CXChildVisitResult, +{ + clang_visitChildren( + cursor, + visit_children_thunk::<F> as _, + &mut f as *mut F as CXClientData, + ); +} + +unsafe fn location_in_scope(r: CXSourceRange) -> bool { + let start = clang_getRangeStart(r); + let mut file = ptr::null_mut(); + clang_getSpellingLocation( + start, + &mut file, + ptr::null_mut(), + ptr::null_mut(), + ptr::null_mut(), + ); + clang_Location_isFromMainFile(start) != 0 + && clang_Location_isInSystemHeader(start) == 0 + && file != ptr::null_mut() +} + +/// tokenize_range_adjust can be used to work around LLVM bug 9069 +/// https://bugs.llvm.org//show_bug.cgi?id=9069 +fn file_visit_macros<F: FnMut(Vec<u8>, Vec<Token>)>( + file: &str, + tokenize_range_adjust: bool, + mut visitor: F, +) { + unsafe { + let tu = { + let index = clang_createIndex(true as _, false as _); + let cfile = ffi::CString::new(file).unwrap(); + let mut tu = mem::MaybeUninit::uninit(); + assert!( + clang_parseTranslationUnit2( + index, + cfile.as_ptr(), + [b"-std=c11\0".as_ptr() as *const ::std::os::raw::c_char].as_ptr(), + 1, + ptr::null_mut(), + 0, + CXTranslationUnit_DetailedPreprocessingRecord, + &mut *tu.as_mut_ptr() + ) == CXError_Success, + "Failure reading test case {}", + file + ); + tu.assume_init() + }; + visit_children(clang_getTranslationUnitCursor(tu), |cur, _parent| { + if cur.kind == CXCursor_MacroDefinition { + let mut range = clang_getCursorExtent(cur); + if !location_in_scope(range) { + return CXChildVisit_Continue; + } + range.end_int_data -= if tokenize_range_adjust { 1 } else { 0 }; + let mut token_ptr = ptr::null_mut(); + let mut num = 0; + clang_tokenize(tu, range, &mut token_ptr, &mut num); + if token_ptr != ptr::null_mut() { + let tokens = slice::from_raw_parts(token_ptr, num as usize); + let tokens: Vec<_> = tokens + .iter() + .filter_map(|t| { + if clang_getTokenKind(*t) != CXToken_Comment { + Some(token_clang_to_cexpr(tu, t)) + } else { + None + } + }) + .collect(); + clang_disposeTokens(tu, token_ptr, num); + visitor(clang_str_to_vec(clang_getCursorSpelling(cur)), tokens) + } + } + CXChildVisit_Continue + }); + clang_disposeTranslationUnit(tu); + }; +} + +fn test_file(file: &str) -> bool { + let mut idents = HashMap::new(); + let mut all_succeeded = true; + file_visit_macros(file, fix_bug_9069(), |ident, tokens| { + all_succeeded &= test_definition(ident, &tokens, &mut idents) + }); + all_succeeded +} + +fn fix_bug_9069() -> bool { + fn check_bug_9069() -> bool { + let mut token_sets = vec![]; + file_visit_macros( + "tests/input/test_llvm_bug_9069.h", + false, + |ident, tokens| { + assert_eq!(&ident, b"A"); + token_sets.push(tokens); + }, + ); + assert_eq!(token_sets.len(), 2); + token_sets[0] != token_sets[1] + } + + use std::sync::atomic::{AtomicBool, Ordering}; + use std::sync::Once; + + static CHECK_FIX: Once = Once::new(); + static FIX: AtomicBool = AtomicBool::new(false); + + CHECK_FIX.call_once(|| FIX.store(check_bug_9069(), Ordering::SeqCst)); + + FIX.load(Ordering::SeqCst) +} + +macro_rules! test_file { + ($f:ident) => { + #[test] + fn $f() { + assert!( + test_file(concat!("tests/input/", stringify!($f), ".h")), + "test_file" + ) + } + }; +} + +test_file!(floats); +test_file!(chars); +test_file!(strings); +test_file!(int_signed); +test_file!(int_unsigned); +test_file!(fail); diff --git a/tests/input/chars.h b/tests/input/chars.h new file mode 100644 index 0000000..45351d3 --- /dev/null +++ b/tests/input/chars.h @@ -0,0 +1,3 @@ +#define CharChar_65 'A' +#define CharChar_127849 '\U0001f369' // 🍩 +#define CharRaw_255 U'\xff' diff --git a/tests/input/fail.h b/tests/input/fail.h new file mode 100644 index 0000000..fd416bc --- /dev/null +++ b/tests/input/fail.h @@ -0,0 +1,9 @@ +#define FAIL_function_like(x) 3 +#define FAIL_empty +#define FAIL_invalid_for_radix 0b2 +#define FAIL_shift_by_float 3<<1f +#define FAIL_unknown_identifier UNKNOWN +#define Int_0 0 +#define Str_str "str" +#define FAIL_concat_integer "test" Str_str Int_0 +#define FAIL_too_large_int 18446744073709551616 diff --git a/tests/input/floats.h b/tests/input/floats.h new file mode 100644 index 0000000..61942cf --- /dev/null +++ b/tests/input/floats.h @@ -0,0 +1,8 @@ +#define Float_0 0. +#define Float_1 1f +#define Float_p1 .1 +#define Float_2 2.0 +#define Float_1000 1e3 +#define Float_2000 2e+3 +#define Float_p001 1e-3 +#define Float_80 10.0*(1<<3) diff --git a/tests/input/int_signed.h b/tests/input/int_signed.h new file mode 100644 index 0000000..65854a6 --- /dev/null +++ b/tests/input/int_signed.h @@ -0,0 +1,3 @@ +#define Int_n3 -(-(-3)) +#define Int_n5 -3-2 +#define Int_n9223372036854775808 -9223372036854775808 diff --git a/tests/input/int_unsigned.h b/tests/input/int_unsigned.h new file mode 100644 index 0000000..6663dda --- /dev/null +++ b/tests/input/int_unsigned.h @@ -0,0 +1,29 @@ +#define Int_456 456 +#define Int_0 0 +#define Int_1 0b1 +#define Int_2 0x2 +#define Int_3 3L +#define Int_4 0X4 +#define Int_5 0B101 +#define Int_63 077 +#define Int_123 123 +#define Int_124 124u +#define Int_125 125uL +#define Int_126 126LuL +#define Int_16 (((1)<<4ULL))/*comment*/ +#define Int_13 1|8^6&2<<1 + +#define Int_47 32|15 +#define Int_38 (32|15)^9 +#define Int_6 ((32|15)^9)&7 +#define Int_12 (((32|15)^9)&7)<<1 +#define Int_17 ((((32|15)^9)&7)<<1)+5 +#define Int_15 (((((32|15)^9)&7)<<1)+5)-2 +#define Int_60 ((((((32|15)^9)&7)<<1)+5)-2)*4 +#define Int_30 (((((((32|15)^9)&7)<<1)+5)-2)*4)/2 +#define Int_39 32|15^9&7<<1+5-2*4/2 + +#define Int_n1 18446744073709551615 /*2^64-1*/ +#define Int_n9223372036854775808 9223372036854775808 + +#define Fn_Int_9(_3) _3*3 diff --git a/tests/input/strings.h b/tests/input/strings.h new file mode 100644 index 0000000..d01d409 --- /dev/null +++ b/tests/input/strings.h @@ -0,0 +1,17 @@ +#define Str_ "" +#define Str_str "str" +#define Str_unicode u"unicode" +#define Str_long L"long" +#define Str_concat u"con" L"cat" +#define Str_concat_parens ("concat" U"_parens") +#define Str_concat_identifier (Str_concat L"_identifier") +#define Str_hex_escape_all "\x68\x65\x78\x5f\x65\x73\x63\x61\x70\x65\x5f\x61\x6c\x6c" +#define Str_hex_escape_hex "h\x65x_\x65s\x63\x61p\x65_h\x65x" +#define Str_quote_U000022_escape "quote_\"_escape" +#define Str_Fly_away_in_my_space_U01F680_You_no_need_put_U01F4B5_in_my_pocket \ + u8"Fly_away_in_my_space_🚀_You_no_need_put_💵_in_my_pocket" +#define Fn_Str_no_args() "no_args" +#define Fn_Str_no_args_concat() "no_args_" Str_concat +#define Fn_Str_prepend_arg(arg) "prepend_" arg +#define Fn_Str_two_args(two, args) two "_" args +#define Fn_Str_three_args(three, _, args) three _ args diff --git a/tests/input/test_llvm_bug_9069.h b/tests/input/test_llvm_bug_9069.h new file mode 100644 index 0000000..a92374e --- /dev/null +++ b/tests/input/test_llvm_bug_9069.h @@ -0,0 +1,4 @@ +// The following two definitions should yield the same list of tokens. +// If https://bugs.llvm.org//show_bug.cgi?id=9069 is not fixed, they don't. +#define A 1 +#define A 1 |