/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCH_H_ #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCH_H_ #include #include #include "annotator/types.h" #include "utils/grammar/types.h" #include "utils/strings/stringpiece.h" namespace libtextclassifier3::grammar { // Represents a single match that was found for a particular nonterminal. // Instances should be created by calling Matcher::AllocateMatch(). // This uses an arena to allocate matches (and subclasses thereof). struct Match { static constexpr int16 kUnknownType = 0; static constexpr int16 kTokenType = -1; static constexpr int16 kDigitsType = -2; static constexpr int16 kBreakType = -3; static constexpr int16 kAssertionMatch = -4; static constexpr int16 kMappingMatch = -5; static constexpr int16 kExclusionMatch = -6; static constexpr int16 kAnnotationMatch = -7; void Init(const Nonterm arg_lhs, const CodepointSpan arg_codepoint_span, const int arg_match_offset, const int arg_type = kUnknownType) { lhs = arg_lhs; codepoint_span = arg_codepoint_span; match_offset = arg_match_offset; type = arg_type; rhs1 = nullptr; rhs2 = nullptr; } void Init(const Match& other) { *this = other; } // For binary rule matches: rhs1 != NULL and rhs2 != NULL // unary rule matches: rhs1 == NULL and rhs2 != NULL // terminal rule matches: rhs1 != NULL and rhs2 == NULL // custom leaves: rhs1 == NULL and rhs2 == NULL bool IsInteriorNode() const { return rhs2 != nullptr; } bool IsLeaf() const { return !rhs2; } bool IsBinaryRule() const { return rhs1 && rhs2; } bool IsUnaryRule() const { return !rhs1 && rhs2; } bool IsTerminalRule() const { return rhs1 && !rhs2; } bool HasLeadingWhitespace() const { return codepoint_span.first != match_offset; } const Match* unary_rule_rhs() const { return rhs2; } // Used in singly-linked queue of matches for processing. Match* next = nullptr; // Nonterminal we found a match for. Nonterm lhs = kUnassignedNonterm; // Type of the match. int16 type = kUnknownType; // The span in codepoints. CodepointSpan codepoint_span; // The begin codepoint offset used during matching. // This is usually including any prefix whitespace. int match_offset; union { // The first sub match for binary rules. const Match* rhs1 = nullptr; // The terminal, for terminal rules. const char* terminal; }; // First or second sub-match for interior nodes. const Match* rhs2 = nullptr; }; // Match type to keep track of associated values. struct MappingMatch : public Match { // The associated id or value. int64 id; }; // Match type to keep track of assertions. struct AssertionMatch : public Match { // If true, the assertion is negative and will be valid if the input doesn't // match. bool negative; }; // Match type to define exclusions. struct ExclusionMatch : public Match { // The nonterminal that denotes matches to exclude from a successful match. // So the match is only valid if there is no match of `exclusion_nonterm` // spanning the same text range. Nonterm exclusion_nonterm; }; // Match to represent an annotator annotated span in the grammar. struct AnnotationMatch : public Match { const ClassificationResult* annotation; }; // Utility functions for parse tree traversal. // Does a preorder traversal, calling `node_fn` on each node. // `node_fn` is expected to return whether to continue expanding a node. void Traverse(const Match* root, const std::function& node_fn); // Does a preorder traversal, calling `pred_fn` and returns the first node // on which `pred_fn` returns true. const Match* SelectFirst(const Match* root, const std::function& pred_fn); // Does a preorder traversal, selecting all nodes where `pred_fn` returns true. std::vector SelectAll( const Match* root, const std::function& pred_fn); // Selects all terminals from a parse tree. inline std::vector SelectTerminals(const Match* root) { return SelectAll(root, &Match::IsTerminalRule); } // Selects all leaves from a parse tree. inline std::vector SelectLeaves(const Match* root) { return SelectAll(root, &Match::IsLeaf); } // Retrieves the first child node of a given type. template const T* SelectFirstOfType(const Match* root, const int16 type) { return static_cast(SelectFirst( root, [type](const Match* node) { return node->type == type; })); } // Retrieves all nodes of a given type. template const std::vector SelectAllOfType(const Match* root, const int16 type) { std::vector result; Traverse(root, [&result, type](const Match* node) { if (node->type == type) { result.push_back(static_cast(node)); } return true; }); return result; } } // namespace libtextclassifier3::grammar #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_MATCH_H_