/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // This is a lexer that runs off the tokenizer and outputs the tokens to a // grammar matcher. The tokens it forwards are the same as the ones produced // by the tokenizer, but possibly further split and normalized (downcased). // Examples: // // - single character tokens for punctuation (e.g., AddTerminal("?")) // // - a string of letters (e.g., "Foo" -- it calls AddTerminal() on "foo") // // - a string of digits (e.g., AddTerminal("37")) // // In addition to the terminal tokens above, it also outputs certain // special nonterminals: // // - a nonterminal, which it outputs in addition to the // regular AddTerminal() call for every token // // - a nonterminal, which it outputs in addition to // the regular AddTerminal() call for each string of digits // // - nonterminals, where N is the length of the string of // digits. By default the maximum N that will be output is 20. This // may be changed at compile time by kMaxNDigitsLength. For instance, // "123" will produce a <3_digits> nonterminal, "1234567" will produce // a <7_digits> nonterminal. // // It does not output any whitespace. Instead, whitespace gets absorbed into // the token that follows them in the text. // For example, if the text contains: // // ...hello there world... // | | | // offset=16 39 52 // // then the output will be: // // "hello" [?, 16) // "there" [16, 44) <-- note "16" NOT "39" // "world" [44, ?) <-- note "44" NOT "52" // // This makes it appear to the Matcher as if the tokens are adjacent -- so // whitespace is simply ignored. // // A minor optimization: We don't bother to output nonterminals if the grammar // rules don't reference them. #ifndef LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_ #define LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_ #include "annotator/types.h" #include "utils/grammar/matcher.h" #include "utils/grammar/rules_generated.h" #include "utils/grammar/types.h" #include "utils/strings/stringpiece.h" #include "utils/utf8/unicodetext.h" #include "utils/utf8/unilib.h" namespace libtextclassifier3::grammar { class Lexer { public: explicit Lexer(const UniLib* unilib, const RulesSet* rules); // Processes a tokenized text. Classifies the tokens and feeds them to the // matcher. // The provided annotations will be fed to the matcher alongside the tokens. // NOTE: The `annotations` need to outlive any dependent processing. void Process(const UnicodeText& text, const std::vector& tokens, const std::vector* annotations, Matcher* matcher) const; void Process(const UnicodeText& text, const std::vector::const_iterator& begin, const std::vector::const_iterator& end, const std::vector* annotations, Matcher* matcher) const; private: // A lexical symbol with an identified meaning that represents raw tokens, // token categories or predefined text matches. // It is the unit fed to the grammar matcher. struct Symbol { // The type of the lexical symbol. enum class Type { // A raw token. TYPE_TERM, // A symbol representing a string of digits. TYPE_DIGITS, // Punctuation characters. TYPE_PUNCTUATION, // A predefined match. TYPE_MATCH }; explicit Symbol() = default; // Constructs a symbol of a given type with an anchor in the text. Symbol(const Type type, const CodepointSpan codepoint_span, const int match_offset, StringPiece lexeme) : type(type), codepoint_span(codepoint_span), match_offset(match_offset), lexeme(lexeme) {} // Constructs a symbol from a pre-defined match. explicit Symbol(Match* match) : type(Type::TYPE_MATCH), codepoint_span(match->codepoint_span), match_offset(match->match_offset), match(match) {} // The type of the symbole. Type type; // The span in the text as codepoint offsets. CodepointSpan codepoint_span; // The match start offset (including preceding whitespace) as codepoint // offset. int match_offset; // The symbol text value. StringPiece lexeme; // The predefined match. Match* match; }; // Processes a single token: the token is split and classified into symbols. void ProcessToken(const StringPiece value, const int prev_token_end, const CodepointSpan codepoint_span, std::vector* symbols) const; // Emits a token to the matcher. void Emit(const Symbol& symbol, const RulesSet_::Nonterminals* nonterms, Matcher* matcher) const; // Gets the type of a character. Symbol::Type GetSymbolType(const UnicodeText::const_iterator& it) const; private: struct RegexAnnotator { std::unique_ptr pattern; Nonterm nonterm; }; // Uncompress and build the defined regex annotators. std::vector BuildRegexAnnotator(const UniLib& unilib, const RulesSet* rules) const; const UniLib& unilib_; const RulesSet* rules_; std::vector regex_annotators_; }; } // namespace libtextclassifier3::grammar #endif // LIBTEXTCLASSIFIER_UTILS_GRAMMAR_LEXER_H_