// // Copyright (C) 2018 The Android Open Source Project // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // include "annotator/entity-data.fbs"; include "annotator/experimental/experimental.fbs"; include "annotator/grammar/dates/dates.fbs"; include "utils/codepoint-range.fbs"; include "utils/flatbuffers.fbs"; include "utils/grammar/rules.fbs"; include "utils/intents/intent-config.fbs"; include "utils/normalization.fbs"; include "utils/resources.fbs"; include "utils/tokenizer.fbs"; include "utils/zlib/buffer.fbs"; file_identifier "TC2 "; // The possible model modes, represents a bit field. namespace libtextclassifier3; enum ModeFlag : int { NONE = 0, ANNOTATION = 1, CLASSIFICATION = 2, ANNOTATION_AND_CLASSIFICATION = 3, SELECTION = 4, ANNOTATION_AND_SELECTION = 5, CLASSIFICATION_AND_SELECTION = 6, ALL = 7, } // Enum for specifying the annotation usecase. namespace libtextclassifier3; enum AnnotationUsecase : int { // Results are optimized for Smart{Select,Share,Linkify}. ANNOTATION_USECASE_SMART = 0, // Smart{Select,Share,Linkify} // Results are optimized for using TextClassifier as an infrastructure that // annotates as much as possible. ANNOTATION_USECASE_RAW = 1, } namespace libtextclassifier3; enum DatetimeExtractorType : int { UNKNOWN_DATETIME_EXTRACTOR_TYPE = 0, AM = 1, PM = 2, JANUARY = 3, FEBRUARY = 4, MARCH = 5, APRIL = 6, MAY = 7, JUNE = 8, JULY = 9, AUGUST = 10, SEPTEMBER = 11, OCTOBER = 12, NOVEMBER = 13, DECEMBER = 14, NEXT = 15, NEXT_OR_SAME = 16, LAST = 17, NOW = 18, TOMORROW = 19, YESTERDAY = 20, PAST = 21, FUTURE = 22, DAY = 23, WEEK = 24, MONTH = 25, YEAR = 26, MONDAY = 27, TUESDAY = 28, WEDNESDAY = 29, THURSDAY = 30, FRIDAY = 31, SATURDAY = 32, SUNDAY = 33, DAYS = 34, WEEKS = 35, MONTHS = 36, // TODO(zilka): Make the following 3 values singular for consistency. HOURS = 37, MINUTES = 38, SECONDS = 39, YEARS = 40, DIGITS = 41, SIGNEDDIGITS = 42, ZERO = 43, ONE = 44, TWO = 45, THREE = 46, FOUR = 47, FIVE = 48, SIX = 49, SEVEN = 50, EIGHT = 51, NINE = 52, TEN = 53, ELEVEN = 54, TWELVE = 55, THIRTEEN = 56, FOURTEEN = 57, FIFTEEN = 58, SIXTEEN = 59, SEVENTEEN = 60, EIGHTEEN = 61, NINETEEN = 62, TWENTY = 63, THIRTY = 64, FORTY = 65, FIFTY = 66, SIXTY = 67, SEVENTY = 68, EIGHTY = 69, NINETY = 70, HUNDRED = 71, THOUSAND = 72, } namespace libtextclassifier3; enum DatetimeGroupType : int { GROUP_UNKNOWN = 0, GROUP_UNUSED = 1, GROUP_YEAR = 2, GROUP_MONTH = 3, GROUP_DAY = 4, GROUP_HOUR = 5, GROUP_MINUTE = 6, GROUP_SECOND = 7, GROUP_AMPM = 8, GROUP_RELATIONDISTANCE = 9, GROUP_RELATION = 10, GROUP_RELATIONTYPE = 11, // Dummy groups serve just as an inflator of the selection. E.g. we might want // to select more text than was contained in an envelope of all extractor // spans. GROUP_DUMMY1 = 12, GROUP_DUMMY2 = 13, } // Options for the model that predicts text selection. namespace libtextclassifier3; table SelectionModelOptions { // If true, before the selection is returned, the unpaired brackets contained // in the predicted selection are stripped from the both selection ends. // The bracket codepoints are defined in the Unicode standard: // http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt strip_unpaired_brackets:bool = true; // Number of hypothetical click positions on either side of the actual click // to consider in order to enforce symmetry. symmetry_context_size:int; // Number of examples to bundle in one batch for inference. batch_size:int = 1024; // Whether to always classify a suggested selection or only on demand. always_classify_suggested_selection:bool = false; } // Options for the model that classifies a text selection. namespace libtextclassifier3; table ClassificationModelOptions { // Limits for phone numbers. phone_min_num_digits:int = 7; phone_max_num_digits:int = 15; // Limits for addresses. address_min_num_tokens:int; // Maximum number of tokens to attempt a classification (-1 is unlimited). max_num_tokens:int = -1; } // Options for post-checks, checksums and verification to apply on a match. namespace libtextclassifier3; table VerificationOptions { verify_luhn_checksum:bool = false; // Lua verifier to use. // Index of the lua verifier in the model. lua_verifier:int = -1; } // Behaviour of rule capturing groups. // This specifies how the text and span of a capturing group, in a regular // expression or from a capturing match in a grammar rule, should be handled. namespace libtextclassifier3; table CapturingGroup { // If true, the span of the capturing group will be used to // extend the selection. extend_selection:bool = true; // If set, the text of the capturing group will be used to set a field in // the classfication result entity data. entity_field_path:FlatbufferFieldPath; // If set, the flatbuffer entity data will be merged with the // classification result entity data. serialized_entity_data:string (shared); // If set, normalization to apply before text is used in entity data. normalization_options:NormalizationOptions; entity_data:EntityData; } // List of regular expression matchers to check. namespace libtextclassifier3.RegexModel_; table Pattern { // The name of the collection of a match. collection_name:string (shared); // The pattern to check. pattern:string (shared); // The modes for which to apply the patterns. enabled_modes:ModeFlag = ALL; // The final score to assign to the results of this pattern. target_classification_score:float = 1; // Priority score used for conflict resolution with the other models. priority_score:float = 0; // If true, will use an approximate matching implementation implemented // using Find() instead of the true Match(). This approximate matching will // use the first Find() result and then check that it spans the whole input. use_approximate_matching:bool = false; compressed_pattern:CompressedBuffer; // Verification to apply on a match. verification_options:VerificationOptions; capturing_group:[CapturingGroup]; // Entity data to set for a match. serialized_entity_data:string (shared); entity_data:EntityData; } namespace libtextclassifier3; table RegexModel { patterns:[RegexModel_.Pattern]; // If true, will compile the regexes only on first use. lazy_regex_compilation:bool = true; // Lua scripts for match verification. // The verifier can access: // * `context`: The context as a string. // * `match`: The groups of the regex match as an array, each group gives // * `begin`: span start // * `end`: span end // * `text`: the text // The verifier is expected to return a boolean, indicating whether the // verification succeeded or not. lua_verifier:[string]; } // List of regex patterns. namespace libtextclassifier3.DatetimeModelPattern_; table Regex { pattern:string (shared); // The ith entry specifies the type of the ith capturing group. // This is used to decide how the matched content has to be parsed. groups:[DatetimeGroupType]; compressed_pattern:CompressedBuffer; } namespace libtextclassifier3; table DatetimeModelPattern { regexes:[DatetimeModelPattern_.Regex]; // List of locale indices in DatetimeModel that represent the locales that // these patterns should be used for. If empty, can be used for all locales. locales:[int]; // The final score to assign to the results of this pattern. target_classification_score:float = 1; // Priority score used for conflict resolution with the other models. priority_score:float = 0; // The modes for which to apply the patterns. enabled_modes:ModeFlag = ALL; // The annotation usecases for which to apply the patterns. // This is a flag field for values of AnnotationUsecase. enabled_annotation_usecases:uint = 4294967295; } namespace libtextclassifier3; table DatetimeModelExtractor { extractor:DatetimeExtractorType; pattern:string (shared); locales:[int]; compressed_pattern:CompressedBuffer; } namespace libtextclassifier3; table DatetimeModel { // List of BCP 47 locale strings representing all locales supported by the // model. The individual patterns refer back to them using an index. locales:[string]; patterns:[DatetimeModelPattern]; extractors:[DatetimeModelExtractor]; // If true, will use the extractors for determining the match location as // opposed to using the location where the global pattern matched. use_extractors_for_locating:bool = true; // List of locale ids, rules of whose are always run, after the requested // ones. default_locales:[int]; // If true, will generate the alternative interpretations for ambiguous // datetime expressions. generate_alternative_interpretations_when_ambiguous:bool = false; // If true, will compile the regexes only on first use. lazy_regex_compilation:bool = true; // If true, will give only future dates (when the day is not specified). prefer_future_for_unspecified_date:bool = false; } // Configuration for the tokenizer. namespace libtextclassifier3; table GrammarTokenizerOptions { tokenization_type:TokenizationType = ICU; // If true, white space tokens will be kept when using the icu tokenizer. icu_preserve_whitespace_tokens:bool = false; // Codepoint ranges that determine what role the different codepoints play // during tokenized. The ranges must not overlap. tokenization_codepoint_config:[TokenizationCodepointRange]; // A set of codepoint ranges to use in the mixed tokenization mode to identify // stretches of tokens to re-tokenize using the internal tokenizer. internal_tokenizer_codepoint_ranges:[CodepointRange]; // If true, tokens will be also split when the codepoint's script_id changes // as defined in TokenizationCodepointRange. tokenize_on_script_change:bool = false; } // Options for grammar date/datetime/date range annotations. namespace libtextclassifier3.GrammarDatetimeModel_; table AnnotationOptions { // If enabled, extract special day offset like today, yesterday, etc. enable_special_day_offset:bool = true; // If true, merge the adjacent day of week, time and date. e.g. // "20/2/2016 at 8pm" is extracted as a single instance instead of two // instance: "20/2/2016" and "8pm". merge_adjacent_components:bool = true; // List the extra id of requested dates. extra_requested_dates:[string]; // If true, try to include preposition to the extracted annotation. e.g. // "at 6pm". if it's false, only 6pm is included. offline-actions has // special requirements to include preposition. include_preposition:bool = true; // If enabled, extract range in date annotator. // input: Monday, 5-6pm // If the flag is true, The extracted annotation only contains 1 range // instance which is from Monday 5pm to 6pm. // If the flag is false, The extracted annotation contains two date // instance: "Monday" and "6pm". enable_date_range:bool = true; reserved_6:int16 (deprecated); // If enabled, the rule priority score is used to set the priority score of // the annotation. // In case of false the annotation priority score is set from // GrammarDatetimeModel's priority_score use_rule_priority_score:bool = false; // If enabled, annotator will try to resolve the ambiguity by generating // possible alternative interpretations of the input text // e.g. '9:45' will be resolved to '9:45 AM' and '9:45 PM'. generate_alternative_interpretations_when_ambiguous:bool; // List of spans which grammar will ignore during the match e.g. if // “@” is in the allowed span list and input is “12 March @ 12PM” then “@” // will be ignored and 12 March @ 12PM will be translate to // {Day:12 Month: March Hour: 12 MERIDIAN: PM}. // This can also be achieved by adding additional rules e.g. //