/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_GRAMMAR_DATES_UTILS_DATE_MATCH_H_ #define LIBTEXTCLASSIFIER_ANNOTATOR_GRAMMAR_DATES_UTILS_DATE_MATCH_H_ #include #include #include #include #include "annotator/grammar/dates/dates_generated.h" #include "annotator/grammar/dates/timezone-code_generated.h" #include "utils/grammar/match.h" namespace libtextclassifier3 { namespace dates { static constexpr int NO_VAL = -1; // POD match data structure. struct MatchBase : public grammar::Match { void Reset() { type = MatchType::MatchType_UNKNOWN; } }; struct ExtractionMatch : public MatchBase { const ExtractionRuleParameter* extraction_rule; void Reset() { MatchBase::Reset(); type = MatchType::MatchType_DATETIME_RULE; extraction_rule = nullptr; } }; struct TermValueMatch : public MatchBase { const TermValue* term_value; void Reset() { MatchBase::Reset(); type = MatchType::MatchType_TERM_VALUE; term_value = nullptr; } }; struct NonterminalMatch : public MatchBase { const NonterminalValue* nonterminal; void Reset() { MatchBase::Reset(); type = MatchType::MatchType_NONTERMINAL; nonterminal = nullptr; } }; struct IntegerMatch : public NonterminalMatch { int value; int8 count_of_digits; // When expression is in digits format. bool is_zero_prefixed; // When expression is in digits format. void Reset() { NonterminalMatch::Reset(); value = NO_VAL; count_of_digits = 0; is_zero_prefixed = false; } }; struct DigitsMatch : public IntegerMatch { void Reset() { IntegerMatch::Reset(); type = MatchType::MatchType_DIGITS; } static bool IsValid(int x) { return true; } }; struct YearMatch : public IntegerMatch { void Reset() { IntegerMatch::Reset(); type = MatchType::MatchType_YEAR; } static bool IsValid(int x) { return x >= 1; } }; struct MonthMatch : public IntegerMatch { void Reset() { IntegerMatch::Reset(); type = MatchType::MatchType_MONTH; } static bool IsValid(int x) { return (x >= 1 && x <= 12); } }; struct DayMatch : public IntegerMatch { void Reset() { IntegerMatch::Reset(); type = MatchType::MatchType_DAY; } static bool IsValid(int x) { return (x >= 1 && x <= 31); } }; struct HourMatch : public IntegerMatch { void Reset() { IntegerMatch::Reset(); type = MatchType::MatchType_HOUR; } static bool IsValid(int x) { return (x >= 0 && x <= 24); } }; struct MinuteMatch : public IntegerMatch { void Reset() { IntegerMatch::Reset(); type = MatchType::MatchType_MINUTE; } static bool IsValid(int x) { return (x >= 0 && x <= 59); } }; struct SecondMatch : public IntegerMatch { void Reset() { IntegerMatch::Reset(); type = MatchType::MatchType_SECOND; } static bool IsValid(int x) { return (x >= 0 && x <= 60); } }; struct DecimalMatch : public NonterminalMatch { double value; int8 count_of_digits; // When expression is in digits format. void Reset() { NonterminalMatch::Reset(); value = NO_VAL; count_of_digits = 0; } }; struct FractionSecondMatch : public DecimalMatch { void Reset() { DecimalMatch::Reset(); type = MatchType::MatchType_FRACTION_SECOND; } static bool IsValid(double x) { return (x >= 0.0 && x < 1.0); } }; // CombinedIntegersMatch is used for expressions containing multiple (up // to N) matches of integers without delimeters between them (because // CFG-grammar is based on tokenizer, it could not split a token into several // pieces like using regular-expression). For example, "1130" contains "11" // and "30" meaning November 30. template struct CombinedIntegersMatch : public NonterminalMatch { enum { SIZE = N, }; int values[SIZE]; int8 count_of_digits; // When expression is in digits format. bool is_zero_prefixed; // When expression is in digits format. void Reset() { NonterminalMatch::Reset(); for (int i = 0; i < SIZE; ++i) { values[i] = NO_VAL; } count_of_digits = 0; is_zero_prefixed = false; } }; struct CombinedDigitsMatch : public CombinedIntegersMatch<6> { enum Index { INDEX_YEAR = 0, INDEX_MONTH = 1, INDEX_DAY = 2, INDEX_HOUR = 3, INDEX_MINUTE = 4, INDEX_SECOND = 5, }; bool HasYear() const { return values[INDEX_YEAR] != NO_VAL; } bool HasMonth() const { return values[INDEX_MONTH] != NO_VAL; } bool HasDay() const { return values[INDEX_DAY] != NO_VAL; } bool HasHour() const { return values[INDEX_HOUR] != NO_VAL; } bool HasMinute() const { return values[INDEX_MINUTE] != NO_VAL; } bool HasSecond() const { return values[INDEX_SECOND] != NO_VAL; } int GetYear() const { return values[INDEX_YEAR]; } int GetMonth() const { return values[INDEX_MONTH]; } int GetDay() const { return values[INDEX_DAY]; } int GetHour() const { return values[INDEX_HOUR]; } int GetMinute() const { return values[INDEX_MINUTE]; } int GetSecond() const { return values[INDEX_SECOND]; } void Reset() { CombinedIntegersMatch::Reset(); type = MatchType::MatchType_COMBINED_DIGITS; } static bool IsValid(int i, int x) { switch (i) { case INDEX_YEAR: return YearMatch::IsValid(x); case INDEX_MONTH: return MonthMatch::IsValid(x); case INDEX_DAY: return DayMatch::IsValid(x); case INDEX_HOUR: return HourMatch::IsValid(x); case INDEX_MINUTE: return MinuteMatch::IsValid(x); case INDEX_SECOND: return SecondMatch::IsValid(x); default: return false; } } }; struct TimeValueMatch : public NonterminalMatch { const HourMatch* hour_match; const MinuteMatch* minute_match; const SecondMatch* second_match; const FractionSecondMatch* fraction_second_match; bool is_hour_zero_prefixed : 1; bool is_minute_one_digit : 1; bool is_second_one_digit : 1; int8 hour; int8 minute; int8 second; double fraction_second; void Reset() { NonterminalMatch::Reset(); type = MatchType::MatchType_TIME_VALUE; hour_match = nullptr; minute_match = nullptr; second_match = nullptr; fraction_second_match = nullptr; is_hour_zero_prefixed = false; is_minute_one_digit = false; is_second_one_digit = false; hour = NO_VAL; minute = NO_VAL; second = NO_VAL; fraction_second = NO_VAL; } }; struct TimeSpanMatch : public NonterminalMatch { const TimeSpanSpec* time_span_spec; TimespanCode time_span_code; void Reset() { NonterminalMatch::Reset(); type = MatchType::MatchType_TIME_SPAN; time_span_spec = nullptr; time_span_code = TimespanCode_TIMESPAN_CODE_NONE; } }; struct TimeZoneNameMatch : public NonterminalMatch { const TimeZoneNameSpec* time_zone_name_spec; TimezoneCode time_zone_code; void Reset() { NonterminalMatch::Reset(); type = MatchType::MatchType_TIME_ZONE_NAME; time_zone_name_spec = nullptr; time_zone_code = TimezoneCode_TIMEZONE_CODE_NONE; } }; struct TimeZoneOffsetMatch : public NonterminalMatch { const TimeZoneOffsetParameter* time_zone_offset_param; int16 time_zone_offset; void Reset() { NonterminalMatch::Reset(); type = MatchType::MatchType_TIME_ZONE_OFFSET; time_zone_offset_param = nullptr; time_zone_offset = 0; } }; struct DayOfWeekMatch : public IntegerMatch { void Reset() { IntegerMatch::Reset(); type = MatchType::MatchType_DAY_OF_WEEK; } static bool IsValid(int x) { return (x > DayOfWeek_DOW_NONE && x <= DayOfWeek_MAX); } }; struct TimePeriodMatch : public NonterminalMatch { int value; void Reset() { NonterminalMatch::Reset(); type = MatchType::MatchType_TIME_PERIOD; value = NO_VAL; } }; struct RelativeMatch : public NonterminalMatch { enum { HAS_NONE = 0, HAS_YEAR = 1 << 0, HAS_MONTH = 1 << 1, HAS_DAY = 1 << 2, HAS_WEEK = 1 << 3, HAS_HOUR = 1 << 4, HAS_MINUTE = 1 << 5, HAS_SECOND = 1 << 6, HAS_DAY_OF_WEEK = 1 << 7, HAS_IS_FUTURE = 1 << 31, }; uint32 existing; int year; int month; int day; int week; int hour; int minute; int second; const NonterminalValue* day_of_week_nonterminal; int8 day_of_week; bool is_future_date; bool HasDay() const { return existing & HAS_DAY; } bool HasDayFields() const { return existing & (HAS_DAY | HAS_DAY_OF_WEEK); } bool HasTimeValueFields() const { return existing & (HAS_HOUR | HAS_MINUTE | HAS_SECOND); } bool IsStandaloneRelativeDayOfWeek() const { return (existing & HAS_DAY_OF_WEEK) && (existing & ~HAS_DAY_OF_WEEK) == 0; } void Reset() { NonterminalMatch::Reset(); type = MatchType::MatchType_RELATIVE_DATE; existing = HAS_NONE; year = NO_VAL; month = NO_VAL; day = NO_VAL; week = NO_VAL; hour = NO_VAL; minute = NO_VAL; second = NO_VAL; day_of_week = NO_VAL; is_future_date = false; } }; // This is not necessarily POD, it is used to keep the final matched result. struct DateMatch { // Sub-matches in the date match. const YearMatch* year_match = nullptr; const MonthMatch* month_match = nullptr; const DayMatch* day_match = nullptr; const DayOfWeekMatch* day_of_week_match = nullptr; const TimeValueMatch* time_value_match = nullptr; const TimeSpanMatch* time_span_match = nullptr; const TimeZoneNameMatch* time_zone_name_match = nullptr; const TimeZoneOffsetMatch* time_zone_offset_match = nullptr; const RelativeMatch* relative_match = nullptr; const CombinedDigitsMatch* combined_digits_match = nullptr; // [begin, end) indicates the Document position where the date or date range // was found. int begin = -1; int end = -1; int priority = 0; float annotator_priority_score = 0.0; int year = NO_VAL; int8 month = NO_VAL; int8 day = NO_VAL; DayOfWeek day_of_week = DayOfWeek_DOW_NONE; BCAD bc_ad = BCAD_BCAD_NONE; int8 hour = NO_VAL; int8 minute = NO_VAL; int8 second = NO_VAL; double fraction_second = NO_VAL; TimespanCode time_span_code = TimespanCode_TIMESPAN_CODE_NONE; int time_zone_code = TimezoneCode_TIMEZONE_CODE_NONE; int16 time_zone_offset = std::numeric_limits::min(); // Fields about ambiguous hours. These fields are used to interpret the // possible values of ambiguous hours. Since all kinds of known ambiguities // are in the form of arithmetic progression (starting from .hour field), // we can use "ambiguous_hour_count" to denote the count of ambiguous hours, // and use "ambiguous_hour_interval" to denote the distance between a pair // of adjacent possible hours. Values in the arithmetic progression are // shrunk into [0, 23] (MOD 24). One can use the GetPossibleHourValues() // method for the complete list of possible hours. uint8 ambiguous_hour_count = 0; uint8 ambiguous_hour_interval = 0; bool is_inferred = false; // This field is set in function PerformRefinements to remove some DateMatch // like overlapped, duplicated, etc. bool is_removed = false; std::string DebugString() const; bool HasYear() const { return year != NO_VAL; } bool HasMonth() const { return month != NO_VAL; } bool HasDay() const { return day != NO_VAL; } bool HasDayOfWeek() const { return day_of_week != DayOfWeek_DOW_NONE; } bool HasBcAd() const { return bc_ad != BCAD_BCAD_NONE; } bool HasHour() const { return hour != NO_VAL; } bool HasMinute() const { return minute != NO_VAL; } bool HasSecond() const { return second != NO_VAL; } bool HasFractionSecond() const { return fraction_second != NO_VAL; } bool HasTimeSpanCode() const { return time_span_code != TimespanCode_TIMESPAN_CODE_NONE; } bool HasTimeZoneCode() const { return time_zone_code != TimezoneCode_TIMEZONE_CODE_NONE; } bool HasTimeZoneOffset() const { return time_zone_offset != std::numeric_limits::min(); } bool HasRelativeDate() const { return relative_match != nullptr; } bool IsHourAmbiguous() const { return ambiguous_hour_count >= 2; } bool IsStandaloneTime() const { return (HasHour() || HasMinute()) && !HasDayOfWeek() && !HasDay() && !HasMonth() && !HasYear(); } void SetAmbiguousHourProperties(uint8 count, uint8 interval) { ambiguous_hour_count = count; ambiguous_hour_interval = interval; } // Outputs all the possible hour values. If current DateMatch does not // contain an hour, nothing will be output. If the hour is not ambiguous, // only one value (= .hour) will be output. This method clears the vector // "values" first, and it is not guaranteed that the values in the vector // are in a sorted order. void GetPossibleHourValues(std::vector* values) const; int GetPriority() const { return priority; } float GetAnnotatorPriorityScore() const { return annotator_priority_score; } bool IsStandaloneRelativeDayOfWeek() const { return (HasRelativeDate() && relative_match->IsStandaloneRelativeDayOfWeek() && !HasDateFields() && !HasTimeFields() && !HasTimeSpanCode()); } bool HasDateFields() const { return (HasYear() || HasMonth() || HasDay() || HasDayOfWeek() || HasBcAd()); } bool HasTimeValueFields() const { return (HasHour() || HasMinute() || HasSecond() || HasFractionSecond()); } bool HasTimeSpanFields() const { return HasTimeSpanCode(); } bool HasTimeZoneFields() const { return (HasTimeZoneCode() || HasTimeZoneOffset()); } bool HasTimeFields() const { return (HasTimeValueFields() || HasTimeSpanFields() || HasTimeZoneFields()); } bool IsValid() const; // Overall relative qualifier of the DateMatch e.g. 2 year ago is 'PAST' and // next week is 'FUTURE'. DatetimeComponent::RelativeQualifier GetRelativeQualifier() const; // Getter method to get the 'DatetimeComponent' of given 'ComponentType'. Optional GetDatetimeComponent( const DatetimeComponent::ComponentType& component_type) const; void FillDatetimeComponents( std::vector* datetime_component) const; }; // Represent a matched date range which includes the from and to matched date. struct DateRangeMatch { int begin = -1; int end = -1; DateMatch from; DateMatch to; std::string DebugString() const; int GetPriority() const { return std::max(from.GetPriority(), to.GetPriority()); } float GetAnnotatorPriorityScore() const { return std::max(from.GetAnnotatorPriorityScore(), to.GetAnnotatorPriorityScore()); } }; } // namespace dates } // namespace libtextclassifier3 #endif // LIBTEXTCLASSIFIER_ANNOTATOR_GRAMMAR_DATES_UTILS_DATE_MATCH_H_