/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "annotator/grammar/dates/parser.h" #include "annotator/grammar/dates/extractor.h" #include "annotator/grammar/dates/utils/date-match.h" #include "annotator/grammar/dates/utils/date-utils.h" #include "utils/base/integral_types.h" #include "utils/base/logging.h" #include "utils/base/macros.h" #include "utils/grammar/lexer.h" #include "utils/grammar/matcher.h" #include "utils/grammar/rules_generated.h" #include "utils/grammar/types.h" #include "utils/strings/split.h" #include "utils/strings/stringpiece.h" namespace libtextclassifier3::dates { namespace { // Helper methods to validate individual components from a date match. // Checks the validation requirement of a rule against a match. // For example if the rule asks for `SPELLED_MONTH`, then we check that the // match has the right flag. bool CheckMatchValidationAndFlag( const grammar::Match* match, const ExtractionRuleParameter* rule, const ExtractionRuleParameter_::ExtractionValidation validation, const NonterminalParameter_::Flag flag) { if (rule == nullptr || (rule->validation() & validation) == 0) { // No validation requirement. return true; } const NonterminalParameter* nonterminal_parameter = static_cast(match) ->nonterminal->nonterminal_parameter(); return (nonterminal_parameter != nullptr && (nonterminal_parameter->flag() & flag) != 0); } bool GenerateDate(const ExtractionRuleParameter* rule, const grammar::Match* match, DateMatch* date) { bool is_valid = true; // Post check and assign date components. grammar::Traverse(match, [rule, date, &is_valid](const grammar::Match* node) { switch (node->type) { case MatchType_YEAR: { if (CheckMatchValidationAndFlag( node, rule, ExtractionRuleParameter_::ExtractionValidation_SPELLED_YEAR, NonterminalParameter_::Flag_IS_SPELLED)) { date->year_match = static_cast(node); date->year = date->year_match->value; } else { is_valid = false; } break; } case MatchType_MONTH: { if (CheckMatchValidationAndFlag( node, rule, ExtractionRuleParameter_::ExtractionValidation_SPELLED_MONTH, NonterminalParameter_::Flag_IS_SPELLED)) { date->month_match = static_cast(node); date->month = date->month_match->value; } else { is_valid = false; } break; } case MatchType_DAY: { if (CheckMatchValidationAndFlag( node, rule, ExtractionRuleParameter_::ExtractionValidation_SPELLED_DAY, NonterminalParameter_::Flag_IS_SPELLED)) { date->day_match = static_cast(node); date->day = date->day_match->value; } else { is_valid = false; } break; } case MatchType_DAY_OF_WEEK: { date->day_of_week_match = static_cast(node); date->day_of_week = static_cast(date->day_of_week_match->value); break; } case MatchType_TIME_VALUE: { date->time_value_match = static_cast(node); date->hour = date->time_value_match->hour; date->minute = date->time_value_match->minute; date->second = date->time_value_match->second; date->fraction_second = date->time_value_match->fraction_second; return false; } case MatchType_TIME_SPAN: { date->time_span_match = static_cast(node); date->time_span_code = date->time_span_match->time_span_code; return false; } case MatchType_TIME_ZONE_NAME: { date->time_zone_name_match = static_cast(node); date->time_zone_code = date->time_zone_name_match->time_zone_code; return false; } case MatchType_TIME_ZONE_OFFSET: { date->time_zone_offset_match = static_cast(node); date->time_zone_offset = date->time_zone_offset_match->time_zone_offset; return false; } case MatchType_RELATIVE_DATE: { date->relative_match = static_cast(node); return false; } case MatchType_COMBINED_DIGITS: { date->combined_digits_match = static_cast(node); if (date->combined_digits_match->HasYear()) { date->year = date->combined_digits_match->GetYear(); } if (date->combined_digits_match->HasMonth()) { date->month = date->combined_digits_match->GetMonth(); } if (date->combined_digits_match->HasDay()) { date->day = date->combined_digits_match->GetDay(); } if (date->combined_digits_match->HasHour()) { date->hour = date->combined_digits_match->GetHour(); } if (date->combined_digits_match->HasMinute()) { date->minute = date->combined_digits_match->GetMinute(); } if (date->combined_digits_match->HasSecond()) { date->second = date->combined_digits_match->GetSecond(); } return false; } default: // Expand node further. return true; } return false; }); if (is_valid) { date->begin = match->codepoint_span.first; date->end = match->codepoint_span.second; date->priority = rule ? rule->priority_delta() : 0; date->annotator_priority_score = rule ? rule->annotator_priority_score() : 0.0; } return is_valid; } bool GenerateFromOrToDateRange(const grammar::Match* match, DateMatch* date) { return GenerateDate( /*rule=*/( match->type == MatchType_DATETIME ? static_cast(match)->extraction_rule : nullptr), match, date); } bool GenerateDateRange(const grammar::Match* match, const grammar::Match* from, const grammar::Match* to, DateRangeMatch* date_range) { if (!GenerateFromOrToDateRange(from, &date_range->from)) { TC3_LOG(WARNING) << "Failed to generate date for `from`."; return false; } if (!GenerateFromOrToDateRange(to, &date_range->to)) { TC3_LOG(WARNING) << "Failed to generate date for `to`."; return false; } date_range->begin = match->codepoint_span.first; date_range->end = match->codepoint_span.second; return true; } bool NormalizeHour(DateMatch* date) { if (date->time_span_match == nullptr) { // Nothing to do. return true; } return NormalizeHourByTimeSpan(date->time_span_match->time_span_spec, date); } void CheckAndSetAmbiguousHour(DateMatch* date) { if (date->HasHour()) { // Use am-pm ambiguity as default. if (!date->HasTimeSpanCode() && date->hour >= 1 && date->hour <= 12 && !(date->time_value_match != nullptr && date->time_value_match->hour_match != nullptr && date->time_value_match->hour_match->is_zero_prefixed)) { date->SetAmbiguousHourProperties(2, 12); } } } // Normalizes a date candidate. // Returns whether the candidate was successfully normalized. bool NormalizeDate(DateMatch* date) { // Normalize hour. if (!NormalizeHour(date)) { TC3_VLOG(ERROR) << "Hour normalization (according to time-span) failed." << date->DebugString(); return false; } CheckAndSetAmbiguousHour(date); if (!date->IsValid()) { TC3_VLOG(ERROR) << "Fields inside date instance are ill-formed " << date->DebugString(); } return true; } // Copies the field from one DateMatch to another whose field is null. for // example: if the from is "May 1, 8pm", and the to is "9pm", "May 1" will be // copied to "to". Now we only copy fields for date range requirement.fv void CopyFieldsForDateMatch(const DateMatch& from, DateMatch* to) { if (from.time_span_match != nullptr && to->time_span_match == nullptr) { to->time_span_match = from.time_span_match; to->time_span_code = from.time_span_code; } if (from.month_match != nullptr && to->month_match == nullptr) { to->month_match = from.month_match; to->month = from.month; } } // Normalizes a date range candidate. // Returns whether the date range was successfully normalized. bool NormalizeDateRange(DateRangeMatch* date_range) { CopyFieldsForDateMatch(date_range->from, &date_range->to); CopyFieldsForDateMatch(date_range->to, &date_range->from); return (NormalizeDate(&date_range->from) && NormalizeDate(&date_range->to)); } bool CheckDate(const DateMatch& date, const ExtractionRuleParameter* rule) { // It's possible that "time_zone_name_match == NULL" when // "HasTimeZoneCode() == true", or "time_zone_offset_match == NULL" when // "HasTimeZoneOffset() == true" due to inference between endpoints, so we // must check if they really exist before using them. if (date.HasTimeZoneOffset()) { if (date.HasTimeZoneCode()) { if (date.time_zone_name_match != nullptr) { TC3_CHECK(date.time_zone_name_match->time_zone_name_spec != nullptr); const TimeZoneNameSpec* spec = date.time_zone_name_match->time_zone_name_spec; if (!spec->is_utc()) { return false; } if (!spec->is_abbreviation()) { return false; } } } else if (date.time_zone_offset_match != nullptr) { TC3_CHECK(date.time_zone_offset_match->time_zone_offset_param != nullptr); const TimeZoneOffsetParameter* param = date.time_zone_offset_match->time_zone_offset_param; if (param->format() == TimeZoneOffsetParameter_::Format_FORMAT_H || param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HH) { return false; } if (!(rule->validation() & ExtractionRuleParameter_:: ExtractionValidation_ALLOW_UNCONFIDENT_TIME_ZONE)) { if (param->format() == TimeZoneOffsetParameter_::Format_FORMAT_H_MM || param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HH_MM || param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HMM) { return false; } } } } // Case: 1 April could be extracted as year 1, month april. // We simply remove this case. if (!date.HasBcAd() && date.year_match != nullptr && date.year < 1000) { // We allow case like 11/5/01 if (date.HasMonth() && date.HasDay() && date.year_match->count_of_digits == 2) { } else { return false; } } // Ignore the date if the year is larger than 9999 (The maximum number of 4 // digits). if (date.year_match != nullptr && date.year > 9999) { TC3_VLOG(ERROR) << "Year is greater than 9999."; return false; } // Case: spelled may could be month 5, it also used very common as modal // verbs. We ignore spelled may as month. if ((rule->validation() & ExtractionRuleParameter_::ExtractionValidation_SPELLED_MONTH) && date.month == 5 && !date.HasYear() && !date.HasDay()) { return false; } return true; } bool CheckContext(const std::vector& text, const DateExtractor::Output& output) { const uint32 validation = output.rule->validation(); // Nothing to check if we don't have any validation requirements for the // span boundaries. if ((validation & (ExtractionRuleParameter_::ExtractionValidation_LEFT_BOUND | ExtractionRuleParameter_::ExtractionValidation_RIGHT_BOUND)) == 0) { return true; } const int begin = output.match->codepoint_span.first; const int end = output.match->codepoint_span.second; // So far, we only check that the adjacent character cannot be a separator, // like /, - or . if ((validation & ExtractionRuleParameter_::ExtractionValidation_LEFT_BOUND) != 0) { if (begin > 0 && (*text[begin - 1] == '/' || *text[begin - 1] == '-' || *text[begin - 1] == ':')) { return false; } } if ((validation & ExtractionRuleParameter_::ExtractionValidation_RIGHT_BOUND) != 0) { // Last valid codepoint is at text.size() - 2 as we added the end position // of text for easier span extraction. if (end < text.size() - 1 && (*text[end] == '/' || *text[end] == '-' || *text[end] == ':')) { return false; } } return true; } // Validates a date match. Returns true if the candidate is valid. bool ValidateDate(const std::vector& text, const DateExtractor::Output& output, const DateMatch& date) { if (!CheckDate(date, output.rule)) { return false; } if (!CheckContext(text, output)) { return false; } return true; } // Builds matched date instances from the grammar output. std::vector BuildDateMatches( const std::vector& text, const std::vector& outputs) { std::vector result; for (const DateExtractor::Output& output : outputs) { DateMatch date; if (GenerateDate(output.rule, output.match, &date)) { if (!NormalizeDate(&date)) { continue; } if (!ValidateDate(text, output, date)) { continue; } result.push_back(date); } } return result; } // Builds matched date range instances from the grammar output. std::vector BuildDateRangeMatches( const std::vector& text, const std::vector& range_outputs) { std::vector result; for (const DateExtractor::RangeOutput& range_output : range_outputs) { DateRangeMatch date_range; if (GenerateDateRange(range_output.match, range_output.from, range_output.to, &date_range)) { if (!NormalizeDateRange(&date_range)) { continue; } result.push_back(date_range); } } return result; } template void RemoveDeletedMatches(const std::vector& removed, std::vector* matches) { int input = 0; for (int next = 0; next < matches->size(); ++next) { if (removed[next]) { continue; } if (input != next) { (*matches)[input] = (*matches)[next]; } input++; } matches->resize(input); } // Removes duplicated date or date range instances. // Overlapping date and date ranges are not considered here. template void RemoveDuplicatedDates(std::vector* matches) { // Assumption: matches are sorted ascending by (begin, end). std::vector removed(matches->size(), false); for (int i = 0; i < matches->size(); i++) { if (removed[i]) { continue; } const T& candidate = matches->at(i); for (int j = i + 1; j < matches->size(); j++) { if (removed[j]) { continue; } const T& next = matches->at(j); // Not overlapping. if (next.begin >= candidate.end) { break; } // If matching the same span of text, then check the priority. if (candidate.begin == next.begin && candidate.end == next.end) { if (candidate.GetPriority() < next.GetPriority()) { removed[i] = true; break; } else { removed[j] = true; continue; } } // Checks if `next` is fully covered by fields of `candidate`. if (next.end <= candidate.end) { removed[j] = true; continue; } // Checks whether `candidate`/`next` is a refinement. if (IsRefinement(candidate, next)) { removed[j] = true; continue; } else if (IsRefinement(next, candidate)) { removed[i] = true; break; } } } RemoveDeletedMatches(removed, matches); } // Filters out simple overtriggering simple matches. bool IsBlacklistedDate(const UniLib& unilib, const std::vector& text, const DateMatch& match) { const int begin = match.begin; const int end = match.end; if (end - begin != 3) { return false; } std::string text_lower = unilib .ToLowerText( UTF8ToUnicodeText(text[begin].utf8_data(), text[end].utf8_data() - text[begin].utf8_data(), /*do_copy=*/false)) .ToUTF8String(); // "sun" is not a good abbreviation for a standalone day of the week. if (match.IsStandaloneRelativeDayOfWeek() && (text_lower == "sun" || text_lower == "mon")) { return true; } // "mar" is not a good abbreviation for single month. if (match.HasMonth() && text_lower == "mar") { return true; } return false; } // Checks if two date matches are adjacent and mergeable. bool AreDateMatchesAdjacentAndMergeable( const UniLib& unilib, const std::vector& text, const std::vector& ignored_spans, const DateMatch& prev, const DateMatch& next) { // Check the context between the two matches. if (next.begin <= prev.end) { // The two matches are not adjacent. return false; } UnicodeText span; for (int i = prev.end; i < next.begin; i++) { const char32 codepoint = *text[i]; if (unilib.IsWhitespace(codepoint)) { continue; } span.push_back(unilib.ToLower(codepoint)); } if (span.empty()) { return true; } const std::string span_text = span.ToUTF8String(); bool matched = false; for (const std::string& ignored_span : ignored_spans) { if (span_text == ignored_span) { matched = true; break; } } if (!matched) { return false; } return IsDateMatchMergeable(prev, next); } // Merges adjacent date and date range. // For e.g. Monday, 5-10pm, the date "Monday" and the time range "5-10pm" will // be merged void MergeDateRangeAndDate(const UniLib& unilib, const std::vector& text, const std::vector& ignored_spans, const std::vector& dates, std::vector* date_ranges) { // For each range, check the date before or after the it to see if they could // be merged. Both the range and date array are sorted, so we only need to // scan the date array once. int next_date = 0; for (int i = 0; i < date_ranges->size(); i++) { DateRangeMatch* date_range = &date_ranges->at(i); // So far we only merge time range with a date. if (!date_range->from.HasHour()) { continue; } for (; next_date < dates.size(); next_date++) { const DateMatch& date = dates[next_date]; // If the range is before the date, we check whether `date_range->to` can // be merged with the date. if (date_range->end <= date.begin) { DateMatch merged_date = date; if (AreDateMatchesAdjacentAndMergeable(unilib, text, ignored_spans, date_range->to, date)) { MergeDateMatch(date_range->to, &merged_date, /*update_span=*/true); date_range->to = merged_date; date_range->end = date_range->to.end; MergeDateMatch(date, &date_range->from, /*update_span=*/false); next_date++; // Check the second date after the range to see if it could be merged // further. For example: 10-11pm, Monday, May 15. 10-11pm is merged // with Monday and then we check that it could be merged with May 15 // as well. if (next_date < dates.size()) { DateMatch next_match = dates[next_date]; if (AreDateMatchesAdjacentAndMergeable( unilib, text, ignored_spans, date_range->to, next_match)) { MergeDateMatch(date_range->to, &next_match, /*update_span=*/true); date_range->to = next_match; date_range->end = date_range->to.end; MergeDateMatch(dates[next_date], &date_range->from, /*update_span=*/false); next_date++; } } } // Since the range is before the date, we try to check if the next range // could be merged with the current date. break; } else if (date_range->end > date.end && date_range->begin > date.begin) { // If the range is after the date, we check if `date_range.from` can be // merged with the date. Here is a special case, the date before range // could be partially overlapped. This is because the range.from could // be extracted as year in date. For example: March 3, 10-11pm is // extracted as date March 3, 2010 and the range 10-11pm. In this // case, we simply clear the year from date. DateMatch merged_date = date; if (date.HasYear() && date.year_match->codepoint_span.second > date_range->begin) { merged_date.year_match = nullptr; merged_date.year = NO_VAL; merged_date.end = date.year_match->match_offset; } // Check and merge the range and the date before the range. if (AreDateMatchesAdjacentAndMergeable(unilib, text, ignored_spans, merged_date, date_range->from)) { MergeDateMatch(merged_date, &date_range->from, /*update_span=*/true); date_range->begin = date_range->from.begin; MergeDateMatch(merged_date, &date_range->to, /*update_span=*/false); // Check if the second date before the range can be merged as well. if (next_date > 0) { DateMatch prev_match = dates[next_date - 1]; if (prev_match.end <= date_range->from.begin) { if (AreDateMatchesAdjacentAndMergeable(unilib, text, ignored_spans, prev_match, date_range->from)) { MergeDateMatch(prev_match, &date_range->from, /*update_span=*/true); date_range->begin = date_range->from.begin; MergeDateMatch(prev_match, &date_range->to, /*update_span=*/false); } } } next_date++; break; } else { // Since the date is before the date range, we move to the next date // to check if it could be merged with the current range. continue; } } else { // The date is either fully overlapped by the date range or the date // span end is after the date range. Move to the next date in both // cases. } } } } // Removes the dates which are part of a range. e.g. in "May 1 - 3", the date // "May 1" is fully contained in the range. void RemoveOverlappedDateByRange(const std::vector& ranges, std::vector* dates) { int next_date = 0; std::vector removed(dates->size(), false); for (int i = 0; i < ranges.size(); ++i) { const auto& range = ranges[i]; for (; next_date < dates->size(); ++next_date) { const auto& date = dates->at(next_date); // So far we don't touch the partially overlapped case. if (date.begin >= range.begin && date.end <= range.end) { // Fully contained. removed[next_date] = true; } else if (date.end <= range.begin) { continue; // date is behind range, go to next date } else if (date.begin >= range.end) { break; // range is behind date, go to next range } } } RemoveDeletedMatches(removed, dates); } // Converts candidate dates and date ranges. void FillDateInstances( const UniLib& unilib, const std::vector& text, const DateAnnotationOptions& options, std::vector* date_matches, std::vector* datetime_parse_result_spans) { int i = 0; for (int j = 1; j < date_matches->size(); j++) { if (options.merge_adjacent_components && AreDateMatchesAdjacentAndMergeable(unilib, text, options.ignored_spans, date_matches->at(i), date_matches->at(j))) { MergeDateMatch(date_matches->at(i), &date_matches->at(j), true); } else { if (!IsBlacklistedDate(unilib, text, date_matches->at(i))) { DatetimeParseResultSpan datetime_parse_result_span; FillDateInstance(date_matches->at(i), &datetime_parse_result_span); datetime_parse_result_spans->push_back(datetime_parse_result_span); } } i = j; } if (!IsBlacklistedDate(unilib, text, date_matches->at(i))) { DatetimeParseResultSpan datetime_parse_result_span; FillDateInstance(date_matches->at(i), &datetime_parse_result_span); datetime_parse_result_spans->push_back(datetime_parse_result_span); } } void FillDateRangeInstances( const std::vector& date_range_matches, std::vector* datetime_parse_result_spans) { for (const DateRangeMatch& date_range_match : date_range_matches) { DatetimeParseResultSpan datetime_parse_result_span; FillDateRangeInstance(date_range_match, &datetime_parse_result_span); datetime_parse_result_spans->push_back(datetime_parse_result_span); } } // Fills `DatetimeParseResultSpan` from `DateMatch` and `DateRangeMatch` // instances. std::vector GetOutputAsAnnotationList( const UniLib& unilib, const DateExtractor& extractor, const std::vector& text, const DateAnnotationOptions& options) { std::vector datetime_parse_result_spans; std::vector date_matches = BuildDateMatches(text, extractor.output()); std::sort( date_matches.begin(), date_matches.end(), // Order by increasing begin, and decreasing end (decreasing length). [](const DateMatch& a, const DateMatch& b) { return (a.begin < b.begin || (a.begin == b.begin && a.end > b.end)); }); if (!date_matches.empty()) { RemoveDuplicatedDates(&date_matches); } if (options.enable_date_range) { std::vector date_range_matches = BuildDateRangeMatches(text, extractor.range_output()); if (!date_range_matches.empty()) { std::sort( date_range_matches.begin(), date_range_matches.end(), // Order by increasing begin, and decreasing end (decreasing length). [](const DateRangeMatch& a, const DateRangeMatch& b) { return (a.begin < b.begin || (a.begin == b.begin && a.end > b.end)); }); RemoveDuplicatedDates(&date_range_matches); } if (!date_matches.empty()) { MergeDateRangeAndDate(unilib, text, options.ignored_spans, date_matches, &date_range_matches); RemoveOverlappedDateByRange(date_range_matches, &date_matches); } FillDateRangeInstances(date_range_matches, &datetime_parse_result_spans); } if (!date_matches.empty()) { FillDateInstances(unilib, text, options, &date_matches, &datetime_parse_result_spans); } return datetime_parse_result_spans; } } // namespace std::vector DateParser::Parse( StringPiece text, const std::vector& tokens, const std::vector& locales, const DateAnnotationOptions& options) const { std::vector codepoint_offsets; const UnicodeText text_unicode = UTF8ToUnicodeText(text, /*do_copy=*/false); for (auto it = text_unicode.begin(); it != text_unicode.end(); it++) { codepoint_offsets.push_back(it); } codepoint_offsets.push_back(text_unicode.end()); DateExtractor extractor(codepoint_offsets, options, datetime_rules_); // Select locale matching rules. // Only use a shard if locales match or the shard doesn't specify a locale // restriction. std::vector locale_rules = SelectLocaleMatchingShards(datetime_rules_->rules(), rules_locales_, locales); if (locale_rules.empty()) { return {}; } grammar::Matcher matcher(&unilib_, datetime_rules_->rules(), locale_rules, &extractor); lexer_.Process(text_unicode, tokens, /*annotations=*/nullptr, &matcher); return GetOutputAsAnnotationList(unilib_, extractor, codepoint_offsets, options); } } // namespace libtextclassifier3::dates