795 lines
29 KiB
C++
795 lines
29 KiB
C++
/*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "annotator/grammar/dates/parser.h"
|
|
|
|
#include "annotator/grammar/dates/extractor.h"
|
|
#include "annotator/grammar/dates/utils/date-match.h"
|
|
#include "annotator/grammar/dates/utils/date-utils.h"
|
|
#include "utils/base/integral_types.h"
|
|
#include "utils/base/logging.h"
|
|
#include "utils/base/macros.h"
|
|
#include "utils/grammar/lexer.h"
|
|
#include "utils/grammar/matcher.h"
|
|
#include "utils/grammar/rules_generated.h"
|
|
#include "utils/grammar/types.h"
|
|
#include "utils/strings/split.h"
|
|
#include "utils/strings/stringpiece.h"
|
|
|
|
namespace libtextclassifier3::dates {
|
|
namespace {
|
|
|
|
// Helper methods to validate individual components from a date match.
|
|
|
|
// Checks the validation requirement of a rule against a match.
|
|
// For example if the rule asks for `SPELLED_MONTH`, then we check that the
|
|
// match has the right flag.
|
|
bool CheckMatchValidationAndFlag(
|
|
const grammar::Match* match, const ExtractionRuleParameter* rule,
|
|
const ExtractionRuleParameter_::ExtractionValidation validation,
|
|
const NonterminalParameter_::Flag flag) {
|
|
if (rule == nullptr || (rule->validation() & validation) == 0) {
|
|
// No validation requirement.
|
|
return true;
|
|
}
|
|
const NonterminalParameter* nonterminal_parameter =
|
|
static_cast<const NonterminalMatch*>(match)
|
|
->nonterminal->nonterminal_parameter();
|
|
return (nonterminal_parameter != nullptr &&
|
|
(nonterminal_parameter->flag() & flag) != 0);
|
|
}
|
|
|
|
bool GenerateDate(const ExtractionRuleParameter* rule,
|
|
const grammar::Match* match, DateMatch* date) {
|
|
bool is_valid = true;
|
|
|
|
// Post check and assign date components.
|
|
grammar::Traverse(match, [rule, date, &is_valid](const grammar::Match* node) {
|
|
switch (node->type) {
|
|
case MatchType_YEAR: {
|
|
if (CheckMatchValidationAndFlag(
|
|
node, rule,
|
|
ExtractionRuleParameter_::ExtractionValidation_SPELLED_YEAR,
|
|
NonterminalParameter_::Flag_IS_SPELLED)) {
|
|
date->year_match = static_cast<const YearMatch*>(node);
|
|
date->year = date->year_match->value;
|
|
} else {
|
|
is_valid = false;
|
|
}
|
|
break;
|
|
}
|
|
case MatchType_MONTH: {
|
|
if (CheckMatchValidationAndFlag(
|
|
node, rule,
|
|
ExtractionRuleParameter_::ExtractionValidation_SPELLED_MONTH,
|
|
NonterminalParameter_::Flag_IS_SPELLED)) {
|
|
date->month_match = static_cast<const MonthMatch*>(node);
|
|
date->month = date->month_match->value;
|
|
} else {
|
|
is_valid = false;
|
|
}
|
|
break;
|
|
}
|
|
case MatchType_DAY: {
|
|
if (CheckMatchValidationAndFlag(
|
|
node, rule,
|
|
ExtractionRuleParameter_::ExtractionValidation_SPELLED_DAY,
|
|
NonterminalParameter_::Flag_IS_SPELLED)) {
|
|
date->day_match = static_cast<const DayMatch*>(node);
|
|
date->day = date->day_match->value;
|
|
} else {
|
|
is_valid = false;
|
|
}
|
|
break;
|
|
}
|
|
case MatchType_DAY_OF_WEEK: {
|
|
date->day_of_week_match = static_cast<const DayOfWeekMatch*>(node);
|
|
date->day_of_week =
|
|
static_cast<DayOfWeek>(date->day_of_week_match->value);
|
|
break;
|
|
}
|
|
case MatchType_TIME_VALUE: {
|
|
date->time_value_match = static_cast<const TimeValueMatch*>(node);
|
|
date->hour = date->time_value_match->hour;
|
|
date->minute = date->time_value_match->minute;
|
|
date->second = date->time_value_match->second;
|
|
date->fraction_second = date->time_value_match->fraction_second;
|
|
return false;
|
|
}
|
|
case MatchType_TIME_SPAN: {
|
|
date->time_span_match = static_cast<const TimeSpanMatch*>(node);
|
|
date->time_span_code = date->time_span_match->time_span_code;
|
|
return false;
|
|
}
|
|
case MatchType_TIME_ZONE_NAME: {
|
|
date->time_zone_name_match =
|
|
static_cast<const TimeZoneNameMatch*>(node);
|
|
date->time_zone_code = date->time_zone_name_match->time_zone_code;
|
|
return false;
|
|
}
|
|
case MatchType_TIME_ZONE_OFFSET: {
|
|
date->time_zone_offset_match =
|
|
static_cast<const TimeZoneOffsetMatch*>(node);
|
|
date->time_zone_offset = date->time_zone_offset_match->time_zone_offset;
|
|
return false;
|
|
}
|
|
case MatchType_RELATIVE_DATE: {
|
|
date->relative_match = static_cast<const RelativeMatch*>(node);
|
|
return false;
|
|
}
|
|
case MatchType_COMBINED_DIGITS: {
|
|
date->combined_digits_match =
|
|
static_cast<const CombinedDigitsMatch*>(node);
|
|
if (date->combined_digits_match->HasYear()) {
|
|
date->year = date->combined_digits_match->GetYear();
|
|
}
|
|
if (date->combined_digits_match->HasMonth()) {
|
|
date->month = date->combined_digits_match->GetMonth();
|
|
}
|
|
if (date->combined_digits_match->HasDay()) {
|
|
date->day = date->combined_digits_match->GetDay();
|
|
}
|
|
if (date->combined_digits_match->HasHour()) {
|
|
date->hour = date->combined_digits_match->GetHour();
|
|
}
|
|
if (date->combined_digits_match->HasMinute()) {
|
|
date->minute = date->combined_digits_match->GetMinute();
|
|
}
|
|
if (date->combined_digits_match->HasSecond()) {
|
|
date->second = date->combined_digits_match->GetSecond();
|
|
}
|
|
return false;
|
|
}
|
|
default:
|
|
// Expand node further.
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
});
|
|
|
|
if (is_valid) {
|
|
date->begin = match->codepoint_span.first;
|
|
date->end = match->codepoint_span.second;
|
|
date->priority = rule ? rule->priority_delta() : 0;
|
|
date->annotator_priority_score =
|
|
rule ? rule->annotator_priority_score() : 0.0;
|
|
}
|
|
return is_valid;
|
|
}
|
|
|
|
bool GenerateFromOrToDateRange(const grammar::Match* match, DateMatch* date) {
|
|
return GenerateDate(
|
|
/*rule=*/(
|
|
match->type == MatchType_DATETIME
|
|
? static_cast<const ExtractionMatch*>(match)->extraction_rule
|
|
: nullptr),
|
|
match, date);
|
|
}
|
|
|
|
bool GenerateDateRange(const grammar::Match* match, const grammar::Match* from,
|
|
const grammar::Match* to, DateRangeMatch* date_range) {
|
|
if (!GenerateFromOrToDateRange(from, &date_range->from)) {
|
|
TC3_LOG(WARNING) << "Failed to generate date for `from`.";
|
|
return false;
|
|
}
|
|
if (!GenerateFromOrToDateRange(to, &date_range->to)) {
|
|
TC3_LOG(WARNING) << "Failed to generate date for `to`.";
|
|
return false;
|
|
}
|
|
date_range->begin = match->codepoint_span.first;
|
|
date_range->end = match->codepoint_span.second;
|
|
return true;
|
|
}
|
|
|
|
bool NormalizeHour(DateMatch* date) {
|
|
if (date->time_span_match == nullptr) {
|
|
// Nothing to do.
|
|
return true;
|
|
}
|
|
return NormalizeHourByTimeSpan(date->time_span_match->time_span_spec, date);
|
|
}
|
|
|
|
void CheckAndSetAmbiguousHour(DateMatch* date) {
|
|
if (date->HasHour()) {
|
|
// Use am-pm ambiguity as default.
|
|
if (!date->HasTimeSpanCode() && date->hour >= 1 && date->hour <= 12 &&
|
|
!(date->time_value_match != nullptr &&
|
|
date->time_value_match->hour_match != nullptr &&
|
|
date->time_value_match->hour_match->is_zero_prefixed)) {
|
|
date->SetAmbiguousHourProperties(2, 12);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Normalizes a date candidate.
|
|
// Returns whether the candidate was successfully normalized.
|
|
bool NormalizeDate(DateMatch* date) {
|
|
// Normalize hour.
|
|
if (!NormalizeHour(date)) {
|
|
TC3_VLOG(ERROR) << "Hour normalization (according to time-span) failed."
|
|
<< date->DebugString();
|
|
return false;
|
|
}
|
|
CheckAndSetAmbiguousHour(date);
|
|
if (!date->IsValid()) {
|
|
TC3_VLOG(ERROR) << "Fields inside date instance are ill-formed "
|
|
<< date->DebugString();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Copies the field from one DateMatch to another whose field is null. for
|
|
// example: if the from is "May 1, 8pm", and the to is "9pm", "May 1" will be
|
|
// copied to "to". Now we only copy fields for date range requirement.fv
|
|
void CopyFieldsForDateMatch(const DateMatch& from, DateMatch* to) {
|
|
if (from.time_span_match != nullptr && to->time_span_match == nullptr) {
|
|
to->time_span_match = from.time_span_match;
|
|
to->time_span_code = from.time_span_code;
|
|
}
|
|
if (from.month_match != nullptr && to->month_match == nullptr) {
|
|
to->month_match = from.month_match;
|
|
to->month = from.month;
|
|
}
|
|
}
|
|
|
|
// Normalizes a date range candidate.
|
|
// Returns whether the date range was successfully normalized.
|
|
bool NormalizeDateRange(DateRangeMatch* date_range) {
|
|
CopyFieldsForDateMatch(date_range->from, &date_range->to);
|
|
CopyFieldsForDateMatch(date_range->to, &date_range->from);
|
|
return (NormalizeDate(&date_range->from) && NormalizeDate(&date_range->to));
|
|
}
|
|
|
|
bool CheckDate(const DateMatch& date, const ExtractionRuleParameter* rule) {
|
|
// It's possible that "time_zone_name_match == NULL" when
|
|
// "HasTimeZoneCode() == true", or "time_zone_offset_match == NULL" when
|
|
// "HasTimeZoneOffset() == true" due to inference between endpoints, so we
|
|
// must check if they really exist before using them.
|
|
if (date.HasTimeZoneOffset()) {
|
|
if (date.HasTimeZoneCode()) {
|
|
if (date.time_zone_name_match != nullptr) {
|
|
TC3_CHECK(date.time_zone_name_match->time_zone_name_spec != nullptr);
|
|
const TimeZoneNameSpec* spec =
|
|
date.time_zone_name_match->time_zone_name_spec;
|
|
if (!spec->is_utc()) {
|
|
return false;
|
|
}
|
|
if (!spec->is_abbreviation()) {
|
|
return false;
|
|
}
|
|
}
|
|
} else if (date.time_zone_offset_match != nullptr) {
|
|
TC3_CHECK(date.time_zone_offset_match->time_zone_offset_param != nullptr);
|
|
const TimeZoneOffsetParameter* param =
|
|
date.time_zone_offset_match->time_zone_offset_param;
|
|
if (param->format() == TimeZoneOffsetParameter_::Format_FORMAT_H ||
|
|
param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HH) {
|
|
return false;
|
|
}
|
|
if (!(rule->validation() &
|
|
ExtractionRuleParameter_::
|
|
ExtractionValidation_ALLOW_UNCONFIDENT_TIME_ZONE)) {
|
|
if (param->format() == TimeZoneOffsetParameter_::Format_FORMAT_H_MM ||
|
|
param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HH_MM ||
|
|
param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HMM) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Case: 1 April could be extracted as year 1, month april.
|
|
// We simply remove this case.
|
|
if (!date.HasBcAd() && date.year_match != nullptr && date.year < 1000) {
|
|
// We allow case like 11/5/01
|
|
if (date.HasMonth() && date.HasDay() &&
|
|
date.year_match->count_of_digits == 2) {
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Ignore the date if the year is larger than 9999 (The maximum number of 4
|
|
// digits).
|
|
if (date.year_match != nullptr && date.year > 9999) {
|
|
TC3_VLOG(ERROR) << "Year is greater than 9999.";
|
|
return false;
|
|
}
|
|
|
|
// Case: spelled may could be month 5, it also used very common as modal
|
|
// verbs. We ignore spelled may as month.
|
|
if ((rule->validation() &
|
|
ExtractionRuleParameter_::ExtractionValidation_SPELLED_MONTH) &&
|
|
date.month == 5 && !date.HasYear() && !date.HasDay()) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CheckContext(const std::vector<UnicodeText::const_iterator>& text,
|
|
const DateExtractor::Output& output) {
|
|
const uint32 validation = output.rule->validation();
|
|
|
|
// Nothing to check if we don't have any validation requirements for the
|
|
// span boundaries.
|
|
if ((validation &
|
|
(ExtractionRuleParameter_::ExtractionValidation_LEFT_BOUND |
|
|
ExtractionRuleParameter_::ExtractionValidation_RIGHT_BOUND)) == 0) {
|
|
return true;
|
|
}
|
|
|
|
const int begin = output.match->codepoint_span.first;
|
|
const int end = output.match->codepoint_span.second;
|
|
|
|
// So far, we only check that the adjacent character cannot be a separator,
|
|
// like /, - or .
|
|
if ((validation &
|
|
ExtractionRuleParameter_::ExtractionValidation_LEFT_BOUND) != 0) {
|
|
if (begin > 0 && (*text[begin - 1] == '/' || *text[begin - 1] == '-' ||
|
|
*text[begin - 1] == ':')) {
|
|
return false;
|
|
}
|
|
}
|
|
if ((validation &
|
|
ExtractionRuleParameter_::ExtractionValidation_RIGHT_BOUND) != 0) {
|
|
// Last valid codepoint is at text.size() - 2 as we added the end position
|
|
// of text for easier span extraction.
|
|
if (end < text.size() - 1 &&
|
|
(*text[end] == '/' || *text[end] == '-' || *text[end] == ':')) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// Validates a date match. Returns true if the candidate is valid.
|
|
bool ValidateDate(const std::vector<UnicodeText::const_iterator>& text,
|
|
const DateExtractor::Output& output, const DateMatch& date) {
|
|
if (!CheckDate(date, output.rule)) {
|
|
return false;
|
|
}
|
|
if (!CheckContext(text, output)) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Builds matched date instances from the grammar output.
|
|
std::vector<DateMatch> BuildDateMatches(
|
|
const std::vector<UnicodeText::const_iterator>& text,
|
|
const std::vector<DateExtractor::Output>& outputs) {
|
|
std::vector<DateMatch> result;
|
|
for (const DateExtractor::Output& output : outputs) {
|
|
DateMatch date;
|
|
if (GenerateDate(output.rule, output.match, &date)) {
|
|
if (!NormalizeDate(&date)) {
|
|
continue;
|
|
}
|
|
if (!ValidateDate(text, output, date)) {
|
|
continue;
|
|
}
|
|
result.push_back(date);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Builds matched date range instances from the grammar output.
|
|
std::vector<DateRangeMatch> BuildDateRangeMatches(
|
|
const std::vector<UnicodeText::const_iterator>& text,
|
|
const std::vector<DateExtractor::RangeOutput>& range_outputs) {
|
|
std::vector<DateRangeMatch> result;
|
|
for (const DateExtractor::RangeOutput& range_output : range_outputs) {
|
|
DateRangeMatch date_range;
|
|
if (GenerateDateRange(range_output.match, range_output.from,
|
|
range_output.to, &date_range)) {
|
|
if (!NormalizeDateRange(&date_range)) {
|
|
continue;
|
|
}
|
|
result.push_back(date_range);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
template <typename T>
|
|
void RemoveDeletedMatches(const std::vector<bool>& removed,
|
|
std::vector<T>* matches) {
|
|
int input = 0;
|
|
for (int next = 0; next < matches->size(); ++next) {
|
|
if (removed[next]) {
|
|
continue;
|
|
}
|
|
if (input != next) {
|
|
(*matches)[input] = (*matches)[next];
|
|
}
|
|
input++;
|
|
}
|
|
matches->resize(input);
|
|
}
|
|
|
|
// Removes duplicated date or date range instances.
|
|
// Overlapping date and date ranges are not considered here.
|
|
template <typename T>
|
|
void RemoveDuplicatedDates(std::vector<T>* matches) {
|
|
// Assumption: matches are sorted ascending by (begin, end).
|
|
std::vector<bool> removed(matches->size(), false);
|
|
for (int i = 0; i < matches->size(); i++) {
|
|
if (removed[i]) {
|
|
continue;
|
|
}
|
|
const T& candidate = matches->at(i);
|
|
for (int j = i + 1; j < matches->size(); j++) {
|
|
if (removed[j]) {
|
|
continue;
|
|
}
|
|
const T& next = matches->at(j);
|
|
|
|
// Not overlapping.
|
|
if (next.begin >= candidate.end) {
|
|
break;
|
|
}
|
|
|
|
// If matching the same span of text, then check the priority.
|
|
if (candidate.begin == next.begin && candidate.end == next.end) {
|
|
if (candidate.GetPriority() < next.GetPriority()) {
|
|
removed[i] = true;
|
|
break;
|
|
} else {
|
|
removed[j] = true;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Checks if `next` is fully covered by fields of `candidate`.
|
|
if (next.end <= candidate.end) {
|
|
removed[j] = true;
|
|
continue;
|
|
}
|
|
|
|
// Checks whether `candidate`/`next` is a refinement.
|
|
if (IsRefinement(candidate, next)) {
|
|
removed[j] = true;
|
|
continue;
|
|
} else if (IsRefinement(next, candidate)) {
|
|
removed[i] = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
RemoveDeletedMatches(removed, matches);
|
|
}
|
|
|
|
// Filters out simple overtriggering simple matches.
|
|
bool IsBlacklistedDate(const UniLib& unilib,
|
|
const std::vector<UnicodeText::const_iterator>& text,
|
|
const DateMatch& match) {
|
|
const int begin = match.begin;
|
|
const int end = match.end;
|
|
if (end - begin != 3) {
|
|
return false;
|
|
}
|
|
|
|
std::string text_lower =
|
|
unilib
|
|
.ToLowerText(
|
|
UTF8ToUnicodeText(text[begin].utf8_data(),
|
|
text[end].utf8_data() - text[begin].utf8_data(),
|
|
/*do_copy=*/false))
|
|
.ToUTF8String();
|
|
|
|
// "sun" is not a good abbreviation for a standalone day of the week.
|
|
if (match.IsStandaloneRelativeDayOfWeek() &&
|
|
(text_lower == "sun" || text_lower == "mon")) {
|
|
return true;
|
|
}
|
|
|
|
// "mar" is not a good abbreviation for single month.
|
|
if (match.HasMonth() && text_lower == "mar") {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// Checks if two date matches are adjacent and mergeable.
|
|
bool AreDateMatchesAdjacentAndMergeable(
|
|
const UniLib& unilib, const std::vector<UnicodeText::const_iterator>& text,
|
|
const std::vector<std::string>& ignored_spans, const DateMatch& prev,
|
|
const DateMatch& next) {
|
|
// Check the context between the two matches.
|
|
if (next.begin <= prev.end) {
|
|
// The two matches are not adjacent.
|
|
return false;
|
|
}
|
|
UnicodeText span;
|
|
for (int i = prev.end; i < next.begin; i++) {
|
|
const char32 codepoint = *text[i];
|
|
if (unilib.IsWhitespace(codepoint)) {
|
|
continue;
|
|
}
|
|
span.push_back(unilib.ToLower(codepoint));
|
|
}
|
|
if (span.empty()) {
|
|
return true;
|
|
}
|
|
const std::string span_text = span.ToUTF8String();
|
|
bool matched = false;
|
|
for (const std::string& ignored_span : ignored_spans) {
|
|
if (span_text == ignored_span) {
|
|
matched = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!matched) {
|
|
return false;
|
|
}
|
|
return IsDateMatchMergeable(prev, next);
|
|
}
|
|
|
|
// Merges adjacent date and date range.
|
|
// For e.g. Monday, 5-10pm, the date "Monday" and the time range "5-10pm" will
|
|
// be merged
|
|
void MergeDateRangeAndDate(const UniLib& unilib,
|
|
const std::vector<UnicodeText::const_iterator>& text,
|
|
const std::vector<std::string>& ignored_spans,
|
|
const std::vector<DateMatch>& dates,
|
|
std::vector<DateRangeMatch>* date_ranges) {
|
|
// For each range, check the date before or after the it to see if they could
|
|
// be merged. Both the range and date array are sorted, so we only need to
|
|
// scan the date array once.
|
|
int next_date = 0;
|
|
for (int i = 0; i < date_ranges->size(); i++) {
|
|
DateRangeMatch* date_range = &date_ranges->at(i);
|
|
// So far we only merge time range with a date.
|
|
if (!date_range->from.HasHour()) {
|
|
continue;
|
|
}
|
|
|
|
for (; next_date < dates.size(); next_date++) {
|
|
const DateMatch& date = dates[next_date];
|
|
|
|
// If the range is before the date, we check whether `date_range->to` can
|
|
// be merged with the date.
|
|
if (date_range->end <= date.begin) {
|
|
DateMatch merged_date = date;
|
|
if (AreDateMatchesAdjacentAndMergeable(unilib, text, ignored_spans,
|
|
date_range->to, date)) {
|
|
MergeDateMatch(date_range->to, &merged_date, /*update_span=*/true);
|
|
date_range->to = merged_date;
|
|
date_range->end = date_range->to.end;
|
|
MergeDateMatch(date, &date_range->from, /*update_span=*/false);
|
|
next_date++;
|
|
|
|
// Check the second date after the range to see if it could be merged
|
|
// further. For example: 10-11pm, Monday, May 15. 10-11pm is merged
|
|
// with Monday and then we check that it could be merged with May 15
|
|
// as well.
|
|
if (next_date < dates.size()) {
|
|
DateMatch next_match = dates[next_date];
|
|
if (AreDateMatchesAdjacentAndMergeable(
|
|
unilib, text, ignored_spans, date_range->to, next_match)) {
|
|
MergeDateMatch(date_range->to, &next_match, /*update_span=*/true);
|
|
date_range->to = next_match;
|
|
date_range->end = date_range->to.end;
|
|
MergeDateMatch(dates[next_date], &date_range->from,
|
|
/*update_span=*/false);
|
|
next_date++;
|
|
}
|
|
}
|
|
}
|
|
// Since the range is before the date, we try to check if the next range
|
|
// could be merged with the current date.
|
|
break;
|
|
} else if (date_range->end > date.end && date_range->begin > date.begin) {
|
|
// If the range is after the date, we check if `date_range.from` can be
|
|
// merged with the date. Here is a special case, the date before range
|
|
// could be partially overlapped. This is because the range.from could
|
|
// be extracted as year in date. For example: March 3, 10-11pm is
|
|
// extracted as date March 3, 2010 and the range 10-11pm. In this
|
|
// case, we simply clear the year from date.
|
|
DateMatch merged_date = date;
|
|
if (date.HasYear() &&
|
|
date.year_match->codepoint_span.second > date_range->begin) {
|
|
merged_date.year_match = nullptr;
|
|
merged_date.year = NO_VAL;
|
|
merged_date.end = date.year_match->match_offset;
|
|
}
|
|
// Check and merge the range and the date before the range.
|
|
if (AreDateMatchesAdjacentAndMergeable(unilib, text, ignored_spans,
|
|
merged_date, date_range->from)) {
|
|
MergeDateMatch(merged_date, &date_range->from, /*update_span=*/true);
|
|
date_range->begin = date_range->from.begin;
|
|
MergeDateMatch(merged_date, &date_range->to, /*update_span=*/false);
|
|
|
|
// Check if the second date before the range can be merged as well.
|
|
if (next_date > 0) {
|
|
DateMatch prev_match = dates[next_date - 1];
|
|
if (prev_match.end <= date_range->from.begin) {
|
|
if (AreDateMatchesAdjacentAndMergeable(unilib, text,
|
|
ignored_spans, prev_match,
|
|
date_range->from)) {
|
|
MergeDateMatch(prev_match, &date_range->from,
|
|
/*update_span=*/true);
|
|
date_range->begin = date_range->from.begin;
|
|
MergeDateMatch(prev_match, &date_range->to,
|
|
/*update_span=*/false);
|
|
}
|
|
}
|
|
}
|
|
next_date++;
|
|
break;
|
|
} else {
|
|
// Since the date is before the date range, we move to the next date
|
|
// to check if it could be merged with the current range.
|
|
continue;
|
|
}
|
|
} else {
|
|
// The date is either fully overlapped by the date range or the date
|
|
// span end is after the date range. Move to the next date in both
|
|
// cases.
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Removes the dates which are part of a range. e.g. in "May 1 - 3", the date
|
|
// "May 1" is fully contained in the range.
|
|
void RemoveOverlappedDateByRange(const std::vector<DateRangeMatch>& ranges,
|
|
std::vector<DateMatch>* dates) {
|
|
int next_date = 0;
|
|
std::vector<bool> removed(dates->size(), false);
|
|
for (int i = 0; i < ranges.size(); ++i) {
|
|
const auto& range = ranges[i];
|
|
for (; next_date < dates->size(); ++next_date) {
|
|
const auto& date = dates->at(next_date);
|
|
// So far we don't touch the partially overlapped case.
|
|
if (date.begin >= range.begin && date.end <= range.end) {
|
|
// Fully contained.
|
|
removed[next_date] = true;
|
|
} else if (date.end <= range.begin) {
|
|
continue; // date is behind range, go to next date
|
|
} else if (date.begin >= range.end) {
|
|
break; // range is behind date, go to next range
|
|
}
|
|
}
|
|
}
|
|
RemoveDeletedMatches(removed, dates);
|
|
}
|
|
|
|
// Converts candidate dates and date ranges.
|
|
void FillDateInstances(
|
|
const UniLib& unilib, const std::vector<UnicodeText::const_iterator>& text,
|
|
const DateAnnotationOptions& options, std::vector<DateMatch>* date_matches,
|
|
std::vector<DatetimeParseResultSpan>* datetime_parse_result_spans) {
|
|
int i = 0;
|
|
for (int j = 1; j < date_matches->size(); j++) {
|
|
if (options.merge_adjacent_components &&
|
|
AreDateMatchesAdjacentAndMergeable(unilib, text, options.ignored_spans,
|
|
date_matches->at(i),
|
|
date_matches->at(j))) {
|
|
MergeDateMatch(date_matches->at(i), &date_matches->at(j), true);
|
|
} else {
|
|
if (!IsBlacklistedDate(unilib, text, date_matches->at(i))) {
|
|
DatetimeParseResultSpan datetime_parse_result_span;
|
|
FillDateInstance(date_matches->at(i), &datetime_parse_result_span);
|
|
datetime_parse_result_spans->push_back(datetime_parse_result_span);
|
|
}
|
|
}
|
|
i = j;
|
|
}
|
|
if (!IsBlacklistedDate(unilib, text, date_matches->at(i))) {
|
|
DatetimeParseResultSpan datetime_parse_result_span;
|
|
FillDateInstance(date_matches->at(i), &datetime_parse_result_span);
|
|
datetime_parse_result_spans->push_back(datetime_parse_result_span);
|
|
}
|
|
}
|
|
|
|
void FillDateRangeInstances(
|
|
const std::vector<DateRangeMatch>& date_range_matches,
|
|
std::vector<DatetimeParseResultSpan>* datetime_parse_result_spans) {
|
|
for (const DateRangeMatch& date_range_match : date_range_matches) {
|
|
DatetimeParseResultSpan datetime_parse_result_span;
|
|
FillDateRangeInstance(date_range_match, &datetime_parse_result_span);
|
|
datetime_parse_result_spans->push_back(datetime_parse_result_span);
|
|
}
|
|
}
|
|
|
|
// Fills `DatetimeParseResultSpan` from `DateMatch` and `DateRangeMatch`
|
|
// instances.
|
|
std::vector<DatetimeParseResultSpan> GetOutputAsAnnotationList(
|
|
const UniLib& unilib, const DateExtractor& extractor,
|
|
const std::vector<UnicodeText::const_iterator>& text,
|
|
const DateAnnotationOptions& options) {
|
|
std::vector<DatetimeParseResultSpan> datetime_parse_result_spans;
|
|
std::vector<DateMatch> date_matches =
|
|
BuildDateMatches(text, extractor.output());
|
|
|
|
std::sort(
|
|
date_matches.begin(), date_matches.end(),
|
|
// Order by increasing begin, and decreasing end (decreasing length).
|
|
[](const DateMatch& a, const DateMatch& b) {
|
|
return (a.begin < b.begin || (a.begin == b.begin && a.end > b.end));
|
|
});
|
|
|
|
if (!date_matches.empty()) {
|
|
RemoveDuplicatedDates(&date_matches);
|
|
}
|
|
|
|
if (options.enable_date_range) {
|
|
std::vector<DateRangeMatch> date_range_matches =
|
|
BuildDateRangeMatches(text, extractor.range_output());
|
|
|
|
if (!date_range_matches.empty()) {
|
|
std::sort(
|
|
date_range_matches.begin(), date_range_matches.end(),
|
|
// Order by increasing begin, and decreasing end (decreasing length).
|
|
[](const DateRangeMatch& a, const DateRangeMatch& b) {
|
|
return (a.begin < b.begin || (a.begin == b.begin && a.end > b.end));
|
|
});
|
|
RemoveDuplicatedDates(&date_range_matches);
|
|
}
|
|
|
|
if (!date_matches.empty()) {
|
|
MergeDateRangeAndDate(unilib, text, options.ignored_spans, date_matches,
|
|
&date_range_matches);
|
|
RemoveOverlappedDateByRange(date_range_matches, &date_matches);
|
|
}
|
|
FillDateRangeInstances(date_range_matches, &datetime_parse_result_spans);
|
|
}
|
|
|
|
if (!date_matches.empty()) {
|
|
FillDateInstances(unilib, text, options, &date_matches,
|
|
&datetime_parse_result_spans);
|
|
}
|
|
return datetime_parse_result_spans;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
std::vector<DatetimeParseResultSpan> DateParser::Parse(
|
|
StringPiece text, const std::vector<Token>& tokens,
|
|
const std::vector<Locale>& locales,
|
|
const DateAnnotationOptions& options) const {
|
|
std::vector<UnicodeText::const_iterator> codepoint_offsets;
|
|
const UnicodeText text_unicode = UTF8ToUnicodeText(text,
|
|
/*do_copy=*/false);
|
|
for (auto it = text_unicode.begin(); it != text_unicode.end(); it++) {
|
|
codepoint_offsets.push_back(it);
|
|
}
|
|
codepoint_offsets.push_back(text_unicode.end());
|
|
DateExtractor extractor(codepoint_offsets, options, datetime_rules_);
|
|
// Select locale matching rules.
|
|
// Only use a shard if locales match or the shard doesn't specify a locale
|
|
// restriction.
|
|
std::vector<const grammar::RulesSet_::Rules*> locale_rules =
|
|
SelectLocaleMatchingShards(datetime_rules_->rules(), rules_locales_,
|
|
locales);
|
|
if (locale_rules.empty()) {
|
|
return {};
|
|
}
|
|
grammar::Matcher matcher(&unilib_, datetime_rules_->rules(), locale_rules,
|
|
&extractor);
|
|
lexer_.Process(text_unicode, tokens, /*annotations=*/nullptr, &matcher);
|
|
return GetOutputAsAnnotationList(unilib_, extractor, codepoint_offsets,
|
|
options);
|
|
}
|
|
|
|
} // namespace libtextclassifier3::dates
|