914 lines
31 KiB
C++
914 lines
31 KiB
C++
/*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "annotator/grammar/dates/extractor.h"
|
|
|
|
#include <initializer_list>
|
|
#include <map>
|
|
|
|
#include "annotator/grammar/dates/utils/date-match.h"
|
|
#include "annotator/grammar/dates/utils/date-utils.h"
|
|
#include "utils/base/casts.h"
|
|
#include "utils/base/logging.h"
|
|
#include "utils/strings/numbers.h"
|
|
|
|
namespace libtextclassifier3::dates {
|
|
namespace {
|
|
|
|
// Helper struct for time-related components.
|
|
// Extracts all subnodes of a specified type.
|
|
struct MatchComponents {
|
|
MatchComponents(const grammar::Match* root,
|
|
std::initializer_list<int16> types)
|
|
: root(root),
|
|
components(grammar::SelectAll(
|
|
root, [root, &types](const grammar::Match* node) {
|
|
if (node == root || node->type == grammar::Match::kUnknownType) {
|
|
return false;
|
|
}
|
|
for (const int64 type : types) {
|
|
if (node->type == type) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
})) {}
|
|
|
|
// Returns the index of the first submatch of the specified type or -1 if not
|
|
// found.
|
|
int IndexOf(const int16 type, const int start_index = 0) const {
|
|
for (int i = start_index; i < components.size(); i++) {
|
|
if (components[i]->type == type) {
|
|
return i;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
// Returns the first submatch of the specified type, or nullptr if not found.
|
|
template <typename T>
|
|
const T* SubmatchOf(const int16 type, const int start_index = 0) const {
|
|
return SubmatchAt<T>(IndexOf(type, start_index));
|
|
}
|
|
|
|
template <typename T>
|
|
const T* SubmatchAt(const int index) const {
|
|
if (index < 0) {
|
|
return nullptr;
|
|
}
|
|
return static_cast<const T*>(components[index]);
|
|
}
|
|
|
|
const grammar::Match* root;
|
|
std::vector<const grammar::Match*> components;
|
|
};
|
|
|
|
// Helper method to check whether a time value has valid components.
|
|
bool IsValidTimeValue(const TimeValueMatch& time_value) {
|
|
// Can only specify seconds if minutes are present.
|
|
if (time_value.minute == NO_VAL && time_value.second != NO_VAL) {
|
|
return false;
|
|
}
|
|
// Can only specify fraction of seconds if seconds are present.
|
|
if (time_value.second == NO_VAL && time_value.fraction_second >= 0.0) {
|
|
return false;
|
|
}
|
|
|
|
const int8 h = time_value.hour;
|
|
const int8 m = (time_value.minute < 0 ? 0 : time_value.minute);
|
|
const int8 s = (time_value.second < 0 ? 0 : time_value.second);
|
|
const double f =
|
|
(time_value.fraction_second < 0.0 ? 0.0 : time_value.fraction_second);
|
|
|
|
// Check value bounds.
|
|
if (h == NO_VAL || h > 24 || m > 59 || s > 60) {
|
|
return false;
|
|
}
|
|
if (h == 24 && (m != 0 || s != 0 || f > 0.0)) {
|
|
return false;
|
|
}
|
|
if (s == 60 && m != 59) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
int ParseLeadingDec32Value(const char* c_str) {
|
|
int value;
|
|
if (ParseInt32(c_str, &value)) {
|
|
return value;
|
|
}
|
|
return NO_VAL;
|
|
}
|
|
|
|
double ParseLeadingDoubleValue(const char* c_str) {
|
|
double value;
|
|
if (ParseDouble(c_str, &value)) {
|
|
return value;
|
|
}
|
|
return NO_VAL;
|
|
}
|
|
|
|
// Extracts digits as an integer and adds a typed match accordingly.
|
|
template <typename T>
|
|
void CheckDigits(const grammar::Match* match,
|
|
const NonterminalValue* nonterminal, StringPiece match_text,
|
|
grammar::Matcher* matcher) {
|
|
TC3_CHECK(match->IsUnaryRule());
|
|
const int value = ParseLeadingDec32Value(match_text.ToString().c_str());
|
|
if (!T::IsValid(value)) {
|
|
return;
|
|
}
|
|
const int num_digits = match_text.size();
|
|
T* result = matcher->AllocateAndInitMatch<T>(
|
|
match->lhs, match->codepoint_span, match->match_offset);
|
|
result->Reset();
|
|
result->nonterminal = nonterminal;
|
|
result->value = value;
|
|
result->count_of_digits = num_digits;
|
|
result->is_zero_prefixed = (num_digits >= 2 && match_text[0] == '0');
|
|
matcher->AddMatch(result);
|
|
}
|
|
|
|
// Extracts digits as a decimal (as fraction, as if a "0." is prefixed) and
|
|
// adds a typed match to the `er accordingly.
|
|
template <typename T>
|
|
void CheckDigitsAsFraction(const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
StringPiece match_text, grammar::Matcher* matcher) {
|
|
TC3_CHECK(match->IsUnaryRule());
|
|
// TODO(smillius): Should should be achievable in a more straight-forward way.
|
|
const double value =
|
|
ParseLeadingDoubleValue(("0." + match_text.ToString()).data());
|
|
if (!T::IsValid(value)) {
|
|
return;
|
|
}
|
|
T* result = matcher->AllocateAndInitMatch<T>(
|
|
match->lhs, match->codepoint_span, match->match_offset);
|
|
result->Reset();
|
|
result->nonterminal = nonterminal;
|
|
result->value = value;
|
|
result->count_of_digits = match_text.size();
|
|
matcher->AddMatch(result);
|
|
}
|
|
|
|
// Extracts consecutive digits as multiple integers according to a format and
|
|
// adds a type match to the matcher accordingly.
|
|
template <typename T>
|
|
void CheckCombinedDigits(const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
StringPiece match_text, grammar::Matcher* matcher) {
|
|
TC3_CHECK(match->IsUnaryRule());
|
|
const std::string& format =
|
|
nonterminal->nonterminal_parameter()->combined_digits_format()->str();
|
|
if (match_text.size() != format.size()) {
|
|
return;
|
|
}
|
|
|
|
static std::map<char, CombinedDigitsMatch::Index>& kCombinedDigitsMatchIndex =
|
|
*[]() {
|
|
return new std::map<char, CombinedDigitsMatch::Index>{
|
|
{'Y', CombinedDigitsMatch::INDEX_YEAR},
|
|
{'M', CombinedDigitsMatch::INDEX_MONTH},
|
|
{'D', CombinedDigitsMatch::INDEX_DAY},
|
|
{'h', CombinedDigitsMatch::INDEX_HOUR},
|
|
{'m', CombinedDigitsMatch::INDEX_MINUTE},
|
|
{'s', CombinedDigitsMatch::INDEX_SECOND}};
|
|
}();
|
|
|
|
struct Segment {
|
|
const int index;
|
|
const int length;
|
|
const int value;
|
|
};
|
|
std::vector<Segment> segments;
|
|
int slice_start = 0;
|
|
while (slice_start < format.size()) {
|
|
int slice_end = slice_start + 1;
|
|
// Advace right as long as we have the same format character.
|
|
while (slice_end < format.size() &&
|
|
format[slice_start] == format[slice_end]) {
|
|
slice_end++;
|
|
}
|
|
|
|
const int slice_length = slice_end - slice_start;
|
|
const int value = ParseLeadingDec32Value(
|
|
std::string(match_text.data() + slice_start, slice_length).c_str());
|
|
|
|
auto index = kCombinedDigitsMatchIndex.find(format[slice_start]);
|
|
if (index == kCombinedDigitsMatchIndex.end()) {
|
|
return;
|
|
}
|
|
if (!T::IsValid(index->second, value)) {
|
|
return;
|
|
}
|
|
segments.push_back(Segment{index->second, slice_length, value});
|
|
slice_start = slice_end;
|
|
}
|
|
T* result = matcher->AllocateAndInitMatch<T>(
|
|
match->lhs, match->codepoint_span, match->match_offset);
|
|
result->Reset();
|
|
result->nonterminal = nonterminal;
|
|
for (const Segment& segment : segments) {
|
|
result->values[segment.index] = segment.value;
|
|
}
|
|
result->count_of_digits = match_text.size();
|
|
result->is_zero_prefixed =
|
|
(match_text[0] == '0' && segments.front().length >= 2);
|
|
matcher->AddMatch(result);
|
|
}
|
|
|
|
// Retrieves the corresponding value from an associated term-value mapping for
|
|
// the nonterminal and adds a typed match to the matcher accordingly.
|
|
template <typename T>
|
|
void CheckMappedValue(const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
const TermValueMatch* term =
|
|
grammar::SelectFirstOfType<TermValueMatch>(match, MatchType_TERM_VALUE);
|
|
if (term == nullptr) {
|
|
return;
|
|
}
|
|
const int value = term->term_value->value();
|
|
if (!T::IsValid(value)) {
|
|
return;
|
|
}
|
|
T* result = matcher->AllocateAndInitMatch<T>(
|
|
match->lhs, match->codepoint_span, match->match_offset);
|
|
result->Reset();
|
|
result->nonterminal = nonterminal;
|
|
result->value = value;
|
|
matcher->AddMatch(result);
|
|
}
|
|
|
|
// Checks if there is an associated value in the corresponding nonterminal and
|
|
// adds a typed match to the matcher accordingly.
|
|
template <typename T>
|
|
void CheckDirectValue(const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
const int value = nonterminal->value()->value();
|
|
if (!T::IsValid(value)) {
|
|
return;
|
|
}
|
|
T* result = matcher->AllocateAndInitMatch<T>(
|
|
match->lhs, match->codepoint_span, match->match_offset);
|
|
result->Reset();
|
|
result->nonterminal = nonterminal;
|
|
result->value = value;
|
|
matcher->AddMatch(result);
|
|
}
|
|
|
|
template <typename T>
|
|
void CheckAndAddDirectOrMappedValue(const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
if (nonterminal->value() != nullptr) {
|
|
CheckDirectValue<T>(match, nonterminal, matcher);
|
|
} else {
|
|
CheckMappedValue<T>(match, nonterminal, matcher);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void CheckAndAddNumericValue(const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
StringPiece match_text,
|
|
grammar::Matcher* matcher) {
|
|
if (nonterminal->nonterminal_parameter() != nullptr &&
|
|
nonterminal->nonterminal_parameter()->flag() &
|
|
NonterminalParameter_::Flag_IS_SPELLED) {
|
|
CheckMappedValue<T>(match, nonterminal, matcher);
|
|
} else {
|
|
CheckDigits<T>(match, nonterminal, match_text, matcher);
|
|
}
|
|
}
|
|
|
|
// Tries to parse as digital time value.
|
|
bool ParseDigitalTimeValue(const std::vector<UnicodeText::const_iterator>& text,
|
|
const MatchComponents& components,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
// Required fields.
|
|
const HourMatch* hour = components.SubmatchOf<HourMatch>(MatchType_HOUR);
|
|
if (hour == nullptr || hour->count_of_digits == 0) {
|
|
return false;
|
|
}
|
|
|
|
// Optional fields.
|
|
const MinuteMatch* minute =
|
|
components.SubmatchOf<MinuteMatch>(MatchType_MINUTE);
|
|
if (minute != nullptr && minute->count_of_digits == 0) {
|
|
return false;
|
|
}
|
|
const SecondMatch* second =
|
|
components.SubmatchOf<SecondMatch>(MatchType_SECOND);
|
|
if (second != nullptr && second->count_of_digits == 0) {
|
|
return false;
|
|
}
|
|
const FractionSecondMatch* fraction_second =
|
|
components.SubmatchOf<FractionSecondMatch>(MatchType_FRACTION_SECOND);
|
|
if (fraction_second != nullptr && fraction_second->count_of_digits == 0) {
|
|
return false;
|
|
}
|
|
|
|
// Validation.
|
|
uint32 validation = nonterminal->time_value_parameter()->validation();
|
|
const grammar::Match* end = hour;
|
|
if (minute != nullptr) {
|
|
if (second != nullptr) {
|
|
if (fraction_second != nullptr) {
|
|
end = fraction_second;
|
|
} else {
|
|
end = second;
|
|
}
|
|
} else {
|
|
end = minute;
|
|
}
|
|
}
|
|
|
|
// Check if there is any extra space between h m s f.
|
|
if ((validation &
|
|
TimeValueParameter_::TimeValueValidation_ALLOW_EXTRA_SPACE) == 0) {
|
|
// Check whether there is whitespace between token.
|
|
if (minute != nullptr && minute->HasLeadingWhitespace()) {
|
|
return false;
|
|
}
|
|
if (second != nullptr && second->HasLeadingWhitespace()) {
|
|
return false;
|
|
}
|
|
if (fraction_second != nullptr && fraction_second->HasLeadingWhitespace()) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check if there is any ':' or '.' as a prefix or suffix.
|
|
if (validation &
|
|
TimeValueParameter_::TimeValueValidation_DISALLOW_COLON_DOT_CONTEXT) {
|
|
const int begin_pos = hour->codepoint_span.first;
|
|
const int end_pos = end->codepoint_span.second;
|
|
if (begin_pos > 1 &&
|
|
(*text[begin_pos - 1] == ':' || *text[begin_pos - 1] == '.') &&
|
|
isdigit(*text[begin_pos - 2])) {
|
|
return false;
|
|
}
|
|
// Last valid codepoint is at text.size() - 2 as we added the end position
|
|
// of text for easier span extraction.
|
|
if (end_pos < text.size() - 2 &&
|
|
(*text[end_pos] == ':' || *text[end_pos] == '.') &&
|
|
isdigit(*text[end_pos + 1])) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
TimeValueMatch time_value;
|
|
time_value.Init(components.root->lhs, components.root->codepoint_span,
|
|
components.root->match_offset);
|
|
time_value.Reset();
|
|
time_value.hour_match = hour;
|
|
time_value.minute_match = minute;
|
|
time_value.second_match = second;
|
|
time_value.fraction_second_match = fraction_second;
|
|
time_value.is_hour_zero_prefixed = hour->is_zero_prefixed;
|
|
time_value.is_minute_one_digit =
|
|
(minute != nullptr && minute->count_of_digits == 1);
|
|
time_value.is_second_one_digit =
|
|
(second != nullptr && second->count_of_digits == 1);
|
|
time_value.hour = hour->value;
|
|
time_value.minute = (minute != nullptr ? minute->value : NO_VAL);
|
|
time_value.second = (second != nullptr ? second->value : NO_VAL);
|
|
time_value.fraction_second =
|
|
(fraction_second != nullptr ? fraction_second->value : NO_VAL);
|
|
|
|
if (!IsValidTimeValue(time_value)) {
|
|
return false;
|
|
}
|
|
|
|
TimeValueMatch* result = matcher->AllocateMatch<TimeValueMatch>();
|
|
*result = time_value;
|
|
matcher->AddMatch(result);
|
|
return true;
|
|
}
|
|
|
|
// Tries to parsing a time from spelled out time components.
|
|
bool ParseSpelledTimeValue(const MatchComponents& components,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
// Required fields.
|
|
const HourMatch* hour = components.SubmatchOf<HourMatch>(MatchType_HOUR);
|
|
if (hour == nullptr || hour->count_of_digits != 0) {
|
|
return false;
|
|
}
|
|
// Optional fields.
|
|
const MinuteMatch* minute =
|
|
components.SubmatchOf<MinuteMatch>(MatchType_MINUTE);
|
|
if (minute != nullptr && minute->count_of_digits != 0) {
|
|
return false;
|
|
}
|
|
const SecondMatch* second =
|
|
components.SubmatchOf<SecondMatch>(MatchType_SECOND);
|
|
if (second != nullptr && second->count_of_digits != 0) {
|
|
return false;
|
|
}
|
|
|
|
uint32 validation = nonterminal->time_value_parameter()->validation();
|
|
// Check if there is any extra space between h m s.
|
|
if ((validation &
|
|
TimeValueParameter_::TimeValueValidation_ALLOW_EXTRA_SPACE) == 0) {
|
|
// Check whether there is whitespace between token.
|
|
if (minute != nullptr && minute->HasLeadingWhitespace()) {
|
|
return false;
|
|
}
|
|
if (second != nullptr && second->HasLeadingWhitespace()) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
TimeValueMatch time_value;
|
|
time_value.Init(components.root->lhs, components.root->codepoint_span,
|
|
components.root->match_offset);
|
|
time_value.Reset();
|
|
time_value.hour_match = hour;
|
|
time_value.minute_match = minute;
|
|
time_value.second_match = second;
|
|
time_value.is_hour_zero_prefixed = hour->is_zero_prefixed;
|
|
time_value.is_minute_one_digit =
|
|
(minute != nullptr && minute->count_of_digits == 1);
|
|
time_value.is_second_one_digit =
|
|
(second != nullptr && second->count_of_digits == 1);
|
|
time_value.hour = hour->value;
|
|
time_value.minute = (minute != nullptr ? minute->value : NO_VAL);
|
|
time_value.second = (second != nullptr ? second->value : NO_VAL);
|
|
|
|
if (!IsValidTimeValue(time_value)) {
|
|
return false;
|
|
}
|
|
|
|
TimeValueMatch* result = matcher->AllocateMatch<TimeValueMatch>();
|
|
*result = time_value;
|
|
matcher->AddMatch(result);
|
|
return true;
|
|
}
|
|
|
|
// Reconstructs and validates a time value from a match.
|
|
void CheckTimeValue(const std::vector<UnicodeText::const_iterator>& text,
|
|
const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
MatchComponents components(
|
|
match, {MatchType_HOUR, MatchType_MINUTE, MatchType_SECOND,
|
|
MatchType_FRACTION_SECOND});
|
|
if (ParseDigitalTimeValue(text, components, nonterminal, matcher)) {
|
|
return;
|
|
}
|
|
if (ParseSpelledTimeValue(components, nonterminal, matcher)) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Validates a time span match.
|
|
void CheckTimeSpan(const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
const TermValueMatch* ts_name =
|
|
grammar::SelectFirstOfType<TermValueMatch>(match, MatchType_TERM_VALUE);
|
|
const TermValue* term_value = ts_name->term_value;
|
|
TC3_CHECK(term_value != nullptr);
|
|
TC3_CHECK(term_value->time_span_spec() != nullptr);
|
|
const TimeSpanSpec* ts_spec = term_value->time_span_spec();
|
|
TimeSpanMatch* time_span = matcher->AllocateAndInitMatch<TimeSpanMatch>(
|
|
match->lhs, match->codepoint_span, match->match_offset);
|
|
time_span->Reset();
|
|
time_span->nonterminal = nonterminal;
|
|
time_span->time_span_spec = ts_spec;
|
|
time_span->time_span_code = ts_spec->code();
|
|
matcher->AddMatch(time_span);
|
|
}
|
|
|
|
// Validates a time period match.
|
|
void CheckTimePeriod(const std::vector<UnicodeText::const_iterator>& text,
|
|
const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
int period_value = NO_VAL;
|
|
|
|
// If a value mapping exists, use it.
|
|
if (nonterminal->value() != nullptr) {
|
|
period_value = nonterminal->value()->value();
|
|
} else if (const TermValueMatch* term =
|
|
grammar::SelectFirstOfType<TermValueMatch>(
|
|
match, MatchType_TERM_VALUE)) {
|
|
period_value = term->term_value->value();
|
|
} else if (const grammar::Match* digits =
|
|
grammar::SelectFirstOfType<grammar::Match>(
|
|
match, grammar::Match::kDigitsType)) {
|
|
period_value = ParseLeadingDec32Value(
|
|
std::string(text[digits->codepoint_span.first].utf8_data(),
|
|
text[digits->codepoint_span.second].utf8_data() -
|
|
text[digits->codepoint_span.first].utf8_data())
|
|
.c_str());
|
|
}
|
|
|
|
if (period_value <= NO_VAL) {
|
|
return;
|
|
}
|
|
|
|
TimePeriodMatch* result = matcher->AllocateAndInitMatch<TimePeriodMatch>(
|
|
match->lhs, match->codepoint_span, match->match_offset);
|
|
result->Reset();
|
|
result->nonterminal = nonterminal;
|
|
result->value = period_value;
|
|
matcher->AddMatch(result);
|
|
}
|
|
|
|
// Reconstructs a date from a relative date rule match.
|
|
void CheckRelativeDate(const DateAnnotationOptions& options,
|
|
const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
if (!options.enable_special_day_offset &&
|
|
grammar::SelectFirstOfType<TermValueMatch>(match, MatchType_TERM_VALUE) !=
|
|
nullptr) {
|
|
// Special day offsets, like "Today", "Tomorrow" etc. are not enabled.
|
|
return;
|
|
}
|
|
|
|
RelativeMatch* relative_match = matcher->AllocateAndInitMatch<RelativeMatch>(
|
|
match->lhs, match->codepoint_span, match->match_offset);
|
|
relative_match->Reset();
|
|
relative_match->nonterminal = nonterminal;
|
|
|
|
// Fill relative date information from individual components.
|
|
grammar::Traverse(match, [match, relative_match](const grammar::Match* node) {
|
|
// Ignore the current match.
|
|
if (node == match || node->type == grammar::Match::kUnknownType) {
|
|
return true;
|
|
}
|
|
|
|
if (node->type == MatchType_TERM_VALUE) {
|
|
const int value =
|
|
static_cast<const TermValueMatch*>(node)->term_value->value();
|
|
relative_match->day = abs(value);
|
|
if (value >= 0) {
|
|
// Marks "today" as in the future.
|
|
relative_match->is_future_date = true;
|
|
}
|
|
relative_match->existing |=
|
|
(RelativeMatch::HAS_DAY | RelativeMatch::HAS_IS_FUTURE);
|
|
return false;
|
|
}
|
|
|
|
// Parse info from nonterminal.
|
|
const NonterminalValue* nonterminal =
|
|
static_cast<const NonterminalMatch*>(node)->nonterminal;
|
|
if (nonterminal != nullptr &&
|
|
nonterminal->relative_parameter() != nullptr) {
|
|
const RelativeParameter* relative_parameter =
|
|
nonterminal->relative_parameter();
|
|
if (relative_parameter->period() !=
|
|
RelativeParameter_::Period_PERIOD_UNKNOWN) {
|
|
relative_match->is_future_date =
|
|
(relative_parameter->period() ==
|
|
RelativeParameter_::Period_PERIOD_FUTURE);
|
|
relative_match->existing |= RelativeMatch::HAS_IS_FUTURE;
|
|
}
|
|
if (relative_parameter->day_of_week_interpretation() != nullptr) {
|
|
relative_match->day_of_week_nonterminal = nonterminal;
|
|
relative_match->existing |= RelativeMatch::HAS_DAY_OF_WEEK;
|
|
}
|
|
}
|
|
|
|
// Relative day of week.
|
|
if (node->type == MatchType_DAY_OF_WEEK) {
|
|
relative_match->day_of_week =
|
|
static_cast<const DayOfWeekMatch*>(node)->value;
|
|
return false;
|
|
}
|
|
|
|
if (node->type != MatchType_TIME_PERIOD) {
|
|
return true;
|
|
}
|
|
|
|
const TimePeriodMatch* period = static_cast<const TimePeriodMatch*>(node);
|
|
switch (nonterminal->relative_parameter()->type()) {
|
|
case RelativeParameter_::RelativeType_YEAR: {
|
|
relative_match->year = period->value;
|
|
relative_match->existing |= RelativeMatch::HAS_YEAR;
|
|
break;
|
|
}
|
|
case RelativeParameter_::RelativeType_MONTH: {
|
|
relative_match->month = period->value;
|
|
relative_match->existing |= RelativeMatch::HAS_MONTH;
|
|
break;
|
|
}
|
|
case RelativeParameter_::RelativeType_WEEK: {
|
|
relative_match->week = period->value;
|
|
relative_match->existing |= RelativeMatch::HAS_WEEK;
|
|
break;
|
|
}
|
|
case RelativeParameter_::RelativeType_DAY: {
|
|
relative_match->day = period->value;
|
|
relative_match->existing |= RelativeMatch::HAS_DAY;
|
|
break;
|
|
}
|
|
case RelativeParameter_::RelativeType_HOUR: {
|
|
relative_match->hour = period->value;
|
|
relative_match->existing |= RelativeMatch::HAS_HOUR;
|
|
break;
|
|
}
|
|
case RelativeParameter_::RelativeType_MINUTE: {
|
|
relative_match->minute = period->value;
|
|
relative_match->existing |= RelativeMatch::HAS_MINUTE;
|
|
break;
|
|
}
|
|
case RelativeParameter_::RelativeType_SECOND: {
|
|
relative_match->second = period->value;
|
|
relative_match->existing |= RelativeMatch::HAS_SECOND;
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return true;
|
|
});
|
|
matcher->AddMatch(relative_match);
|
|
}
|
|
|
|
bool IsValidTimeZoneOffset(const int time_zone_offset) {
|
|
return (time_zone_offset >= -720 && time_zone_offset <= 840 &&
|
|
time_zone_offset % 15 == 0);
|
|
}
|
|
|
|
// Parses, validates and adds a time zone offset match.
|
|
void CheckTimeZoneOffset(const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
MatchComponents components(
|
|
match, {MatchType_DIGITS, MatchType_TERM_VALUE, MatchType_NONTERMINAL});
|
|
const TermValueMatch* tz_sign =
|
|
components.SubmatchOf<TermValueMatch>(MatchType_TERM_VALUE);
|
|
if (tz_sign == nullptr) {
|
|
return;
|
|
}
|
|
const int sign = tz_sign->term_value->value();
|
|
TC3_CHECK(sign == -1 || sign == 1);
|
|
|
|
const int tz_digits_index = components.IndexOf(MatchType_DIGITS);
|
|
if (tz_digits_index < 0) {
|
|
return;
|
|
}
|
|
const DigitsMatch* tz_digits =
|
|
components.SubmatchAt<DigitsMatch>(tz_digits_index);
|
|
if (tz_digits == nullptr) {
|
|
return;
|
|
}
|
|
|
|
int offset;
|
|
if (tz_digits->count_of_digits >= 3) {
|
|
offset = (tz_digits->value / 100) * 60 + (tz_digits->value % 100);
|
|
} else {
|
|
offset = tz_digits->value * 60;
|
|
if (const DigitsMatch* tz_digits_extra = components.SubmatchOf<DigitsMatch>(
|
|
MatchType_DIGITS, /*start_index=*/tz_digits_index + 1)) {
|
|
offset += tz_digits_extra->value;
|
|
}
|
|
}
|
|
|
|
const NonterminalMatch* tz_offset =
|
|
components.SubmatchOf<NonterminalMatch>(MatchType_NONTERMINAL);
|
|
if (tz_offset == nullptr) {
|
|
return;
|
|
}
|
|
|
|
const int time_zone_offset = sign * offset;
|
|
if (!IsValidTimeZoneOffset(time_zone_offset)) {
|
|
return;
|
|
}
|
|
|
|
TimeZoneOffsetMatch* result =
|
|
matcher->AllocateAndInitMatch<TimeZoneOffsetMatch>(
|
|
match->lhs, match->codepoint_span, match->match_offset);
|
|
result->Reset();
|
|
result->nonterminal = nonterminal;
|
|
result->time_zone_offset_param =
|
|
tz_offset->nonterminal->time_zone_offset_parameter();
|
|
result->time_zone_offset = time_zone_offset;
|
|
matcher->AddMatch(result);
|
|
}
|
|
|
|
// Validates and adds a time zone name match.
|
|
void CheckTimeZoneName(const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
TC3_CHECK(match->IsUnaryRule());
|
|
const TermValueMatch* tz_name =
|
|
static_cast<const TermValueMatch*>(match->unary_rule_rhs());
|
|
if (tz_name == nullptr) {
|
|
return;
|
|
}
|
|
const TimeZoneNameSpec* tz_name_spec =
|
|
tz_name->term_value->time_zone_name_spec();
|
|
TimeZoneNameMatch* result = matcher->AllocateAndInitMatch<TimeZoneNameMatch>(
|
|
match->lhs, match->codepoint_span, match->match_offset);
|
|
result->Reset();
|
|
result->nonterminal = nonterminal;
|
|
result->time_zone_name_spec = tz_name_spec;
|
|
result->time_zone_code = tz_name_spec->code();
|
|
matcher->AddMatch(result);
|
|
}
|
|
|
|
// Adds a mapped term value match containing its value.
|
|
void AddTermValue(const grammar::Match* match, const TermValue* term_value,
|
|
grammar::Matcher* matcher) {
|
|
TermValueMatch* term_match = matcher->AllocateAndInitMatch<TermValueMatch>(
|
|
match->lhs, match->codepoint_span, match->match_offset);
|
|
term_match->Reset();
|
|
term_match->term_value = term_value;
|
|
matcher->AddMatch(term_match);
|
|
}
|
|
|
|
// Adds a match for a nonterminal.
|
|
void AddNonterminal(const grammar::Match* match,
|
|
const NonterminalValue* nonterminal,
|
|
grammar::Matcher* matcher) {
|
|
NonterminalMatch* result =
|
|
matcher->AllocateAndInitMatch<NonterminalMatch>(*match);
|
|
result->Reset();
|
|
result->nonterminal = nonterminal;
|
|
matcher->AddMatch(result);
|
|
}
|
|
|
|
// Adds a match for an extraction rule that is potentially used in a date range
|
|
// rule.
|
|
void AddExtractionRuleMatch(const grammar::Match* match,
|
|
const ExtractionRuleParameter* rule,
|
|
grammar::Matcher* matcher) {
|
|
ExtractionMatch* result =
|
|
matcher->AllocateAndInitMatch<ExtractionMatch>(*match);
|
|
result->Reset();
|
|
result->extraction_rule = rule;
|
|
matcher->AddMatch(result);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
void DateExtractor::HandleExtractionRuleMatch(
|
|
const ExtractionRuleParameter* rule, const grammar::Match* match,
|
|
grammar::Matcher* matcher) {
|
|
if (rule->id() != nullptr) {
|
|
const std::string rule_id = rule->id()->str();
|
|
bool keep = false;
|
|
for (const std::string& extra_requested_dates_id :
|
|
options_.extra_requested_dates) {
|
|
if (extra_requested_dates_id == rule_id) {
|
|
keep = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!keep) {
|
|
return;
|
|
}
|
|
}
|
|
output_.push_back(
|
|
Output{rule, matcher->AllocateAndInitMatch<grammar::Match>(*match)});
|
|
}
|
|
|
|
void DateExtractor::HandleRangeExtractionRuleMatch(const grammar::Match* match,
|
|
grammar::Matcher* matcher) {
|
|
// Collect the two datetime roots that make up the range.
|
|
std::vector<const grammar::Match*> parts;
|
|
grammar::Traverse(match, [match, &parts](const grammar::Match* node) {
|
|
if (node == match || node->type == grammar::Match::kUnknownType) {
|
|
// Just continue traversing the match.
|
|
return true;
|
|
}
|
|
|
|
// Collect, but don't expand the individual datetime nodes.
|
|
parts.push_back(node);
|
|
return false;
|
|
});
|
|
TC3_CHECK_EQ(parts.size(), 2);
|
|
range_output_.push_back(
|
|
RangeOutput{matcher->AllocateAndInitMatch<grammar::Match>(*match),
|
|
/*from=*/parts[0], /*to=*/parts[1]});
|
|
}
|
|
|
|
void DateExtractor::MatchFound(const grammar::Match* match,
|
|
const grammar::CallbackId type,
|
|
const int64 value, grammar::Matcher* matcher) {
|
|
switch (type) {
|
|
case MatchType_DATETIME_RULE: {
|
|
HandleExtractionRuleMatch(
|
|
/*rule=*/
|
|
datetime_rules_->extraction_rule()->Get(value), match, matcher);
|
|
return;
|
|
}
|
|
case MatchType_DATETIME_RANGE_RULE: {
|
|
HandleRangeExtractionRuleMatch(match, matcher);
|
|
return;
|
|
}
|
|
case MatchType_DATETIME: {
|
|
// If an extraction rule is also part of a range extraction rule, then the
|
|
// extraction rule is treated as a rule match and nonterminal match.
|
|
// This type is used to match the rule as non terminal.
|
|
AddExtractionRuleMatch(
|
|
match, datetime_rules_->extraction_rule()->Get(value), matcher);
|
|
return;
|
|
}
|
|
case MatchType_TERM_VALUE: {
|
|
// Handle mapped terms.
|
|
AddTermValue(match, datetime_rules_->term_value()->Get(value), matcher);
|
|
return;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
// Handle non-terminals.
|
|
const NonterminalValue* nonterminal =
|
|
datetime_rules_->nonterminal_value()->Get(value);
|
|
StringPiece match_text =
|
|
StringPiece(text_[match->codepoint_span.first].utf8_data(),
|
|
text_[match->codepoint_span.second].utf8_data() -
|
|
text_[match->codepoint_span.first].utf8_data());
|
|
switch (type) {
|
|
case MatchType_NONTERMINAL:
|
|
AddNonterminal(match, nonterminal, matcher);
|
|
break;
|
|
case MatchType_DIGITS:
|
|
CheckDigits<DigitsMatch>(match, nonterminal, match_text, matcher);
|
|
break;
|
|
case MatchType_YEAR:
|
|
CheckDigits<YearMatch>(match, nonterminal, match_text, matcher);
|
|
break;
|
|
case MatchType_MONTH:
|
|
CheckAndAddNumericValue<MonthMatch>(match, nonterminal, match_text,
|
|
matcher);
|
|
break;
|
|
case MatchType_DAY:
|
|
CheckAndAddNumericValue<DayMatch>(match, nonterminal, match_text,
|
|
matcher);
|
|
break;
|
|
case MatchType_DAY_OF_WEEK:
|
|
CheckAndAddDirectOrMappedValue<DayOfWeekMatch>(match, nonterminal,
|
|
matcher);
|
|
break;
|
|
case MatchType_HOUR:
|
|
CheckAndAddNumericValue<HourMatch>(match, nonterminal, match_text,
|
|
matcher);
|
|
break;
|
|
case MatchType_MINUTE:
|
|
CheckAndAddNumericValue<MinuteMatch>(match, nonterminal, match_text,
|
|
matcher);
|
|
break;
|
|
case MatchType_SECOND:
|
|
CheckAndAddNumericValue<SecondMatch>(match, nonterminal, match_text,
|
|
matcher);
|
|
break;
|
|
case MatchType_FRACTION_SECOND:
|
|
CheckDigitsAsFraction<FractionSecondMatch>(match, nonterminal, match_text,
|
|
matcher);
|
|
break;
|
|
case MatchType_TIME_VALUE:
|
|
CheckTimeValue(text_, match, nonterminal, matcher);
|
|
break;
|
|
case MatchType_TIME_SPAN:
|
|
CheckTimeSpan(match, nonterminal, matcher);
|
|
break;
|
|
case MatchType_TIME_ZONE_NAME:
|
|
CheckTimeZoneName(match, nonterminal, matcher);
|
|
break;
|
|
case MatchType_TIME_ZONE_OFFSET:
|
|
CheckTimeZoneOffset(match, nonterminal, matcher);
|
|
break;
|
|
case MatchType_TIME_PERIOD:
|
|
CheckTimePeriod(text_, match, nonterminal, matcher);
|
|
break;
|
|
case MatchType_RELATIVE_DATE:
|
|
CheckRelativeDate(options_, match, nonterminal, matcher);
|
|
break;
|
|
case MatchType_COMBINED_DIGITS:
|
|
CheckCombinedDigits<CombinedDigitsMatch>(match, nonterminal, match_text,
|
|
matcher);
|
|
break;
|
|
default:
|
|
TC3_VLOG(ERROR) << "Unhandled match type: " << type;
|
|
}
|
|
}
|
|
|
|
} // namespace libtextclassifier3::dates
|