Android11/external/libtextclassifier/native/annotator/grammar/dates/parser.cc

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "annotator/grammar/dates/parser.h"

#include "annotator/grammar/dates/extractor.h"
#include "annotator/grammar/dates/utils/date-match.h"
#include "annotator/grammar/dates/utils/date-utils.h"
#include "utils/base/integral_types.h"
#include "utils/base/logging.h"
#include "utils/base/macros.h"
#include "utils/grammar/lexer.h"
#include "utils/grammar/matcher.h"
#include "utils/grammar/rules_generated.h"
#include "utils/grammar/types.h"
#include "utils/strings/split.h"
#include "utils/strings/stringpiece.h"

namespace libtextclassifier3::dates {
namespace {

// Helper methods to validate individual components from a date match.

// Checks the validation requirement of a rule against a match.
// For example if the rule asks for `SPELLED_MONTH`, then we check that the
// match has the right flag.
bool CheckMatchValidationAndFlag(
    const grammar::Match* match, const ExtractionRuleParameter* rule,
    const ExtractionRuleParameter_::ExtractionValidation validation,
    const NonterminalParameter_::Flag flag) {
  if (rule == nullptr || (rule->validation() & validation) == 0) {
    // No validation requirement.
    return true;
  }
  const NonterminalParameter* nonterminal_parameter =
      static_cast<const NonterminalMatch*>(match)
          ->nonterminal->nonterminal_parameter();
  return (nonterminal_parameter != nullptr &&
          (nonterminal_parameter->flag() & flag) != 0);
}

bool GenerateDate(const ExtractionRuleParameter* rule,
                  const grammar::Match* match, DateMatch* date) {
  bool is_valid = true;

  // Post check and assign date components.
  grammar::Traverse(match, [rule, date, &is_valid](const grammar::Match* node) {
    switch (node->type) {
      case MatchType_YEAR: {
        if (CheckMatchValidationAndFlag(
                node, rule,
                ExtractionRuleParameter_::ExtractionValidation_SPELLED_YEAR,
                NonterminalParameter_::Flag_IS_SPELLED)) {
          date->year_match = static_cast<const YearMatch*>(node);
          date->year = date->year_match->value;
        } else {
          is_valid = false;
        }
        break;
      }
      case MatchType_MONTH: {
        if (CheckMatchValidationAndFlag(
                node, rule,
                ExtractionRuleParameter_::ExtractionValidation_SPELLED_MONTH,
                NonterminalParameter_::Flag_IS_SPELLED)) {
          date->month_match = static_cast<const MonthMatch*>(node);
          date->month = date->month_match->value;
        } else {
          is_valid = false;
        }
        break;
      }
      case MatchType_DAY: {
        if (CheckMatchValidationAndFlag(
                node, rule,
                ExtractionRuleParameter_::ExtractionValidation_SPELLED_DAY,
                NonterminalParameter_::Flag_IS_SPELLED)) {
          date->day_match = static_cast<const DayMatch*>(node);
          date->day = date->day_match->value;
        } else {
          is_valid = false;
        }
        break;
      }
      case MatchType_DAY_OF_WEEK: {
        date->day_of_week_match = static_cast<const DayOfWeekMatch*>(node);
        date->day_of_week =
            static_cast<DayOfWeek>(date->day_of_week_match->value);
        break;
      }
      case MatchType_TIME_VALUE: {
        date->time_value_match = static_cast<const TimeValueMatch*>(node);
        date->hour = date->time_value_match->hour;
        date->minute = date->time_value_match->minute;
        date->second = date->time_value_match->second;
        date->fraction_second = date->time_value_match->fraction_second;
        return false;
      }
      case MatchType_TIME_SPAN: {
        date->time_span_match = static_cast<const TimeSpanMatch*>(node);
        date->time_span_code = date->time_span_match->time_span_code;
        return false;
      }
      case MatchType_TIME_ZONE_NAME: {
        date->time_zone_name_match =
            static_cast<const TimeZoneNameMatch*>(node);
        date->time_zone_code = date->time_zone_name_match->time_zone_code;
        return false;
      }
      case MatchType_TIME_ZONE_OFFSET: {
        date->time_zone_offset_match =
            static_cast<const TimeZoneOffsetMatch*>(node);
        date->time_zone_offset = date->time_zone_offset_match->time_zone_offset;
        return false;
      }
      case MatchType_RELATIVE_DATE: {
        date->relative_match = static_cast<const RelativeMatch*>(node);
        return false;
      }
      case MatchType_COMBINED_DIGITS: {
        date->combined_digits_match =
            static_cast<const CombinedDigitsMatch*>(node);
        if (date->combined_digits_match->HasYear()) {
          date->year = date->combined_digits_match->GetYear();
        }
        if (date->combined_digits_match->HasMonth()) {
          date->month = date->combined_digits_match->GetMonth();
        }
        if (date->combined_digits_match->HasDay()) {
          date->day = date->combined_digits_match->GetDay();
        }
        if (date->combined_digits_match->HasHour()) {
          date->hour = date->combined_digits_match->GetHour();
        }
        if (date->combined_digits_match->HasMinute()) {
          date->minute = date->combined_digits_match->GetMinute();
        }
        if (date->combined_digits_match->HasSecond()) {
          date->second = date->combined_digits_match->GetSecond();
        }
        return false;
      }
      default:
        // Expand node further.
        return true;
    }

    return false;
  });

  if (is_valid) {
    date->begin = match->codepoint_span.first;
    date->end = match->codepoint_span.second;
    date->priority = rule ? rule->priority_delta() : 0;
    date->annotator_priority_score =
        rule ? rule->annotator_priority_score() : 0.0;
  }
  return is_valid;
}

bool GenerateFromOrToDateRange(const grammar::Match* match, DateMatch* date) {
  return GenerateDate(
      /*rule=*/(
          match->type == MatchType_DATETIME
              ? static_cast<const ExtractionMatch*>(match)->extraction_rule
              : nullptr),
      match, date);
}

bool GenerateDateRange(const grammar::Match* match, const grammar::Match* from,
                       const grammar::Match* to, DateRangeMatch* date_range) {
  if (!GenerateFromOrToDateRange(from, &date_range->from)) {
    TC3_LOG(WARNING) << "Failed to generate date for `from`.";
    return false;
  }
  if (!GenerateFromOrToDateRange(to, &date_range->to)) {
    TC3_LOG(WARNING) << "Failed to generate date for `to`.";
    return false;
  }
  date_range->begin = match->codepoint_span.first;
  date_range->end = match->codepoint_span.second;
  return true;
}

bool NormalizeHour(DateMatch* date) {
  if (date->time_span_match == nullptr) {
    // Nothing to do.
    return true;
  }
  return NormalizeHourByTimeSpan(date->time_span_match->time_span_spec, date);
}

void CheckAndSetAmbiguousHour(DateMatch* date) {
  if (date->HasHour()) {
    // Use am-pm ambiguity as default.
    if (!date->HasTimeSpanCode() && date->hour >= 1 && date->hour <= 12 &&
        !(date->time_value_match != nullptr &&
          date->time_value_match->hour_match != nullptr &&
          date->time_value_match->hour_match->is_zero_prefixed)) {
      date->SetAmbiguousHourProperties(2, 12);
    }
  }
}

// Normalizes a date candidate.
// Returns whether the candidate was successfully normalized.
bool NormalizeDate(DateMatch* date) {
  // Normalize hour.
  if (!NormalizeHour(date)) {
    TC3_VLOG(ERROR) << "Hour normalization (according to time-span) failed."
                    << date->DebugString();
    return false;
  }
  CheckAndSetAmbiguousHour(date);
  if (!date->IsValid()) {
    TC3_VLOG(ERROR) << "Fields inside date instance are ill-formed "
                    << date->DebugString();
  }
  return true;
}

// Copies the field from one DateMatch to another whose field is null. for
// example: if the from is "May 1, 8pm", and the to is "9pm", "May 1" will be
// copied to "to". Now we only copy fields for date range requirement.fv
void CopyFieldsForDateMatch(const DateMatch& from, DateMatch* to) {
  if (from.time_span_match != nullptr && to->time_span_match == nullptr) {
    to->time_span_match = from.time_span_match;
    to->time_span_code = from.time_span_code;
  }
  if (from.month_match != nullptr && to->month_match == nullptr) {
    to->month_match = from.month_match;
    to->month = from.month;
  }
}

// Normalizes a date range candidate.
// Returns whether the date range was successfully normalized.
bool NormalizeDateRange(DateRangeMatch* date_range) {
  CopyFieldsForDateMatch(date_range->from, &date_range->to);
  CopyFieldsForDateMatch(date_range->to, &date_range->from);
  return (NormalizeDate(&date_range->from) && NormalizeDate(&date_range->to));
}

bool CheckDate(const DateMatch& date, const ExtractionRuleParameter* rule) {
  // It's possible that "time_zone_name_match == NULL" when
  // "HasTimeZoneCode() == true", or "time_zone_offset_match == NULL" when
  // "HasTimeZoneOffset() == true" due to inference between endpoints, so we
  // must check if they really exist before using them.
  if (date.HasTimeZoneOffset()) {
    if (date.HasTimeZoneCode()) {
      if (date.time_zone_name_match != nullptr) {
        TC3_CHECK(date.time_zone_name_match->time_zone_name_spec != nullptr);
        const TimeZoneNameSpec* spec =
            date.time_zone_name_match->time_zone_name_spec;
        if (!spec->is_utc()) {
          return false;
        }
        if (!spec->is_abbreviation()) {
          return false;
        }
      }
    } else if (date.time_zone_offset_match != nullptr) {
      TC3_CHECK(date.time_zone_offset_match->time_zone_offset_param != nullptr);
      const TimeZoneOffsetParameter* param =
          date.time_zone_offset_match->time_zone_offset_param;
      if (param->format() == TimeZoneOffsetParameter_::Format_FORMAT_H ||
          param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HH) {
        return false;
      }
      if (!(rule->validation() &
            ExtractionRuleParameter_::
                ExtractionValidation_ALLOW_UNCONFIDENT_TIME_ZONE)) {
        if (param->format() == TimeZoneOffsetParameter_::Format_FORMAT_H_MM ||
            param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HH_MM ||
            param->format() == TimeZoneOffsetParameter_::Format_FORMAT_HMM) {
          return false;
        }
      }
    }
  }

  // Case: 1 April could be extracted as year 1, month april.
  // We simply remove this case.
  if (!date.HasBcAd() && date.year_match != nullptr && date.year < 1000) {
    // We allow case like 11/5/01
    if (date.HasMonth() && date.HasDay() &&
        date.year_match->count_of_digits == 2) {
    } else {
      return false;
    }
  }

  // Ignore the date if the year is larger than 9999 (The maximum number of 4
  // digits).
  if (date.year_match != nullptr && date.year > 9999) {
    TC3_VLOG(ERROR) << "Year is greater than 9999.";
    return false;
  }

  // Case: spelled may could be month 5, it also used very common as modal
  // verbs. We ignore spelled may as month.
  if ((rule->validation() &
       ExtractionRuleParameter_::ExtractionValidation_SPELLED_MONTH) &&
      date.month == 5 && !date.HasYear() && !date.HasDay()) {
    return false;
  }

  return true;
}

bool CheckContext(const std::vector<UnicodeText::const_iterator>& text,
                  const DateExtractor::Output& output) {
  const uint32 validation = output.rule->validation();

  // Nothing to check if we don't have any validation requirements for the
  // span boundaries.
  if ((validation &
       (ExtractionRuleParameter_::ExtractionValidation_LEFT_BOUND |
        ExtractionRuleParameter_::ExtractionValidation_RIGHT_BOUND)) == 0) {
    return true;
  }

  const int begin = output.match->codepoint_span.first;
  const int end = output.match->codepoint_span.second;

  // So far, we only check that the adjacent character cannot be a separator,
  // like /, - or .
  if ((validation &
       ExtractionRuleParameter_::ExtractionValidation_LEFT_BOUND) != 0) {
    if (begin > 0 && (*text[begin - 1] == '/' || *text[begin - 1] == '-' ||
                      *text[begin - 1] == ':')) {
      return false;
    }
  }
  if ((validation &
       ExtractionRuleParameter_::ExtractionValidation_RIGHT_BOUND) != 0) {
    // Last valid codepoint is at text.size() - 2 as we added the end position
    // of text for easier span extraction.
    if (end < text.size() - 1 &&
        (*text[end] == '/' || *text[end] == '-' || *text[end] == ':')) {
      return false;
    }
  }

  return true;
}

// Validates a date match. Returns true if the candidate is valid.
bool ValidateDate(const std::vector<UnicodeText::const_iterator>& text,
                  const DateExtractor::Output& output, const DateMatch& date) {
  if (!CheckDate(date, output.rule)) {
    return false;
  }
  if (!CheckContext(text, output)) {
    return false;
  }
  return true;
}

// Builds matched date instances from the grammar output.
std::vector<DateMatch> BuildDateMatches(
    const std::vector<UnicodeText::const_iterator>& text,
    const std::vector<DateExtractor::Output>& outputs) {
  std::vector<DateMatch> result;
  for (const DateExtractor::Output& output : outputs) {
    DateMatch date;
    if (GenerateDate(output.rule, output.match, &date)) {
      if (!NormalizeDate(&date)) {
        continue;
      }
      if (!ValidateDate(text, output, date)) {
        continue;
      }
      result.push_back(date);
    }
  }
  return result;
}

// Builds matched date range instances from the grammar output.
std::vector<DateRangeMatch> BuildDateRangeMatches(
    const std::vector<UnicodeText::const_iterator>& text,
    const std::vector<DateExtractor::RangeOutput>& range_outputs) {
  std::vector<DateRangeMatch> result;
  for (const DateExtractor::RangeOutput& range_output : range_outputs) {
    DateRangeMatch date_range;
    if (GenerateDateRange(range_output.match, range_output.from,
                          range_output.to, &date_range)) {
      if (!NormalizeDateRange(&date_range)) {
        continue;
      }
      result.push_back(date_range);
    }
  }
  return result;
}

template <typename T>
void RemoveDeletedMatches(const std::vector<bool>& removed,
                          std::vector<T>* matches) {
  int input = 0;
  for (int next = 0; next < matches->size(); ++next) {
    if (removed[next]) {
      continue;
    }
    if (input != next) {
      (*matches)[input] = (*matches)[next];
    }
    input++;
  }
  matches->resize(input);
}

// Removes duplicated date or date range instances.
// Overlapping date and date ranges are not considered here.
template <typename T>
void RemoveDuplicatedDates(std::vector<T>* matches) {
  // Assumption: matches are sorted ascending by (begin, end).
  std::vector<bool> removed(matches->size(), false);
  for (int i = 0; i < matches->size(); i++) {
    if (removed[i]) {
      continue;
    }
    const T& candidate = matches->at(i);
    for (int j = i + 1; j < matches->size(); j++) {
      if (removed[j]) {
        continue;
      }
      const T& next = matches->at(j);

      // Not overlapping.
      if (next.begin >= candidate.end) {
        break;
      }

      // If matching the same span of text, then check the priority.
      if (candidate.begin == next.begin && candidate.end == next.end) {
        if (candidate.GetPriority() < next.GetPriority()) {
          removed[i] = true;
          break;
        } else {
          removed[j] = true;
          continue;
        }
      }

      // Checks if `next` is fully covered by fields of `candidate`.
      if (next.end <= candidate.end) {
        removed[j] = true;
        continue;
      }

      // Checks whether `candidate`/`next` is a refinement.
      if (IsRefinement(candidate, next)) {
        removed[j] = true;
        continue;
      } else if (IsRefinement(next, candidate)) {
        removed[i] = true;
        break;
      }
    }
  }
  RemoveDeletedMatches(removed, matches);
}

// Filters out simple overtriggering simple matches.
bool IsBlacklistedDate(const UniLib& unilib,
                       const std::vector<UnicodeText::const_iterator>& text,
                       const DateMatch& match) {
  const int begin = match.begin;
  const int end = match.end;
  if (end - begin != 3) {
    return false;
  }

  std::string text_lower =
      unilib
          .ToLowerText(
              UTF8ToUnicodeText(text[begin].utf8_data(),
                                text[end].utf8_data() - text[begin].utf8_data(),
                                /*do_copy=*/false))
          .ToUTF8String();

  // "sun" is not a good abbreviation for a standalone day of the week.
  if (match.IsStandaloneRelativeDayOfWeek() &&
      (text_lower == "sun" || text_lower == "mon")) {
    return true;
  }

  // "mar" is not a good abbreviation for single month.
  if (match.HasMonth() && text_lower == "mar") {
    return true;
  }

  return false;
}

// Checks if two date matches are adjacent and mergeable.
bool AreDateMatchesAdjacentAndMergeable(
    const UniLib& unilib, const std::vector<UnicodeText::const_iterator>& text,
    const std::vector<std::string>& ignored_spans, const DateMatch& prev,
    const DateMatch& next) {
  // Check the context between the two matches.
  if (next.begin <= prev.end) {
    // The two matches are not adjacent.
    return false;
  }
  UnicodeText span;
  for (int i = prev.end; i < next.begin; i++) {
    const char32 codepoint = *text[i];
    if (unilib.IsWhitespace(codepoint)) {
      continue;
    }
    span.push_back(unilib.ToLower(codepoint));
  }
  if (span.empty()) {
    return true;
  }
  const std::string span_text = span.ToUTF8String();
  bool matched = false;
  for (const std::string& ignored_span : ignored_spans) {
    if (span_text == ignored_span) {
      matched = true;
      break;
    }
  }
  if (!matched) {
    return false;
  }
  return IsDateMatchMergeable(prev, next);
}

// Merges adjacent date and date range.
// For e.g. Monday, 5-10pm, the date "Monday" and the time range "5-10pm" will
// be merged
void MergeDateRangeAndDate(const UniLib& unilib,
                           const std::vector<UnicodeText::const_iterator>& text,
                           const std::vector<std::string>& ignored_spans,
                           const std::vector<DateMatch>& dates,
                           std::vector<DateRangeMatch>* date_ranges) {
  // For each range, check the date before or after the it to see if they could
  // be merged. Both the range and date array are sorted, so we only need to
  // scan the date array once.
  int next_date = 0;
  for (int i = 0; i < date_ranges->size(); i++) {
    DateRangeMatch* date_range = &date_ranges->at(i);
    // So far we only merge time range with a date.
    if (!date_range->from.HasHour()) {
      continue;
    }

    for (; next_date < dates.size(); next_date++) {
      const DateMatch& date = dates[next_date];

      // If the range is before the date, we check whether `date_range->to` can
      // be merged with the date.
      if (date_range->end <= date.begin) {
        DateMatch merged_date = date;
        if (AreDateMatchesAdjacentAndMergeable(unilib, text, ignored_spans,
                                               date_range->to, date)) {
          MergeDateMatch(date_range->to, &merged_date, /*update_span=*/true);
          date_range->to = merged_date;
          date_range->end = date_range->to.end;
          MergeDateMatch(date, &date_range->from, /*update_span=*/false);
          next_date++;

          // Check the second date after the range to see if it could be merged
          // further. For example: 10-11pm, Monday, May 15. 10-11pm is merged
          // with Monday and then we check that it could be merged with May 15
          // as well.
          if (next_date < dates.size()) {
            DateMatch next_match = dates[next_date];
            if (AreDateMatchesAdjacentAndMergeable(
                    unilib, text, ignored_spans, date_range->to, next_match)) {
              MergeDateMatch(date_range->to, &next_match, /*update_span=*/true);
              date_range->to = next_match;
              date_range->end = date_range->to.end;
              MergeDateMatch(dates[next_date], &date_range->from,
                             /*update_span=*/false);
              next_date++;
            }
          }
        }
        // Since the range is before the date, we try to check if the next range
        // could be merged with the current date.
        break;
      } else if (date_range->end > date.end && date_range->begin > date.begin) {
        // If the range is after the date, we check if `date_range.from` can be
        // merged with the date. Here is a special case, the date before range
        // could be partially overlapped. This is because the range.from could
        // be extracted as year in date. For example: March 3, 10-11pm is
        // extracted as date March 3, 2010 and the range 10-11pm. In this
        // case, we simply clear the year from date.
        DateMatch merged_date = date;
        if (date.HasYear() &&
            date.year_match->codepoint_span.second > date_range->begin) {
          merged_date.year_match = nullptr;
          merged_date.year = NO_VAL;
          merged_date.end = date.year_match->match_offset;
        }
        // Check and merge the range and the date before the range.
        if (AreDateMatchesAdjacentAndMergeable(unilib, text, ignored_spans,
                                               merged_date, date_range->from)) {
          MergeDateMatch(merged_date, &date_range->from, /*update_span=*/true);
          date_range->begin = date_range->from.begin;
          MergeDateMatch(merged_date, &date_range->to, /*update_span=*/false);

          // Check if the second date before the range can be merged as well.
          if (next_date > 0) {
            DateMatch prev_match = dates[next_date - 1];
            if (prev_match.end <= date_range->from.begin) {
              if (AreDateMatchesAdjacentAndMergeable(unilib, text,
                                                     ignored_spans, prev_match,
                                                     date_range->from)) {
                MergeDateMatch(prev_match, &date_range->from,
                               /*update_span=*/true);
                date_range->begin = date_range->from.begin;
                MergeDateMatch(prev_match, &date_range->to,
                               /*update_span=*/false);
              }
            }
          }
          next_date++;
          break;
        } else {
          // Since the date is before the date range, we move to the next date
          // to check if it could be merged with the current range.
          continue;
        }
      } else {
        // The date is either fully overlapped by the date range or the date
        // span end is after the date range. Move to the next date in both
        // cases.
      }
    }
  }
}

// Removes the dates which are part of a range. e.g. in "May 1 - 3", the date
// "May 1" is fully contained in the range.
void RemoveOverlappedDateByRange(const std::vector<DateRangeMatch>& ranges,
                                 std::vector<DateMatch>* dates) {
  int next_date = 0;
  std::vector<bool> removed(dates->size(), false);
  for (int i = 0; i < ranges.size(); ++i) {
    const auto& range = ranges[i];
    for (; next_date < dates->size(); ++next_date) {
      const auto& date = dates->at(next_date);
      // So far we don't touch the partially overlapped case.
      if (date.begin >= range.begin && date.end <= range.end) {
        // Fully contained.
        removed[next_date] = true;
      } else if (date.end <= range.begin) {
        continue;  // date is behind range, go to next date
      } else if (date.begin >= range.end) {
        break;  // range is behind date, go to next range
      }
    }
  }
  RemoveDeletedMatches(removed, dates);
}

// Converts candidate dates and date ranges.
void FillDateInstances(
    const UniLib& unilib, const std::vector<UnicodeText::const_iterator>& text,
    const DateAnnotationOptions& options, std::vector<DateMatch>* date_matches,
    std::vector<DatetimeParseResultSpan>* datetime_parse_result_spans) {
  int i = 0;
  for (int j = 1; j < date_matches->size(); j++) {
    if (options.merge_adjacent_components &&
        AreDateMatchesAdjacentAndMergeable(unilib, text, options.ignored_spans,
                                           date_matches->at(i),
                                           date_matches->at(j))) {
      MergeDateMatch(date_matches->at(i), &date_matches->at(j), true);
    } else {
      if (!IsBlacklistedDate(unilib, text, date_matches->at(i))) {
        DatetimeParseResultSpan datetime_parse_result_span;
        FillDateInstance(date_matches->at(i), &datetime_parse_result_span);
        datetime_parse_result_spans->push_back(datetime_parse_result_span);
      }
    }
    i = j;
  }
  if (!IsBlacklistedDate(unilib, text, date_matches->at(i))) {
    DatetimeParseResultSpan datetime_parse_result_span;
    FillDateInstance(date_matches->at(i), &datetime_parse_result_span);
    datetime_parse_result_spans->push_back(datetime_parse_result_span);
  }
}

void FillDateRangeInstances(
    const std::vector<DateRangeMatch>& date_range_matches,
    std::vector<DatetimeParseResultSpan>* datetime_parse_result_spans) {
  for (const DateRangeMatch& date_range_match : date_range_matches) {
    DatetimeParseResultSpan datetime_parse_result_span;
    FillDateRangeInstance(date_range_match, &datetime_parse_result_span);
    datetime_parse_result_spans->push_back(datetime_parse_result_span);
  }
}

// Fills `DatetimeParseResultSpan`  from `DateMatch` and `DateRangeMatch`
// instances.
std::vector<DatetimeParseResultSpan> GetOutputAsAnnotationList(
    const UniLib& unilib, const DateExtractor& extractor,
    const std::vector<UnicodeText::const_iterator>& text,
    const DateAnnotationOptions& options) {
  std::vector<DatetimeParseResultSpan> datetime_parse_result_spans;
  std::vector<DateMatch> date_matches =
      BuildDateMatches(text, extractor.output());

  std::sort(
      date_matches.begin(), date_matches.end(),
      // Order by increasing begin, and decreasing end (decreasing length).
      [](const DateMatch& a, const DateMatch& b) {
        return (a.begin < b.begin || (a.begin == b.begin && a.end > b.end));
      });

  if (!date_matches.empty()) {
    RemoveDuplicatedDates(&date_matches);
  }

  if (options.enable_date_range) {
    std::vector<DateRangeMatch> date_range_matches =
        BuildDateRangeMatches(text, extractor.range_output());

    if (!date_range_matches.empty()) {
      std::sort(
          date_range_matches.begin(), date_range_matches.end(),
          // Order by increasing begin, and decreasing end (decreasing length).
          [](const DateRangeMatch& a, const DateRangeMatch& b) {
            return (a.begin < b.begin || (a.begin == b.begin && a.end > b.end));
          });
      RemoveDuplicatedDates(&date_range_matches);
    }

    if (!date_matches.empty()) {
      MergeDateRangeAndDate(unilib, text, options.ignored_spans, date_matches,
                            &date_range_matches);
      RemoveOverlappedDateByRange(date_range_matches, &date_matches);
    }
    FillDateRangeInstances(date_range_matches, &datetime_parse_result_spans);
  }

  if (!date_matches.empty()) {
    FillDateInstances(unilib, text, options, &date_matches,
                      &datetime_parse_result_spans);
  }
  return datetime_parse_result_spans;
}

}  // namespace

std::vector<DatetimeParseResultSpan> DateParser::Parse(
    StringPiece text, const std::vector<Token>& tokens,
    const std::vector<Locale>& locales,
    const DateAnnotationOptions& options) const {
  std::vector<UnicodeText::const_iterator> codepoint_offsets;
  const UnicodeText text_unicode = UTF8ToUnicodeText(text,
                                                     /*do_copy=*/false);
  for (auto it = text_unicode.begin(); it != text_unicode.end(); it++) {
    codepoint_offsets.push_back(it);
  }
  codepoint_offsets.push_back(text_unicode.end());
  DateExtractor extractor(codepoint_offsets, options, datetime_rules_);
  // Select locale matching rules.
  // Only use a shard if locales match or the shard doesn't specify a locale
  // restriction.
  std::vector<const grammar::RulesSet_::Rules*> locale_rules =
      SelectLocaleMatchingShards(datetime_rules_->rules(), rules_locales_,
                                 locales);
  if (locale_rules.empty()) {
    return {};
  }
  grammar::Matcher matcher(&unilib_, datetime_rules_->rules(), locale_rules,
                           &extractor);
  lexer_.Process(text_unicode, tokens, /*annotations=*/nullptr, &matcher);
  return GetOutputAsAnnotationList(unilib_, extractor, codepoint_offsets,
                                   options);
}

}  // namespace libtextclassifier3::dates