124 lines
3.9 KiB
C++
124 lines
3.9 KiB
C++
/*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "utils/grammar/rules-utils.h"
|
|
|
|
namespace libtextclassifier3::grammar {
|
|
|
|
std::vector<std::vector<Locale>> ParseRulesLocales(const RulesSet* rules) {
|
|
if (rules == nullptr || rules->rules() == nullptr) {
|
|
return {};
|
|
}
|
|
std::vector<std::vector<Locale>> locales(rules->rules()->size());
|
|
for (int i = 0; i < rules->rules()->size(); i++) {
|
|
const grammar::RulesSet_::Rules* rules_shard = rules->rules()->Get(i);
|
|
if (rules_shard->locale() == nullptr) {
|
|
continue;
|
|
}
|
|
for (const LanguageTag* tag : *rules_shard->locale()) {
|
|
locales[i].push_back(Locale::FromLanguageTag(tag));
|
|
}
|
|
}
|
|
return locales;
|
|
}
|
|
|
|
std::vector<const grammar::RulesSet_::Rules*> SelectLocaleMatchingShards(
|
|
const RulesSet* rules,
|
|
const std::vector<std::vector<Locale>>& shard_locales,
|
|
const std::vector<Locale>& locales) {
|
|
std::vector<const grammar::RulesSet_::Rules*> shards;
|
|
if (rules->rules() == nullptr) {
|
|
return shards;
|
|
}
|
|
for (int i = 0; i < shard_locales.size(); i++) {
|
|
if (shard_locales[i].empty() ||
|
|
Locale::IsAnyLocaleSupported(locales,
|
|
/*supported_locales=*/shard_locales[i],
|
|
/*default_value=*/false)) {
|
|
shards.push_back(rules->rules()->Get(i));
|
|
}
|
|
}
|
|
return shards;
|
|
}
|
|
|
|
std::vector<Derivation> DeduplicateDerivations(
|
|
const std::vector<Derivation>& derivations) {
|
|
std::vector<Derivation> sorted_candidates = derivations;
|
|
std::stable_sort(
|
|
sorted_candidates.begin(), sorted_candidates.end(),
|
|
[](const Derivation& a, const Derivation& b) {
|
|
// Sort by id.
|
|
if (a.rule_id != b.rule_id) {
|
|
return a.rule_id < b.rule_id;
|
|
}
|
|
|
|
// Sort by increasing start.
|
|
if (a.match->codepoint_span.first != b.match->codepoint_span.first) {
|
|
return a.match->codepoint_span.first < b.match->codepoint_span.first;
|
|
}
|
|
|
|
// Sort by decreasing end.
|
|
return a.match->codepoint_span.second > b.match->codepoint_span.second;
|
|
});
|
|
|
|
// Deduplicate by overlap.
|
|
std::vector<Derivation> result;
|
|
for (int i = 0; i < sorted_candidates.size(); i++) {
|
|
const Derivation& candidate = sorted_candidates[i];
|
|
bool eliminated = false;
|
|
|
|
// Due to the sorting above, the candidate can only be completely
|
|
// intersected by a match before it in the sorted order.
|
|
for (int j = i - 1; j >= 0; j--) {
|
|
if (sorted_candidates[j].rule_id != candidate.rule_id) {
|
|
break;
|
|
}
|
|
if (sorted_candidates[j].match->codepoint_span.first <=
|
|
candidate.match->codepoint_span.first &&
|
|
sorted_candidates[j].match->codepoint_span.second >=
|
|
candidate.match->codepoint_span.second) {
|
|
eliminated = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!eliminated) {
|
|
result.push_back(candidate);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
bool VerifyAssertions(const Match* match) {
|
|
bool result = true;
|
|
grammar::Traverse(match, [&result](const Match* node) {
|
|
if (node->type != Match::kAssertionMatch) {
|
|
// Only validation if all checks so far passed.
|
|
return result;
|
|
}
|
|
|
|
// Positive assertions are by definition fulfilled,
|
|
// fail if the assertion is negative.
|
|
if (static_cast<const AssertionMatch*>(node)->negative) {
|
|
result = false;
|
|
}
|
|
return result;
|
|
});
|
|
return result;
|
|
}
|
|
|
|
} // namespace libtextclassifier3::grammar
|