Add support for performing NFD and NFKD on strings
This commit is contained in:
parent
2675f3e9e7
commit
3d720c6c09
@ -450,6 +450,97 @@ fn next(&mut self) -> Option<&'self str> {
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions used for Unicode normalization
|
||||
fn canonical_sort(comb: &mut [(char, u8)]) {
|
||||
use iterator::range;
|
||||
use tuple::CopyableTuple;
|
||||
|
||||
let len = comb.len();
|
||||
for i in range(0, len) {
|
||||
let mut swapped = false;
|
||||
for j in range(1, len-i) {
|
||||
let classA = comb[j-1].second();
|
||||
let classB = comb[j].second();
|
||||
if classA != 0 && classB != 0 && classA > classB {
|
||||
comb.swap(j-1, j);
|
||||
swapped = true;
|
||||
}
|
||||
}
|
||||
if !swapped { break; }
|
||||
}
|
||||
}
|
||||
|
||||
#[deriving(Clone)]
|
||||
enum NormalizationForm {
|
||||
NFD,
|
||||
NFKD
|
||||
}
|
||||
|
||||
/// External iterator for a string's normalization's characters.
|
||||
/// Use with the `std::iterator` module.
|
||||
#[deriving(Clone)]
|
||||
struct NormalizationIterator<'self> {
|
||||
priv kind: NormalizationForm,
|
||||
priv index: uint,
|
||||
priv string: &'self str,
|
||||
priv buffer: ~[(char, u8)],
|
||||
priv sorted: bool
|
||||
}
|
||||
|
||||
impl<'self> Iterator<char> for NormalizationIterator<'self> {
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
use unicode::decompose::canonical_combining_class;
|
||||
|
||||
match self.buffer.head_opt() {
|
||||
Some(&(c, 0)) => {
|
||||
self.sorted = false;
|
||||
self.buffer.shift();
|
||||
return Some(c);
|
||||
}
|
||||
Some(&(c, _)) if self.sorted => {
|
||||
self.buffer.shift();
|
||||
return Some(c);
|
||||
}
|
||||
_ => self.sorted = false
|
||||
}
|
||||
|
||||
let decomposer = match self.kind {
|
||||
NFD => char::decompose_canonical,
|
||||
NFKD => char::decompose_compatible
|
||||
};
|
||||
|
||||
while !self.sorted && self.index < self.string.len() {
|
||||
let CharRange {ch, next} = self.string.char_range_at(self.index);
|
||||
self.index = next;
|
||||
do decomposer(ch) |d| {
|
||||
let class = canonical_combining_class(d);
|
||||
if class == 0 && !self.sorted {
|
||||
canonical_sort(self.buffer);
|
||||
self.sorted = true;
|
||||
}
|
||||
self.buffer.push((d, class));
|
||||
}
|
||||
}
|
||||
|
||||
if !self.sorted {
|
||||
canonical_sort(self.buffer);
|
||||
self.sorted = true;
|
||||
}
|
||||
|
||||
match self.buffer.shift_opt() {
|
||||
Some((c, 0)) => {
|
||||
self.sorted = false;
|
||||
Some(c)
|
||||
}
|
||||
Some((c, _)) => Some(c),
|
||||
None => None
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (uint, Option<uint>) { (self.string.len(), None) }
|
||||
}
|
||||
|
||||
/// Replace all occurrences of one string with another
|
||||
///
|
||||
/// # Arguments
|
||||
@ -1128,6 +1219,8 @@ fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_
|
||||
fn line_iter(&self) -> CharSplitIterator<'self, char>;
|
||||
fn any_line_iter(&self) -> AnyLineIterator<'self>;
|
||||
fn word_iter(&self) -> WordIterator<'self>;
|
||||
fn nfd_iter(&self) -> NormalizationIterator<'self>;
|
||||
fn nfkd_iter(&self) -> NormalizationIterator<'self>;
|
||||
fn ends_with(&self, needle: &str) -> bool;
|
||||
fn is_whitespace(&self) -> bool;
|
||||
fn is_alphanumeric(&self) -> bool;
|
||||
@ -1343,6 +1436,28 @@ fn word_iter(&self) -> WordIterator<'self> {
|
||||
self.split_iter(char::is_whitespace).filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
/// Returns the string in Unicode Normalization Form D (canonical decomposition)
|
||||
fn nfd_iter(&self) -> NormalizationIterator<'self> {
|
||||
NormalizationIterator {
|
||||
index: 0,
|
||||
string: *self,
|
||||
buffer: ~[],
|
||||
sorted: false,
|
||||
kind: NFD
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the string in Unicode Normalization Form KD (compatibility decomposition)
|
||||
fn nfkd_iter(&self) -> NormalizationIterator<'self> {
|
||||
NormalizationIterator {
|
||||
index: 0,
|
||||
string: *self,
|
||||
buffer: ~[],
|
||||
sorted: false,
|
||||
kind: NFKD
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the string contains only whitespace
|
||||
///
|
||||
/// Whitespace characters are determined by `char::is_whitespace`
|
||||
@ -3217,6 +3332,34 @@ fn test_word_iter() {
|
||||
assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfd_iter() {
|
||||
assert_eq!("abc".nfd_iter().collect::<~str>(), ~"abc");
|
||||
assert_eq!("\u1e0b\u01c4".nfd_iter().collect::<~str>(), ~"d\u0307\u01c4");
|
||||
assert_eq!("\u2026".nfd_iter().collect::<~str>(), ~"\u2026");
|
||||
assert_eq!("\u2126".nfd_iter().collect::<~str>(), ~"\u03a9");
|
||||
assert_eq!("\u1e0b\u0323".nfd_iter().collect::<~str>(), ~"d\u0323\u0307");
|
||||
assert_eq!("\u1e0d\u0307".nfd_iter().collect::<~str>(), ~"d\u0323\u0307");
|
||||
assert_eq!("a\u0301".nfd_iter().collect::<~str>(), ~"a\u0301");
|
||||
assert_eq!("\u0301a".nfd_iter().collect::<~str>(), ~"\u0301a");
|
||||
assert_eq!("\ud4db".nfd_iter().collect::<~str>(), ~"\u1111\u1171\u11b6");
|
||||
assert_eq!("\uac1c".nfd_iter().collect::<~str>(), ~"\u1100\u1162");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfkd_iter() {
|
||||
assert_eq!("abc".nfkd_iter().collect::<~str>(), ~"abc");
|
||||
assert_eq!("\u1e0b\u01c4".nfkd_iter().collect::<~str>(), ~"d\u0307DZ\u030c");
|
||||
assert_eq!("\u2026".nfkd_iter().collect::<~str>(), ~"...");
|
||||
assert_eq!("\u2126".nfkd_iter().collect::<~str>(), ~"\u03a9");
|
||||
assert_eq!("\u1e0b\u0323".nfkd_iter().collect::<~str>(), ~"d\u0323\u0307");
|
||||
assert_eq!("\u1e0d\u0307".nfkd_iter().collect::<~str>(), ~"d\u0323\u0307");
|
||||
assert_eq!("a\u0301".nfkd_iter().collect::<~str>(), ~"a\u0301");
|
||||
assert_eq!("\u0301a".nfkd_iter().collect::<~str>(), ~"\u0301a");
|
||||
assert_eq!("\ud4db".nfkd_iter().collect::<~str>(), ~"\u1111\u1171\u11b6");
|
||||
assert_eq!("\uac1c".nfkd_iter().collect::<~str>(), ~"\u1100\u1162");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_line_iter() {
|
||||
let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
|
||||
|
Loading…
Reference in New Issue
Block a user