1 // Copyright 2015 The Servo Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! This crate implements the [Unicode Bidirectional Algorithm][tr9] for display of mixed
11 //! right-to-left and left-to-right text.  It is written in safe Rust, compatible with the
12 //! current stable release.
13 //!
14 //! ## Example
15 //!
16 //! ```rust
17 //! # #[cfg(feature = "hardcoded-data")] {
18 //! use unicode_bidi::BidiInfo;
19 //!
20 //! // This example text is defined using `concat!` because some browsers
21 //! // and text editors have trouble displaying bidi strings.
22 //! let text = concat![
23 //!   "א",
24 //!   "ב",
25 //!   "ג",
26 //!   "a",
27 //!   "b",
28 //!   "c",
29 //! ];
30 //!
31 //! // Resolve embedding levels within the text.  Pass `None` to detect the
32 //! // paragraph level automatically.
33 //! let bidi_info = BidiInfo::new(&text, None);
34 //!
35 //! // This paragraph has embedding level 1 because its first strong character is RTL.
36 //! assert_eq!(bidi_info.paragraphs.len(), 1);
37 //! let para = &bidi_info.paragraphs[0];
38 //! assert_eq!(para.level.number(), 1);
39 //! assert_eq!(para.level.is_rtl(), true);
40 //!
41 //! // Re-ordering is done after wrapping each paragraph into a sequence of
42 //! // lines. For this example, I'll just use a single line that spans the
43 //! // entire paragraph.
44 //! let line = para.range.clone();
45 //!
46 //! let display = bidi_info.reorder_line(para, line);
47 //! assert_eq!(display, concat![
48 //!   "a",
49 //!   "b",
50 //!   "c",
51 //!   "ג",
52 //!   "ב",
53 //!   "א",
54 //! ]);
55 //! # } // feature = "hardcoded-data"
56 //! ```
57 //!
58 //! # Features
59 //!
60 //! - `std`: Enabled by default, but can be disabled to make `unicode_bidi`
61 //!   `#![no_std]` + `alloc` compatible.
62 //! - `hardcoded-data`: Enabled by default. Includes hardcoded Unicode bidi data and more convenient APIs.
63 //! - `serde`: Adds [`serde::Serialize`] and [`serde::Deserialize`]
64 //!   implementations to relevant types.
65 //!
66 //! [tr9]: <http://www.unicode.org/reports/tr9/>
67 
68 #![no_std]
69 // We need to link to std to make doc tests work on older Rust versions
70 #[cfg(feature = "std")]
71 extern crate std;
72 #[macro_use]
73 extern crate alloc;
74 
75 pub mod data_source;
76 pub mod deprecated;
77 pub mod format_chars;
78 pub mod level;
79 pub mod utf16;
80 
81 mod char_data;
82 mod explicit;
83 mod implicit;
84 mod prepare;
85 
86 pub use crate::char_data::{BidiClass, UNICODE_VERSION};
87 pub use crate::data_source::BidiDataSource;
88 pub use crate::level::{Level, LTR_LEVEL, RTL_LEVEL};
89 pub use crate::prepare::LevelRun;
90 
91 #[cfg(feature = "hardcoded-data")]
92 pub use crate::char_data::{bidi_class, HardcodedBidiData};
93 
94 use alloc::borrow::Cow;
95 use alloc::string::String;
96 use alloc::vec::Vec;
97 use core::char;
98 use core::cmp;
99 use core::iter::repeat;
100 use core::ops::Range;
101 use core::str::CharIndices;
102 
103 use crate::format_chars as chars;
104 use crate::BidiClass::*;
105 
106 /// Trait that abstracts over a text source for use by the bidi algorithms.
107 /// We implement this for str (UTF-8) and for [u16] (UTF-16, native-endian).
108 /// (For internal unicode-bidi use; API may be unstable.)
109 /// This trait is sealed and cannot be implemented for types outside this crate.
110 pub trait TextSource<'text>: private::Sealed {
111     type CharIter: Iterator<Item = char>;
112     type CharIndexIter: Iterator<Item = (usize, char)>;
113     type IndexLenIter: Iterator<Item = (usize, usize)>;
114 
115     /// Return the length of the text in code units.
116     #[doc(hidden)]
len(&self) -> usize117     fn len(&self) -> usize;
118 
119     /// Get the character at a given code unit index, along with its length in code units.
120     /// Returns None if index is out of range, or points inside a multi-code-unit character.
121     /// Returns REPLACEMENT_CHARACTER for any unpaired surrogates in UTF-16.
122     #[doc(hidden)]
char_at(&self, index: usize) -> Option<(char, usize)>123     fn char_at(&self, index: usize) -> Option<(char, usize)>;
124 
125     /// Return a subrange of the text, indexed by code units.
126     /// (We don't implement all of the Index trait, just the minimum we use.)
127     #[doc(hidden)]
subrange(&self, range: Range<usize>) -> &Self128     fn subrange(&self, range: Range<usize>) -> &Self;
129 
130     /// An iterator over the text returning Unicode characters,
131     /// REPLACEMENT_CHAR for invalid code units.
132     #[doc(hidden)]
chars(&'text self) -> Self::CharIter133     fn chars(&'text self) -> Self::CharIter;
134 
135     /// An iterator over the text returning (index, char) tuples,
136     /// where index is the starting code-unit index of the character,
137     /// and char is its Unicode value (or REPLACEMENT_CHAR if invalid).
138     #[doc(hidden)]
char_indices(&'text self) -> Self::CharIndexIter139     fn char_indices(&'text self) -> Self::CharIndexIter;
140 
141     /// An iterator over the text returning (index, length) tuples,
142     /// where index is the starting code-unit index of the character,
143     /// and length is its length in code units.
144     #[doc(hidden)]
indices_lengths(&'text self) -> Self::IndexLenIter145     fn indices_lengths(&'text self) -> Self::IndexLenIter;
146 
147     /// Number of code units the given character uses.
148     #[doc(hidden)]
char_len(ch: char) -> usize149     fn char_len(ch: char) -> usize;
150 }
151 
152 mod private {
153     pub trait Sealed {}
154 
155     // Implement for str and [u16] only.
156     impl Sealed for str {}
157     impl Sealed for [u16] {}
158 }
159 
160 #[derive(PartialEq, Debug)]
161 pub enum Direction {
162     Ltr,
163     Rtl,
164     Mixed,
165 }
166 
167 /// Bidi information about a single paragraph
168 #[derive(Clone, Debug, PartialEq)]
169 pub struct ParagraphInfo {
170     /// The paragraphs boundaries within the text, as byte indices.
171     ///
172     /// TODO: Shrink this to only include the starting index?
173     pub range: Range<usize>,
174 
175     /// The paragraph embedding level.
176     ///
177     /// <http://www.unicode.org/reports/tr9/#BD4>
178     pub level: Level,
179 }
180 
181 impl ParagraphInfo {
182     /// Gets the length of the paragraph in the source text.
len(&self) -> usize183     pub fn len(&self) -> usize {
184         self.range.end - self.range.start
185     }
186 }
187 
188 /// Initial bidi information of the text.
189 ///
190 /// Contains the text paragraphs and `BidiClass` of its characters.
191 #[derive(PartialEq, Debug)]
192 pub struct InitialInfo<'text> {
193     /// The text
194     pub text: &'text str,
195 
196     /// The BidiClass of the character at each byte in the text.
197     /// If a character is multiple bytes, its class will appear multiple times in the vector.
198     pub original_classes: Vec<BidiClass>,
199 
200     /// The boundaries and level of each paragraph within the text.
201     pub paragraphs: Vec<ParagraphInfo>,
202 }
203 
204 impl<'text> InitialInfo<'text> {
205     /// Find the paragraphs and BidiClasses in a string of text.
206     ///
207     /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
208     ///
209     /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
210     /// character is found before the matching PDI.  If no strong character is found, the class will
211     /// remain FSI, and it's up to later stages to treat these as LRI when needed.
212     ///
213     /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
214     #[cfg_attr(feature = "flame_it", flamer::flame)]
215     #[cfg(feature = "hardcoded-data")]
new(text: &str, default_para_level: Option<Level>) -> InitialInfo<'_>216     pub fn new(text: &str, default_para_level: Option<Level>) -> InitialInfo<'_> {
217         Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
218     }
219 
220     /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`]
221     /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`]
222     /// instead (enabled with tbe default `hardcoded-data` Cargo feature)
223     ///
224     /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
225     ///
226     /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
227     /// character is found before the matching PDI.  If no strong character is found, the class will
228     /// remain FSI, and it's up to later stages to treat these as LRI when needed.
229     #[cfg_attr(feature = "flame_it", flamer::flame)]
new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a str, default_para_level: Option<Level>, ) -> InitialInfo<'a>230     pub fn new_with_data_source<'a, D: BidiDataSource>(
231         data_source: &D,
232         text: &'a str,
233         default_para_level: Option<Level>,
234     ) -> InitialInfo<'a> {
235         InitialInfoExt::new_with_data_source(data_source, text, default_para_level).base
236     }
237 }
238 
239 /// Extended version of InitialInfo (not public API).
240 #[derive(PartialEq, Debug)]
241 struct InitialInfoExt<'text> {
242     /// The base InitialInfo for the text, recording its paragraphs and bidi classes.
243     base: InitialInfo<'text>,
244 
245     /// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that
246     /// requires no further bidi processing (i.e. there are no RTL characters or bidi
247     /// control codes present).
248     pure_ltr: Vec<bool>,
249 }
250 
251 impl<'text> InitialInfoExt<'text> {
252     /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`]
253     /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`]
254     /// instead (enabled with tbe default `hardcoded-data` Cargo feature)
255     ///
256     /// <http://www.unicode.org/reports/tr9/#The_Paragraph_Level>
257     ///
258     /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong
259     /// character is found before the matching PDI.  If no strong character is found, the class will
260     /// remain FSI, and it's up to later stages to treat these as LRI when needed.
261     #[cfg_attr(feature = "flame_it", flamer::flame)]
new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a str, default_para_level: Option<Level>, ) -> InitialInfoExt<'a>262     pub fn new_with_data_source<'a, D: BidiDataSource>(
263         data_source: &D,
264         text: &'a str,
265         default_para_level: Option<Level>,
266     ) -> InitialInfoExt<'a> {
267         let mut paragraphs = Vec::<ParagraphInfo>::new();
268         let mut pure_ltr = Vec::<bool>::new();
269         let (original_classes, _, _) = compute_initial_info(
270             data_source,
271             text,
272             default_para_level,
273             Some((&mut paragraphs, &mut pure_ltr)),
274         );
275 
276         InitialInfoExt {
277             base: InitialInfo {
278                 text,
279                 original_classes,
280                 paragraphs,
281             },
282             pure_ltr,
283         }
284     }
285 }
286 
287 /// Implementation of initial-info computation for both BidiInfo and ParagraphBidiInfo.
288 /// To treat the text as (potentially) multiple paragraphs, the caller should pass the
289 /// pair of optional outparam arrays to receive the ParagraphInfo and pure-ltr flags
290 /// for each paragraph. Passing None for split_paragraphs will ignore any paragraph-
291 /// separator characters in the text, treating it just as a single paragraph.
292 /// Returns the array of BidiClass values for each code unit of the text, along with
293 /// the embedding level and pure-ltr flag for the *last* (or only) paragraph.
compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( data_source: &D, text: &'a T, default_para_level: Option<Level>, mut split_paragraphs: Option<(&mut Vec<ParagraphInfo>, &mut Vec<bool>)>, ) -> (Vec<BidiClass>, Level, bool)294 fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
295     data_source: &D,
296     text: &'a T,
297     default_para_level: Option<Level>,
298     mut split_paragraphs: Option<(&mut Vec<ParagraphInfo>, &mut Vec<bool>)>,
299 ) -> (Vec<BidiClass>, Level, bool) {
300     let mut original_classes = Vec::with_capacity(text.len());
301 
302     // The stack contains the starting code unit index for each nested isolate we're inside.
303     let mut isolate_stack = Vec::new();
304 
305     debug_assert!(
306         if let Some((ref paragraphs, ref pure_ltr)) = split_paragraphs {
307             paragraphs.is_empty() && pure_ltr.is_empty()
308         } else {
309             true
310         }
311     );
312 
313     let mut para_start = 0;
314     let mut para_level = default_para_level;
315 
316     // Per-paragraph flag: can subsequent processing be skipped? Set to false if any
317     // RTL characters or bidi control characters are encountered in the paragraph.
318     let mut is_pure_ltr = true;
319 
320     #[cfg(feature = "flame_it")]
321     flame::start("compute_initial_info(): iter text.char_indices()");
322 
323     for (i, c) in text.char_indices() {
324         let class = data_source.bidi_class(c);
325 
326         #[cfg(feature = "flame_it")]
327         flame::start("original_classes.extend()");
328 
329         let len = T::char_len(c);
330         original_classes.extend(repeat(class).take(len));
331 
332         #[cfg(feature = "flame_it")]
333         flame::end("original_classes.extend()");
334 
335         match class {
336             B => {
337                 if let Some((ref mut paragraphs, ref mut pure_ltr)) = split_paragraphs {
338                     // P1. Split the text into separate paragraphs. The paragraph separator is kept
339                     // with the previous paragraph.
340                     let para_end = i + len;
341                     paragraphs.push(ParagraphInfo {
342                         range: para_start..para_end,
343                         // P3. If no character is found in p2, set the paragraph level to zero.
344                         level: para_level.unwrap_or(LTR_LEVEL),
345                     });
346                     pure_ltr.push(is_pure_ltr);
347                     // Reset state for the start of the next paragraph.
348                     para_start = para_end;
349                     // TODO: Support defaulting to direction of previous paragraph
350                     //
351                     // <http://www.unicode.org/reports/tr9/#HL1>
352                     para_level = default_para_level;
353                     is_pure_ltr = true;
354                     isolate_stack.clear();
355                 }
356             }
357 
358             L | R | AL => {
359                 if class != L {
360                     is_pure_ltr = false;
361                 }
362                 match isolate_stack.last() {
363                     Some(&start) => {
364                         if original_classes[start] == FSI {
365                             // X5c. If the first strong character between FSI and its matching
366                             // PDI is R or AL, treat it as RLI. Otherwise, treat it as LRI.
367                             for j in 0..T::char_len(chars::FSI) {
368                                 original_classes[start + j] = if class == L { LRI } else { RLI };
369                             }
370                         }
371                     }
372 
373                     None => {
374                         if para_level.is_none() {
375                             // P2. Find the first character of type L, AL, or R, while skipping
376                             // any characters between an isolate initiator and its matching
377                             // PDI.
378                             para_level = Some(if class != L { RTL_LEVEL } else { LTR_LEVEL });
379                         }
380                     }
381                 }
382             }
383 
384             AN | LRE | RLE | LRO | RLO => {
385                 is_pure_ltr = false;
386             }
387 
388             RLI | LRI | FSI => {
389                 is_pure_ltr = false;
390                 isolate_stack.push(i);
391             }
392 
393             PDI => {
394                 isolate_stack.pop();
395             }
396 
397             _ => {}
398         }
399     }
400 
401     if let Some((paragraphs, pure_ltr)) = split_paragraphs {
402         if para_start < text.len() {
403             paragraphs.push(ParagraphInfo {
404                 range: para_start..text.len(),
405                 level: para_level.unwrap_or(LTR_LEVEL),
406             });
407             pure_ltr.push(is_pure_ltr);
408         }
409         debug_assert_eq!(paragraphs.len(), pure_ltr.len());
410     }
411     debug_assert_eq!(original_classes.len(), text.len());
412 
413     #[cfg(feature = "flame_it")]
414     flame::end("compute_initial_info(): iter text.char_indices()");
415 
416     (
417         original_classes,
418         para_level.unwrap_or(LTR_LEVEL),
419         is_pure_ltr,
420     )
421 }
422 
423 /// Bidi information of the text.
424 ///
425 /// The `original_classes` and `levels` vectors are indexed by byte offsets into the text.  If a
426 /// character is multiple bytes wide, then its class and level will appear multiple times in these
427 /// vectors.
428 // TODO: Impl `struct StringProperty<T> { values: Vec<T> }` and use instead of Vec<T>
429 #[derive(Debug, PartialEq)]
430 pub struct BidiInfo<'text> {
431     /// The text
432     pub text: &'text str,
433 
434     /// The BidiClass of the character at each byte in the text.
435     pub original_classes: Vec<BidiClass>,
436 
437     /// The directional embedding level of each byte in the text.
438     pub levels: Vec<Level>,
439 
440     /// The boundaries and paragraph embedding level of each paragraph within the text.
441     ///
442     /// TODO: Use SmallVec or similar to avoid overhead when there are only one or two paragraphs?
443     /// Or just don't include the first paragraph, which always starts at 0?
444     pub paragraphs: Vec<ParagraphInfo>,
445 }
446 
447 impl<'text> BidiInfo<'text> {
448     /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph.
449     ///
450     ///
451     /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
452     ///
453     /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
454     /// text that is entirely LTR.  See the `nsBidi` class from Gecko for comparison.
455     ///
456     /// TODO: Support auto-RTL base direction
457     #[cfg_attr(feature = "flame_it", flamer::flame)]
458     #[cfg(feature = "hardcoded-data")]
459     #[inline]
new(text: &str, default_para_level: Option<Level>) -> BidiInfo<'_>460     pub fn new(text: &str, default_para_level: Option<Level>) -> BidiInfo<'_> {
461         Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
462     }
463 
464     /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph, with a custom [`BidiDataSource`]
465     /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`]
466     /// instead (enabled with tbe default `hardcoded-data` Cargo feature).
467     ///
468     /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
469     /// text that is entirely LTR.  See the `nsBidi` class from Gecko for comparison.
470     ///
471     /// TODO: Support auto-RTL base direction
472     #[cfg_attr(feature = "flame_it", flamer::flame)]
new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a str, default_para_level: Option<Level>, ) -> BidiInfo<'a>473     pub fn new_with_data_source<'a, D: BidiDataSource>(
474         data_source: &D,
475         text: &'a str,
476         default_para_level: Option<Level>,
477     ) -> BidiInfo<'a> {
478         let InitialInfoExt { base, pure_ltr, .. } =
479             InitialInfoExt::new_with_data_source(data_source, text, default_para_level);
480 
481         let mut levels = Vec::<Level>::with_capacity(text.len());
482         let mut processing_classes = base.original_classes.clone();
483 
484         for (para, is_pure_ltr) in base.paragraphs.iter().zip(pure_ltr.iter()) {
485             let text = &text[para.range.clone()];
486             let original_classes = &base.original_classes[para.range.clone()];
487 
488             compute_bidi_info_for_para(
489                 data_source,
490                 para,
491                 *is_pure_ltr,
492                 text,
493                 original_classes,
494                 &mut processing_classes,
495                 &mut levels,
496             );
497         }
498 
499         BidiInfo {
500             text,
501             original_classes: base.original_classes,
502             paragraphs: base.paragraphs,
503             levels,
504         }
505     }
506 
507     /// Produce the levels for this paragraph as needed for reordering, one level per *byte*
508     /// in the paragraph. The returned vector includes bytes that are not included
509     /// in the `line`, but will not adjust them.
510     ///
511     /// This runs [Rule L1], you can run
512     /// [Rule L2] by calling [`Self::reorder_visual()`].
513     /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
514     /// to avoid non-byte indices.
515     ///
516     /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
517     ///
518     /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
519     /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
520     #[cfg_attr(feature = "flame_it", flamer::flame)]
reordered_levels(&self, para: &ParagraphInfo, line: Range<usize>) -> Vec<Level>521     pub fn reordered_levels(&self, para: &ParagraphInfo, line: Range<usize>) -> Vec<Level> {
522         assert!(line.start <= self.levels.len());
523         assert!(line.end <= self.levels.len());
524 
525         let mut levels = self.levels.clone();
526         let line_classes = &self.original_classes[line.clone()];
527         let line_levels = &mut levels[line.clone()];
528 
529         reorder_levels(
530             line_classes,
531             line_levels,
532             self.text.subrange(line),
533             para.level,
534         );
535 
536         levels
537     }
538 
539     /// Produce the levels for this paragraph as needed for reordering, one level per *character*
540     /// in the paragraph. The returned vector includes characters that are not included
541     /// in the `line`, but will not adjust them.
542     ///
543     /// This runs [Rule L1], you can run
544     /// [Rule L2] by calling [`Self::reorder_visual()`].
545     /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
546     /// to avoid non-byte indices.
547     ///
548     /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
549     ///
550     /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
551     /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
552     #[cfg_attr(feature = "flame_it", flamer::flame)]
reordered_levels_per_char( &self, para: &ParagraphInfo, line: Range<usize>, ) -> Vec<Level>553     pub fn reordered_levels_per_char(
554         &self,
555         para: &ParagraphInfo,
556         line: Range<usize>,
557     ) -> Vec<Level> {
558         let levels = self.reordered_levels(para, line);
559         self.text.char_indices().map(|(i, _)| levels[i]).collect()
560     }
561 
562     /// Re-order a line based on resolved levels and return the line in display order.
563     ///
564     /// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
565     ///
566     /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
567     /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
568     #[cfg_attr(feature = "flame_it", flamer::flame)]
reorder_line(&self, para: &ParagraphInfo, line: Range<usize>) -> Cow<'text, str>569     pub fn reorder_line(&self, para: &ParagraphInfo, line: Range<usize>) -> Cow<'text, str> {
570         if !level::has_rtl(&self.levels[line.clone()]) {
571             return self.text[line].into();
572         }
573         let (levels, runs) = self.visual_runs(para, line.clone());
574         reorder_line(self.text, line, levels, runs)
575     }
576 
577     /// Reorders pre-calculated levels of a sequence of characters.
578     ///
579     /// NOTE: This is a convenience method that does not use a `Paragraph`  object. It is
580     /// intended to be used when an application has determined the levels of the objects (character sequences)
581     /// and just needs to have them reordered.
582     ///
583     /// the index map will result in `indexMap[visualIndex]==logicalIndex`.
584     ///
585     /// This only runs [Rule L2](http://www.unicode.org/reports/tr9/#L2) as it does not have
586     /// information about the actual text.
587     ///
588     /// Furthermore, if `levels` is an array that is aligned with code units, bytes within a codepoint may be
589     /// reversed. You may need to fix up the map to deal with this. Alternatively, only pass in arrays where each `Level`
590     /// is for a single code point.
591     ///
592     ///
593     ///   # # Example
594     /// ```
595     /// use unicode_bidi::BidiInfo;
596     /// use unicode_bidi::Level;
597     ///
598     /// let l0 = Level::from(0);
599     /// let l1 = Level::from(1);
600     /// let l2 = Level::from(2);
601     ///
602     /// let levels = vec![l0, l0, l0, l0];
603     /// let index_map = BidiInfo::reorder_visual(&levels);
604     /// assert_eq!(levels.len(), index_map.len());
605     /// assert_eq!(index_map, [0, 1, 2, 3]);
606     ///
607     /// let levels: Vec<Level> = vec![l0, l0, l0, l1, l1, l1, l2, l2];
608     /// let index_map = BidiInfo::reorder_visual(&levels);
609     /// assert_eq!(levels.len(), index_map.len());
610     /// assert_eq!(index_map, [0, 1, 2, 6, 7, 5, 4, 3]);
611     /// ```
612     #[cfg_attr(feature = "flame_it", flamer::flame)]
613     #[inline]
reorder_visual(levels: &[Level]) -> Vec<usize>614     pub fn reorder_visual(levels: &[Level]) -> Vec<usize> {
615         reorder_visual(levels)
616     }
617 
618     /// Find the level runs within a line and return them in visual order.
619     ///
620     /// `line` is a range of bytes indices within `levels`.
621     ///
622     /// The first return value is a vector of levels used by the reordering algorithm,
623     /// i.e. the result of [Rule L1]. The second return value is a vector of level runs,
624     /// the result of [Rule L2], showing the visual order that each level run (a run of text with the
625     /// same level) should be displayed. Within each run, the display order can be checked
626     /// against the Level vector.
627     ///
628     /// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring),
629     /// as that should be handled by the engine using this API.
630     ///
631     /// Conceptually, this is the same as running [`Self::reordered_levels()`] followed by
632     /// [`Self::reorder_visual()`], however it returns the result as a list of level runs instead
633     /// of producing a level map, since one may wish to deal with the fact that this is operating on
634     /// byte rather than character indices.
635     ///
636     /// <http://www.unicode.org/reports/tr9/#Reordering_Resolved_Levels>
637     ///
638     /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
639     /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
640     /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
641     /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
642     #[cfg_attr(feature = "flame_it", flamer::flame)]
643     #[inline]
visual_runs( &self, para: &ParagraphInfo, line: Range<usize>, ) -> (Vec<Level>, Vec<LevelRun>)644     pub fn visual_runs(
645         &self,
646         para: &ParagraphInfo,
647         line: Range<usize>,
648     ) -> (Vec<Level>, Vec<LevelRun>) {
649         let levels = self.reordered_levels(para, line.clone());
650         visual_runs_for_line(levels, &line)
651     }
652 
653     /// If processed text has any computed RTL levels
654     ///
655     /// This information is usually used to skip re-ordering of text when no RTL level is present
656     #[inline]
has_rtl(&self) -> bool657     pub fn has_rtl(&self) -> bool {
658         level::has_rtl(&self.levels)
659     }
660 }
661 
662 /// Bidi information of text treated as a single paragraph.
663 ///
664 /// The `original_classes` and `levels` vectors are indexed by byte offsets into the text.  If a
665 /// character is multiple bytes wide, then its class and level will appear multiple times in these
666 /// vectors.
667 #[derive(Debug, PartialEq)]
668 pub struct ParagraphBidiInfo<'text> {
669     /// The text
670     pub text: &'text str,
671 
672     /// The BidiClass of the character at each byte in the text.
673     pub original_classes: Vec<BidiClass>,
674 
675     /// The directional embedding level of each byte in the text.
676     pub levels: Vec<Level>,
677 
678     /// The paragraph embedding level.
679     pub paragraph_level: Level,
680 
681     /// Whether the paragraph is purely LTR.
682     pub is_pure_ltr: bool,
683 }
684 
685 impl<'text> ParagraphBidiInfo<'text> {
686     /// Determine the bidi embedding level.
687     ///
688     ///
689     /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this.
690     ///
691     /// TODO: In early steps, check for special cases that allow later steps to be skipped. like
692     /// text that is entirely LTR.  See the `nsBidi` class from Gecko for comparison.
693     ///
694     /// TODO: Support auto-RTL base direction
695     #[cfg_attr(feature = "flame_it", flamer::flame)]
696     #[cfg(feature = "hardcoded-data")]
697     #[inline]
new(text: &str, default_para_level: Option<Level>) -> ParagraphBidiInfo<'_>698     pub fn new(text: &str, default_para_level: Option<Level>) -> ParagraphBidiInfo<'_> {
699         Self::new_with_data_source(&HardcodedBidiData, text, default_para_level)
700     }
701 
702     /// Determine the bidi embedding level, with a custom [`BidiDataSource`]
703     /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`]
704     /// instead (enabled with tbe default `hardcoded-data` Cargo feature).
705     ///
706     /// (This is the single-paragraph equivalent of BidiInfo::new_with_data_source,
707     /// and should be kept in sync with it.
708     #[cfg_attr(feature = "flame_it", flamer::flame)]
new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a str, default_para_level: Option<Level>, ) -> ParagraphBidiInfo<'a>709     pub fn new_with_data_source<'a, D: BidiDataSource>(
710         data_source: &D,
711         text: &'a str,
712         default_para_level: Option<Level>,
713     ) -> ParagraphBidiInfo<'a> {
714         // Here we could create a ParagraphInitialInfo struct to parallel the one
715         // used by BidiInfo, but there doesn't seem any compelling reason for it.
716         let (original_classes, paragraph_level, is_pure_ltr) =
717             compute_initial_info(data_source, text, default_para_level, None);
718 
719         let mut levels = Vec::<Level>::with_capacity(text.len());
720         let mut processing_classes = original_classes.clone();
721 
722         let para_info = ParagraphInfo {
723             range: Range {
724                 start: 0,
725                 end: text.len(),
726             },
727             level: paragraph_level,
728         };
729 
730         compute_bidi_info_for_para(
731             data_source,
732             &para_info,
733             is_pure_ltr,
734             text,
735             &original_classes,
736             &mut processing_classes,
737             &mut levels,
738         );
739 
740         ParagraphBidiInfo {
741             text,
742             original_classes,
743             levels,
744             paragraph_level,
745             is_pure_ltr,
746         }
747     }
748 
749     /// Produce the levels for this paragraph as needed for reordering, one level per *byte*
750     /// in the paragraph. The returned vector includes bytes that are not included
751     /// in the `line`, but will not adjust them.
752     ///
753     /// See BidiInfo::reordered_levels for details.
754     ///
755     /// (This should be kept in sync with BidiInfo::reordered_levels.)
756     #[cfg_attr(feature = "flame_it", flamer::flame)]
reordered_levels(&self, line: Range<usize>) -> Vec<Level>757     pub fn reordered_levels(&self, line: Range<usize>) -> Vec<Level> {
758         assert!(line.start <= self.levels.len());
759         assert!(line.end <= self.levels.len());
760 
761         let mut levels = self.levels.clone();
762         let line_classes = &self.original_classes[line.clone()];
763         let line_levels = &mut levels[line.clone()];
764 
765         reorder_levels(
766             line_classes,
767             line_levels,
768             self.text.subrange(line),
769             self.paragraph_level,
770         );
771 
772         levels
773     }
774 
775     /// Produce the levels for this paragraph as needed for reordering, one level per *character*
776     /// in the paragraph. The returned vector includes characters that are not included
777     /// in the `line`, but will not adjust them.
778     ///
779     /// See BidiInfo::reordered_levels_per_char for details.
780     ///
781     /// (This should be kept in sync with BidiInfo::reordered_levels_per_char.)
782     #[cfg_attr(feature = "flame_it", flamer::flame)]
reordered_levels_per_char(&self, line: Range<usize>) -> Vec<Level>783     pub fn reordered_levels_per_char(&self, line: Range<usize>) -> Vec<Level> {
784         let levels = self.reordered_levels(line);
785         self.text.char_indices().map(|(i, _)| levels[i]).collect()
786     }
787 
788     /// Re-order a line based on resolved levels and return the line in display order.
789     ///
790     /// See BidiInfo::reorder_line for details.
791     ///
792     /// (This should be kept in sync with BidiInfo::reorder_line.)
793     #[cfg_attr(feature = "flame_it", flamer::flame)]
reorder_line(&self, line: Range<usize>) -> Cow<'text, str>794     pub fn reorder_line(&self, line: Range<usize>) -> Cow<'text, str> {
795         if !level::has_rtl(&self.levels[line.clone()]) {
796             return self.text[line].into();
797         }
798 
799         let (levels, runs) = self.visual_runs(line.clone());
800 
801         reorder_line(self.text, line, levels, runs)
802     }
803 
804     /// Reorders pre-calculated levels of a sequence of characters.
805     ///
806     /// See BidiInfo::reorder_visual for details.
807     #[cfg_attr(feature = "flame_it", flamer::flame)]
808     #[inline]
reorder_visual(levels: &[Level]) -> Vec<usize>809     pub fn reorder_visual(levels: &[Level]) -> Vec<usize> {
810         reorder_visual(levels)
811     }
812 
813     /// Find the level runs within a line and return them in visual order.
814     ///
815     /// `line` is a range of bytes indices within `levels`.
816     ///
817     /// See BidiInfo::visual_runs for details.
818     ///
819     /// (This should be kept in sync with BidiInfo::visual_runs.)
820     #[cfg_attr(feature = "flame_it", flamer::flame)]
821     #[inline]
visual_runs(&self, line: Range<usize>) -> (Vec<Level>, Vec<LevelRun>)822     pub fn visual_runs(&self, line: Range<usize>) -> (Vec<Level>, Vec<LevelRun>) {
823         let levels = self.reordered_levels(line.clone());
824         visual_runs_for_line(levels, &line)
825     }
826 
827     /// If processed text has any computed RTL levels
828     ///
829     /// This information is usually used to skip re-ordering of text when no RTL level is present
830     #[inline]
has_rtl(&self) -> bool831     pub fn has_rtl(&self) -> bool {
832         !self.is_pure_ltr
833     }
834 
835     /// Return the paragraph's Direction (Ltr, Rtl, or Mixed) based on its levels.
836     #[inline]
direction(&self) -> Direction837     pub fn direction(&self) -> Direction {
838         para_direction(&self.levels)
839     }
840 }
841 
842 /// Return a line of the text in display order based on resolved levels.
843 ///
844 /// `text`   the full text passed to the `BidiInfo` or `ParagraphBidiInfo` for analysis
845 /// `line`   a range of byte indices within `text` corresponding to one line
846 /// `levels` array of `Level` values, with `line`'s levels reordered into visual order
847 /// `runs`   array of `LevelRun`s in visual order
848 ///
849 /// (`levels` and `runs` are the result of calling `BidiInfo::visual_runs()` or
850 /// `ParagraphBidiInfo::visual_runs()` for the line of interest.)
851 ///
852 /// Returns: the reordered text of the line.
853 ///
854 /// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
855 ///
856 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
857 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
reorder_line<'text>( text: &'text str, line: Range<usize>, levels: Vec<Level>, runs: Vec<LevelRun>, ) -> Cow<'text, str>858 fn reorder_line<'text>(
859     text: &'text str,
860     line: Range<usize>,
861     levels: Vec<Level>,
862     runs: Vec<LevelRun>,
863 ) -> Cow<'text, str> {
864     // If all isolating run sequences are LTR, no reordering is needed
865     if runs.iter().all(|run| levels[run.start].is_ltr()) {
866         return text[line].into();
867     }
868 
869     let mut result = String::with_capacity(line.len());
870     for run in runs {
871         if levels[run.start].is_rtl() {
872             result.extend(text[run].chars().rev());
873         } else {
874             result.push_str(&text[run]);
875         }
876     }
877     result.into()
878 }
879 
880 /// Find the level runs within a line and return them in visual order.
881 ///
882 /// `line` is a range of code-unit indices within `levels`.
883 ///
884 /// The first return value is a vector of levels used by the reordering algorithm,
885 /// i.e. the result of [Rule L1]. The second return value is a vector of level runs,
886 /// the result of [Rule L2], showing the visual order that each level run (a run of text with the
887 /// same level) should be displayed. Within each run, the display order can be checked
888 /// against the Level vector.
889 ///
890 /// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring),
891 /// as that should be handled by the engine using this API.
892 ///
893 /// Conceptually, this is the same as running [`reordered_levels()`] followed by
894 /// [`reorder_visual()`], however it returns the result as a list of level runs instead
895 /// of producing a level map, since one may wish to deal with the fact that this is operating on
896 /// byte rather than character indices.
897 ///
898 /// <http://www.unicode.org/reports/tr9/#Reordering_Resolved_Levels>
899 ///
900 /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
901 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
902 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
903 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
visual_runs_for_line(levels: Vec<Level>, line: &Range<usize>) -> (Vec<Level>, Vec<LevelRun>)904 fn visual_runs_for_line(levels: Vec<Level>, line: &Range<usize>) -> (Vec<Level>, Vec<LevelRun>) {
905     // Find consecutive level runs.
906     let mut runs = Vec::new();
907     let mut start = line.start;
908     let mut run_level = levels[start];
909     let mut min_level = run_level;
910     let mut max_level = run_level;
911 
912     for (i, &new_level) in levels.iter().enumerate().take(line.end).skip(start + 1) {
913         if new_level != run_level {
914             // End of the previous run, start of a new one.
915             runs.push(start..i);
916             start = i;
917             run_level = new_level;
918             min_level = cmp::min(run_level, min_level);
919             max_level = cmp::max(run_level, max_level);
920         }
921     }
922     runs.push(start..line.end);
923 
924     let run_count = runs.len();
925 
926     // Re-order the odd runs.
927     // <http://www.unicode.org/reports/tr9/#L2>
928 
929     // Stop at the lowest *odd* level.
930     min_level = min_level.new_lowest_ge_rtl().expect("Level error");
931     // This loop goes through contiguous chunks of level runs that have a level
932     // ≥ max_level and reverses their contents, reducing max_level by 1 each time.
933     while max_level >= min_level {
934         // Look for the start of a sequence of consecutive runs of max_level or higher.
935         let mut seq_start = 0;
936         while seq_start < run_count {
937             if levels[runs[seq_start].start] < max_level {
938                 seq_start += 1;
939                 continue;
940             }
941 
942             // Found the start of a sequence. Now find the end.
943             let mut seq_end = seq_start + 1;
944             while seq_end < run_count {
945                 if levels[runs[seq_end].start] < max_level {
946                     break;
947                 }
948                 seq_end += 1;
949             }
950             // Reverse the runs within this sequence.
951             runs[seq_start..seq_end].reverse();
952 
953             seq_start = seq_end;
954         }
955         max_level
956             .lower(1)
957             .expect("Lowering embedding level below zero");
958     }
959     (levels, runs)
960 }
961 
962 /// Reorders pre-calculated levels of a sequence of characters.
963 ///
964 /// NOTE: This is a convenience method that does not use a `Paragraph`  object. It is
965 /// intended to be used when an application has determined the levels of the objects (character sequences)
966 /// and just needs to have them reordered.
967 ///
968 /// the index map will result in `indexMap[visualIndex]==logicalIndex`.
969 ///
970 /// This only runs [Rule L2](http://www.unicode.org/reports/tr9/#L2) as it does not have
971 /// information about the actual text.
972 ///
973 /// Furthermore, if `levels` is an array that is aligned with code units, bytes within a codepoint may be
974 /// reversed. You may need to fix up the map to deal with this. Alternatively, only pass in arrays where each `Level`
975 /// is for a single code point.
reorder_visual(levels: &[Level]) -> Vec<usize>976 fn reorder_visual(levels: &[Level]) -> Vec<usize> {
977     // Gets the next range of characters after start_index with a level greater
978     // than or equal to `max`
979     fn next_range(levels: &[level::Level], mut start_index: usize, max: Level) -> Range<usize> {
980         if levels.is_empty() || start_index >= levels.len() {
981             return start_index..start_index;
982         }
983         while let Some(l) = levels.get(start_index) {
984             if *l >= max {
985                 break;
986             }
987             start_index += 1;
988         }
989 
990         if levels.get(start_index).is_none() {
991             // If at the end of the array, adding one will
992             // produce an out-of-range end element
993             return start_index..start_index;
994         }
995 
996         let mut end_index = start_index + 1;
997         while let Some(l) = levels.get(end_index) {
998             if *l < max {
999                 return start_index..end_index;
1000             }
1001             end_index += 1;
1002         }
1003 
1004         start_index..end_index
1005     }
1006 
1007     // This implementation is similar to the L2 implementation in `visual_runs()`
1008     // but it cannot benefit from a precalculated LevelRun vector so needs to be different.
1009 
1010     if levels.is_empty() {
1011         return vec![];
1012     }
1013 
1014     // Get the min and max levels
1015     let (mut min, mut max) = levels
1016         .iter()
1017         .fold((levels[0], levels[0]), |(min, max), &l| {
1018             (cmp::min(min, l), cmp::max(max, l))
1019         });
1020 
1021     // Initialize an index map
1022     let mut result: Vec<usize> = (0..levels.len()).collect();
1023 
1024     if min == max && min.is_ltr() {
1025         // Everything is LTR and at the same level, do nothing
1026         return result;
1027     }
1028 
1029     // Stop at the lowest *odd* level, since everything below that
1030     // is LTR and does not need further reordering
1031     min = min.new_lowest_ge_rtl().expect("Level error");
1032 
1033     // For each max level, take all contiguous chunks of
1034     // levels ≥ max and reverse them
1035     //
1036     // We can do this check with the original levels instead of checking reorderings because all
1037     // prior reorderings will have been for contiguous chunks of levels >> max, which will
1038     // be a subset of these chunks anyway.
1039     while min <= max {
1040         let mut range = 0..0;
1041         loop {
1042             range = next_range(levels, range.end, max);
1043             result[range.clone()].reverse();
1044 
1045             if range.end >= levels.len() {
1046                 break;
1047             }
1048         }
1049 
1050         max.lower(1).expect("Level error");
1051     }
1052 
1053     result
1054 }
1055 
1056 /// The core of BidiInfo initialization, factored out into a function that both
1057 /// the utf-8 and utf-16 versions of BidiInfo can use.
compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( data_source: &D, para: &ParagraphInfo, is_pure_ltr: bool, text: &'a T, original_classes: &[BidiClass], processing_classes: &mut [BidiClass], levels: &mut Vec<Level>, )1058 fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
1059     data_source: &D,
1060     para: &ParagraphInfo,
1061     is_pure_ltr: bool,
1062     text: &'a T,
1063     original_classes: &[BidiClass],
1064     processing_classes: &mut [BidiClass],
1065     levels: &mut Vec<Level>,
1066 ) {
1067     let new_len = levels.len() + para.range.len();
1068     levels.resize(new_len, para.level);
1069     if para.level == LTR_LEVEL && is_pure_ltr {
1070         return;
1071     }
1072 
1073     let processing_classes = &mut processing_classes[para.range.clone()];
1074     let levels = &mut levels[para.range.clone()];
1075 
1076     explicit::compute(
1077         text,
1078         para.level,
1079         original_classes,
1080         levels,
1081         processing_classes,
1082     );
1083 
1084     let sequences = prepare::isolating_run_sequences(para.level, original_classes, levels);
1085     for sequence in &sequences {
1086         implicit::resolve_weak(text, sequence, processing_classes);
1087         implicit::resolve_neutral(
1088             text,
1089             data_source,
1090             sequence,
1091             levels,
1092             original_classes,
1093             processing_classes,
1094         );
1095     }
1096     implicit::resolve_levels(processing_classes, levels);
1097 
1098     assign_levels_to_removed_chars(para.level, original_classes, levels);
1099 }
1100 
1101 /// Produce the levels for this paragraph as needed for reordering, one level per *code unit*
1102 /// in the paragraph. The returned vector includes code units that are not included
1103 /// in the `line`, but will not adjust them.
1104 ///
1105 /// This runs [Rule L1]
1106 ///
1107 /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
reorder_levels<'a, T: TextSource<'a> + ?Sized>( line_classes: &[BidiClass], line_levels: &mut [Level], line_text: &'a T, para_level: Level, )1108 fn reorder_levels<'a, T: TextSource<'a> + ?Sized>(
1109     line_classes: &[BidiClass],
1110     line_levels: &mut [Level],
1111     line_text: &'a T,
1112     para_level: Level,
1113 ) {
1114     // Reset some whitespace chars to paragraph level.
1115     // <http://www.unicode.org/reports/tr9/#L1>
1116     let mut reset_from: Option<usize> = Some(0);
1117     let mut reset_to: Option<usize> = None;
1118     let mut prev_level = para_level;
1119     for (i, c) in line_text.char_indices() {
1120         match line_classes[i] {
1121             // Segment separator, Paragraph separator
1122             B | S => {
1123                 assert_eq!(reset_to, None);
1124                 reset_to = Some(i + T::char_len(c));
1125                 if reset_from == None {
1126                     reset_from = Some(i);
1127                 }
1128             }
1129             // Whitespace, isolate formatting
1130             WS | FSI | LRI | RLI | PDI => {
1131                 if reset_from == None {
1132                     reset_from = Some(i);
1133                 }
1134             }
1135             // <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
1136             // same as above + set the level
1137             RLE | LRE | RLO | LRO | PDF | BN => {
1138                 if reset_from == None {
1139                     reset_from = Some(i);
1140                 }
1141                 // also set the level to previous
1142                 line_levels[i] = prev_level;
1143             }
1144             _ => {
1145                 reset_from = None;
1146             }
1147         }
1148         if let (Some(from), Some(to)) = (reset_from, reset_to) {
1149             for level in &mut line_levels[from..to] {
1150                 *level = para_level;
1151             }
1152             reset_from = None;
1153             reset_to = None;
1154         }
1155         prev_level = line_levels[i];
1156     }
1157     if let Some(from) = reset_from {
1158         for level in &mut line_levels[from..] {
1159             *level = para_level;
1160         }
1161     }
1162 }
1163 
1164 /// Contains a reference of `BidiInfo` and one of its `paragraphs`.
1165 /// And it supports all operation in the `Paragraph` that needs also its
1166 /// `BidiInfo` such as `direction`.
1167 #[derive(Debug)]
1168 pub struct Paragraph<'a, 'text> {
1169     pub info: &'a BidiInfo<'text>,
1170     pub para: &'a ParagraphInfo,
1171 }
1172 
1173 impl<'a, 'text> Paragraph<'a, 'text> {
1174     #[inline]
new(info: &'a BidiInfo<'text>, para: &'a ParagraphInfo) -> Paragraph<'a, 'text>1175     pub fn new(info: &'a BidiInfo<'text>, para: &'a ParagraphInfo) -> Paragraph<'a, 'text> {
1176         Paragraph { info, para }
1177     }
1178 
1179     /// Returns if the paragraph is Left direction, right direction or mixed.
1180     #[inline]
direction(&self) -> Direction1181     pub fn direction(&self) -> Direction {
1182         para_direction(&self.info.levels[self.para.range.clone()])
1183     }
1184 
1185     /// Returns the `Level` of a certain character in the paragraph.
1186     #[inline]
level_at(&self, pos: usize) -> Level1187     pub fn level_at(&self, pos: usize) -> Level {
1188         let actual_position = self.para.range.start + pos;
1189         self.info.levels[actual_position]
1190     }
1191 }
1192 
1193 /// Return the directionality of the paragraph (Left, Right or Mixed) from its levels.
1194 #[cfg_attr(feature = "flame_it", flamer::flame)]
para_direction(levels: &[Level]) -> Direction1195 fn para_direction(levels: &[Level]) -> Direction {
1196     let mut ltr = false;
1197     let mut rtl = false;
1198     for level in levels {
1199         if level.is_ltr() {
1200             ltr = true;
1201             if rtl {
1202                 return Direction::Mixed;
1203             }
1204         }
1205 
1206         if level.is_rtl() {
1207             rtl = true;
1208             if ltr {
1209                 return Direction::Mixed;
1210             }
1211         }
1212     }
1213 
1214     if ltr {
1215         return Direction::Ltr;
1216     }
1217 
1218     Direction::Rtl
1219 }
1220 
1221 /// Assign levels to characters removed by rule X9.
1222 ///
1223 /// The levels assigned to these characters are not specified by the algorithm.  This function
1224 /// assigns each one the level of the previous character, to avoid breaking level runs.
1225 #[cfg_attr(feature = "flame_it", flamer::flame)]
assign_levels_to_removed_chars(para_level: Level, classes: &[BidiClass], levels: &mut [Level])1226 fn assign_levels_to_removed_chars(para_level: Level, classes: &[BidiClass], levels: &mut [Level]) {
1227     for i in 0..levels.len() {
1228         if prepare::removed_by_x9(classes[i]) {
1229             levels[i] = if i > 0 { levels[i - 1] } else { para_level };
1230         }
1231     }
1232 }
1233 
1234 /// Get the base direction of the text provided according to the Unicode Bidirectional Algorithm.
1235 ///
1236 /// See rules P2 and P3.
1237 ///
1238 /// The base direction is derived from the first character in the string with bidi character type
1239 /// L, R, or AL. If the first such character has type L, Direction::Ltr is returned. If the first
1240 /// such character has type R or AL, Direction::Rtl is returned.
1241 ///
1242 /// If the string does not contain any character of these types (outside of embedded isolate runs),
1243 /// then Direction::Mixed is returned (but should be considered as meaning "neutral" or "unknown",
1244 /// not in fact mixed directions).
1245 ///
1246 /// This is a lightweight function for use when only the base direction is needed and no further
1247 /// bidi processing of the text is needed.
1248 ///
1249 /// If the text contains paragraph separators, this function considers only the first paragraph.
1250 #[cfg(feature = "hardcoded-data")]
1251 #[inline]
get_base_direction<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction1252 pub fn get_base_direction<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction {
1253     get_base_direction_with_data_source(&HardcodedBidiData, text)
1254 }
1255 
1256 /// Get the base direction of the text provided according to the Unicode Bidirectional Algorithm,
1257 /// considering the full text if the first paragraph is all-neutral.
1258 ///
1259 /// This is the same as get_base_direction except that it does not stop at the first block
1260 /// separator, but just resets the embedding level and continues to look for a strongly-
1261 /// directional character. So the result will be the base direction of the first paragraph
1262 /// that is not purely neutral characters.
1263 #[cfg(feature = "hardcoded-data")]
1264 #[inline]
get_base_direction_full<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction1265 pub fn get_base_direction_full<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction {
1266     get_base_direction_full_with_data_source(&HardcodedBidiData, text)
1267 }
1268 
1269 #[inline]
get_base_direction_with_data_source<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( data_source: &D, text: &'a T, ) -> Direction1270 pub fn get_base_direction_with_data_source<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
1271     data_source: &D,
1272     text: &'a T,
1273 ) -> Direction {
1274     get_base_direction_impl(data_source, text, false)
1275 }
1276 
1277 #[inline]
get_base_direction_full_with_data_source< 'a, D: BidiDataSource, T: TextSource<'a> + ?Sized, >( data_source: &D, text: &'a T, ) -> Direction1278 pub fn get_base_direction_full_with_data_source<
1279     'a,
1280     D: BidiDataSource,
1281     T: TextSource<'a> + ?Sized,
1282 >(
1283     data_source: &D,
1284     text: &'a T,
1285 ) -> Direction {
1286     get_base_direction_impl(data_source, text, true)
1287 }
1288 
get_base_direction_impl<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( data_source: &D, text: &'a T, use_full_text: bool, ) -> Direction1289 fn get_base_direction_impl<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
1290     data_source: &D,
1291     text: &'a T,
1292     use_full_text: bool,
1293 ) -> Direction {
1294     let mut isolate_level = 0;
1295     for c in text.chars() {
1296         match data_source.bidi_class(c) {
1297             LRI | RLI | FSI => isolate_level = isolate_level + 1,
1298             PDI if isolate_level > 0 => isolate_level = isolate_level - 1,
1299             L if isolate_level == 0 => return Direction::Ltr,
1300             R | AL if isolate_level == 0 => return Direction::Rtl,
1301             B if !use_full_text => break,
1302             B if use_full_text => isolate_level = 0,
1303             _ => (),
1304         }
1305     }
1306     // If no strong char was found, return Mixed. Normally this will be treated as Ltr by callers
1307     // (see rule P3), but we don't map this to Ltr here so that a caller that wants to apply other
1308     // heuristics to an all-neutral paragraph can tell the difference.
1309     Direction::Mixed
1310 }
1311 
1312 /// Implementation of TextSource for UTF-8 text (a string slice).
1313 impl<'text> TextSource<'text> for str {
1314     type CharIter = core::str::Chars<'text>;
1315     type CharIndexIter = core::str::CharIndices<'text>;
1316     type IndexLenIter = Utf8IndexLenIter<'text>;
1317 
1318     #[inline]
len(&self) -> usize1319     fn len(&self) -> usize {
1320         (self as &str).len()
1321     }
1322     #[inline]
char_at(&self, index: usize) -> Option<(char, usize)>1323     fn char_at(&self, index: usize) -> Option<(char, usize)> {
1324         if let Some(slice) = self.get(index..) {
1325             if let Some(ch) = slice.chars().next() {
1326                 return Some((ch, ch.len_utf8()));
1327             }
1328         }
1329         None
1330     }
1331     #[inline]
subrange(&self, range: Range<usize>) -> &Self1332     fn subrange(&self, range: Range<usize>) -> &Self {
1333         &(self as &str)[range]
1334     }
1335     #[inline]
chars(&'text self) -> Self::CharIter1336     fn chars(&'text self) -> Self::CharIter {
1337         (self as &str).chars()
1338     }
1339     #[inline]
char_indices(&'text self) -> Self::CharIndexIter1340     fn char_indices(&'text self) -> Self::CharIndexIter {
1341         (self as &str).char_indices()
1342     }
1343     #[inline]
indices_lengths(&'text self) -> Self::IndexLenIter1344     fn indices_lengths(&'text self) -> Self::IndexLenIter {
1345         Utf8IndexLenIter::new(&self)
1346     }
1347     #[inline]
char_len(ch: char) -> usize1348     fn char_len(ch: char) -> usize {
1349         ch.len_utf8()
1350     }
1351 }
1352 
1353 /// Iterator over (UTF-8) string slices returning (index, char_len) tuple.
1354 #[derive(Debug)]
1355 pub struct Utf8IndexLenIter<'text> {
1356     iter: CharIndices<'text>,
1357 }
1358 
1359 impl<'text> Utf8IndexLenIter<'text> {
1360     #[inline]
new(text: &'text str) -> Self1361     pub fn new(text: &'text str) -> Self {
1362         Utf8IndexLenIter {
1363             iter: text.char_indices(),
1364         }
1365     }
1366 }
1367 
1368 impl Iterator for Utf8IndexLenIter<'_> {
1369     type Item = (usize, usize);
1370 
1371     #[inline]
next(&mut self) -> Option<Self::Item>1372     fn next(&mut self) -> Option<Self::Item> {
1373         if let Some((pos, ch)) = self.iter.next() {
1374             return Some((pos, ch.len_utf8()));
1375         }
1376         None
1377     }
1378 }
1379 
1380 #[cfg(test)]
to_utf16(s: &str) -> Vec<u16>1381 fn to_utf16(s: &str) -> Vec<u16> {
1382     s.encode_utf16().collect()
1383 }
1384 
1385 #[cfg(test)]
1386 #[cfg(feature = "hardcoded-data")]
1387 mod tests {
1388     use super::*;
1389 
1390     use utf16::{
1391         BidiInfo as BidiInfoU16, InitialInfo as InitialInfoU16, Paragraph as ParagraphU16,
1392         ParagraphBidiInfo as ParagraphBidiInfoU16,
1393     };
1394 
1395     #[test]
test_utf16_text_source()1396     fn test_utf16_text_source() {
1397         let text: &[u16] =
1398             &[0x41, 0xD801, 0xDC01, 0x20, 0xD800, 0x20, 0xDFFF, 0x20, 0xDC00, 0xD800];
1399         assert_eq!(text.char_at(0), Some(('A', 1)));
1400         assert_eq!(text.char_at(1), Some(('\u{10401}', 2)));
1401         assert_eq!(text.char_at(2), None);
1402         assert_eq!(text.char_at(3), Some((' ', 1)));
1403         assert_eq!(text.char_at(4), Some((char::REPLACEMENT_CHARACTER, 1)));
1404         assert_eq!(text.char_at(5), Some((' ', 1)));
1405         assert_eq!(text.char_at(6), Some((char::REPLACEMENT_CHARACTER, 1)));
1406         assert_eq!(text.char_at(7), Some((' ', 1)));
1407         assert_eq!(text.char_at(8), Some((char::REPLACEMENT_CHARACTER, 1)));
1408         assert_eq!(text.char_at(9), Some((char::REPLACEMENT_CHARACTER, 1)));
1409         assert_eq!(text.char_at(10), None);
1410     }
1411 
1412     #[test]
test_utf16_char_iter()1413     fn test_utf16_char_iter() {
1414         let text: &[u16] =
1415             &[0x41, 0xD801, 0xDC01, 0x20, 0xD800, 0x20, 0xDFFF, 0x20, 0xDC00, 0xD800];
1416         assert_eq!(text.len(), 10);
1417         assert_eq!(text.chars().count(), 9);
1418         let mut chars = text.chars();
1419         assert_eq!(chars.next(), Some('A'));
1420         assert_eq!(chars.next(), Some('\u{10401}'));
1421         assert_eq!(chars.next(), Some(' '));
1422         assert_eq!(chars.next(), Some('\u{FFFD}'));
1423         assert_eq!(chars.next(), Some(' '));
1424         assert_eq!(chars.next(), Some('\u{FFFD}'));
1425         assert_eq!(chars.next(), Some(' '));
1426         assert_eq!(chars.next(), Some('\u{FFFD}'));
1427         assert_eq!(chars.next(), Some('\u{FFFD}'));
1428         assert_eq!(chars.next(), None);
1429     }
1430 
1431     #[test]
test_initial_text_info()1432     fn test_initial_text_info() {
1433         let tests = vec![
1434             (
1435                 // text
1436                 "a1",
1437                 // expected bidi classes per utf-8 byte
1438                 vec![L, EN],
1439                 // expected paragraph-info for utf-8
1440                 vec![ParagraphInfo {
1441                     range: 0..2,
1442                     level: LTR_LEVEL,
1443                 }],
1444                 // expected bidi classes per utf-16 code unit
1445                 vec![L, EN],
1446                 // expected paragraph-info for utf-16
1447                 vec![ParagraphInfo {
1448                     range: 0..2,
1449                     level: LTR_LEVEL,
1450                 }],
1451             ),
1452             (
1453                 // Arabic, space, Hebrew
1454                 "\u{0639} \u{05D0}",
1455                 vec![AL, AL, WS, R, R],
1456                 vec![ParagraphInfo {
1457                     range: 0..5,
1458                     level: RTL_LEVEL,
1459                 }],
1460                 vec![AL, WS, R],
1461                 vec![ParagraphInfo {
1462                     range: 0..3,
1463                     level: RTL_LEVEL,
1464                 }],
1465             ),
1466             (
1467                 // SMP characters from Kharoshthi, Cuneiform, Adlam:
1468                 "\u{10A00}\u{12000}\u{1E900}",
1469                 vec![R, R, R, R, L, L, L, L, R, R, R, R],
1470                 vec![ParagraphInfo {
1471                     range: 0..12,
1472                     level: RTL_LEVEL,
1473                 }],
1474                 vec![R, R, L, L, R, R],
1475                 vec![ParagraphInfo {
1476                     range: 0..6,
1477                     level: RTL_LEVEL,
1478                 }],
1479             ),
1480             (
1481                 "a\u{2029}b",
1482                 vec![L, B, B, B, L],
1483                 vec![
1484                     ParagraphInfo {
1485                         range: 0..4,
1486                         level: LTR_LEVEL,
1487                     },
1488                     ParagraphInfo {
1489                         range: 4..5,
1490                         level: LTR_LEVEL,
1491                     },
1492                 ],
1493                 vec![L, B, L],
1494                 vec![
1495                     ParagraphInfo {
1496                         range: 0..2,
1497                         level: LTR_LEVEL,
1498                     },
1499                     ParagraphInfo {
1500                         range: 2..3,
1501                         level: LTR_LEVEL,
1502                     },
1503                 ],
1504             ),
1505             (
1506                 "\u{2068}א\u{2069}a", // U+2068 FSI, U+2069 PDI
1507                 vec![RLI, RLI, RLI, R, R, PDI, PDI, PDI, L],
1508                 vec![ParagraphInfo {
1509                     range: 0..9,
1510                     level: LTR_LEVEL,
1511                 }],
1512                 vec![RLI, R, PDI, L],
1513                 vec![ParagraphInfo {
1514                     range: 0..4,
1515                     level: LTR_LEVEL,
1516                 }],
1517             ),
1518         ];
1519 
1520         for t in tests {
1521             assert_eq!(
1522                 InitialInfo::new(t.0, None),
1523                 InitialInfo {
1524                     text: t.0,
1525                     original_classes: t.1,
1526                     paragraphs: t.2,
1527                 }
1528             );
1529             let text = &to_utf16(t.0);
1530             assert_eq!(
1531                 InitialInfoU16::new(text, None),
1532                 InitialInfoU16 {
1533                     text,
1534                     original_classes: t.3,
1535                     paragraphs: t.4,
1536                 }
1537             );
1538         }
1539     }
1540 
1541     #[test]
1542     #[cfg(feature = "hardcoded-data")]
test_process_text()1543     fn test_process_text() {
1544         let tests = vec![
1545             (
1546                 // text
1547                 "abc123",
1548                 // base level
1549                 Some(LTR_LEVEL),
1550                 // levels
1551                 Level::vec(&[0, 0, 0, 0, 0, 0]),
1552                 // original_classes
1553                 vec![L, L, L, EN, EN, EN],
1554                 // paragraphs
1555                 vec![ParagraphInfo {
1556                     range: 0..6,
1557                     level: LTR_LEVEL,
1558                 }],
1559                 // levels_u16
1560                 Level::vec(&[0, 0, 0, 0, 0, 0]),
1561                 // original_classes_u16
1562                 vec![L, L, L, EN, EN, EN],
1563                 // paragraphs_u16
1564                 vec![ParagraphInfo {
1565                     range: 0..6,
1566                     level: LTR_LEVEL,
1567                 }],
1568             ),
1569             (
1570                 "abc \u{05D0}\u{05D1}\u{05D2}",
1571                 Some(LTR_LEVEL),
1572                 Level::vec(&[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]),
1573                 vec![L, L, L, WS, R, R, R, R, R, R],
1574                 vec![ParagraphInfo {
1575                     range: 0..10,
1576                     level: LTR_LEVEL,
1577                 }],
1578                 Level::vec(&[0, 0, 0, 0, 1, 1, 1]),
1579                 vec![L, L, L, WS, R, R, R],
1580                 vec![ParagraphInfo {
1581                     range: 0..7,
1582                     level: LTR_LEVEL,
1583                 }],
1584             ),
1585             (
1586                 "abc \u{05D0}\u{05D1}\u{05D2}",
1587                 Some(RTL_LEVEL),
1588                 Level::vec(&[2, 2, 2, 1, 1, 1, 1, 1, 1, 1]),
1589                 vec![L, L, L, WS, R, R, R, R, R, R],
1590                 vec![ParagraphInfo {
1591                     range: 0..10,
1592                     level: RTL_LEVEL,
1593                 }],
1594                 Level::vec(&[2, 2, 2, 1, 1, 1, 1]),
1595                 vec![L, L, L, WS, R, R, R],
1596                 vec![ParagraphInfo {
1597                     range: 0..7,
1598                     level: RTL_LEVEL,
1599                 }],
1600             ),
1601             (
1602                 "\u{05D0}\u{05D1}\u{05D2} abc",
1603                 Some(LTR_LEVEL),
1604                 Level::vec(&[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]),
1605                 vec![R, R, R, R, R, R, WS, L, L, L],
1606                 vec![ParagraphInfo {
1607                     range: 0..10,
1608                     level: LTR_LEVEL,
1609                 }],
1610                 Level::vec(&[1, 1, 1, 0, 0, 0, 0]),
1611                 vec![R, R, R, WS, L, L, L],
1612                 vec![ParagraphInfo {
1613                     range: 0..7,
1614                     level: LTR_LEVEL,
1615                 }],
1616             ),
1617             (
1618                 "\u{05D0}\u{05D1}\u{05D2} abc",
1619                 None,
1620                 Level::vec(&[1, 1, 1, 1, 1, 1, 1, 2, 2, 2]),
1621                 vec![R, R, R, R, R, R, WS, L, L, L],
1622                 vec![ParagraphInfo {
1623                     range: 0..10,
1624                     level: RTL_LEVEL,
1625                 }],
1626                 Level::vec(&[1, 1, 1, 1, 2, 2, 2]),
1627                 vec![R, R, R, WS, L, L, L],
1628                 vec![ParagraphInfo {
1629                     range: 0..7,
1630                     level: RTL_LEVEL,
1631                 }],
1632             ),
1633             (
1634                 "\u{063A}2\u{0638} \u{05D0}2\u{05D2}",
1635                 Some(LTR_LEVEL),
1636                 Level::vec(&[1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1]),
1637                 vec![AL, AL, EN, AL, AL, WS, R, R, EN, R, R],
1638                 vec![ParagraphInfo {
1639                     range: 0..11,
1640                     level: LTR_LEVEL,
1641                 }],
1642                 Level::vec(&[1, 2, 1, 1, 1, 2, 1]),
1643                 vec![AL, EN, AL, WS, R, EN, R],
1644                 vec![ParagraphInfo {
1645                     range: 0..7,
1646                     level: LTR_LEVEL,
1647                 }],
1648             ),
1649             (
1650                 "a א.\nג",
1651                 None,
1652                 Level::vec(&[0, 0, 1, 1, 0, 0, 1, 1]),
1653                 vec![L, WS, R, R, CS, B, R, R],
1654                 vec![
1655                     ParagraphInfo {
1656                         range: 0..6,
1657                         level: LTR_LEVEL,
1658                     },
1659                     ParagraphInfo {
1660                         range: 6..8,
1661                         level: RTL_LEVEL,
1662                     },
1663                 ],
1664                 Level::vec(&[0, 0, 1, 0, 0, 1]),
1665                 vec![L, WS, R, CS, B, R],
1666                 vec![
1667                     ParagraphInfo {
1668                         range: 0..5,
1669                         level: LTR_LEVEL,
1670                     },
1671                     ParagraphInfo {
1672                         range: 5..6,
1673                         level: RTL_LEVEL,
1674                     },
1675                 ],
1676             ),
1677             // BidiTest:69635 (AL ET EN)
1678             (
1679                 "\u{060B}\u{20CF}\u{06F9}",
1680                 None,
1681                 Level::vec(&[1, 1, 1, 1, 1, 2, 2]),
1682                 vec![AL, AL, ET, ET, ET, EN, EN],
1683                 vec![ParagraphInfo {
1684                     range: 0..7,
1685                     level: RTL_LEVEL,
1686                 }],
1687                 Level::vec(&[1, 1, 2]),
1688                 vec![AL, ET, EN],
1689                 vec![ParagraphInfo {
1690                     range: 0..3,
1691                     level: RTL_LEVEL,
1692                 }],
1693             ),
1694         ];
1695 
1696         for t in tests {
1697             assert_eq!(
1698                 BidiInfo::new(t.0, t.1),
1699                 BidiInfo {
1700                     text: t.0,
1701                     levels: t.2.clone(),
1702                     original_classes: t.3.clone(),
1703                     paragraphs: t.4.clone(),
1704                 }
1705             );
1706             // If it was a single paragraph, also test ParagraphBidiInfo.
1707             if t.4.len() == 1 {
1708                 assert_eq!(
1709                     ParagraphBidiInfo::new(t.0, t.1),
1710                     ParagraphBidiInfo {
1711                         text: t.0,
1712                         original_classes: t.3,
1713                         levels: t.2.clone(),
1714                         paragraph_level: t.4[0].level,
1715                         is_pure_ltr: !level::has_rtl(&t.2),
1716                     }
1717                 )
1718             }
1719             let text = &to_utf16(t.0);
1720             assert_eq!(
1721                 BidiInfoU16::new(text, t.1),
1722                 BidiInfoU16 {
1723                     text,
1724                     levels: t.5.clone(),
1725                     original_classes: t.6.clone(),
1726                     paragraphs: t.7.clone(),
1727                 }
1728             );
1729             if t.7.len() == 1 {
1730                 assert_eq!(
1731                     ParagraphBidiInfoU16::new(text, t.1),
1732                     ParagraphBidiInfoU16 {
1733                         text: text,
1734                         original_classes: t.6.clone(),
1735                         levels: t.5.clone(),
1736                         paragraph_level: t.7[0].level,
1737                         is_pure_ltr: !level::has_rtl(&t.5),
1738                     }
1739                 )
1740             }
1741         }
1742     }
1743 
1744     #[test]
1745     #[cfg(feature = "hardcoded-data")]
test_paragraph_bidi_info()1746     fn test_paragraph_bidi_info() {
1747         // Passing text that includes a paragraph break to the ParagraphBidiInfo API:
1748         // this is a misuse of the API by the client, but our behavior is safe &
1749         // consistent. The embedded paragraph break acts like a separator (tab) would.
1750         let tests = vec![
1751             (
1752                 "a א.\nג",
1753                 None,
1754                 // utf-8 results:
1755                 vec![L, WS, R, R, CS, B, R, R],
1756                 Level::vec(&[0, 0, 1, 1, 1, 1, 1, 1]),
1757                 // utf-16 results:
1758                 vec![L, WS, R, CS, B, R],
1759                 Level::vec(&[0, 0, 1, 1, 1, 1]),
1760                 // paragraph level; is_pure_ltr
1761                 LTR_LEVEL,
1762                 false,
1763             ),
1764             (
1765                 "\u{5d1} a.\nb.",
1766                 None,
1767                 // utf-8 results:
1768                 vec![R, R, WS, L, CS, B, L, CS],
1769                 Level::vec(&[1, 1, 1, 2, 2, 2, 2, 1]),
1770                 // utf-16 results:
1771                 vec![R, WS, L, CS, B, L, CS],
1772                 Level::vec(&[1, 1, 2, 2, 2, 2, 1]),
1773                 // paragraph level; is_pure_ltr
1774                 RTL_LEVEL,
1775                 false,
1776             ),
1777             (
1778                 "a א.\tג",
1779                 None,
1780                 // utf-8 results:
1781                 vec![L, WS, R, R, CS, S, R, R],
1782                 Level::vec(&[0, 0, 1, 1, 1, 1, 1, 1]),
1783                 // utf-16 results:
1784                 vec![L, WS, R, CS, S, R],
1785                 Level::vec(&[0, 0, 1, 1, 1, 1]),
1786                 // paragraph level; is_pure_ltr
1787                 LTR_LEVEL,
1788                 false,
1789             ),
1790             (
1791                 "\u{5d1} a.\tb.",
1792                 None,
1793                 // utf-8 results:
1794                 vec![R, R, WS, L, CS, S, L, CS],
1795                 Level::vec(&[1, 1, 1, 2, 2, 2, 2, 1]),
1796                 // utf-16 results:
1797                 vec![R, WS, L, CS, S, L, CS],
1798                 Level::vec(&[1, 1, 2, 2, 2, 2, 1]),
1799                 // paragraph level; is_pure_ltr
1800                 RTL_LEVEL,
1801                 false,
1802             ),
1803         ];
1804 
1805         for t in tests {
1806             assert_eq!(
1807                 ParagraphBidiInfo::new(t.0, t.1),
1808                 ParagraphBidiInfo {
1809                     text: t.0,
1810                     original_classes: t.2,
1811                     levels: t.3,
1812                     paragraph_level: t.6,
1813                     is_pure_ltr: t.7,
1814                 }
1815             );
1816             let text = &to_utf16(t.0);
1817             assert_eq!(
1818                 ParagraphBidiInfoU16::new(text, t.1),
1819                 ParagraphBidiInfoU16 {
1820                     text: text,
1821                     original_classes: t.4,
1822                     levels: t.5,
1823                     paragraph_level: t.6,
1824                     is_pure_ltr: t.7,
1825                 }
1826             );
1827         }
1828     }
1829 
1830     #[test]
1831     #[cfg(feature = "hardcoded-data")]
test_bidi_info_has_rtl()1832     fn test_bidi_info_has_rtl() {
1833         let tests = vec![
1834             // ASCII only
1835             ("123", None, false),
1836             ("123", Some(LTR_LEVEL), false),
1837             ("123", Some(RTL_LEVEL), false),
1838             ("abc", None, false),
1839             ("abc", Some(LTR_LEVEL), false),
1840             ("abc", Some(RTL_LEVEL), false),
1841             ("abc 123", None, false),
1842             ("abc\n123", None, false),
1843             // With Hebrew
1844             ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true),
1845             ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", Some(LTR_LEVEL), true),
1846             ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", Some(RTL_LEVEL), true),
1847             ("abc \u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true),
1848             ("abc\n\u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true),
1849             ("\u{05D0}\u{05D1}\u{05BC}\u{05D2} abc", None, true),
1850             ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}\nabc", None, true),
1851             ("\u{05D0}\u{05D1}\u{05BC}\u{05D2} 123", None, true),
1852             ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}\n123", None, true),
1853         ];
1854 
1855         for t in tests {
1856             assert_eq!(BidiInfo::new(t.0, t.1).has_rtl(), t.2);
1857             assert_eq!(BidiInfoU16::new(&to_utf16(t.0), t.1).has_rtl(), t.2);
1858         }
1859     }
1860 
1861     #[cfg(feature = "hardcoded-data")]
reorder_paras(text: &str) -> Vec<Cow<'_, str>>1862     fn reorder_paras(text: &str) -> Vec<Cow<'_, str>> {
1863         let bidi_info = BidiInfo::new(text, None);
1864         bidi_info
1865             .paragraphs
1866             .iter()
1867             .map(|para| bidi_info.reorder_line(para, para.range.clone()))
1868             .collect()
1869     }
1870 
1871     #[cfg(feature = "hardcoded-data")]
reorder_paras_u16(text: &[u16]) -> Vec<Cow<'_, [u16]>>1872     fn reorder_paras_u16(text: &[u16]) -> Vec<Cow<'_, [u16]>> {
1873         let bidi_info = BidiInfoU16::new(text, None);
1874         bidi_info
1875             .paragraphs
1876             .iter()
1877             .map(|para| bidi_info.reorder_line(para, para.range.clone()))
1878             .collect()
1879     }
1880 
1881     #[test]
1882     #[cfg(feature = "hardcoded-data")]
test_reorder_line()1883     fn test_reorder_line() {
1884         let tests = vec![
1885             // Bidi_Class: L L L B L L L B L L L
1886             ("abc\ndef\nghi", vec!["abc\n", "def\n", "ghi"]),
1887             // Bidi_Class: L L EN B L L EN B L L EN
1888             ("ab1\nde2\ngh3", vec!["ab1\n", "de2\n", "gh3"]),
1889             // Bidi_Class: L L L B AL AL AL
1890             ("abc\nابج", vec!["abc\n", "جبا"]),
1891             // Bidi_Class: AL AL AL B L L L
1892             (
1893                 "\u{0627}\u{0628}\u{062C}\nabc",
1894                 vec!["\n\u{062C}\u{0628}\u{0627}", "abc"],
1895             ),
1896             ("1.-2", vec!["1.-2"]),
1897             ("1-.2", vec!["1-.2"]),
1898             ("abc אבג", vec!["abc גבא"]),
1899             // Numbers being weak LTR characters, cannot reorder strong RTL
1900             ("123 \u{05D0}\u{05D1}\u{05D2}", vec!["גבא 123"]),
1901             ("abc\u{202A}def", vec!["abc\u{202A}def"]),
1902             (
1903                 "abc\u{202A}def\u{202C}ghi",
1904                 vec!["abc\u{202A}def\u{202C}ghi"],
1905             ),
1906             (
1907                 "abc\u{2066}def\u{2069}ghi",
1908                 vec!["abc\u{2066}def\u{2069}ghi"],
1909             ),
1910             // Testing for RLE Character
1911             ("\u{202B}abc אבג\u{202C}", vec!["\u{202b}גבא abc\u{202c}"]),
1912             // Testing neutral characters
1913             ("\u{05D0}בג? אבג", vec!["גבא ?גבא"]),
1914             // Testing neutral characters with special case
1915             ("A אבג?", vec!["A גבא?"]),
1916             // Testing neutral characters with Implicit RTL Marker
1917             ("A אבג?\u{200F}", vec!["A \u{200F}?גבא"]),
1918             ("\u{05D0}בג abc", vec!["abc גבא"]),
1919             ("abc\u{2067}.-\u{2069}ghi", vec!["abc\u{2067}-.\u{2069}ghi"]),
1920             (
1921                 "Hello, \u{2068}\u{202E}world\u{202C}\u{2069}!",
1922                 vec!["Hello, \u{2068}\u{202E}\u{202C}dlrow\u{2069}!"],
1923             ),
1924             // With mirrorable characters in RTL run
1925             ("\u{05D0}(ב)ג.", vec![".ג)ב(א"]),
1926             // With mirrorable characters on level boundary
1927             ("\u{05D0}ב(גד[&ef].)gh", vec!["gh).]ef&[דג(בא"]),
1928         ];
1929 
1930         for t in tests {
1931             assert_eq!(reorder_paras(t.0), t.1);
1932             let expect_utf16 = t.1.iter().map(|v| to_utf16(v)).collect::<Vec<_>>();
1933             assert_eq!(reorder_paras_u16(&to_utf16(t.0)), expect_utf16);
1934         }
1935     }
1936 
reordered_levels_for_paras(text: &str) -> Vec<Vec<Level>>1937     fn reordered_levels_for_paras(text: &str) -> Vec<Vec<Level>> {
1938         let bidi_info = BidiInfo::new(text, None);
1939         bidi_info
1940             .paragraphs
1941             .iter()
1942             .map(|para| bidi_info.reordered_levels(para, para.range.clone()))
1943             .collect()
1944     }
1945 
reordered_levels_per_char_for_paras(text: &str) -> Vec<Vec<Level>>1946     fn reordered_levels_per_char_for_paras(text: &str) -> Vec<Vec<Level>> {
1947         let bidi_info = BidiInfo::new(text, None);
1948         bidi_info
1949             .paragraphs
1950             .iter()
1951             .map(|para| bidi_info.reordered_levels_per_char(para, para.range.clone()))
1952             .collect()
1953     }
1954 
reordered_levels_for_paras_u16(text: &[u16]) -> Vec<Vec<Level>>1955     fn reordered_levels_for_paras_u16(text: &[u16]) -> Vec<Vec<Level>> {
1956         let bidi_info = BidiInfoU16::new(text, None);
1957         bidi_info
1958             .paragraphs
1959             .iter()
1960             .map(|para| bidi_info.reordered_levels(para, para.range.clone()))
1961             .collect()
1962     }
1963 
reordered_levels_per_char_for_paras_u16(text: &[u16]) -> Vec<Vec<Level>>1964     fn reordered_levels_per_char_for_paras_u16(text: &[u16]) -> Vec<Vec<Level>> {
1965         let bidi_info = BidiInfoU16::new(text, None);
1966         bidi_info
1967             .paragraphs
1968             .iter()
1969             .map(|para| bidi_info.reordered_levels_per_char(para, para.range.clone()))
1970             .collect()
1971     }
1972 
1973     #[test]
1974     #[cfg(feature = "hardcoded-data")]
test_reordered_levels()1975     fn test_reordered_levels() {
1976         let tests = vec![
1977             // BidiTest:946 (LRI PDI)
1978             (
1979                 "\u{2067}\u{2069}",
1980                 vec![Level::vec(&[0, 0, 0, 0, 0, 0])],
1981                 vec![Level::vec(&[0, 0])],
1982                 vec![Level::vec(&[0, 0])],
1983             ),
1984             // BidiTest:69635 (AL ET EN)
1985             (
1986                 "\u{060B}\u{20CF}\u{06F9}",
1987                 vec![Level::vec(&[1, 1, 1, 1, 1, 2, 2])],
1988                 vec![Level::vec(&[1, 1, 2])],
1989                 vec![Level::vec(&[1, 1, 2])],
1990             ),
1991         ];
1992 
1993         for t in tests {
1994             assert_eq!(reordered_levels_for_paras(t.0), t.1);
1995             assert_eq!(reordered_levels_per_char_for_paras(t.0), t.2);
1996             let text = &to_utf16(t.0);
1997             assert_eq!(reordered_levels_for_paras_u16(text), t.3);
1998             assert_eq!(reordered_levels_per_char_for_paras_u16(text), t.2);
1999         }
2000 
2001         let tests = vec![
2002             // BidiTest:291284 (AN RLI PDF R)
2003             (
2004                 "\u{0605}\u{2067}\u{202C}\u{0590}",
2005                 vec![&["2", "2", "0", "0", "0", "x", "x", "x", "1", "1"]],
2006                 vec![&["2", "0", "x", "1"]],
2007                 vec![&["2", "0", "x", "1"]],
2008             ),
2009         ];
2010 
2011         for t in tests {
2012             assert_eq!(reordered_levels_for_paras(t.0), t.1);
2013             assert_eq!(reordered_levels_per_char_for_paras(t.0), t.2);
2014             let text = &to_utf16(t.0);
2015             assert_eq!(reordered_levels_for_paras_u16(text), t.3);
2016             assert_eq!(reordered_levels_per_char_for_paras_u16(text), t.2);
2017         }
2018 
2019         let text = "aa טֶ";
2020         let bidi_info = BidiInfo::new(text, None);
2021         assert_eq!(
2022             bidi_info.reordered_levels(&bidi_info.paragraphs[0], 3..7),
2023             Level::vec(&[0, 0, 0, 1, 1, 1, 1]),
2024         );
2025 
2026         let text = &to_utf16(text);
2027         let bidi_info = BidiInfoU16::new(text, None);
2028         assert_eq!(
2029             bidi_info.reordered_levels(&bidi_info.paragraphs[0], 1..4),
2030             Level::vec(&[0, 0, 0, 1, 1]),
2031         );
2032     }
2033 
2034     #[test]
test_paragraph_info_len()2035     fn test_paragraph_info_len() {
2036         let text = "hello world";
2037         let bidi_info = BidiInfo::new(text, None);
2038         assert_eq!(bidi_info.paragraphs.len(), 1);
2039         assert_eq!(bidi_info.paragraphs[0].len(), text.len());
2040 
2041         let text2 = "How are you";
2042         let whole_text = format!("{}\n{}", text, text2);
2043         let bidi_info = BidiInfo::new(&whole_text, None);
2044         assert_eq!(bidi_info.paragraphs.len(), 2);
2045 
2046         // The first paragraph include the paragraph separator.
2047         // TODO: investigate if the paragraph separator character
2048         // should not be part of any paragraph.
2049         assert_eq!(bidi_info.paragraphs[0].len(), text.len() + 1);
2050         assert_eq!(bidi_info.paragraphs[1].len(), text2.len());
2051 
2052         let text = &to_utf16(text);
2053         let bidi_info = BidiInfoU16::new(text, None);
2054         assert_eq!(bidi_info.paragraphs.len(), 1);
2055         assert_eq!(bidi_info.paragraphs[0].len(), text.len());
2056 
2057         let text2 = &to_utf16(text2);
2058         let whole_text = &to_utf16(&whole_text);
2059         let bidi_info = BidiInfoU16::new(&whole_text, None);
2060         assert_eq!(bidi_info.paragraphs.len(), 2);
2061 
2062         assert_eq!(bidi_info.paragraphs[0].len(), text.len() + 1);
2063         assert_eq!(bidi_info.paragraphs[1].len(), text2.len());
2064     }
2065 
2066     #[test]
test_direction()2067     fn test_direction() {
2068         let ltr_text = "hello world";
2069         let rtl_text = "أهلا بكم";
2070         let all_paragraphs = format!("{}\n{}\n{}{}", ltr_text, rtl_text, ltr_text, rtl_text);
2071         let bidi_info = BidiInfo::new(&all_paragraphs, None);
2072         assert_eq!(bidi_info.paragraphs.len(), 3);
2073         let p_ltr = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]);
2074         let p_rtl = Paragraph::new(&bidi_info, &bidi_info.paragraphs[1]);
2075         let p_mixed = Paragraph::new(&bidi_info, &bidi_info.paragraphs[2]);
2076         assert_eq!(p_ltr.direction(), Direction::Ltr);
2077         assert_eq!(p_rtl.direction(), Direction::Rtl);
2078         assert_eq!(p_mixed.direction(), Direction::Mixed);
2079 
2080         let all_paragraphs = &to_utf16(&all_paragraphs);
2081         let bidi_info = BidiInfoU16::new(&all_paragraphs, None);
2082         assert_eq!(bidi_info.paragraphs.len(), 3);
2083         let p_ltr = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]);
2084         let p_rtl = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[1]);
2085         let p_mixed = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[2]);
2086         assert_eq!(p_ltr.direction(), Direction::Ltr);
2087         assert_eq!(p_rtl.direction(), Direction::Rtl);
2088         assert_eq!(p_mixed.direction(), Direction::Mixed);
2089     }
2090 
2091     #[test]
test_edge_cases_direction()2092     fn test_edge_cases_direction() {
2093         // No paragraphs for empty text.
2094         let empty = "";
2095         let bidi_info = BidiInfo::new(empty, Option::from(RTL_LEVEL));
2096         assert_eq!(bidi_info.paragraphs.len(), 0);
2097 
2098         let empty = &to_utf16(empty);
2099         let bidi_info = BidiInfoU16::new(empty, Option::from(RTL_LEVEL));
2100         assert_eq!(bidi_info.paragraphs.len(), 0);
2101 
2102         let tests = vec![
2103             // The paragraph separator will take the value of the default direction
2104             // which is left to right.
2105             ("\n", None, Direction::Ltr),
2106             // The paragraph separator will take the value of the given initial direction
2107             // which is left to right.
2108             ("\n", Option::from(LTR_LEVEL), Direction::Ltr),
2109             // The paragraph separator will take the value of the given initial direction
2110             // which is right to left.
2111             ("\n", Option::from(RTL_LEVEL), Direction::Rtl),
2112         ];
2113 
2114         for t in tests {
2115             let bidi_info = BidiInfo::new(t.0, t.1);
2116             assert_eq!(bidi_info.paragraphs.len(), 1);
2117             let p = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]);
2118             assert_eq!(p.direction(), t.2);
2119             let text = &to_utf16(t.0);
2120             let bidi_info = BidiInfoU16::new(text, t.1);
2121             let p = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]);
2122             assert_eq!(p.direction(), t.2);
2123         }
2124     }
2125 
2126     #[test]
test_level_at()2127     fn test_level_at() {
2128         let ltr_text = "hello world";
2129         let rtl_text = "أهلا بكم";
2130         let all_paragraphs = format!("{}\n{}\n{}{}", ltr_text, rtl_text, ltr_text, rtl_text);
2131         let bidi_info = BidiInfo::new(&all_paragraphs, None);
2132         assert_eq!(bidi_info.paragraphs.len(), 3);
2133 
2134         let p_ltr = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]);
2135         let p_rtl = Paragraph::new(&bidi_info, &bidi_info.paragraphs[1]);
2136         let p_mixed = Paragraph::new(&bidi_info, &bidi_info.paragraphs[2]);
2137 
2138         assert_eq!(p_ltr.level_at(0), LTR_LEVEL);
2139         assert_eq!(p_rtl.level_at(0), RTL_LEVEL);
2140         assert_eq!(p_mixed.level_at(0), LTR_LEVEL);
2141         assert_eq!(p_mixed.info.levels.len(), 54);
2142         assert_eq!(p_mixed.para.range.start, 28);
2143         assert_eq!(p_mixed.level_at(ltr_text.len()), RTL_LEVEL);
2144 
2145         let all_paragraphs = &to_utf16(&all_paragraphs);
2146         let bidi_info = BidiInfoU16::new(&all_paragraphs, None);
2147         assert_eq!(bidi_info.paragraphs.len(), 3);
2148 
2149         let p_ltr = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]);
2150         let p_rtl = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[1]);
2151         let p_mixed = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[2]);
2152 
2153         assert_eq!(p_ltr.level_at(0), LTR_LEVEL);
2154         assert_eq!(p_rtl.level_at(0), RTL_LEVEL);
2155         assert_eq!(p_mixed.level_at(0), LTR_LEVEL);
2156         assert_eq!(p_mixed.info.levels.len(), 40);
2157         assert_eq!(p_mixed.para.range.start, 21);
2158         assert_eq!(p_mixed.level_at(ltr_text.len()), RTL_LEVEL);
2159     }
2160 
2161     #[test]
test_get_base_direction()2162     fn test_get_base_direction() {
2163         let tests = vec![
2164             ("", Direction::Mixed), // return Mixed if no strong character found
2165             ("123[]-+\u{2019}\u{2060}\u{00bf}?", Direction::Mixed),
2166             ("3.14\npi", Direction::Mixed), // only first paragraph is considered
2167             ("[123 'abc']", Direction::Ltr),
2168             ("[123 '\u{0628}' abc", Direction::Rtl),
2169             ("[123 '\u{2066}abc\u{2069}'\u{0628}]", Direction::Rtl), // embedded isolate is ignored
2170             ("[123 '\u{2066}abc\u{2068}'\u{0628}]", Direction::Mixed),
2171         ];
2172 
2173         for t in tests {
2174             assert_eq!(get_base_direction(t.0), t.1);
2175             let text = &to_utf16(t.0);
2176             assert_eq!(get_base_direction(text.as_slice()), t.1);
2177         }
2178     }
2179 
2180     #[test]
test_get_base_direction_full()2181     fn test_get_base_direction_full() {
2182         let tests = vec![
2183             ("", Direction::Mixed), // return Mixed if no strong character found
2184             ("123[]-+\u{2019}\u{2060}\u{00bf}?", Direction::Mixed),
2185             ("3.14\npi", Direction::Ltr), // direction taken from the second paragraph
2186             ("3.14\n\u{05D0}", Direction::Rtl), // direction taken from the second paragraph
2187             ("[123 'abc']", Direction::Ltr),
2188             ("[123 '\u{0628}' abc", Direction::Rtl),
2189             ("[123 '\u{2066}abc\u{2069}'\u{0628}]", Direction::Rtl), // embedded isolate is ignored
2190             ("[123 '\u{2066}abc\u{2068}'\u{0628}]", Direction::Mixed),
2191             ("[123 '\u{2066}abc\u{2068}'\n\u{0628}]", Direction::Rtl), // \n resets embedding level
2192         ];
2193 
2194         for t in tests {
2195             assert_eq!(get_base_direction_full(t.0), t.1);
2196             let text = &to_utf16(t.0);
2197             assert_eq!(get_base_direction_full(text.as_slice()), t.1);
2198         }
2199     }
2200 }
2201 
2202 #[cfg(all(feature = "serde", feature = "hardcoded-data", test))]
2203 mod serde_tests {
2204     use super::*;
2205     use serde_test::{assert_tokens, Token};
2206 
2207     #[test]
test_levels()2208     fn test_levels() {
2209         let text = "abc אבג";
2210         let bidi_info = BidiInfo::new(text, None);
2211         let levels = bidi_info.levels;
2212         assert_eq!(text.as_bytes().len(), 10);
2213         assert_eq!(levels.len(), 10);
2214         assert_tokens(
2215             &levels,
2216             &[
2217                 Token::Seq { len: Some(10) },
2218                 Token::NewtypeStruct { name: "Level" },
2219                 Token::U8(0),
2220                 Token::NewtypeStruct { name: "Level" },
2221                 Token::U8(0),
2222                 Token::NewtypeStruct { name: "Level" },
2223                 Token::U8(0),
2224                 Token::NewtypeStruct { name: "Level" },
2225                 Token::U8(0),
2226                 Token::NewtypeStruct { name: "Level" },
2227                 Token::U8(1),
2228                 Token::NewtypeStruct { name: "Level" },
2229                 Token::U8(1),
2230                 Token::NewtypeStruct { name: "Level" },
2231                 Token::U8(1),
2232                 Token::NewtypeStruct { name: "Level" },
2233                 Token::U8(1),
2234                 Token::NewtypeStruct { name: "Level" },
2235                 Token::U8(1),
2236                 Token::NewtypeStruct { name: "Level" },
2237                 Token::U8(1),
2238                 Token::SeqEnd,
2239             ],
2240         );
2241     }
2242 }
2243