rust/crates/rust-analyzer/src/line_index.rs

//! Enhances `ide::LineIndex` with additional info required to convert offsets
//! into lsp positions.
//!
//! We maintain invariant that all internal strings use `\n` as line separator.
//! This module does line ending conversion and detection (so that we can
//! convert back to `\r\n` on the way out).

use ide_db::line_index::WideEncoding;
use memchr::memmem;
use triomphe::Arc;

#[derive(Clone, Copy)]
pub enum PositionEncoding {
    Utf8,
    Wide(WideEncoding),
}

pub(crate) struct LineIndex {
    pub(crate) index: Arc<ide::LineIndex>,
    pub(crate) endings: LineEndings,
    pub(crate) encoding: PositionEncoding,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub(crate) enum LineEndings {
    Unix,
    Dos,
}

impl LineEndings {
    /// Replaces `\r\n` with `\n` in-place in `src`.
    pub(crate) fn normalize(src: String) -> (String, LineEndings) {
        // We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding.
        // While we *can* call `as_mut_vec` and do surgery on the live string
        // directly, let's rather steal the contents of `src`. This makes the code
        // safe even if a panic occurs.

        let mut buf = src.into_bytes();
        let mut gap_len = 0;
        let mut tail = buf.as_mut_slice();
        let mut crlf_seen = false;

        let finder = memmem::Finder::new(b"\r\n");

        loop {
            let idx = match finder.find(&tail[gap_len..]) {
                None if crlf_seen => tail.len(),
                // SAFETY: buf is unchanged and therefore still contains utf8 data
                None => return (unsafe { String::from_utf8_unchecked(buf) }, LineEndings::Unix),
                Some(idx) => {
                    crlf_seen = true;
                    idx + gap_len
                }
            };
            tail.copy_within(gap_len..idx, 0);
            tail = &mut tail[idx - gap_len..];
            if tail.len() == gap_len {
                break;
            }
            gap_len += 1;
        }

        // Account for removed `\r`.
        // After `set_len`, `buf` is guaranteed to contain utf-8 again.
        let src = unsafe {
            let new_len = buf.len() - gap_len;
            buf.set_len(new_len);
            String::from_utf8_unchecked(buf)
        };
        (src, LineEndings::Dos)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn unix() {
        let src = "a\nb\nc\n\n\n\n";
        let (res, endings) = LineEndings::normalize(src.into());
        assert_eq!(endings, LineEndings::Unix);
        assert_eq!(res, src);
    }

    #[test]
    fn dos() {
        let src = "\r\na\r\n\r\nb\r\nc\r\n\r\n\r\n\r\n";
        let (res, endings) = LineEndings::normalize(src.into());
        assert_eq!(endings, LineEndings::Dos);
        assert_eq!(res, "\na\n\nb\nc\n\n\n\n");
    }

    #[test]
    fn mixed() {
        let src = "a\r\nb\r\nc\r\n\n\r\n\n";
        let (res, endings) = LineEndings::normalize(src.into());
        assert_eq!(endings, LineEndings::Dos);
        assert_eq!(res, "a\nb\nc\n\n\n\n");
    }

    #[test]
    fn none() {
        let src = "abc";
        let (res, endings) = LineEndings::normalize(src.into());
        assert_eq!(endings, LineEndings::Unix);
        assert_eq!(res, src);
    }
}
Fix bitrotted module name 2021-02-12 16:28:48 -06:00			//! Enhances `ide::LineIndex` with additional info required to convert offsets
			`//! into lsp positions.`
			`//!`
New VFS 2020-06-11 04:04:09 -05:00			//! We maintain invariant that all internal strings use `\n` as line separator.
			`//! This module does line ending conversion and detection (so that we can`
			//! convert back to `\r\n` on the way out).

Support UTF-32 position encoding Looks like this is a native encoding for Emacs at least! 2023-02-13 18:56:28 -06:00			`use ide_db::line_index::WideEncoding;`
internal: speedup LineEndings calculation using 'memchr' 2024-01-18 03:03:29 -06:00			`use memchr::memmem;`
Use triomphe Arc 2023-05-02 09:12:22 -05:00			`use triomphe::Arc;`
Support UTF-32 position encoding Looks like this is a native encoding for Emacs at least! 2023-02-13 18:56:28 -06:00
			`#[derive(Clone, Copy)]`
Switch to upstream positionEncoding 2022-10-25 06:43:26 -05:00			`pub enum PositionEncoding {`
Implement utf8 offsets 2021-02-12 15:55:27 -06:00			`Utf8,`
Support UTF-32 position encoding Looks like this is a native encoding for Emacs at least! 2023-02-13 18:56:28 -06:00			`Wide(WideEncoding),`
Implement utf8 offsets 2021-02-12 15:55:27 -06:00			`}`

Make it easy to add additional context for offset conversion 2021-02-12 15:44:28 -06:00			`pub(crate) struct LineIndex {`
			`pub(crate) index: Arc<ide::LineIndex>,`
			`pub(crate) endings: LineEndings,`
Switch to upstream positionEncoding 2022-10-25 06:43:26 -05:00			`pub(crate) encoding: PositionEncoding,`
Make it easy to add additional context for offset conversion 2021-02-12 15:44:28 -06:00			`}`

New VFS 2020-06-11 04:04:09 -05:00			`#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]`
			`pub(crate) enum LineEndings {`
			`Unix,`
			`Dos,`
			`}`

			`impl LineEndings {`
			/// Replaces `\r\n` with `\n` in-place in `src`.
			`pub(crate) fn normalize(src: String) -> (String, LineEndings) {`
			// We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding.
			// While we can call `as_mut_vec` and do surgery on the live string
			// directly, let's rather steal the contents of `src`. This makes the code
			`// safe even if a panic occurs.`

			`let mut buf = src.into_bytes();`
			`let mut gap_len = 0;`
			`let mut tail = buf.as_mut_slice();`
Add tests for LineEndings::normalize 2022-11-05 05:00:17 -05:00			`let mut crlf_seen = false;`

internal: speedup LineEndings calculation using 'memchr' 2024-01-18 03:03:29 -06:00			`let finder = memmem::Finder::new(b"\r\n");`
Add tests for LineEndings::normalize 2022-11-05 05:00:17 -05:00
New VFS 2020-06-11 04:04:09 -05:00			`loop {`
internal: speedup LineEndings calculation using 'memchr' 2024-01-18 03:03:29 -06:00			`let idx = match finder.find(&tail[gap_len..]) {`
Add tests for LineEndings::normalize 2022-11-05 05:00:17 -05:00			`None if crlf_seen => tail.len(),`
Fix typos 2022-11-07 04:53:33 -06:00			`// SAFETY: buf is unchanged and therefore still contains utf8 data`
Add tests for LineEndings::normalize 2022-11-05 05:00:17 -05:00			`None => return (unsafe { String::from_utf8_unchecked(buf) }, LineEndings::Unix),`
			`Some(idx) => {`
			`crlf_seen = true;`
			`idx + gap_len`
			`}`
New VFS 2020-06-11 04:04:09 -05:00			`};`
			`tail.copy_within(gap_len..idx, 0);`
			`tail = &mut tail[idx - gap_len..];`
			`if tail.len() == gap_len {`
			`break;`
			`}`
			`gap_len += 1;`
			`}`

			// Account for removed `\r`.
			// After `set_len`, `buf` is guaranteed to contain utf-8 again.
			`let src = unsafe {`
Add tests for LineEndings::normalize 2022-11-05 05:00:17 -05:00			`let new_len = buf.len() - gap_len;`
New VFS 2020-06-11 04:04:09 -05:00			`buf.set_len(new_len);`
			`String::from_utf8_unchecked(buf)`
			`};`
Add tests for LineEndings::normalize 2022-11-05 05:00:17 -05:00			`(src, LineEndings::Dos)`
			`}`
			`}`
New VFS 2020-06-11 04:04:09 -05:00
Add tests for LineEndings::normalize 2022-11-05 05:00:17 -05:00			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn unix() {`
			`let src = "a\nb\nc\n\n\n\n";`
			`let (res, endings) = LineEndings::normalize(src.into());`
			`assert_eq!(endings, LineEndings::Unix);`
			`assert_eq!(res, src);`
			`}`

			`#[test]`
			`fn dos() {`
			`let src = "\r\na\r\n\r\nb\r\nc\r\n\r\n\r\n\r\n";`
			`let (res, endings) = LineEndings::normalize(src.into());`
			`assert_eq!(endings, LineEndings::Dos);`
			`assert_eq!(res, "\na\n\nb\nc\n\n\n\n");`
			`}`

			`#[test]`
			`fn mixed() {`
			`let src = "a\r\nb\r\nc\r\n\n\r\n\n";`
			`let (res, endings) = LineEndings::normalize(src.into());`
			`assert_eq!(endings, LineEndings::Dos);`
			`assert_eq!(res, "a\nb\nc\n\n\n\n");`
			`}`

			`#[test]`
			`fn none() {`
			`let src = "abc";`
			`let (res, endings) = LineEndings::normalize(src.into());`
			`assert_eq!(endings, LineEndings::Unix);`
			`assert_eq!(res, src);`
New VFS 2020-06-11 04:04:09 -05:00			`}`
			`}`