rapidhash/
rapid_file.rs

1use std::fs::File;
2use std::io::{BufReader, Read};
3use crate::rapid_const::{RAPID_SEED, RAPID_SECRET, rapid_mix, rapid_mum, rapidhash_finish, rapidhash_seed, read_u32_combined, read_u64};
4
5/// Rapidhash a file, matching the C++ implementation.
6///
7/// This method will check the metadata for a file length, and then stream the file with a
8/// [BufReader] to compute the hash. This avoids loading the entire file into memory.
9#[inline]
10pub fn rapidhash_file(data: &mut File) -> std::io::Result<u64> {
11    rapidhash_file_inline(data, RAPID_SEED)
12}
13
14/// Rapidhash a file, matching the C++ implementation, with a custom seed.
15///
16/// This method will check the metadata for a file length, and then stream the file with a
17/// [BufReader] to compute the hash. This avoids loading the entire file into memory.
18#[inline]
19pub fn rapidhash_file_seeded(data: &mut File, seed: u64) -> std::io::Result<u64> {
20    rapidhash_file_inline(data, seed)
21}
22
23/// Rapidhash a file, matching the C++ implementation.
24///
25/// This method will check the metadata for a file length, and then stream the file with a
26/// [BufReader] to compute the hash. This avoids loading the entire file into memory.
27///
28/// We could easily add more ways to read other streams that can be converted to a [BufReader],
29/// but the length must be known at the start of the stream due to how rapidhash is seeded using
30/// the data length. Raise a [GitHub](https://github.com/hoxxep/rapidhash) issue if you have a
31/// use case to support other stream types.
32///
33/// Is marked with `#[inline(always)]` to force the compiler to inline and optimise the method.
34/// Can provide large performance uplifts for inputs where the length is known at compile time.
35#[inline(always)]
36pub fn rapidhash_file_inline(data: &mut File, mut seed: u64) -> std::io::Result<u64> {
37    let len = data.metadata()?.len();
38    let mut reader = BufReader::new(data);
39    seed = rapidhash_seed(seed, len);
40    let (a, b, _) = rapidhash_file_core(0, 0, seed, len as usize, &mut reader)?;
41    Ok(rapidhash_finish(a, b, len))
42}
43
44#[inline(always)]
45fn rapidhash_file_core(mut a: u64, mut b: u64, mut seed: u64, len: usize, iter: &mut BufReader<&mut File>) -> std::io::Result<(u64, u64, u64)> {
46    if len <= 16 {
47        let mut data = [0u8; 16];
48        iter.read_exact(&mut data[0..len])?;
49
50        // deviation from the C++ impl computes delta as follows
51        // let delta = (data.len() & 24) >> (data.len() >> 3);
52        // this is equivalent to "match {..8=>0, 8..=>4}"
53        // and so using the extra if-else statement is equivalent and allows the compiler to skip
54        // some unnecessary bounds checks while still being safe rust.
55        if len >= 8 {
56            // len is 8..=16
57            let plast = len - 4;
58            let delta = 4;
59            a ^= read_u32_combined(&data, 0, plast);
60            b ^= read_u32_combined(&data, delta, plast - delta);
61        } else if len >= 4 {
62            // len is 4..=7
63            let plast = len - 4;
64            let delta = 0;
65            a ^= read_u32_combined(&data, 0, plast);
66            b ^= read_u32_combined(&data, delta, plast - delta);
67        } else if len > 0 {
68            // len is 1..=3
69            a ^= ((data[0] as u64) << 56) | ((data[len >> 1] as u64) << 32) | data[len - 1] as u64;
70            // b = 0;
71        }
72    } else {
73        let mut remaining = len;
74        let mut buf = [0u8; 192];
75
76        // slice is a view on the buffer that we use for reading into, and reading from, depending
77        // on the stage of the loop.
78        let mut slice = &mut buf[..96];
79
80        // because we're using a buffered reader, it might be worth unrolling this loop further
81        let mut see1 = seed;
82        let mut see2 = seed;
83        while remaining >= 96 {
84            // read into and process using the first half of the buffer
85            iter.read_exact(&mut slice)?;
86            seed = rapid_mix(read_u64(slice, 0) ^ RAPID_SECRET[0], read_u64(slice, 8) ^ seed);
87            see1 = rapid_mix(read_u64(slice, 16) ^ RAPID_SECRET[1], read_u64(slice, 24) ^ see1);
88            see2 = rapid_mix(read_u64(slice, 32) ^ RAPID_SECRET[2], read_u64(slice, 40) ^ see2);
89            seed = rapid_mix(read_u64(slice , 48) ^ RAPID_SECRET[0], read_u64(slice, 56) ^ seed);
90            see1 = rapid_mix(read_u64(slice, 64) ^ RAPID_SECRET[1], read_u64(slice, 72) ^ see1);
91            see2 = rapid_mix(read_u64(slice, 80) ^ RAPID_SECRET[2], read_u64(slice, 88) ^ see2);
92            remaining -= 96;
93        }
94
95        // remaining might be up to 95 bytes, so we read into the second half of the buffer,
96        // which allows us to negative index safely in the final a and b xor using `end`.
97        slice = &mut buf[96..96 + remaining];
98        iter.read_exact(&mut slice)?;
99        let end = 96 + remaining;
100
101        if remaining >= 48 {
102            seed = rapid_mix(read_u64(slice, 0) ^ RAPID_SECRET[0], read_u64(slice, 8) ^ seed);
103            see1 = rapid_mix(read_u64(slice, 16) ^ RAPID_SECRET[1], read_u64(slice, 24) ^ see1);
104            see2 = rapid_mix(read_u64(slice, 32) ^ RAPID_SECRET[2], read_u64(slice, 40) ^ see2);
105            slice = &mut buf[96 + 48..96 + remaining];
106            remaining -= 48;
107        }
108
109        seed ^= see1 ^ see2;
110
111        if remaining > 16 {
112            seed = rapid_mix(read_u64(slice, 0) ^ RAPID_SECRET[2], read_u64(slice, 8) ^ seed ^ RAPID_SECRET[1]);
113            if remaining > 32 {
114                seed = rapid_mix(read_u64(slice, 16) ^ RAPID_SECRET[2], read_u64(slice, 24) ^ seed);
115            }
116        }
117
118        a ^= read_u64(&buf, end - 16);
119        b ^= read_u64(&buf, end - 8);
120    }
121
122    a ^= RAPID_SECRET[1];
123    b ^= seed;
124
125    let (a2, b2) = rapid_mum(a, b);
126    a = a2;
127    b = b2;
128    Ok((a, b, seed))
129}
130
131#[cfg(test)]
132mod tests {
133    use std::io::{Seek, SeekFrom, Write};
134    use super::*;
135
136    #[test]
137    fn test_compare_rapidhash_file() {
138        use rand::RngCore;
139
140        const LENGTH: usize = 1024;
141        for len in 1..=LENGTH {
142            let mut data = vec![0u8; len];
143            rand::rng().fill_bytes(&mut data);
144
145            let mut file = tempfile::tempfile().unwrap();
146            file.write(&data).unwrap();
147            file.seek(SeekFrom::Start(0)).unwrap();
148
149            assert_eq!(
150                crate::rapidhash(&data),
151                rapidhash_file(&mut file).unwrap(),
152                "Mismatch for input len: {}", &data.len()
153            );
154        }
155    }
156}