scanlex/
lib.rs

1//! `scanlex` implements a simple _lexical scanner_.
2//!
3//! Tokens are returned by repeatedly calling the `get` method,
4//! (which will return `Token::End` if no tokens are left)
5//! or by iterating over the scanner.
6//!
7//! They represent floats (stored as f64), integers (as i64), characters, identifiers,
8//! or single or double quoted strings. There is also `Token::Error` to
9//! indicate a badly formed token.  This lexical scanner makes some
10//! sensible assumptions, such as a number may not be directly followed
11//! by a letter, etc. No attempt is made in this version to decode C-style
12//! escape codes in strings.  All whitespace is ignored.
13//!
14//! ## Examples
15//!
16//! ```
17//! use  scanlex::{Scanner,Token};
18//!
19//! let mut scan = Scanner::new("iden 'string' * 10");
20//! assert_eq!(scan.get(),Token::Iden("iden".into()));
21//! assert_eq!(scan.get(),Token::Str("string".into()));
22//! assert_eq!(scan.get(),Token::Char('*'));
23//! assert_eq!(scan.get(),Token::Int(10));
24//! assert_eq!(scan.get(),Token::End);
25//! ```
26//!
27//! The scanner struct implements iterator, so:
28//!
29//! ```
30//! let v: Vec<_> = scanlex::Scanner::new("bonzo 42 dog (cat)")
31//!     .filter_map(|t| t.to_iden()).collect();
32//! assert_eq!(v,&["bonzo","dog","cat"]);
33//! ```
34
35use std::str::FromStr;
36use std::error::Error;
37use std::io;
38
39mod int;
40use int::Int;
41
42mod error;
43pub use error::ScanError;
44
45mod token;
46pub use token::Token;
47
48/// a struct for lexical scanning of a string
49pub struct Scanner <'a> {
50    iter: ::std::str::Chars<'a>,
51    ch: char,
52    pub lineno: u32,
53    no_float: bool,
54    line_comment: Option<char>,
55}
56
57fn expecting_chars(chars: &[char]) -> String {
58    let mut res = String::new();
59    for c in chars {
60        res.push_str(&format!("'{}'",c));
61        res.push(',')
62    }
63    res.pop();
64    res
65}
66
67impl<'a> Iterator for Scanner<'a> {
68    type Item = Token;
69
70    fn next(&mut self) -> Option<Token> {
71        match self.get() {
72            Token::End => None,
73            t => Some(t)
74        }
75    }
76}
77
78impl<'a> Scanner<'a> {
79    /// create a new scanner from a string slice.
80    ///
81    /// Empty text is not a problem, but `get` will then
82    /// return `Token::End`.
83    pub fn new(s: &'a str) -> Scanner<'a> {
84        Scanner::new_ex(s,1)
85    }
86
87    fn new_ex(s: &'a str, lineno: u32) -> Scanner<'a> {
88        let mut iter = s.chars();
89        let mch = iter.next();
90        Scanner {
91            iter: iter,
92            ch: match mch {Some(c) => c, None => '\0'},
93            lineno: lineno,
94            no_float: false,
95            line_comment: None,
96        }
97    }
98
99    /// this scanner will not recognize floats
100    ///
101    /// "2.5" is tokenized as Int(2),Char('.'),Int(5)
102    pub fn no_float(mut self) -> Scanner<'a> {
103        self.no_float = true;
104        self
105    }
106
107    /// ignore everything in a line after this char
108    pub fn line_comment(mut self, c: char) -> Scanner<'a> {
109        self.line_comment = Some(c);
110        self
111    }
112    
113
114    pub fn scan_error(&self, msg: &str, cause: Option<&dyn Error>) -> ScanError {
115       ScanError{
116           details: format!("{}{}", msg,
117                match cause {
118                    Some(c) => format!(": caused by {}",c),
119                    None => "".into()
120                }
121            ),
122            lineno: self.lineno
123        }
124    }
125
126    fn update_lineno(&self, mut err: ScanError) -> ScanError {
127        err.lineno = self.lineno;
128        err
129    }
130
131    fn token_error(&self, msg: &str, cause: Option<&dyn Error>) -> Token {
132        Token::Error(self.scan_error(msg,cause))
133    }
134
135    fn check_line_comment(&mut self) -> bool {
136        if let Some(lc) = self.line_comment {
137            if self.ch == lc {
138                self.skip_until(|c| c=='\n');
139                return true;
140            }
141        }
142        return false;
143
144    }
145
146    /// skip any whitespace characters - return false if we're at the end.
147    pub fn skip_whitespace(&mut self) -> bool {
148        loop {
149            self.check_line_comment();
150            if self.ch.is_whitespace() {
151                if self.ch == '\n' {
152                     self.lineno += 1;
153                }
154                while let Some(c) = self.iter.next() {
155                    if c == '\n' {
156                        self.lineno += 1;
157                    }
158                    if ! c.is_whitespace() {
159                        self.ch = c;
160                        if self.check_line_comment() {
161                            continue;
162                        } else {
163                            return true;
164                        }
165                    }
166                }
167                // run of chars!
168                self.ch = '\0';
169                break;
170            } else {
171                break;
172            }
173        }
174        if self.ch == '\0' {
175            false
176        } else {
177            true
178        }
179    }
180
181    /// look ahead at the next character
182    pub fn peek(&self) -> char {
183        self.ch
184    }
185
186    /// get the next character
187    pub fn nextch(&mut self) -> char {
188        let old_ch = self.ch;
189        self.ch = match self.iter.next() {
190            Some(c) => c,
191            None => '\0'
192        };
193        old_ch
194    }
195
196    fn either_plus_or_minus(&self) -> Option<char> {
197        if self.ch == '+' || self.ch == '-' {
198            Some(self.ch)
199        } else {
200            None
201        }
202    }
203
204    fn is_digit(&self) -> bool {
205        self.ch.is_digit(10)
206    }
207
208    /// get the next token
209    pub fn get(&mut self) -> Token {
210        use self::Token::*;
211        if ! self.skip_whitespace() {
212            return End;
213        }
214
215        // a number starts with a digit or a sign
216        let plusminus = if ! self.no_float {self.either_plus_or_minus()} else {None};
217        if self.is_digit() || plusminus.is_some() {
218            let mut s = String::new();
219            if plusminus.is_some() {
220                s.push(plusminus.unwrap());
221            }
222            if ! self.no_float {
223                let mut maybe_hex = self.ch == '0';
224                if plusminus.is_some() || maybe_hex {
225                    // look ahead! Might be a number or just a char
226                    self.nextch();
227                    if maybe_hex { // after a '0'?
228                        maybe_hex = self.ch == 'X' || self.ch == 'x';
229                        if ! maybe_hex {
230                            s.push('0');
231                            if ! self.is_digit() && self.ch != '.' { self.ch = '\0'; }
232                        }
233                    } else
234                    if ! self.is_digit() { // false alarm, wuz just a char...
235                        return Char(plusminus.unwrap());
236                    }
237                }
238                // integer part
239                if maybe_hex { // in hex...
240                    self.nextch(); // skip the 'x'
241                    self.take_while_into(&mut s,|c| c.is_digit(16));
242                    return match i64::from_str_radix(&s,16) {
243                        Ok(n) => Int(n),
244                        Err(e) => self.token_error("bad hex constant",Some(&e))
245                    }
246                }
247            }
248
249            if self.ch != '.' { // for 0. case - we already peeked ahead
250                self.take_digits_into(&mut s);
251            }
252
253            // floating point part?
254            if ! self.no_float && (self.ch == '.'  || self.ch == 'e' || self.ch == 'E') {
255                if self.ch == '.' {
256                    self.take_digits_into(&mut s);
257                }
258                if self.ch == 'e' || self.ch == 'E' {
259                    s.push(self.nextch());
260                    if self.is_digit() || self.either_plus_or_minus().is_some() {
261                        self.take_digits_into(&mut s);
262                    }
263                }
264                return if self.ch.is_alphabetic() {
265                    self.token_error("bad floating-point number: letter follows",None)
266                } else {
267                    match f64::from_str(&s) {
268                        Ok(x) => Num(x),
269                        Err(e) => self.token_error(&format!("bad floating-point number {:?}",s),Some(&e))
270                    }
271                }
272            } else {
273                return if ! self.no_float && self.ch.is_alphabetic() {
274                    self.token_error("bad integer: letter follows",None)
275                } else {
276                    match i64::from_str(&s) {
277                        Ok(x) => Int(x),
278                        Err(e) => self.token_error(&format!("bad integer {:?}",s),Some(&e))
279                    }
280                }
281            }
282        } else
283        if self.ch == '\'' || self.ch == '\"' {
284            let endquote = self.ch;
285            self.nextch(); // skip the opening quote
286            let s = self.grab_while(|c| c != endquote);
287            // TODO unfinished quote
288            self.nextch();  // skip end quote
289            Str(s)
290        } else
291        if self.ch.is_alphabetic() || self.ch == '_' {
292            let s = self.grab_while(|c| c.is_alphanumeric() || c == '_');
293            Iden(s)
294        } else {
295            Char(self.nextch())
296        }
297    }
298
299    /// collect chars matching the condition, returning a string
300    /// ```
301    /// let mut scan = scanlex::Scanner::new("hello + goodbye");
302    /// assert_eq!(scan.grab_while(|c| c != '+'), "hello ");
303    /// ```
304    pub fn grab_while<F>(&mut self, pred: F ) -> String
305     where F: Fn(char) -> bool {
306        let mut s = String::new();
307        self.take_while_into(&mut s,pred);
308        s
309    }
310
311    /// collect chars matching the condition into a given string
312    pub fn take_while_into<F>(&mut self, s: &mut String, pred: F )
313     where F: Fn(char) -> bool {
314        if self.ch != '\0' {
315            s.push(self.ch);
316        }
317        while let Some(c) = self.iter.next() {
318            if ! pred(c) { self.ch = c; return; }
319            s.push(c);
320        }
321        self.ch = '\0';
322    }
323
324    fn take_digits_into(&mut self, s: &mut String) {
325        self.take_while_into(s, |c| c.is_digit(10));
326    }
327
328    /// skip chars while the condition is false
329    ///
330    /// ```
331    /// let mut scan = scanlex::Scanner::new("hello and\nwelcome");
332    /// scan.skip_until(|c| c == '\n');
333    /// assert_eq!(scan.get_iden().unwrap(),"welcome");
334    /// ```
335    pub fn skip_until<F>(&mut self, pred: F ) -> bool
336    where F: Fn(char) -> bool {
337        while let Some(c) = self.iter.next() {
338            if pred(c) { self.ch = c; return true; }
339        }
340        self.ch = '\0';
341        false
342    }
343
344    /// collect the rest of the chars
345    ///
346    /// ```
347    /// use scanlex::{Scanner,Token};
348    ///
349    /// let mut scan = Scanner::new("42 the answer");
350    /// assert_eq!(scan.get(),Token::Int(42));
351    /// assert_eq!(scan.take_rest()," the answer");
352    /// ```
353    pub fn take_rest(&mut self) -> String {
354        self.grab_while(|c| c != '\0')
355    }
356
357    /// collect until we match one of the chars
358    pub fn take_until (&mut self, chars: &[char]) -> String {
359        self.grab_while(|c| ! chars.contains(&c))
360    }
361
362    /// get a String token, failing otherwise
363    pub fn get_string(&mut self) -> Result<String,ScanError> {
364        self.get().to_string_result().map_err(|e| self.update_lineno(e))
365    }
366
367    /// get an Identifier token, failing otherwise
368    ///
369    /// ```
370    /// let mut scan = scanlex::Scanner::new("hello dolly");
371    /// assert_eq!(scan.get_iden().unwrap(),"hello");
372    /// ```
373    pub fn get_iden(&mut self) -> Result<String,ScanError> {
374        self.get().to_iden_result().map_err(|e| self.update_lineno(e))
375    }
376
377    /// get a number, failing otherwise
378    ///
379    /// ```
380    /// let mut scan = scanlex::Scanner::new("(42)");
381    /// scan.get(); // skip '('
382    /// assert_eq!(scan.get_number().unwrap(),42.0);
383    /// ```
384    pub fn get_number(&mut self) -> Result<f64,ScanError> {
385        self.get().to_number_result().map_err(|e| self.update_lineno(e))
386    }
387
388    /// get an integer, failing otherwise
389    pub fn get_integer(&mut self) -> Result<i64,ScanError> {
390        self.get().to_integer_result().map_err(|e| self.update_lineno(e))
391    }
392
393    /// get an integer of a particular type, failing otherwise
394    pub fn get_int<I: Int>(&mut self) -> Result<I::Type,ScanError> {
395        self.get().to_int_result::<I>().map_err(|e| self.update_lineno(e))
396    }
397
398    /// get an float, failing otherwise
399    pub fn get_float(&mut self) -> Result<f64,ScanError> {
400        self.get().to_float_result().map_err(|e| self.update_lineno(e))
401    }
402
403    /// get a character, failing otherwise
404    pub fn get_char(&mut self) -> Result<char,ScanError> {
405        self.get().to_char_result().map_err(|e| self.update_lineno(e))
406    }
407
408    /// get a Character token that must be one of the given chars
409    pub fn get_ch_matching(&mut self, chars: &[char]) -> Result<char,ScanError> {
410        let c = self.get_char()?;
411        if chars.contains(&c) {
412            Ok(c)
413        } else {
414            let s = expecting_chars(chars);
415            Err(self.scan_error(&format!("expected one of {}, got {}",s,c),None))
416        }
417    }
418
419    /// skip each character in the string.
420    pub fn skip_chars(&mut self, chars: &str) -> Result<(),ScanError> {
421        for ch in chars.chars() {
422            let c = self.get_char()?;
423            if c != ch {
424                return Err(self.scan_error(&format!("expected '{}' got '{}'",ch,c),None));
425            }
426        }
427        Ok(())
428    }
429
430    /// grab 'balanced' text between some open and close chars
431    pub fn grab_brackets(&mut self, pair: &str) -> Result<String,ScanError> {
432        let mut chars = pair.chars();
433        let open = chars.next().expect("provide open bracket");
434        let close = chars.next().expect("provide close bracket");
435        self.skip_whitespace();
436        let mut s = String::new();
437        if self.ch != '\0' {
438            s.push(self.ch);
439        }        
440        let mut level = 1;
441        while let Some(c) = self.iter.next() {
442            if c == open {
443                level += 1;
444            } else
445            if c == close {
446                level -= 1;
447            }
448            s.push(c);
449            if level == 0 {
450                self.nextch();
451                return Ok(s);
452            }
453        }
454        Err(self.scan_error("expect close bracket",None))
455
456    }
457
458}
459
460use std::io::prelude::*;
461
462/// used to generate Scanner structs for each line
463pub struct ScanLines<R: Read> {
464    rdr: io::BufReader<R>,
465    line: String,
466    lineno: u32,
467}
468
469impl <'a, R: Read> ScanLines<R> {
470
471    /// create a Scanner 'iterator' over all lines from a readable.
472    /// This cannot be a proper `Iterator` because the lifetime constraint
473    /// on `Scanner` cannot be satisfied. You need to use the explicit form:
474    ///
475    /// ```rust,ignore
476    /// let mut iter = ScanLines::new(File::open("lines.txt")?);
477    /// while let Some(s) = iter.next() {
478    ///     let mut s = s?;
479    ///     // first token of each line
480    ///     println!("{:?}",s.get());
481    /// }
482    /// ```
483    pub fn new(f: R) -> ScanLines<R> {
484        ScanLines {
485            rdr: io::BufReader::new(f),
486            line: String::new(),
487            lineno: 0,
488        }
489    }
490
491
492    /// call this to return a `Scanner` for the next line in the source.
493    pub fn next(&'a mut self) -> Option<io::Result<Scanner<'a>>> {
494        self.line.clear();
495        match self.rdr.read_line(&mut self.line) {
496            Ok(nbytes) =>  if nbytes == 0 {
497                return None;
498            },
499            Err(e) => return Some(Err(e))
500        }
501        self.lineno += 1;
502        Some(Ok(Scanner::new_ex(&self.line,self.lineno)))
503    }
504
505}
506
507
508#[cfg(test)]
509mod tests {
510    use super::*;
511
512    #[test]
513    fn skipping() {
514        // skipping
515        let mut scan = Scanner::new("here we go\nand more *yay*");
516        scan.skip_until(|c| c == '\n');
517        assert_eq!(scan.get(),Token::Iden("and".to_string()));
518        scan.skip_until(|c| c == '*');
519        assert_eq!(scan.get(),Token::Char('*'));
520        assert_eq!(scan.get(),Token::Iden("yay".to_string()));
521    }
522
523    #[test]
524    fn getting()  {
525        use Token::*;
526        let mut scan = Scanner::new("'hello' 42 * / -10 24B 2.0e6 0xFF-\"yay\"");
527        assert_eq!(scan.get_string().unwrap(), "hello");
528        assert_eq!(scan.get_number().unwrap(), 42.0);
529        assert_eq!(scan.get_ch_matching(&['*']).unwrap(),'*');
530        assert_eq!(
531            scan.get_ch_matching(&[',',':']).err().unwrap(),
532            ScanError::new("expected one of ',',':', got /")
533        );
534        assert_eq!(scan.get(),Int(-10));
535        assert_eq!(scan.get(),Error(ScanError::new("bad integer: letter follows")));
536        assert_eq!(scan.get(),Iden("B".to_string()));
537        assert_eq!(scan.get(),Num(2000000.0));
538        assert_eq!(scan.get(),Int(255));
539        assert_eq!(scan.get(),Char('-'));
540        assert_eq!(scan.get(),Str("yay".to_string()));
541    }
542
543    fn try_scan_err() -> Result<(),ScanError> {
544        let mut scan = Scanner::new("hello: 42");
545        let s = scan.get_iden()?;
546        let ch = scan.get_char()?;
547        let n = scan.get_integer()?;
548        assert_eq!(s,"hello");
549        assert_eq!(ch,':');
550        assert_eq!(n,42);
551        Ok(())
552    }
553
554    #[test]
555    fn try_scan_test() {
556        let _ = try_scan_err();
557    }
558
559    fn try_skip_chars(test: &str) -> Result<(),ScanError> {
560        let mut scan = Scanner::new(test);
561        scan.skip_chars("(")?;
562        let name = scan.get_iden()?;
563        scan.skip_chars(")=")?;
564        let num = scan.get_integer()?;
565        assert_eq!(name,"hello");
566        assert_eq!(num,42);
567        Ok(())
568    }
569
570    #[test]
571    fn skip_chars() {
572        let _ = try_skip_chars("(hello)=42");
573        let _ = try_skip_chars(" ( hello ) =  42  ");
574    }
575
576    #[test]
577    fn numbers() {
578        let mut scan = Scanner::new("10 0.0 1.0e1 1e1 0 ");
579        assert_eq!(scan.get_integer(),Ok(10));
580        assert_eq!(scan.get_number(),Ok(0.0));
581        assert_eq!(scan.get_number(),Ok(10.0));
582        assert_eq!(scan.get_float(),Ok(10.0));
583        assert_eq!(scan.get_integer(),Ok(0));
584    }
585
586    #[test]
587    fn no_float() {
588        use Token::*;
589        let scan = Scanner::new("0.0 1e4").no_float();
590        let c: Vec<_> = scan.collect();
591        assert_eq!(c,&[Int(0),Char('.'),Int(0),Int(1),Iden("e4".into())]);
592    }
593
594    #[test]
595    fn classifying_tokens() {
596        let mut s = Scanner::new("10 2.0 'hello' hello?");
597        let t = s.get();
598        assert!(t.is_integer());
599        assert!(t.is_number());
600        assert!(s.get().is_float());
601        assert!(s.get().is_string());
602        assert!(s.get().is_iden());
603        assert!(s.get().is_char());
604    }
605
606    #[test]
607    fn collecting_tokens_of_type() {
608        let s = Scanner::new("if let Some(a) = Bonzo::Dog {}");
609        let c: Vec<_> = s.filter_map(|t| t.to_iden()).collect();
610        assert_eq!(c,&["if","let","Some","a","Bonzo","Dog"]);
611    }
612
613    #[test]
614    fn collecting_same_tokens_or_error() {
615        let s = Scanner::new("10 1.5 20.0 30.1");
616        let c: Result<Vec<_>,_> = s.map(|t| t.to_number_result()).collect();
617        assert_eq!(c.unwrap(),&[10.0,1.5,20.0,30.1]);
618    }
619
620    #[test]
621    fn line_comments() {
622        let text = "
623            one  # some comment
624            20
625        ";
626        let mut scan = Scanner::new(text)
627            .line_comment('#');
628        assert_eq!(scan.get_iden(),Ok("one".into()));
629        assert_eq!(scan.get_number(),Ok(20.0));
630    }
631
632}