From 46d1af28b5ce4f626be1eb33cb9751cb9cbb1fe9 Mon Sep 17 00:00:00 2001 From: Corey Richardson Date: Wed, 21 May 2014 16:57:31 -0700 Subject: [PATCH] syntax: methodify the lexer --- src/librustdoc/html/highlight.rs | 2 +- src/libsyntax/attr.rs | 2 +- src/libsyntax/parse/lexer.rs | 1112 ------------------ src/libsyntax/parse/{ => lexer}/comments.rs | 105 +- src/libsyntax/parse/lexer/mod.rs | 1153 +++++++++++++++++++ src/libsyntax/parse/mod.rs | 3 +- src/libsyntax/parse/token.rs | 2 +- src/libsyntax/print/pprust.rs | 3 +- 8 files changed, 1195 insertions(+), 1187 deletions(-) delete mode 100644 src/libsyntax/parse/lexer.rs rename src/libsyntax/parse/{ => lexer}/comments.rs (82%) create mode 100644 src/libsyntax/parse/lexer/mod.rs diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 1f3ed010928..f544e1e0973 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -34,7 +34,7 @@ pub fn highlight(src: &str, class: Option<&str>) -> String { let mut out = io::MemWriter::new(); doit(&sess, - lexer::new_string_reader(&sess.span_diagnostic, fm), + lexer::StringReader::new(&sess.span_diagnostic, fm), class, &mut out).unwrap(); str::from_utf8_lossy(out.unwrap().as_slice()).to_string() diff --git a/src/libsyntax/attr.rs b/src/libsyntax/attr.rs index 4a38835f86b..a514ef65e72 100644 --- a/src/libsyntax/attr.rs +++ b/src/libsyntax/attr.rs @@ -15,7 +15,7 @@ use ast::{AttrId, Attribute, Attribute_, MetaItem, MetaWord, MetaNameValue, Meta use codemap::{Span, Spanned, spanned, dummy_spanned}; use codemap::BytePos; use diagnostic::SpanHandler; -use parse::comments::{doc_comment_style, strip_doc_comment_decoration}; +use parse::lexer::comments::{doc_comment_style, strip_doc_comment_decoration}; use parse::token::InternedString; use parse::token; use crateid::CrateId; diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs deleted file mode 100644 index f5386b43d51..00000000000 --- a/src/libsyntax/parse/lexer.rs +++ /dev/null @@ -1,1112 +0,0 @@ -// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use ast; -use codemap::{BytePos, CharPos, CodeMap, Pos, Span}; -use codemap; -use diagnostic::SpanHandler; -use ext::tt::transcribe::tt_next_token; -use parse::token; -use parse::token::{str_to_ident}; - -use std::char; -use std::mem::replace; -use std::num::from_str_radix; -use std::rc::Rc; -use std::str; - -pub use ext::tt::transcribe::{TtReader, new_tt_reader}; - -pub trait Reader { - fn is_eof(&self) -> bool; - fn next_token(&mut self) -> TokenAndSpan; - /// Report a fatal error with the current span. - fn fatal(&self, &str) -> !; - /// Report a non-fatal error with the current span. - fn err(&self, &str); - fn peek(&self) -> TokenAndSpan; -} - -#[deriving(Clone, PartialEq, Show)] -pub struct TokenAndSpan { - pub tok: token::Token, - pub sp: Span, -} - -pub struct StringReader<'a> { - pub span_diagnostic: &'a SpanHandler, - // The absolute offset within the codemap of the next character to read - pub pos: BytePos, - // The absolute offset within the codemap of the last character read(curr) - pub last_pos: BytePos, - // The column of the next character to read - pub col: CharPos, - // The last character to be read - pub curr: Option, - pub filemap: Rc, - /* cached: */ - pub peek_tok: token::Token, - pub peek_span: Span, -} - -impl<'a> StringReader<'a> { - pub fn curr_is(&self, c: char) -> bool { - self.curr == Some(c) - } -} - -pub fn new_string_reader<'a>(span_diagnostic: &'a SpanHandler, - filemap: Rc) - -> StringReader<'a> { - let mut r = new_low_level_string_reader(span_diagnostic, filemap); - string_advance_token(&mut r); /* fill in peek_* */ - r -} - -/* For comments.rs, which hackily pokes into 'pos' and 'curr' */ -pub fn new_low_level_string_reader<'a>(span_diagnostic: &'a SpanHandler, - filemap: Rc) - -> StringReader<'a> { - // Force the initial reader bump to start on a fresh line - let initial_char = '\n'; - let mut r = StringReader { - span_diagnostic: span_diagnostic, - pos: filemap.start_pos, - last_pos: filemap.start_pos, - col: CharPos(0), - curr: Some(initial_char), - filemap: filemap, - /* dummy values; not read */ - peek_tok: token::EOF, - peek_span: codemap::DUMMY_SP, - }; - bump(&mut r); - r -} - -impl<'a> Reader for StringReader<'a> { - fn is_eof(&self) -> bool { is_eof(self) } - // return the next token. EFFECT: advances the string_reader. - fn next_token(&mut self) -> TokenAndSpan { - let ret_val = TokenAndSpan { - tok: replace(&mut self.peek_tok, token::UNDERSCORE), - sp: self.peek_span, - }; - string_advance_token(self); - ret_val - } - fn fatal(&self, m: &str) -> ! { - self.span_diagnostic.span_fatal(self.peek_span, m) - } - fn err(&self, m: &str) { - self.span_diagnostic.span_err(self.peek_span, m) - } - fn peek(&self) -> TokenAndSpan { - // FIXME(pcwalton): Bad copy! - TokenAndSpan { - tok: self.peek_tok.clone(), - sp: self.peek_span, - } - } -} - -impl<'a> Reader for TtReader<'a> { - fn is_eof(&self) -> bool { - self.cur_tok == token::EOF - } - fn next_token(&mut self) -> TokenAndSpan { - let r = tt_next_token(self); - debug!("TtReader: r={:?}", r); - r - } - fn fatal(&self, m: &str) -> ! { - self.sp_diag.span_fatal(self.cur_span, m); - } - fn err(&self, m: &str) { - self.sp_diag.span_err(self.cur_span, m); - } - fn peek(&self) -> TokenAndSpan { - TokenAndSpan { - tok: self.cur_tok.clone(), - sp: self.cur_span, - } - } -} - -// report a lexical error spanning [`from_pos`, `to_pos`) -fn fatal_span(rdr: &mut StringReader, from_pos: BytePos, to_pos: BytePos, m: &str) -> ! { - rdr.peek_span = codemap::mk_sp(from_pos, to_pos); - rdr.fatal(m); -} - -fn err_span(rdr: &mut StringReader, from_pos: BytePos, to_pos: BytePos, m: &str) { - rdr.peek_span = codemap::mk_sp(from_pos, to_pos); - rdr.err(m); -} - -// report a lexical error spanning [`from_pos`, `to_pos`), appending an -// escaped character to the error message -fn fatal_span_char(rdr: &mut StringReader, - from_pos: BytePos, to_pos: BytePos, - m: &str, c: char) -> ! { - let mut m = m.to_string(); - m.push_str(": "); - char::escape_default(c, |c| m.push_char(c)); - fatal_span(rdr, from_pos, to_pos, m.as_slice()); -} - -fn err_span_char(rdr: &mut StringReader, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) { - let mut m = m.to_string(); - m.push_str(": "); - char::escape_default(c, |c| m.push_char(c)); - err_span(rdr, from_pos, to_pos, m.as_slice()); -} - -// report a lexical error spanning [`from_pos`, `to_pos`), appending the -// offending string to the error message -fn fatal_span_verbose(rdr: &mut StringReader, from_pos: BytePos, to_pos: BytePos, m: &str) -> ! { - let mut m = m.to_string(); - m.push_str(": "); - let from = byte_offset(rdr, from_pos).to_uint(); - let to = byte_offset(rdr, to_pos).to_uint(); - m.push_str(rdr.filemap.src.as_slice().slice(from, to)); - fatal_span(rdr, from_pos, to_pos, m.as_slice()); -} - -// EFFECT: advance peek_tok and peek_span to refer to the next token. -// EFFECT: update the interner, maybe. -fn string_advance_token(r: &mut StringReader) { - match consume_whitespace_and_comments(r) { - Some(comment) => { - r.peek_span = comment.sp; - r.peek_tok = comment.tok; - }, - None => { - if is_eof(r) { - r.peek_tok = token::EOF; - } else { - let start_bytepos = r.last_pos; - r.peek_tok = next_token_inner(r); - r.peek_span = codemap::mk_sp(start_bytepos, - r.last_pos); - }; - } - } -} - -fn byte_offset(rdr: &StringReader, pos: BytePos) -> BytePos { - (pos - rdr.filemap.start_pos) -} - -/// Calls `f` with a string slice of the source text spanning from `start` -/// up to but excluding `rdr.last_pos`, meaning the slice does not include -/// the character `rdr.curr`. -pub fn with_str_from( - rdr: &StringReader, - start: BytePos, - f: |s: &str| -> T) - -> T { - with_str_from_to(rdr, start, rdr.last_pos, f) -} - -/// Calls `f` with astring slice of the source text spanning from `start` -/// up to but excluding `end`. -fn with_str_from_to( - rdr: &StringReader, - start: BytePos, - end: BytePos, - f: |s: &str| -> T) - -> T { - f(rdr.filemap.src.as_slice().slice( - byte_offset(rdr, start).to_uint(), - byte_offset(rdr, end).to_uint())) -} - -// EFFECT: advance the StringReader by one character. If a newline is -// discovered, add it to the FileMap's list of line start offsets. -pub fn bump(rdr: &mut StringReader) { - rdr.last_pos = rdr.pos; - let current_byte_offset = byte_offset(rdr, rdr.pos).to_uint(); - if current_byte_offset < rdr.filemap.src.len() { - assert!(rdr.curr.is_some()); - let last_char = rdr.curr.unwrap(); - let next = rdr.filemap - .src - .as_slice() - .char_range_at(current_byte_offset); - let byte_offset_diff = next.next - current_byte_offset; - rdr.pos = rdr.pos + Pos::from_uint(byte_offset_diff); - rdr.curr = Some(next.ch); - rdr.col = rdr.col + CharPos(1u); - if last_char == '\n' { - rdr.filemap.next_line(rdr.last_pos); - rdr.col = CharPos(0u); - } - - if byte_offset_diff > 1 { - rdr.filemap.record_multibyte_char(rdr.last_pos, byte_offset_diff); - } - } else { - rdr.curr = None; - } -} - -pub fn is_eof(rdr: &StringReader) -> bool { - rdr.curr.is_none() -} - -pub fn nextch(rdr: &StringReader) -> Option { - let offset = byte_offset(rdr, rdr.pos).to_uint(); - if offset < rdr.filemap.src.len() { - Some(rdr.filemap.src.as_slice().char_at(offset)) - } else { - None - } -} -pub fn nextch_is(rdr: &StringReader, c: char) -> bool { - nextch(rdr) == Some(c) -} - -pub fn nextnextch(rdr: &StringReader) -> Option { - let offset = byte_offset(rdr, rdr.pos).to_uint(); - let s = rdr.filemap.deref().src.as_slice(); - if offset >= s.len() { return None } - let str::CharRange { next, .. } = s.char_range_at(offset); - if next < s.len() { - Some(s.char_at(next)) - } else { - None - } -} -pub fn nextnextch_is(rdr: &StringReader, c: char) -> bool { - nextnextch(rdr) == Some(c) -} - -pub fn is_whitespace(c: Option) -> bool { - match c.unwrap_or('\x00') { // None can be null for now... it's not whitespace - ' ' | '\n' | '\t' | '\r' => true, - _ => false - } -} - -// EFFECT: eats whitespace and comments. -// returns a Some(sugared-doc-attr) if one exists, None otherwise. -fn consume_whitespace_and_comments(rdr: &mut StringReader) - -> Option { - while is_whitespace(rdr.curr) { bump(rdr); } - return consume_any_line_comment(rdr); -} - -pub fn is_line_non_doc_comment(s: &str) -> bool { - s.starts_with("////") -} - -// PRECONDITION: rdr.curr is not whitespace -// EFFECT: eats any kind of comment. -// returns a Some(sugared-doc-attr) if one exists, None otherwise -fn consume_any_line_comment(rdr: &mut StringReader) - -> Option { - if rdr.curr_is('/') { - match nextch(rdr) { - Some('/') => { - bump(rdr); - bump(rdr); - // line comments starting with "///" or "//!" are doc-comments - if rdr.curr_is('/') || rdr.curr_is('!') { - let start_bpos = rdr.pos - BytePos(3); - while !rdr.curr_is('\n') && !is_eof(rdr) { - bump(rdr); - } - let ret = with_str_from(rdr, start_bpos, |string| { - // but comments with only more "/"s are not - if !is_line_non_doc_comment(string) { - Some(TokenAndSpan{ - tok: token::DOC_COMMENT(str_to_ident(string)), - sp: codemap::mk_sp(start_bpos, rdr.pos) - }) - } else { - None - } - }); - - if ret.is_some() { - return ret; - } - } else { - while !rdr.curr_is('\n') && !is_eof(rdr) { bump(rdr); } - } - // Restart whitespace munch. - consume_whitespace_and_comments(rdr) - } - Some('*') => { bump(rdr); bump(rdr); consume_block_comment(rdr) } - _ => None - } - } else if rdr.curr_is('#') { - if nextch_is(rdr, '!') { - - // Parse an inner attribute. - if nextnextch_is(rdr, '[') { - return None; - } - - // I guess this is the only way to figure out if - // we're at the beginning of the file... - let cmap = CodeMap::new(); - cmap.files.borrow_mut().push(rdr.filemap.clone()); - let loc = cmap.lookup_char_pos_adj(rdr.last_pos); - if loc.line == 1u && loc.col == CharPos(0u) { - while !rdr.curr_is('\n') && !is_eof(rdr) { bump(rdr); } - return consume_whitespace_and_comments(rdr); - } - } - None - } else { - None - } -} - -pub fn is_block_non_doc_comment(s: &str) -> bool { - s.starts_with("/***") -} - -// might return a sugared-doc-attr -fn consume_block_comment(rdr: &mut StringReader) -> Option { - // block comments starting with "/**" or "/*!" are doc-comments - let is_doc_comment = rdr.curr_is('*') || rdr.curr_is('!'); - let start_bpos = rdr.pos - BytePos(if is_doc_comment {3} else {2}); - - let mut level: int = 1; - while level > 0 { - if is_eof(rdr) { - let msg = if is_doc_comment { - "unterminated block doc-comment" - } else { - "unterminated block comment" - }; - fatal_span(rdr, start_bpos, rdr.last_pos, msg); - } else if rdr.curr_is('/') && nextch_is(rdr, '*') { - level += 1; - bump(rdr); - bump(rdr); - } else if rdr.curr_is('*') && nextch_is(rdr, '/') { - level -= 1; - bump(rdr); - bump(rdr); - } else { - bump(rdr); - } - } - - let res = if is_doc_comment { - with_str_from(rdr, start_bpos, |string| { - // but comments with only "*"s between two "/"s are not - if !is_block_non_doc_comment(string) { - Some(TokenAndSpan{ - tok: token::DOC_COMMENT(str_to_ident(string)), - sp: codemap::mk_sp(start_bpos, rdr.pos) - }) - } else { - None - } - }) - } else { - None - }; - - // restart whitespace munch. - if res.is_some() { res } else { consume_whitespace_and_comments(rdr) } -} - -fn scan_exponent(rdr: &mut StringReader, start_bpos: BytePos) -> Option { - // \x00 hits the `return None` case immediately, so this is fine. - let mut c = rdr.curr.unwrap_or('\x00'); - let mut rslt = String::new(); - if c == 'e' || c == 'E' { - rslt.push_char(c); - bump(rdr); - c = rdr.curr.unwrap_or('\x00'); - if c == '-' || c == '+' { - rslt.push_char(c); - bump(rdr); - } - let exponent = scan_digits(rdr, 10u); - if exponent.len() > 0u { - rslt.push_str(exponent.as_slice()); - } else { - err_span(rdr, start_bpos, rdr.last_pos, "scan_exponent: bad fp literal"); - rslt.push_str("1"); // arbitrary placeholder exponent - } - Some(rslt) - } else { - None - } -} - -fn scan_digits(rdr: &mut StringReader, radix: uint) -> String { - let mut rslt = String::new(); - loop { - let c = rdr.curr; - if c == Some('_') { bump(rdr); continue; } - match c.and_then(|cc| char::to_digit(cc, radix)) { - Some(_) => { - rslt.push_char(c.unwrap()); - bump(rdr); - } - _ => return rslt - } - }; -} - -fn check_float_base(rdr: &mut StringReader, start_bpos: BytePos, last_bpos: BytePos, - base: uint) { - match base { - 16u => err_span(rdr, start_bpos, last_bpos, "hexadecimal float literal is not supported"), - 8u => err_span(rdr, start_bpos, last_bpos, "octal float literal is not supported"), - 2u => err_span(rdr, start_bpos, last_bpos, "binary float literal is not supported"), - _ => () - } -} - -fn scan_number(c: char, rdr: &mut StringReader) -> token::Token { - let mut num_str; - let mut base = 10u; - let mut c = c; - let mut n = nextch(rdr).unwrap_or('\x00'); - let start_bpos = rdr.last_pos; - if c == '0' && n == 'x' { - bump(rdr); - bump(rdr); - base = 16u; - } else if c == '0' && n == 'o' { - bump(rdr); - bump(rdr); - base = 8u; - } else if c == '0' && n == 'b' { - bump(rdr); - bump(rdr); - base = 2u; - } - num_str = scan_digits(rdr, base); - c = rdr.curr.unwrap_or('\x00'); - if c == 'u' || c == 'i' { - enum Result { Signed(ast::IntTy), Unsigned(ast::UintTy) } - let signed = c == 'i'; - let mut tp = { - if signed { Signed(ast::TyI) } - else { Unsigned(ast::TyU) } - }; - bump(rdr); - c = rdr.curr.unwrap_or('\x00'); - if c == '8' { - bump(rdr); - tp = if signed { Signed(ast::TyI8) } - else { Unsigned(ast::TyU8) }; - } - n = nextch(rdr).unwrap_or('\x00'); - if c == '1' && n == '6' { - bump(rdr); - bump(rdr); - tp = if signed { Signed(ast::TyI16) } - else { Unsigned(ast::TyU16) }; - } else if c == '3' && n == '2' { - bump(rdr); - bump(rdr); - tp = if signed { Signed(ast::TyI32) } - else { Unsigned(ast::TyU32) }; - } else if c == '6' && n == '4' { - bump(rdr); - bump(rdr); - tp = if signed { Signed(ast::TyI64) } - else { Unsigned(ast::TyU64) }; - } - if num_str.len() == 0u { - err_span(rdr, start_bpos, rdr.last_pos, "no valid digits found for number"); - num_str = "1".to_string(); - } - let parsed = match from_str_radix::(num_str.as_slice(), - base as uint) { - Some(p) => p, - None => { - err_span(rdr, start_bpos, rdr.last_pos, "int literal is too large"); - 1 - } - }; - - match tp { - Signed(t) => return token::LIT_INT(parsed as i64, t), - Unsigned(t) => return token::LIT_UINT(parsed, t) - } - } - let mut is_float = false; - if rdr.curr_is('.') && !(ident_start(nextch(rdr)) || nextch_is(rdr, '.')) { - is_float = true; - bump(rdr); - let dec_part = scan_digits(rdr, 10u); - num_str.push_char('.'); - num_str.push_str(dec_part.as_slice()); - } - match scan_exponent(rdr, start_bpos) { - Some(ref s) => { - is_float = true; - num_str.push_str(s.as_slice()); - } - None => () - } - - if rdr.curr_is('f') { - bump(rdr); - c = rdr.curr.unwrap_or('\x00'); - n = nextch(rdr).unwrap_or('\x00'); - if c == '3' && n == '2' { - bump(rdr); - bump(rdr); - check_float_base(rdr, start_bpos, rdr.last_pos, base); - return token::LIT_FLOAT(str_to_ident(num_str.as_slice()), - ast::TyF32); - } else if c == '6' && n == '4' { - bump(rdr); - bump(rdr); - check_float_base(rdr, start_bpos, rdr.last_pos, base); - return token::LIT_FLOAT(str_to_ident(num_str.as_slice()), - ast::TyF64); - /* FIXME (#2252): if this is out of range for either a - 32-bit or 64-bit float, it won't be noticed till the - back-end. */ - } else if c == '1' && n == '2' && nextnextch(rdr).unwrap_or('\x00') == '8' { - bump(rdr); - bump(rdr); - bump(rdr); - check_float_base(rdr, start_bpos, rdr.last_pos, base); - return token::LIT_FLOAT(str_to_ident(num_str.as_slice()), ast::TyF128); - } - err_span(rdr, start_bpos, rdr.last_pos, "expected `f32`, `f64` or `f128` suffix"); - } - if is_float { - check_float_base(rdr, start_bpos, rdr.last_pos, base); - return token::LIT_FLOAT_UNSUFFIXED(str_to_ident(num_str.as_slice())); - } else { - if num_str.len() == 0u { - err_span(rdr, start_bpos, rdr.last_pos, "no valid digits found for number"); - num_str = "1".to_string(); - } - let parsed = match from_str_radix::(num_str.as_slice(), - base as uint) { - Some(p) => p, - None => { - err_span(rdr, start_bpos, rdr.last_pos, "int literal is too large"); - 1 - } - }; - - debug!("lexing {} as an unsuffixed integer literal", - num_str.as_slice()); - return token::LIT_INT_UNSUFFIXED(parsed as i64); - } -} - -fn scan_numeric_escape(rdr: &mut StringReader, n_hex_digits: uint, delim: char) -> char { - let mut accum_int = 0u32; - let start_bpos = rdr.last_pos; - for _ in range(0, n_hex_digits) { - if is_eof(rdr) { - fatal_span(rdr, start_bpos, rdr.last_pos, "unterminated numeric character escape"); - } - if rdr.curr_is(delim) { - err_span(rdr, start_bpos, rdr.last_pos, "numeric character escape is too short"); - break; - } - let c = rdr.curr.unwrap_or('\x00'); - accum_int *= 16; - accum_int += c.to_digit(16).unwrap_or_else(|| { - err_span_char(rdr, rdr.last_pos, rdr.pos, - "illegal character in numeric character escape", c); - 0 - }) as u32; - bump(rdr); - } - - match char::from_u32(accum_int) { - Some(x) => x, - None => { - err_span(rdr, start_bpos, rdr.last_pos, "illegal numeric character escape"); - '?' - } - } -} - -fn ident_start(c: Option) -> bool { - let c = match c { Some(c) => c, None => return false }; - - (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z') - || c == '_' - || (c > '\x7f' && char::is_XID_start(c)) -} - -fn ident_continue(c: Option) -> bool { - let c = match c { Some(c) => c, None => return false }; - - (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z') - || (c >= '0' && c <= '9') - || c == '_' - || (c > '\x7f' && char::is_XID_continue(c)) -} - -// return the next token from the string -// EFFECT: advances the input past that token -// EFFECT: updates the interner -fn next_token_inner(rdr: &mut StringReader) -> token::Token { - let c = rdr.curr; - if ident_start(c) && !nextch_is(rdr, '"') && !nextch_is(rdr, '#') { - // Note: r as in r" or r#" is part of a raw string literal, - // not an identifier, and is handled further down. - - let start = rdr.last_pos; - while ident_continue(rdr.curr) { - bump(rdr); - } - - return with_str_from(rdr, start, |string| { - if string == "_" { - token::UNDERSCORE - } else { - let is_mod_name = rdr.curr_is(':') && nextch_is(rdr, ':'); - - // FIXME: perform NFKC normalization here. (Issue #2253) - token::IDENT(str_to_ident(string), is_mod_name) - } - }) - } - if c.map_or(false, |c| c.is_digit_radix(10)) { - return scan_number(c.unwrap(), rdr); - } - fn binop(rdr: &mut StringReader, op: token::BinOp) -> token::Token { - bump(rdr); - if rdr.curr_is('=') { - bump(rdr); - return token::BINOPEQ(op); - } else { return token::BINOP(op); } - } - match c.expect("next_token_inner called at EOF") { - - - - - - // One-byte tokens. - ';' => { bump(rdr); return token::SEMI; } - ',' => { bump(rdr); return token::COMMA; } - '.' => { - bump(rdr); - return if rdr.curr_is('.') { - bump(rdr); - if rdr.curr_is('.') { - bump(rdr); - token::DOTDOTDOT - } else { - token::DOTDOT - } - } else { - token::DOT - }; - } - '(' => { bump(rdr); return token::LPAREN; } - ')' => { bump(rdr); return token::RPAREN; } - '{' => { bump(rdr); return token::LBRACE; } - '}' => { bump(rdr); return token::RBRACE; } - '[' => { bump(rdr); return token::LBRACKET; } - ']' => { bump(rdr); return token::RBRACKET; } - '@' => { bump(rdr); return token::AT; } - '#' => { bump(rdr); return token::POUND; } - '~' => { bump(rdr); return token::TILDE; } - ':' => { - bump(rdr); - if rdr.curr_is(':') { - bump(rdr); - return token::MOD_SEP; - } else { return token::COLON; } - } - - '$' => { bump(rdr); return token::DOLLAR; } - - - - - - // Multi-byte tokens. - '=' => { - bump(rdr); - if rdr.curr_is('=') { - bump(rdr); - return token::EQEQ; - } else if rdr.curr_is('>') { - bump(rdr); - return token::FAT_ARROW; - } else { - return token::EQ; - } - } - '!' => { - bump(rdr); - if rdr.curr_is('=') { - bump(rdr); - return token::NE; - } else { return token::NOT; } - } - '<' => { - bump(rdr); - match rdr.curr.unwrap_or('\x00') { - '=' => { bump(rdr); return token::LE; } - '<' => { return binop(rdr, token::SHL); } - '-' => { - bump(rdr); - return token::LARROW; - } - _ => { return token::LT; } - } - } - '>' => { - bump(rdr); - match rdr.curr.unwrap_or('\x00') { - '=' => { bump(rdr); return token::GE; } - '>' => { return binop(rdr, token::SHR); } - _ => { return token::GT; } - } - } - '\'' => { - // Either a character constant 'a' OR a lifetime name 'abc - bump(rdr); - let start = rdr.last_pos; - - // the eof will be picked up by the final `'` check below - let mut c2 = rdr.curr.unwrap_or('\x00'); - bump(rdr); - - // If the character is an ident start not followed by another single - // quote, then this is a lifetime name: - if ident_start(Some(c2)) && !rdr.curr_is('\'') { - while ident_continue(rdr.curr) { - bump(rdr); - } - let ident = with_str_from(rdr, start, |lifetime_name| { - str_to_ident(lifetime_name) - }); - let tok = &token::IDENT(ident, false); - - if token::is_keyword(token::keywords::Self, tok) { - err_span(rdr, start, rdr.last_pos, - "invalid lifetime name: 'self is no longer a special lifetime"); - } else if token::is_any_keyword(tok) && - !token::is_keyword(token::keywords::Static, tok) { - err_span(rdr, start, rdr.last_pos, "invalid lifetime name"); - } - return token::LIFETIME(ident); - } - - // Otherwise it is a character constant: - match c2 { - '\\' => { - // '\X' for some X must be a character constant: - let escaped = rdr.curr; - let escaped_pos = rdr.last_pos; - bump(rdr); - match escaped { - None => {} - Some(e) => { - c2 = match e { - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '"' => '"', - '0' => '\x00', - 'x' => scan_numeric_escape(rdr, 2u, '\''), - 'u' => scan_numeric_escape(rdr, 4u, '\''), - 'U' => scan_numeric_escape(rdr, 8u, '\''), - c2 => { - err_span_char(rdr, escaped_pos, rdr.last_pos, - "unknown character escape", c2); - c2 - } - } - } - } - } - '\t' | '\n' | '\r' | '\'' => { - err_span_char(rdr, start, rdr.last_pos, "character constant must be escaped", c2); - } - _ => {} - } - if !rdr.curr_is('\'') { - fatal_span_verbose(rdr, - // Byte offsetting here is okay because the - // character before position `start` is an - // ascii single quote. - start - BytePos(1), rdr.last_pos, - "unterminated character constant"); - } - bump(rdr); // advance curr past token - return token::LIT_CHAR(c2); - } - '"' => { - let mut accum_str = String::new(); - let start_bpos = rdr.last_pos; - bump(rdr); - while !rdr.curr_is('"') { - if is_eof(rdr) { - fatal_span(rdr, start_bpos, rdr.last_pos, "unterminated double quote string"); - } - - let ch = rdr.curr.unwrap(); - bump(rdr); - match ch { - '\\' => { - if is_eof(rdr) { - fatal_span(rdr, start_bpos, rdr.last_pos, "unterminated double quote string"); - } - - let escaped = rdr.curr.unwrap(); - let escaped_pos = rdr.last_pos; - bump(rdr); - match escaped { - 'n' => accum_str.push_char('\n'), - 'r' => accum_str.push_char('\r'), - 't' => accum_str.push_char('\t'), - '\\' => accum_str.push_char('\\'), - '\'' => accum_str.push_char('\''), - '"' => accum_str.push_char('"'), - '\n' => consume_whitespace(rdr), - '0' => accum_str.push_char('\x00'), - 'x' => { - accum_str.push_char(scan_numeric_escape(rdr, 2u, '"')); - } - 'u' => { - accum_str.push_char(scan_numeric_escape(rdr, 4u, '"')); - } - 'U' => { - accum_str.push_char(scan_numeric_escape(rdr, 8u, '"')); - } - c2 => { - err_span_char(rdr, escaped_pos, rdr.last_pos, "unknown string escape", c2); - } - } - } - _ => accum_str.push_char(ch) - } - } - bump(rdr); - return token::LIT_STR(str_to_ident(accum_str.as_slice())); - } - 'r' => { - let start_bpos = rdr.last_pos; - bump(rdr); - let mut hash_count = 0u; - while rdr.curr_is('#') { - bump(rdr); - hash_count += 1; - } - - if is_eof(rdr) { - fatal_span(rdr, start_bpos, rdr.last_pos, "unterminated raw string"); - } else if !rdr.curr_is('"') { - fatal_span_char(rdr, start_bpos, rdr.last_pos, - "only `#` is allowed in raw string delimitation; \ - found illegal character", - rdr.curr.unwrap()); - } - bump(rdr); - let content_start_bpos = rdr.last_pos; - let mut content_end_bpos; - 'outer: loop { - if is_eof(rdr) { - fatal_span(rdr, start_bpos, rdr.last_pos, "unterminated raw string"); - } - if rdr.curr_is('"') { - content_end_bpos = rdr.last_pos; - for _ in range(0, hash_count) { - bump(rdr); - if !rdr.curr_is('#') { - continue 'outer; - } - } - break; - } - bump(rdr); - } - bump(rdr); - let str_content = with_str_from_to(rdr, - content_start_bpos, - content_end_bpos, - str_to_ident); - return token::LIT_STR_RAW(str_content, hash_count); - } - '-' => { - if nextch_is(rdr, '>') { - bump(rdr); - bump(rdr); - return token::RARROW; - } else { return binop(rdr, token::MINUS); } - } - '&' => { - if nextch_is(rdr, '&') { - bump(rdr); - bump(rdr); - return token::ANDAND; - } else { return binop(rdr, token::AND); } - } - '|' => { - match nextch(rdr) { - Some('|') => { bump(rdr); bump(rdr); return token::OROR; } - _ => { return binop(rdr, token::OR); } - } - } - '+' => { return binop(rdr, token::PLUS); } - '*' => { return binop(rdr, token::STAR); } - '/' => { return binop(rdr, token::SLASH); } - '^' => { return binop(rdr, token::CARET); } - '%' => { return binop(rdr, token::PERCENT); } - c => { - fatal_span_char(rdr, rdr.last_pos, rdr.pos, "unknown start of token", c); - } - } -} - -fn consume_whitespace(rdr: &mut StringReader) { - while is_whitespace(rdr.curr) && !is_eof(rdr) { bump(rdr); } -} - -#[cfg(test)] -mod test { - use super::*; - - use codemap::{BytePos, CodeMap, Span}; - use diagnostic; - use parse::token; - use parse::token::{str_to_ident}; - use std::io::util; - - fn mk_sh() -> diagnostic::SpanHandler { - let emitter = diagnostic::EmitterWriter::new(box util::NullWriter); - let handler = diagnostic::mk_handler(box emitter); - diagnostic::mk_span_handler(handler, CodeMap::new()) - } - - // open a string reader for the given string - fn setup<'a>(span_handler: &'a diagnostic::SpanHandler, - teststr: String) -> StringReader<'a> { - let fm = span_handler.cm.new_filemap("zebra.rs".to_string(), teststr); - new_string_reader(span_handler, fm) - } - - #[test] fn t1 () { - let span_handler = mk_sh(); - let mut string_reader = setup(&span_handler, - "/* my source file */ \ - fn main() { println!(\"zebra\"); }\n".to_string()); - let id = str_to_ident("fn"); - let tok1 = string_reader.next_token(); - let tok2 = TokenAndSpan{ - tok:token::IDENT(id, false), - sp:Span {lo:BytePos(21),hi:BytePos(23),expn_info: None}}; - assert_eq!(tok1,tok2); - // the 'main' id is already read: - assert_eq!(string_reader.last_pos.clone(), BytePos(28)); - // read another token: - let tok3 = string_reader.next_token(); - let tok4 = TokenAndSpan{ - tok:token::IDENT(str_to_ident("main"), false), - sp:Span {lo:BytePos(24),hi:BytePos(28),expn_info: None}}; - assert_eq!(tok3,tok4); - // the lparen is already read: - assert_eq!(string_reader.last_pos.clone(), BytePos(29)) - } - - // check that the given reader produces the desired stream - // of tokens (stop checking after exhausting the expected vec) - fn check_tokenization (mut string_reader: StringReader, expected: Vec ) { - for expected_tok in expected.iter() { - assert_eq!(&string_reader.next_token().tok, expected_tok); - } - } - - // make the identifier by looking up the string in the interner - fn mk_ident (id: &str, is_mod_name: bool) -> token::Token { - token::IDENT (str_to_ident(id),is_mod_name) - } - - #[test] fn doublecolonparsing () { - check_tokenization(setup(&mk_sh(), "a b".to_string()), - vec!(mk_ident("a",false), - mk_ident("b",false))); - } - - #[test] fn dcparsing_2 () { - check_tokenization(setup(&mk_sh(), "a::b".to_string()), - vec!(mk_ident("a",true), - token::MOD_SEP, - mk_ident("b",false))); - } - - #[test] fn dcparsing_3 () { - check_tokenization(setup(&mk_sh(), "a ::b".to_string()), - vec!(mk_ident("a",false), - token::MOD_SEP, - mk_ident("b",false))); - } - - #[test] fn dcparsing_4 () { - check_tokenization(setup(&mk_sh(), "a:: b".to_string()), - vec!(mk_ident("a",true), - token::MOD_SEP, - mk_ident("b",false))); - } - - #[test] fn character_a() { - assert_eq!(setup(&mk_sh(), "'a'".to_string()).next_token().tok, - token::LIT_CHAR('a')); - } - - #[test] fn character_space() { - assert_eq!(setup(&mk_sh(), "' '".to_string()).next_token().tok, - token::LIT_CHAR(' ')); - } - - #[test] fn character_escaped() { - assert_eq!(setup(&mk_sh(), "'\\n'".to_string()).next_token().tok, - token::LIT_CHAR('\n')); - } - - #[test] fn lifetime_name() { - assert_eq!(setup(&mk_sh(), "'abc".to_string()).next_token().tok, - token::LIFETIME(token::str_to_ident("abc"))); - } - - #[test] fn raw_string() { - assert_eq!(setup(&mk_sh(), - "r###\"\"#a\\b\x00c\"\"###".to_string()).next_token() - .tok, - token::LIT_STR_RAW(token::str_to_ident("\"#a\\b\x00c\""), 3)); - } - - #[test] fn line_doc_comments() { - assert!(!is_line_non_doc_comment("///")); - assert!(!is_line_non_doc_comment("/// blah")); - assert!(is_line_non_doc_comment("////")); - } - - #[test] fn nested_block_comments() { - assert_eq!(setup(&mk_sh(), - "/* /* */ */'a'".to_string()).next_token().tok, - token::LIT_CHAR('a')); - } - -} diff --git a/src/libsyntax/parse/comments.rs b/src/libsyntax/parse/lexer/comments.rs similarity index 82% rename from src/libsyntax/parse/comments.rs rename to src/libsyntax/parse/lexer/comments.rs index 622ed6b9801..a009955f91a 100644 --- a/src/libsyntax/parse/comments.rs +++ b/src/libsyntax/parse/lexer/comments.rs @@ -11,8 +11,8 @@ use ast; use codemap::{BytePos, CharPos, CodeMap, Pos}; use diagnostic; -use parse::lexer::{is_whitespace, with_str_from, Reader}; -use parse::lexer::{StringReader, bump, is_eof, nextch_is, TokenAndSpan}; +use parse::lexer::{is_whitespace, Reader}; +use parse::lexer::{StringReader, TokenAndSpan}; use parse::lexer::{is_line_non_doc_comment, is_block_non_doc_comment}; use parse::lexer; use parse::token; @@ -141,31 +141,6 @@ pub fn strip_doc_comment_decoration(comment: &str) -> String { fail!("not a doc-comment: {}", comment); } -fn read_to_eol(rdr: &mut StringReader) -> String { - let mut val = String::new(); - while !rdr.curr_is('\n') && !is_eof(rdr) { - val.push_char(rdr.curr.unwrap()); - bump(rdr); - } - if rdr.curr_is('\n') { bump(rdr); } - return val -} - -fn read_one_line_comment(rdr: &mut StringReader) -> String { - let val = read_to_eol(rdr); - assert!((val.as_slice()[0] == '/' as u8 && - val.as_slice()[1] == '/' as u8) || - (val.as_slice()[0] == '#' as u8 && - val.as_slice()[1] == '!' as u8)); - return val; -} - -fn consume_non_eol_whitespace(rdr: &mut StringReader) { - while is_whitespace(rdr.curr) && !rdr.curr_is('\n') && !is_eof(rdr) { - bump(rdr); - } -} - fn push_blank_line_comment(rdr: &StringReader, comments: &mut Vec) { debug!(">>> blank-line comment"); comments.push(Comment { @@ -177,11 +152,11 @@ fn push_blank_line_comment(rdr: &StringReader, comments: &mut Vec) { fn consume_whitespace_counting_blank_lines(rdr: &mut StringReader, comments: &mut Vec) { - while is_whitespace(rdr.curr) && !is_eof(rdr) { + while is_whitespace(rdr.curr) && !rdr.is_eof() { if rdr.col == CharPos(0u) && rdr.curr_is('\n') { push_blank_line_comment(rdr, &mut *comments); } - bump(rdr); + rdr.bump(); } } @@ -193,7 +168,7 @@ fn read_shebang_comment(rdr: &mut StringReader, code_to_the_left: bool, debug!("<<< shebang comment"); comments.push(Comment { style: if code_to_the_left { Trailing } else { Isolated }, - lines: vec!(read_one_line_comment(rdr)), + lines: vec!(rdr.read_one_line_comment()), pos: p }); } @@ -203,15 +178,15 @@ fn read_line_comments(rdr: &mut StringReader, code_to_the_left: bool, debug!(">>> line comments"); let p = rdr.last_pos; let mut lines: Vec = Vec::new(); - while rdr.curr_is('/') && nextch_is(rdr, '/') { - let line = read_one_line_comment(rdr); + while rdr.curr_is('/') && rdr.nextch_is('/') { + let line = rdr.read_one_line_comment(); debug!("{}", line); // Doc comments are not put in comments. if is_doc_comment(line.as_slice()) { break; } lines.push(line); - consume_non_eol_whitespace(rdr); + rdr.consume_non_eol_whitespace(); } debug!("<<< line comments"); if !lines.is_empty() { @@ -265,21 +240,21 @@ fn read_block_comment(rdr: &mut StringReader, let p = rdr.last_pos; let mut lines: Vec = Vec::new(); let col = rdr.col; - bump(rdr); - bump(rdr); + rdr.bump(); + rdr.bump(); let mut curr_line = String::from_str("/*"); // doc-comments are not really comments, they are attributes - if (rdr.curr_is('*') && !nextch_is(rdr, '*')) || rdr.curr_is('!') { - while !(rdr.curr_is('*') && nextch_is(rdr, '/')) && !is_eof(rdr) { + if (rdr.curr_is('*') && !rdr.nextch_is('*')) || rdr.curr_is('!') { + while !(rdr.curr_is('*') && rdr.nextch_is('/')) && !rdr.is_eof() { curr_line.push_char(rdr.curr.unwrap()); - bump(rdr); + rdr.bump(); } - if !is_eof(rdr) { + if !rdr.is_eof() { curr_line.push_str("*/"); - bump(rdr); - bump(rdr); + rdr.bump(); + rdr.bump(); } if !is_block_non_doc_comment(curr_line.as_slice()) { return @@ -290,7 +265,7 @@ fn read_block_comment(rdr: &mut StringReader, let mut level: int = 1; while level > 0 { debug!("=== block comment level {}", level); - if is_eof(rdr) { + if rdr.is_eof() { rdr.fatal("unterminated block comment"); } if rdr.curr_is('\n') { @@ -298,21 +273,21 @@ fn read_block_comment(rdr: &mut StringReader, curr_line, col); curr_line = String::new(); - bump(rdr); + rdr.bump(); } else { curr_line.push_char(rdr.curr.unwrap()); - if rdr.curr_is('/') && nextch_is(rdr, '*') { - bump(rdr); - bump(rdr); + if rdr.curr_is('/') && rdr.nextch_is('*') { + rdr.bump(); + rdr.bump(); curr_line.push_char('*'); level += 1; } else { - if rdr.curr_is('*') && nextch_is(rdr, '/') { - bump(rdr); - bump(rdr); + if rdr.curr_is('*') && rdr.nextch_is('/') { + rdr.bump(); + rdr.bump(); curr_line.push_char('/'); level -= 1; - } else { bump(rdr); } + } else { rdr.bump(); } } } } @@ -324,31 +299,24 @@ fn read_block_comment(rdr: &mut StringReader, } let mut style = if code_to_the_left { Trailing } else { Isolated }; - consume_non_eol_whitespace(rdr); - if !is_eof(rdr) && !rdr.curr_is('\n') && lines.len() == 1u { + rdr.consume_non_eol_whitespace(); + if !rdr.is_eof() && !rdr.curr_is('\n') && lines.len() == 1u { style = Mixed; } debug!("<<< block comment"); comments.push(Comment {style: style, lines: lines, pos: p}); } -fn peeking_at_comment(rdr: &StringReader) -> bool { - return (rdr.curr_is('/') && nextch_is(rdr, '/')) || - (rdr.curr_is('/') && nextch_is(rdr, '*')) || - // consider shebangs comments, but not inner attributes - (rdr.curr_is('#') && nextch_is(rdr, '!') && - !lexer::nextnextch_is(rdr, '[')); -} fn consume_comment(rdr: &mut StringReader, code_to_the_left: bool, comments: &mut Vec ) { debug!(">>> consume comment"); - if rdr.curr_is('/') && nextch_is(rdr, '/') { + if rdr.curr_is('/') && rdr.nextch_is('/') { read_line_comments(rdr, code_to_the_left, comments); - } else if rdr.curr_is('/') && nextch_is(rdr, '*') { + } else if rdr.curr_is('/') && rdr.nextch_is('*') { read_block_comment(rdr, code_to_the_left, comments); - } else if rdr.curr_is('#') && nextch_is(rdr, '!') { + } else if rdr.curr_is('#') && rdr.nextch_is('!') { read_shebang_comment(rdr, code_to_the_left, comments); } else { fail!(); } debug!("<<< consume comment"); @@ -362,8 +330,7 @@ pub struct Literal { // it appears this function is called only from pprust... that's // probably not a good thing. -pub fn gather_comments_and_literals(span_diagnostic: - &diagnostic::SpanHandler, +pub fn gather_comments_and_literals(span_diagnostic: &diagnostic::SpanHandler, path: String, srdr: &mut io::Reader) -> (Vec, Vec) { @@ -371,20 +338,20 @@ pub fn gather_comments_and_literals(span_diagnostic: let src = str::from_utf8(src.as_slice()).unwrap().to_string(); let cm = CodeMap::new(); let filemap = cm.new_filemap(path, src); - let mut rdr = lexer::new_low_level_string_reader(span_diagnostic, filemap); + let mut rdr = lexer::StringReader::new_raw(span_diagnostic, filemap); let mut comments: Vec = Vec::new(); let mut literals: Vec = Vec::new(); let mut first_read: bool = true; - while !is_eof(&rdr) { + while !rdr.is_eof() { loop { let mut code_to_the_left = !first_read; - consume_non_eol_whitespace(&mut rdr); + rdr.consume_non_eol_whitespace(); if rdr.curr_is('\n') { code_to_the_left = false; consume_whitespace_counting_blank_lines(&mut rdr, &mut comments); } - while peeking_at_comment(&rdr) { + while rdr.peeking_at_comment() { consume_comment(&mut rdr, code_to_the_left, &mut comments); consume_whitespace_counting_blank_lines(&mut rdr, &mut comments); } @@ -397,7 +364,7 @@ pub fn gather_comments_and_literals(span_diagnostic: //discard, and look ahead; we're working with internal state let TokenAndSpan {tok: tok, sp: sp} = rdr.peek(); if token::is_lit(&tok) { - with_str_from(&rdr, bstart, |s| { + rdr.with_str_from(bstart, |s| { debug!("tok lit: {}", s); literals.push(Literal {lit: s.to_string(), pos: sp.lo}); }) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs new file mode 100644 index 00000000000..bb23fe50bd9 --- /dev/null +++ b/src/libsyntax/parse/lexer/mod.rs @@ -0,0 +1,1153 @@ +// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use ast; +use codemap::{BytePos, CharPos, CodeMap, Pos, Span}; +use codemap; +use diagnostic::SpanHandler; +use ext::tt::transcribe::tt_next_token; +use parse::token; +use parse::token::{str_to_ident}; + +use std::char; +use std::mem::replace; +use std::num::from_str_radix; +use std::rc::Rc; +use std::str; + +pub use ext::tt::transcribe::{TtReader, new_tt_reader}; + +pub mod comments; + +pub trait Reader { + fn is_eof(&self) -> bool; + fn next_token(&mut self) -> TokenAndSpan; + /// Report a fatal error with the current span. + fn fatal(&self, &str) -> !; + /// Report a non-fatal error with the current span. + fn err(&self, &str); + fn peek(&self) -> TokenAndSpan; +} + +#[deriving(Clone, PartialEq, Eq, Show)] +pub struct TokenAndSpan { + pub tok: token::Token, + pub sp: Span, +} + +pub struct StringReader<'a> { + pub span_diagnostic: &'a SpanHandler, + // The absolute offset within the codemap of the next character to read + pub pos: BytePos, + // The absolute offset within the codemap of the last character read(curr) + pub last_pos: BytePos, + // The column of the next character to read + pub col: CharPos, + // The last character to be read + pub curr: Option, + pub filemap: Rc, + /* cached: */ + pub peek_tok: token::Token, + pub peek_span: Span, +} + +impl<'a> Reader for StringReader<'a> { + fn is_eof(&self) -> bool { self.curr.is_none() } + // return the next token. EFFECT: advances the string_reader. + fn next_token(&mut self) -> TokenAndSpan { + let ret_val = TokenAndSpan { + tok: replace(&mut self.peek_tok, token::UNDERSCORE), + sp: self.peek_span, + }; + self.advance_token(); + ret_val + } + fn fatal(&self, m: &str) -> ! { + self.span_diagnostic.span_fatal(self.peek_span, m) + } + fn err(&self, m: &str) { + self.span_diagnostic.span_err(self.peek_span, m) + } + fn peek(&self) -> TokenAndSpan { + // FIXME(pcwalton): Bad copy! + TokenAndSpan { + tok: self.peek_tok.clone(), + sp: self.peek_span, + } + } +} + +impl<'a> Reader for TtReader<'a> { + fn is_eof(&self) -> bool { + self.cur_tok == token::EOF + } + fn next_token(&mut self) -> TokenAndSpan { + let r = tt_next_token(self); + debug!("TtReader: r={:?}", r); + r + } + fn fatal(&self, m: &str) -> ! { + self.sp_diag.span_fatal(self.cur_span, m); + } + fn err(&self, m: &str) { + self.sp_diag.span_err(self.cur_span, m); + } + fn peek(&self) -> TokenAndSpan { + TokenAndSpan { + tok: self.cur_tok.clone(), + sp: self.cur_span, + } + } +} + +impl<'a> StringReader<'a> { + /// For comments.rs, which hackily pokes into pos and curr + pub fn new_raw<'b>(span_diagnostic: &'b SpanHandler, + filemap: Rc) -> StringReader<'b> { + let mut sr = StringReader { + span_diagnostic: span_diagnostic, + pos: filemap.start_pos, + last_pos: filemap.start_pos, + col: CharPos(0), + curr: Some('\n'), + filemap: filemap, + /* dummy values; not read */ + peek_tok: token::EOF, + peek_span: codemap::DUMMY_SP, + }; + sr.bump(); + sr + } + + pub fn new<'b>(span_diagnostic: &'b SpanHandler, + filemap: Rc) -> StringReader<'b> { + let mut sr = StringReader::new_raw(span_diagnostic, filemap); + sr.advance_token(); + sr + } + + pub fn curr_is(&self, c: char) -> bool { + self.curr == Some(c) + } + + /// Report a lexical error spanning [`from_pos`, `to_pos`) + fn fatal_span(&mut self, from_pos: BytePos, to_pos: BytePos, m: &str) -> ! { + self.peek_span = codemap::mk_sp(from_pos, to_pos); + self.fatal(m); + } + + fn err_span(&mut self, from_pos: BytePos, to_pos: BytePos, m: &str) { + self.peek_span = codemap::mk_sp(from_pos, to_pos); + self.err(m); + } + + /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an + /// escaped character to the error message + fn fatal_span_char(&mut self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> ! { + let mut m = m.to_string(); + m.push_str(": "); + char::escape_default(c, |c| m.push_char(c)); + self.fatal_span(from_pos, to_pos, m.as_slice()); + } + + /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an + /// escaped character to the error message + fn err_span_char(&mut self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) { + let mut m = m.to_string(); + m.push_str(": "); + char::escape_default(c, |c| m.push_char(c)); + self.err_span(from_pos, to_pos, m.as_slice()); + } + + /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the + /// offending string to the error message + fn fatal_span_verbose(&mut self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> ! { + m.push_str(": "); + let from = self.byte_offset(from_pos).to_uint(); + let to = self.byte_offset(to_pos).to_uint(); + m.push_str(self.filemap.src.as_slice().slice(from, to)); + self.fatal_span(from_pos, to_pos, m.as_slice()); + } + + /// Advance peek_tok and peek_span to refer to the next token, and + /// possibly update the interner. + fn advance_token(&mut self) { + match self.consume_whitespace_and_comments() { + Some(comment) => { + self.peek_span = comment.sp; + self.peek_tok = comment.tok; + }, + None => { + if self.is_eof() { + self.peek_tok = token::EOF; + } else { + let start_bytepos = self.last_pos; + self.peek_tok = self.next_token_inner(); + self.peek_span = codemap::mk_sp(start_bytepos, + self.last_pos); + }; + } + } + } + + fn byte_offset(&self, pos: BytePos) -> BytePos { + (pos - self.filemap.start_pos) + } + + /// Calls `f` with a string slice of the source text spanning from `start` + /// up to but excluding `self.last_pos`, meaning the slice does not include + /// the character `self.curr`. + pub fn with_str_from(&self, start: BytePos, f: |s: &str| -> T) -> T { + self.with_str_from_to(start, self.last_pos, f) + } + + /// Calls `f` with a string slice of the source text spanning from `start` + /// up to but excluding `end`. + fn with_str_from_to(&self, start: BytePos, end: BytePos, f: |s: &str| -> T) -> T { + f(self.filemap.src.as_slice().slice( + self.byte_offset(start).to_uint(), + self.byte_offset(end).to_uint())) + } + + /// Advance the StringReader by one character. If a newline is + /// discovered, add it to the FileMap's list of line start offsets. + pub fn bump(&mut self) { + self.last_pos = self.pos; + let current_byte_offset = self.byte_offset(self.pos).to_uint(); + if current_byte_offset < self.filemap.src.len() { + assert!(self.curr.is_some()); + let last_char = self.curr.unwrap(); + let next = self.filemap + .src + .as_slice() + .char_range_at(current_byte_offset); + let byte_offset_diff = next.next - current_byte_offset; + self.pos = self.pos + Pos::from_uint(byte_offset_diff); + self.curr = Some(next.ch); + self.col = self.col + CharPos(1u); + if last_char == '\n' { + self.filemap.next_line(self.last_pos); + self.col = CharPos(0u); + } + + if byte_offset_diff > 1 { + self.filemap.record_multibyte_char(self.last_pos, byte_offset_diff); + } + } else { + self.curr = None; + } + } + + pub fn nextch(&self) -> Option { + let offset = self.byte_offset(self.pos).to_uint(); + if offset < self.filemap.src.len() { + Some(self.filemap.src.as_slice().char_at(offset)) + } else { + None + } + } + + pub fn nextch_is(&self, c: char) -> bool { + self.nextch() == Some(c) + } + + pub fn nextnextch(&self) -> Option { + let offset = self.byte_offset(self.pos).to_uint(); + let s = self.filemap.deref().src.as_slice(); + if offset >= s.len() { return None } + let str::CharRange { next, .. } = s.char_range_at(offset); + if next < s.len() { + Some(s.char_at(next)) + } else { + None + } + } + + pub fn nextnextch_is(&self, c: char) -> bool { + self.nextnextch() == Some(c) + } + + /// PRECONDITION: self.curr is not whitespace + /// Eats any kind of comment. + /// Returns a Some(sugared-doc-attr) if one exists, None otherwise + fn consume_any_line_comment(&mut self) -> Option { + match self.curr { + Some(c) => { + if c.is_whitespace() { + self.span_diagnostic.span_err(codemap::mk_sp(self.last_pos, self.last_pos), + "called consume_any_line_comment, but there was whitespace"); + } + }, + None => { } + } + + if self.curr_is('/') { + match self.nextch() { + Some('/') => { + self.bump(); + self.bump(); + // line comments starting with "///" or "//!" are doc-comments + if self.curr_is('/') || self.curr_is('!') { + let start_bpos = self.pos - BytePos(3); + while !self.curr_is('\n') && !self.is_eof() { + self.bump(); + } + let ret = self.with_str_from(start_bpos, |string| { + // but comments with only more "/"s are not + if !is_line_non_doc_comment(string) { + Some(TokenAndSpan{ + tok: token::DOC_COMMENT(str_to_ident(string)), + sp: codemap::mk_sp(start_bpos, self.pos) + }) + } else { + None + } + }); + + if ret.is_some() { + return ret; + } + } else { + while !self.curr_is('\n') && !self.is_eof() { self.bump(); } + } + // Restart whitespace munch. + self.consume_whitespace_and_comments() + } + Some('*') => { self.bump(); self.bump(); self.consume_block_comment() } + _ => None + } + } else if self.curr_is('#') { + if self.nextch_is('!') { + + // Parse an inner attribute. + if self.nextnextch_is('[') { + return None; + } + + // I guess this is the only way to figure out if + // we're at the beginning of the file... + let cmap = CodeMap::new(); + cmap.files.borrow_mut().push(self.filemap.clone()); + let loc = cmap.lookup_char_pos_adj(self.last_pos); + if loc.line == 1u && loc.col == CharPos(0u) { + while !self.curr_is('\n') && !self.is_eof() { self.bump(); } + return self.consume_whitespace_and_comments(); + } + } + None + } else { + None + } + } + + /// EFFECT: eats whitespace and comments. + /// Returns a Some(sugared-doc-attr) if one exists, None otherwise. + fn consume_whitespace_and_comments(&mut self) -> Option { + while is_whitespace(self.curr) { self.bump(); } + return self.consume_any_line_comment(); + } + + // might return a sugared-doc-attr + fn consume_block_comment(&mut self) -> Option { + // block comments starting with "/**" or "/*!" are doc-comments + let is_doc_comment = self.curr_is('*') || self.curr_is('!'); + let start_bpos = self.pos - BytePos(if is_doc_comment {3} else {2}); + + let mut level: int = 1; + while level > 0 { + if self.is_eof() { + let msg = if is_doc_comment { + "unterminated block doc-comment" + } else { + "unterminated block comment" + }; + self.fatal_span(start_bpos, self.last_pos, msg); + } else if self.curr_is('/') && self.nextch_is('*') { + level += 1; + self.bump(); + self.bump(); + } else if self.curr_is('*') && self.nextch_is('/') { + level -= 1; + self.bump(); + self.bump(); + } else { + self.bump(); + } + } + + let res = if is_doc_comment { + self.with_str_from(start_bpos, |string| { + // but comments with only "*"s between two "/"s are not + if !is_block_non_doc_comment(string) { + Some(TokenAndSpan{ + tok: token::DOC_COMMENT(str_to_ident(string)), + sp: codemap::mk_sp(start_bpos, self.pos) + }) + } else { + None + } + }) + } else { + None + }; + + // restart whitespace munch. + if res.is_some() { res } else { self.consume_whitespace_and_comments() } + } + + fn scan_exponent(&mut self, start_bpos: BytePos) -> Option { + // \x00 hits the `return None` case immediately, so this is fine. + let mut c = self.curr.unwrap_or('\x00'); + let mut rslt = String::new(); + if c == 'e' || c == 'E' { + rslt.push_char(c); + self.bump(); + c = self.curr.unwrap_or('\x00'); + if c == '-' || c == '+' { + rslt.push_char(c); + self.bump(); + } + let exponent = self.scan_digits(10u); + if exponent.len() > 0u { + rslt.push_str(exponent.as_slice()); + return Some(rslt); + } else { + self.err_span(start_bpos, self.last_pos, "scan_exponent: bad fp literal"); + rslt.push_str("1"); // arbitrary placeholder exponent + return Some(rslt); + } + } else { + return None::; + } + } + + fn scan_digits(&mut self, radix: uint) -> String { + let mut rslt = String::new(); + loop { + let c = self.curr; + if c == Some('_') { self.bump(); continue; } + match c.and_then(|cc| char::to_digit(cc, radix)) { + Some(_) => { + rslt.push_char(c.unwrap()); + self.bump(); + } + _ => return rslt + } + }; + } + + fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: uint) { + match base { + 16u => self.err_span(start_bpos, last_bpos, "hexadecimal float literal is not supported"), + 8u => self.err_span(start_bpos, last_bpos, "octal float literal is not supported"), + 2u => self.err_span(start_bpos, last_bpos, "binary float literal is not supported"), + _ => () + } + } + + fn scan_number(&mut self, c: char) -> token::Token { + let mut num_str; + let mut base = 10u; + let mut c = c; + let mut n = self.nextch().unwrap_or('\x00'); + let start_bpos = self.last_pos; + if c == '0' && n == 'x' { + self.bump(); + self.bump(); + base = 16u; + } else if c == '0' && n == 'o' { + self.bump(); + self.bump(); + base = 8u; + } else if c == '0' && n == 'b' { + self.bump(); + self.bump(); + base = 2u; + } + num_str = self.scan_digits(base); + c = self.curr.unwrap_or('\x00'); + self.nextch(); + if c == 'u' || c == 'i' { + enum Result { Signed(ast::IntTy), Unsigned(ast::UintTy) } + let signed = c == 'i'; + let mut tp = { + if signed { Signed(ast::TyI) } + else { Unsigned(ast::TyU) } + }; + self.bump(); + c = self.curr.unwrap_or('\x00'); + if c == '8' { + self.bump(); + tp = if signed { Signed(ast::TyI8) } + else { Unsigned(ast::TyU8) }; + } + n = self.nextch().unwrap_or('\x00'); + if c == '1' && n == '6' { + self.bump(); + self.bump(); + tp = if signed { Signed(ast::TyI16) } + else { Unsigned(ast::TyU16) }; + } else if c == '3' && n == '2' { + self.bump(); + self.bump(); + tp = if signed { Signed(ast::TyI32) } + else { Unsigned(ast::TyU32) }; + } else if c == '6' && n == '4' { + self.bump(); + self.bump(); + tp = if signed { Signed(ast::TyI64) } + else { Unsigned(ast::TyU64) }; + } + if num_str.len() == 0u { + self.err_span(start_bpos, self.last_pos, "no valid digits found for number"); + num_str = "1".to_string(); + } + let parsed = match from_str_radix::(num_str.as_slice(), + base as uint) { + Some(p) => p, + None => { + self.err_span(start_bpos, self.last_pos, "int literal is too large"); + 1 + } + }; + + match tp { + Signed(t) => return token::LIT_INT(parsed as i64, t), + Unsigned(t) => return token::LIT_UINT(parsed, t) + } + } + let mut is_float = false; + if self.curr_is('.') && !(ident_start(self.nextch()) || self.nextch_is('.')) { + is_float = true; + self.bump(); + let dec_part = self.scan_digits(10u); + num_str.push_char('.'); + num_str.push_str(dec_part.as_slice()); + } + match self.scan_exponent(start_bpos) { + Some(ref s) => { + is_float = true; + num_str.push_str(s.as_slice()); + } + None => () + } + + if self.curr_is('f') { + self.bump(); + c = self.curr.unwrap_or('\x00'); + n = self.nextch().unwrap_or('\x00'); + if c == '3' && n == '2' { + self.bump(); + self.bump(); + self.check_float_base(start_bpos, self.last_pos, base); + return token::LIT_FLOAT(str_to_ident(num_str.as_slice()), + ast::TyF32); + } else if c == '6' && n == '4' { + self.bump(); + self.bump(); + self.check_float_base(start_bpos, self.last_pos, base); + return token::LIT_FLOAT(str_to_ident(num_str.as_slice()), + ast::TyF64); + /* FIXME (#2252): if this is out of range for either a + 32-bit or 64-bit float, it won't be noticed till the + back-end. */ + } else if c == '1' && n == '2' && self.nextnextch().unwrap_or('\x00') == '8' { + self.bump(); + self.bump(); + self.bump(); + self.check_float_base(start_bpos, self.last_pos, base); + return token::LIT_FLOAT(str_to_ident(num_str.as_slice()), ast::TyF128); + } + self.err_span(start_bpos, self.last_pos, "expected `f32`, `f64` or `f128` suffix"); + } + if is_float { + self.check_float_base(start_bpos, self.last_pos, base); + return token::LIT_FLOAT_UNSUFFIXED(str_to_ident( + num_str.as_slice())); + } else { + if num_str.len() == 0u { + self.err_span(start_bpos, self.last_pos, "no valid digits found for number"); + num_str = "1".to_string(); + } + let parsed = match from_str_radix::(num_str.as_slice(), + base as uint) { + Some(p) => p, + None => { + self.err_span(start_bpos, self.last_pos, "int literal is too large"); + 1 + } + }; + + debug!("lexing {} as an unsuffixed integer literal", + num_str.as_slice()); + return token::LIT_INT_UNSUFFIXED(parsed as i64); + } + } + + + fn scan_numeric_escape(&mut self, n_hex_digits: uint, delim: char) -> char { + let mut accum_int = 0u32; + let start_bpos = self.last_pos; + for _ in range(0, n_hex_digits) { + if self.is_eof() { + self.fatal_span(start_bpos, self.last_pos, "unterminated numeric character escape"); + } + if self.curr_is(delim) { + self.err_span(start_bpos, self.last_pos, "numeric character escape is too short"); + break; + } + let c = self.curr.unwrap_or('\x00'); + accum_int *= 16; + accum_int += c.to_digit(16).unwrap_or_else(|| { + self.err_span_char(self.last_pos, self.pos, + "illegal character in numeric character escape", c); + 0 + }) as u32; + self.bump(); + } + + match char::from_u32(accum_int) { + Some(x) => x, + None => { + self.err_span(start_bpos, self.last_pos, "illegal numeric character escape"); + '?' + } + } + } + + fn binop(&mut self, op: token::BinOp) -> token::Token { + self.bump(); + if self.curr_is('=') { + self.bump(); + return token::BINOPEQ(op); + } else { + return token::BINOP(op); + } + } + + /// Return the next token from the string, advances the input past that + /// token, and updates the interner + fn next_token_inner(&mut self) -> token::Token { + let c = self.curr; + if ident_start(c) && !self.nextch_is('"') && !self.nextch_is('#') { + // Note: r as in r" or r#" is part of a raw string literal, + // not an identifier, and is handled further down. + + let start = self.last_pos; + while ident_continue(self.curr) { + self.bump(); + } + + return self.with_str_from(start, |string| { + if string == "_" { + token::UNDERSCORE + } else { + let is_mod_name = self.curr_is(':') && self.nextch_is(':'); + + // FIXME: perform NFKC normalization here. (Issue #2253) + token::IDENT(str_to_ident(string), is_mod_name) + } + }) + } + + if is_dec_digit(c) { + return self.scan_number(c.unwrap()); + } + + match c.expect("next_token_inner called at EOF") { + // One-byte tokens. + ';' => { self.bump(); return token::SEMI; } + ',' => { self.bump(); return token::COMMA; } + '.' => { + self.bump(); + return if self.curr_is('.') { + self.bump(); + if self.curr_is('.') { + self.bump(); + token::DOTDOTDOT + } else { + token::DOTDOT + } + } else { + token::DOT + }; + } + '(' => { self.bump(); return token::LPAREN; } + ')' => { self.bump(); return token::RPAREN; } + '{' => { self.bump(); return token::LBRACE; } + '}' => { self.bump(); return token::RBRACE; } + '[' => { self.bump(); return token::LBRACKET; } + ']' => { self.bump(); return token::RBRACKET; } + '@' => { self.bump(); return token::AT; } + '#' => { self.bump(); return token::POUND; } + '~' => { self.bump(); return token::TILDE; } + ':' => { + self.bump(); + if self.curr_is(':') { + self.bump(); + return token::MOD_SEP; + } else { + return token::COLON; + } + } + + '$' => { self.bump(); return token::DOLLAR; } + + // Multi-byte tokens. + '=' => { + self.bump(); + if self.curr_is('=') { + self.bump(); + return token::EQEQ; + } else if self.curr_is('>') { + self.bump(); + return token::FAT_ARROW; + } else { + return token::EQ; + } + } + '!' => { + self.bump(); + if self.curr_is('=') { + self.bump(); + return token::NE; + } else { return token::NOT; } + } + '<' => { + self.bump(); + match self.curr.unwrap_or('\x00') { + '=' => { self.bump(); return token::LE; } + '<' => { return self.binop(token::SHL); } + '-' => { + self.bump(); + match self.curr.unwrap_or('\x00') { + _ => { return token::LARROW; } + } + } + _ => { return token::LT; } + } + } + '>' => { + self.bump(); + match self.curr.unwrap_or('\x00') { + '=' => { self.bump(); return token::GE; } + '>' => { return self.binop(token::SHR); } + _ => { return token::GT; } + } + } + '\'' => { + // Either a character constant 'a' OR a lifetime name 'abc + self.bump(); + let start = self.last_pos; + + // the eof will be picked up by the final `'` check below + let mut c2 = self.curr.unwrap_or('\x00'); + self.bump(); + + // If the character is an ident start not followed by another single + // quote, then this is a lifetime name: + if ident_start(Some(c2)) && !self.curr_is('\'') { + while ident_continue(self.curr) { + self.bump(); + } + let ident = self.with_str_from(start, |lifetime_name| { + str_to_ident(lifetime_name) + }); + let tok = &token::IDENT(ident, false); + + if token::is_keyword(token::keywords::Self, tok) { + self.err_span(start, self.last_pos, + "invalid lifetime name: 'self \ + is no longer a special lifetime"); + } else if token::is_any_keyword(tok) && + !token::is_keyword(token::keywords::Static, tok) { + self.err_span(start, self.last_pos, + "invalid lifetime name"); + } + return token::LIFETIME(ident); + } + + // Otherwise it is a character constant: + match c2 { + '\\' => { + // '\X' for some X must be a character constant: + let escaped = self.curr; + let escaped_pos = self.last_pos; + self.bump(); + match escaped { + None => {} + Some(e) => { + c2 = match e { + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '"' => '"', + '0' => '\x00', + 'x' => self.scan_numeric_escape(2u, '\''), + 'u' => self.scan_numeric_escape(4u, '\''), + 'U' => self.scan_numeric_escape(8u, '\''), + c2 => { + self.err_span_char(escaped_pos, self.last_pos, + "unknown character escape", c2); + c2 + } + } + } + } + } + '\t' | '\n' | '\r' | '\'' => { + self.err_span_char( start, self.last_pos, + "character constant must be escaped", c2); + } + _ => {} + } + if !self.curr_is('\'') { + self.fatal_span_verbose( + // Byte offsetting here is okay because the + // character before position `start` is an + // ascii single quote. + start - BytePos(1), self.last_pos, + "unterminated character constant".to_string()); + } + self.bump(); // advance curr past token + return token::LIT_CHAR(c2); + } + '"' => { + let mut accum_str = String::new(); + let start_bpos = self.last_pos; + self.bump(); + while !self.curr_is('"') { + if self.is_eof() { + self.fatal_span(start_bpos, self.last_pos, "unterminated double quote string"); + } + + let ch = self.curr.unwrap(); + self.bump(); + match ch { + '\\' => { + if self.is_eof() { + self.fatal_span(start_bpos, self.last_pos, + "unterminated double quote string"); + } + + let escaped = self.curr.unwrap(); + let escaped_pos = self.last_pos; + self.bump(); + match escaped { + 'n' => accum_str.push_char('\n'), + 'r' => accum_str.push_char('\r'), + 't' => accum_str.push_char('\t'), + '\\' => accum_str.push_char('\\'), + '\'' => accum_str.push_char('\''), + '"' => accum_str.push_char('"'), + '\n' => self.consume_whitespace(), + '0' => accum_str.push_char('\x00'), + 'x' => { + accum_str.push_char(self.scan_numeric_escape(2u, '"')); + } + 'u' => { + accum_str.push_char(self.scan_numeric_escape(4u, '"')); + } + 'U' => { + accum_str.push_char(self.scan_numeric_escape(8u, '"')); + } + c2 => { + self.err_span_char(escaped_pos, self.last_pos, + "unknown string escape", c2); + } + } + } + _ => accum_str.push_char(ch) + } + } + self.bump(); + return token::LIT_STR(str_to_ident(accum_str.as_slice())); + } + 'r' => { + let start_bpos = self.last_pos; + self.bump(); + let mut hash_count = 0u; + while self.curr_is('#') { + self.bump(); + hash_count += 1; + } + + if self.is_eof() { + self.fatal_span(start_bpos, self.last_pos, "unterminated raw string"); + } else if !self.curr_is('"') { + self.fatal_span_char(start_bpos, self.last_pos, + "only `#` is allowed in raw string delimitation; \ + found illegal character", + self.curr.unwrap()); + } + self.bump(); + let content_start_bpos = self.last_pos; + let mut content_end_bpos; + 'outer: loop { + if self.is_eof() { + self.fatal_span(start_bpos, self.last_pos, "unterminated raw string"); + } + if self.curr_is('"') { + content_end_bpos = self.last_pos; + for _ in range(0, hash_count) { + self.bump(); + if !self.curr_is('#') { + continue 'outer; + } + } + break; + } + self.bump(); + } + self.bump(); + let str_content = self.with_str_from_to( + content_start_bpos, + content_end_bpos, + str_to_ident); + return token::LIT_STR_RAW(str_content, hash_count); + } + '-' => { + if self.nextch_is('>') { + self.bump(); + self.bump(); + return token::RARROW; + } else { return self.binop(token::MINUS); } + } + '&' => { + if self.nextch_is('&') { + self.bump(); + self.bump(); + return token::ANDAND; + } else { return self.binop(token::AND); } + } + '|' => { + match self.nextch() { + Some('|') => { self.bump(); self.bump(); return token::OROR; } + _ => { return self.binop(token::OR); } + } + } + '+' => { return self.binop(token::PLUS); } + '*' => { return self.binop(token::STAR); } + '/' => { return self.binop(token::SLASH); } + '^' => { return self.binop(token::CARET); } + '%' => { return self.binop(token::PERCENT); } + c => { + self.fatal_span_char(self.last_pos, self.pos, + "unknown start of token", c); + } + } + } + + fn consume_whitespace(&mut self) { + while is_whitespace(self.curr) && !self.is_eof() { self.bump(); } + } + + fn read_to_eol(&mut self) -> String { + let mut val = String::new(); + while !self.curr_is('\n') && !self.is_eof() { + val.push_char(self.curr.unwrap()); + self.bump(); + } + if self.curr_is('\n') { self.bump(); } + return val + } + + fn read_one_line_comment(&mut self) -> String { + let val = self.read_to_eol(); + assert!((val.as_slice()[0] == '/' as u8 && val.as_slice()[1] == '/' as u8) + || (val.as_slice()[0] == '#' as u8 && val.as_slice()[1] == '!' as u8)); + return val; + } + + fn consume_non_eol_whitespace(&mut self) { + while is_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() { + self.bump(); + } + } + + fn peeking_at_comment(&self) -> bool { + (self.curr_is('/') && self.nextch_is('/')) + || (self.curr_is('/') && self.nextch_is('*')) + // consider shebangs comments, but not inner attributes + || (self.curr_is('#') && self.nextch_is('!') && !self.nextnextch_is('[')) + } +} + +pub fn is_whitespace(c: Option) -> bool { + match c.unwrap_or('\x00') { // None can be null for now... it's not whitespace + ' ' | '\n' | '\t' | '\r' => true, + _ => false + } +} + +fn in_range(c: Option, lo: char, hi: char) -> bool { + match c { + Some(c) => lo <= c && c <= hi, + _ => false + } +} + +fn is_dec_digit(c: Option) -> bool { return in_range(c, '0', '9'); } + +pub fn is_line_non_doc_comment(s: &str) -> bool { + s.starts_with("////") +} + +pub fn is_block_non_doc_comment(s: &str) -> bool { + s.starts_with("/***") +} + +fn ident_start(c: Option) -> bool { + let c = match c { Some(c) => c, None => return false }; + + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || c == '_' + || (c > '\x7f' && char::is_XID_start(c)) +} + +fn ident_continue(c: Option) -> bool { + let c = match c { Some(c) => c, None => return false }; + + (c >= 'a' && c <= 'z') + || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') + || c == '_' + || (c > '\x7f' && char::is_XID_continue(c)) +} + +#[cfg(test)] +mod test { + use super::*; + + use codemap::{BytePos, CodeMap, Span}; + use diagnostic; + use parse::token; + use parse::token::{str_to_ident}; + use std::io::util; + + fn mk_sh() -> diagnostic::SpanHandler { + let emitter = diagnostic::EmitterWriter::new(box util::NullWriter); + let handler = diagnostic::mk_handler(box emitter); + diagnostic::mk_span_handler(handler, CodeMap::new()) + } + + // open a string reader for the given string + fn setup<'a>(span_handler: &'a diagnostic::SpanHandler, + teststr: String) -> StringReader<'a> { + let fm = span_handler.cm.new_filemap("zebra.rs".to_string(), teststr); + StringReader::new(span_handler, fm) + } + + #[test] fn t1 () { + let span_handler = mk_sh(); + let mut string_reader = setup(&span_handler, + "/* my source file */ \ + fn main() { println!(\"zebra\"); }\n".to_string()); + let id = str_to_ident("fn"); + let tok1 = string_reader.next_token(); + let tok2 = TokenAndSpan{ + tok:token::IDENT(id, false), + sp:Span {lo:BytePos(21),hi:BytePos(23),expn_info: None}}; + assert_eq!(tok1,tok2); + // the 'main' id is already read: + assert_eq!(string_reader.last_pos.clone(), BytePos(28)); + // read another token: + let tok3 = string_reader.next_token(); + let tok4 = TokenAndSpan{ + tok:token::IDENT(str_to_ident("main"), false), + sp:Span {lo:BytePos(24),hi:BytePos(28),expn_info: None}}; + assert_eq!(tok3,tok4); + // the lparen is already read: + assert_eq!(string_reader.last_pos.clone(), BytePos(29)) + } + + // check that the given reader produces the desired stream + // of tokens (stop checking after exhausting the expected vec) + fn check_tokenization (mut string_reader: StringReader, expected: Vec ) { + for expected_tok in expected.iter() { + assert_eq!(&string_reader.next_token().tok, expected_tok); + } + } + + // make the identifier by looking up the string in the interner + fn mk_ident (id: &str, is_mod_name: bool) -> token::Token { + token::IDENT (str_to_ident(id),is_mod_name) + } + + #[test] fn doublecolonparsing () { + check_tokenization(setup(&mk_sh(), "a b".to_string()), + vec!(mk_ident("a",false), + mk_ident("b",false))); + } + + #[test] fn dcparsing_2 () { + check_tokenization(setup(&mk_sh(), "a::b".to_string()), + vec!(mk_ident("a",true), + token::MOD_SEP, + mk_ident("b",false))); + } + + #[test] fn dcparsing_3 () { + check_tokenization(setup(&mk_sh(), "a ::b".to_string()), + vec!(mk_ident("a",false), + token::MOD_SEP, + mk_ident("b",false))); + } + + #[test] fn dcparsing_4 () { + check_tokenization(setup(&mk_sh(), "a:: b".to_string()), + vec!(mk_ident("a",true), + token::MOD_SEP, + mk_ident("b",false))); + } + + #[test] fn character_a() { + assert_eq!(setup(&mk_sh(), "'a'".to_string()).next_token().tok, + token::LIT_CHAR('a')); + } + + #[test] fn character_space() { + assert_eq!(setup(&mk_sh(), "' '".to_string()).next_token().tok, + token::LIT_CHAR(' ')); + } + + #[test] fn character_escaped() { + assert_eq!(setup(&mk_sh(), "'\\n'".to_string()).next_token().tok, + token::LIT_CHAR('\n')); + } + + #[test] fn lifetime_name() { + assert_eq!(setup(&mk_sh(), "'abc".to_string()).next_token().tok, + token::LIFETIME(token::str_to_ident("abc"))); + } + + #[test] fn raw_string() { + assert_eq!(setup(&mk_sh(), + "r###\"\"#a\\b\x00c\"\"###".to_string()).next_token() + .tok, + token::LIT_STR_RAW(token::str_to_ident("\"#a\\b\x00c\""), 3)); + } + + #[test] fn line_doc_comments() { + assert!(!is_line_non_doc_comment("///")); + assert!(!is_line_non_doc_comment("/// blah")); + assert!(is_line_non_doc_comment("////")); + } + + #[test] fn nested_block_comments() { + assert_eq!(setup(&mk_sh(), + "/* /* */ */'a'".to_string()).next_token().tok, + token::LIT_CHAR('a')); + } + +} diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs index 3132f91c09b..2231b7a78e1 100644 --- a/src/libsyntax/parse/mod.rs +++ b/src/libsyntax/parse/mod.rs @@ -25,7 +25,6 @@ use std::str; pub mod lexer; pub mod parser; pub mod token; -pub mod comments; pub mod attr; pub mod common; @@ -255,7 +254,7 @@ pub fn filemap_to_tts(sess: &ParseSess, filemap: Rc) // it appears to me that the cfg doesn't matter here... indeed, // parsing tt's probably shouldn't require a parser at all. let cfg = Vec::new(); - let srdr = lexer::new_string_reader(&sess.span_diagnostic, filemap); + let srdr = lexer::StringReader::new(&sess.span_diagnostic, filemap); let mut p1 = Parser::new(sess, cfg, box srdr); p1.parse_all_token_trees() } diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs index 17a90ad1325..42319eeb371 100644 --- a/src/libsyntax/parse/token.rs +++ b/src/libsyntax/parse/token.rs @@ -166,7 +166,7 @@ pub fn to_str(t: &Token) -> String { ANDAND => "&&".to_string(), BINOP(op) => binop_to_str(op).to_string(), BINOPEQ(op) => { - let mut s = binop_to_str(op).to_strbuf(); + let mut s = binop_to_str(op).to_string(); s.push_str("="); s } diff --git a/src/libsyntax/print/pprust.rs b/src/libsyntax/print/pprust.rs index 90436faca23..440070e70a6 100644 --- a/src/libsyntax/print/pprust.rs +++ b/src/libsyntax/print/pprust.rs @@ -20,7 +20,8 @@ use codemap; use diagnostic; use parse::classify::expr_is_simple_block; use parse::token::IdentInterner; -use parse::{comments, token}; +use parse::token; +use parse::lexer::comments; use parse; use print::pp::{break_offset, word, space, zerobreak, hardbreak}; use print::pp::{Breaks, Consistent, Inconsistent, eof};