diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 789239c3ab0..7ccf51a4629 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -1,4 +1,4 @@ -// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// Copyright 2014-2016 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -8,16 +8,26 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! Basic html highlighting functionality +//! Basic syntax highlighting functionality. //! //! This module uses libsyntax's lexer to provide token-based highlighting for //! the HTML documentation generated by rustdoc. +//! +//! If you just want to syntax highlighting for a Rust program, then you can use +//! the `render_inner_with_highlighting` or `render_with_highlighting` +//! functions. For more advanced use cases (if you want to supply your own css +//! classes or control how the HTML is generated, or even generate something +//! other then HTML), then you should implement the the `Writer` trait and use a +//! `Classifier`. use html::escape::Escape; +use std::fmt::Display; use std::io; use std::io::prelude::*; -use syntax::parse::lexer::{self, Reader}; + +use syntax::codemap::{CodeMap, Span}; +use syntax::parse::lexer::{self, Reader, TokenAndSpan}; use syntax::parse::token; use syntax::parse; @@ -29,11 +39,13 @@ pub fn render_with_highlighting(src: &str, class: Option<&str>, id: Option<&str> let mut out = Vec::new(); write_header(class, id, &mut out).unwrap(); - if let Err(_) = write_source(&sess, - lexer::StringReader::new(&sess.span_diagnostic, fm), - &mut out) { - return format!("
{}
", src) + + let mut classifier = Classifier::new(lexer::StringReader::new(&sess.span_diagnostic, fm), + sess.codemap()); + if let Err(_) = classifier.write_source(&mut out) { + return format!("
{}
", src); } + write_footer(&mut out).unwrap(); String::from_utf8_lossy(&out[..]).into_owned() } @@ -46,84 +58,187 @@ pub fn render_inner_with_highlighting(src: &str) -> io::Result { let fm = sess.codemap().new_filemap("".to_string(), src.to_string()); let mut out = Vec::new(); - write_source(&sess, - lexer::StringReader::new(&sess.span_diagnostic, fm), - &mut out)?; - Ok(String::from_utf8_lossy(&out[..]).into_owned()) + let mut classifier = Classifier::new(lexer::StringReader::new(&sess.span_diagnostic, fm), + sess.codemap()); + classifier.write_source(&mut out)?; + + Ok(String::from_utf8_lossy(&out).into_owned()) } -/// Exhausts the `lexer` writing the output into `out`. +/// Processes a program (nested in the internal `lexer`), classifying strings of +/// text by highlighting category (`Class`). Calls out to a `Writer` to write +/// each span of text in sequence. +pub struct Classifier<'a> { + lexer: lexer::StringReader<'a>, + codemap: &'a CodeMap, + + // State of the classifier. + in_attribute: bool, + in_macro: bool, + in_macro_nonterminal: bool, +} + +/// How a span of text is classified. Mostly corresponds to token kinds. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Class { + None, + Comment, + DocComment, + Attribute, + KeyWord, + // Keywords that do pointer/reference stuff. + RefKeyWord, + Self_, + Op, + Macro, + MacroNonTerminal, + String, + Number, + Bool, + Ident, + Lifetime, + PreludeTy, + PreludeVal, +} + +/// Trait that controls writing the output of syntax highlighting. Users should +/// implement this trait to customise writing output. /// -/// The general structure for this method is to iterate over each token, -/// possibly giving it an HTML span with a class specifying what flavor of token -/// it's used. All source code emission is done as slices from the source map, -/// not from the tokens themselves, in order to stay true to the original -/// source. -fn write_source(sess: &parse::ParseSess, - mut lexer: lexer::StringReader, - out: &mut Write) - -> io::Result<()> { - let mut is_attribute = false; - let mut is_macro = false; - let mut is_macro_nonterminal = false; - loop { - let next = match lexer.try_next_token() { - Ok(tok) => tok, - Err(_) => { - lexer.emit_fatal_errors(); - lexer.span_diagnostic.struct_warn("Backing out of syntax highlighting") - .note("You probably did not intend to render this \ - as a rust code-block") - .emit(); - return Err(io::Error::new(io::ErrorKind::Other, "")) - }, - }; +/// The classifier will call into the `Writer` implementation as it finds spans +/// of text to highlight. Exactly how that text should be highlighted is up to +/// the implemention. +pub trait Writer { + /// Called when we start processing a span of text that should be highlighted. + /// The `Class` argument specifies how it should be highlighted. + fn enter_span(&mut self, Class) -> io::Result<()>; - let snip = |sp| sess.codemap().span_to_snippet(sp).unwrap(); + /// Called at the end of a span of highlighted text. + fn exit_span(&mut self) -> io::Result<()>; - if next.tok == token::Eof { break } + /// Called for a span of text, usually, but not always, a single token. If + /// the string of text (`T`) does correspond to a token, then the token will + /// also be passed. If the text should be highlighted differently from the + /// surrounding text, then the `Class` argument will be a value other than + /// `None`. + /// The following sequences of callbacks are equivalent: + /// ```plain + /// enter_span(Foo), string("text", None), exit_span() + /// string("text", Foo) + /// ``` + /// The latter can be thought of as a shorthand for the former, which is + /// more flexible. + fn string(&mut self, T, Class, Option<&TokenAndSpan>) -> io::Result<()>; +} - let klass = match next.tok { - token::Whitespace => { - write!(out, "{}", Escape(&snip(next.sp)))?; - continue - }, - token::Comment => { - write!(out, "{}", - Escape(&snip(next.sp)))?; - continue - }, +// Implement `Writer` for anthing that can be written to, this just implements +// the default rustdoc behaviour. +impl Writer for U { + fn string(&mut self, + text: T, + klass: Class, + _tas: Option<&TokenAndSpan>) + -> io::Result<()> { + match klass { + Class::None => write!(self, "{}", text), + klass => write!(self, "{}", klass.rustdoc_class(), text), + } + } + + fn enter_span(&mut self, klass: Class) -> io::Result<()> { + write!(self, "", klass.rustdoc_class()) + } + + fn exit_span(&mut self) -> io::Result<()> { + write!(self, "") + } +} + +impl<'a> Classifier<'a> { + pub fn new(lexer: lexer::StringReader<'a>, codemap: &'a CodeMap) -> Classifier<'a> { + Classifier { + lexer: lexer, + codemap: codemap, + in_attribute: false, + in_macro: false, + in_macro_nonterminal: false, + } + } + + /// Exhausts the `lexer` writing the output into `out`. + /// + /// The general structure for this method is to iterate over each token, + /// possibly giving it an HTML span with a class specifying what flavor of token + /// is used. All source code emission is done as slices from the source map, + /// not from the tokens themselves, in order to stay true to the original + /// source. + pub fn write_source(&mut self, + out: &mut W) + -> io::Result<()> { + loop { + let next = match self.lexer.try_next_token() { + Ok(tas) => tas, + Err(_) => { + self.lexer.emit_fatal_errors(); + self.lexer.span_diagnostic.struct_warn("Backing out of syntax highlighting") + .note("You probably did not intend to render this \ + as a rust code-block") + .emit(); + return Err(io::Error::new(io::ErrorKind::Other, "")); + } + }; + + if next.tok == token::Eof { + break; + } + + self.write_token(out, next)?; + } + + Ok(()) + } + + // Handles an individual token from the lexer. + fn write_token(&mut self, + out: &mut W, + tas: TokenAndSpan) + -> io::Result<()> { + let klass = match tas.tok { token::Shebang(s) => { - write!(out, "{}", Escape(&s.as_str()))?; - continue + out.string(Escape(&s.as_str()), Class::None, Some(&tas))?; + return Ok(()); }, + + token::Whitespace => Class::None, + token::Comment => Class::Comment, + token::DocComment(..) => Class::DocComment, + // If this '&' token is directly adjacent to another token, assume // that it's the address-of operator instead of the and-operator. - // This allows us to give all pointers their own class (`Box` and - // `@` are below). - token::BinOp(token::And) if lexer.peek().sp.lo == next.sp.hi => "kw-2", - token::At | token::Tilde => "kw-2", + token::BinOp(token::And) if self.lexer.peek().sp.lo == tas.sp.hi => Class::RefKeyWord, - // consider this as part of a macro invocation if there was a - // leading identifier - token::Not if is_macro => { is_macro = false; "macro" } + // Consider this as part of a macro invocation if there was a + // leading identifier. + token::Not if self.in_macro => { + self.in_macro = false; + Class::Macro + } - // operators + // Operators. token::Eq | token::Lt | token::Le | token::EqEq | token::Ne | token::Ge | token::Gt | token::AndAnd | token::OrOr | token::Not | token::BinOp(..) | token::RArrow | - token::BinOpEq(..) | token::FatArrow => "op", + token::BinOpEq(..) | token::FatArrow => Class::Op, - // miscellaneous, no highlighting + // Miscellaneous, no highlighting. token::Dot | token::DotDot | token::DotDotDot | token::Comma | token::Semi | token::Colon | token::ModSep | token::LArrow | token::OpenDelim(_) | token::CloseDelim(token::Brace) | token::CloseDelim(token::Paren) | - token::Question => "", + token::Question => Class::None, token::Dollar => { - if lexer.peek().tok.is_ident() { - is_macro_nonterminal = true; - "macro-nonterminal" + if self.lexer.peek().tok.is_ident() { + self.in_macro_nonterminal = true; + Class::MacroNonTerminal } else { - "" + Class::None } } @@ -132,78 +247,103 @@ fn write_source(sess: &parse::ParseSess, // seen, so skip out early. Down below we terminate the attribute // span when we see the ']'. token::Pound => { - is_attribute = true; - write!(out, r"#")?; - continue + self.in_attribute = true; + out.enter_span(Class::Attribute)?; + out.string("#", Class::None, None)?; + return Ok(()); } token::CloseDelim(token::Bracket) => { - if is_attribute { - is_attribute = false; - write!(out, "]")?; - continue + if self.in_attribute { + self.in_attribute = false; + out.string("]", Class::None, None)?; + out.exit_span()?; + return Ok(()); } else { - "" + Class::None } } token::Literal(lit, _suf) => { match lit { - // text literals + // Text literals. token::Byte(..) | token::Char(..) | token::ByteStr(..) | token::ByteStrRaw(..) | - token::Str_(..) | token::StrRaw(..) => "string", + token::Str_(..) | token::StrRaw(..) => Class::String, - // number literals - token::Integer(..) | token::Float(..) => "number", + // Number literals. + token::Integer(..) | token::Float(..) => Class::Number, } } - // keywords are also included in the identifier set + // Keywords are also included in the identifier set. token::Ident(ident) => { match &*ident.name.as_str() { - "ref" | "mut" => "kw-2", + "ref" | "mut" => Class::RefKeyWord, - "self" => "self", - "false" | "true" => "boolval", + "self" |"Self" => Class::Self_, + "false" | "true" => Class::Bool, - "Option" | "Result" => "prelude-ty", - "Some" | "None" | "Ok" | "Err" => "prelude-val", + "Option" | "Result" => Class::PreludeTy, + "Some" | "None" | "Ok" | "Err" => Class::PreludeVal, - _ if next.tok.is_any_keyword() => "kw", + _ if tas.tok.is_any_keyword() => Class::KeyWord, _ => { - if is_macro_nonterminal { - is_macro_nonterminal = false; - "macro-nonterminal" - } else if lexer.peek().tok == token::Not { - is_macro = true; - "macro" + if self.in_macro_nonterminal { + self.in_macro_nonterminal = false; + Class::MacroNonTerminal + } else if self.lexer.peek().tok == token::Not { + self.in_macro = true; + Class::Macro } else { - "ident" + Class::Ident } } } } - // Special macro vars are like keywords - token::SpecialVarNt(_) => "kw-2", + // Special macro vars are like keywords. + token::SpecialVarNt(_) => Class::KeyWord, + + token::Lifetime(..) => Class::Lifetime, - token::Lifetime(..) => "lifetime", - token::DocComment(..) => "doccomment", token::Underscore | token::Eof | token::Interpolated(..) | - token::MatchNt(..) | token::SubstNt(..) => "", + token::MatchNt(..) | token::SubstNt(..) | token::Tilde | token::At => Class::None, }; - // as mentioned above, use the original source code instead of - // stringifying this token - let snip = sess.codemap().span_to_snippet(next.sp).unwrap(); - if klass == "" { - write!(out, "{}", Escape(&snip))?; - } else { - write!(out, "{}", klass, Escape(&snip))?; - } + // Anything that didn't return above is the simple case where we the + // class just spans a single token, so we can use the `string` method. + out.string(Escape(&self.snip(tas.sp)), klass, Some(&tas)) } - Ok(()) + // Helper function to get a snippet from the codemap. + fn snip(&self, sp: Span) -> String { + self.codemap.span_to_snippet(sp).unwrap() + } +} + +impl Class { + /// Returns the css class expected by rustdoc for each `Class`. + pub fn rustdoc_class(self) -> &'static str { + match self { + Class::None => "", + Class::Comment => "comment", + Class::DocComment => "doccomment", + Class::Attribute => "attribute", + Class::KeyWord => "kw", + Class::RefKeyWord => "kw-2", + Class::Self_ => "self", + Class::Op => "op", + Class::Macro => "macro", + Class::MacroNonTerminal => "macro-nonterminal", + Class::String => "string", + Class::Number => "number", + Class::Bool => "boolvalue", + Class::Ident => "ident", + Class::Lifetime => "lifetime", + Class::PreludeTy => "prelude-ty", + Class::PreludeVal => "prelude-val", + } + } } fn write_header(class: Option<&str>,