2011-09-16 17:47:21 +02:00
|
|
|
// Copyright 2011 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
// Package csv reads and writes comma-separated values (CSV) files.
|
|
|
|
//
|
|
|
|
// A csv file contains zero or more records of one or more fields per record.
|
|
|
|
// Each record is separated by the newline character. The final record may
|
|
|
|
// optionally be followed by a newline character.
|
|
|
|
//
|
|
|
|
// field1,field2,field3
|
|
|
|
//
|
|
|
|
// White space is considered part of a field.
|
|
|
|
//
|
|
|
|
// Carriage returns before newline characters are silently removed.
|
|
|
|
//
|
|
|
|
// Blank lines are ignored. A line with only whitespace characters (excluding
|
|
|
|
// the ending newline character) is not considered a blank line.
|
|
|
|
//
|
|
|
|
// Fields which start and stop with the quote character " are called
|
|
|
|
// quoted-fields. The beginning and ending quote are not part of the
|
|
|
|
// field.
|
|
|
|
//
|
|
|
|
// The source:
|
|
|
|
//
|
|
|
|
// normal string,"quoted-field"
|
|
|
|
//
|
|
|
|
// results in the fields
|
|
|
|
//
|
|
|
|
// {`normal string`, `quoted-field`}
|
|
|
|
//
|
|
|
|
// Within a quoted-field a quote character followed by a second quote
|
|
|
|
// character is considered a single quote.
|
|
|
|
//
|
|
|
|
// "the ""word"" is true","a ""quoted-field"""
|
|
|
|
//
|
|
|
|
// results in
|
|
|
|
//
|
|
|
|
// {`the "word" is true`, `a "quoted-field"`}
|
|
|
|
//
|
|
|
|
// Newlines and commas may be included in a quoted-field
|
|
|
|
//
|
|
|
|
// "Multi-line
|
|
|
|
// field","comma is ,"
|
|
|
|
//
|
|
|
|
// results in
|
|
|
|
//
|
|
|
|
// {`Multi-line
|
|
|
|
// field`, `comma is ,`}
|
|
|
|
package csv
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"bytes"
|
2011-12-03 03:17:34 +01:00
|
|
|
"errors"
|
2011-09-16 17:47:21 +02:00
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"unicode"
|
|
|
|
)
|
|
|
|
|
|
|
|
// A ParseError is returned for parsing errors.
|
|
|
|
// The first line is 1. The first column is 0.
|
|
|
|
type ParseError struct {
|
2011-12-03 03:17:34 +01:00
|
|
|
Line int // Line where the error occurred
|
|
|
|
Column int // Column (rune index) where the error occurred
|
|
|
|
Err error // The actual error
|
2011-09-16 17:47:21 +02:00
|
|
|
}
|
|
|
|
|
2011-12-03 03:17:34 +01:00
|
|
|
func (e *ParseError) Error() string {
|
|
|
|
return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err)
|
2011-09-16 17:47:21 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// These are the errors that can be returned in ParseError.Error
|
|
|
|
var (
|
2011-12-03 03:17:34 +01:00
|
|
|
ErrTrailingComma = errors.New("extra delimiter at end of line")
|
|
|
|
ErrBareQuote = errors.New("bare \" in non-quoted-field")
|
|
|
|
ErrQuote = errors.New("extraneous \" in field")
|
|
|
|
ErrFieldCount = errors.New("wrong number of fields in line")
|
2011-09-16 17:47:21 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
// A Reader reads records from a CSV-encoded file.
|
|
|
|
//
|
|
|
|
// As returned by NewReader, a Reader expects input conforming to RFC 4180.
|
|
|
|
// The exported fields can be changed to customize the details before the
|
|
|
|
// first call to Read or ReadAll.
|
|
|
|
//
|
|
|
|
// Comma is the field delimiter. It defaults to ','.
|
|
|
|
//
|
|
|
|
// Comment, if not 0, is the comment character. Lines beginning with the
|
|
|
|
// Comment character are ignored.
|
|
|
|
//
|
|
|
|
// If FieldsPerRecord is positive, Read requires each record to
|
|
|
|
// have the given number of fields. If FieldsPerRecord is 0, Read sets it to
|
|
|
|
// the number of fields in the first record, so that future records must
|
|
|
|
// have the same field count.
|
|
|
|
//
|
|
|
|
// If LazyQuotes is true, a quote may appear in an unquoted field and a
|
|
|
|
// non-doubled quote may appear in a quoted field.
|
|
|
|
//
|
|
|
|
// If TrailingComma is true, the last field may be an unquoted empty field.
|
|
|
|
//
|
|
|
|
// If TrimLeadingSpace is true, leading white space in a field is ignored.
|
|
|
|
type Reader struct {
|
2011-12-02 20:34:41 +01:00
|
|
|
Comma rune // Field delimiter (set to ',' by NewReader)
|
|
|
|
Comment rune // Comment character for start of line
|
2011-09-16 17:47:21 +02:00
|
|
|
FieldsPerRecord int // Number of expected fields per record
|
|
|
|
LazyQuotes bool // Allow lazy quotes
|
|
|
|
TrailingComma bool // Allow trailing comma
|
|
|
|
TrimLeadingSpace bool // Trim leading space
|
|
|
|
line int
|
|
|
|
column int
|
|
|
|
r *bufio.Reader
|
|
|
|
field bytes.Buffer
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewReader returns a new Reader that reads from r.
|
|
|
|
func NewReader(r io.Reader) *Reader {
|
|
|
|
return &Reader{
|
|
|
|
Comma: ',',
|
|
|
|
r: bufio.NewReader(r),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// error creates a new ParseError based on err.
|
2011-12-03 03:17:34 +01:00
|
|
|
func (r *Reader) error(err error) error {
|
2011-09-16 17:47:21 +02:00
|
|
|
return &ParseError{
|
|
|
|
Line: r.line,
|
|
|
|
Column: r.column,
|
2011-12-03 03:17:34 +01:00
|
|
|
Err: err,
|
2011-09-16 17:47:21 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Read reads one record from r. The record is a slice of strings with each
|
|
|
|
// string representing one field.
|
2011-12-03 03:17:34 +01:00
|
|
|
func (r *Reader) Read() (record []string, err error) {
|
2011-09-16 17:47:21 +02:00
|
|
|
for {
|
|
|
|
record, err = r.parseRecord()
|
|
|
|
if record != nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if r.FieldsPerRecord > 0 {
|
|
|
|
if len(record) != r.FieldsPerRecord {
|
|
|
|
r.column = 0 // report at start of record
|
|
|
|
return record, r.error(ErrFieldCount)
|
|
|
|
}
|
|
|
|
} else if r.FieldsPerRecord == 0 {
|
|
|
|
r.FieldsPerRecord = len(record)
|
|
|
|
}
|
|
|
|
return record, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// ReadAll reads all the remaining records from r.
|
|
|
|
// Each record is a slice of fields.
|
2011-12-03 03:17:34 +01:00
|
|
|
func (r *Reader) ReadAll() (records [][]string, err error) {
|
2011-09-16 17:47:21 +02:00
|
|
|
for {
|
|
|
|
record, err := r.Read()
|
2011-12-03 03:17:34 +01:00
|
|
|
if err == io.EOF {
|
2011-09-16 17:47:21 +02:00
|
|
|
return records, nil
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
records = append(records, record)
|
|
|
|
}
|
|
|
|
panic("unreachable")
|
|
|
|
}
|
|
|
|
|
|
|
|
// readRune reads one rune from r, folding \r\n to \n and keeping track
|
|
|
|
// of how far into the line we have read. r.column will point to the start
|
|
|
|
// of this rune, not the end of this rune.
|
2011-12-03 03:17:34 +01:00
|
|
|
func (r *Reader) readRune() (rune, error) {
|
2011-12-02 20:34:41 +01:00
|
|
|
r1, _, err := r.r.ReadRune()
|
2011-09-16 17:47:21 +02:00
|
|
|
|
|
|
|
// Handle \r\n here. We make the simplifying assumption that
|
|
|
|
// anytime \r is followed by \n that it can be folded to \n.
|
|
|
|
// We will not detect files which contain both \r\n and bare \n.
|
2011-12-02 20:34:41 +01:00
|
|
|
if r1 == '\r' {
|
|
|
|
r1, _, err = r.r.ReadRune()
|
2011-09-16 17:47:21 +02:00
|
|
|
if err == nil {
|
2011-12-02 20:34:41 +01:00
|
|
|
if r1 != '\n' {
|
2011-09-16 17:47:21 +02:00
|
|
|
r.r.UnreadRune()
|
2011-12-02 20:34:41 +01:00
|
|
|
r1 = '\r'
|
2011-09-16 17:47:21 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
r.column++
|
2011-12-02 20:34:41 +01:00
|
|
|
return r1, err
|
2011-09-16 17:47:21 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// unreadRune puts the last rune read from r back.
|
|
|
|
func (r *Reader) unreadRune() {
|
|
|
|
r.r.UnreadRune()
|
|
|
|
r.column--
|
|
|
|
}
|
|
|
|
|
|
|
|
// skip reads runes up to and including the rune delim or until error.
|
2011-12-03 03:17:34 +01:00
|
|
|
func (r *Reader) skip(delim rune) error {
|
2011-09-16 17:47:21 +02:00
|
|
|
for {
|
2011-12-02 20:34:41 +01:00
|
|
|
r1, err := r.readRune()
|
2011-09-16 17:47:21 +02:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2011-12-02 20:34:41 +01:00
|
|
|
if r1 == delim {
|
2011-09-16 17:47:21 +02:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
panic("unreachable")
|
|
|
|
}
|
|
|
|
|
|
|
|
// parseRecord reads and parses a single csv record from r.
|
2011-12-03 03:17:34 +01:00
|
|
|
func (r *Reader) parseRecord() (fields []string, err error) {
|
2011-09-16 17:47:21 +02:00
|
|
|
// Each record starts on a new line. We increment our line
|
|
|
|
// number (lines start at 1, not 0) and set column to -1
|
|
|
|
// so as we increment in readRune it points to the character we read.
|
|
|
|
r.line++
|
|
|
|
r.column = -1
|
|
|
|
|
|
|
|
// Peek at the first rune. If it is an error we are done.
|
|
|
|
// If we are support comments and it is the comment character
|
|
|
|
// then skip to the end of line.
|
|
|
|
|
2011-12-02 20:34:41 +01:00
|
|
|
r1, _, err := r.r.ReadRune()
|
2011-09-16 17:47:21 +02:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2011-12-02 20:34:41 +01:00
|
|
|
if r.Comment != 0 && r1 == r.Comment {
|
2011-09-16 17:47:21 +02:00
|
|
|
return nil, r.skip('\n')
|
|
|
|
}
|
|
|
|
r.r.UnreadRune()
|
|
|
|
|
|
|
|
// At this point we have at least one field.
|
|
|
|
for {
|
|
|
|
haveField, delim, err := r.parseField()
|
|
|
|
if haveField {
|
|
|
|
fields = append(fields, r.field.String())
|
|
|
|
}
|
2011-12-03 03:17:34 +01:00
|
|
|
if delim == '\n' || err == io.EOF {
|
2011-09-16 17:47:21 +02:00
|
|
|
return fields, err
|
|
|
|
} else if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
panic("unreachable")
|
|
|
|
}
|
|
|
|
|
|
|
|
// parseField parses the next field in the record. The read field is
|
|
|
|
// located in r.field. Delim is the first character not part of the field
|
|
|
|
// (r.Comma or '\n').
|
2011-12-03 03:17:34 +01:00
|
|
|
func (r *Reader) parseField() (haveField bool, delim rune, err error) {
|
2011-09-16 17:47:21 +02:00
|
|
|
r.field.Reset()
|
|
|
|
|
2011-12-02 20:34:41 +01:00
|
|
|
r1, err := r.readRune()
|
2011-09-16 17:47:21 +02:00
|
|
|
if err != nil {
|
|
|
|
// If we have EOF and are not at the start of a line
|
|
|
|
// then we return the empty field. We have already
|
|
|
|
// checked for trailing commas if needed.
|
2011-12-03 03:17:34 +01:00
|
|
|
if err == io.EOF && r.column != 0 {
|
2011-09-16 17:47:21 +02:00
|
|
|
return true, 0, err
|
|
|
|
}
|
|
|
|
return false, 0, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if r.TrimLeadingSpace {
|
2011-12-02 20:34:41 +01:00
|
|
|
for r1 != '\n' && unicode.IsSpace(r1) {
|
|
|
|
r1, err = r.readRune()
|
2011-09-16 17:47:21 +02:00
|
|
|
if err != nil {
|
|
|
|
return false, 0, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-12-02 20:34:41 +01:00
|
|
|
switch r1 {
|
2011-09-16 17:47:21 +02:00
|
|
|
case r.Comma:
|
|
|
|
// will check below
|
|
|
|
|
|
|
|
case '\n':
|
|
|
|
// We are a trailing empty field or a blank line
|
|
|
|
if r.column == 0 {
|
2011-12-02 20:34:41 +01:00
|
|
|
return false, r1, nil
|
2011-09-16 17:47:21 +02:00
|
|
|
}
|
2011-12-02 20:34:41 +01:00
|
|
|
return true, r1, nil
|
2011-09-16 17:47:21 +02:00
|
|
|
|
|
|
|
case '"':
|
|
|
|
// quoted field
|
|
|
|
Quoted:
|
|
|
|
for {
|
2011-12-02 20:34:41 +01:00
|
|
|
r1, err = r.readRune()
|
2011-09-16 17:47:21 +02:00
|
|
|
if err != nil {
|
2011-12-03 03:17:34 +01:00
|
|
|
if err == io.EOF {
|
2011-09-16 17:47:21 +02:00
|
|
|
if r.LazyQuotes {
|
|
|
|
return true, 0, err
|
|
|
|
}
|
|
|
|
return false, 0, r.error(ErrQuote)
|
|
|
|
}
|
|
|
|
return false, 0, err
|
|
|
|
}
|
2011-12-02 20:34:41 +01:00
|
|
|
switch r1 {
|
2011-09-16 17:47:21 +02:00
|
|
|
case '"':
|
2011-12-02 20:34:41 +01:00
|
|
|
r1, err = r.readRune()
|
|
|
|
if err != nil || r1 == r.Comma {
|
2011-09-16 17:47:21 +02:00
|
|
|
break Quoted
|
|
|
|
}
|
2011-12-02 20:34:41 +01:00
|
|
|
if r1 == '\n' {
|
|
|
|
return true, r1, nil
|
2011-09-16 17:47:21 +02:00
|
|
|
}
|
2011-12-02 20:34:41 +01:00
|
|
|
if r1 != '"' {
|
2011-09-16 17:47:21 +02:00
|
|
|
if !r.LazyQuotes {
|
|
|
|
r.column--
|
|
|
|
return false, 0, r.error(ErrQuote)
|
|
|
|
}
|
|
|
|
// accept the bare quote
|
|
|
|
r.field.WriteRune('"')
|
|
|
|
}
|
|
|
|
case '\n':
|
|
|
|
r.line++
|
|
|
|
r.column = -1
|
|
|
|
}
|
2011-12-02 20:34:41 +01:00
|
|
|
r.field.WriteRune(r1)
|
2011-09-16 17:47:21 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
// unquoted field
|
|
|
|
for {
|
2011-12-02 20:34:41 +01:00
|
|
|
r.field.WriteRune(r1)
|
|
|
|
r1, err = r.readRune()
|
|
|
|
if err != nil || r1 == r.Comma {
|
2011-09-16 17:47:21 +02:00
|
|
|
break
|
|
|
|
}
|
2011-12-02 20:34:41 +01:00
|
|
|
if r1 == '\n' {
|
|
|
|
return true, r1, nil
|
2011-09-16 17:47:21 +02:00
|
|
|
}
|
2011-12-02 20:34:41 +01:00
|
|
|
if !r.LazyQuotes && r1 == '"' {
|
2011-09-16 17:47:21 +02:00
|
|
|
return false, 0, r.error(ErrBareQuote)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if err != nil {
|
2011-12-03 03:17:34 +01:00
|
|
|
if err == io.EOF {
|
2011-09-16 17:47:21 +02:00
|
|
|
return true, 0, err
|
|
|
|
}
|
|
|
|
return false, 0, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if !r.TrailingComma {
|
|
|
|
// We don't allow trailing commas. See if we
|
|
|
|
// are at the end of the line (being mindful
|
|
|
|
// of trimming spaces).
|
|
|
|
c := r.column
|
2011-12-02 20:34:41 +01:00
|
|
|
r1, err = r.readRune()
|
2011-09-16 17:47:21 +02:00
|
|
|
if r.TrimLeadingSpace {
|
2011-12-02 20:34:41 +01:00
|
|
|
for r1 != '\n' && unicode.IsSpace(r1) {
|
|
|
|
r1, err = r.readRune()
|
2011-09-16 17:47:21 +02:00
|
|
|
if err != nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2011-12-03 03:17:34 +01:00
|
|
|
if err == io.EOF || r1 == '\n' {
|
2011-09-16 17:47:21 +02:00
|
|
|
r.column = c // report the comma
|
|
|
|
return false, 0, r.error(ErrTrailingComma)
|
|
|
|
}
|
|
|
|
r.unreadRune()
|
|
|
|
}
|
2011-12-02 20:34:41 +01:00
|
|
|
return true, r1, nil
|
2011-09-16 17:47:21 +02:00
|
|
|
}
|