transfer.sh/vendor/google.golang.org/api/examples/bigquery.go

// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package main

import (
	"container/list"
	"encoding/json"
	"fmt"
	"io/ioutil"
	"log"
	"math"
	"math/rand"
	"net/http"
	"os"
	"strconv"
	"strings"
	"time"

	bigquery "google.golang.org/api/bigquery/v2"
	storage "google.golang.org/api/storage/v1"
)

const (
	GB                         = 1 << 30
	MaxBackoff                 = 30000
	BaseBackoff                = 250
	BackoffGrowthFactor        = 1.8
	BackoffGrowthDamper        = 0.25
	JobStatusDone              = "DONE"
	DatasetAlreadyExists       = "Already Exists: Dataset"
	TableWriteEmptyDisposition = "WRITE_EMPTY"
)

func init() {
	scope := fmt.Sprintf("%s %s %s", bigquery.BigqueryScope,
		storage.DevstorageReadOnlyScope,
		"https://www.googleapis.com/auth/userinfo.profile")
	registerDemo("bigquery", scope, bqMain)
}

// This example demonstrates loading objects from Google Cloud Storage into
// BigQuery. Objects are specified by their bucket and a name prefix. Each
// object will be loaded into a new table identified by the object name minus
// any file extension. All tables are added to the specified dataset (one will
// be created if necessary). Currently, tables will not be overwritten and an
// attempt to load an object into a dataset that already contains its table
// will emit an error message indicating the table already exists.
// A schema file must be provided and it will be applied to every object/table.
// Example usage:
//   go-api-demo -clientid="my-clientid" -secret="my-secret" bq myProject
//								myDataBucket datafile2013070 DataFiles2013
//								./datafile_schema.json 100
//
// This will load all objects (e.g. all data files from July 2013) from
// gs://myDataBucket into a (possibly new) BigQuery dataset named DataFiles2013
// using the schema file provided and allowing up to 100 bad records. Assuming
// each object is named like datafileYYYYMMDD.csv.gz and all of July's files are
// stored in the bucket, 9 tables will be created named like datafile201307DD
// where DD ranges from 01 to 09, inclusive.
// When the program completes, it will emit a results line similar to:
//
// 9 files loaded in 3m58s (18m2.708s). Size: 7.18GB Rows: 7130725
//
// The total elapsed time from the start of first job to the end of the last job
// (effectively wall clock time) is shown. In parenthesis is the aggregate time
// taken to load all tables.
func bqMain(client *http.Client, argv []string) {
	if len(argv) != 6 {
		fmt.Fprintln(os.Stderr,
			"Usage: bq project_id bucket prefix dataset schema max_bad_records")
		return
	}

	var (
		project    = argv[0]
		bucket     = argv[1]
		objPrefix  = argv[2]
		datasetId  = argv[3]
		schemaFile = argv[4]
	)
	badRecords, err := strconv.ParseInt(argv[5], 10, 64)
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		return
	}

	rand.Seed(time.Now().UnixNano())

	service, err := storage.New(client)
	if err != nil {
		log.Fatalf("Unable to create Storage service: %v", err)
	}

	// Get the list of objects in the bucket matching the specified prefix.
	list := service.Objects.List(bucket)
	list.Prefix(objPrefix)
	objects, err := list.Do()
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		return
	}

	// Create the wrapper and insert the (new) dataset.
	dataset, err := newBQDataset(client, project, datasetId)
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		return
	}
	if err = dataset.insert(true); err != nil {
		fmt.Fprintln(os.Stderr, err)
		return
	}

	objectSource := &tableSource{
		maxBadRecords: badRecords,
		disposition:   TableWriteEmptyDisposition,
	}

	// Load the schema from disk.
	f, err := ioutil.ReadFile(schemaFile)
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		return
	}
	if err = json.Unmarshal(f, &objectSource.schema); err != nil {
		fmt.Fprintln(os.Stderr, err)
		return
	}

	// Assumes all objects have .csv, .csv.gz (or no) extension.
	tableIdFromObject := func(name string) string {
		return strings.TrimSuffix(strings.TrimSuffix(name, ".gz"), ".csv")
	}

	// A jobset is way to group a collection of jobs together for monitoring.
	// For this example, we just use the name of the bucket and object prefix.
	jobset := fmt.Sprintf("%s:%s", bucket, objPrefix)
	fmt.Fprintf(os.Stderr, "\nLoading %d objects.\n", len(objects.Items))

	// Load each object into a dataset of the same name (minus any extension).
	// A successful insert call will inject the job into our queue for monitoring.
	for _, o := range objects.Items {
		objectSource.id = tableIdFromObject(o.Name)
		objectSource.uri = fmt.Sprintf("gs://%s/%s", o.Bucket, o.Name)
		if err = dataset.load(jobset, objectSource); err != nil {
			fmt.Fprintln(os.Stderr, err)
		}
	}

	dataset.monitor(jobset)
}

// Wraps the BigQuery service and dataset and provides some helper functions.
type bqDataset struct {
	project string
	id      string
	bq      *bigquery.Service
	dataset *bigquery.Dataset
	jobsets map[string]*list.List
}

func newBQDataset(client *http.Client, dsProj string, dsId string) (*bqDataset,
	error) {

	service, err := bigquery.New(client)
	if err != nil {
		log.Fatalf("Unable to create BigQuery service: %v", err)
	}

	return &bqDataset{
		project: dsProj,
		id:      dsId,
		bq:      service,
		dataset: &bigquery.Dataset{
			DatasetReference: &bigquery.DatasetReference{
				DatasetId: dsId,
				ProjectId: dsProj,
			},
		},
		jobsets: make(map[string]*list.List),
	}, nil
}

func (ds *bqDataset) insert(existsOK bool) error {
	call := ds.bq.Datasets.Insert(ds.project, ds.dataset)
	_, err := call.Do()
	if err != nil && (!existsOK || !strings.Contains(err.Error(),
		DatasetAlreadyExists)) {
		return err
	}

	return nil
}

type tableSource struct {
	id            string
	uri           string
	schema        bigquery.TableSchema
	maxBadRecords int64
	disposition   string
}

func (ds *bqDataset) load(jobset string, source *tableSource) error {
	job := &bigquery.Job{
		Configuration: &bigquery.JobConfiguration{
			Load: &bigquery.JobConfigurationLoad{
				DestinationTable: &bigquery.TableReference{
					DatasetId: ds.dataset.DatasetReference.DatasetId,
					ProjectId: ds.project,
					TableId:   source.id,
				},
				MaxBadRecords:    source.maxBadRecords,
				Schema:           &source.schema,
				SourceUris:       []string{source.uri},
				WriteDisposition: source.disposition,
			},
		},
	}

	call := ds.bq.Jobs.Insert(ds.project, job)
	job, err := call.Do()
	if err != nil {
		return err
	}

	_, ok := ds.jobsets[jobset]
	if !ok {
		ds.jobsets[jobset] = list.New()
	}
	ds.jobsets[jobset].PushBack(job)

	return nil
}

func (ds *bqDataset) getJob(id string) (*bigquery.Job, error) {
	return ds.bq.Jobs.Get(ds.project, id).Do()
}

func (ds *bqDataset) monitor(jobset string) {
	jobq, ok := ds.jobsets[jobset]
	if !ok {
		return
	}

	var backoff float64 = BaseBackoff
	pause := func(grow bool) {
		if grow {
			backoff *= BackoffGrowthFactor
			backoff -= (backoff * rand.Float64() * BackoffGrowthDamper)
			backoff = math.Min(backoff, MaxBackoff)
			fmt.Fprintf(os.Stderr, "[%s] Checking remaining %d jobs...\n", jobset,
				1+jobq.Len())
		}
		time.Sleep(time.Duration(backoff) * time.Millisecond)
	}
	var stats jobStats

	// Track a 'head' pending job in queue for detecting cycling.
	head := ""
	// Loop until all jobs are done - with either success or error.
	for jobq.Len() > 0 {
		jel := jobq.Front()
		job := jel.Value.(*bigquery.Job)
		jobq.Remove(jel)
		jid := job.JobReference.JobId
		loop := false

		// Check and possibly pick a new head job id.
		if len(head) == 0 {
			head = jid
		} else {
			if jid == head {
				loop = true
			}
		}

		// Retrieve the job's current status.
		pause(loop)
		j, err := ds.getJob(jid)
		if err != nil {
			fmt.Fprintln(os.Stderr, err)
			// In this case of a transient API error, we want keep the job.
			if j == nil {
				jobq.PushBack(job)
			} else {
				// Must reset head tracker if job is discarded.
				if loop {
					head = ""
					backoff = BaseBackoff
				}
			}
			continue
		}

		// Reassign with the updated job data (from Get).
		// We don't use j here as Get might return nil for this value.
		job = j

		if job.Status.State != JobStatusDone {
			jobq.PushBack(job)
			continue
		}

		if res := job.Status.ErrorResult; res != nil {
			fmt.Fprintln(os.Stderr, res.Message)
		} else {
			stat := job.Statistics
			lstat := stat.Load
			stats.files += 1
			stats.bytesIn += lstat.InputFileBytes
			stats.bytesOut += lstat.OutputBytes
			stats.rows += lstat.OutputRows
			stats.elapsed +=
				time.Duration(stat.EndTime-stat.StartTime) * time.Millisecond

			if stats.start.IsZero() {
				stats.start = time.Unix(stat.StartTime/1000, 0)
			} else {
				t := time.Unix(stat.StartTime/1000, 0)
				if stats.start.Sub(t) > 0 {
					stats.start = t
				}
			}

			if stats.finish.IsZero() {
				stats.finish = time.Unix(stat.EndTime/1000, 0)
			} else {
				t := time.Unix(stat.EndTime/1000, 0)
				if t.Sub(stats.finish) > 0 {
					stats.finish = t
				}
			}
		}
		// When the head job is processed reset the backoff since the loads
		// run in BQ in parallel.
		if loop {
			head = ""
			backoff = BaseBackoff
		}
	}

	fmt.Fprintf(os.Stderr, "%#v\n", stats)
}

type jobStats struct {
	// Number of files (sources) loaded.
	files int64
	// Bytes read from source (possibly compressed).
	bytesIn int64
	// Bytes loaded into BigQuery (uncompressed).
	bytesOut int64
	// Rows loaded into BigQuery.
	rows int64
	// Time taken to load source into table.
	elapsed time.Duration
	// Start time of the job.
	start time.Time
	// End time of the job.
	finish time.Time
}

func (s jobStats) GoString() string {
	return fmt.Sprintf("\n%d files loaded in %v (%v). Size: %.2fGB Rows: %d\n",
		s.files, s.finish.Sub(s.start), s.elapsed, float64(s.bytesOut)/GB,
		s.rows)
}
add GDrive provider support (#118) * GDrive provider support * More reliable basedir ownership * Fix mimetype 2018-06-19 15:30:26 +02:00			`// Copyright 2013 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`package main`

			`import (`
			`"container/list"`
			`"encoding/json"`
			`"fmt"`
			`"io/ioutil"`
			`"log"`
			`"math"`
			`"math/rand"`
			`"net/http"`
			`"os"`
			`"strconv"`
			`"strings"`
			`"time"`

			`bigquery "google.golang.org/api/bigquery/v2"`
			`storage "google.golang.org/api/storage/v1"`
			`)`

			`const (`
			`GB = 1 << 30`
			`MaxBackoff = 30000`
			`BaseBackoff = 250`
			`BackoffGrowthFactor = 1.8`
			`BackoffGrowthDamper = 0.25`
			`JobStatusDone = "DONE"`
			`DatasetAlreadyExists = "Already Exists: Dataset"`
			`TableWriteEmptyDisposition = "WRITE_EMPTY"`
			`)`

			`func init() {`
			`scope := fmt.Sprintf("%s %s %s", bigquery.BigqueryScope,`
			`storage.DevstorageReadOnlyScope,`
			`"https://www.googleapis.com/auth/userinfo.profile")`
			`registerDemo("bigquery", scope, bqMain)`
			`}`

			`// This example demonstrates loading objects from Google Cloud Storage into`
			`// BigQuery. Objects are specified by their bucket and a name prefix. Each`
			`// object will be loaded into a new table identified by the object name minus`
			`// any file extension. All tables are added to the specified dataset (one will`
			`// be created if necessary). Currently, tables will not be overwritten and an`
			`// attempt to load an object into a dataset that already contains its table`
			`// will emit an error message indicating the table already exists.`
			`// A schema file must be provided and it will be applied to every object/table.`
			`// Example usage:`
			`// go-api-demo -clientid="my-clientid" -secret="my-secret" bq myProject`
			`// myDataBucket datafile2013070 DataFiles2013`
			`// ./datafile_schema.json 100`
			`//`
			`// This will load all objects (e.g. all data files from July 2013) from`
			`// gs://myDataBucket into a (possibly new) BigQuery dataset named DataFiles2013`
			`// using the schema file provided and allowing up to 100 bad records. Assuming`
			`// each object is named like datafileYYYYMMDD.csv.gz and all of July's files are`
			`// stored in the bucket, 9 tables will be created named like datafile201307DD`
			`// where DD ranges from 01 to 09, inclusive.`
			`// When the program completes, it will emit a results line similar to:`
			`//`
			`// 9 files loaded in 3m58s (18m2.708s). Size: 7.18GB Rows: 7130725`
			`//`
			`// The total elapsed time from the start of first job to the end of the last job`
			`// (effectively wall clock time) is shown. In parenthesis is the aggregate time`
			`// taken to load all tables.`
			`func bqMain(client *http.Client, argv []string) {`
			`if len(argv) != 6 {`
			`fmt.Fprintln(os.Stderr,`
			`"Usage: bq project_id bucket prefix dataset schema max_bad_records")`
			`return`
			`}`

			`var (`
			`project = argv[0]`
			`bucket = argv[1]`
			`objPrefix = argv[2]`
			`datasetId = argv[3]`
			`schemaFile = argv[4]`
			`)`
			`badRecords, err := strconv.ParseInt(argv[5], 10, 64)`
			`if err != nil {`
			`fmt.Fprintln(os.Stderr, err)`
			`return`
			`}`

			`rand.Seed(time.Now().UnixNano())`

			`service, err := storage.New(client)`
			`if err != nil {`
			`log.Fatalf("Unable to create Storage service: %v", err)`
			`}`

			`// Get the list of objects in the bucket matching the specified prefix.`
			`list := service.Objects.List(bucket)`
			`list.Prefix(objPrefix)`
			`objects, err := list.Do()`
			`if err != nil {`
			`fmt.Fprintln(os.Stderr, err)`
			`return`
			`}`

			`// Create the wrapper and insert the (new) dataset.`
			`dataset, err := newBQDataset(client, project, datasetId)`
			`if err != nil {`
			`fmt.Fprintln(os.Stderr, err)`
			`return`
			`}`
			`if err = dataset.insert(true); err != nil {`
			`fmt.Fprintln(os.Stderr, err)`
			`return`
			`}`

			`objectSource := &tableSource{`
			`maxBadRecords: badRecords,`
			`disposition: TableWriteEmptyDisposition,`
			`}`

			`// Load the schema from disk.`
			`f, err := ioutil.ReadFile(schemaFile)`
			`if err != nil {`
			`fmt.Fprintln(os.Stderr, err)`
			`return`
			`}`
			`if err = json.Unmarshal(f, &objectSource.schema); err != nil {`
			`fmt.Fprintln(os.Stderr, err)`
			`return`
			`}`

			`// Assumes all objects have .csv, .csv.gz (or no) extension.`
			`tableIdFromObject := func(name string) string {`
			`return strings.TrimSuffix(strings.TrimSuffix(name, ".gz"), ".csv")`
			`}`

			`// A jobset is way to group a collection of jobs together for monitoring.`
			`// For this example, we just use the name of the bucket and object prefix.`
			`jobset := fmt.Sprintf("%s:%s", bucket, objPrefix)`
			`fmt.Fprintf(os.Stderr, "\nLoading %d objects.\n", len(objects.Items))`

			`// Load each object into a dataset of the same name (minus any extension).`
			`// A successful insert call will inject the job into our queue for monitoring.`
			`for _, o := range objects.Items {`
			`objectSource.id = tableIdFromObject(o.Name)`
			`objectSource.uri = fmt.Sprintf("gs://%s/%s", o.Bucket, o.Name)`
			`if err = dataset.load(jobset, objectSource); err != nil {`
			`fmt.Fprintln(os.Stderr, err)`
			`}`
			`}`

			`dataset.monitor(jobset)`
			`}`

			`// Wraps the BigQuery service and dataset and provides some helper functions.`
			`type bqDataset struct {`
			`project string`
			`id string`
			`bq *bigquery.Service`
			`dataset *bigquery.Dataset`
			`jobsets map[string]*list.List`
			`}`

			`func newBQDataset(client http.Client, dsProj string, dsId string) (bqDataset,`
			`error) {`

			`service, err := bigquery.New(client)`
			`if err != nil {`
			`log.Fatalf("Unable to create BigQuery service: %v", err)`
			`}`

			`return &bqDataset{`
			`project: dsProj,`
			`id: dsId,`
			`bq: service,`
			`dataset: &bigquery.Dataset{`
			`DatasetReference: &bigquery.DatasetReference{`
			`DatasetId: dsId,`
			`ProjectId: dsProj,`
			`},`
			`},`
			`jobsets: make(map[string]*list.List),`
			`}, nil`
			`}`

			`func (ds *bqDataset) insert(existsOK bool) error {`
			`call := ds.bq.Datasets.Insert(ds.project, ds.dataset)`
			`_, err := call.Do()`
			`if err != nil && (!existsOK \|\| !strings.Contains(err.Error(),`
			`DatasetAlreadyExists)) {`
			`return err`
			`}`

			`return nil`
			`}`

			`type tableSource struct {`
			`id string`
			`uri string`
			`schema bigquery.TableSchema`
			`maxBadRecords int64`
			`disposition string`
			`}`

			`func (ds bqDataset) load(jobset string, source tableSource) error {`
			`job := &bigquery.Job{`
			`Configuration: &bigquery.JobConfiguration{`
			`Load: &bigquery.JobConfigurationLoad{`
			`DestinationTable: &bigquery.TableReference{`
			`DatasetId: ds.dataset.DatasetReference.DatasetId,`
			`ProjectId: ds.project,`
			`TableId: source.id,`
			`},`
			`MaxBadRecords: source.maxBadRecords,`
			`Schema: &source.schema,`
			`SourceUris: []string{source.uri},`
			`WriteDisposition: source.disposition,`
			`},`
			`},`
			`}`

			`call := ds.bq.Jobs.Insert(ds.project, job)`
			`job, err := call.Do()`
			`if err != nil {`
			`return err`
			`}`

			`_, ok := ds.jobsets[jobset]`
			`if !ok {`
			`ds.jobsets[jobset] = list.New()`
			`}`
			`ds.jobsets[jobset].PushBack(job)`

			`return nil`
			`}`

			`func (ds bqDataset) getJob(id string) (bigquery.Job, error) {`
			`return ds.bq.Jobs.Get(ds.project, id).Do()`
			`}`

			`func (ds *bqDataset) monitor(jobset string) {`
			`jobq, ok := ds.jobsets[jobset]`
			`if !ok {`
			`return`
			`}`

			`var backoff float64 = BaseBackoff`
			`pause := func(grow bool) {`
			`if grow {`
			`backoff *= BackoffGrowthFactor`
			`backoff -= (backoff * rand.Float64() * BackoffGrowthDamper)`
			`backoff = math.Min(backoff, MaxBackoff)`
			`fmt.Fprintf(os.Stderr, "[%s] Checking remaining %d jobs...\n", jobset,`
			`1+jobq.Len())`
			`}`
			`time.Sleep(time.Duration(backoff) * time.Millisecond)`
			`}`
			`var stats jobStats`

			`// Track a 'head' pending job in queue for detecting cycling.`
			`head := ""`
			`// Loop until all jobs are done - with either success or error.`
			`for jobq.Len() > 0 {`
			`jel := jobq.Front()`
			`job := jel.Value.(*bigquery.Job)`
			`jobq.Remove(jel)`
			`jid := job.JobReference.JobId`
			`loop := false`

			`// Check and possibly pick a new head job id.`
			`if len(head) == 0 {`
			`head = jid`
			`} else {`
			`if jid == head {`
			`loop = true`
			`}`
			`}`

			`// Retrieve the job's current status.`
			`pause(loop)`
			`j, err := ds.getJob(jid)`
			`if err != nil {`
			`fmt.Fprintln(os.Stderr, err)`
			`// In this case of a transient API error, we want keep the job.`
			`if j == nil {`
			`jobq.PushBack(job)`
			`} else {`
			`// Must reset head tracker if job is discarded.`
			`if loop {`
			`head = ""`
			`backoff = BaseBackoff`
			`}`
			`}`
			`continue`
			`}`

			`// Reassign with the updated job data (from Get).`
			`// We don't use j here as Get might return nil for this value.`
			`job = j`

			`if job.Status.State != JobStatusDone {`
			`jobq.PushBack(job)`
			`continue`
			`}`

			`if res := job.Status.ErrorResult; res != nil {`
			`fmt.Fprintln(os.Stderr, res.Message)`
			`} else {`
			`stat := job.Statistics`
			`lstat := stat.Load`
			`stats.files += 1`
			`stats.bytesIn += lstat.InputFileBytes`
			`stats.bytesOut += lstat.OutputBytes`
			`stats.rows += lstat.OutputRows`
			`stats.elapsed +=`
			`time.Duration(stat.EndTime-stat.StartTime) * time.Millisecond`

			`if stats.start.IsZero() {`
			`stats.start = time.Unix(stat.StartTime/1000, 0)`
			`} else {`
			`t := time.Unix(stat.StartTime/1000, 0)`
			`if stats.start.Sub(t) > 0 {`
			`stats.start = t`
			`}`
			`}`

			`if stats.finish.IsZero() {`
			`stats.finish = time.Unix(stat.EndTime/1000, 0)`
			`} else {`
			`t := time.Unix(stat.EndTime/1000, 0)`
			`if t.Sub(stats.finish) > 0 {`
			`stats.finish = t`
			`}`
			`}`
			`}`
			`// When the head job is processed reset the backoff since the loads`
			`// run in BQ in parallel.`
			`if loop {`
			`head = ""`
			`backoff = BaseBackoff`
			`}`
			`}`

			`fmt.Fprintf(os.Stderr, "%#v\n", stats)`
			`}`

			`type jobStats struct {`
			`// Number of files (sources) loaded.`
			`files int64`
			`// Bytes read from source (possibly compressed).`
			`bytesIn int64`
			`// Bytes loaded into BigQuery (uncompressed).`
			`bytesOut int64`
			`// Rows loaded into BigQuery.`
			`rows int64`
			`// Time taken to load source into table.`
			`elapsed time.Duration`
			`// Start time of the job.`
			`start time.Time`
			`// End time of the job.`
			`finish time.Time`
			`}`

			`func (s jobStats) GoString() string {`
			`return fmt.Sprintf("\n%d files loaded in %v (%v). Size: %.2fGB Rows: %d\n",`
			`s.files, s.finish.Sub(s.start), s.elapsed, float64(s.bytesOut)/GB,`
			`s.rows)`
			`}`