From 5229888846b8031c238823253afe7d8f755a3fe6 Mon Sep 17 00:00:00 2001 From: gutmet Date: Sun, 13 Jan 2019 19:57:34 +0100 Subject: [PATCH] initial commit --- .gitignore | 3 + README.md | 61 ++++++++++++++++ ambrowse.go | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++++ go.mod | 8 ++ 4 files changed, 277 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 ambrowse.go create mode 100644 go.mod diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..678aba8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.gitdist +go.sum +ambrowse diff --git a/README.md b/README.md new file mode 100644 index 0000000..94986fa --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +ambrowse +======== + +ambrowse extracts book metadata from saved amazon.com, amazon.co.uk or amazon.de pages. It watches the directory 'ambrowse' +inside your HOME, digests newly written files and adds the metadata to a list in 'books.yaml'. It then removes those files +after a time delay, leaving only the metadata list. + +You can find releases on [releases.gutmet.org](https://releases.gutmet.org) or build it yourself. + +build +===== + +If you want to build ambrowse yourself, you need go1.11+ with module support. Clone this repository and type 'go build ambrowse.go'. + +quick start +=========== + +Download a release from [releases.gutmet.org](https://releases.gutmet.org) and unzip it. Put the correct file for your operating system +somewhere you can run it. Then start a command line: + +``` +> ambrowse +2019/01/13 19:06:34 ambrowse is watching /home/johndoe/ambrowse +``` + +Go to amazon, visit a book page and save it to the folder ambrowse is watching. + +``` +2019/01/13 19:06:40 Reading metadata from /home/johndoe/ambrowse/Metallbearbeitung II: Verbindungstechniken, Gebrauchsgegenstände aus Metall ALS-Werk- und Arbeitsmappen: Amazon.de: Ingrid Kreide, Wolfram Enders: Bücher.html +``` + +If you are on Unix or MacOS, you will get a desktop notification about the extracted metadata. You might want to add the watched folder to your favorites to reduce the number of clicks and add ambrowse to the services that are automatically started. + + +example metadata +================ + +The file 'books.yaml' inside the watched folder might look like this: + +``` +--- + +title: "Essential Bushcraft" +authors: [Ray Mears (Autor)] +publisher: "Hodder & Stoughton General Division; Auflage: Reprint (23. Juni 2003)" +isbn10: 0340829710 +isbn13: 978-0340829714 +amazonPrice: "EUR 24,01" + +--- + +title: "Ray Mears Outdoor Survival Handbook: The Classic Indispensable Guide to +Surviving the Outdoors" +authors: [Ray Mears (Autor)] +publisher: "Ebury Press (30. Mai 2001)" +isbn10: 0091878861 +isbn13: 978-0091878863 +amazonPrice: "EUR 20,35" +``` + +So far it seems like this format is bookseller-parseable. \ No newline at end of file diff --git a/ambrowse.go b/ambrowse.go new file mode 100644 index 0000000..8e2350c --- /dev/null +++ b/ambrowse.go @@ -0,0 +1,205 @@ +package main + +import ( + "git.gutmet.org/goutil.git" + "github.com/fsnotify/fsnotify" + "golang.org/x/net/html" + "log" + "os" + "path/filepath" + "strings" + "time" +) + +const ( + NAME = "ambrowse" + DIR = "ambrowse" // inside home directory + BOOKFILE = "books.yaml" +) + +type book struct { + title string + authors []string + publisher string + isbn10 string + isbn13 string + amazonPrice string +} + +func (b book) String() string { + return strings.Join([]string{b.title, strings.Join(b.authors, ", "), b.publisher, b.isbn10, b.isbn13, b.amazonPrice}, "\n") +} + +func (b book) yaml() string { + return "\n---\n\n" + + "title: \"" + b.title + "\"\n" + + "authors: [" + strings.Join(b.authors, ", ") + "]\n" + + "publisher: \"" + b.publisher + "\"\n" + + "isbn10: " + b.isbn10 + "\n" + + "isbn13: " + b.isbn13 + "\n" + + "amazonPrice: \"" + b.amazonPrice + "\"\n" +} + +func optFatal(err error) { + if err != nil { + log.Fatal(err) + } +} + +func optLog(err error) { + if err != nil { + log.Println(err) + } +} + +func readLine(line string) string { + line = strings.TrimSpace(line) + parts := strings.Split(line, ":") + if len(parts) >= 2 { + return strings.TrimSpace(strings.Join(parts[1:], ":")) + } else { + return "" + } +} + +func getDirectory() string { + home, err := goutil.HomeDir() + if err != nil { + log.Fatal("Cannot identify home directory", err) + } + dir := filepath.Join(home, DIR) + err = os.MkdirAll(dir, 0750) + if err != nil { + log.Fatal("Cannot create "+dir, err) + } + return dir +} + +func getAuthors(doc *goutil.HtmlNode) []string { + authors := make(map[string]interface{}) + authorNodes := doc.GetElementsByClass("contributorNameID") + authorNodes = append(authorNodes, doc.GetElementsByClass("author")...) + for _, n := range authorNodes { + decl := n.GetElementsByClass("a-declarative") + if len(decl) > 0 { + authors[decl[0].Text()] = nil + } else { + authors[strings.TrimSuffix(n.Text(), ",")] = nil + } + } + result := []string{} + for k, _ := range authors { + result = append(result, k) + } + return result +} + +func setDetails(b *book, doc *goutil.HtmlNode) { + details := doc.GetElementById("detail-bullets") + if details == nil { + details = doc.GetElementById("detail_bullets_id") + } + if details != nil { + tmpcont := details.GetElementsByClass("content") + if len(tmpcont) > 0 { + cont := tmpcont[0].FindAll(func(n *goutil.HtmlNode) bool { return n.Type == html.ElementNode && n.Data == "li" }) + for _, c := range cont { + line := c.Text() + if strings.Contains(line, "Verlag") || strings.Contains(line, "Publisher") { + b.publisher = readLine(line) + } else if strings.Contains(line, "ISBN-10") { + b.isbn10 = readLine(line) + } else if strings.Contains(line, "ISBN-13") { + b.isbn13 = readLine(line) + } + } + } + } +} + +func getPrice(doc *goutil.HtmlNode) string { + price := doc.GetElementsByClass("a-color-price") + if len(price) > 0 { + return strings.TrimSpace(price[0].Text()) + } else { + return "" + } +} + +func readHTML(filename string) book { + b := book{} + log.Println("Reading metadata from " + filename) + f, err := os.Open(filename) + optFatal(err) + tmpdoc, err := html.Parse(f) + optFatal(err) + doc := (*goutil.HtmlNode)(tmpdoc) + title := doc.GetElementById("productTitle") + b.title = title.Text() + b.authors = getAuthors(doc) + setDetails(&b, doc) + b.amazonPrice = getPrice(doc) + return b +} + +func clean(files []string) { + time.Sleep(30 * time.Second) + for _, file := range files { + os.RemoveAll(file) + } +} + +func digest(dir string, created []string) { + for _, file := range created { + if filepath.Ext(file) == ".html" && goutil.PathExists(file) { + b := readHTML(file) + if b.title != "" { + err := goutil.AppendToFile(filepath.Join(dir, BOOKFILE), b.yaml()) + if err != nil { + log.Println(err) + } + goutil.Notify(NAME, b.String()) + } + } + } + go clean(created) +} + +func main() { + dir := getDirectory() + watcher, err := fsnotify.NewWatcher() + optFatal(err) + defer watcher.Close() + done := make(chan bool) + go func() { + created := []string{} + for { + select { + case event, ok := <-watcher.Events: + if !ok { + return + } + if event.Op&fsnotify.Create == fsnotify.Create && filepath.Base(event.Name) != BOOKFILE { + created = append(created, event.Name) + } + case err, ok := <-watcher.Errors: + if !ok { + return + } + log.Println("error:", err) + case <-time.After(100 * time.Millisecond): + // 100ms after last file system event: munch data + if len(created) > 0 { + tmpcreated := created + created = []string{} + go digest(dir, tmpcreated) + } + } + + } + }() + err = watcher.Add(dir) + optFatal(err) + log.Println(NAME + " is watching " + dir) + <-done +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..2b503f0 --- /dev/null +++ b/go.mod @@ -0,0 +1,8 @@ +module git.gutmet.org/ambrowse.git + +require ( + git.gutmet.org/goutil.git v0.0.0-20190113180148-bb2d3e26ea6c + github.com/fsnotify/fsnotify v1.4.7 + golang.org/x/net v0.0.0-20190110200230-915654e7eabc + golang.org/x/sys v0.0.0-20190109145017-48ac38b7c8cb // indirect +)