initial commit
This commit is contained in:
commit
dd48aef782
8 changed files with 1010 additions and 0 deletions
359
scraper/scraper.go
Normal file
359
scraper/scraper.go
Normal file
|
|
@ -0,0 +1,359 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"sauce/shared"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
// "sync"
|
||||
"time"
|
||||
|
||||
"gorm.io/driver/sqlite"
|
||||
"gorm.io/gorm"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/mattn/go-sqlite3" // ???
|
||||
)
|
||||
|
||||
var categoryMap = map[string]shared.Category {
|
||||
"doujinshi": shared.Doujinshi,
|
||||
"manga": shared.Doujinshi,
|
||||
}
|
||||
|
||||
var languageMap = map[string]shared.Language {
|
||||
"japanese": shared.Jp,
|
||||
"chinese": shared.Cn,
|
||||
"english": shared.En,
|
||||
}
|
||||
|
||||
type extractor interface {
|
||||
extract(...string) error
|
||||
discover(string) error
|
||||
}
|
||||
|
||||
type nhentai struct {
|
||||
db *gorm.DB
|
||||
}
|
||||
|
||||
type Queue struct {
|
||||
ID uint `gorm:"primarykey"`
|
||||
CreatedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
Url string `gorm:"unique"`
|
||||
}
|
||||
|
||||
var extractGalleryUrl = regexp.MustCompile(`https?://t\d?\.nhentai\.net/galleries/(\d+)/.+\.(\w+)`)
|
||||
|
||||
func (n nhentai) discover(url string) (error) {
|
||||
|
||||
root := "https://nhentai.net"
|
||||
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Load the HTML document
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
doc.Find(".cover").Each(func(i int, s *goquery.Selection) {
|
||||
href, ok := s.Attr("href")
|
||||
if !ok {
|
||||
panic(ok)
|
||||
}
|
||||
url := root + href
|
||||
|
||||
fmt.Println(url)
|
||||
|
||||
var source shared.Publication
|
||||
err = n.db.Where("source = ?", url).First(&source).Error
|
||||
if err != nil && err != gorm.ErrRecordNotFound {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = n.db.Create(&Queue{ Url: url}).Error
|
||||
if _, ok := err.(sqlite3.Error); ok { // fixme
|
||||
return
|
||||
|
||||
} else if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
})
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
func (n nhentai) download(url, name string) (shared.Page, error) {
|
||||
|
||||
resp1, err := http.Get(url)
|
||||
if err != nil {
|
||||
return shared.Page{}, err
|
||||
}
|
||||
if resp1.StatusCode >= 400 && resp1.StatusCode < 500 {
|
||||
return shared.Page{}, fmt.Errorf("status code: %s", resp1.Status)
|
||||
}
|
||||
defer resp1.Body.Close()
|
||||
|
||||
blob, err := io.ReadAll(resp1.Body)
|
||||
if err != nil {
|
||||
return shared.Page{}, err
|
||||
}
|
||||
|
||||
img, err := shared.LoadImageFromBytes(blob)
|
||||
if err != nil {
|
||||
return shared.Page{}, err
|
||||
}
|
||||
img.Url = url
|
||||
img.Name = name
|
||||
|
||||
// publication.Pages = append(publication.Pages, img)
|
||||
|
||||
return img, nil
|
||||
}
|
||||
|
||||
func (n nhentai) extract(url string) error {
|
||||
|
||||
fmt.Println("extracting:", url)
|
||||
|
||||
// url := root + strings.Join(path,
|
||||
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Load the HTML document
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
|
||||
publication := shared.Publication {
|
||||
Title: doc.Find(".title .pretty").Text(),
|
||||
Source: url,
|
||||
Host: "nhentai",
|
||||
}
|
||||
|
||||
if publication.Title == "" {
|
||||
return fmt.Errorf("missing title")
|
||||
}
|
||||
|
||||
coverUrl, ok := doc.Find("#cover > a > img").Attr("data-src")
|
||||
if !ok {
|
||||
panic(coverUrl)
|
||||
}
|
||||
|
||||
var pagesTotal int
|
||||
// galleryUrl := extractGalleryUrl.FindStringSubmatch(coverUrl)
|
||||
// fmt.Println(galleryUrl, coverUrl)
|
||||
// galleryId := galleryUrl[1]
|
||||
// galleryExt := galleryUrl[2]
|
||||
|
||||
doc.Find(".tag-container").Each(func(i int, s *goquery.Selection) {
|
||||
tag := strings.TrimSpace(s.Contents().First().Text())
|
||||
|
||||
switch tag {
|
||||
case "Artists:":
|
||||
field := s.Find(".name").First().Text()
|
||||
artists := strings.Split(field, " | ")
|
||||
|
||||
var authors []shared.Author
|
||||
for _, name := range artists {
|
||||
var author shared.Author
|
||||
|
||||
err := n.db.Where("name = ?", name).First(&author).Error
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
author = shared.Author{ Name: name, }
|
||||
} else if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
authors = append(authors, author)
|
||||
}
|
||||
|
||||
publication.Authors = append(publication.Authors, authors...)
|
||||
case "Tags:":
|
||||
s.Find("a").Each(func(i int, s *goquery.Selection) {
|
||||
name := s.Children().First().Text()
|
||||
var tag shared.Tag
|
||||
|
||||
err := n.db.Where("name = ?", name).First(&tag).Error
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
tag = shared.Tag{
|
||||
Name: name,
|
||||
}
|
||||
} else if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
publication.Tags = append(publication.Tags, tag)
|
||||
})
|
||||
case "Categories:":
|
||||
s.Find("a").Each(func(i int, s *goquery.Selection) {
|
||||
|
||||
category, ok := categoryMap[s.Children().First().Text()]
|
||||
|
||||
if !ok {
|
||||
panic(category)
|
||||
}
|
||||
publication.Category = &category
|
||||
|
||||
})
|
||||
case "Languages:":
|
||||
s.Find("a").Each(func(i int, s *goquery.Selection) {
|
||||
|
||||
lang, ok := languageMap[s.Children().First().Text()]
|
||||
|
||||
if !ok {
|
||||
lang = shared.None
|
||||
}
|
||||
|
||||
publication.Language = &lang
|
||||
})
|
||||
case "Pages:":
|
||||
s.Find("a .name").Each(func(i int, s *goquery.Selection) {
|
||||
pagesTotal, err = strconv.Atoi(s.Text())
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}) }
|
||||
})
|
||||
|
||||
// var wg sync.WaitGroup
|
||||
// var mut sync.Mutex
|
||||
// errors := make(chan error, pagesTotal)
|
||||
|
||||
for pageNumber := range pagesTotal {
|
||||
|
||||
url := fmt.Sprintf("%s%d", url, pageNumber+1)
|
||||
// fmt.Println("source:", url)
|
||||
resp2, err := http.Get(url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if resp2.StatusCode != 200 {
|
||||
panic(resp2.Status)
|
||||
}
|
||||
doc2, err := goquery.NewDocumentFromReader(resp2.Body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
src, ok := doc2.Find("#image-container > a > img").Attr("src")
|
||||
if !ok {
|
||||
panic(src)
|
||||
}
|
||||
// url := fmt.Sprintf("https://i.nhentai.net/galleries/%s/%d.%s", galleryId, pageNumber+1, galleryExt)
|
||||
outputName := fmt.Sprintf("%d.jpg", pageNumber+1)
|
||||
|
||||
fmt.Println("requested:", src)
|
||||
page, err := n.download(src, outputName)
|
||||
if err != nil {
|
||||
if errors.Is(err, shared.BlankImage) {
|
||||
fmt.Println("blank image:", src)
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
fmt.Println("finished:", src)
|
||||
page.Order = pageNumber
|
||||
|
||||
publication.Pages = append(publication.Pages, page)
|
||||
|
||||
// time.Sleep(time.Second) // good guying
|
||||
// wg.Add(1)
|
||||
|
||||
// go func() {
|
||||
// defer wg.Done()
|
||||
//
|
||||
// fmt.Println("requested:", url)
|
||||
// page, err := n.download(url, outputName)
|
||||
// if err != nil {
|
||||
// errors <- err
|
||||
// return
|
||||
// }
|
||||
// page.Order = order
|
||||
//
|
||||
// mut.Lock()
|
||||
// publication.Pages = append(publication.Pages, page)
|
||||
// mut.Unlock()
|
||||
//
|
||||
// fmt.Println("finished:", url)
|
||||
// errors <- nil
|
||||
// }()
|
||||
}
|
||||
|
||||
// wg.Wait()
|
||||
// close(errors)
|
||||
|
||||
// for err := range errors {
|
||||
// if err == shared.BlankImage {
|
||||
// continue
|
||||
// }
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
// }
|
||||
|
||||
err = n.db.Debug().Create(&publication).Error
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
db, err := gorm.Open(sqlite.Open("test.db"), &gorm.Config{})
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
db.AutoMigrate(&Queue{})
|
||||
db.AutoMigrate(&shared.Author{}, &shared.Page{}, &shared.Tag{}, &shared.Publication{})
|
||||
|
||||
// nhentai{db: db}.discover("https://nhentai.net/artist/mda-starou/")
|
||||
// return
|
||||
|
||||
for {
|
||||
|
||||
var dequeue Queue
|
||||
tx := db.Begin()
|
||||
|
||||
nhentai := nhentai{ db: tx }
|
||||
|
||||
err = tx.First(&dequeue).Error
|
||||
if err == gorm.ErrRecordNotFound {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = nhentai.extract(dequeue.Url)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
err = tx.Delete(&dequeue).Error
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
tx.Commit()
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue