summaryrefslogtreecommitdiff
path: root/document
diff options
context:
space:
mode:
authorF.O. <scrotadamus@insiberia.net>2025-02-16 17:56:08 +0100
committerF.O. <scrotadamus@insiberia.net>2025-02-16 17:56:57 +0100
commit17fb6add26291b31f7020e3551a7c8487130a747 (patch)
treed4559a7339ed181393ff921909e6ce05b7c2cf18 /document
genesi
Diffstat (limited to 'document')
-rw-r--r--document/annot.go76
-rw-r--r--document/document.go236
-rw-r--r--document/hash.go104
-rw-r--r--document/utils.go30
4 files changed, 446 insertions, 0 deletions
diff --git a/document/annot.go b/document/annot.go
new file mode 100644
index 0000000..e280cd0
--- /dev/null
+++ b/document/annot.go
@@ -0,0 +1,76 @@
+package document
+
+import (
+ "github.com/scrotadamus/ghligh/go-poppler"
+)
+
+type AnnotJSON struct {
+ Type poppler.AnnotType `json:"type,omitempty"`
+ Index int `json:"index,omitempty"`
+ Date string `json:"date,omitempty"`
+ Rect poppler.Rectangle `json:"rect,omitempty"`
+ Color poppler.Color `json:"color,omitempty"`
+ Name string `json:"name,omitempty"`
+ Contents string `json:"contents,omitempty"`
+ Flags poppler.AnnotFlag `json:"flags,omitempty"`
+ Quads []poppler.Quad `json:"quads,omitempty"`
+}
+
+func annotToJson(a poppler.Annot) (AnnotJSON) {
+ var aj AnnotJSON
+ aj.Type = a.Type()
+ aj.Index = a.Index()
+ aj.Date = a.Date()
+ aj.Rect = a.Rect()
+ aj.Color = a.Color()
+ aj.Name = a.Name()
+ aj.Contents = a.Contents()
+ aj.Flags = a.Flags()
+ aj.Quads = a.Quads()
+
+ return aj
+}
+
+func (d *GhlighDoc) jsonToAnnot(aJson AnnotJSON) *poppler.Annot {
+
+ annot, _ := d.doc.NewAnnot(poppler.AnnotHighlight, aJson.Rect, aJson.Quads)
+
+ annot.SetColor(aJson.Color)
+ annot.SetContents(aJson.Contents)
+ annot.SetFlags(aJson.Flags)
+
+ return &annot
+}
+
+func popplerAnnotsMatch(a *poppler.Annot, b *poppler.Annot) bool {
+ aRect := a.Rect()
+ bRect := b.Rect()
+
+ aQuads := a.Quads()
+ bQuads := b.Quads()
+
+ if aRect.X1 != bRect.X1 ||
+ aRect.Y1 != bRect.Y1 ||
+ aRect.X2 != bRect.X2 ||
+ aRect.Y2 != bRect.Y2 {
+ return false
+ }
+
+ if len(aQuads) != len(bQuads) {
+ return false
+ }
+
+ for i := range aQuads {
+ q1 := aQuads[i]
+ q2 := bQuads[i]
+
+ if q1.P1.X != q2.P1.X || q1.P1.Y != q2.P1.Y ||
+ q1.P2.X != q2.P2.X || q1.P2.Y != q2.P2.Y ||
+ q1.P3.X != q2.P3.X || q1.P3.Y != q2.P3.Y ||
+ q1.P4.X != q2.P4.X || q1.P4.Y != q2.P4.Y {
+ return false
+ }
+ }
+
+ return true
+}
diff --git a/document/document.go b/document/document.go
new file mode 100644
index 0000000..a036eee
--- /dev/null
+++ b/document/document.go
@@ -0,0 +1,236 @@
+package document
+
+import (
+ "github.com/scrotadamus/ghligh/go-poppler"
+
+ "os"
+ "sync"
+
+ "strings"
+
+ "fmt"
+)
+
+const ghlighFilter = "ghligh-Y2lhbm5v:"
+
+// This is different from poppler's annot_mapping
+// it is the list of annotations mapped to the page index
+type AnnotsMap map[int][]AnnotJSON
+
+type GhlighDoc struct {
+ doc *poppler.Document
+ mu sync.Mutex
+
+ Path string `json:"file"`
+ HashBuffer string `json:"hash"`
+ AnnotsBuffer AnnotsMap `json:"highlights,omitempty"`
+}
+
+type HighlightedText struct {
+ Page int `json:"page"`
+ Text string `json:"text"`
+ Contents string `json:"contents,omitempty"`
+}
+
+func Open(filename string) (*GhlighDoc, error) {
+ var err error
+
+ g := &GhlighDoc{}
+
+ g.doc, err = poppler.Open(filename)
+ if err != nil {
+ fmt.Errorf("%s: error opening pdf %v", os.Args[0], err)
+ return nil, err
+ }
+ g.Path = filename
+ // HashDoc??
+
+ return g, nil
+}
+
+func (d *GhlighDoc) Close() {
+ d.AnnotsBuffer = nil
+ d.HashBuffer = ""
+ if d.doc != nil {
+ d.doc.Close()
+ }
+}
+
+func (d *GhlighDoc) Info() poppler.DocumentInfo {
+ return d.doc.Info()
+}
+
+func (d *GhlighDoc) tagExists(text string) bool {
+ for _, tag := range d.GetTags() {
+ if tag == text {
+ return true
+ }
+ }
+ return false
+}
+
+func (d *GhlighDoc) Tag(text string) {
+ if !d.tagExists(text) {
+ d.doc.Tag(ghlighFilter + text)
+ } else {
+ fmt.Fprintf(os.Stderr, "warning: tag %s already exist inside %s, i don't do anything\n", text, d.Path)
+ }
+}
+
+func (d *GhlighDoc) GetTags() []string {
+ var tags []string
+ annots := d.doc.GetTags(ghlighFilter)
+ for _, annot := range annots {
+ contents := strings.TrimPrefix(annot.Contents(), ghlighFilter)
+ tags = append(tags, contents)
+ }
+ return tags
+}
+
+func (d *GhlighDoc) RemoveTags(tags []string) int {
+ zeroPage := d.doc.GetPage(0)
+ var removedTags int
+
+ annots := d.doc.GetTags(ghlighFilter)
+ for _, annot := range annots {
+ contents := strings.TrimPrefix(annot.Contents(), ghlighFilter)
+ for _, tag := range tags {
+ if tag == contents {
+ zeroPage.RemoveAnnot(*annot)
+ removedTags += 1
+ break
+ }
+ }
+ }
+ return removedTags
+}
+
+func (d *GhlighDoc) Import(annotsMap AnnotsMap) (int, error) {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+ annots_count := 0
+
+ var err error
+ d.AnnotsBuffer = annotsMap
+
+ for key := range d.AnnotsBuffer {
+ page := d.doc.GetPage(key)
+ for _, annot := range d.AnnotsBuffer[key] {
+ a := d.jsonToAnnot(annot)
+ if !isInPage(a, page) {
+ annots_count += 1
+ page.AddAnnot(*a)
+ }
+
+ }
+ page.Close()
+ }
+
+ d.AnnotsBuffer = nil
+ return annots_count, err
+}
+
+func integrityCheck(tizio *GhlighDoc, caio *GhlighDoc) {
+
+}
+
+func (d *GhlighDoc) Save() (bool, error) {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+ tempFile, err := os.CreateTemp("", ".ghligh_*.pdf")
+ if err != nil {
+ return false, err
+ }
+ defer os.Remove(tempFile.Name())
+
+ ok, err := d.doc.Save(tempFile.Name())
+ if !ok {
+ return false, err
+ }
+
+ /* integrity check */
+ newDoc, err := Open(tempFile.Name())
+ if err != nil {
+ return false, err
+ }
+
+ if newDoc.HashDoc() != d.HashDoc() {
+ return false, fmt.Errorf("After saving document %s to %s its hash doesn't correspond the the old one", d.Path, tempFile.Name())
+ }
+
+ err = os.Rename(tempFile.Name(), d.Path)
+ if err != nil {
+ return false, err
+ }
+
+ return true, nil
+}
+
+func (d *GhlighDoc) Cat() []HighlightedText {
+ var highlights []HighlightedText
+
+ n_pages := d.doc.GetNPages()
+ for i := 0; i < n_pages; i++ {
+ page := d.doc.GetPage(i)
+ annots := page.GetAnnots()
+ for _, annot := range annots {
+ if annot.Type() == poppler.AnnotHighlight {
+ annotText := page.AnnotText(*annot)
+
+ highlights = append(highlights, HighlightedText{Page: i, Text: annotText, Contents: annot.Contents()})
+ }
+ }
+
+ page.Close()
+ }
+ return highlights
+}
+
+func (d *GhlighDoc) HasHighlights() bool {
+ // check if is tagged with ls
+ if d.tagExists("ls") {
+ return true
+ }
+
+ // check if it has highlights
+ n_pages := d.doc.GetNPages()
+ for i := 0; i < n_pages; i++ {
+ page := d.doc.GetPage(i)
+ annots := page.GetAnnots()
+ for _, annot := range annots {
+ if annot.Type() == poppler.AnnotHighlight {
+ return true
+ }
+ }
+
+ page.Close()
+ }
+ return false
+}
+
+func (d *GhlighDoc) GetAnnotsBuffer() AnnotsMap {
+ annots_json_of_page := make(AnnotsMap)
+
+ n := d.doc.GetNPages()
+ var annots_json []AnnotJSON
+ for i := 0; i < n; i++ {
+ annots_json = nil
+ page := d.doc.GetPage(i)
+
+ annots := page.GetAnnots()
+ for _, annot := range annots {
+ if annot.Type() == poppler.AnnotHighlight {
+ annot_json := annotToJson(*annot)
+ annots_json = append(annots_json, annot_json)
+ }
+ }
+
+ page.Close()
+
+ if len(annots_json) > 0 {
+ annots_json_of_page[i] = annots_json
+ }
+ }
+
+ return annots_json_of_page
+}
diff --git a/document/hash.go b/document/hash.go
new file mode 100644
index 0000000..fe9520c
--- /dev/null
+++ b/document/hash.go
@@ -0,0 +1,104 @@
+package document
+
+import (
+ "fmt"
+ "os"
+
+ "unsafe"
+ "reflect"
+
+ "math"
+
+ "runtime"
+ "sync"
+
+ "crypto/hmac"
+ "crypto/sha256"
+
+ "github.com/scrotadamus/ghligh/go-poppler"
+)
+
+
+var ghlighKey = []byte("ghligh-pdf-doc")
+
+var bufPool = sync.Pool{
+ New: func() interface{} {
+ return make([]byte, 0, os.Getpagesize())
+ },
+}
+
+type pageResult struct {
+ index int
+ buf []byte
+}
+
+
+func sqrtInt(n int) int{
+ return int(math.Sqrt(float64(n)))
+}
+
+func continueAt(i, n int) bool {
+ // Very unlikely to edit a pdf and add a page in the center
+ return i < sqrtInt(n)/2 || i > n - sqrtInt(n)/2
+}
+
+// generate identifier from document based on document text (use layout instead)
+func (d *GhlighDoc) HashDoc() string {
+ nPages := d.doc.GetNPages()
+
+ hmacHash := hmac.New(sha256.New, ghlighKey)
+ resultsCh := make(chan pageResult, nPages)
+
+ var wg sync.WaitGroup
+
+ maxWorkers := runtime.NumCPU() + 1
+ sem := make(chan struct{}, maxWorkers)
+
+
+ for i := 0; continueAt(i, nPages); i++ {
+ wg.Add(1)
+ sem <- struct{}{}
+ go func(i int) {
+ defer wg.Done()
+ defer func() { <-sem }()
+
+ page := d.doc.GetPage(i)
+ text := page.Text()
+ page.Close()
+
+ buf := bufPool.Get().([]byte)
+ buf = buf[:0]
+ buf = append(buf, []byte(text)...)
+
+ resultsCh <- pageResult{index: i, buf: buf}
+ }(i)
+ }
+
+ wg.Wait()
+ close(resultsCh)
+
+ results := make([][]byte, nPages)
+ for res := range resultsCh {
+ results[res.index] = res.buf
+ }
+
+ for i := 0; continueAt(i, nPages); i++ {
+ hmacHash.Write(results[i])
+ hmacHash.Write([]byte{byte(i)})
+ bufPool.Put(results[i])
+ }
+ hmacHash.Write([]byte{byte(nPages)})
+
+ return fmt.Sprintf("%x", hmacHash.Sum(nil))
+}
+
+func rectToBytes(r *poppler.Rectangle) []byte {
+ size := int(unsafe.Sizeof(*r))
+
+ var sliceHeader reflect.SliceHeader
+ sliceHeader.Data = uintptr(unsafe.Pointer(r))
+ sliceHeader.Len = size
+ sliceHeader.Cap = size
+
+ return *(*[]byte)(unsafe.Pointer(&sliceHeader))
+}
diff --git a/document/utils.go b/document/utils.go
new file mode 100644
index 0000000..c6593c4
--- /dev/null
+++ b/document/utils.go
@@ -0,0 +1,30 @@
+package document
+
+import (
+ "github.com/scrotadamus/ghligh/go-poppler"
+
+ "encoding/json"
+)
+
+func unmarshallHighlights(jsonData string) (AnnotsMap, error) {
+ var annotsMap AnnotsMap
+
+ err := json.Unmarshal([]byte(jsonData), &struct {
+ Highlights *AnnotsMap `json:"highlights"`
+ }{
+ Highlights: &annotsMap,
+ })
+
+ return annotsMap, err
+}
+
+func isInPage(a *poppler.Annot, p *poppler.Page) bool {
+ annots := p.GetAnnots()
+ for _, annot := range annots {
+ if popplerAnnotsMatch(a, annot) {
+ return true
+ }
+ }
+
+ return false
+}