From 17fb6add26291b31f7020e3551a7c8487130a747 Mon Sep 17 00:00:00 2001 From: "F.O." Date: Sun, 16 Feb 2025 17:56:08 +0100 Subject: genesi --- document/annot.go | 76 +++++++++++++++++ document/document.go | 236 +++++++++++++++++++++++++++++++++++++++++++++++++++ document/hash.go | 104 +++++++++++++++++++++++ document/utils.go | 30 +++++++ 4 files changed, 446 insertions(+) create mode 100644 document/annot.go create mode 100644 document/document.go create mode 100644 document/hash.go create mode 100644 document/utils.go (limited to 'document') diff --git a/document/annot.go b/document/annot.go new file mode 100644 index 0000000..e280cd0 --- /dev/null +++ b/document/annot.go @@ -0,0 +1,76 @@ +package document + +import ( + "github.com/scrotadamus/ghligh/go-poppler" +) + +type AnnotJSON struct { + Type poppler.AnnotType `json:"type,omitempty"` + Index int `json:"index,omitempty"` + Date string `json:"date,omitempty"` + Rect poppler.Rectangle `json:"rect,omitempty"` + Color poppler.Color `json:"color,omitempty"` + Name string `json:"name,omitempty"` + Contents string `json:"contents,omitempty"` + Flags poppler.AnnotFlag `json:"flags,omitempty"` + Quads []poppler.Quad `json:"quads,omitempty"` +} + +func annotToJson(a poppler.Annot) (AnnotJSON) { + var aj AnnotJSON + aj.Type = a.Type() + aj.Index = a.Index() + aj.Date = a.Date() + aj.Rect = a.Rect() + aj.Color = a.Color() + aj.Name = a.Name() + aj.Contents = a.Contents() + aj.Flags = a.Flags() + aj.Quads = a.Quads() + + return aj +} + +func (d *GhlighDoc) jsonToAnnot(aJson AnnotJSON) *poppler.Annot { + + annot, _ := d.doc.NewAnnot(poppler.AnnotHighlight, aJson.Rect, aJson.Quads) + + annot.SetColor(aJson.Color) + annot.SetContents(aJson.Contents) + annot.SetFlags(aJson.Flags) + + return &annot +} + +func popplerAnnotsMatch(a *poppler.Annot, b *poppler.Annot) bool { + aRect := a.Rect() + bRect := b.Rect() + + aQuads := a.Quads() + bQuads := b.Quads() + + if aRect.X1 != bRect.X1 || + aRect.Y1 != bRect.Y1 || + aRect.X2 != bRect.X2 || + aRect.Y2 != bRect.Y2 { + return false + } + + if len(aQuads) != len(bQuads) { + return false + } + + for i := range aQuads { + q1 := aQuads[i] + q2 := bQuads[i] + + if q1.P1.X != q2.P1.X || q1.P1.Y != q2.P1.Y || + q1.P2.X != q2.P2.X || q1.P2.Y != q2.P2.Y || + q1.P3.X != q2.P3.X || q1.P3.Y != q2.P3.Y || + q1.P4.X != q2.P4.X || q1.P4.Y != q2.P4.Y { + return false + } + } + + return true +} diff --git a/document/document.go b/document/document.go new file mode 100644 index 0000000..a036eee --- /dev/null +++ b/document/document.go @@ -0,0 +1,236 @@ +package document + +import ( + "github.com/scrotadamus/ghligh/go-poppler" + + "os" + "sync" + + "strings" + + "fmt" +) + +const ghlighFilter = "ghligh-Y2lhbm5v:" + +// This is different from poppler's annot_mapping +// it is the list of annotations mapped to the page index +type AnnotsMap map[int][]AnnotJSON + +type GhlighDoc struct { + doc *poppler.Document + mu sync.Mutex + + Path string `json:"file"` + HashBuffer string `json:"hash"` + AnnotsBuffer AnnotsMap `json:"highlights,omitempty"` +} + +type HighlightedText struct { + Page int `json:"page"` + Text string `json:"text"` + Contents string `json:"contents,omitempty"` +} + +func Open(filename string) (*GhlighDoc, error) { + var err error + + g := &GhlighDoc{} + + g.doc, err = poppler.Open(filename) + if err != nil { + fmt.Errorf("%s: error opening pdf %v", os.Args[0], err) + return nil, err + } + g.Path = filename + // HashDoc?? + + return g, nil +} + +func (d *GhlighDoc) Close() { + d.AnnotsBuffer = nil + d.HashBuffer = "" + if d.doc != nil { + d.doc.Close() + } +} + +func (d *GhlighDoc) Info() poppler.DocumentInfo { + return d.doc.Info() +} + +func (d *GhlighDoc) tagExists(text string) bool { + for _, tag := range d.GetTags() { + if tag == text { + return true + } + } + return false +} + +func (d *GhlighDoc) Tag(text string) { + if !d.tagExists(text) { + d.doc.Tag(ghlighFilter + text) + } else { + fmt.Fprintf(os.Stderr, "warning: tag %s already exist inside %s, i don't do anything\n", text, d.Path) + } +} + +func (d *GhlighDoc) GetTags() []string { + var tags []string + annots := d.doc.GetTags(ghlighFilter) + for _, annot := range annots { + contents := strings.TrimPrefix(annot.Contents(), ghlighFilter) + tags = append(tags, contents) + } + return tags +} + +func (d *GhlighDoc) RemoveTags(tags []string) int { + zeroPage := d.doc.GetPage(0) + var removedTags int + + annots := d.doc.GetTags(ghlighFilter) + for _, annot := range annots { + contents := strings.TrimPrefix(annot.Contents(), ghlighFilter) + for _, tag := range tags { + if tag == contents { + zeroPage.RemoveAnnot(*annot) + removedTags += 1 + break + } + } + } + return removedTags +} + +func (d *GhlighDoc) Import(annotsMap AnnotsMap) (int, error) { + d.mu.Lock() + defer d.mu.Unlock() + annots_count := 0 + + var err error + d.AnnotsBuffer = annotsMap + + for key := range d.AnnotsBuffer { + page := d.doc.GetPage(key) + for _, annot := range d.AnnotsBuffer[key] { + a := d.jsonToAnnot(annot) + if !isInPage(a, page) { + annots_count += 1 + page.AddAnnot(*a) + } + + } + page.Close() + } + + d.AnnotsBuffer = nil + return annots_count, err +} + +func integrityCheck(tizio *GhlighDoc, caio *GhlighDoc) { + +} + +func (d *GhlighDoc) Save() (bool, error) { + d.mu.Lock() + defer d.mu.Unlock() + tempFile, err := os.CreateTemp("", ".ghligh_*.pdf") + if err != nil { + return false, err + } + defer os.Remove(tempFile.Name()) + + ok, err := d.doc.Save(tempFile.Name()) + if !ok { + return false, err + } + + /* integrity check */ + newDoc, err := Open(tempFile.Name()) + if err != nil { + return false, err + } + + if newDoc.HashDoc() != d.HashDoc() { + return false, fmt.Errorf("After saving document %s to %s its hash doesn't correspond the the old one", d.Path, tempFile.Name()) + } + + err = os.Rename(tempFile.Name(), d.Path) + if err != nil { + return false, err + } + + return true, nil +} + +func (d *GhlighDoc) Cat() []HighlightedText { + var highlights []HighlightedText + + n_pages := d.doc.GetNPages() + for i := 0; i < n_pages; i++ { + page := d.doc.GetPage(i) + annots := page.GetAnnots() + for _, annot := range annots { + if annot.Type() == poppler.AnnotHighlight { + annotText := page.AnnotText(*annot) + + highlights = append(highlights, HighlightedText{Page: i, Text: annotText, Contents: annot.Contents()}) + } + } + + page.Close() + } + return highlights +} + +func (d *GhlighDoc) HasHighlights() bool { + // check if is tagged with ls + if d.tagExists("ls") { + return true + } + + // check if it has highlights + n_pages := d.doc.GetNPages() + for i := 0; i < n_pages; i++ { + page := d.doc.GetPage(i) + annots := page.GetAnnots() + for _, annot := range annots { + if annot.Type() == poppler.AnnotHighlight { + return true + } + } + + page.Close() + } + return false +} + +func (d *GhlighDoc) GetAnnotsBuffer() AnnotsMap { + annots_json_of_page := make(AnnotsMap) + + n := d.doc.GetNPages() + var annots_json []AnnotJSON + for i := 0; i < n; i++ { + annots_json = nil + page := d.doc.GetPage(i) + + annots := page.GetAnnots() + for _, annot := range annots { + if annot.Type() == poppler.AnnotHighlight { + annot_json := annotToJson(*annot) + annots_json = append(annots_json, annot_json) + } + } + + page.Close() + + if len(annots_json) > 0 { + annots_json_of_page[i] = annots_json + } + } + + return annots_json_of_page +} diff --git a/document/hash.go b/document/hash.go new file mode 100644 index 0000000..fe9520c --- /dev/null +++ b/document/hash.go @@ -0,0 +1,104 @@ +package document + +import ( + "fmt" + "os" + + "unsafe" + "reflect" + + "math" + + "runtime" + "sync" + + "crypto/hmac" + "crypto/sha256" + + "github.com/scrotadamus/ghligh/go-poppler" +) + + +var ghlighKey = []byte("ghligh-pdf-doc") + +var bufPool = sync.Pool{ + New: func() interface{} { + return make([]byte, 0, os.Getpagesize()) + }, +} + +type pageResult struct { + index int + buf []byte +} + + +func sqrtInt(n int) int{ + return int(math.Sqrt(float64(n))) +} + +func continueAt(i, n int) bool { + // Very unlikely to edit a pdf and add a page in the center + return i < sqrtInt(n)/2 || i > n - sqrtInt(n)/2 +} + +// generate identifier from document based on document text (use layout instead) +func (d *GhlighDoc) HashDoc() string { + nPages := d.doc.GetNPages() + + hmacHash := hmac.New(sha256.New, ghlighKey) + resultsCh := make(chan pageResult, nPages) + + var wg sync.WaitGroup + + maxWorkers := runtime.NumCPU() + 1 + sem := make(chan struct{}, maxWorkers) + + + for i := 0; continueAt(i, nPages); i++ { + wg.Add(1) + sem <- struct{}{} + go func(i int) { + defer wg.Done() + defer func() { <-sem }() + + page := d.doc.GetPage(i) + text := page.Text() + page.Close() + + buf := bufPool.Get().([]byte) + buf = buf[:0] + buf = append(buf, []byte(text)...) + + resultsCh <- pageResult{index: i, buf: buf} + }(i) + } + + wg.Wait() + close(resultsCh) + + results := make([][]byte, nPages) + for res := range resultsCh { + results[res.index] = res.buf + } + + for i := 0; continueAt(i, nPages); i++ { + hmacHash.Write(results[i]) + hmacHash.Write([]byte{byte(i)}) + bufPool.Put(results[i]) + } + hmacHash.Write([]byte{byte(nPages)}) + + return fmt.Sprintf("%x", hmacHash.Sum(nil)) +} + +func rectToBytes(r *poppler.Rectangle) []byte { + size := int(unsafe.Sizeof(*r)) + + var sliceHeader reflect.SliceHeader + sliceHeader.Data = uintptr(unsafe.Pointer(r)) + sliceHeader.Len = size + sliceHeader.Cap = size + + return *(*[]byte)(unsafe.Pointer(&sliceHeader)) +} diff --git a/document/utils.go b/document/utils.go new file mode 100644 index 0000000..c6593c4 --- /dev/null +++ b/document/utils.go @@ -0,0 +1,30 @@ +package document + +import ( + "github.com/scrotadamus/ghligh/go-poppler" + + "encoding/json" +) + +func unmarshallHighlights(jsonData string) (AnnotsMap, error) { + var annotsMap AnnotsMap + + err := json.Unmarshal([]byte(jsonData), &struct { + Highlights *AnnotsMap `json:"highlights"` + }{ + Highlights: &annotsMap, + }) + + return annotsMap, err +} + +func isInPage(a *poppler.Annot, p *poppler.Page) bool { + annots := p.GetAnnots() + for _, annot := range annots { + if popplerAnnotsMatch(a, annot) { + return true + } + } + + return false +} -- cgit v1.2.3