summaryrefslogtreecommitdiff
path: root/document/hash.go
diff options
context:
space:
mode:
authorF.O. <scrotadamus@insiberia.net>2025-02-16 17:56:08 +0100
committerF.O. <scrotadamus@insiberia.net>2025-02-16 17:56:57 +0100
commit17fb6add26291b31f7020e3551a7c8487130a747 (patch)
treed4559a7339ed181393ff921909e6ce05b7c2cf18 /document/hash.go
genesi
Diffstat (limited to 'document/hash.go')
-rw-r--r--document/hash.go104
1 files changed, 104 insertions, 0 deletions
diff --git a/document/hash.go b/document/hash.go
new file mode 100644
index 0000000..fe9520c
--- /dev/null
+++ b/document/hash.go
@@ -0,0 +1,104 @@
+package document
+
+import (
+ "fmt"
+ "os"
+
+ "unsafe"
+ "reflect"
+
+ "math"
+
+ "runtime"
+ "sync"
+
+ "crypto/hmac"
+ "crypto/sha256"
+
+ "github.com/scrotadamus/ghligh/go-poppler"
+)
+
+
+var ghlighKey = []byte("ghligh-pdf-doc")
+
+var bufPool = sync.Pool{
+ New: func() interface{} {
+ return make([]byte, 0, os.Getpagesize())
+ },
+}
+
+type pageResult struct {
+ index int
+ buf []byte
+}
+
+
+func sqrtInt(n int) int{
+ return int(math.Sqrt(float64(n)))
+}
+
+func continueAt(i, n int) bool {
+ // Very unlikely to edit a pdf and add a page in the center
+ return i < sqrtInt(n)/2 || i > n - sqrtInt(n)/2
+}
+
+// generate identifier from document based on document text (use layout instead)
+func (d *GhlighDoc) HashDoc() string {
+ nPages := d.doc.GetNPages()
+
+ hmacHash := hmac.New(sha256.New, ghlighKey)
+ resultsCh := make(chan pageResult, nPages)
+
+ var wg sync.WaitGroup
+
+ maxWorkers := runtime.NumCPU() + 1
+ sem := make(chan struct{}, maxWorkers)
+
+
+ for i := 0; continueAt(i, nPages); i++ {
+ wg.Add(1)
+ sem <- struct{}{}
+ go func(i int) {
+ defer wg.Done()
+ defer func() { <-sem }()
+
+ page := d.doc.GetPage(i)
+ text := page.Text()
+ page.Close()
+
+ buf := bufPool.Get().([]byte)
+ buf = buf[:0]
+ buf = append(buf, []byte(text)...)
+
+ resultsCh <- pageResult{index: i, buf: buf}
+ }(i)
+ }
+
+ wg.Wait()
+ close(resultsCh)
+
+ results := make([][]byte, nPages)
+ for res := range resultsCh {
+ results[res.index] = res.buf
+ }
+
+ for i := 0; continueAt(i, nPages); i++ {
+ hmacHash.Write(results[i])
+ hmacHash.Write([]byte{byte(i)})
+ bufPool.Put(results[i])
+ }
+ hmacHash.Write([]byte{byte(nPages)})
+
+ return fmt.Sprintf("%x", hmacHash.Sum(nil))
+}
+
+func rectToBytes(r *poppler.Rectangle) []byte {
+ size := int(unsafe.Sizeof(*r))
+
+ var sliceHeader reflect.SliceHeader
+ sliceHeader.Data = uintptr(unsafe.Pointer(r))
+ sliceHeader.Len = size
+ sliceHeader.Cap = size
+
+ return *(*[]byte)(unsafe.Pointer(&sliceHeader))
+}