summaryrefslogtreecommitdiff
path: root/document/hash.go
blob: fe9520cf893882edbf89b8a821b11f8e2b557a25 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
package document

import (
	"fmt"
	"os"

	"unsafe"
	"reflect"

	"math"

	"runtime"
	"sync"

	"crypto/hmac"
	"crypto/sha256"

	"github.com/scrotadamus/ghligh/go-poppler"
)


var ghlighKey = []byte("ghligh-pdf-doc")

var bufPool = sync.Pool{
	New: func() interface{} {
		return make([]byte, 0, os.Getpagesize())
	},
}

type pageResult struct {
	index int
	buf   []byte
}


func sqrtInt(n int) int{
	return int(math.Sqrt(float64(n)))
}

func continueAt(i, n int) bool {
	// Very unlikely to edit a pdf and add a page in the center
	return i < sqrtInt(n)/2  || i > n - sqrtInt(n)/2
}

// generate identifier from document based on document text (use layout instead)
func (d *GhlighDoc) HashDoc() string {
	nPages := d.doc.GetNPages()

	hmacHash := hmac.New(sha256.New, ghlighKey)
	resultsCh := make(chan pageResult, nPages)

	var wg sync.WaitGroup

	maxWorkers := runtime.NumCPU() + 1
	sem := make(chan struct{}, maxWorkers)


	for i := 0; continueAt(i, nPages); i++ {
		wg.Add(1)
		sem <- struct{}{}
		go func(i int) {
			defer wg.Done()
			defer func() { <-sem }()

			page := d.doc.GetPage(i)
			text := page.Text()
			page.Close()

			buf := bufPool.Get().([]byte)
			buf = buf[:0]
			buf = append(buf, []byte(text)...)

			resultsCh <- pageResult{index: i, buf: buf}
		}(i)
	}

	wg.Wait()
	close(resultsCh)

	results := make([][]byte, nPages)
	for res := range resultsCh {
		results[res.index] = res.buf
	}

	for i := 0; continueAt(i, nPages); i++ {
		hmacHash.Write(results[i])
		hmacHash.Write([]byte{byte(i)})
		bufPool.Put(results[i])
	}
	hmacHash.Write([]byte{byte(nPages)})

	return fmt.Sprintf("%x", hmacHash.Sum(nil))
}

func rectToBytes(r *poppler.Rectangle) []byte {
	size := int(unsafe.Sizeof(*r))

	var sliceHeader reflect.SliceHeader
	sliceHeader.Data = uintptr(unsafe.Pointer(r))
	sliceHeader.Len = size
	sliceHeader.Cap = size

	return *(*[]byte)(unsafe.Pointer(&sliceHeader))
}