diff options
author | F.O. <scrotadamus@insiberia.net> | 2025-02-16 17:56:08 +0100 |
---|---|---|
committer | F.O. <scrotadamus@insiberia.net> | 2025-02-16 17:56:57 +0100 |
commit | 17fb6add26291b31f7020e3551a7c8487130a747 (patch) | |
tree | d4559a7339ed181393ff921909e6ce05b7c2cf18 |
genesi
-rw-r--r-- | LICENSE.md | 338 | ||||
-rw-r--r-- | README.md | 25 | ||||
-rw-r--r-- | cmd/cat.go | 94 | ||||
-rw-r--r-- | cmd/export.go | 123 | ||||
-rw-r--r-- | cmd/hash.go | 98 | ||||
-rw-r--r-- | cmd/import.go | 203 | ||||
-rw-r--r-- | cmd/info.go | 94 | ||||
-rw-r--r-- | cmd/ls.go | 168 | ||||
-rw-r--r-- | cmd/root.go | 36 | ||||
-rw-r--r-- | cmd/tag/add.go | 88 | ||||
-rw-r--r-- | cmd/tag/remove.go | 87 | ||||
-rw-r--r-- | cmd/tag/show.go | 82 | ||||
-rw-r--r-- | cmd/tag/tag.go | 63 | ||||
-rw-r--r-- | document/annot.go | 76 | ||||
-rw-r--r-- | document/document.go | 236 | ||||
-rw-r--r-- | document/hash.go | 104 | ||||
-rw-r--r-- | document/utils.go | 30 | ||||
-rw-r--r-- | go-poppler/LICENSE.md | 363 | ||||
-rw-r--r-- | go-poppler/annot.go | 194 | ||||
-rw-r--r-- | go-poppler/document.go | 161 | ||||
-rw-r--r-- | go-poppler/image.go | 31 | ||||
-rw-r--r-- | go-poppler/page.go | 217 | ||||
-rw-r--r-- | go-poppler/poppler.go | 58 | ||||
-rw-r--r-- | go-poppler/tags.go | 57 | ||||
-rw-r--r-- | go-poppler/text.go | 22 | ||||
-rw-r--r-- | go-poppler/utils.go | 85 | ||||
-rw-r--r-- | go.mod | 13 | ||||
-rw-r--r-- | go.sum | 12 | ||||
-rw-r--r-- | main.go | 11 |
29 files changed, 3169 insertions, 0 deletions
diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..87c0f98 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,338 @@ +Copyright (c) 2025 Francesco Orlando + +GNU General Public License +========================== + +_Version 2, June 1991_ +_Copyright © 1989, 1991 Free Software Foundation, Inc.,_ +_51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA_ + +Everyone is permitted to copy and distribute verbatim copies +of this license document, but changing it is not allowed. + +### Preamble + +The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + +When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + +To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + +For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + +We protect your rights with two steps: **(1)** copyright the software, and +**(2)** offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + +Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + +Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + +The precise terms and conditions for copying, distribution and +modification follow. + +### TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + +**0.** This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The “Program”, below, +refers to any such program or work, and a “work based on the Program” +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term “modification”.) Each licensee is addressed as “you”. + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + +**1.** You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + +**2.** You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + +* **a)** You must cause the modified files to carry prominent notices +stating that you changed the files and the date of any change. +* **b)** You must cause any work that you distribute or publish, that in +whole or in part contains or is derived from the Program or any +part thereof, to be licensed as a whole at no charge to all third +parties under the terms of this License. +* **c)** If the modified program normally reads commands interactively +when run, you must cause it, when started running for such +interactive use in the most ordinary way, to print or display an +announcement including an appropriate copyright notice and a +notice that there is no warranty (or else, saying that you provide +a warranty) and that users may redistribute the program under +these conditions, and telling the user how to view a copy of this +License. (Exception: if the Program itself is interactive but +does not normally print such an announcement, your work based on +the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + +**3.** You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + +* **a)** Accompany it with the complete corresponding machine-readable +source code, which must be distributed under the terms of Sections +1 and 2 above on a medium customarily used for software interchange; or, +* **b)** Accompany it with a written offer, valid for at least three +years, to give any third party, for a charge no more than your +cost of physically performing source distribution, a complete +machine-readable copy of the corresponding source code, to be +distributed under the terms of Sections 1 and 2 above on a medium +customarily used for software interchange; or, +* **c)** Accompany it with the information you received as to the offer +to distribute corresponding source code. (This alternative is +allowed only for noncommercial distribution and only if you +received the program in object code or executable form with such +an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + +**4.** You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + +**5.** You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + +**6.** Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + +**7.** If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + +**8.** If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + +**9.** The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and “any +later version”, you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + +**10.** If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + +### NO WARRANTY + +**11.** BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + +**12.** IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + +END OF TERMS AND CONDITIONS + +### How to Apply These Terms to Your New Programs + +If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + +To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the “copyright” line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w` and `show c` should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w` and `show c`; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a “copyright disclaimer” for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..33c04f3 --- /dev/null +++ b/README.md @@ -0,0 +1,25 @@ +ghligh +====== + +ghligh can be used to manipulate pdf files in various ways. + +### Usage: +- ghligh [flags] +- ghligh [command] + +### Available Commands: +- `cat` cat shows highlights pdf files +- `completion` Generate the autocompletion script for the specified shell +- `export` export pdf highlights into json +- `hash` display the ghligh hash used to identify a documet [json] +- `help` Help about any command +- `import` import highlights from json file +- `info` display info about pdf documents [json] +- `ls` show files with highlights or tagged with 'ls' [unix] +- `tag` manage pdf tags + + +### Flags: + -h, --help help for ghligh + +Use `ghligh [command] --help` for more information about a command. diff --git a/cmd/cat.go b/cmd/cat.go new file mode 100644 index 0000000..688f10f --- /dev/null +++ b/cmd/cat.go @@ -0,0 +1,94 @@ +/* +Copyright © 2025 NAME HERE <EMAIL ADDRESS> +*/ +package cmd + +import ( + "fmt" + "os" + + "encoding/json" + + "github.com/scrotadamus/ghligh/document" + "github.com/spf13/cobra" +) + +// catCmd represents the cat command +var catCmd = &cobra.Command{ + Use: "cat", + Short: "cat prints highlights of pdf files [unix][json]", + Long: ` + ghligh cat file1.pdf file2.pdf ... [--json] [-i] + + will show every highlights inside pdf files specified + if --json is set the output will be in json format + + if -i is set the json output will be indented +`, + Run: func(cmd *cobra.Command, args []string) { + + useJSON, err := cmd.Flags().GetBool("json") + if err != nil { + cmd.Help() + return + } + + indent, err := cmd.Flags().GetBool("indent") + if err != nil { + cmd.Help() + return + } + + jsonCat := make(map[string][]document.HighlightedText) + + // for every arg + for _, arg := range args { + doc, err := document.Open(arg) + if err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + continue + } + + highlights := doc.Cat() + if !useJSON { + for _, highlight := range highlights { + if highlight.Contents != "" { + fmt.Printf("%s {{{%s}}}", highlight.Text, highlight.Contents) + } else { + fmt.Printf("%s", highlight.Text) + } + } + } else { + jsonCat[doc.Path] = highlights + } + + doc.Close() + } + + var jsonBytes []byte + if indent { + jsonBytes, err = json.MarshalIndent(jsonCat, "", " ") + } else { + jsonBytes, err = json.Marshal(jsonCat) + } + if err != nil { + panic(err) + } + fmt.Println(string(jsonBytes)) + }, +} + +func init() { + rootCmd.AddCommand(catCmd) + + // Here you will define your flags and configuration settings. + + // Cobra supports Persistent Flags which will work for this command + // and all subcommands, e.g.: + // catCmd.PersistentFlags().String("foo", "", "A help for foo") + + // Cobra supports local flags which will only run when this command + // is called directly, e.g.: + catCmd.Flags().BoolP("json", "j", false, "print highlights as json") + catCmd.Flags().BoolP("indent", "i", false, "print highlights as json") +} diff --git a/cmd/export.go b/cmd/export.go new file mode 100644 index 0000000..aaf822c --- /dev/null +++ b/cmd/export.go @@ -0,0 +1,123 @@ +/* +Copyright © 2025 NAME HERE <EMAIL ADDRESS> +*/ +package cmd + +import ( + "encoding/json" + "fmt" + "os" + + "github.com/scrotadamus/ghligh/document" + "github.com/spf13/cobra" +) + +var outputFiles []string + +func writeJSONToFile(jsonBytes []byte, path string) error { + file, err := os.Create(path) + if err != nil { + return err + } + defer file.Close() + + // IF FILE EXISTS ASK + _, err = file.Write(jsonBytes) + if err != nil { + return err + } + + return nil +} + +// exportCmd represents the export command +var exportCmd = &cobra.Command{ + Use: "export", + Short: "export pdf highlights into json", + Long: ` + ghligh export foo.pdf bar.pdf ... [--to fnord.json] [-1] [-i] + + will create one or more json file (specified with --to) or dump it + to stdout (-1) + + -i will indent the json output +`, + + Run: func(cmd *cobra.Command, args []string) { + if len(args) == 0 { + cmd.Help() + return + } + + indent, err := cmd.Flags().GetBool("indent") + if err != nil { + cmd.Help() + return + } + + stdout, err := cmd.Flags().GetBool("stdout") + if err != nil { + cmd.Help() + return + } + + if !stdout && len(outputFiles) == 0 { + fmt.Fprintf(os.Stderr, "nowhere to put output I am not doing anything\n") + return + } + + var exportedDocs []document.GhlighDoc + for _, file := range args { + doc, err := document.Open(file) + if err != nil { + fmt.Fprintf(os.Stderr, "error loading %s: %v", file, err) + continue + } + + doc.AnnotsBuffer = doc.GetAnnotsBuffer() + doc.HashBuffer = doc.HashDoc() + exportedDocs = append(exportedDocs, *doc) + } + + var jsonBytes []byte + if indent { + jsonBytes, err = json.MarshalIndent(exportedDocs, "", " ") + } else { + jsonBytes, err = json.Marshal(exportedDocs) + } + if err != nil { + panic(err) + } + + for _, file := range outputFiles { + err := writeJSONToFile(jsonBytes, file) + if err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + } + } + + if stdout { + fmt.Printf("%s\n", string(jsonBytes)) + } + + }, +} + +func init() { + rootCmd.AddCommand(exportCmd) + + // Here you will define your flags and configuration settings. + + // Cobra supports Persistent Flags which will work for this command + // and all subcommands, e.g.: + // exportCmd.PersistentFlags().String("foo", "", "A help for foo") + + // Cobra supports local flags which will only run when this command + // is called directly, e.g.: + // exportCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle") + // TODO flag toFiles + exportCmd.Flags().BoolP("indent", "i", false, "indent the json data") + exportCmd.Flags().BoolP("stdout", "1", false, "dump to stdout") + + exportCmd.Flags().StringArrayVarP(&outputFiles, "to", "t", []string{}, "files to save exported annots") +} diff --git a/cmd/hash.go b/cmd/hash.go new file mode 100644 index 0000000..5185736 --- /dev/null +++ b/cmd/hash.go @@ -0,0 +1,98 @@ +/* +Copyright © 2025 NAME HERE <EMAIL ADDRESS> +*/ +package cmd + +import ( + "encoding/json" + "fmt" + "os" + + "sync" + + "github.com/scrotadamus/ghligh/document" + "github.com/spf13/cobra" +) + +// hashCmd represents the hash command +var hashCmd = &cobra.Command{ + Use: "hash", + Short: "display the ghligh hash used to identify a documet [json]", + Long: `the ghligh hash is used to identify documents with different filenames / annotations and it is calculated using the text of some pages. + + ghligh hash file1.json file2.json [-i] + + -i will indent the json output + `, + Run: func(cmd *cobra.Command, args []string) { + if len(args) == 0 { + cmd.Help() + return + } + indent, err := cmd.Flags().GetBool("indent") + if err != nil { + cmd.Help() + return + } + + // var hashes []document.GhlighDoc + hashChan := make(chan document.GhlighDoc) + var wg sync.WaitGroup + wg.Add(len(args)) + + for _, arg := range args { + go func(arg string) { + defer wg.Done() + doc, err := document.Open(arg) + if err != nil { + fmt.Fprintf(os.Stderr, "error opening %s: %v\n", arg, err) + return + } + + // A little hacky, set hash after closing the document + //doc.Close() + doc.HashBuffer = doc.HashDoc() + + hashChan <- *doc + }(arg) + } + + go func() { + wg.Wait() + close(hashChan) + }() + + var hashes []document.GhlighDoc + for doc := range hashChan { + hashes = append(hashes, doc) + doc.Close() + } + + var jsonBytes []byte + if indent { + jsonBytes, err = json.MarshalIndent(hashes, "", " ") + } else { + jsonBytes, err = json.Marshal(hashes) + } + if err != nil { + panic(err) + } + fmt.Println(string(jsonBytes)) + + }, +} + +func init() { + rootCmd.AddCommand(hashCmd) + + // Here you will define your flags and configuration settings. + + // Cobra supports Persistent Flags which will work for this command + // and all subcommands, e.g.: + // hashCmd.PersistentFlags().String("foo", "", "A help for foo") + + // Cobra supports local flags which will only run when this command + // is called directly, e.g.: + // hashCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle") + hashCmd.Flags().BoolP("indent", "i", false, "indent the json data") +} diff --git a/cmd/import.go b/cmd/import.go new file mode 100644 index 0000000..2bcb1d1 --- /dev/null +++ b/cmd/import.go @@ -0,0 +1,203 @@ +/* +Copyright © 2025 NAME HERE <EMAIL ADDRESS> +*/ +package cmd + +import ( + "encoding/json" + "fmt" + "io" + "os" + "sync" + + "github.com/scrotadamus/ghligh/document" + "github.com/spf13/cobra" + + "crypto/sha256" +) + +var inputFiles []string + +type importedAnnots struct { + // FIXME just internatl map[string]struct where + // struct contains map[string]bool and document.AnnotsMap + internal map[string]document.AnnotsMap + annotsHashes map[string]map[string]bool + mutex sync.Mutex +} + +func (ia *importedAnnots) get(hash string) document.AnnotsMap { + return ia.internal[hash] +} + +func (ia *importedAnnots) init(hash string) { + ia.mutex.Lock() + defer ia.mutex.Unlock() + if ia.internal[hash] == nil { + ia.internal[hash] = make(document.AnnotsMap) + } + if ia.annotsHashes[hash] == nil { + ia.annotsHashes[hash] = make(map[string]bool) + } +} + +func (ia *importedAnnots) check(docHash string, annotsHash string) bool { + ia.mutex.Lock() + defer ia.mutex.Unlock() + ok := ia.annotsHashes[docHash][annotsHash] + return ok +} + +func (ia *importedAnnots) insert(hash string, am document.AnnotsMap) error { + amHash, err := hashAnnotsMap(am) + if err != nil { + return err + } + + present := ia.check(hash, amHash) + if !present { + ia.mutex.Lock() + for key, value := range am { + ia.internal[hash][key] = append(ia.internal[hash][key], value...) + ia.annotsHashes[hash][amHash] = true + } + ia.mutex.Unlock() + } + return nil +} + +func hashAnnotsMap(am document.AnnotsMap) (string, error) { + jsonBytes, err := json.Marshal(am) + if err != nil { + return "", err + } + + h := sha256.Sum256(jsonBytes) + return fmt.Sprintf("%x", h), nil +} + +func loadImportedAnnots(ia *importedAnnots, reader io.Reader) { + data, err := io.ReadAll(reader) + if err != nil { + fmt.Fprintf(os.Stderr, "could not read data: %v\n", err) + return + } + + var importedDocs []document.GhlighDoc + + err = json.Unmarshal(data, &importedDocs) + if err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + return + } + + for _, importedDoc := range importedDocs { + hash := importedDoc.HashBuffer + ia.init(hash) + ia.insert(hash, importedDoc.AnnotsBuffer) + } +} + +// importCmd represents the import command +var importCmd = &cobra.Command{ + Use: "import", + Short: "import highlights from json file", + Long: ` + ghligh import foo.pdf bar.pdf ... [--from fnord.json] [--from kadio.json] [-0] [--save=false] + + will import into foo.pdf bar.pdf etc... the highlights from file specified + with the --from flag + + if -0 is set ghligh will read json from stdin + + --save=false will run without saving documents, it will just tells you how + many annotations from the json files specified will be imported +`, + + Run: func(cmd *cobra.Command, args []string) { + if len(args) == 0 { + cmd.Help() + return + } + + stdin, err := cmd.Flags().GetBool("stdin") + if err != nil { + cmd.Help() + return + } + + save, err := cmd.Flags().GetBool("save") + if err != nil { + cmd.Help() + return + } + + if stdin == false && len(inputFiles) == 0 { + fmt.Fprintf(os.Stderr, "nowhere to put output I am not doing anything\n") + return + } + + // Load Annot Maps + ia := importedAnnots{ + internal: make(map[string]document.AnnotsMap), + annotsHashes: make(map[string]map[string]bool), + } + + var wg sync.WaitGroup + + wg.Add(len(inputFiles)) + for _, file := range inputFiles { + //wg.Add(1) + go func(path string) { + defer wg.Done() + + f, err := os.Open(path) + if err != nil { + fmt.Fprintf(os.Stderr, "could not open file %s: %v\n", path, err) + return + } + defer f.Close() + + loadImportedAnnots(&ia, f) + }(file) + } + + wg.Wait() + + if stdin { + loadImportedAnnots(&ia, os.Stdin) + } + + // load from inputFiles + for _, file := range args { + doc, err := document.Open(file) + if err != nil { + fmt.Fprintf(os.Stderr, "error loading %s: %v", file, err) + continue + } + + hash := doc.HashDoc() + + num, err := doc.Import(ia.get(hash)) + if err != nil { + fmt.Fprintf(os.Stderr, "could not import highlights into %s: %v\n", file, err) + } else { + fmt.Fprintf(os.Stderr, "imported %d annots into %s\n", num, file) + if save { + doc.Save() + } + } + doc.Close() + + } + + }, +} + +func init() { + rootCmd.AddCommand(importCmd) + + importCmd.Flags().BoolP("stdin", "0", false, "read json from stdin") + importCmd.Flags().BoolP("save", "", true, "save the file with new annotation importer") + importCmd.Flags().StringArrayVarP(&inputFiles, "from", "f", []string{}, "files to import annots from") +} diff --git a/cmd/info.go b/cmd/info.go new file mode 100644 index 0000000..7a33f62 --- /dev/null +++ b/cmd/info.go @@ -0,0 +1,94 @@ +/* +Copyright © 2025 NAME HERE <EMAIL ADDRESS> +*/ +package cmd + +import ( + "fmt" + "os" + + "sync" + + "encoding/json" + + "github.com/scrotadamus/ghligh/document" + "github.com/scrotadamus/ghligh/go-poppler" + "github.com/spf13/cobra" +) + +// infoCmd represents the info command +var infoCmd = &cobra.Command{ + Use: "info", + Short: "display info about pdf documents [json]", + Long: ` + ghligh info file1.pdf file2.pdf [-i] + + shows information about pdf (author, publisher, modification date, etc...) + -i will indent the json output + `, + Run: func(cmd *cobra.Command, args []string) { + if len(args) == 0 { + cmd.Help() + return + } + indent, err := cmd.Flags().GetBool("indent") + if err != nil { + cmd.Help() + return + } + + infoChan := make(chan poppler.DocumentInfo) + var wg sync.WaitGroup + wg.Add(len(args)) + + for _, arg := range args { + go func(arg string) { + defer wg.Done() + doc, err := document.Open(arg) + if err != nil { + fmt.Fprintf(os.Stderr, "error opening %s: %v\n", arg, err) + return + } + defer doc.Close() + infoChan <- doc.Info() + }(arg) + } + + go func() { + wg.Wait() + close(infoChan) + }() + + var infos []poppler.DocumentInfo + for info := range infoChan { + infos = append(infos, info) + } + + var jsonBytes []byte + if indent { + jsonBytes, err = json.MarshalIndent(infos, "", " ") + } else { + jsonBytes, err = json.Marshal(infos) + } + if err != nil { + panic(err) + } + fmt.Println(string(jsonBytes)) + + }, +} + +func init() { + rootCmd.AddCommand(infoCmd) + + // Here you will define your flags and configuration settings. + + // Cobra supports Persistent Flags which will work for this command + // and all subcommands, e.g.: + // infoCmd.PersistentFlags().String("foo", "", "A help for foo") + + // Cobra supports local flags which will only run when this command + // is called directly, e.g.: + // infoCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle") + infoCmd.Flags().BoolP("indent", "i", false, "indent the json data") +} diff --git a/cmd/ls.go b/cmd/ls.go new file mode 100644 index 0000000..bcdbe66 --- /dev/null +++ b/cmd/ls.go @@ -0,0 +1,168 @@ +/* +Copyright © 2025 NAME HERE <EMAIL ADDRESS> +*/ +package cmd + +import ( + "context" + "fmt" + "os" + "path/filepath" + "sync" + + "github.com/scrotadamus/ghligh/document" + "github.com/spf13/cobra" +) + +var recursive bool + +type resolver struct { + paths []string + recurse bool + ctx context.Context + ch chan<- string + wg sync.WaitGroup +} + +func (r *resolver) resolve() { + for _, path := range r.paths { + r.wg.Add(1) + go r.resolvePath(path) + } + + go func() { + r.wg.Wait() + close(r.ch) + }() +} + +func (r *resolver) resolvePath(path string) { + defer r.wg.Done() + if err := r.ctx.Err(); err != nil { + return + } + + entries, err := os.ReadDir(path) + if err != nil { + str, err := filepath.Abs(path) + if err != nil { + return + } + + r.ch <- str + return + } + + for _, entry := range entries { + info, err := entry.Info() + if err != nil { + fmt.Fprintf(os.Stderr, "Error retrieving info for %s: %v\n", entry.Name(), err) + continue + } + + fullPath := filepath.Join(path, entry.Name()) + + if info.IsDir() { + if r.recurse { + if err := r.ctx.Err(); err != nil { + return + } + r.wg.Add(1) + go r.resolvePath(fullPath) + } + } else if info.Mode().IsRegular() { + r.ch <- fullPath + } + } +} + +func lsArgs(args []string) []string { + if len(args) == 0 { + cwd, err := os.Getwd() + if err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + os.Exit(1) + return nil + } + return []string{cwd} + + } + return args +} + +func checkFile(path string) bool { + doc, err := document.Open(path) + if err != nil { + return false + } + defer doc.Close() + + return doc.HasHighlights() +} + +// lsCmd represents the ls command +var lsCmd = &cobra.Command{ + Use: "ls", + Short: "show files with highlights or tagged with 'ls' [unix]", + Long: ` + ghligh ls file1.pdf directory [-R] [-c] + + will show every file inside directory that contains highlights or it is marked + ls with the ghligh tag add command + + ghligh ls # show files in current dir + ghligh ls file1.pdf # if it outpus file1.pdf it means that file1.pdf contains highlights + ghligh ls -c file1.pdf # same as ghligh ls file1.pdf but exit status will not be if file1.pdf doesnt + contains highlights + + ghligh ls -R # do it recursively, be careful with symlink dir cycles, as I am to lazy to + address that particular issue +`, + Run: func(cmd *cobra.Command, args []string) { + files := lsArgs(args) + ch := make(chan string) + ctx := context.Background() + + res := resolver{ + paths: files, + recurse: recursive, + ctx: ctx, + ch: ch, + } + + go res.resolve() + + var wg sync.WaitGroup + var found bool + for file := range ch { + wg.Add(1) + go func(f string) { + defer wg.Done() + if checkFile(f) { + found = true + fmt.Printf("%s\n", f) + } + }(file) + } + wg.Wait() + + check, err := cmd.Flags().GetBool("check") + if err != nil { + cmd.Help() + return + } + if check && !found { + os.Exit(1) + } + + }, +} + +func init() { + rootCmd.AddCommand(lsCmd) + + lsCmd.Flags().BoolVarP(&recursive, "recursive", "R", false, "List recursively") + lsCmd.Flags().BoolP("check", "c", false, "exit status is 1 if no file its found") + // order pdf by time of something (modification / creation) ??? + //lsCmd.Flags().BoolP("time", "t", false, "ls by time") +} diff --git a/cmd/root.go b/cmd/root.go new file mode 100644 index 0000000..321253c --- /dev/null +++ b/cmd/root.go @@ -0,0 +1,36 @@ +/* +Copyright © 2025 Francesco Orlando scrotadamus@insiberia.net +*/ +package cmd + +import ( + "os" + + "github.com/scrotadamus/ghligh/cmd/tag" + "github.com/spf13/cobra" +) + +var rootCmd = &cobra.Command{ + Use: "ghligh", + Short: "pdf highlights swiss knife", + Long: `ghligh can be used to manipulate pdf files in various ways.`, + + Run: func(cmd *cobra.Command, args []string) { + cmd.Help() + return + }, +} + +// Execute adds all child commands to the root command and sets flags appropriately. +// This is called by main.main(). It only needs to happen once to the rootCmd. +func Execute() { + err := rootCmd.Execute() + if err != nil { + os.Exit(1) + } +} + +func init() { + //rootCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle") + rootCmd.AddCommand(tag.TagCmd) +} diff --git a/cmd/tag/add.go b/cmd/tag/add.go new file mode 100644 index 0000000..a5dc258 --- /dev/null +++ b/cmd/tag/add.go @@ -0,0 +1,88 @@ +/* +Copyright © 2025 NAME HERE <EMAIL ADDRESS> +*/ +package tag + +import ( + "fmt" + "io" + "os" + + "github.com/scrotadamus/ghligh/document" + "github.com/spf13/cobra" +) + +func readStdin() string { + data, err := io.ReadAll(os.Stdin) + if err != nil { + panic(err) + } + return string(data) +} + +// tagAddCmd represents the tag command +var tagAddCmd = &cobra.Command{ + Use: "add", + Short: "add a ghligh tag to a pdf file", + Long: `A longer description that spans multiple lines and likely contains examples +and usage of using your command. For example: + +Cobra is a CLI library for Go that empowers applications. +This application is a tool to generate the needed files +to quickly create a Cobra application.`, + Run: func(cmd *cobra.Command, args []string) { + if len(args) == 0 { + cmd.Help() + return + } + + stdin, err := cmd.Flags().GetBool("stdin") + if err != nil { + cmd.Help() + return + } + if stdin { + tags = append(tags, readStdin()) + } + if len(tags) == 0 { + fmt.Fprintf(os.Stderr, "Either --tag or --stdin is required\n", err) + os.Exit(1) + } + + for _, file := range args { + doc, err := document.Open(file) + if err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + continue + } + for _, tag := range tags { + doc.Tag(tag) + fmt.Fprintf(os.Stderr, "added tag {{{%s}}} to %s\n", tag, file) + } + doc.Save() + doc.Close() + } + }, +} + +func init() { + TagCmd.AddCommand(tagAddCmd) + + // Here you will define your flags and configuration settings. + + // Cobra supports Persistent Flags which will work for this command + // and all subcommands, e.g.: + // tagCmd.PersistentFlags().String("foo", "", "A help for foo") + + // Cobra supports local flags which will only run when this command + // is called directly, e.g.: + // tagCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle") + // TODO add to "add" command + tagAddCmd.Flags().StringArrayVarP(&tags, "tag", "t", []string{}, "Tag da associare ai file (può essere usato più volte)") + tagAddCmd.Flags().BoolP("stdin", "0", false, "read tag from stdin") + + //if err := tagAddCmd.MarkFlagRequired("tag"); err != nil { + //panic(err) + //} + +} diff --git a/cmd/tag/remove.go b/cmd/tag/remove.go new file mode 100644 index 0000000..ca0f1b4 --- /dev/null +++ b/cmd/tag/remove.go @@ -0,0 +1,87 @@ +/* +Copyright © 2025 NAME HERE <EMAIL ADDRESS> + +*/ +package tag + +import ( + "fmt" + "os" + + "github.com/spf13/cobra" + "github.com/scrotadamus/ghligh/document" +) + +var tagRemoveCmd = &cobra.Command{ + Use: "remove", + Short: "remove ghligh tags from a pdf files using regex", + Long: `A longer description that spans multiple lines and likely contains examples +and usage of using your command. For example: + +Cobra is a CLI library for Go that empowers applications. +This application is a tool to generate the needed files +to quickly create a Cobra application.`, + Run: func(cmd *cobra.Command, args []string) { + if len(args) == 0 { + cmd.Help() + return + } + + if regex == "" && exact == "" { + fmt.Fprintf(os.Stderr, "either regex or exact must be set with --regex or --exact\n") + os.Exit(1) + } + + nosafe, err := cmd.Flags().GetBool("nosafe") + if err != nil { + cmd.Help() + return + } + + // just a little hack -> boundaries = nosafe ? "" : `\b` + boundaries := map[bool]string{true: "", false: `\b`}[nosafe] + regex = formatRegex(regex, boundaries) + + // if exact set overwrite regex + if exact != "" { + regex = `^` + exact + `$` + } + + for _, file := range(args){ + doc, err := document.Open(file) + if err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + continue + } + + tags := regexSlice(regex, doc.GetTags()) + removedTags := doc.RemoveTags(tags) + doc.Save() + doc.Close() + + fmt.Printf("removed %d tags from %s\n", removedTags, doc.Path) + } + + }, +} + +func init() { + TagCmd.AddCommand(tagRemoveCmd) + + // Here you will define your flags and configuration settings. + + // Cobra supports Persistent Flags which will work for this command + // and all subcommands, e.g.: + // removetagsCmd.PersistentFlags().String("foo", "", "A help for foo") + + // Cobra supports local flags which will only run when this command + // is called directly, e.g.: + // removetagsCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle") + tagRemoveCmd.Flags().StringVarP(®ex, "regex", "r", "", "regex") + tagRemoveCmd.Flags().StringVarP(&exact, "exact", "e", "", "exact") + tagRemoveCmd.Flags().BoolP("nosafe", "", false, "don't use safe boundaries around regex") + + //if err := tagRemoveCmd.MarkFlagRequired("regex"); err != nil { + //panic(err) + //} +} diff --git a/cmd/tag/show.go b/cmd/tag/show.go new file mode 100644 index 0000000..3d8ae62 --- /dev/null +++ b/cmd/tag/show.go @@ -0,0 +1,82 @@ +/* +Copyright © 2025 NAME HERE <EMAIL ADDRESS> + +*/ +package tag + +import ( + "fmt" + "os" + + "encoding/json" + "github.com/spf13/cobra" + "github.com/scrotadamus/ghligh/document" +) + +var tagShowCmd = &cobra.Command{ + Use: "show", + Short: "show ghligh tags of pdf files [json]", + Long: `A longer description that spans multiple lines and likely contains examples +and usage of using your command. For example: + +Cobra is a CLI library for Go that empowers applications. +This application is a tool to generate the needed files +to quickly create a Cobra application.`, + Run: func(cmd *cobra.Command, args []string) { + if len(args) == 0 { + cmd.Help() + return + } + + indent, err := cmd.Flags().GetBool("indent") + if err != nil { + cmd.Help() + return + } + + exportTags := make(map[string][]string) + for _, file := range(args){ + doc, err := document.Open(file) + if err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + continue + } + + if regex != "" { + regex = formatRegex(regex, "") + } + tags := regexSlice(regex, doc.GetTags()) + + exportTags[doc.Path] = tags + + doc.Close() + + } + + var jsonBytes []byte + if indent { + jsonBytes, err = json.MarshalIndent(exportTags, "", " ") + } else { + jsonBytes, err = json.Marshal(exportTags) + } + if err != nil { + panic(err) + } + fmt.Println(string(jsonBytes)) + }, +} + +func init() { + TagCmd.AddCommand(tagShowCmd) + + // Here you will define your flags and configuration settings. + + // Cobra supports Persistent Flags which will work for this command + // and all subcommands, e.g.: + // showtagsCmd.PersistentFlags().String("foo", "", "A help for foo") + + // Cobra supports local flags which will only run when this command + // is called directly, e.g.: + tagShowCmd.Flags().BoolP("indent", "i", false, "indent the json data") + tagShowCmd.Flags().StringVarP(®ex, "regex", "r", "", "regex") +} diff --git a/cmd/tag/tag.go b/cmd/tag/tag.go new file mode 100644 index 0000000..30e41c8 --- /dev/null +++ b/cmd/tag/tag.go @@ -0,0 +1,63 @@ +/* +Copyright © 2025 NAME HERE <EMAIL ADDRESS> + +*/ +package tag + +import ( + "regexp" + "github.com/spf13/cobra" +) + + +var tags []string + +var regex, exact string + +func formatRegex(r string, boundaries string) string { + //return `\b` + r + `\b` + return boundaries + r + boundaries +} + +func regexSlice(regex string, slice []string) []string { + if regex == "" { + return slice + } + + var newSlice []string + re, err := regexp.Compile(regex) + if err != nil { + panic(err) + } + for _, s := range(slice){ + if re.MatchString(s){ + newSlice = append(newSlice, s) + } + } + + return newSlice +} + +// tagCmd represents the tag command +var TagCmd = &cobra.Command{ + Use: "tag", + Short: "manage pdf tags", + Long: `a tag is a string you can attach to a pdf +.`, +} + +func init() { + //rootCmd.AddCommand(tagCmd) + + // Here you will define your flags and configuration settings. + + // Cobra supports Persistent Flags which will work for this command + // and all subcommands, e.g.: + // tagCmd.PersistentFlags().String("foo", "", "A help for foo") + + // Cobra supports local flags which will only run when this command + // is called directly, e.g.: + // tagCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle") + // TODO add to "add" command + // tagCmd.Flags().StringArrayVarP(&tags, "tag", "t", []string{}, "Tag da associare ai file (può essere usato più volte)") +} diff --git a/document/annot.go b/document/annot.go new file mode 100644 index 0000000..e280cd0 --- /dev/null +++ b/document/annot.go @@ -0,0 +1,76 @@ +package document + +import ( + "github.com/scrotadamus/ghligh/go-poppler" +) + +type AnnotJSON struct { + Type poppler.AnnotType `json:"type,omitempty"` + Index int `json:"index,omitempty"` + Date string `json:"date,omitempty"` + Rect poppler.Rectangle `json:"rect,omitempty"` + Color poppler.Color `json:"color,omitempty"` + Name string `json:"name,omitempty"` + Contents string `json:"contents,omitempty"` + Flags poppler.AnnotFlag `json:"flags,omitempty"` + Quads []poppler.Quad `json:"quads,omitempty"` +} + +func annotToJson(a poppler.Annot) (AnnotJSON) { + var aj AnnotJSON + aj.Type = a.Type() + aj.Index = a.Index() + aj.Date = a.Date() + aj.Rect = a.Rect() + aj.Color = a.Color() + aj.Name = a.Name() + aj.Contents = a.Contents() + aj.Flags = a.Flags() + aj.Quads = a.Quads() + + return aj +} + +func (d *GhlighDoc) jsonToAnnot(aJson AnnotJSON) *poppler.Annot { + + annot, _ := d.doc.NewAnnot(poppler.AnnotHighlight, aJson.Rect, aJson.Quads) + + annot.SetColor(aJson.Color) + annot.SetContents(aJson.Contents) + annot.SetFlags(aJson.Flags) + + return &annot +} + +func popplerAnnotsMatch(a *poppler.Annot, b *poppler.Annot) bool { + aRect := a.Rect() + bRect := b.Rect() + + aQuads := a.Quads() + bQuads := b.Quads() + + if aRect.X1 != bRect.X1 || + aRect.Y1 != bRect.Y1 || + aRect.X2 != bRect.X2 || + aRect.Y2 != bRect.Y2 { + return false + } + + if len(aQuads) != len(bQuads) { + return false + } + + for i := range aQuads { + q1 := aQuads[i] + q2 := bQuads[i] + + if q1.P1.X != q2.P1.X || q1.P1.Y != q2.P1.Y || + q1.P2.X != q2.P2.X || q1.P2.Y != q2.P2.Y || + q1.P3.X != q2.P3.X || q1.P3.Y != q2.P3.Y || + q1.P4.X != q2.P4.X || q1.P4.Y != q2.P4.Y { + return false + } + } + + return true +} diff --git a/document/document.go b/document/document.go new file mode 100644 index 0000000..a036eee --- /dev/null +++ b/document/document.go @@ -0,0 +1,236 @@ +package document + +import ( + "github.com/scrotadamus/ghligh/go-poppler" + + "os" + "sync" + + "strings" + + "fmt" +) + +const ghlighFilter = "ghligh-Y2lhbm5v:" + +// This is different from poppler's annot_mapping +// it is the list of annotations mapped to the page index +type AnnotsMap map[int][]AnnotJSON + +type GhlighDoc struct { + doc *poppler.Document + mu sync.Mutex + + Path string `json:"file"` + HashBuffer string `json:"hash"` + AnnotsBuffer AnnotsMap `json:"highlights,omitempty"` +} + +type HighlightedText struct { + Page int `json:"page"` + Text string `json:"text"` + Contents string `json:"contents,omitempty"` +} + +func Open(filename string) (*GhlighDoc, error) { + var err error + + g := &GhlighDoc{} + + g.doc, err = poppler.Open(filename) + if err != nil { + fmt.Errorf("%s: error opening pdf %v", os.Args[0], err) + return nil, err + } + g.Path = filename + // HashDoc?? + + return g, nil +} + +func (d *GhlighDoc) Close() { + d.AnnotsBuffer = nil + d.HashBuffer = "" + if d.doc != nil { + d.doc.Close() + } +} + +func (d *GhlighDoc) Info() poppler.DocumentInfo { + return d.doc.Info() +} + +func (d *GhlighDoc) tagExists(text string) bool { + for _, tag := range d.GetTags() { + if tag == text { + return true + } + } + return false +} + +func (d *GhlighDoc) Tag(text string) { + if !d.tagExists(text) { + d.doc.Tag(ghlighFilter + text) + } else { + fmt.Fprintf(os.Stderr, "warning: tag %s already exist inside %s, i don't do anything\n", text, d.Path) + } +} + +func (d *GhlighDoc) GetTags() []string { + var tags []string + annots := d.doc.GetTags(ghlighFilter) + for _, annot := range annots { + contents := strings.TrimPrefix(annot.Contents(), ghlighFilter) + tags = append(tags, contents) + } + return tags +} + +func (d *GhlighDoc) RemoveTags(tags []string) int { + zeroPage := d.doc.GetPage(0) + var removedTags int + + annots := d.doc.GetTags(ghlighFilter) + for _, annot := range annots { + contents := strings.TrimPrefix(annot.Contents(), ghlighFilter) + for _, tag := range tags { + if tag == contents { + zeroPage.RemoveAnnot(*annot) + removedTags += 1 + break + } + } + } + return removedTags +} + +func (d *GhlighDoc) Import(annotsMap AnnotsMap) (int, error) { + d.mu.Lock() + defer d.mu.Unlock() + annots_count := 0 + + var err error + d.AnnotsBuffer = annotsMap + + for key := range d.AnnotsBuffer { + page := d.doc.GetPage(key) + for _, annot := range d.AnnotsBuffer[key] { + a := d.jsonToAnnot(annot) + if !isInPage(a, page) { + annots_count += 1 + page.AddAnnot(*a) + } + + } + page.Close() + } + + d.AnnotsBuffer = nil + return annots_count, err +} + +func integrityCheck(tizio *GhlighDoc, caio *GhlighDoc) { + +} + +func (d *GhlighDoc) Save() (bool, error) { + d.mu.Lock() + defer d.mu.Unlock() + tempFile, err := os.CreateTemp("", ".ghligh_*.pdf") + if err != nil { + return false, err + } + defer os.Remove(tempFile.Name()) + + ok, err := d.doc.Save(tempFile.Name()) + if !ok { + return false, err + } + + /* integrity check */ + newDoc, err := Open(tempFile.Name()) + if err != nil { + return false, err + } + + if newDoc.HashDoc() != d.HashDoc() { + return false, fmt.Errorf("After saving document %s to %s its hash doesn't correspond the the old one", d.Path, tempFile.Name()) + } + + err = os.Rename(tempFile.Name(), d.Path) + if err != nil { + return false, err + } + + return true, nil +} + +func (d *GhlighDoc) Cat() []HighlightedText { + var highlights []HighlightedText + + n_pages := d.doc.GetNPages() + for i := 0; i < n_pages; i++ { + page := d.doc.GetPage(i) + annots := page.GetAnnots() + for _, annot := range annots { + if annot.Type() == poppler.AnnotHighlight { + annotText := page.AnnotText(*annot) + + highlights = append(highlights, HighlightedText{Page: i, Text: annotText, Contents: annot.Contents()}) + } + } + + page.Close() + } + return highlights +} + +func (d *GhlighDoc) HasHighlights() bool { + // check if is tagged with ls + if d.tagExists("ls") { + return true + } + + // check if it has highlights + n_pages := d.doc.GetNPages() + for i := 0; i < n_pages; i++ { + page := d.doc.GetPage(i) + annots := page.GetAnnots() + for _, annot := range annots { + if annot.Type() == poppler.AnnotHighlight { + return true + } + } + + page.Close() + } + return false +} + +func (d *GhlighDoc) GetAnnotsBuffer() AnnotsMap { + annots_json_of_page := make(AnnotsMap) + + n := d.doc.GetNPages() + var annots_json []AnnotJSON + for i := 0; i < n; i++ { + annots_json = nil + page := d.doc.GetPage(i) + + annots := page.GetAnnots() + for _, annot := range annots { + if annot.Type() == poppler.AnnotHighlight { + annot_json := annotToJson(*annot) + annots_json = append(annots_json, annot_json) + } + } + + page.Close() + + if len(annots_json) > 0 { + annots_json_of_page[i] = annots_json + } + } + + return annots_json_of_page +} diff --git a/document/hash.go b/document/hash.go new file mode 100644 index 0000000..fe9520c --- /dev/null +++ b/document/hash.go @@ -0,0 +1,104 @@ +package document + +import ( + "fmt" + "os" + + "unsafe" + "reflect" + + "math" + + "runtime" + "sync" + + "crypto/hmac" + "crypto/sha256" + + "github.com/scrotadamus/ghligh/go-poppler" +) + + +var ghlighKey = []byte("ghligh-pdf-doc") + +var bufPool = sync.Pool{ + New: func() interface{} { + return make([]byte, 0, os.Getpagesize()) + }, +} + +type pageResult struct { + index int + buf []byte +} + + +func sqrtInt(n int) int{ + return int(math.Sqrt(float64(n))) +} + +func continueAt(i, n int) bool { + // Very unlikely to edit a pdf and add a page in the center + return i < sqrtInt(n)/2 || i > n - sqrtInt(n)/2 +} + +// generate identifier from document based on document text (use layout instead) +func (d *GhlighDoc) HashDoc() string { + nPages := d.doc.GetNPages() + + hmacHash := hmac.New(sha256.New, ghlighKey) + resultsCh := make(chan pageResult, nPages) + + var wg sync.WaitGroup + + maxWorkers := runtime.NumCPU() + 1 + sem := make(chan struct{}, maxWorkers) + + + for i := 0; continueAt(i, nPages); i++ { + wg.Add(1) + sem <- struct{}{} + go func(i int) { + defer wg.Done() + defer func() { <-sem }() + + page := d.doc.GetPage(i) + text := page.Text() + page.Close() + + buf := bufPool.Get().([]byte) + buf = buf[:0] + buf = append(buf, []byte(text)...) + + resultsCh <- pageResult{index: i, buf: buf} + }(i) + } + + wg.Wait() + close(resultsCh) + + results := make([][]byte, nPages) + for res := range resultsCh { + results[res.index] = res.buf + } + + for i := 0; continueAt(i, nPages); i++ { + hmacHash.Write(results[i]) + hmacHash.Write([]byte{byte(i)}) + bufPool.Put(results[i]) + } + hmacHash.Write([]byte{byte(nPages)}) + + return fmt.Sprintf("%x", hmacHash.Sum(nil)) +} + +func rectToBytes(r *poppler.Rectangle) []byte { + size := int(unsafe.Sizeof(*r)) + + var sliceHeader reflect.SliceHeader + sliceHeader.Data = uintptr(unsafe.Pointer(r)) + sliceHeader.Len = size + sliceHeader.Cap = size + + return *(*[]byte)(unsafe.Pointer(&sliceHeader)) +} diff --git a/document/utils.go b/document/utils.go new file mode 100644 index 0000000..c6593c4 --- /dev/null +++ b/document/utils.go @@ -0,0 +1,30 @@ +package document + +import ( + "github.com/scrotadamus/ghligh/go-poppler" + + "encoding/json" +) + +func unmarshallHighlights(jsonData string) (AnnotsMap, error) { + var annotsMap AnnotsMap + + err := json.Unmarshal([]byte(jsonData), &struct { + Highlights *AnnotsMap `json:"highlights"` + }{ + Highlights: &annotsMap, + }) + + return annotsMap, err +} + +func isInPage(a *poppler.Annot, p *poppler.Page) bool { + annots := p.GetAnnots() + for _, annot := range annots { + if popplerAnnotsMatch(a, annot) { + return true + } + } + + return false +} diff --git a/go-poppler/LICENSE.md b/go-poppler/LICENSE.md new file mode 100644 index 0000000..e96bb63 --- /dev/null +++ b/go-poppler/LICENSE.md @@ -0,0 +1,363 @@ +Copyright (c) 2015-2020 Sergey Cherepanov + +### GNU GENERAL PUBLIC LICENSE + +Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +### Preamble + +The licenses for most software are designed to take away your freedom +to share and change it. By contrast, the GNU General Public License is +intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + +When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + +To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if +you distribute copies of the software, or if you modify it. + +For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + +We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + +Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, +we want its recipients to know that what they have is not the +original, so that any problems introduced by others will not reflect +on the original authors' reputations. + +Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at +all. + +The precise terms and conditions for copying, distribution and +modification follow. + +### TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + +**0.** This License applies to any program or other work which +contains a notice placed by the copyright holder saying it may be +distributed under the terms of this General Public License. The +"Program", below, refers to any such program or work, and a "work +based on the Program" means either the Program or any derivative work +under copyright law: that is to say, a work containing the Program or +a portion of it, either verbatim or with modifications and/or +translated into another language. (Hereinafter, translation is +included without limitation in the term "modification".) Each licensee +is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the Program +(independent of having been made by running the Program). Whether that +is true depends on what the Program does. + +**1.** You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a +fee. + +**2.** You may modify your copy or copies of the Program or any +portion of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + +**a)** You must cause the modified files to carry prominent notices +stating that you changed the files and the date of any change. + + +**b)** You must cause any work that you distribute or publish, that in +whole or in part contains or is derived from the Program or any part +thereof, to be licensed as a whole at no charge to all third parties +under the terms of this License. + + +**c)** If the modified program normally reads commands interactively +when run, you must cause it, when started running for such interactive +use in the most ordinary way, to print or display an announcement +including an appropriate copyright notice and a notice that there is +no warranty (or else, saying that you provide a warranty) and that +users may redistribute the program under these conditions, and telling +the user how to view a copy of this License. (Exception: if the +Program itself is interactive but does not normally print such an +announcement, your work based on the Program is not required to print +an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + +**3.** You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + +**a)** Accompany it with the complete corresponding machine-readable +source code, which must be distributed under the terms of Sections 1 +and 2 above on a medium customarily used for software interchange; or, + + +**b)** Accompany it with a written offer, valid for at least three +years, to give any third party, for a charge no more than your cost of +physically performing source distribution, a complete machine-readable +copy of the corresponding source code, to be distributed under the +terms of Sections 1 and 2 above on a medium customarily used for +software interchange; or, + + +**c)** Accompany it with the information you received as to the offer +to distribute corresponding source code. (This alternative is allowed +only for noncommercial distribution and only if you received the +program in object code or executable form with such an offer, in +accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + +**4.** You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt otherwise +to copy, modify, sublicense or distribute the Program is void, and +will automatically terminate your rights under this License. However, +parties who have received copies, or rights, from you under this +License will not have their licenses terminated so long as such +parties remain in full compliance. + +**5.** You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + +**6.** Each time you redistribute the Program (or any work based on +the Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + +**7.** If, as a consequence of a court judgment or allegation of +patent infringement or for any other reason (not limited to patent +issues), conditions are imposed on you (whether by court order, +agreement or otherwise) that contradict the conditions of this +License, they do not excuse you from the conditions of this License. +If you cannot distribute so as to satisfy simultaneously your +obligations under this License and any other pertinent obligations, +then as a consequence you may not distribute the Program at all. For +example, if a patent license would not permit royalty-free +redistribution of the Program by all those who receive copies directly +or indirectly through you, then the only way you could satisfy both it +and this License would be to refrain entirely from distribution of the +Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + +**8.** If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + +**9.** The Free Software Foundation may publish revised and/or new +versions of the General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Program does not specify a +version number of this License, you may choose any version ever +published by the Free Software Foundation. + +**10.** If you wish to incorporate parts of the Program into other +free programs whose distribution conditions are different, write to +the author to ask for permission. For software which is copyrighted by +the Free Software Foundation, write to the Free Software Foundation; +we sometimes make exceptions for this. Our decision will be guided by +the two goals of preserving the free status of all derivatives of our +free software and of promoting the sharing and reuse of software +generally. + +**NO WARRANTY** + +**11.** BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + +**12.** IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + +### END OF TERMS AND CONDITIONS + +### How to Apply These Terms to Your New Programs + +If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these +terms. + +To do so, attach the following notices to the program. It is safest to +attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + one line to give the program's name and an idea of what it does. + Copyright (C) yyyy name of author + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +Also add information on how to contact you by electronic and paper +mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details + type `show w'. This is free software, and you are welcome + to redistribute it under certain conditions; type `show c' + for details. + +The hypothetical commands \`show w' and \`show c' should show the +appropriate parts of the General Public License. Of course, the +commands you use may be called something other than \`show w' and +\`show c'; they could even be mouse-clicks or menu items--whatever +suits your program. + +You should also get your employer (if you work as a programmer) or +your school, if any, to sign a "copyright disclaimer" for the program, +if necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright + interest in the program `Gnomovision' + (which makes passes at compilers) written + by James Hacker. + + signature of Ty Coon, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, +you may consider it more useful to permit linking proprietary +applications with the library. If this is what you want to do, use the +[GNU Lesser General Public +License](https://www.gnu.org/licenses/lgpl.html) instead of this +License. diff --git a/go-poppler/annot.go b/go-poppler/annot.go new file mode 100644 index 0000000..4527d83 --- /dev/null +++ b/go-poppler/annot.go @@ -0,0 +1,194 @@ +package poppler + +// #cgo pkg-config: poppler-glib +// #include <poppler.h> +// #include <glib.h> +// #include <cairo.h> +// +// /* macro wrappings */ +// gboolean wrap_POPPLER_IS_ANNOT_TEXT_MARKUP(PopplerAnnot *annot){ +// return POPPLER_IS_ANNOT_TEXT_MARKUP(annot); +// } +// PopplerAnnotTextMarkup *wrap_POPPLER_ANNOT_TEXT_MARKUP(PopplerAnnot *annot) { +// return POPPLER_ANNOT_TEXT_MARKUP(annot); +// } +import "C" + +import "unsafe" +//import "github.com/ungerik/go-cairo" + +// DEBUG +//import "fmt" + +type Point struct { + X, Y float64 +} +type Quad struct { + P1, P2, P3, P4 Point +} + +type Annot struct { + am *C.struct__PopplerAnnotMapping +} + +type AnnotType int + +const ( + AnnotUnknown AnnotType = iota + AnnotText + AnnotLink + AnnotFreeText + AnnotLine + AnnotSquare + AnnotCircle + AnnotPolygon + AnnotPolyLine + AnnotHighlight + AnnotUnderline + AnnotSquiggly + AnnotStrikeOut + AnnotStamp + AnnotCaret + AnnotInk + AnnotPopup + AnnotFileAttachment + AnnotSound + AnnotMovie + AnnotWidget + AnnotScreen + AnnotPrinterMark + AnnotTrapNet + AnnotWatermark + Annot3D +) + +type AnnotFlag int + +const AnnotFlagUnknown AnnotFlag = 0 +const ( + AnnotFlagInvisible AnnotFlag = 1 << iota + AnnotFlagHidden + AnnotFlagPrint + AnnotFlagNoZoom + AnnotFlagNoRotate + AnnotFlagNoView + AnnotFlagReadOnly + AnnotFlagLocked + AnnotFlagToggleNoView + AnnotFlagLockedContents +) + +func (a *Annot) Type() AnnotType { + t := C.poppler_annot_get_annot_type(a.am.annot) + return AnnotType(t) +} + +func (a *Annot) Index() int { + i := C.poppler_annot_get_page_index(a.am.annot) + return int(i) +} + +func (a *Annot) Date() string { + cText := C.poppler_annot_get_modified(a.am.annot) + return C.GoString(cText) +} + +func (a *Annot) Rect() Rectangle { + var r C.PopplerRectangle + C.poppler_annot_get_rectangle(a.am.annot, &r) + + rect := Rectangle{ + X1: float64(r.x1), + Y1: float64(r.y1), + X2: float64(r.x2), + Y2: float64(r.y2), + } + + return rect + +} + +func (a *Annot) Color() Color { + c := C.poppler_annot_get_color(a.am.annot) + if c == nil { + return Color{} + } + defer C.poppler_color_free(c) + + color := Color{ + R: int(c.red), + G: int(c.green), + B: int(c.blue), + } + + return color +} + +func (a *Annot) Name() string { + cText := C.poppler_annot_get_name(a.am.annot) + return C.GoString(cText) +} + +func (a *Annot) Contents() string { + //if a.am.annot == nil { + //return "" + //} + cText := C.poppler_annot_get_contents(a.am.annot) + //fmt.Printf("DEBUG poppler_annot_get_contents returned pointer: %v", cText) + if cText == nil { + return "" + } + return C.GoString(cText) +} + +func (a *Annot) Flags() AnnotFlag { + f := C.poppler_annot_get_flags(a.am.annot) + return AnnotFlag(f) +} + +func (a *Annot) Quads() []Quad { + if C.wrap_POPPLER_IS_ANNOT_TEXT_MARKUP(a.am.annot) == C.FALSE { + return nil + } + + + textMarkup := C.wrap_POPPLER_ANNOT_TEXT_MARKUP(a.am.annot) + + q := C.poppler_annot_text_markup_get_quadrilaterals(textMarkup) + + quads := gArrayToQuads(q) + + C.g_array_free(q, 1) + + return quads +} + +func (a *Annot) Close() { + if a.am != nil { + C.poppler_annot_mapping_free(a.am) + a.am = nil + } +} + +func (a *Annot) SetColor(c Color){ + pColor := C.poppler_color_new() + pColor.red = C.ushort(c.R) + pColor.green = C.ushort(c.G) + pColor.blue = C.ushort(c.B) + defer C.poppler_color_free(pColor) + + C.poppler_annot_set_color(a.am.annot, pColor ) +} + +func (a *Annot) SetContents(c string){ + cStr := C.CString(c) + defer C.free(unsafe.Pointer(cStr)) + + C.poppler_annot_set_contents(a.am.annot, cStr) +} + +func (a *Annot) SetFlags(f AnnotFlag){ + pFlags := C.PopplerAnnotFlag(f) + + C.poppler_annot_set_flags(a.am.annot, pFlags) +} diff --git a/go-poppler/document.go b/go-poppler/document.go new file mode 100644 index 0000000..58365ca --- /dev/null +++ b/go-poppler/document.go @@ -0,0 +1,161 @@ +package poppler + +// #cgo pkg-config: poppler-glib +// #include <poppler.h> +// #include <stdlib.h> +// #include <glib.h> +// #include <unistd.h> +import "C" + +import ( + "errors" + "unsafe" + "path/filepath" +) + +type Document struct { + doc poppDoc + openedPages []*Page +} + +type DocumentInfo struct { + PdfVersion string `json:"pdf_version,omitempty"` + Title string `json:"title,omitempty"` + Author string `json:"author,omitempty"` + Subject string `json:"subject,omitempty"` + KeyWords string `json:"keywords,omitempty"` + Creator string `json:"creator,omitempty"` + Producer string `json:"producer,omitempty"` + Metadata string `json:"metadata,omitempty"` + CreationDate int `json:"creation_date,omitempty"` + ModificationDate int `json:"modification_date,omitempty"` + Pages int `json:"pages_number,omitempty"` + IsLinearized bool `json:"is_linearized,omitempty"` +} + +func (d *Document) Info() DocumentInfo { + return DocumentInfo{ + PdfVersion: toString(C.poppler_document_get_pdf_version_string(d.doc)), + Title: toString(C.poppler_document_get_title(d.doc)), + Author: toString(C.poppler_document_get_author(d.doc)), + Subject: toString(C.poppler_document_get_subject(d.doc)), + KeyWords: toString(C.poppler_document_get_keywords(d.doc)), + Creator: toString(C.poppler_document_get_creator(d.doc)), + Producer: toString(C.poppler_document_get_producer(d.doc)), + Metadata: toString(C.poppler_document_get_metadata(d.doc)), + CreationDate: int(C.poppler_document_get_creation_date(d.doc)), + ModificationDate: int(C.poppler_document_get_modification_date(d.doc)), + Pages: int(C.poppler_document_get_n_pages(d.doc)), + IsLinearized: toBool(C.poppler_document_is_linearized(d.doc)), + } +} + +func (d *Document) GetNPages() int { + return int(C.poppler_document_get_n_pages(d.doc)) +} + +func (d *Document) GetPage(i int) (page *Page) { + p := C.poppler_document_get_page(d.doc, C.int(i)) + + page = &Page{ + p: p, + openedAnnots: nil, + } + d.openedPages = append(d.openedPages, page) + + return page +} + +func (d *Document) HasAttachments() bool { + return toBool(C.poppler_document_has_attachments(d.doc)) +} + +func (d *Document) GetNAttachments() int { + return int(C.poppler_document_get_n_attachments(d.doc)) +} + +func (d *Document) Close() { + + for i := 0; i < len(d.openedPages); i++ { + d.openedPages[i].Close() + } + d.openedPages = []*Page{} + + C.g_object_unref(C.gpointer(d.doc)) +} + + +func (d *Document) NewAnnot(t AnnotType, r Rectangle, q []Quad) (Annot, error) { + am := C.poppler_annot_mapping_new(); + + annot := Annot { + am: am, + } + + pRect := rectangleToPopplerRectangle(r) + + pQuad := quadsToGArray(q) + defer C.g_array_free(pQuad, 1) + + + switch (t){ + case AnnotHighlight: + am.annot = C.poppler_annot_text_markup_new_highlight(d.doc, &pRect, pQuad) + case AnnotUnderline: + am.annot = C.poppler_annot_text_markup_new_underline(d.doc, &pRect, pQuad) + case AnnotSquiggly: + am.annot = C.poppler_annot_text_markup_new_squiggly(d.doc, &pRect, pQuad) + case AnnotStrikeOut: + am.annot = C.poppler_annot_text_markup_new_strikeout(d.doc, &pRect, pQuad) + default: + C.poppler_annot_mapping_free(am) + return annot, errors.New("invalid type for new annotation") + } + + + if am.annot == nil { + C.poppler_annot_mapping_free(am) + return annot, errors.New("failed to create annotation") + } + + /* Can't get real annot mapping area as done in + * poppler_page_get_annot_mapping() since page is + * needed for page->page->getCropBox() and + * page->page->getRotate() + * + * as a placeholder we just use the annot rect + */ + annot.am.area = pRect + + return annot, nil +} + +func (d *Document) Save(filename string) (saved bool, err error) { + filename, err = filepath.Abs(filename) + if err != nil { + return false, err + } + + var e *C.GError + cFilename := (*C.gchar)(C.CString(filename)) + defer C.free(unsafe.Pointer(cFilename)) + + cUri := C.g_filename_to_uri(cFilename, nil, nil) + cBool := C.poppler_document_save (d.doc, cUri, &e); + if e != nil { + err = errors.New(C.GoString((*C.char)(e.message))) + return false, err + } + + if cBool == C.TRUE { + return true, nil + } + + return false, nil +} + +/* +func (d *Document) GetAttachments() []Attachment { + return +} +*/ diff --git a/go-poppler/image.go b/go-poppler/image.go new file mode 100644 index 0000000..ba2abc8 --- /dev/null +++ b/go-poppler/image.go @@ -0,0 +1,31 @@ +package poppler + +// #cgo pkg-config: poppler-glib +// #include <poppler.h> +// #include <glib.h> +import "C" +import ( + "unsafe" + + "github.com/ungerik/go-cairo" +) + +// Image + +type Image struct { + Id int + Area Rectangle + p *C.struct__PopplerPage +} + +type Rectangle struct { + X1, Y1, X2, Y2 float64 +} + +func (im *Image) GetSurface() (cs *cairo.Surface) { + ci := C.poppler_page_get_image(im.p, C.gint(im.Id)) + ctx := C.cairo_create(ci) + cip := (cairo.Cairo_surface)(unsafe.Pointer(ci)) + ctxp := (cairo.Cairo_context)(unsafe.Pointer(ctx)) + return cairo.NewSurfaceFromC(cip, ctxp) +} diff --git a/go-poppler/page.go b/go-poppler/page.go new file mode 100644 index 0000000..efd0706 --- /dev/null +++ b/go-poppler/page.go @@ -0,0 +1,217 @@ +package poppler + +// #cgo pkg-config: poppler-glib +// #include <poppler.h> +// #include <glib.h> +// #include <cairo.h> +import "C" +import "unsafe" +import "github.com/ungerik/go-cairo" + +//import "fmt" + +type Page struct { + p *C.struct__PopplerPage + openedAnnots []*Annot +} + +func (p *Page) Text() string { + return C.GoString(C.poppler_page_get_text(p.p)) +} + +func (p *Page) TextAttributes() (results []TextAttributes) { + a := C.poppler_page_get_text_attributes(p.p) + defer C.poppler_page_free_text_attributes(a) + var attr *C.PopplerTextAttributes + results = make([]TextAttributes, 0) + el := C.g_list_first(a) + for el != nil { + attr = (*C.PopplerTextAttributes)(el.data) + fn := attr.font_name + result := TextAttributes{ + FontName: toString(fn), + FontSize: float64(attr.font_size), + IsUnderlined: toBool(attr.is_underlined), + StartIndex: int(attr.start_index), + EndIndex: int(attr.end_index), + Color: Color{ + R: int(attr.color.red), + G: int(attr.color.green), + B: int(attr.color.blue), + }, + } + results = append(results, result) + el = el.next + } + return +} + +func (p *Page) Size() (width, height float64) { + var w, h C.double + C.poppler_page_get_size(p.p, &w, &h) + return float64(w), float64(h) +} + +func (p *Page) Index() int { + return int(C.poppler_page_get_index(p.p)) +} + +func (p *Page) Label() string { + return toString(C.poppler_page_get_label(p.p)) +} + +func (p *Page) Duration() float64 { + return float64(C.poppler_page_get_duration(p.p)) +} + +func (p *Page) Images() (results []Image) { + l := C.poppler_page_get_image_mapping(p.p) + defer C.poppler_page_free_image_mapping(l) + results = make([]Image, 0) + var im *C.PopplerImageMapping + for el := C.g_list_first(l); el != nil; el = el.next { + im = (*C.PopplerImageMapping)(el.data) + result := Image{ + Id: int(im.image_id), + Area: Rectangle{ + X1: float64(im.area.x1), + Y1: float64(im.area.y1), + X2: float64(im.area.x2), + Y2: float64(im.area.y2), + }, + p: p.p, + } + results = append(results, result) + } + return +} + +func (p *Page) TextLayout() (layouts []Rectangle) { + var rect *C.PopplerRectangle + var n C.guint + if toBool(C.poppler_page_get_text_layout(p.p, &rect, &n)) { + defer C.g_free((C.gpointer)(rect)) + layouts = make([]Rectangle, int(n)) + r := (*[1 << 30]C.PopplerRectangle)(unsafe.Pointer(rect))[:n:n] + for i := 0; i < int(n); i++ { + layouts[i] = Rectangle{ + X1: float64(r[i].x1), + Y1: float64(r[i].y1), + X2: float64(r[i].x2), + Y2: float64(r[i].y2), + } + } + } + return +} + +func (p *Page) TextLayoutAndAttrs() (result []TextEl) { + text := p.Text() + attrs := p.TextAttributes() + layout := p.TextLayout() + result = make([]TextEl, len(layout)) + attrsRef := make([]*TextAttributes, len(attrs)) + for i, a := range attrs { + attr := a + attrsRef[i] = &attr + } + i := 0 + for _, t := range text { + var a *TextAttributes + for _, a = range attrsRef { + if i >= a.StartIndex && i <= a.EndIndex { + break + } + } + result[i] = TextEl{ + Text: string(t), + Attrs: a, + Rect: layout[i], + } + i++ + } + return +} + +func (p *Page) Close() { + p.closeAnnotMappings() + + if p.p != nil { + C.g_object_unref(C.gpointer(p.p)) + /* avoid double free */ + p.p = nil + } +} + +// Converts a page into SVG and saves to file. +// Inspired by https://github.com/dawbarton/pdf2svg +func (p *Page) ConvertToSVG(filename string){ + width, height := p.Size() + + // Open the SVG file + surface := cairo.NewSVGSurface( filename, width, height, cairo.SVG_VERSION_1_2 ) + + // TODO Can be improved by using cairo_svg_surface_create_for_stream() instead of + // cairo_svg_surface_create() for stream processing instead of file processing. + // However, this needs to be changed in github.com/ungerik/go-cairo/surface.go + + // Get cairo context pointer + _, drawcontext := surface.Native() + + // Render the PDF file into the SVG file + C.poppler_page_render_for_printing(p.p, (*C.cairo_t)(unsafe.Pointer(drawcontext)) ); + + // Close the SVG file + surface.ShowPage() + surface.Destroy() +} + +func (p *Page) closeAnnotMappings(){ + for i := 0; i < len(p.openedAnnots); i++ { + p.openedAnnots[i].Close() + } + + p.openedAnnots = nil + +} + +func (p *Page) GetAnnots() (Annots []*Annot) { + var annots []*Annot + + annotGlist := C.poppler_page_get_annot_mapping(p.p) + defer C.g_list_free(annotGlist) + + p.closeAnnotMappings() + + for annotGlist != nil { + popplerAnnot := (*C.PopplerAnnotMapping)(annotGlist.data) + + + annot := &Annot{ + am: popplerAnnot, + } + + /* Maybe we can used openedAnnots instead of annots + openedAnnots + */ + + annots = append(annots, annot) + p.openedAnnots = append(p.openedAnnots, annot) + + + annotGlist = annotGlist.next + } + + return annots +} + +func (p *Page) AnnotText(a Annot) string { + cText := C.poppler_page_get_text_for_area(p.p, &a.am.area) + return C.GoString(cText) +} + +func (p *Page) AddAnnot(a Annot) { + C.poppler_page_add_annot(p.p, a.am.annot) +} +func (p *Page) RemoveAnnot(a Annot) { + C.poppler_page_remove_annot(p.p, a.am.annot) +} diff --git a/go-poppler/poppler.go b/go-poppler/poppler.go new file mode 100644 index 0000000..eaa53c4 --- /dev/null +++ b/go-poppler/poppler.go @@ -0,0 +1,58 @@ +package poppler + +// #cgo pkg-config: poppler-glib +// #include <poppler.h> +// #include <stdlib.h> +// #include <glib.h> +// #include <unistd.h> +import "C" + +import ( + "errors" + "path/filepath" + "unsafe" +) + +type poppDoc *C.struct__PopplerDocument + +func Open(filename string) (doc *Document, err error) { + filename, err = filepath.Abs(filename) + if err != nil { + return + } + var e *C.GError + cfilename := (*C.gchar)(C.CString(filename)) + defer C.free(unsafe.Pointer(cfilename)) + fn := C.g_filename_to_uri(cfilename, nil, nil) + var d poppDoc + d = C.poppler_document_new_from_file((*C.char)(fn), nil, &e) + if e != nil { + err = errors.New(C.GoString((*C.char)(e.message))) + } + doc = &Document{ + doc: d, + openedPages: []*Page{}, + } + return +} + +func Load(data []byte) (doc *Document, err error) { + var e *C.GError + var d poppDoc + + b := C.g_bytes_new((C.gconstpointer)(unsafe.Pointer(&data[0])), (C.ulong)(len(data))) + defer C.g_bytes_unref(b) + + d = C.poppler_document_new_from_bytes(b, nil, &e) + if e != nil { + err = errors.New(C.GoString((*C.char)(e.message))) + } + doc = &Document{ + doc: d, + } + return +} + +func Version() string { + return C.GoString(C.poppler_get_version()) +} diff --git a/go-poppler/tags.go b/go-poppler/tags.go new file mode 100644 index 0000000..fd24f70 --- /dev/null +++ b/go-poppler/tags.go @@ -0,0 +1,57 @@ +package poppler + +// #cgo pkg-config: poppler-glib +// #include <poppler.h> +// #include <stdlib.h> +// #include <glib.h> +// #include <unistd.h> +import "C" +import "strings" + + + +var zeroRect = Rectangle{X1: 0, X2: 0, Y1: 0, Y2: 0} + +func (d *Document) Tag(text string){ + am := C.poppler_annot_mapping_new(); + + pRect := rectangleToPopplerRectangle(zeroRect) + + annot := Annot { + am: am, + } + defer annot.Close() + + am.annot = C.poppler_annot_text_new(d.doc, &pRect) + annot.SetContents(text) + annot.SetFlags(AnnotFlagHidden | AnnotFlagInvisible) + + zeroPage := d.GetPage(0) + zeroPage.AddAnnot(annot) + defer zeroPage.Close() +} + + +func (d *Document) GetTags(filter string) []*Annot { + page := d.GetPage(0) + //defer page.Close() + + annots := page.GetAnnots() + var tags []*Annot + for _, a := range(annots) { + if a.Type() == AnnotText && + rectEq(a.Rect(), zeroRect) && + a.Flags() & AnnotFlagHidden != 0 && + a.Flags() & AnnotFlagInvisible != 0 && + strings.HasPrefix(a.Contents(), filter){ + tags = append(tags, a) + } + } + + return tags +} + +func (d *Document) RemoveTags(filter string){ +// d.GetPage(0).AddAnnot(annot) + // TODO +} diff --git a/go-poppler/text.go b/go-poppler/text.go new file mode 100644 index 0000000..5550b64 --- /dev/null +++ b/go-poppler/text.go @@ -0,0 +1,22 @@ +package poppler + +import () + +type TextEl struct { + Text string + Attrs *TextAttributes + Rect Rectangle +} + +type TextAttributes struct { + FontName string + FontSize float64 + IsUnderlined bool + Color Color + StartIndex, EndIndex int +} + +type Color struct { + R, G, B int +} + diff --git a/go-poppler/utils.go b/go-poppler/utils.go new file mode 100644 index 0000000..4bda53a --- /dev/null +++ b/go-poppler/utils.go @@ -0,0 +1,85 @@ +package poppler + +// #cgo pkg-config: poppler-glib +// #include <poppler.h> +// #include <glib.h> +// #include <unistd.h> +// #include <stdlib.h> +import "C" + +import "unsafe" + +func toString(in *C.gchar) string { + return C.GoString((*C.char)(in)) +} + +func toBool(in C.gboolean) bool { + return int(in) > 0 +} + +/* convert a Quad struct to a GArray */ +func quadsToGArray(quads []Quad) *C.GArray { + garray := C.g_array_new(C.FALSE, C.FALSE, C.sizeof_PopplerQuadrilateral) + + for _, quad := range quads { + item := C.PopplerQuadrilateral{ + p1: C.PopplerPoint{ + x: C.double(quad.P1.X), + y: C.double(quad.P1.Y), + }, + p2: C.PopplerPoint{ + x: C.double(quad.P2.X), + y: C.double(quad.P2.Y), + }, + p3: C.PopplerPoint{ + x: C.double(quad.P3.X), + y: C.double(quad.P3.Y), + }, + p4: C.PopplerPoint{ + x: C.double(quad.P4.X), + y: C.double(quad.P4.Y), + }, + } + + C.g_array_append_vals(garray, C.gconstpointer(&item),1) + } + + return garray +} + +/* convert a GArray to a quad */ +func gArrayToQuads(q *C.GArray) []Quad { + length := int(q.len) + + quads := make([]Quad, length) + + for i := 0; i < length; i++ { + item := (*C.PopplerQuadrilateral)(unsafe.Pointer(uintptr(unsafe.Pointer(q.data)) + uintptr(i)*unsafe.Sizeof(C.PopplerQuadrilateral{}))) + quads[i] = Quad{ + P1: Point{X: float64(item.p1.x), Y: float64(item.p1.y)}, + P2: Point{X: float64(item.p2.x), Y: float64(item.p2.y)}, + P3: Point{X: float64(item.p3.x), Y: float64(item.p3.y)}, + P4: Point{X: float64(item.p4.x), Y: float64(item.p4.y)}, + } + } + + return quads +} + +func rectangleToPopplerRectangle (r Rectangle) C.PopplerRectangle { + var pRect C.PopplerRectangle + + pRect.x1 = C.double(r.X1) + pRect.y1 = C.double(r.Y1) + pRect.x2 = C.double(r.X2) + pRect.y2 = C.double(r.Y2) + + return pRect +} + +func rectEq(r1 Rectangle, r2 Rectangle) bool { + return r1.X1 == r2.X1 && + r1.X2 == r2.X2 && + r1.Y1 == r2.Y1 && + r1.Y2 == r2.Y2 +} @@ -0,0 +1,13 @@ +module github.com/scrotadamus/ghligh + +go 1.22.2 + +require ( + github.com/spf13/cobra v1.8.1 + github.com/ungerik/go-cairo v0.0.0-20240304075741-47de8851d267 +) + +require ( + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/spf13/pflag v1.0.5 // indirect +) @@ -0,0 +1,12 @@ +github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= +github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/ungerik/go-cairo v0.0.0-20240304075741-47de8851d267 h1:KA55kgg61iraQP4wSKIFRHwHIgDqim2Tvh8EXn7Udxw= +github.com/ungerik/go-cairo v0.0.0-20240304075741-47de8851d267/go.mod h1:yLTJg56omDJ+JVxZ5whpCrZgQdaSs+OBdFa+X6ViJcI= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= @@ -0,0 +1,11 @@ +/* +Copyright © 2025 Francesco Orlando scrotadamus@insiberia.net + +*/ +package main + +import "github.com/scrotadamus/ghligh/cmd" + +func main() { + cmd.Execute() +} |