Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use json to store cmaps #940

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@ MKDIR=mkdir
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPSRC=cmaprsrc
CMAPDST=pdfminer/cmap
cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.json.gz $(CMAPDST)/to-unicode-Adobe-GB1.json.gz \
$(CMAPDST)/to-unicode-Adobe-Japan1.json.gz $(CMAPDST)/to-unicode-Adobe-Korea1.json.gz
cmap_clean:
-$(RM) -r $(CMAPDST)
$(CMAPDST):
$(MKDIR) $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-CNS1.json.gz: $(CMAPDST)
$(CONV_CMAP) -c B5=cp950 -c UniCNS-UTF8=utf-8 \
$(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt
$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-GB1.json.gz: $(CMAPDST)
$(CONV_CMAP) -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 \
$(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-Japan1.json.gz: $(CMAPDST)
$(CONV_CMAP) -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 \
$(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt
$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-Korea1.json.gz: $(CMAPDST)
$(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \
$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
118 changes: 61 additions & 57 deletions tools/conv_cmap.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#!/usr/bin/env python3

import argparse
import codecs
import pickle as pickle
import gzip
import json
import sys
from pathlib import Path
from typing import List, Any


class CMapConverter:
Expand Down Expand Up @@ -137,68 +140,69 @@ def dump_cmap(self, fp, enc):
IS_VERTICAL=self.is_vertical.get(enc, False),
CODE2CID=self.code2cid.get(enc),
)
fp.write(pickle.dumps(data, 2))
return
json.dump(data, fp)

def dump_unicodemap(self, fp):
data = dict(
CID2UNICHR_H=self.cid2unichr_h,
CID2UNICHR_V=self.cid2unichr_v,
)
fp.write(pickle.dumps(data, 2))
return


def main(argv):
import getopt
import gzip
import os.path

def usage():
print(
"usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]" % argv[0]
)
return 100

try:
(opts, args) = getopt.getopt(argv[1:], "c:")
except getopt.GetoptError:
return usage()
enc2codec = {}
for (k, v) in opts:
if k == "-c":
(enc, _, codec) = v.partition("=")
enc2codec[enc] = codec
if not args:
return usage()
outdir = args.pop(0)
if not args:
return usage()
regname = args.pop(0)

converter = CMapConverter(enc2codec)
for path in args:
print("reading: %r..." % path)
fp = open(path)
converter.load(fp)
fp.close()

json.dump(data, fp)


def create_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser()
parser.add_argument(
"--encoding-codec",
"-c",
type=str,
action="append",
default=[],
help="Specify the codec of an encoding. Use `enc=codec` as a value.",
)
parser.add_argument(
"output_dir",
type=Path,
help="Directory where the compressed cmap's are stored.",
)
parser.add_argument(
"regname",
type=str,
)
parser.add_argument("cid2code", type=Path, nargs="*", help="Input cmaps.")
return parser


def main(argv: List[Any]):
parsed_args = create_parser().parse_args(argv[1:])

encoding_codec: List[str] = parsed_args.encoding_codec
outdir: Path = parsed_args.output_dir
regname: str = parsed_args.regname
cid2codes: List[Path] = parsed_args.cid2code

converter = CMapConverter(
dict([enc_codec.split("=") for enc_codec in encoding_codec])
)

for path in cid2codes:
print(f"reading: {path}...")
path.parent.mkdir(exist_ok=True)
with path.open() as fp:
converter.load(fp)

outdir.mkdir(exist_ok=True)
for enc in converter.get_encs():
fname = "%s.pickle.gz" % enc
path = os.path.join(outdir, fname)
print("writing: %r..." % path)
fp = gzip.open(path, "wb")
converter.dump_cmap(fp, enc)
fp.close()

fname = "to-unicode-%s.pickle.gz" % regname
path = os.path.join(outdir, fname)
print("writing: %r..." % path)
fp = gzip.open(path, "wb")
converter.dump_unicodemap(fp)
fp.close()
return
path = outdir / f"{enc}.json.gz"
print(f"writing: {path}...")
with gzip.open(path, "wt") as fp:
converter.dump_cmap(fp, enc)

path = outdir / f"to-unicode-{regname}.json.gz"
print(f"writing: {path}...")
with gzip.open(path, "wt") as fp:
converter.dump_unicodemap(fp)


if __name__ == "__main__":
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]
sys.exit(main(sys.argv))
Loading