git: 6d884b207aab - main - textproc/py-sentencepiece: New port: Unsupervised text tokenizer for Neural Network-based text generation
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 25 Jan 2023 07:46:31 UTC
The branch main has been updated by yuri: URL: https://cgit.FreeBSD.org/ports/commit/?id=6d884b207aab2373494bbd713278a80474a58601 commit 6d884b207aab2373494bbd713278a80474a58601 Author: Yuri Victorovich <yuri@FreeBSD.org> AuthorDate: 2023-01-25 07:45:57 +0000 Commit: Yuri Victorovich <yuri@FreeBSD.org> CommitDate: 2023-01-25 07:45:57 +0000 textproc/py-sentencepiece: New port: Unsupervised text tokenizer for Neural Network-based text generation --- textproc/Makefile | 1 + textproc/py-sentencepiece/Makefile | 26 ++++++++++++++++++++++++++ textproc/py-sentencepiece/distinfo | 3 +++ textproc/py-sentencepiece/pkg-descr | 7 +++++++ 4 files changed, 37 insertions(+) diff --git a/textproc/Makefile b/textproc/Makefile index e2d0e0ea9521..3d52828e2e12 100644 --- a/textproc/Makefile +++ b/textproc/Makefile @@ -1496,6 +1496,7 @@ SUBDIR += py-rst2html5 SUBDIR += py-rstfmt SUBDIR += py-scour + SUBDIR += py-sentencepiece SUBDIR += py-simplebayes SUBDIR += py-smartypants SUBDIR += py-snowballstemmer diff --git a/textproc/py-sentencepiece/Makefile b/textproc/py-sentencepiece/Makefile new file mode 100644 index 000000000000..fe1b9cfd4ba7 --- /dev/null +++ b/textproc/py-sentencepiece/Makefile @@ -0,0 +1,26 @@ +PORTNAME= sentencepiece +DISTVERSIONPREFIX= v +DISTVERSION= 0.1.97 +CATEGORIES= textproc # machine-learning +PKGNAMEPREFIX= ${PYTHON_PKGNAMEPREFIX} + +MAINTAINER= yuri@FreeBSD.org +COMMENT= Unsupervised text tokenizer for Neural Network-based text generation +WWW= https://github.com/google/sentencepiece + +LICENSE= APACHE20 +LICENSE_FILE= ${WRKSRC}/../LICENSE + +LIB_DEPENDS= libsentencepiece.so:textproc/sentencepiece + +USES= compiler:c++17-lang pkgconfig python +USE_PYTHON= distutils autoplist pytest + +USE_GITHUB= yes +GH_ACCOUNT= google + +WRKSRC_SUBDIR= python + +TEST_ENV= ${MAKE_ENV} PYTHONPATH=${STAGEDIR}${PYTHONPREFIX_SITELIBDIR} + +.include <bsd.port.mk> diff --git a/textproc/py-sentencepiece/distinfo b/textproc/py-sentencepiece/distinfo new file mode 100644 index 000000000000..c29dc9430710 --- /dev/null +++ b/textproc/py-sentencepiece/distinfo @@ -0,0 +1,3 @@ +TIMESTAMP = 1673860778 +SHA256 (google-sentencepiece-v0.1.97_GH0.tar.gz) = 41c3a07f315e3ac87605460c8bb8d739955bc8e7f478caec4017ef9b7d78669b +SIZE (google-sentencepiece-v0.1.97_GH0.tar.gz) = 11945436 diff --git a/textproc/py-sentencepiece/pkg-descr b/textproc/py-sentencepiece/pkg-descr new file mode 100644 index 000000000000..62b7de5f4ece --- /dev/null +++ b/textproc/py-sentencepiece/pkg-descr @@ -0,0 +1,7 @@ +SentencePiece is an unsupervised text tokenizer and detokenizer mainly for +Neural Network-based text generation systems where the vocabulary size is +predetermined prior to the neural model training. SentencePiece implements +subword units (e.g., byte-pair-encoding (BPE)) and unigram language model +with the extension of direct training from raw sentences. SentencePiece +allows us to make a purely end-to-end system that does not depend on +language-specific pre/postprocessing.