git: 922291e01926 - main - textproc/sentencepiece: New port: Unsupervised text tokenizer for Neural Network-based text generation
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Mon, 16 Jan 2023 09:41:03 UTC
The branch main has been updated by yuri: URL: https://cgit.FreeBSD.org/ports/commit/?id=922291e019260419b7bf80e0db65caf4563c2174 commit 922291e019260419b7bf80e0db65caf4563c2174 Author: Yuri Victorovich <yuri@FreeBSD.org> AuthorDate: 2023-01-16 09:36:02 +0000 Commit: Yuri Victorovich <yuri@FreeBSD.org> CommitDate: 2023-01-16 09:41:00 +0000 textproc/sentencepiece: New port: Unsupervised text tokenizer for Neural Network-based text generation --- textproc/Makefile | 1 + textproc/sentencepiece/Makefile | 21 +++++++++++++++++++++ textproc/sentencepiece/distinfo | 3 +++ textproc/sentencepiece/pkg-descr | 7 +++++++ textproc/sentencepiece/pkg-plist | 16 ++++++++++++++++ 5 files changed, 48 insertions(+) diff --git a/textproc/Makefile b/textproc/Makefile index a85511af2b50..e2d0e0ea9521 100644 --- a/textproc/Makefile +++ b/textproc/Makefile @@ -1888,6 +1888,7 @@ SUBDIR += sdocbook-xml SUBDIR += sdom SUBDIR += senna + SUBDIR += sentencepiece SUBDIR += sgmlformat SUBDIR += sgmls SUBDIR += sgrep diff --git a/textproc/sentencepiece/Makefile b/textproc/sentencepiece/Makefile new file mode 100644 index 000000000000..84e7ac9ca43e --- /dev/null +++ b/textproc/sentencepiece/Makefile @@ -0,0 +1,21 @@ +PORTNAME= sentencepiece +DISTVERSIONPREFIX= v +DISTVERSION= 0.1.97 +CATEGORIES= textproc # machine-learning + +MAINTAINER= yuri@FreeBSD.org +COMMENT= Unsupervised text tokenizer for Neural Network-based text generation +WWW= https://github.com/google/sentencepiece + +LICENSE= APACHE20 +LICENSE_FILE= ${WRKSRC}/LICENSE + +USES= cmake:testing compiler:c++17-lang +USE_LDCONFIG= yes + +USE_GITHUB= yes +GH_ACCOUNT= google + +CMAKE_TESTING_ON= SPM_BUILD_TEST + +.include <bsd.port.mk> diff --git a/textproc/sentencepiece/distinfo b/textproc/sentencepiece/distinfo new file mode 100644 index 000000000000..c29dc9430710 --- /dev/null +++ b/textproc/sentencepiece/distinfo @@ -0,0 +1,3 @@ +TIMESTAMP = 1673860778 +SHA256 (google-sentencepiece-v0.1.97_GH0.tar.gz) = 41c3a07f315e3ac87605460c8bb8d739955bc8e7f478caec4017ef9b7d78669b +SIZE (google-sentencepiece-v0.1.97_GH0.tar.gz) = 11945436 diff --git a/textproc/sentencepiece/pkg-descr b/textproc/sentencepiece/pkg-descr new file mode 100644 index 000000000000..62b7de5f4ece --- /dev/null +++ b/textproc/sentencepiece/pkg-descr @@ -0,0 +1,7 @@ +SentencePiece is an unsupervised text tokenizer and detokenizer mainly for +Neural Network-based text generation systems where the vocabulary size is +predetermined prior to the neural model training. SentencePiece implements +subword units (e.g., byte-pair-encoding (BPE)) and unigram language model +with the extension of direct training from raw sentences. SentencePiece +allows us to make a purely end-to-end system that does not depend on +language-specific pre/postprocessing. diff --git a/textproc/sentencepiece/pkg-plist b/textproc/sentencepiece/pkg-plist new file mode 100644 index 000000000000..7640dc4d9c23 --- /dev/null +++ b/textproc/sentencepiece/pkg-plist @@ -0,0 +1,16 @@ +bin/spm_decode +bin/spm_encode +bin/spm_export_vocab +bin/spm_normalize +bin/spm_train +include/sentencepiece_processor.h +include/sentencepiece_trainer.h +lib/libsentencepiece.a +lib/libsentencepiece.so +lib/libsentencepiece.so.0 +lib/libsentencepiece.so.0.0.0 +lib/libsentencepiece_train.a +lib/libsentencepiece_train.so +lib/libsentencepiece_train.so.0 +lib/libsentencepiece_train.so.0.0.0 +libdata/pkgconfig/sentencepiece.pc