ports/152827: [new port] www/googlebook_dl
Alex Kozlov
spam at rm-rf.kiev.ua
Sat Dec 4 09:30:15 UTC 2010
>Number: 152827
>Category: ports
>Synopsis: [new port] www/googlebook_dl
>Confidential: no
>Severity: non-critical
>Priority: low
>Responsible: freebsd-ports-bugs
>State: open
>Quarter:
>Keywords:
>Date-Required:
>Class: change-request
>Submitter-Id: current-users
>Arrival-Date: Sat Dec 04 09:30:14 UTC 2010
>Closed-Date:
>Last-Modified:
>Originator: Alex Kozlov
>Release: RELENG_8
>Organization:
private
>Environment:
>Description:
Add new port: a command-line utility for downloading books from Google Books.
>How-To-Repeat:
>Fix:
Patch attached with submission follows:
# This is a shell archive. Save it in a file, remove anything before
# this line, and then unpack it by entering "sh file". Note, it may
# create directories; files and directories will be owned by you and
# have default permissions.
#
# This archive contains:
#
# googlebook_dl
# googlebook_dl/files
# googlebook_dl/files/googlebook_dl.sh
# googlebook_dl/Makefile
# googlebook_dl/pkg-descr
#
echo c - googlebook_dl
mkdir -p googlebook_dl > /dev/null 2>&1
echo c - googlebook_dl/files
mkdir -p googlebook_dl/files > /dev/null 2>&1
echo x - googlebook_dl/files/googlebook_dl.sh
sed 's/^X//' >googlebook_dl/files/googlebook_dl.sh << '5132a609fdfd6cad1b563a9cf871f42d'
X#!/bin/sh
X
Xparse_options()
X{
X local OPT OPTARG OPTIND
X
X while getopts ap:P:vw: OPT; do
X # escape meta
X OPTARG=${OPTARG%%[;\\\$]*}
X
X case ${OPT} in
X a) all=yes ;;
X p) proxylist="${OPTARG}" ;;
X P) pageprefix="${OPTARG}" ;;
X v) verbose=yes ;;
X w) pagewidth="${OPTARG}" ;;
X *) usage ;;
X esac
X done
X
X OPTC=$((${OPTIND} - 1))
X}
X
Xusage()
X{
X echo "usage: ${0##*/} [-ahPpw] totpages bookid"
X echo ' -h display this help'
X echo ' -a all mode (try to get sigs from all pages, including already downloaded)'
X echo ' -P pageprefix (*PA, PP, PR, PT)'
X echo ' -p proxylist'
X echo ' -v verbose'
X echo ' -w pagewidth (800, *1024, 1280, 1440, 1680, ...)'
X echo
X exit 1
X}
X
Xget_pages()
X{
X local ua page url _return
X
X # with wrong ua we will get 401 Unauthorized
X # ua='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Firefox/3.0'
X ua='Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)'
X
X # get cookie
X wget -T5 -t2 -q -U"${ua}" --keep-session-cookies \
X --save-cookies "${DIR}/cookies.txt" -O/dev/null \
X "http://books.google.com/books?id=${bookid}&pg=PA1&jscmd=click3"
X
X # bail if wget returned non zero exit code or cookies.txt is empty
X _return=$?
X cookie="$(grep '^.google.com' "${DIR}/cookies.txt" 2>/dev/null | \
X sed 's/^.*\(ID.*\)$/\1/')"
X [ ${_return} -ne 0 -o -z "${cookie}" ] && \
X { rm "${DIR}/cookies.txt"; return 1; }
X
X # show cookie
X [ -n "${verbose}" ] && echo "cookie: ${cookie}"
X
X # if downloaded less that half of total pages, use all mode
X [ $(ls "${bookid}/" | wc -l) -le $((${totpages} / 2)) ] && all=yes
X
X # pull sigs only from missing pages unless in all mode
X page=1
X while [ ${page} -le ${totpages} ]; do
X [ -f "${bookid}/${pageprefix}${page}" -a -z "${all}" ] || \
X echo "http://books.google.com/books?id=${bookid}&pg=${pageprefix}${page}&jscmd=click3" \
X >> "${DIR}/urls"
X page=$(( ${page} + 1))
X done
X
X # get all sigs at once
X # NB! sigs tied to cookie and ip
X wget -T5 -t2 -q -U"${ua}" --no-cache --load-cookies "${DIR}/cookies.txt" \
X -O- -i "${DIR}/urls" | tr '}' '\n' | grep "{\"pid\":\"P.*\",\"src\":" | \
X sed 's/^.*"src":"\(http:\/\/[^"]*\)".*$/\1/;s/\\u0026/\&/g' | sort -u | \
X while read -r url; do
X page=$(echo "${url}" | sed 's/^.*&pg=\([^&]*\)&.*$/\1/')
X
X [ -n "${verbose}" ] && verbose="${page}: ${url}&w=${pagewidth}"
X
X # skip already downloaded pages
X [ -f "${bookid}/${page}" ] || \
X {
X wget -T5 -t3 -q -U"${ua}" --no-cache \
X --load-cookies "${DIR}/cookies.txt" \
X -O"${bookid}/${page}" "${url}&w=${pagewidth}"
X
X _return=$?
X if [ ${_return} -ne 0 ]; then
X # sometimes google books returns 404
X rm "${bookid}/${page}"
X [ -n "${verbose}" ] && verbose="${verbose} ERROR"
X else
X if [ -n "${verbose}" ]; then
X verbose="${verbose} DOWNLOADED"
X else
X echo -n "${page} "
X fi
X fi
X }
X
X [ -n "${verbose}" ] && echo "${verbose}"
X done
X # clean temp files
X rm "${DIR}/cookies.txt" "${DIR}/urls"
X
X echo
X}
X
X#
X# MAIN
X#
X
X# default page width
Xpagewidth=1024
X
X# PA - books pages ${totpages}
X# PR - preface, contents ~30
X# PP,PT - front, back title ~10
X# default page prefix
Xpageprefix=PA
X
Xparse_options ${1+"$@"}
Xshift ${OPTC}
X
X[ -z $1 ] && usage
Xtotpages=$1
X
X[ -z $2 ] && usage
Xbookid=$2
X
X# if bookid dir already exists, continue from previous try
X[ -d "${bookid}" ] || \
X{
X mkdir "${bookid}" || { echo "cannot create dir ${bookid}"; exit 2; }
X}
X
XDIR=`mktemp -d googlebook_dl.XXXXXXXXXX` || exit 2
Xtrap "rm -rf ${DIR}; exit 1" 1 2 3 10 13 15
X
Xif [ -z "${proxylist}" ]; then
X get_pages
Xelse
X for http_proxy in `cat "${proxylist}"`; do
X echo "using proxy ${http_proxy}"
X get_pages
X done
Xfi
X
Xrmdir "${DIR}"
5132a609fdfd6cad1b563a9cf871f42d
echo x - googlebook_dl/Makefile
sed 's/^X//' >googlebook_dl/Makefile << 'd16a7128e88c8f65bbb5b5b15758848c'
X# New ports collection makefile for: googlebook_dl
X# Date created: 2 Dec 2010
X# Whom: spam at rm-rf.kiev.ua
X#
X# $FreeBSD$
X#
X
XPORTNAME= googlebook_dl
XPORTVERSION= 20100502
XCATEGORIES= www
XMASTER_SITES= # none
XDISTFILES= # none
X
XMAINTAINER= spam at rm-rf.kiev.ua
XCOMMENT= A command-line utility for downloading books from Google Books
X
XRUN_DEPENDS= wget:${PORTSDIR}/ftp/wget
X
XNO_BUILD= yes
X
XPLIST_FILES= bin/googlebook_dl
X
Xdo-install:
X ${INSTALL_SCRIPT} ${FILESDIR}/${PORTNAME}.sh ${PREFIX}/bin/${PORTNAME}
X
X.include <bsd.port.mk>
d16a7128e88c8f65bbb5b5b15758848c
echo x - googlebook_dl/pkg-descr
sed 's/^X//' >googlebook_dl/pkg-descr << 'f24b13a9ac0f95fe41a57cfa2244003a'
XA command-line utility for downloading books from Google Books
f24b13a9ac0f95fe41a57cfa2244003a
exit
>Release-Note:
>Audit-Trail:
>Unformatted:
More information about the freebsd-ports-bugs
mailing list