#!/bin/sh
#
# Collects Firefox adblock lists from http://easylist.adblockplus.org/ and
# converts them into an Epiphany adblock format
#
# Author: JM. Philippe <jean-michel.philippe@doudoulinux.org>
#################################
# This file is part of DoudouLinux.
#
# DoudouLinux is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# DoudouLinux is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with DoudouLinux.  If not, see <http://www.gnu.org/licenses/>
#################################

# constants
PAGELIST=https://adblockplus.org/en/subscriptions
OUTPUTDIR=/var/lib/epiphany-adblock
README=README
DICTFILE=langdict.txt
CONVERTER=./firefox2epiphany.pl
TMPFILE=adblock.temp.list

# guess work dir and cd into
SCRIPTDIR=$(echo ${0} | sed 's/[^\/]*$//')
if [ $SCRIPTDIR = './' ]; then
	SCRIPTDIR=$(pwd);
fi
cd $SCRIPTDIR

# check input args
case $# in
	0)
		if [ $(id -un) != "root" ]; then
			echo "ERROR: you must be root to write to '$OUTPUTDIR'"
		fi
		exit
		;;
	1)
		OUTPUTDIR=$1
		;;
	*)
		cat <<EOF
Fetches Firefox Adblock lists from the Internet and converts
them into Epiphany Adblock compatible lists.

Usage:

  $ get-adblock-lists [OUTPUTDIR]
EOF
		exit
esac

# initialisations
if ! [ -d $OUTPUTDIR ]; then
	mkdir $OUTPUTDIR
fi
if ! [ -f $DICTFILE ]; then
	touch $DICTFILE
fi
README=$OUTPUTDIR/$README

# empty output dir
if [ -n "$(ls $OUTPUTDIR/* 2>/dev/null)" ]; then
	rm $OUTPUTDIR/*
fi
cat > $README << EOF
# adblock lists grabbed from $PAGELIST and converted
# into Epiphany format by Andrei Kouznetsov's script
# see http://kouznetsov.awardspace.com/epiphany/adblock.xml
#
# automatically generated for DoudouLinux
# by epiphany-adblock-lists
#
# LANGCODE	LANGUAGE	URL	LICENSE

EOF

# get the page list
DESCRIPTION=
FILEURI=
echo "> retrieving $PAGELIST"
wget --no-check-certificate --quiet -O - $PAGELIST | while read LINE; do
	# search for the language/description in block
	if echo $LINE | grep -E '^[^<>]*$' >/dev/null 2>/dev/null; then
		DESCRIPTION=${LINE## }
	fi
	
	# search for list file URI
	if echo $LINE | grep 'href="abp:subscribe' >/dev/null 2>/dev/null; then
		FILEURI=$(echo $LINE | \
			grep -oE 'location=[^&]*' | \
			head -n 1 | \
			sed 's|location=||' | \
			sed 's|%2F|/|g' | \
			sed 's|%3A|:|' \
		)
	fi
	
	# collect data
	if ( [ -n "$DESCRIPTION" ] && [ -n "$FILEURI" ] ); then
		if ! grep "^$DESCRIPTION=" $DICTFILE >/dev/null 2>/dev/null; then
			# dictionary file of language description and langcode
			# doesn't contain the desired langcode
			echo $DESCRIPTION= >> $DICTFILE
			echo "WARNING: no langcode for '$DESCRIPTION'"
		else
			# get langcode and output data
			LANGCODE=$(grep "^$DESCRIPTION=" $DICTFILE | grep -oE '[^=]*$')
			if [ -n "$LANGCODE" ]; then
				# check file isn't empty if it exists
				OUTPUTFILE=$OUTPUTDIR/$LANGCODE
				FILESIZE=0
				if [ -f $OUTPUTFILE ]; then
					FILESIZE=$(ls -l $OUTPUTFILE | cut -d ' ' -f 5)
				fi
				
				# generate URL block list
				if ( ! [ -f $OUTPUTFILE ] || [ $FILESIZE -eq 0 ] ); then
					# fetch and process file
					echo "> ($LANGCODE) retrieving $FILEURI"
					wget --quiet --no-check-certificate -O - $FILEURI > $TMPFILE
					if [ -f $OUTPUTFILE ]; then
						rm $OUTPUTFILE
						touch $OUTPUTFILE
					fi
					ls -l $TMPFILE
					cat $TMPFILE | $CONVERTER | sed 's/\\/\\\\/g' | while read REGEXP; do
						# test regexp syntax
						if echo 'test' | sed s\'$REGEXP\'\' >/dev/null; then
							# apend to file
							echo $REGEXP >> $OUTPUTFILE
						else
							# show erroneous regexp
							echo $REGEXP
						fi
					done
					ls -l $OUTPUTFILE
					
					# get license
					LICENSE=$(grep -i '^!.*licenses' $TMPFILE | sed 's/^![ ]*//' | head -n 1)
					if [ -n "$LICENSE" ]; then
						echo "     $LICENSE"
					fi
					
					# test if empty result
					if [ $(stat -c %s $OUTPUTFILE) -eq 0 ]; then
						echo "ERROR: output file is empty, removing it"
						rm $OUTPUTFILE
					fi
					
					# record URI into README file
					if ! grep -qsE '^$LANGCODE' $README; then
						# file may have been removed if empty
						if [ -f $OUTPUTFILE ]; then
							echo "$LANGCODE	$DESCRIPTION	$FILEURI	$LICENSE" >> $README
						fi
					fi
				else
					echo "  File exists, not overwriting."
				fi
				
				# case of lists common to several languages
				if echo $LANGCODE | grep -qs ','; then
					for SUBCODE in $(echo $LANGCODE | sed 's/,/ /'); do
						cp $OUTPUTFILE $OUTPUTDIR/$SUBCODE
					done
					rm $OUTPUTFILE
				fi
			else
				echo "WARNING: empty language code for '$DESCRIPTION'"
			fi
		fi
		
		# reset variables
		DESCRIPTION=
		FILEURI=
	fi
done

if [ -f $TMPFILE ]; then
	rm $TMPFILE
fi
