function usage {
echo "usage: <n>$0"
+ echo "note: n is the number of essays to download"
+ echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
+ echo "n | tokens"
+ echo "--- | ---"
+ echo "1 | 6230"
+ echo "2 | 23619"
+ echo "5 | 25859"
+ echo "10 | 36888"
+ echo "15 | 50188"
+ echo "20 | 59094"
+ echo "25 | 88764"
+ echo "30 | 103121"
+ echo "32 | 108338"
+ echo "35 | 113403"
+ echo "40 | 127699"
+ echo "45 | 135896"
exit 1
}
rm pg.txt
fi
+c=1
for url in $urls; do
echo "processing $url"
- curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt
+ cc=$(printf "%03d" $c)
+
+ curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
+ cat pg-$cc-one.txt >> pg.txt
+
+ cp -v pg.txt pg-$cc-all.txt
+ c=$((c+1))
# don't flood the server
sleep 1