]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
scripts : improve get-pg.sh (#4838)
authorGeorgi Gerganov <redacted>
Tue, 9 Jan 2024 17:20:45 +0000 (19:20 +0200)
committerGeorgi Gerganov <redacted>
Tue, 9 Jan 2024 17:21:13 +0000 (19:21 +0200)
scripts/get-pg.sh

index d516db46cf01fef3d497ddf61e441eabac3de634..b027793e19f7a77aaeb6eff3014add4004cc2f27 100755 (executable)
@@ -2,6 +2,22 @@
 
 function usage {
     echo "usage: <n>$0"
+    echo "note: n is the number of essays to download"
+    echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
+    echo "n   | tokens"
+    echo "--- | ---"
+    echo "1   | 6230"
+    echo "2   | 23619"
+    echo "5   | 25859"
+    echo "10  | 36888"
+    echo "15  | 50188"
+    echo "20  | 59094"
+    echo "25  | 88764"
+    echo "30  | 103121"
+    echo "32  | 108338"
+    echo "35  | 113403"
+    echo "40  | 127699"
+    echo "45  | 135896"
     exit 1
 }
 
@@ -33,10 +49,17 @@ if [ -f pg.txt ]; then
     rm pg.txt
 fi
 
+c=1
 for url in $urls; do
     echo "processing $url"
 
-    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt
+    cc=$(printf "%03d" $c)
+
+    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
+    cat pg-$cc-one.txt >> pg.txt
+
+    cp -v pg.txt pg-$cc-all.txt
+    c=$((c+1))
 
     # don't flood the server
     sleep 1