scripts : improve get-pg.sh (#4838)

author Georgi Gerganov <redacted>

Tue, 9 Jan 2024 17:20:45 +0000 (19:20 +0200)

committer Georgi Gerganov <redacted>

Tue, 9 Jan 2024 17:21:13 +0000 (19:21 +0200)
author Georgi Gerganov <redacted>
Tue, 9 Jan 2024 17:20:45 +0000 (19:20 +0200)
committer Georgi Gerganov <redacted>
Tue, 9 Jan 2024 17:21:13 +0000 (19:21 +0200)
diff --git a/scripts/get-pg.sh b/scripts/get-pg.sh

index d516db46cf01fef3d497ddf61e441eabac3de634..b027793e19f7a77aaeb6eff3014add4004cc2f27 100755 (executable)
--- a/scripts/get-pg.sh
+++ b/scripts/get-pg.sh
@@ -2,6 +2,22 @@
  
  function usage {
      echo "usage: <n>$0"
+    echo "note: n is the number of essays to download"
+    echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
+    echo "n   | tokens"
+    echo "--- | ---"
+    echo "1   | 6230"
+    echo "2   | 23619"
+    echo "5   | 25859"
+    echo "10  | 36888"
+    echo "15  | 50188"
+    echo "20  | 59094"
+    echo "25  | 88764"
+    echo "30  | 103121"
+    echo "32  | 108338"
+    echo "35  | 113403"
+    echo "40  | 127699"
+    echo "45  | 135896"
      exit 1
  }
  
@@ -33,10 +49,17 @@ if [ -f pg.txt ]; then
      rm pg.txt
  fi
  
+c=1
  for url in $urls; do
      echo "processing $url"
  
-    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt
+    cc=$(printf "%03d" $c)
+
+    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
+    cat pg-$cc-one.txt >> pg.txt
+
+    cp -v pg.txt pg-$cc-all.txt
+    c=$((c+1))
  
      # don't flood the server
      sleep 1
author	Georgi Gerganov <redacted>
	Tue, 9 Jan 2024 17:20:45 +0000 (19:20 +0200)
committer	Georgi Gerganov <redacted>
	Tue, 9 Jan 2024 17:21:13 +0000 (19:21 +0200)