]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
scripts : script to get Paul Graham essays in txt format (#4838)
authorGeorgi Gerganov <redacted>
Tue, 9 Jan 2024 14:23:05 +0000 (16:23 +0200)
committerGitHub <redacted>
Tue, 9 Jan 2024 14:23:05 +0000 (16:23 +0200)
scripts/get-pg.sh [new file with mode: 0755]

diff --git a/scripts/get-pg.sh b/scripts/get-pg.sh
new file mode 100755 (executable)
index 0000000..d516db4
--- /dev/null
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+function usage {
+    echo "usage: <n>$0"
+    exit 1
+}
+
+function has_cmd {
+    if ! [ -x "$(command -v $1)" ]; then
+        echo "error: $1 is not available" >&2
+        exit 1
+    fi
+}
+
+# check for: curl, html2text, tail, sed, fmt
+has_cmd curl
+has_cmd html2text
+has_cmd tail
+has_cmd sed
+
+if [ $# -ne 1 ]; then
+    usage
+fi
+
+n=$1
+
+# get urls
+urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
+
+printf "urls:\n%s\n" "$urls"
+
+if [ -f pg.txt ]; then
+    rm pg.txt
+fi
+
+for url in $urls; do
+    echo "processing $url"
+
+    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt
+
+    # don't flood the server
+    sleep 1
+done
+
+echo "done. data in pg.txt"
+
+exit 0