Archive links from Personal Wiki
taskUseful tools
Relevant for
Pull all links
pocUsing org-rw
Working, but giving less results than expected 🤔
#!/usr/bin/env python3
import sys
import os
import logging
import org_rw
from org_rw import OrgTime, dom, Link
from org_rw import load as load_org
def load_all(top_dir_relative):
top = os.path.abspath(top_dir_relative)
docs = []
for root, dirs, files in os.walk(top):
for name in files:
if ".org" not in name:
continue
path = os.path.join(root, name)
try:
doc = load_org(open(path), extra_cautious=True)
docs.append(doc)
except Exception as err:
import traceback
traceback.print_exc()
print(f"== On {path}")
sys.exit(1)
logging.info("Collected {} files".format(len(docs)))
return docs
def main(src_top):
orgs = load_all(src_top)
for org in orgs:
for link in org.get_links():
if link.value.startswith('http'):
print("{}".format(link.value))
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: {} SOURCE_TOP".format(sys.argv[0]))
exit(0)
logging.basicConfig(level=logging.INFO, format="%(levelname)-8s %(message)s")
exit(main(sys.argv[1]))
Using regex
even less…
grep -PRIoh 'https?://[^\]\\ ]+ ' ~/.logs/brain|sort|uniq