From abb2809aa6eedaf081a0bb9aca2ff84e135e564d Mon Sep 17 00:00:00 2001
From: Matt Heffron <heffron@alumni.caltech.edu>
Date: Thu, 18 Jun 2026 11:31:44 -0700
Subject: [PATCH 1/2] Initial attempt. Too many "false positive" indicating
 broken when not.

---
 scripts/bibSplit.pl | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/scripts/bibSplit.pl b/scripts/bibSplit.pl
index 8a9b77be..f5b433d4 100755
--- a/scripts/bibSplit.pl
+++ b/scripts/bibSplit.pl
@@ -2,6 +2,7 @@
 use JSON::PP qw(decode_json encode_json);
 use Encode qw(decode encode is_utf8);  
 use Unicode::Normalize qw(NFC);
+# use LWP::Simple;
 use utf8;
 BEGIN 
 { 
@@ -63,6 +64,16 @@ sub sanitize_text {
   my $itemTitle = sanitize_text($obj->{title} // '');
   my $title = $itemTitle eq '' ? "title: ''" : "title: |\n  $itemTitle\n";
 
+  my $urlSource = defined $obj->{url} ? $obj->{url} : '';
+
+  # test if the URL is accessible
+  # head() returns a true value (list of headers) if the page exists
+  if ($urlSource ne '') {
+      if (system(qq{curl --output /dev/null --silent --head --fail --location "$urlSource"}) != 0) {
+          print STDERR qq{URL: $urlSource is broken or unreachable on "$itemTitle".\n};
+      }
+  }
+
   # Abstracts can be multi-line and  contain multiple paragraphs.  Place YAML keyword on
   # one line and follow it with the abstract indented on subsequent lines.
   my $abs = sanitize_text($obj->{abstract} // '');
@@ -101,8 +112,6 @@ sub sanitize_text {
     $itemEditors =~ s/\n$//u;  # strip trailing newline
   }
   
-  my $urlSource = defined $obj->{url} ? $obj->{url} : '';
-
   # Modified date
   my $dateModified = defined $obj->{dateModified} ? $obj->{dateModified} : '';
 
@@ -165,7 +174,7 @@ sub sanitize_text {
   } elsif ($type eq 'entry-encyclopedia') {
     $extraFields = "encyclopedia_title: $encyclopediaTitle\n";
   } else {
-    print STDERR "Warning: unhandled type \"$type\" for key \"$key\"\n";
+    # print STDERR "Warning: unhandled type \"$type\" for key \"$key\"\n";
   }
 
   # Todo: Remove writing the json file once we're happy with the markdown files

From ff0fcd4de3a71e724ae9273a55e52849efffc2d7 Mon Sep 17 00:00:00 2001
From: Matt Heffron <heffron@alumni.caltech.edu>
Date: Sat, 20 Jun 2026 12:09:32 -0700
Subject: [PATCH 2/2] Added output of curl errors and headers from bibSplit.pl,
 for determining why they're getting errors from curl, but (most) not when
 entered in a browser.

---
 scripts/bibSplit.pl | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/scripts/bibSplit.pl b/scripts/bibSplit.pl
index f5b433d4..99acec48 100755
--- a/scripts/bibSplit.pl
+++ b/scripts/bibSplit.pl
@@ -2,7 +2,6 @@
 use JSON::PP qw(decode_json encode_json);
 use Encode qw(decode encode is_utf8);  
 use Unicode::Normalize qw(NFC);
-# use LWP::Simple;
 use utf8;
 BEGIN 
 { 
@@ -66,11 +65,15 @@ sub sanitize_text {
 
   my $urlSource = defined $obj->{url} ? $obj->{url} : '';
 
+  print STDERR qq{$key --> "$urlSource" on "$itemTitle".\n};
   # test if the URL is accessible
-  # head() returns a true value (list of headers) if the page exists
   if ($urlSource ne '') {
-      if (system(qq{curl --output /dev/null --silent --head --fail --location "$urlSource"}) != 0) {
-          print STDERR qq{URL: $urlSource is broken or unreachable on "$itemTitle".\n};
+      if (system(qq{curl --output /dev/null --silent --show-error --head --fail --dump-header "$key.hdr" --location --referer "https://interlisp.org/history/bibliography;auto" "$urlSource"}) != 0) {
+          print STDERR qq{URL is broken or unreachable.\n};
+      }
+      else
+      {
+          unlink("$key.hdr");
       }
   }