From abb2809aa6eedaf081a0bb9aca2ff84e135e564d Mon Sep 17 00:00:00 2001 From: Matt Heffron Date: Thu, 18 Jun 2026 11:31:44 -0700 Subject: [PATCH 1/2] Initial attempt. Too many "false positive" indicating broken when not. --- scripts/bibSplit.pl | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/bibSplit.pl b/scripts/bibSplit.pl index 8a9b77be..f5b433d4 100755 --- a/scripts/bibSplit.pl +++ b/scripts/bibSplit.pl @@ -2,6 +2,7 @@ use JSON::PP qw(decode_json encode_json); use Encode qw(decode encode is_utf8); use Unicode::Normalize qw(NFC); +# use LWP::Simple; use utf8; BEGIN { @@ -63,6 +64,16 @@ sub sanitize_text { my $itemTitle = sanitize_text($obj->{title} // ''); my $title = $itemTitle eq '' ? "title: ''" : "title: |\n $itemTitle\n"; + my $urlSource = defined $obj->{url} ? $obj->{url} : ''; + + # test if the URL is accessible + # head() returns a true value (list of headers) if the page exists + if ($urlSource ne '') { + if (system(qq{curl --output /dev/null --silent --head --fail --location "$urlSource"}) != 0) { + print STDERR qq{URL: $urlSource is broken or unreachable on "$itemTitle".\n}; + } + } + # Abstracts can be multi-line and contain multiple paragraphs. Place YAML keyword on # one line and follow it with the abstract indented on subsequent lines. my $abs = sanitize_text($obj->{abstract} // ''); @@ -101,8 +112,6 @@ sub sanitize_text { $itemEditors =~ s/\n$//u; # strip trailing newline } - my $urlSource = defined $obj->{url} ? $obj->{url} : ''; - # Modified date my $dateModified = defined $obj->{dateModified} ? $obj->{dateModified} : ''; @@ -165,7 +174,7 @@ sub sanitize_text { } elsif ($type eq 'entry-encyclopedia') { $extraFields = "encyclopedia_title: $encyclopediaTitle\n"; } else { - print STDERR "Warning: unhandled type \"$type\" for key \"$key\"\n"; + # print STDERR "Warning: unhandled type \"$type\" for key \"$key\"\n"; } # Todo: Remove writing the json file once we're happy with the markdown files From ff0fcd4de3a71e724ae9273a55e52849efffc2d7 Mon Sep 17 00:00:00 2001 From: Matt Heffron Date: Sat, 20 Jun 2026 12:09:32 -0700 Subject: [PATCH 2/2] Added output of curl errors and headers from bibSplit.pl, for determining why they're getting errors from curl, but (most) not when entered in a browser. --- scripts/bibSplit.pl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/bibSplit.pl b/scripts/bibSplit.pl index f5b433d4..99acec48 100755 --- a/scripts/bibSplit.pl +++ b/scripts/bibSplit.pl @@ -2,7 +2,6 @@ use JSON::PP qw(decode_json encode_json); use Encode qw(decode encode is_utf8); use Unicode::Normalize qw(NFC); -# use LWP::Simple; use utf8; BEGIN { @@ -66,11 +65,15 @@ sub sanitize_text { my $urlSource = defined $obj->{url} ? $obj->{url} : ''; + print STDERR qq{$key --> "$urlSource" on "$itemTitle".\n}; # test if the URL is accessible - # head() returns a true value (list of headers) if the page exists if ($urlSource ne '') { - if (system(qq{curl --output /dev/null --silent --head --fail --location "$urlSource"}) != 0) { - print STDERR qq{URL: $urlSource is broken or unreachable on "$itemTitle".\n}; + if (system(qq{curl --output /dev/null --silent --show-error --head --fail --dump-header "$key.hdr" --location --referer "https://interlisp.org/history/bibliography;auto" "$urlSource"}) != 0) { + print STDERR qq{URL is broken or unreachable.\n}; + } + else + { + unlink("$key.hdr"); } }