From 14b13c8b27c5da1302545281f3a155de6102ea83 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Tue, 21 Apr 2026 01:09:17 +0100 Subject: [PATCH 01/68] fix: Add try/catch on hb_store_arweave --- src/core/store/hb_store_arweave.erl | 56 ++++++++++++++++++++------- src/preloaded/arweave/dev_arweave.erl | 45 ++++++++++++++++++++- 2 files changed, 86 insertions(+), 15 deletions(-) diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 722438bee..5b37003de 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -177,22 +177,36 @@ load_item(ExpectedID, StartOffset, Length, Opts) -> fun() -> case read_chunks(StartOffset, Length, Opts) of {ok, SerializedItem} -> - Item = - ar_bundles:deserialize(SerializedItem), - case hb_util:encode(Item#tx.id) of - ExpectedID -> - {ok, hb_message:convert( - Item, - <<"structured@1.0">>, - <<"ans104@1.0">>, - Opts - )}; - ActualID -> - {error, - {id_mismatch, - ExpectedID, ActualID}} + try + Item = + ar_bundles:deserialize(SerializedItem), + case hb_util:encode(Item#tx.id) of + ExpectedID -> + {ok, hb_message:convert( + Item, + <<"structured@1.0">>, + <<"ans104@1.0">>, + Opts + )}; + ActualID -> + ?event(error, {load_item, {id_mismatch}}), + {error, + {id_mismatch, + ExpectedID, ActualID}} + end + catch _:Reason:Stacktrace -> + %% Due to malformed encoding, attempt to deserialize + %% can throw. + ?event(error, + {load_item, + {expected_id, ExpectedID}, + {reason, Reason}, + {stacktrace, Stacktrace} + }), + {error, Reason} end; {error, Reason} -> + ?event(error, {load_item, Reason}), {error, Reason} end end, @@ -402,3 +416,17 @@ write_read_fake_bundle_tx_test() -> {ok, TX} = read(Opts, #{ <<"read">> => ID }, Opts), ?assert(hb_message:verify(TX, all, #{})), ok. + +%% @doc Interior Arweave offset returns bytes that are not a valid ANS-104 item, +%% so ar_bundles:deserialize/1 throws. The catch in load_item/4 must convert +%% that throw into {error, _} rather than crashing. +load_item_deserialize_throws_test() -> + Store = [hb_test_utils:test_store()], + Opts = #{<<"index-store">> => Store}, + FakeID = <<"BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB">>, + %% Same interior offset used in dev_arweave bundle_header_garbage_guard test: + %% the bytes at ProbeOffset are mid-TX application data, not an ANS-104 header. + ProbeOffset = 376836336327208, + Size = 4096, + ok = write_offset(Opts, FakeID, <<"ans104@1.0">>, ProbeOffset - 1, Size), + ?assertMatch({error, _}, read(Opts, FakeID)). diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index 3c967d81c..0cd3e2148 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -184,7 +184,16 @@ head_raw(Base, Request, Opts) -> <<"tx@1.0">> -> fun head_raw_tx/4; _ -> throw({invalid_codec_device, CodecDevice}) end, - CodecFun(TXID, StartOffset, Length, Opts); + try CodecFun(TXID, StartOffset, Length, Opts) + catch _:Reason:Stacktrace -> + %% This can be prone to serialization error. + %% Catch and output as an error. + ?event(store_error, {head_raw, + {txid, TXID}, + {reason, Reason}, + {stacktrace, Stacktrace}}), + {error, Reason} + end; not_found -> ?event( arweave, @@ -1531,6 +1540,40 @@ head_raw_ans104_invalid_tags_test() -> do_head_raw_ans104(<<0:256>>, 0, byte_size(DataItem), DataItem, #{}) ). +%% @doc Interior Arweave offset returns bytes that are not a valid ANS-104 +%% header, so head_raw_ans104/4 throws inside do_head_raw_ans104/5. The +%% try-catch added to head_raw/3 must convert that throw into {error, _}. +head_raw_ans104_deserialize_throws_test_parallel() -> + TestStore = hb_test_utils:test_store(hb_store_volatile, <<"head-raw-throws">>), + IndexStore = #{ + <<"module">> => hb_store_arweave, + <<"index-store">> => [TestStore] + }, + Opts = #{ + <<"store">> => [TestStore], + <<"arweave-index-ids">> => true, + <<"arweave-index-store">> => IndexStore + }, + FakeID = <<"CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC">>, + %% Same interior offset as bundle_header_garbage_guard_test_parallel. + ProbeOffset = 376836336327208, + Size = 4096, + ok = hb_store_arweave:write_offset( + IndexStore, FakeID, <<"ans104@1.0">>, ProbeOffset - 1, Size, Opts + ), + ?assertMatch( + {error, _}, + hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9">> }, + #{ + <<"path">> => <<"raw">>, + <<"raw">> => FakeID, + <<"method">> => <<"HEAD">> + }, + Opts + ) + ). + get_raw_range_tx_test_parallel() -> DataItemID = <<"ptBC0UwDmrUTBQX3MqZ1lB57ex20ygwzkjjCrQjIx3o">>, Opts = setup_arweave_index_opts([DataItemID]), From 5ba5aaf3b49c30ef48d65164c446ee0016661d7e Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Fri, 6 Mar 2026 12:00:00 +0100 Subject: [PATCH 02/68] feat: Add L1 TX filtering with owner and tag support --- src/preloaded/query/dev_copycat_arweave.erl | 506 +++++++++++++++++++- 1 file changed, 497 insertions(+), 9 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 329219193..98522d768 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -6,10 +6,15 @@ -module(dev_copycat_arweave). -device_libraries([lib_arweave_common]). -export([arweave/3]). +-export([add_owner_alias/3, resolve_owner_alias/2, set_memory_safe_cap/2, get_memory_safe_cap/1, set_depth_recursion_cap/2, get_depth_recursion_cap/1]). -include_lib("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). -define(ARWEAVE_DEVICE, <<"~arweave@2.9">>). +-define(DEPTH_L1_OFFSETS, 1). +-define(DEPTH_RECURSION_CAP, 4). +%% 1GB in bytes +-define(MEMORY_SAFE_CAP, 1024 * 1024 * 1024). % GET /~cron@1.0/once&cron-path=~copycat@1.0/arweave @@ -17,18 +22,259 @@ %% latest known block towards the Genesis block. If no range is provided, we %% fetch blocks from the latest known block towards the Genesis block. arweave(_Base, Request, Opts) -> - case parse_range(Request, Opts) of - {error, unavailable} -> - {error, unavailable}; - {ok, {From, To}} -> - case hb_maps:get(<<"mode">>, Request, <<"write">>, Opts) of - <<"write">> -> fetch_blocks(Request, From, To, Opts); - <<"list">> -> list_index(From, To, Opts); - Mode -> - {error, <<"Unsupported mode `", (hb_util:bin(Mode))/binary, "`. Supported modes are: write, list">>} + case hb_maps:get(<<"mode">>, Request, <<"write">>, Opts) of + <<"write">> -> + case hb_maps:find(<<"id">>, Request, Opts) of + {ok, TXID} -> process_l1_request(TXID, Request, Opts); + error -> + case parse_range(Request, Opts) of + {error, unavailable} -> {error, unavailable}; + {ok, {From, To}} -> fetch_blocks(Request, From, To, Opts) + end + end; + <<"list">> -> + case parse_range(Request, Opts) of + {error, unavailable} -> {error, unavailable}; + {ok, {From, To}} -> list_index(From, To, Opts) + end; + Mode -> + {error, <<"Unsupported mode `", (hb_util:bin(Mode))/binary, "`. Supported modes are: write, list">>} + end. +%% @doc Set safe memory resource allocation cap for the in-memory +%% bundle processing. in bytes. +set_memory_safe_cap(Cap, Opts) when is_integer(Cap), Cap > 0 -> + Opts#{copycat_memory_cap => Cap}. +%% @doc Set bundles descendant recursion cap, avoids recursion +%% in very nested bundles (very rare). +set_depth_recursion_cap(Cap, Opts) when is_integer(Cap), Cap > 0 -> + Opts#{copycat_depth_recursion_cap => Cap}. +%% @doc Get the set depth recursion cap. if not set, defaults to ?DEPTH_RECURSION_CAP +get_depth_recursion_cap(Opts) -> + case maps:get(copycat_depth_recursion_cap, Opts, not_found) of + not_found -> ?DEPTH_RECURSION_CAP; + Cap -> Cap + end. +%% @doc Get the L1 TX data size that gets handled in-memory +%% defaults to ?MEMORY_SAFE_CAP if not set. +get_memory_safe_cap(Opts) -> + case maps:get(copycat_memory_cap, Opts, not_found) of + not_found -> ?MEMORY_SAFE_CAP; + Cap -> Cap + end. +%% @doc Normalize an owner address into the native ID form used for comparisons. +normalize_owner_id(Addr) -> + hb_util:native_id(hb_util:bin(Addr)). + +%% @doc Adds an address to the owners aliases cache in Opts, mapping +%% Alias -> native address for fast lookup and once per address computation. +add_owner_alias(Addr, Alias, Opts) when is_binary(Alias) -> + ExistingAliases = maps:get(owner_aliases, Opts, #{}), + Opts#{ owner_aliases => ExistingAliases#{ Alias => normalize_owner_id(Addr) }}; +add_owner_alias(_Addr, Alias, _Opts) -> + throw({invalid_owner_alias, Alias}). + +%% @doc Retrieve the address of a given alias. +resolve_owner_alias(Alias, Opts) when is_binary(Alias) -> + Aliases = maps:get(owner_aliases, Opts, #{}), + case maps:find(Alias, Aliases) of + {ok, Addr} -> {ok, Addr}; + error -> {error, {owner_alias_not_found, Alias}} + end; +resolve_owner_alias(Alias, _Opts) -> + {error, {invalid_owner_alias, Alias}}. +%% @doc Parse include/exclude owner filters from the request. +%% Supports direct owner values and owner aliases. +parse_owner_filter(Request, Opts) -> + case resolve_owner_filter_value( + <<"include-owner">>, + <<"include-owner-alias">>, + Request, + Opts + ) of + {error, _} = Error -> + Error; + {ok, IncludeOwner} -> + case resolve_owner_filter_value( + <<"exclude-owner">>, + <<"exclude-owner-alias">>, + Request, + Opts + ) of + {error, _} = Error -> + Error; + {ok, ExcludeOwner} -> + {ok, #{ + include_owner => IncludeOwner, + exclude_owner => ExcludeOwner + }} end end. +%% @doc Resolve one owner filter value from either a direct owner param or +%% a comma-separated owner alias param. Alias takes precedence. +resolve_owner_filter_value(OwnerKey, AliasKey, Request, Opts) -> + case hb_maps:find(AliasKey, Request, Opts) of + {ok, Alias} -> + resolve_owner_aliases(Alias, Opts); + error -> + case hb_maps:find(OwnerKey, Request, Opts) of + {ok, Owner} -> + {ok, normalize_owner_id(Owner)}; + error -> + {ok, undefined} + end + end. +%% @doc Resolve one or more comma-separated owner aliases into normalized owner IDs. +resolve_owner_aliases(Alias, Opts) -> + case + lists:filter( + fun(Part) -> byte_size(Part) > 0 end, + binary:split(hb_util:bin(Alias), <<",">>, [global]) + ) + of + [SingleAlias] -> + case resolve_owner_alias(SingleAlias, Opts) of + {ok, Addr} -> {ok, normalize_owner_id(Addr)}; + {error, _} = Error -> Error + end; + Aliases -> + resolve_owner_aliases(Aliases, Opts, []) + end. +%% @doc Resolve a list of owner aliases into normalized owner IDs. +resolve_owner_aliases([], _Opts, Acc) -> + {ok, lists:reverse(Acc)}; +resolve_owner_aliases([Alias | Rest], Opts, Acc) -> + case resolve_owner_alias(Alias, Opts) of + {ok, Addr} -> + resolve_owner_aliases(Rest, Opts, [normalize_owner_id(Addr) | Acc]); + {error, _} = Error -> + Error + end. +%% @doc Parse an L1 tag filter from `Name:Value` form. +parse_tag_filter(Key, Request, Opts) -> + case hb_maps:find(Key, Request, Opts) of + {ok, Tag} -> + case binary:split(hb_util:bin(Tag), <<":">>, [global]) of + [Name, Value] + when byte_size(Name) > 0 andalso byte_size(Value) > 0 -> + {ok, #{name => Name, value => Value}}; + _ -> + {error, invalid_tag_filter} + end; + error -> + {ok, undefined} + end. +%% @doc Process the `id=...` copycat path for an already indexed L1 TX. +%% applies L1-level owner/tag filters on the lightweight TX header first, then, +%% if the TX passes and is a bundle, loads the full L1 payload once and indexes +%% descendants in-memory (under the ?MEMORY_SAFE_CAP limit) up to the requested safe depth +%% (defaults to full recursion till the set copycat_depth_recursion_cap). +process_l1_request(TXID, Request, Opts) -> + Depth = request_depth(Request, <<"safe_max">>, Opts), + case parse_owner_filter(Request, Opts) of + {error, _} = Error -> + Error; + {ok, OwnerFilters} -> + case parse_tag_filter(<<"include-tag">>, Request, Opts) of + {error, _} = Error -> + Error; + {ok, IncludeTag} -> + case parse_tag_filter(<<"exclude-tag">>, Request, Opts) of + {error, _} = Error -> + Error; + {ok, ExcludeTag} -> + {ok, + process_l1_candidate( + TXID, + OwnerFilters#{ + include_tag => IncludeTag, + exclude_tag => ExcludeTag + }, + Depth, + Opts + )} + end + end + end. +%% @doc Parse the requested recursion depth and clamp it to the configured safe cap. +%% `safe_max` resolves to the current copycat depth recursion cap. +request_depth(Request, Default, Opts) -> + MaxRecursionCap = get_depth_recursion_cap(Opts), + RequestedDepth = + case hb_maps:get(<<"depth">>, Request, Default, Opts) of + <<"safe_max">> -> MaxRecursionCap; + Value -> hb_util:int(Value) + end, + erlang:min( + MaxRecursionCap, + erlang:max( + ?DEPTH_L1_OFFSETS, + RequestedDepth + ) + ). +%% @doc Return the first matching L1 filter reason for a TX header, or `pass`. +l1_filter_reason(TX, Filters) -> + IncludeOwner = maps:get(include_owner, Filters, undefined), + ExcludeOwner = maps:get(exclude_owner, Filters, undefined), + IncludeTag = maps:get(include_tag, Filters, undefined), + ExcludeTag = maps:get(exclude_tag, Filters, undefined), + Owner = ar_tx:get_owner_address(TX), + case owner_matches_filter(Owner, IncludeOwner) of + false when IncludeOwner =/= undefined -> + include_owner_mismatch; + _ -> + case owner_matches_filter(Owner, ExcludeOwner) of + true -> + exclude_owner_match; + false -> + case IncludeTag of + undefined -> + case ExcludeTag of + undefined -> pass; + _ -> + case has_tag_pair(TX, ExcludeTag) of + true -> exclude_tag_match; + false -> pass + end + end; + _ -> + case has_tag_pair(TX, IncludeTag) of + false -> include_tag_mismatch; + true -> + case ExcludeTag of + undefined -> pass; + _ -> + case has_tag_pair(TX, ExcludeTag) of + true -> exclude_tag_match; + false -> pass + end + end + end + end + end + end. +%% @doc Match an owner against an undefined, single-owner, or multi-owner filter. +owner_matches_filter(_Owner, undefined) -> + false; +owner_matches_filter(Owner, Owners) when is_list(Owners) -> + lists:member(Owner, Owners); +owner_matches_filter(Owner, FilterOwner) -> + Owner =:= FilterOwner. +has_tag_pair(#tx{tags = Tags}, #{name := Name, value := Value}) -> + TagValue = dev_arweave_common:tagfind(Name, Tags, not_found), + case TagValue of + not_found -> + false; + _ -> + LowerTagValue = hb_util:to_lower(TagValue), + LowerValue = hb_util:to_lower(Value), + case LowerTagValue of + LowerValue -> true; + _ -> false + end +end; +has_tag_pair(_, _) -> + false. %% @doc Parse the range from the request. parse_range(Request, Opts) -> maybe @@ -401,6 +647,248 @@ process_txs(ValidTXs, BlockStartOffset, Opts) -> Results ). +%% @doc Process a single indexed L1 TX candidate after lightweight filter checks. +process_l1_candidate(TXID, Filters, Depth, Opts) -> + Skipped = #{items_count => 0, bundle_count => 0, skipped_count => 1}, + NormalizedTXID = hb_util:native_id(TXID), + EncodedTXID = hb_util:encode(NormalizedTXID), + IndexStore = hb_store_arweave:store_from_opts(Opts), + case hb_store_arweave:read_offset(IndexStore, NormalizedTXID) of + {ok, + #{ + <<"codec-device">> := <<"tx@1.0">>, + <<"start-offset">> := StartOffset, + <<"length">> := Length + }} -> + case resolve_tx_header(EncodedTXID, Opts) of + {ok, TX} -> + case l1_filter_reason(TX, Filters) of + pass -> + case is_bundle_tx(TX, Opts) of + false -> + ?event( + copycat_short, + {arweave_tx_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, not_bundle} + } + ), + Skipped; + true -> + case Length =< get_memory_safe_cap(Opts) of + false -> + ?event( + copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, memory_safe_cap_exceeded} + } + ), + #{ + items_count => 0, + bundle_count => 1, + skipped_count => 1 + }; + true -> + case hb_store_arweave:read_chunks( + StartOffset, + Length, + Opts + ) of + {ok, BundleData} -> + {TotalTime, IndexRes} = timer:tc( + fun() -> + index_bundle_bytes( + BundleData, + StartOffset, + Depth, + IndexStore, + Opts + ) + end + ), + case IndexRes of + {ok, ItemsCount} -> + record_event_metrics( + <<"item_indexed">>, + ItemsCount, + TotalTime + ), + #{ + items_count => ItemsCount, + bundle_count => 1, + skipped_count => 0 + }; + {error, Reason} -> + ?event( + copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, Reason} + } + ), + #{ + items_count => 0, + bundle_count => 1, + skipped_count => 1 + } + end; + {error, Reason} -> + ?event( + copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, Reason} + } + ), + #{ + items_count => 0, + bundle_count => 1, + skipped_count => 1 + }; + not_found -> + ?event( + copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, not_found} + } + ), + #{ + items_count => 0, + bundle_count => 1, + skipped_count => 1 + } + end + end + end; + FilterReason -> + ?event( + copycat_short, + {arweave_tx_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, FilterReason} + } + ), + Skipped + end; + error -> + Skipped + end; + {ok, _OtherOffset} -> + ?event( + copycat_short, + {arweave_tx_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, not_tx} + } + ), + Skipped; + not_found -> + ?event( + copycat_short, + {arweave_tx_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, missing_offset} + } + ), + Skipped + end. + +index_bundle_bytes(_BundleData, _BundleStartOffset, Depth, _Store, _Opts) + when Depth =< 0 -> + {ok, 0}; +index_bundle_bytes(BundleData, BundleStartOffset, Depth, Store, Opts) -> + case ar_bundles:decode_bundle_header(BundleData) of + invalid_bundle_header -> + {error, invalid_bundle_header}; + {ItemsBin, BundleIndex} -> + HeaderSize = byte_size(BundleData) - byte_size(ItemsBin), + index_bundle_items( + BundleIndex, + ItemsBin, + BundleStartOffset + HeaderSize, + Depth, + Store, + Opts, + 0 + ) + end. + +%% @doc Index bundle children from decoded bundle bytes and recurse descendants in-memory. +index_bundle_items([], _ItemsBin, _ItemStartOffset, _Depth, _Store, _Opts, Count) -> + {ok, Count}; +index_bundle_items( + [{ItemID, Size} | Rest], + ItemsBin, + ItemStartOffset, + Depth, + Store, + Opts, + Count +) when byte_size(ItemsBin) >= Size -> + ItemBinary = binary:part(ItemsBin, 0, Size), + hb_store_arweave:write_offset( + Store, + hb_util:encode(ItemID), + <<"ans104@1.0">>, + ItemStartOffset, + Size + ), + DescendantCount = + case Depth > 1 of + true -> + index_bundle_descendants( + ItemBinary, + ItemStartOffset, + Depth - 1, + Store, + Opts + ); + false -> + 0 + end, + index_bundle_items( + Rest, + binary:part(ItemsBin, Size, byte_size(ItemsBin) - Size), + ItemStartOffset + Size, + Depth, + Store, + Opts, + Count + 1 + DescendantCount + ); +index_bundle_items(_BundleIndex, _ItemsBin, _ItemStartOffset, _Depth, _Store, _Opts, _Count) -> + {error, invalid_bundle_header}. + +%% @doc Recurse into a nested bundle data item from in-memory bytes. +index_bundle_descendants(_ItemBinary, _ItemStartOffset, Depth, _Store, _Opts) + when Depth =< 0 -> + 0; +index_bundle_descendants(ItemBinary, ItemStartOffset, Depth, Store, Opts) -> + try ar_bundles:deserialize_header(ItemBinary) of + {ok, HeaderSize, HeaderTX} -> + case is_bundle_tx(HeaderTX, Opts) of + true -> + case index_bundle_bytes( + HeaderTX#tx.data, + ItemStartOffset + HeaderSize, + Depth, + Store, + Opts + ) of + {ok, Count} -> Count; + _ -> 0 + end; + false -> + 0 + end; + _ -> + 0 + catch + _:_ -> + 0 + end. + %% @doc Check whether a TX header indicates bundle content. is_bundle_tx(TX, _Opts) -> ar_tx:type(TX) =/= binary. From 5a0f08568854dc352f066d0271d2f7564ea25e9d Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Sat, 7 Mar 2026 12:00:00 +0100 Subject: [PATCH 03/68] feat: Add L1 TX offset loading and configuration --- src/preloaded/query/dev_copycat_arweave.erl | 83 +++++++++++++++++++-- 1 file changed, 78 insertions(+), 5 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 98522d768..81585ddec 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -13,8 +13,8 @@ -define(ARWEAVE_DEVICE, <<"~arweave@2.9">>). -define(DEPTH_L1_OFFSETS, 1). -define(DEPTH_RECURSION_CAP, 4). -%% 1GB in bytes --define(MEMORY_SAFE_CAP, 1024 * 1024 * 1024). +%% 6GB in bytes +-define(MEMORY_SAFE_CAP, 6 * 1024 * 1024 * 1024). % GET /~cron@1.0/once&cron-path=~copycat@1.0/arweave @@ -170,6 +170,10 @@ parse_tag_filter(Key, Request, Opts) -> %% (defaults to full recursion till the set copycat_depth_recursion_cap). process_l1_request(TXID, Request, Opts) -> Depth = request_depth(Request, <<"safe_max">>, Opts), + LoadL1Offset = + hb_util:bool( + hb_maps:get(<<"load-l1-offset">>, Request, false, Opts) + ), case parse_owner_filter(Request, Opts) of {error, _} = Error -> Error; @@ -186,6 +190,7 @@ process_l1_request(TXID, Request, Opts) -> process_l1_candidate( TXID, OwnerFilters#{ + load_l1_offset => LoadL1Offset, include_tag => IncludeTag, exclude_tag => ExcludeTag }, @@ -653,7 +658,14 @@ process_l1_candidate(TXID, Filters, Depth, Opts) -> NormalizedTXID = hb_util:native_id(TXID), EncodedTXID = hb_util:encode(NormalizedTXID), IndexStore = hb_store_arweave:store_from_opts(Opts), - case hb_store_arweave:read_offset(IndexStore, NormalizedTXID) of + LoadL1Offset = maps:get(load_l1_offset, Filters, false), + case ensure_l1_tx_offset( + NormalizedTXID, + EncodedTXID, + IndexStore, + LoadL1Offset, + Opts + ) of {ok, #{ <<"codec-device">> := <<"tx@1.0">>, @@ -784,16 +796,77 @@ process_l1_candidate(TXID, Filters, Depth, Opts) -> } ), Skipped; - not_found -> + {error, Reason} -> ?event( copycat_short, {arweave_tx_skipped, {tx_id, {explicit, EncodedTXID}}, - {reason, missing_offset} + {reason, Reason} } ), Skipped end. +%% @doc Ensure the root L1 TX offset exists locally before `id=...` indexing. +%% if the offset is missing and `load_l1_offset` is enabled, fetches the TX +%% offset metadata from Arweave, writes it to the local offset store, and +%% retries the local lookup. +ensure_l1_tx_offset(_TXID, _EncodedTXID, IndexStore, _LoadL1Offset, _Opts) + when is_map(IndexStore) =:= false -> + {error, missing_offset}; +ensure_l1_tx_offset(TXID, EncodedTXID, IndexStore, LoadL1Offset, Opts) -> + case hb_store_arweave:read_offset(IndexStore, TXID) of + {ok, _} = OffsetRes -> + OffsetRes; + not_found when LoadL1Offset -> + ?event( + copycat_short, + {arweave_tx_offset_loading, + {tx_id, {explicit, EncodedTXID}}, + {source, network} + } + ), + case load_l1_tx_offset(EncodedTXID, IndexStore, Opts) of + ok -> + case hb_store_arweave:read_offset(IndexStore, TXID) of + {ok, _} = OffsetRes -> + OffsetRes; + not_found -> + {error, missing_offset} + end; + {error, Reason} -> + {error, Reason} + end; + not_found -> + {error, missing_offset} + end. + +load_l1_tx_offset(TXID, IndexStore, Opts) -> + case hb_http:request( + #{ + <<"path">> => <<"/arweave/tx/", TXID/binary, "/offset">>, + <<"method">> => <<"GET">> + }, + Opts + ) of + {ok, #{ <<"body">> := OffsetBody }} -> + OffsetMsg = hb_json:decode(OffsetBody), + EndOffset = hb_util:int(maps:get(<<"offset">>, OffsetMsg)), + Size = hb_util:int(maps:get(<<"size">>, OffsetMsg)), + StartOffset = EndOffset - Size, + ok = + hb_store_arweave:write_offset( + IndexStore, + TXID, + <<"tx@1.0">>, + StartOffset, + Size + ), + ok; + {error, Reason} -> + {error, Reason}; + not_found -> + {error, not_found} + end. index_bundle_bytes(_BundleData, _BundleStartOffset, Depth, _Store, _Opts) when Depth =< 0 -> From 1b2f8d7e54a6834940fa79c43b77a13ec5cd765d Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Tue, 10 Mar 2026 12:00:00 +0100 Subject: [PATCH 04/68] feat: Add block N depth indexing --- src/core/resolver/hb_opts.erl | 3 ++ src/preloaded/query/dev_copycat_arweave.erl | 34 ++++++++++++--------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/core/resolver/hb_opts.erl b/src/core/resolver/hb_opts.erl index 6d271c518..2b529058c 100644 --- a/src/core/resolver/hb_opts.erl +++ b/src/core/resolver/hb_opts.erl @@ -283,6 +283,9 @@ raw_default_message() -> <<"relay-http-client">> => httpc, % The default codec to use for commitment signatures. <<"commitment-device">> => <<"httpsig@1.0">>, + % Copycat-specific options. + copycat_memory_cap => 6 * 1024 * 1024 * 1024, + copycat_depth_recursion_cap => 4, % Dev options <<"mode">> => debug, <<"profiling">> => true, diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 81585ddec..78ccb2e59 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -12,9 +12,6 @@ -define(ARWEAVE_DEVICE, <<"~arweave@2.9">>). -define(DEPTH_L1_OFFSETS, 1). --define(DEPTH_RECURSION_CAP, 4). -%% 6GB in bytes --define(MEMORY_SAFE_CAP, 6 * 1024 * 1024 * 1024). % GET /~cron@1.0/once&cron-path=~copycat@1.0/arweave @@ -27,9 +24,16 @@ arweave(_Base, Request, Opts) -> case hb_maps:find(<<"id">>, Request, Opts) of {ok, TXID} -> process_l1_request(TXID, Request, Opts); error -> + BlockDepth = request_depth(Request, ?DEPTH_L1_OFFSETS, Opts), case parse_range(Request, Opts) of {error, unavailable} -> {error, unavailable}; - {ok, {From, To}} -> fetch_blocks(Request, From, To, Opts) + {ok, {From, To}} -> + fetch_blocks( + Request, + From, + To, + Opts#{copycat_block_depth => BlockDepth} + ) end end; <<"list">> -> @@ -48,19 +52,13 @@ set_memory_safe_cap(Cap, Opts) when is_integer(Cap), Cap > 0 -> %% in very nested bundles (very rare). set_depth_recursion_cap(Cap, Opts) when is_integer(Cap), Cap > 0 -> Opts#{copycat_depth_recursion_cap => Cap}. -%% @doc Get the set depth recursion cap. if not set, defaults to ?DEPTH_RECURSION_CAP +%% @doc Get the set depth recursion cap from hb_opts. get_depth_recursion_cap(Opts) -> - case maps:get(copycat_depth_recursion_cap, Opts, not_found) of - not_found -> ?DEPTH_RECURSION_CAP; - Cap -> Cap - end. + hb_opts:get(copycat_depth_recursion_cap, undefined, Opts). %% @doc Get the L1 TX data size that gets handled in-memory -%% defaults to ?MEMORY_SAFE_CAP if not set. +%% from hb_opts. get_memory_safe_cap(Opts) -> - case maps:get(copycat_memory_cap, Opts, not_found) of - not_found -> ?MEMORY_SAFE_CAP; - Cap -> Cap - end. + hb_opts:get(copycat_memory_cap, undefined, Opts). %% @doc Normalize an owner address into the native ID form used for comparisons. normalize_owner_id(Addr) -> hb_util:native_id(hb_util:bin(Addr)). @@ -166,7 +164,7 @@ parse_tag_filter(Key, Request, Opts) -> %% @doc Process the `id=...` copycat path for an already indexed L1 TX. %% applies L1-level owner/tag filters on the lightweight TX header first, then, %% if the TX passes and is a bundle, loads the full L1 payload once and indexes -%% descendants in-memory (under the ?MEMORY_SAFE_CAP limit) up to the requested safe depth +%% descendants in-memory (under the configured copycat_memory_cap) up to the requested safe depth %% (defaults to full recursion till the set copycat_depth_recursion_cap). process_l1_request(TXID, Request, Opts) -> Depth = request_depth(Request, <<"safe_max">>, Opts), @@ -321,6 +319,9 @@ normalize_height(Height, Opts) -> {ok, RequestedHeight} end. +get_block_depth(Opts) -> + maps:get(copycat_block_depth, Opts, ?DEPTH_L1_OFFSETS). + latest_height(Opts) -> case hb_ao:resolve( <>, @@ -557,6 +558,7 @@ parallel_map(Items, Fun, Opts) -> process_tx({{padding, _PaddingRoot}, _EndOffset}, _BlockStartOffset, _Opts) -> #{items_count => 0, bundle_count => 0, skipped_count => 0}; process_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, Opts) -> + Depth = get_block_depth(Opts), IndexStore = hb_store_arweave:store_from_opts(Opts), TXID = hb_util:encode(TX#tx.id), TXEndOffset = BlockStartOffset + EndOffset, @@ -577,6 +579,8 @@ process_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, Opts) -> end), case is_bundle_tx(TX, Opts) of false -> #{items_count => 0, bundle_count => 0, skipped_count => 0}; + true when Depth > ?DEPTH_L1_OFFSETS -> + process_l1_candidate(TX#tx.id, #{}, Depth, Opts); true -> % Lightweight processing of block transactions to depth 2. We % can avoid loading the full L1 TX data into memory, and instead From 4f4ed7957eacea5b2c63749c6ad1ec938cec7e65 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Tue, 10 Mar 2026 12:00:00 +0100 Subject: [PATCH 05/68] test: Add tests and refactor copycat internals --- src/core/resolver/hb_opts.erl | 4 +- src/preloaded/query/dev_copycat_arweave.erl | 978 +++++++++++++++++--- 2 files changed, 837 insertions(+), 145 deletions(-) diff --git a/src/core/resolver/hb_opts.erl b/src/core/resolver/hb_opts.erl index 2b529058c..74bca1805 100644 --- a/src/core/resolver/hb_opts.erl +++ b/src/core/resolver/hb_opts.erl @@ -284,8 +284,8 @@ raw_default_message() -> % The default codec to use for commitment signatures. <<"commitment-device">> => <<"httpsig@1.0">>, % Copycat-specific options. - copycat_memory_cap => 6 * 1024 * 1024 * 1024, - copycat_depth_recursion_cap => 4, + <<"copycat-memory-cap">> => 6 * 1024 * 1024 * 1024, + <<"copycat-depth-recursion-cap">> => 6, % 2x the deepest we've seen to date % Dev options <<"mode">> => debug, <<"profiling">> => true, diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 78ccb2e59..9a8b489ef 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -11,7 +11,12 @@ -include_lib("eunit/include/eunit.hrl"). -define(ARWEAVE_DEVICE, <<"~arweave@2.9">>). --define(DEPTH_L1_OFFSETS, 1). +% By default we'll index blocks to depth 2 which is: +% - depth 1: L1 TXs +% - depth 2: L2 bundles and dataitems +% Note: this means that the children of L2 bundles are not indexed at +% depth 2. +-define(DEFAULT_BLOCK_DEPTH, 2). % GET /~cron@1.0/once&cron-path=~copycat@1.0/arweave @@ -24,16 +29,11 @@ arweave(_Base, Request, Opts) -> case hb_maps:find(<<"id">>, Request, Opts) of {ok, TXID} -> process_l1_request(TXID, Request, Opts); error -> - BlockDepth = request_depth(Request, ?DEPTH_L1_OFFSETS, Opts), + TargetDepth = request_depth(Request, ?DEFAULT_BLOCK_DEPTH, Opts), case parse_range(Request, Opts) of {error, unavailable} -> {error, unavailable}; {ok, {From, To}} -> - fetch_blocks( - Request, - From, - To, - Opts#{copycat_block_depth => BlockDepth} - ) + fetch_blocks(From, To, TargetDepth, Opts) end end; <<"list">> -> @@ -66,15 +66,15 @@ normalize_owner_id(Addr) -> %% @doc Adds an address to the owners aliases cache in Opts, mapping %% Alias -> native address for fast lookup and once per address computation. add_owner_alias(Addr, Alias, Opts) when is_binary(Alias) -> - ExistingAliases = maps:get(owner_aliases, Opts, #{}), + ExistingAliases = hb_opts:get(owner_aliases, #{}, Opts), Opts#{ owner_aliases => ExistingAliases#{ Alias => normalize_owner_id(Addr) }}; add_owner_alias(_Addr, Alias, _Opts) -> throw({invalid_owner_alias, Alias}). %% @doc Retrieve the address of a given alias. resolve_owner_alias(Alias, Opts) when is_binary(Alias) -> - Aliases = maps:get(owner_aliases, Opts, #{}), - case maps:find(Alias, Aliases) of + Aliases = hb_opts:get(owner_aliases, #{}, Opts), + case hb_maps:find(Alias, Aliases) of {ok, Addr} -> {ok, Addr}; error -> {error, {owner_alias_not_found, Alias}} end; @@ -83,29 +83,28 @@ resolve_owner_alias(Alias, _Opts) -> %% @doc Parse include/exclude owner filters from the request. %% Supports direct owner values and owner aliases. parse_owner_filter(Request, Opts) -> - case resolve_owner_filter_value( - <<"include-owner">>, - <<"include-owner-alias">>, - Request, - Opts - ) of - {error, _} = Error -> - Error; - {ok, IncludeOwner} -> - case resolve_owner_filter_value( + maybe + {ok, IncludeOwner} ?= + resolve_owner_filter_value( + <<"include-owner">>, + <<"include-owner-alias">>, + Request, + Opts + ), + {ok, ExcludeOwner} ?= + resolve_owner_filter_value( <<"exclude-owner">>, <<"exclude-owner-alias">>, Request, Opts - ) of - {error, _} = Error -> - Error; - {ok, ExcludeOwner} -> - {ok, #{ - include_owner => IncludeOwner, - exclude_owner => ExcludeOwner - }} - end + ), + {ok, #{ + include_owner => IncludeOwner, + exclude_owner => ExcludeOwner + }} + else + {error, _} = Error -> + Error end. %% @doc Resolve one owner filter value from either a direct owner param or %% a comma-separated owner alias param. Alias takes precedence. @@ -164,41 +163,38 @@ parse_tag_filter(Key, Request, Opts) -> %% @doc Process the `id=...` copycat path for an already indexed L1 TX. %% applies L1-level owner/tag filters on the lightweight TX header first, then, %% if the TX passes and is a bundle, loads the full L1 payload once and indexes -%% descendants in-memory (under the configured copycat_memory_cap) up to the requested safe depth -%% (defaults to full recursion till the set copycat_depth_recursion_cap). +%% descendants in-memory (under the configured copycat_memory_cap) up to the +%% requested safe depth (defaults to full recursion till the set +%% copycat_depth_recursion_cap). process_l1_request(TXID, Request, Opts) -> Depth = request_depth(Request, <<"safe_max">>, Opts), - LoadL1Offset = + QueryL1Offset = hb_util:bool( - hb_maps:get(<<"load-l1-offset">>, Request, false, Opts) + hb_maps:get(<<"query-l1-offset">>, Request, false, Opts) ), - case parse_owner_filter(Request, Opts) of + maybe + {ok, OwnerFilters} ?= parse_owner_filter(Request, Opts), + {ok, IncludeTag} ?= parse_tag_filter(<<"include-tag">>, Request, Opts), + {ok, ExcludeTag} ?= parse_tag_filter(<<"exclude-tag">>, Request, Opts), + {ok, + maybe_process_l1_tx( + TXID, + OwnerFilters#{ + include_tag => IncludeTag, + exclude_tag => ExcludeTag + }, + Depth, + QueryL1Offset, + Opts + )} + else {error, _} = Error -> - Error; - {ok, OwnerFilters} -> - case parse_tag_filter(<<"include-tag">>, Request, Opts) of - {error, _} = Error -> - Error; - {ok, IncludeTag} -> - case parse_tag_filter(<<"exclude-tag">>, Request, Opts) of - {error, _} = Error -> - Error; - {ok, ExcludeTag} -> - {ok, - process_l1_candidate( - TXID, - OwnerFilters#{ - load_l1_offset => LoadL1Offset, - include_tag => IncludeTag, - exclude_tag => ExcludeTag - }, - Depth, - Opts - )} - end - end + Error end. -%% @doc Parse the requested recursion depth and clamp it to the configured safe cap. +%% @doc Parse the requested recursion depth and clamp it to the configured +%% safe cap. Depth is relative so depth 1 is always one level below the +%% root specified in the request (either a block or an L1 TX ID). +%% %% `safe_max` resolves to the current copycat depth recursion cap. request_depth(Request, Default, Opts) -> MaxRecursionCap = get_depth_recursion_cap(Opts), @@ -209,10 +205,7 @@ request_depth(Request, Default, Opts) -> end, erlang:min( MaxRecursionCap, - erlang:max( - ?DEPTH_L1_OFFSETS, - RequestedDepth - ) + erlang:max(1, RequestedDepth) ). %% @doc Return the first matching L1 filter reason for a TX header, or `pass`. l1_filter_reason(TX, Filters) -> @@ -221,39 +214,14 @@ l1_filter_reason(TX, Filters) -> IncludeTag = maps:get(include_tag, Filters, undefined), ExcludeTag = maps:get(exclude_tag, Filters, undefined), Owner = ar_tx:get_owner_address(TX), - case owner_matches_filter(Owner, IncludeOwner) of - false when IncludeOwner =/= undefined -> - include_owner_mismatch; - _ -> - case owner_matches_filter(Owner, ExcludeOwner) of - true -> - exclude_owner_match; - false -> - case IncludeTag of - undefined -> - case ExcludeTag of - undefined -> pass; - _ -> - case has_tag_pair(TX, ExcludeTag) of - true -> exclude_tag_match; - false -> pass - end - end; - _ -> - case has_tag_pair(TX, IncludeTag) of - false -> include_tag_mismatch; - true -> - case ExcludeTag of - undefined -> pass; - _ -> - case has_tag_pair(TX, ExcludeTag) of - true -> exclude_tag_match; - false -> pass - end - end - end - end - end + maybe + pass ?= maybe_include_owner(Owner, IncludeOwner), + pass ?= maybe_exclude_owner(Owner, ExcludeOwner), + pass ?= maybe_include_tag(TX, IncludeTag), + pass ?= maybe_exclude_tag(TX, ExcludeTag), + pass + else + Reason -> Reason end. %% @doc Match an owner against an undefined, single-owner, or multi-owner filter. owner_matches_filter(_Owner, undefined) -> @@ -263,6 +231,38 @@ owner_matches_filter(Owner, Owners) when is_list(Owners) -> owner_matches_filter(Owner, FilterOwner) -> Owner =:= FilterOwner. +maybe_include_owner(_Owner, undefined) -> + pass; +maybe_include_owner(Owner, IncludeOwner) -> + case owner_matches_filter(Owner, IncludeOwner) of + true -> pass; + false -> include_owner_mismatch + end. + +maybe_exclude_owner(_Owner, undefined) -> + pass; +maybe_exclude_owner(Owner, ExcludeOwner) -> + case owner_matches_filter(Owner, ExcludeOwner) of + true -> exclude_owner_match; + false -> pass + end. + +maybe_include_tag(_TX, undefined) -> + pass; +maybe_include_tag(TX, IncludeTag) -> + case has_tag_pair(TX, IncludeTag) of + true -> pass; + false -> include_tag_mismatch + end. + +maybe_exclude_tag(_TX, undefined) -> + pass; +maybe_exclude_tag(TX, ExcludeTag) -> + case has_tag_pair(TX, ExcludeTag) of + true -> exclude_tag_match; + false -> pass + end. + has_tag_pair(#tx{tags = Tags}, #{name := Name, value := Value}) -> TagValue = dev_arweave_common:tagfind(Name, Tags, not_found), case TagValue of @@ -419,38 +419,38 @@ classify_txs(TXIDs, Opts) -> %% @doc Fetch blocks from an Arweave node while moving downward from `Current'. %% If `To' is provided, every block in [`To', `Current'] is processed. If `To' %% is omitted, stop at the first block where any TX is already indexed. -fetch_blocks(Req, Current, To, _Opts) when is_integer(To), Current < To -> +fetch_blocks(Current, To, TargetDepth, _Opts) + when is_integer(To), Current < To -> ?event(copycat_short, {arweave_block_indexing_completed, {reached_target, To}, - {initial_request, Req} + {target_depth, TargetDepth} } ), {ok, To}; -fetch_blocks(_Req, Current, undefined, _Opts) when Current < 0 -> +fetch_blocks(Current, undefined, _TargetDepth, _Opts) when Current < 0 -> {ok, 0}; -fetch_blocks(Req, Current, undefined, Opts) -> +fetch_blocks(Current, undefined, TargetDepth, Opts) -> BlockRes = fetch_block_header(Current, Opts), case is_already_indexed(BlockRes, Opts) of true -> ?event(copycat_short, {arweave_block_indexing_completed, - {stop_at_indexed_block, Current}, - {initial_request, Req} + {stop_at_indexed_block, Current} } ), {ok, Current}; false -> observe_event(<<"block_indexed">>, fun() -> - process_block(BlockRes, Current, undefined, Opts) + process_block(BlockRes, Current, undefined, TargetDepth, Opts) end), - fetch_blocks(Req, Current - 1, undefined, Opts) + fetch_blocks(Current - 1, undefined, TargetDepth, Opts) end; -fetch_blocks(Req, Current, To, Opts) -> +fetch_blocks(Current, To, TargetDepth, Opts) -> observe_event(<<"block_indexed">>, fun() -> - fetch_and_process_block(Current, To, Opts) + fetch_and_process_block(Current, To, TargetDepth, Opts) end), - fetch_blocks(Req, Current - 1, To, Opts). + fetch_blocks(Current - 1, To, TargetDepth, Opts). %% @doc Determine whether a fetched block is considered indexed. %% A block is indexed when any TX from its `txs' list is in the index. @@ -460,17 +460,17 @@ is_already_indexed({ok, Block}, Opts) -> is_already_indexed({error, _}, _Opts) -> false. -fetch_and_process_block(Current, To, Opts) -> +fetch_and_process_block(Current, To, TargetDepth, Opts) -> BlockRes = fetch_block_header(Current, Opts), - process_block(BlockRes, Current, To, Opts). + process_block(BlockRes, Current, To, TargetDepth, Opts). %% @doc Process a block. -process_block(BlockRes, Current, To, Opts) -> +process_block(BlockRes, Current, To, TargetDepth, Opts) -> case BlockRes of {ok, Block} -> ?event(debug_copycat, {{processing_block, Current}, {indep_hash, hb_maps:get(<<"indep_hash">>, Block, <<>>)}}), - case maybe_index_ids(Block, Opts) of + case maybe_index_block(Block, TargetDepth, Opts) of {block_skipped, Results} -> TotalTXs = maps:get(total_txs, Results, 0), ?event( @@ -509,7 +509,7 @@ process_block(BlockRes, Current, To, Opts) -> end. %% @doc Index the IDs of all transactions in the block if configured to do so. -maybe_index_ids(Block, Opts) -> +maybe_index_block(Block, TargetDepth, Opts) -> TotalTXs = length(hb_maps:get(<<"txs">>, Block, [], Opts)), case hb_opts:get(arweave_index_ids, true, Opts) of false -> @@ -540,7 +540,8 @@ maybe_index_ids(Block, Opts) -> fun({{padding, _}, _}) -> false; (_) -> true end, TXsWithData ), - TXResults = process_txs(ValidTXs, BlockStartOffset, Opts), + TXResults = process_block_txs( + ValidTXs, BlockStartOffset, TargetDepth, Opts), {block_cached, TXResults#{total_txs => TotalTXs}} end end. @@ -555,10 +556,9 @@ parallel_map(Items, Fun, Opts) -> %% @doc Process a single transaction and return its contribution to the counters. %% Returns a map with keys: items_count, bundle_count, skipped_count -process_tx({{padding, _PaddingRoot}, _EndOffset}, _BlockStartOffset, _Opts) -> +process_block_tx({{padding, _PaddingRoot}, _EndOffset}, _BlockStartOffset, _TargetDepth, _Opts) -> #{items_count => 0, bundle_count => 0, skipped_count => 0}; -process_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, Opts) -> - Depth = get_block_depth(Opts), +process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, Opts) -> IndexStore = hb_store_arweave:store_from_opts(Opts), TXID = hb_util:encode(TX#tx.id), TXEndOffset = BlockStartOffset + EndOffset, @@ -579,8 +579,10 @@ process_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, Opts) -> end), case is_bundle_tx(TX, Opts) of false -> #{items_count => 0, bundle_count => 0, skipped_count => 0}; - true when Depth > ?DEPTH_L1_OFFSETS -> - process_l1_candidate(TX#tx.id, #{}, Depth, Opts); + true when TargetDepth > 2 -> + % Indexing a block to depth 3 or greater means we need to load + % and recurse into each of the L1 TXs in the block. + maybe_process_l1_tx(TX#tx.id, #{}, TargetDepth - 1, false, Opts); true -> % Lightweight processing of block transactions to depth 2. We % can avoid loading the full L1 TX data into memory, and instead @@ -638,10 +640,11 @@ process_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, Opts) -> %% When arweave_index_workers <= 1, processes sequentially (one worker at a time). %% When arweave_index_workers > 1, processes in parallel with the specified concurrency limit. %% Returns a map with keys: items_count, bundle_count, skipped_count. -process_txs(ValidTXs, BlockStartOffset, Opts) -> +process_txs(ValidTXs, BlockStartOffset, TargetDepth, Opts) -> Results = parallel_map( ValidTXs, - fun(TXWithData) -> process_tx(TXWithData, BlockStartOffset, Opts) end, + fun(TXWithData) -> process_tx( + TXWithData, BlockStartOffset, TargetDepth, Opts) end, Opts ), lists:foldl( @@ -657,17 +660,16 @@ process_txs(ValidTXs, BlockStartOffset, Opts) -> ). %% @doc Process a single indexed L1 TX candidate after lightweight filter checks. -process_l1_candidate(TXID, Filters, Depth, Opts) -> +process_l1_candidate(TXID, Filters, Depth, ReadL1Offset, Opts) -> Skipped = #{items_count => 0, bundle_count => 0, skipped_count => 1}, NormalizedTXID = hb_util:native_id(TXID), EncodedTXID = hb_util:encode(NormalizedTXID), IndexStore = hb_store_arweave:store_from_opts(Opts), - LoadL1Offset = maps:get(load_l1_offset, Filters, false), case ensure_l1_tx_offset( NormalizedTXID, EncodedTXID, IndexStore, - LoadL1Offset, + ReadL1Offset, Opts ) of {ok, @@ -817,11 +819,11 @@ process_l1_candidate(TXID, Filters, Depth, Opts) -> ensure_l1_tx_offset(_TXID, _EncodedTXID, IndexStore, _LoadL1Offset, _Opts) when is_map(IndexStore) =:= false -> {error, missing_offset}; -ensure_l1_tx_offset(TXID, EncodedTXID, IndexStore, LoadL1Offset, Opts) -> +ensure_l1_tx_offset(TXID, EncodedTXID, IndexStore, ReadL1Offset, Opts) -> case hb_store_arweave:read_offset(IndexStore, TXID) of {ok, _} = OffsetRes -> OffsetRes; - not_found when LoadL1Offset -> + not_found when ReadL1Offset -> ?event( copycat_short, {arweave_tx_offset_loading, @@ -892,10 +894,342 @@ index_bundle_bytes(BundleData, BundleStartOffset, Depth, Store, Opts) -> ) end. +%% @doc Lightweight bundle indexing. This function only loads the bundle header +%% and writes the index based solely on the header. For a more rigorous and +%% deeper indexing, see the index_full_bundle_xxx functions. +index_bundle_header(TXID, TXEndOffset, TXDataSize, TXStartOffset, IndexStore, Opts) -> + BundleRes = download_bundle_header(TXEndOffset, TXDataSize, Opts), + case BundleRes of + {ok, {BundleIndex, HeaderSize}} -> + % Batch event tracking: measure total time and count for + % all write_offset calls + {TotalTime, {_, ItemsCount}} = timer:tc(fun() -> + lists:foldl( + fun({ItemID, Size}, {ItemStartOffset, ItemsCountAcc}) -> + hb_store_arweave:write_offset( + IndexStore, + hb_util:encode(ItemID), + <<"ans104@1.0">>, + ItemStartOffset, + Size + ), + {ItemStartOffset + Size, ItemsCountAcc + 1} + end, + {TXStartOffset + HeaderSize, 0}, + BundleIndex + ) + end), + % Single event increment for the batch + record_event_metrics(<<"item_indexed">>, ItemsCount, TotalTime), + #{items_count => ItemsCount, bundle_count => 1, skipped_count => 0}; + {error, Reason} -> + ?event( + copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, TXID}}, + {reason, Reason} + } + ), + #{items_count => 0, bundle_count => 1, skipped_count => 1} + end. + +download_bundle_header(EndOffset, Size, Opts) -> + observe_event(<<"bundle_header">>, fun() -> + dev_arweave:bundle_header(EndOffset - Size, Opts) + end). + +header_chunk(invalid_bundle_header, _FirstChunk, _StartOffset, _Opts) -> + {error, invalid_bundle_header}; +header_chunk(HeaderSize, FirstChunk, _StartOffset, _Opts) + when HeaderSize =< byte_size(FirstChunk) -> + {ok, FirstChunk}; +header_chunk(HeaderSize, FirstChunk, StartOffset, Opts) -> + Res = + hb_ao:resolve( + << + ?ARWEAVE_DEVICE/binary, + "/chunk&offset=", + (hb_util:bin(StartOffset + byte_size(FirstChunk)))/binary, + "&length=", + (hb_util:bin(HeaderSize - byte_size(FirstChunk)))/binary + >>, + Opts + ), + case Res of + {ok, OtherChunks} -> {ok, <>}; + Other -> Other + end. + +%% @doc Process transactions: spawn workers and manage the worker pool. +%% This function processes transactions in parallel using parallel_map. +%% When arweave_index_workers <= 1, processes sequentially (one worker at a time). +%% When arweave_index_workers > 1, processes in parallel with the specified concurrency limit. +%% Returns a map with keys: items_count, bundle_count, skipped_count. +process_block_txs(ValidTXs, BlockStartOffset, TargetDepth, Opts) -> + Results = parallel_map( + ValidTXs, + fun(TXWithData) -> process_block_tx( + TXWithData, BlockStartOffset, TargetDepth, Opts) end, + Opts + ), + lists:foldl( + fun(Result, Acc) -> + #{ + items_count => maps:get(items_count, Result, 0) + maps:get(items_count, Acc, 0), + bundle_count => maps:get(bundle_count, Result, 0) + maps:get(bundle_count, Acc, 0), + skipped_count => maps:get(skipped_count, Result, 0) + maps:get(skipped_count, Acc, 0) + } + end, + #{items_count => 0, bundle_count => 0, skipped_count => 0}, + Results + ). + +%% @doc Process a single indexed L1 TX candidate after lightweight filter checks. +maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> + Skipped = #{items_count => 0, bundle_count => 0, skipped_count => 1}, + NormalizedTXID = hb_util:native_id(TXID), + EncodedTXID = hb_util:encode(NormalizedTXID), + IndexStore = hb_store_arweave:store_from_opts(Opts), + maybe + {ok, + #{ + <<"codec-device">> := <<"tx@1.0">>, + <<"start-offset">> := StartOffset, + <<"length">> := Length + }} ?= + ensure_l1_tx_offset( + NormalizedTXID, + EncodedTXID, + IndexStore, + QueryL1Offset, + Opts + ), + {ok, TX} ?= resolve_tx_header(EncodedTXID, Opts), + pass ?= l1_filter_reason(TX, Filters), + bundle ?= + case is_bundle_tx(TX, Opts) of + true -> bundle; + false -> not_bundle + end, + within_memory_cap ?= + case Length =< get_memory_safe_cap(Opts) of + true -> within_memory_cap; + false -> memory_safe_cap_exceeded + end, + process_l1_tx( + StartOffset, + Length, + Depth, + IndexStore, + EncodedTXID, + Opts + ) + else + {error, Reason} -> + ?event( + copycat_short, + {arweave_tx_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, Reason} + } + ), + Skipped; + error -> + % event already logged in resolve_tx_header + Skipped; + not_bundle -> + ?event( + copycat_short, + {arweave_tx_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, not_bundle} + } + ), + Skipped; + memory_safe_cap_exceeded -> + ?event( + copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, memory_safe_cap_exceeded} + } + ), + #{ + items_count => 0, + bundle_count => 1, + skipped_count => 1 + }; + FilterReason -> + ?event( + copycat_short, + {arweave_tx_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, FilterReason} + } + ), + Skipped + end. + +%% @doc Load the L1 TX data into memory and index it. +%% +%% TODO: process_l1_tx and process_block_tx are very similar and can/should +%% be merged. +process_l1_tx( + StartOffset, Length, Depth, IndexStore, EncodedTXID, Opts) -> + case hb_store_arweave:read_chunks(StartOffset, Length, Opts) of + {ok, BundleData} -> + {TotalTime, IndexRes} = timer:tc( + fun() -> + index_full_bundle_bytes( + BundleData, + StartOffset, + Depth, + IndexStore, + Opts + ) + end + ), + case IndexRes of + {ok, ItemsCount} -> + record_event_metrics( + <<"item_indexed">>, + ItemsCount, + TotalTime + ), + #{ + items_count => ItemsCount, + bundle_count => 1, + skipped_count => 0 + }; + {error, Reason} -> + ?event( + copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, Reason} + } + ), + #{ + items_count => 0, + bundle_count => 1, + skipped_count => 1 + } + end; + {error, Reason} -> + ?event( + copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, Reason} + } + ), + #{ + items_count => 0, + bundle_count => 1, + skipped_count => 1 + }; + not_found -> + ?event( + copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, not_found} + } + ), + #{ + items_count => 0, + bundle_count => 1, + skipped_count => 1 + } + end. +%% @doc Ensure the root L1 TX offset exists locally before `id=...` indexing. +%% if the offset is missing and `query_l1_offset` is enabled, fetches the TX +%% offset metadata from Arweave, writes it to the local offset store, and +%% retries the local lookup. +ensure_l1_tx_offset(_TXID, _EncodedTXID, IndexStore, _LoadL1Offset, _Opts) + when is_map(IndexStore) =:= false -> + {error, missing_offset}; +ensure_l1_tx_offset(TXID, EncodedTXID, IndexStore, QueryL1Offset, Opts) -> + case hb_store_arweave:read_offset(IndexStore, TXID) of + {ok, _} = OffsetRes -> + OffsetRes; + not_found when QueryL1Offset -> + ?event( + copycat_short, + {arweave_tx_querying_offset, + {tx_id, {explicit, EncodedTXID}}, + {source, network} + } + ), + case query_l1_tx_offset(EncodedTXID, IndexStore, Opts) of + ok -> + case hb_store_arweave:read_offset(IndexStore, TXID) of + {ok, _} = OffsetRes -> + OffsetRes; + not_found -> + {error, missing_offset} + end; + {error, Reason} -> + {error, Reason} + end; + not_found -> + {error, missing_offset} + end. + +query_l1_tx_offset(TXID, IndexStore, Opts) -> + % TODO: move this into dev_arweave - I think? Unless it's possible to + % query this already via one of the existing ~arweave@2.9 paths? + case hb_http:request( + #{ + <<"path">> => <<"/arweave/tx/", TXID/binary, "/offset">>, + <<"method">> => <<"GET">> + }, + Opts + ) of + {ok, #{ <<"body">> := OffsetBody }} -> + OffsetMsg = hb_json:decode(OffsetBody), + EndOffset = hb_util:int(maps:get(<<"offset">>, OffsetMsg)), + Size = hb_util:int(maps:get(<<"size">>, OffsetMsg)), + StartOffset = EndOffset - Size, + ok = + hb_store_arweave:write_offset( + IndexStore, + TXID, + <<"tx@1.0">>, + StartOffset, + Size + ), + ok; + {error, Reason} -> + {error, Reason}; + not_found -> + {error, not_found} + end. + +index_full_bundle_bytes(_BundleData, _BundleStartOffset, Depth, _Store, _Opts) + when Depth =< 0 -> + {ok, 0}; +index_full_bundle_bytes(BundleData, BundleStartOffset, Depth, Store, Opts) -> + case ar_bundles:decode_bundle_header(BundleData) of + invalid_bundle_header -> + {error, invalid_bundle_header}; + {ItemsBin, BundleIndex} -> + HeaderSize = byte_size(BundleData) - byte_size(ItemsBin), + index_full_bundle_items( + BundleIndex, + ItemsBin, + BundleStartOffset + HeaderSize, + Depth, + Store, + Opts, + 0 + ) + end. + %% @doc Index bundle children from decoded bundle bytes and recurse descendants in-memory. -index_bundle_items([], _ItemsBin, _ItemStartOffset, _Depth, _Store, _Opts, Count) -> +index_full_bundle_items([], _ItemsBin, _ItemStartOffset, _Depth, _Store, _Opts, Count) -> {ok, Count}; -index_bundle_items( +index_full_bundle_items( [{ItemID, Size} | Rest], ItemsBin, ItemStartOffset, @@ -915,7 +1249,7 @@ index_bundle_items( DescendantCount = case Depth > 1 of true -> - index_bundle_descendants( + index_full_bundle_descendants( ItemBinary, ItemStartOffset, Depth - 1, @@ -925,7 +1259,7 @@ index_bundle_items( false -> 0 end, - index_bundle_items( + index_full_bundle_items( Rest, binary:part(ItemsBin, Size, byte_size(ItemsBin) - Size), ItemStartOffset + Size, @@ -934,19 +1268,19 @@ index_bundle_items( Opts, Count + 1 + DescendantCount ); -index_bundle_items(_BundleIndex, _ItemsBin, _ItemStartOffset, _Depth, _Store, _Opts, _Count) -> +index_full_bundle_items(_BundleIndex, _ItemsBin, _ItemStartOffset, _Depth, _Store, _Opts, _Count) -> {error, invalid_bundle_header}. %% @doc Recurse into a nested bundle data item from in-memory bytes. -index_bundle_descendants(_ItemBinary, _ItemStartOffset, Depth, _Store, _Opts) +index_full_bundle_descendants(_ItemBinary, _ItemStartOffset, Depth, _Store, _Opts) when Depth =< 0 -> 0; -index_bundle_descendants(ItemBinary, ItemStartOffset, Depth, Store, Opts) -> +index_full_bundle_descendants(ItemBinary, ItemStartOffset, Depth, Store, Opts) -> try ar_bundles:deserialize_header(ItemBinary) of {ok, HeaderSize, HeaderTX} -> case is_bundle_tx(HeaderTX, Opts) of true -> - case index_bundle_bytes( + case index_full_bundle_bytes( HeaderTX#tx.data, ItemStartOffset + HeaderSize, Depth, @@ -970,12 +1304,6 @@ index_bundle_descendants(ItemBinary, ItemStartOffset, Depth, Store, Opts) -> is_bundle_tx(TX, _Opts) -> ar_tx:type(TX) =/= binary. -%% @doc Download and decode a bundle header from chunk data. -download_bundle_header(EndOffset, Size, Opts) -> - observe_event(<<"bundle_header">>, fun() -> - lib_arweave_common:bundle_header(EndOffset - Size, Size, Opts) - end). - resolve_tx_headers(TXIDs, Opts) -> Results = parallel_map( TXIDs, @@ -1122,8 +1450,24 @@ index_ids_test_parallel() -> ], Opts ), + % L3 item not read when doing L1 depth=1 + assert_item_not_read(<<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, Opts), ok. +block_depth_3_test() -> + %% Test block: https://viewblock.io/arweave/block/1827942 + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {ok, 1827942} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=1827942&to=1827942&depth=3">>, + Opts + ), + % L3 item read when doing depth=2 + assert_item_read( + <<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, + Opts), + ok. + %% @doc Test a bundle header that fits in a single chunk. small_bundle_header_test_parallel() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), @@ -1637,6 +1981,345 @@ negative_from_index_test_parallel() -> ?assertNot(has_any_indexed_tx(NextBlock + 1, Opts)), ok. +owner_alias_roundtrip_test() -> + Opts1 = + add_owner_alias( + <<"FPjbN7EVwP3XwQJx8qnKqJDYa4TLJ0Y8gu4AaiUuW1c">>, + <<"turbo">>, + #{} + ), + Opts2 = + add_owner_alias( + <<"JNC6vBhU4sAK5T49VL4k79vNer0tZjM8fI1gpqUQK5g">>, + <<"redstone">>, + Opts1 + ), + ?assertEqual( + {ok, normalize_owner_id(<<"FPjbN7EVwP3XwQJx8qnKqJDYa4TLJ0Y8gu4AaiUuW1c">>)}, + resolve_owner_alias(<<"turbo">>, Opts2) + ), + ?assertEqual( + {ok, normalize_owner_id(<<"JNC6vBhU4sAK5T49VL4k79vNer0tZjM8fI1gpqUQK5g">>)}, + resolve_owner_alias(<<"redstone">>, Opts2) + ), + ?assertEqual( + {error, {owner_alias_not_found, <<"unknown">>}}, + resolve_owner_alias(<<"unknown">>, Opts2) + ), + ok. + +parse_tag_filter_test() -> + ?assertEqual( + {ok, #{name => <<"App-Name">>, value => <<"ao">>}}, + parse_tag_filter(<<"include-tag">>, #{<<"include-tag">> => <<"App-Name:ao">>}, #{}) + ), + ?assertEqual( + {ok, undefined}, + parse_tag_filter(<<"include-tag">>, #{}, #{}) + ), + ?assertEqual( + {error, invalid_tag_filter}, + parse_tag_filter(<<"include-tag">>, #{<<"include-tag">> => <<"App-Name">>}, #{}) + ), + ?assertEqual( + {error, invalid_tag_filter}, + parse_tag_filter(<<"include-tag">>, #{<<"include-tag">> => <<":ao">>}, #{}) + ), + ?assertEqual( + {error, invalid_tag_filter}, + parse_tag_filter(<<"include-tag">>, #{<<"include-tag">> => <<"App-Name:">>}, #{}) + ), + ok. + +l1_filter_reason_test() -> + Owner = <<"owner-1">>, + OtherOwner = <<"owner-2">>, + TX = #tx{ + owner = <<"non-default-owner">>, + owner_address = Owner, + tags = [ + {<<"App-Name">>, <<"ao">>}, + {<<"Bundler-App-Name">>, <<"Redstone">>} + ] + }, + IncludeTag = #{name => <<"App-Name">>, value => <<"ao">>}, + ExcludeTag = #{name => <<"Bundler-App-Name">>, value => <<"Redstone">>}, + ?assertEqual(pass, l1_filter_reason(TX, #{})), + ?assertEqual(pass, l1_filter_reason(TX, #{include_owner => Owner})), + ?assertEqual( + include_owner_mismatch, + l1_filter_reason(TX, #{include_owner => OtherOwner}) + ), + ?assertEqual( + exclude_owner_match, + l1_filter_reason(TX, #{exclude_owner => Owner}) + ), + ?assertEqual( + pass, + l1_filter_reason(TX, #{exclude_owner => OtherOwner}) + ), + ?assertEqual(pass, l1_filter_reason(TX, #{include_tag => IncludeTag})), + ?assertEqual( + include_tag_mismatch, + l1_filter_reason( + TX, + #{include_tag => #{name => <<"Content-Type">>, value => <<"text/plain">>}} + ) + ), + ?assertEqual( + exclude_tag_match, + l1_filter_reason(TX, #{exclude_tag => ExcludeTag}) + ), + ?assertEqual( + pass, + l1_filter_reason( + TX, + #{exclude_tag => #{name => <<"Content-Type">>, value => <<"text/plain">>}} + ) + ), + ?assertEqual( + exclude_tag_match, + l1_filter_reason( + TX, + #{include_tag => IncludeTag, exclude_tag => ExcludeTag} + ) + ), + ?assertEqual( + pass, + l1_filter_reason(TX, #{include_owner => [OtherOwner, Owner]}) + ), + ok. + +request_depth_clamping_test() -> + {_TestStore, _StoreOpts, Opts0} = setup_index_opts(), + ?assertEqual(6, request_depth(#{}, <<"safe_max">>, Opts0)), + ?assertEqual( + 2, + request_depth(#{<<"depth">> => <<"2">>}, <<"safe_max">>, Opts0) + ), + ?assertEqual( + 1, + request_depth(#{<<"depth">> => <<"0">>}, <<"safe_max">>, Opts0) + ), + ?assertEqual( + 6, + request_depth(#{<<"depth">> => <<"999">>}, <<"safe_max">>, Opts0) + ), + Opts1 = set_depth_recursion_cap(2, Opts0), + ?assertEqual(2, request_depth(#{}, <<"safe_max">>, Opts1)), + % no recursion cap set, use default from hb_opts + ?assertEqual(6, request_depth(#{}, <<"safe_max">>, #{})), + ok. + +memory_cap_setter_getter_test() -> + {_TestStore, _StoreOpts, Opts0} = setup_index_opts(), + ?assertEqual(6 * 1024 * 1024 * 1024, get_memory_safe_cap(Opts0)), + Opts1 = set_memory_safe_cap(1024, Opts0), + ?assertEqual(1024, get_memory_safe_cap(Opts1)), + ok. + +id_depth_1_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {Block, TXID} = {1827942, <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>}, + ok = index_l1_offsets(Block, Opts), + {ok, Result} = + hb_ao:resolve( + << + "~copycat@1.0/arweave&" + "id=", TXID/binary, "&" + "mode=write&" + "depth=1" + >>, + Opts + ), + ?assertEqual(26, maps:get(items_count, Result)), + ?assertEqual(1, maps:get(bundle_count, Result)), + ?assertEqual(0, maps:get(skipped_count, Result)), + + assert_bundle_read( + <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>, + [ + {<<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, <<"1">>}, + {<<"MgatoEjlO_YtdbxFi9Q7Hxbs0YQVcChddhSS7FsdeIg">>, <<"19">>}, + {<<"z-oKJfhMq5qoVFrljEfiBKgumaJmCWVxNJaavR5aPE8">>, <<"26">>} + ], + Opts + ), + % L3 item not read when doing L1 depth=1 + assert_item_not_read(<<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, Opts), + ok. + +id_depth_2_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {Block, TXID} = {1827942, <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>}, + ok = index_l1_offsets(Block, Opts), + {ok, Result} = + hb_ao:resolve( + << + "~copycat@1.0/arweave&" + "id=", TXID/binary, "&" + "mode=write&" + "depth=2" + >>, + Opts + ), + ?assertEqual(52, maps:get(items_count, Result)), + ?assertEqual(1, maps:get(bundle_count, Result)), + ?assertEqual(0, maps:get(skipped_count, Result)), + + assert_bundle_read( + <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>, + [ + {<<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, <<"1">>}, + {<<"MgatoEjlO_YtdbxFi9Q7Hxbs0YQVcChddhSS7FsdeIg">>, <<"19">>}, + {<<"z-oKJfhMq5qoVFrljEfiBKgumaJmCWVxNJaavR5aPE8">>, <<"26">>} + ], + Opts + ), + % L2 bundle and L3 children should be read when doing L1 with depth=2 + assert_bundle_read( + <<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, + [ + {<<"iS5R3iSKaCdcXG2nlKWsbdT1_uhQe54nMsgYK-ivEcE">>, <<"1">>}, + {<<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, <<"2">>} + ], + Opts + ), + ok. + +id_exclude_tag_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {Block, TXID} = {1827942, <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>}, + ok = index_l1_offsets(Block, Opts), + {ok, Result} = + hb_ao:resolve( + << + "~copycat@1.0/arweave&" + "id=", TXID/binary, "&" + "mode=write&" + "exclude-tag=App-Name:ArDrive%20Turbo&" + "depth=2" + >>, + Opts + ), + ?assertEqual(0, maps:get(items_count, Result)), + ?assertEqual(0, maps:get(bundle_count, Result)), + ?assertEqual(1, maps:get(skipped_count, Result)), + assert_item_not_read(<<"iS5R3iSKaCdcXG2nlKWsbdT1_uhQe54nMsgYK-ivEcE">>, Opts), + ok. + +id_include_owner_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {Block, TXID} = {1827942, <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>}, + ok = index_l1_offsets(Block, Opts), + {ok, Included} = + hb_ao:resolve( + << + "~copycat@1.0/arweave&" + "id=", TXID/binary, "&" + "mode=write&" + "include-owner=JNC6vBhjHY1EPwV3pEeNmrsgFMxH5d38_LHsZ7jful8" + >>, + Opts + ), + ?assertEqual(52, maps:get(items_count, Included)), + ?assertEqual(1, maps:get(bundle_count, Included)), + ?assertEqual(0, maps:get(skipped_count, Included)), + {ok, Skipped} = + hb_ao:resolve( + << + "~copycat@1.0/arweave&" + "id=", TXID/binary, "&" + "mode=write&" + "include-owner=FPjbN7EVwP3XwQJx8qnKqJDYa4TLJ0Y8gu4AaiUuW1c" + >>, + Opts + ), + ?assertEqual(0, maps:get(items_count, Skipped)), + ?assertEqual(0, maps:get(bundle_count, Skipped)), + ?assertEqual(1, maps:get(skipped_count, Skipped)). + +id_missing_offset_without_load_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {_Block, TXID} = {1827942, <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>}, + {ok, Result} = + hb_ao:resolve( + << + "~copycat@1.0/arweave&" + "id=", TXID/binary, "&" + "mode=write" + >>, + Opts + ), + ?assertEqual(0, maps:get(items_count, Result)), + ?assertEqual(0, maps:get(bundle_count, Result)), + ?assertEqual(1, maps:get(skipped_count, Result)), + assert_item_not_read(<<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>, Opts), + ok. + +id_missing_offset_with_load_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {_Block, TXID} = {1827942, <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>}, + {ok, Result} = + hb_ao:resolve( + << + "~copycat@1.0/arweave&" + "id=", TXID/binary, "&" + "mode=write&" + "query-l1-offset=true&" + "depth=2" + >>, + Opts + ), + ?assertEqual(52, maps:get(items_count, Result)), + ?assertEqual(1, maps:get(bundle_count, Result)), + ?assertEqual(0, maps:get(skipped_count, Result)), + + assert_bundle_read( + <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>, + [ + {<<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, <<"1">>}, + {<<"MgatoEjlO_YtdbxFi9Q7Hxbs0YQVcChddhSS7FsdeIg">>, <<"19">>}, + {<<"z-oKJfhMq5qoVFrljEfiBKgumaJmCWVxNJaavR5aPE8">>, <<"26">>} + ], + Opts + ), + % L2 bundle and L3 children should be read when doing L1 with depth=2 + assert_bundle_read( + <<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, + [ + {<<"iS5R3iSKaCdcXG2nlKWsbdT1_uhQe54nMsgYK-ivEcE">>, <<"1">>}, + {<<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, <<"2">>} + ], + Opts + ), + ok. + +parse_owner_filter_unknown_alias_test() -> + ?assertEqual( + {error, {owner_alias_not_found, <<"nonexistent">>}}, + parse_owner_filter( + #{<<"include-owner-alias">> => <<"nonexistent">>}, + #{} + ) + ), + ok. + +index_l1_offsets(Block, Opts) -> + BlockBin = hb_util:bin(Block), + {ok, Block} = + hb_ao:resolve( + << + "~copycat@1.0/arweave&" + "from=", BlockBin/binary, "&" + "to=", BlockBin/binary, "&" + "mode=write&" + "depth=1" + >>, + Opts + ), + ok. + setup_index_opts() -> TestStore = hb_test_utils:test_store(), StoreOpts = #{ <<"index-store">> => [TestStore] }, @@ -1686,7 +2369,9 @@ assert_bundle_read(BundleID, ExpectedItems, Opts) -> lists:foreach( fun({{_ItemID, Index}, Item}) -> QueriedItem = hb_ao:get(Index, Bundle, Opts), - ?assertEqual(hb_maps:without(?AO_CORE_KEYS, Item), hb_maps:without(?AO_CORE_KEYS, QueriedItem)) + ?assertEqual( + hb_maps:without(?AO_CORE_KEYS, Item), + hb_maps:without(?AO_CORE_KEYS, QueriedItem)) end, lists:zip(ExpectedItems, ReadItems) ), @@ -1694,14 +2379,21 @@ assert_bundle_read(BundleID, ExpectedItems, Opts) -> assert_item_read(ItemID, Opts) -> ?event(debug_test, {resolving, {explicit, ItemID}}), - Resolved = hb_ao:resolve(ItemID, Opts), - ?assertMatch({ok, _}, Resolved, ItemID), - {ok, Item} = Resolved, + ReadResult = hb_store_arweave:read( + hb_store_arweave:store_from_opts(Opts), ItemID), + ?assertMatch({ok, _}, ReadResult, ItemID), + {ok, Item} = ReadResult, ?event(debug_test, {item, Item}), ?assert(hb_message:verify(Item, all, Opts)), ?assertEqual(ItemID, hb_message:id(Item, signed)), Item. +assert_item_not_read(ItemID, Opts) -> + ReadResult = hb_store_arweave:read( + hb_store_arweave:store_from_opts(Opts), ItemID), + ?assertEqual(not_found, ReadResult), + ok. + has_any_indexed_tx(Height, Opts) -> case fetch_block_header(Height, Opts) of {ok, Block} -> From 390d93b68beeffc7b2d9c5eb7c721ff70631cb8f Mon Sep 17 00:00:00 2001 From: James Piechota Date: Thu, 12 Mar 2026 12:00:00 +0100 Subject: [PATCH 06/68] impr: Add indexer logging and response improvements --- src/preloaded/query/dev_copycat_arweave.erl | 339 ++------------------ 1 file changed, 33 insertions(+), 306 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 9a8b489ef..48c5168f2 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -27,12 +27,37 @@ arweave(_Base, Request, Opts) -> case hb_maps:get(<<"mode">>, Request, <<"write">>, Opts) of <<"write">> -> case hb_maps:find(<<"id">>, Request, Opts) of - {ok, TXID} -> process_l1_request(TXID, Request, Opts); + {ok, TXID} -> + case process_l1_request(TXID, Request, Opts) of + {ok, Stats} when is_map(Stats) -> + ?event( + copycat_short, + {arweave_tx_indexed, + {id, {explicit, TXID}}, + {items_indexed, maps:get(items_count, Stats, 0)}, + {bundle_txs, maps:get(bundle_count, Stats, 0)}, + {skipped_txs, maps:get(skipped_count, Stats, 0)} + } + ), + {ok, Stats#{ + <<"body">> => maps:get(items_count, Stats, 0) + }}; + _ -> + {ok, #{ + items_count => 0, + bundle_count => 0, + skipped_count => 0, + <<"body">> => 0 + }} + end; error -> TargetDepth = request_depth(Request, ?DEFAULT_BLOCK_DEPTH, Opts), case parse_range(Request, Opts) of {error, unavailable} -> {error, unavailable}; {ok, {From, To}} -> + ?event(copycat_short, + {indexing_blocks, {from, From}, {to, To}, {depth, TargetDepth}} + ), fetch_blocks(From, To, TargetDepth, Opts) end end; @@ -319,9 +344,6 @@ normalize_height(Height, Opts) -> {ok, RequestedHeight} end. -get_block_depth(Opts) -> - maps:get(copycat_block_depth, Opts, ?DEPTH_L1_OFFSETS). - latest_height(Opts) -> case hb_ao:resolve( <>, @@ -635,304 +657,6 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, end end. -%% @doc Process transactions: spawn workers and manage the worker pool. -%% This function processes transactions in parallel using parallel_map. -%% When arweave_index_workers <= 1, processes sequentially (one worker at a time). -%% When arweave_index_workers > 1, processes in parallel with the specified concurrency limit. -%% Returns a map with keys: items_count, bundle_count, skipped_count. -process_txs(ValidTXs, BlockStartOffset, TargetDepth, Opts) -> - Results = parallel_map( - ValidTXs, - fun(TXWithData) -> process_tx( - TXWithData, BlockStartOffset, TargetDepth, Opts) end, - Opts - ), - lists:foldl( - fun(Result, Acc) -> - #{ - items_count => maps:get(items_count, Result, 0) + maps:get(items_count, Acc, 0), - bundle_count => maps:get(bundle_count, Result, 0) + maps:get(bundle_count, Acc, 0), - skipped_count => maps:get(skipped_count, Result, 0) + maps:get(skipped_count, Acc, 0) - } - end, - #{items_count => 0, bundle_count => 0, skipped_count => 0}, - Results - ). - -%% @doc Process a single indexed L1 TX candidate after lightweight filter checks. -process_l1_candidate(TXID, Filters, Depth, ReadL1Offset, Opts) -> - Skipped = #{items_count => 0, bundle_count => 0, skipped_count => 1}, - NormalizedTXID = hb_util:native_id(TXID), - EncodedTXID = hb_util:encode(NormalizedTXID), - IndexStore = hb_store_arweave:store_from_opts(Opts), - case ensure_l1_tx_offset( - NormalizedTXID, - EncodedTXID, - IndexStore, - ReadL1Offset, - Opts - ) of - {ok, - #{ - <<"codec-device">> := <<"tx@1.0">>, - <<"start-offset">> := StartOffset, - <<"length">> := Length - }} -> - case resolve_tx_header(EncodedTXID, Opts) of - {ok, TX} -> - case l1_filter_reason(TX, Filters) of - pass -> - case is_bundle_tx(TX, Opts) of - false -> - ?event( - copycat_short, - {arweave_tx_skipped, - {tx_id, {explicit, EncodedTXID}}, - {reason, not_bundle} - } - ), - Skipped; - true -> - case Length =< get_memory_safe_cap(Opts) of - false -> - ?event( - copycat_short, - {arweave_bundle_skipped, - {tx_id, {explicit, EncodedTXID}}, - {reason, memory_safe_cap_exceeded} - } - ), - #{ - items_count => 0, - bundle_count => 1, - skipped_count => 1 - }; - true -> - case hb_store_arweave:read_chunks( - StartOffset, - Length, - Opts - ) of - {ok, BundleData} -> - {TotalTime, IndexRes} = timer:tc( - fun() -> - index_bundle_bytes( - BundleData, - StartOffset, - Depth, - IndexStore, - Opts - ) - end - ), - case IndexRes of - {ok, ItemsCount} -> - record_event_metrics( - <<"item_indexed">>, - ItemsCount, - TotalTime - ), - #{ - items_count => ItemsCount, - bundle_count => 1, - skipped_count => 0 - }; - {error, Reason} -> - ?event( - copycat_short, - {arweave_bundle_skipped, - {tx_id, {explicit, EncodedTXID}}, - {reason, Reason} - } - ), - #{ - items_count => 0, - bundle_count => 1, - skipped_count => 1 - } - end; - {error, Reason} -> - ?event( - copycat_short, - {arweave_bundle_skipped, - {tx_id, {explicit, EncodedTXID}}, - {reason, Reason} - } - ), - #{ - items_count => 0, - bundle_count => 1, - skipped_count => 1 - }; - not_found -> - ?event( - copycat_short, - {arweave_bundle_skipped, - {tx_id, {explicit, EncodedTXID}}, - {reason, not_found} - } - ), - #{ - items_count => 0, - bundle_count => 1, - skipped_count => 1 - } - end - end - end; - FilterReason -> - ?event( - copycat_short, - {arweave_tx_skipped, - {tx_id, {explicit, EncodedTXID}}, - {reason, FilterReason} - } - ), - Skipped - end; - error -> - Skipped - end; - {ok, _OtherOffset} -> - ?event( - copycat_short, - {arweave_tx_skipped, - {tx_id, {explicit, EncodedTXID}}, - {reason, not_tx} - } - ), - Skipped; - {error, Reason} -> - ?event( - copycat_short, - {arweave_tx_skipped, - {tx_id, {explicit, EncodedTXID}}, - {reason, Reason} - } - ), - Skipped - end. -%% @doc Ensure the root L1 TX offset exists locally before `id=...` indexing. -%% if the offset is missing and `load_l1_offset` is enabled, fetches the TX -%% offset metadata from Arweave, writes it to the local offset store, and -%% retries the local lookup. -ensure_l1_tx_offset(_TXID, _EncodedTXID, IndexStore, _LoadL1Offset, _Opts) - when is_map(IndexStore) =:= false -> - {error, missing_offset}; -ensure_l1_tx_offset(TXID, EncodedTXID, IndexStore, ReadL1Offset, Opts) -> - case hb_store_arweave:read_offset(IndexStore, TXID) of - {ok, _} = OffsetRes -> - OffsetRes; - not_found when ReadL1Offset -> - ?event( - copycat_short, - {arweave_tx_offset_loading, - {tx_id, {explicit, EncodedTXID}}, - {source, network} - } - ), - case load_l1_tx_offset(EncodedTXID, IndexStore, Opts) of - ok -> - case hb_store_arweave:read_offset(IndexStore, TXID) of - {ok, _} = OffsetRes -> - OffsetRes; - not_found -> - {error, missing_offset} - end; - {error, Reason} -> - {error, Reason} - end; - not_found -> - {error, missing_offset} - end. - -load_l1_tx_offset(TXID, IndexStore, Opts) -> - case hb_http:request( - #{ - <<"path">> => <<"/arweave/tx/", TXID/binary, "/offset">>, - <<"method">> => <<"GET">> - }, - Opts - ) of - {ok, #{ <<"body">> := OffsetBody }} -> - OffsetMsg = hb_json:decode(OffsetBody), - EndOffset = hb_util:int(maps:get(<<"offset">>, OffsetMsg)), - Size = hb_util:int(maps:get(<<"size">>, OffsetMsg)), - StartOffset = EndOffset - Size, - ok = - hb_store_arweave:write_offset( - IndexStore, - TXID, - <<"tx@1.0">>, - StartOffset, - Size - ), - ok; - {error, Reason} -> - {error, Reason}; - not_found -> - {error, not_found} - end. - -index_bundle_bytes(_BundleData, _BundleStartOffset, Depth, _Store, _Opts) - when Depth =< 0 -> - {ok, 0}; -index_bundle_bytes(BundleData, BundleStartOffset, Depth, Store, Opts) -> - case ar_bundles:decode_bundle_header(BundleData) of - invalid_bundle_header -> - {error, invalid_bundle_header}; - {ItemsBin, BundleIndex} -> - HeaderSize = byte_size(BundleData) - byte_size(ItemsBin), - index_bundle_items( - BundleIndex, - ItemsBin, - BundleStartOffset + HeaderSize, - Depth, - Store, - Opts, - 0 - ) - end. - -%% @doc Lightweight bundle indexing. This function only loads the bundle header -%% and writes the index based solely on the header. For a more rigorous and -%% deeper indexing, see the index_full_bundle_xxx functions. -index_bundle_header(TXID, TXEndOffset, TXDataSize, TXStartOffset, IndexStore, Opts) -> - BundleRes = download_bundle_header(TXEndOffset, TXDataSize, Opts), - case BundleRes of - {ok, {BundleIndex, HeaderSize}} -> - % Batch event tracking: measure total time and count for - % all write_offset calls - {TotalTime, {_, ItemsCount}} = timer:tc(fun() -> - lists:foldl( - fun({ItemID, Size}, {ItemStartOffset, ItemsCountAcc}) -> - hb_store_arweave:write_offset( - IndexStore, - hb_util:encode(ItemID), - <<"ans104@1.0">>, - ItemStartOffset, - Size - ), - {ItemStartOffset + Size, ItemsCountAcc + 1} - end, - {TXStartOffset + HeaderSize, 0}, - BundleIndex - ) - end), - % Single event increment for the batch - record_event_metrics(<<"item_indexed">>, ItemsCount, TotalTime), - #{items_count => ItemsCount, bundle_count => 1, skipped_count => 0}; - {error, Reason} -> - ?event( - copycat_short, - {arweave_bundle_skipped, - {tx_id, {explicit, TXID}}, - {reason, Reason} - } - ), - #{items_count => 0, bundle_count => 1, skipped_count => 1} - end. - download_bundle_header(EndOffset, Size, Opts) -> observe_event(<<"bundle_header">>, fun() -> dev_arweave:bundle_header(EndOffset - Size, Opts) @@ -990,6 +714,12 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> NormalizedTXID = hb_util:native_id(TXID), EncodedTXID = hb_util:encode(NormalizedTXID), IndexStore = hb_store_arweave:store_from_opts(Opts), + ?event(copycat_short, + {indexing_l1_tx, {tx_id, {explicit, EncodedTXID}}, + {depth, Depth}, + {query_l1_offset, QueryL1Offset}, + {filters, Filters} + }), maybe {ok, #{ @@ -1452,7 +1182,7 @@ index_ids_test_parallel() -> ), % L3 item not read when doing L1 depth=1 assert_item_not_read(<<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, Opts), - ok. + ok. block_depth_3_test() -> %% Test block: https://viewblock.io/arweave/block/1827942 @@ -2135,7 +1865,6 @@ id_depth_1_test() -> ?assertEqual(26, maps:get(items_count, Result)), ?assertEqual(1, maps:get(bundle_count, Result)), ?assertEqual(0, maps:get(skipped_count, Result)), - assert_bundle_read( <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>, [ @@ -2166,7 +1895,6 @@ id_depth_2_test() -> ?assertEqual(52, maps:get(items_count, Result)), ?assertEqual(1, maps:get(bundle_count, Result)), ?assertEqual(0, maps:get(skipped_count, Result)), - assert_bundle_read( <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>, [ @@ -2274,7 +2002,6 @@ id_missing_offset_with_load_test() -> ?assertEqual(52, maps:get(items_count, Result)), ?assertEqual(1, maps:get(bundle_count, Result)), ?assertEqual(0, maps:get(skipped_count, Result)), - assert_bundle_read( <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>, [ From 0c5d03b4fd3633722d935a124e4e0df40ee90353 Mon Sep 17 00:00:00 2001 From: James Piechota Date: Fri, 13 Mar 2026 12:00:00 +0100 Subject: [PATCH 07/68] fix: Operational fixes for copycat indexer --- src/core/monitor/hb_event.erl | 24 +--- src/preloaded/query/dev_copycat_arweave.erl | 133 ++++++++++++-------- test/arbundles.js/upload-items.js | 10 +- 3 files changed, 93 insertions(+), 74 deletions(-) diff --git a/src/core/monitor/hb_event.erl b/src/core/monitor/hb_event.erl index 7f157440a..ae991d980 100644 --- a/src/core/monitor/hb_event.erl +++ b/src/core/monitor/hb_event.erl @@ -405,28 +405,10 @@ check_overload(Last, N) -> case erlang:process_info(self(), message_queue_len) of {message_queue_len, Len} when Len > ?OVERLOAD_QUEUE_LENGTH -> {memory, MemorySize} = erlang:process_info(self(), memory), - case rand:uniform(max(1000, Len - ?OVERLOAD_QUEUE_LENGTH)) of - 1 -> - ?debug_print( - {warning, - prometheus_event_queue_overloading, - {queue, Len}, - {last_event, Last}, - {memory_bytes, MemorySize} - } - ); - _ -> ignored - end, + % If the size of this process is too large, exit such that + % we can be restarted by the next caller. case MemorySize of MemorySize when MemorySize > ?MAX_MEMORY -> - ?debug_print( - {error, - prometheus_event_queue_terminating_on_memory_overload, - {queue, Len}, - {memory_bytes, MemorySize}, - {last_event, Last} - } - ), exit(memory_overload); _ -> no_action end; @@ -683,4 +665,4 @@ wait_drain_loop(Pid, Deadline) -> undefined -> error(event_server_dead) end. --endif. +-endif. \ No newline at end of file diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 48c5168f2..df2de0250 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -197,25 +197,30 @@ process_l1_request(TXID, Request, Opts) -> hb_util:bool( hb_maps:get(<<"query-l1-offset">>, Request, false, Opts) ), - maybe - {ok, OwnerFilters} ?= parse_owner_filter(Request, Opts), - {ok, IncludeTag} ?= parse_tag_filter(<<"include-tag">>, Request, Opts), - {ok, ExcludeTag} ?= parse_tag_filter(<<"exclude-tag">>, Request, Opts), - {ok, - maybe_process_l1_tx( - TXID, - OwnerFilters#{ - include_tag => IncludeTag, - exclude_tag => ExcludeTag - }, - Depth, - QueryL1Offset, - Opts - )} - else - {error, _} = Error -> - Error - end. + observe_copycat_l1_stage( + <<"l1_request_total">>, + fun() -> + maybe + {ok, OwnerFilters} ?= parse_owner_filter(Request, Opts), + {ok, IncludeTag} ?= parse_tag_filter(<<"include-tag">>, Request, Opts), + {ok, ExcludeTag} ?= parse_tag_filter(<<"exclude-tag">>, Request, Opts), + {ok, + maybe_process_l1_tx( + TXID, + OwnerFilters#{ + include_tag => IncludeTag, + exclude_tag => ExcludeTag + }, + Depth, + QueryL1Offset, + Opts + )} + else + {error, _} = Error -> + Error + end + end + ). %% @doc Parse the requested recursion depth and clamp it to the configured %% safe cap. Depth is relative so depth 1 is always one level below the %% root specified in the request (either a block or an L1 TX ID). @@ -717,8 +722,7 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> ?event(copycat_short, {indexing_l1_tx, {tx_id, {explicit, EncodedTXID}}, {depth, Depth}, - {query_l1_offset, QueryL1Offset}, - {filters, Filters} + {query_l1_offset, QueryL1Offset} }), maybe {ok, @@ -727,12 +731,17 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> <<"start-offset">> := StartOffset, <<"length">> := Length }} ?= - ensure_l1_tx_offset( - NormalizedTXID, - EncodedTXID, - IndexStore, - QueryL1Offset, - Opts + observe_copycat_l1_stage( + <<"l1_offset_lookup">>, + fun() -> + ensure_l1_tx_offset( + NormalizedTXID, + EncodedTXID, + IndexStore, + QueryL1Offset, + Opts + ) + end ), {ok, TX} ?= resolve_tx_header(EncodedTXID, Opts), pass ?= l1_filter_reason(TX, Filters), @@ -806,16 +815,24 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> %% be merged. process_l1_tx( StartOffset, Length, Depth, IndexStore, EncodedTXID, Opts) -> - case hb_store_arweave:read_chunks(StartOffset, Length, Opts) of + case observe_copycat_l1_stage( + <<"l1_read_chunks">>, + fun() -> hb_store_arweave:read_chunks(StartOffset, Length, Opts) end + ) of {ok, BundleData} -> {TotalTime, IndexRes} = timer:tc( fun() -> - index_full_bundle_bytes( - BundleData, - StartOffset, - Depth, - IndexStore, - Opts + observe_copycat_l1_stage( + <<"l1_full_bundle_index">>, + fun() -> + index_full_bundle_bytes( + BundleData, + StartOffset, + Depth, + IndexStore, + Opts + ) + end ) end ), @@ -909,26 +926,35 @@ ensure_l1_tx_offset(TXID, EncodedTXID, IndexStore, QueryL1Offset, Opts) -> query_l1_tx_offset(TXID, IndexStore, Opts) -> % TODO: move this into dev_arweave - I think? Unless it's possible to % query this already via one of the existing ~arweave@2.9 paths? - case hb_http:request( - #{ - <<"path">> => <<"/arweave/tx/", TXID/binary, "/offset">>, - <<"method">> => <<"GET">> - }, - Opts + case observe_copycat_l1_stage( + <<"l1_offset_query_http">>, + fun() -> + hb_http:request( + #{ + <<"path">> => <<"/arweave/tx/", TXID/binary, "/offset">>, + <<"method">> => <<"GET">> + }, + Opts + ) + end ) of {ok, #{ <<"body">> := OffsetBody }} -> OffsetMsg = hb_json:decode(OffsetBody), EndOffset = hb_util:int(maps:get(<<"offset">>, OffsetMsg)), Size = hb_util:int(maps:get(<<"size">>, OffsetMsg)), StartOffset = EndOffset - Size, - ok = - hb_store_arweave:write_offset( - IndexStore, - TXID, - <<"tx@1.0">>, - StartOffset, - Size - ), + ok = observe_copycat_l1_stage( + <<"l1_offset_query_store_write">>, + fun() -> + hb_store_arweave:write_offset( + IndexStore, + TXID, + <<"tx@1.0">>, + StartOffset, + Size + ) + end + ), ok; {error, Reason} -> {error, Reason}; @@ -1101,6 +1127,10 @@ record_event_metrics(MetricName, Count, Duration) -> hb_event:record(<<"arweave_block_count">>, MetricName, #{}, Count), hb_event:record(<<"arweave_block_duration">>, MetricName, #{}, Duration). +record_copycat_l1_metrics(MetricName, Count, Duration) -> + hb_event:record(copycat_l1_count, MetricName, #{}, Count), + hb_event:record(copycat_l1_duration, MetricName, #{}, Duration). + %% @doc Track an operation's execution time and count using hb_event:record. %% Always tracks both count and duration, regardless of success/failure. observe_event(MetricName, Fun) -> @@ -1108,6 +1138,11 @@ observe_event(MetricName, Fun) -> record_event_metrics(MetricName, 1, Time), Result. +observe_copycat_l1_stage(MetricName, Fun) -> + {Time, Result} = timer:tc(Fun), + record_copycat_l1_metrics(MetricName, 1, Time), + Result. + %%% Tests index_ids_test_parallel() -> @@ -2153,4 +2188,4 @@ assert_indexed_range(From, To, _Opts) when From < To -> ok; assert_indexed_range(From, To, Opts) -> ?assert(has_any_indexed_tx(From, Opts)), - assert_indexed_range(From - 1, To, Opts). + assert_indexed_range(From - 1, To, Opts). \ No newline at end of file diff --git a/test/arbundles.js/upload-items.js b/test/arbundles.js/upload-items.js index 67579202f..43dc216c2 100644 --- a/test/arbundles.js/upload-items.js +++ b/test/arbundles.js/upload-items.js @@ -8,7 +8,7 @@ const ENDPOINT_PATH = process.env.ENDPOINT_PATH || "/~bundler@1.0/item?codec-dev const DEFAULT_WALLET = "../../hyperbeam-key.json"; const CONCURRENT_UPLOADS = 100; // Number of parallel uploads -async function performanceTest(walletPath, itemCount, bytesPerItem = 0) { +async function performanceTest(walletPath, itemCount, bytesPerItem = 0, bundlerUrl = BUNDLER_URL) { const wallet = require(path.resolve(walletPath)); const signer = new ArweaveSigner(wallet); const endpoint = `${BUNDLER_URL}${ENDPOINT_PATH}`; @@ -139,14 +139,16 @@ if (require.main === module) { const walletPath = firstIsNumber ? DEFAULT_WALLET : (process.argv[2] || DEFAULT_WALLET); const itemCount = parseInt(firstIsNumber ? process.argv[2] : process.argv[3], 10); const bytesPerItem = parseInt(firstIsNumber ? process.argv[3] : process.argv[4], 10) || 0; + const bundlerUrl = (firstIsNumber ? process.argv[4] : process.argv[5]) || BUNDLER_URL; if (!itemCount || itemCount < 1 || isNaN(itemCount)) { - console.error("Usage: node upload-items.js [wallet_path] [bytes_per_item]"); + console.error("Usage: node upload-items.js [wallet_path] [bytes_per_item] [bundler_url]"); console.error(""); console.error("Arguments:"); console.error(" wallet_path - Path to Arweave wallet JSON (default: ../../hyperbeam-key.json)"); console.error(" number_of_items - Number of data items to create and upload"); console.error(" bytes_per_item - Minimum size of each item in bytes (optional)"); + console.error(" bundler_url - Bundler base URL (default: " + BUNDLER_URL + ")"); console.error(""); console.error("Environment variables:"); console.error(" BUNDLER_URL - Gateway base URL (default: http://localhost:8734)"); @@ -165,7 +167,7 @@ if (require.main === module) { process.exit(1); } - performanceTest(walletPath, itemCount, bytesPerItem) + performanceTest(walletPath, itemCount, bytesPerItem, bundlerUrl) .then(() => { process.exit(0); }) @@ -175,4 +177,4 @@ if (require.main === module) { }); } -module.exports = { performanceTest }; +module.exports = { performanceTest }; \ No newline at end of file From 13e750938ac4fb8fd1ef4aa28a43abb79dd775bb Mon Sep 17 00:00:00 2001 From: Niko Storni Date: Fri, 3 Apr 2026 12:00:00 +0200 Subject: [PATCH 08/68] feat: Add per-block item index with depth tracking --- src/core/store/hb_store_arweave_offset.erl | 6 +- src/preloaded/query/dev_copycat_arweave.erl | 930 ++++++++++++++++++-- 2 files changed, 839 insertions(+), 97 deletions(-) diff --git a/src/core/store/hb_store_arweave_offset.erl b/src/core/store/hb_store_arweave_offset.erl index 7b4d5a914..1913645d5 100644 --- a/src/core/store/hb_store_arweave_offset.erl +++ b/src/core/store/hb_store_arweave_offset.erl @@ -26,7 +26,7 @@ %%% to contract to only the number of bytes actually necessary to represent it. %%% -module(hb_store_arweave_offset). --export([encode/3, decode/1, path/1]). +-export([encode/3, decode/1, path/1, mismatch_path/1]). -include("include/hb.hrl"). %% @doc Determine if a value is within a given unsigned bit range. @@ -42,6 +42,10 @@ path(ID) when ?IS_ID(ID) -> hb_util:native_id(ID); path(ID) -> throw({cannot_encode_path, ID}). +mismatch_path(ID) when ?IS_ID(ID) -> + <<"mismatch/", (hb_util:native_id(ID))/binary>>; +mismatch_path(ID) -> throw({cannot_encode_mismatch_path, ID}). + %% @doc Encode the offset of the data if it is valid. Throws `cannot_encode_offset' %% if invalid. encode(Type, StartOffset, Length) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index df2de0250..62c8bea10 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -1,8 +1,9 @@ %%% @doc A `~copycat@1.0' engine that fetches block data from an Arweave node for %%% replication. This engine works in _reverse_ chronological order by default. %%% If `to' is omitted, it keeps moving downward from `from' until it reaches a -%%% block where at least one TX is already indexed, then stops. If `to' is -%%% provided, every block in the range is processed. +%%% block that is already indexed at the requested depth (checked via block +%%% markers first, then legacy per-TX fallback for pre-marker indexes). If `to' +%%% is provided, every block in the range is processed. -module(dev_copycat_arweave). -device_libraries([lib_arweave_common]). -export([arweave/3]). @@ -11,6 +12,9 @@ -include_lib("eunit/include/eunit.hrl"). -define(ARWEAVE_DEVICE, <<"~arweave@2.9">>). +-define(BLOCK_MARKER_PREFIX, <<"block/">>). +-define(CUTOVER_KEY, <<"block/marker-cutover-height">>). +-define(DEPTH_SENTINEL, 99999). % By default we'll index blocks to depth 2 which is: % - depth 1: L1 TXs % - depth 2: L2 bundles and dataitems @@ -66,8 +70,14 @@ arweave(_Base, Request, Opts) -> {error, unavailable} -> {error, unavailable}; {ok, {From, To}} -> list_index(From, To, Opts) end; + <<"list-items">> -> + case parse_range(Request, Opts) of + {error, unavailable} -> {error, unavailable}; + {ok, {From, To}} -> list_items_index(From, To, Opts) + end; Mode -> - {error, <<"Unsupported mode `", (hb_util:bin(Mode))/binary, "`. Supported modes are: write, list">>} + {error, <<"Unsupported mode `", (hb_util:bin(Mode))/binary, + "`. Supported modes are: write, list, list-items">>} end. %% @doc Set safe memory resource allocation cap for the in-memory %% bundle processing. in bytes. @@ -84,6 +94,156 @@ get_depth_recursion_cap(Opts) -> %% from hb_opts. get_memory_safe_cap(Opts) -> hb_opts:get(copycat_memory_cap, undefined, Opts). + +%% @doc Return the store path for a block completion marker. +block_indexed_path(Height) -> + <>. + +%% @doc Return the store path for a per-block item index at a given depth. +block_items_path(Height, Depth) -> + <<"block/", (hb_util:bin(Height))/binary, + "/items/", (hb_util:bin(Depth))/binary>>. + +%% @doc Encode a list of 32-byte raw IDs into a single binary. +encode_item_ids(IDs) -> + << <> || ID <- IDs >>. + +%% @doc Decode a binary of concatenated 32-byte IDs into a list. +%% Rejects binaries whose size is not a multiple of 32. +decode_item_ids(<<>>) -> []; +decode_item_ids(Bin) when byte_size(Bin) rem 32 =/= 0 -> + {error, invalid_item_ids_binary}; +decode_item_ids(Bin) -> + decode_item_ids_acc(Bin, []). + +decode_item_ids_acc(<<>>, Acc) -> lists:reverse(Acc); +decode_item_ids_acc(<>, Acc) -> + decode_item_ids_acc(Rest, [ID | Acc]). + +%% @doc Shift all depth keys in an item ID map by Offset. +shift_item_ids(Map, Offset) -> + maps:fold( + fun(Depth, IDs, Acc) -> Acc#{Depth + Offset => IDs} end, + #{}, + Map + ). + +%% @doc Merge a list of depth→ID-list maps in one pass per depth key. +merge_all_item_ids(Maps) -> + AllKeys = lists:usort(lists:flatmap(fun maps:keys/1, Maps)), + maps:from_list([ + {K, lists:append([maps:get(K, M, []) || M <- Maps])} + || K <- AllKeys]). + +%% @doc Merge two depth→ID-list maps by concatenating lists at each depth. +merge_item_ids(A, B) -> + maps:fold( + fun(Depth, IDs, Acc) -> + Existing = maps:get(Depth, Acc, []), + Acc#{Depth => Existing ++ IDs} + end, + A, + B + ). + +%% @doc Read the stored marker depth for a block, or undefined if none. +read_block_marker_depth(Height, Opts) -> + case hb_store_arweave:store_from_opts(Opts) of + no_store -> undefined; + #{ <<"index-store">> := Store } -> + case hb_store:read(Store, block_indexed_path(Height)) of + {ok, Bin} -> + try binary_to_integer(Bin) + catch _:_ -> undefined + end; + not_found -> undefined + end + end. + +%% @doc Check if a block has been indexed at the given depth or deeper. +is_block_indexed(undefined, _TargetDepth, _Opts) -> + false; +is_block_indexed(Height, TargetDepth, Opts) -> + case read_block_marker_depth(Height, Opts) of + undefined -> false; + StoredDepth -> StoredDepth >= TargetDepth + end. + +%% @doc Write per-depth item ID lists for a block. +%% Writes an entry for every depth from 1 through AchievedDepth (empty if +%% no items at that level), plus any partial depths beyond AchievedDepth +%% that were collected during indexing. +write_block_item_ids(Height, AchievedDepth, ItemIDs, Opts) -> + case hb_store_arweave:store_from_opts(Opts) of + no_store -> ok; + #{ <<"index-store">> := Store } -> + MaxStoredDepth = case maps:keys(ItemIDs) of + [] -> AchievedDepth; + Keys -> max(AchievedDepth, lists:max(Keys)) + end, + Results = lists:map( + fun(D) -> + IDs = maps:get(D, ItemIDs, []), + Bin = encode_item_ids(IDs), + hb_store:write( + Store, + block_items_path(Height, D), + Bin + ) + end, + lists:seq(1, MaxStoredDepth) + ), + case lists:all(fun(R) -> R =:= ok end, Results) of + true -> ok; + false -> + ?event(copycat_short, + {block_item_ids_write_failed, + {height, Height}}), + {error, item_ids_write_failed} + end + end. + +%% @doc Write a block completion marker with the achieved depth. +mark_block_indexed(Height, Depth, Opts) -> + case hb_store_arweave:store_from_opts(Opts) of + no_store -> ok; + #{ <<"index-store">> := Store } -> + hb_store:write( + Store, + block_indexed_path(Height), + integer_to_binary(Depth) + ) + end. + +%% @doc Read the persisted cutover height from the index store. +read_cutover_height(Opts) -> + case hb_store_arweave:store_from_opts(Opts) of + no_store -> undefined; + #{ <<"index-store">> := Store } -> + case hb_store:read(Store, ?CUTOVER_KEY) of + {ok, Bin} -> hb_util:int(Bin); + not_found -> undefined + end + end. + +%% @doc Write the cutover height if not already set. +ensure_cutover_height(Height, Opts) -> + case read_cutover_height(Opts) of + undefined -> + case hb_store_arweave:store_from_opts(Opts) of + no_store -> ok; + #{ <<"index-store">> := Store } -> + hb_store:write( + Store, ?CUTOVER_KEY, hb_util:bin(Height)), + ?event(copycat_short, + {marker_cutover_initialized, + {height, Height} + } + ) + end; + _ -> ok + end. + %% @doc Normalize an owner address into the native ID form used for comparisons. normalize_owner_id(Addr) -> hb_util:native_id(hb_util:bin(Addr)). @@ -404,12 +564,19 @@ list_index_blocks(Current, To, Opts, Acc) -> list_index_blocks(Current - 1, To, Opts, Acc); _ -> BlockKey = hb_util:bin(Current), - NewAcc = Acc#{ - BlockKey => #{ - <<"indexed">> => IndexedTXs, - <<"not-indexed">> => NotIndexedTXs - } - }, + BlockInfo = assemble_block_info( + Current, Block, Opts), + WithItems = case maps:get( + <<"depth">>, BlockInfo, undefined) + of + undefined -> BlockInfo; + _ -> + BlockInfo#{ + <<"items">> => + read_block_item_counts( + Current, Opts)} + end, + NewAcc = Acc#{BlockKey => WithItems}, list_index_blocks(Current - 1, To, Opts, NewAcc) end end; @@ -417,6 +584,79 @@ list_index_blocks(Current, To, Opts, Acc) -> list_index_blocks(Current - 1, To, Opts, Acc) end. +%% @doc Build base block info with indexed/not-indexed TXs and optional depth. +assemble_block_info(Height, Block, Opts) -> + TXIDs = hb_maps:get(<<"txs">>, Block, [], Opts), + {IndexedTXs, NotIndexedTXs} = classify_txs(TXIDs, Opts), + Base = #{ + <<"indexed">> => IndexedTXs, + <<"not-indexed">> => NotIndexedTXs + }, + case read_block_depth(Height, Opts) of + undefined -> Base; + Depth -> Base#{<<"depth">> => Depth} + end. + +%% @doc Read the achieved depth from a block marker. +read_block_depth(Height, Opts) -> + read_block_marker_depth(Height, Opts). + +%% @doc Probe item entries upward from depth 1, applying TransformFun to each. +probe_block_items(Height, Opts, TransformFun) -> + case hb_store_arweave:store_from_opts(Opts) of + no_store -> #{}; + #{ <<"index-store">> := Store } -> + probe_block_items(Height, Store, 1, #{}, TransformFun) + end. + +probe_block_items(Height, Store, Depth, Acc, TransformFun) -> + case hb_store:read(Store, block_items_path(Height, Depth)) of + {ok, Bin} -> + Key = hb_util:bin(Depth), + probe_block_items( + Height, Store, Depth + 1, + Acc#{Key => TransformFun(Bin)}, TransformFun); + not_found -> + Acc + end. + +count_ids(Bin) when byte_size(Bin) rem 32 =:= 0 -> + byte_size(Bin) div 32; +count_ids(_) -> <<"corrupt">>. + +decode_and_encode_ids(Bin) -> + case decode_item_ids(Bin) of + {error, _} -> <<"corrupt">>; + List -> [hb_util:encode(ID) || ID <- List] + end. + +read_block_item_counts(Height, Opts) -> + probe_block_items(Height, Opts, fun count_ids/1). + +read_block_item_ids(Height, Opts) -> + probe_block_items(Height, Opts, fun decode_and_encode_ids/1). + +%% @doc mode=list-items: return full item ID lists for a single block. +list_items_index(From, To, _Opts) when From =/= To -> + {error, <<"mode=list-items requires from=to (single block only)">>}; +list_items_index(From, _To, Opts) -> + BlockKey = hb_util:bin(From), + BlockInfo = case fetch_block_header(From, Opts) of + {ok, Block} -> + Base = assemble_block_info(From, Block, Opts), + case maps:get(<<"depth">>, Base, undefined) of + undefined -> Base; + _ -> Base#{<<"items">> => read_block_item_ids(From, Opts)} + end; + {error, _} -> + #{<<"error">> => <<"block not found">>} + end, + JSON = hb_json:encode(#{BlockKey => BlockInfo}), + {ok, #{ + <<"content-type">> => <<"application/json">>, + <<"body">> => JSON + }}. + fetch_block_header(Height, Opts) -> ?event(debug_copycat, {fetching_block, Height}), observe_event(<<"block_header">>, fun() -> @@ -445,7 +685,8 @@ classify_txs(TXIDs, Opts) -> %% @doc Fetch blocks from an Arweave node while moving downward from `Current'. %% If `To' is provided, every block in [`To', `Current'] is processed. If `To' -%% is omitted, stop at the first block where any TX is already indexed. +%% is omitted, stop at the first block already indexed at the requested depth +%% (via block markers above cutover, or legacy per-TX check below cutover). fetch_blocks(Current, To, TargetDepth, _Opts) when is_integer(To), Current < To -> ?event(copycat_short, @@ -459,7 +700,7 @@ fetch_blocks(Current, undefined, _TargetDepth, _Opts) when Current < 0 -> {ok, 0}; fetch_blocks(Current, undefined, TargetDepth, Opts) -> BlockRes = fetch_block_header(Current, Opts), - case is_already_indexed(BlockRes, Opts) of + case is_already_indexed(BlockRes, TargetDepth, Opts) of true -> ?event(copycat_short, {arweave_block_indexing_completed, @@ -479,14 +720,37 @@ fetch_blocks(Current, To, TargetDepth, Opts) -> end), fetch_blocks(Current - 1, To, TargetDepth, Opts). -%% @doc Determine whether a fetched block is considered indexed. -%% A block is indexed when any TX from its `txs' list is in the index. -is_already_indexed({ok, Block}, Opts) -> - TXIDs = hb_maps:get(<<"txs">>, Block, [], Opts), - lists:any(fun(TXID) -> is_tx_indexed(TXID, Opts) end, TXIDs); -is_already_indexed({error, _}, _Opts) -> +%% @doc Determine whether a fetched block is considered indexed at the +%% requested depth. Checks block markers first. For blocks at or above +%% the cutover height, the marker is authoritative. For blocks below +%% the cutover, falls back to legacy per-TX check. +is_already_indexed({ok, Block}, TargetDepth, Opts) -> + Height = hb_maps:get(<<"height">>, Block, undefined, Opts), + case is_block_indexed(Height, TargetDepth, Opts) of + true -> + true; + false -> + case is_post_cutover(Height, Opts) of + true -> + false; + false -> + TXIDs = hb_maps:get(<<"txs">>, Block, [], Opts), + lists:any( + fun(TXID) -> is_tx_indexed(TXID, Opts) end, + TXIDs + ) + end + end; +is_already_indexed({error, _}, _TargetDepth, _Opts) -> false. +is_post_cutover(undefined, _Opts) -> false; +is_post_cutover(Height, Opts) -> + case read_cutover_height(Opts) of + undefined -> false; + Cutover -> Height >= Cutover + end. + fetch_and_process_block(Current, To, TargetDepth, Opts) -> BlockRes = fetch_block_header(Current, Opts), process_block(BlockRes, Current, To, TargetDepth, Opts). @@ -513,17 +777,46 @@ process_block(BlockRes, Current, To, TargetDepth, Opts) -> TotalTXs = maps:get(total_txs, Results, 0), BundleTXs = maps:get(bundle_count, Results, 0), SkippedTXs = maps:get(skipped_count, Results, 0), - ?event( - copycat_short, - {arweave_block_indexed, - {height, Current}, - {items_indexed, ItemsIndexed}, - {total_txs, TotalTXs}, - {bundle_txs, BundleTXs}, - {skipped_txs, SkippedTXs}, - {target, To} - } - ) + AchievedDepth = maps:get( + achieved_depth, Results, + max(2, TargetDepth)), + ItemIDs = maps:get(item_ids, Results, #{}), + maybe + ok ?= write_block_item_ids( + Current, AchievedDepth, ItemIDs, Opts), + ok ?= mark_block_indexed( + Current, AchievedDepth, Opts), + ensure_cutover_height(Current, Opts), + ?event( + copycat_short, + {arweave_block_indexed, + {height, Current}, + {items_indexed, ItemsIndexed}, + {total_txs, TotalTXs}, + {bundle_txs, BundleTXs}, + {skipped_txs, SkippedTXs}, + {achieved_depth, AchievedDepth}, + {target, To} + } + ) + else + {error, item_ids_write_failed} -> + ?event( + copycat_short, + {arweave_block_metadata_failed, + {height, Current}, + {target, To} + } + ); + _ -> + ?event( + copycat_short, + {arweave_block_marker_failed, + {height, Current}, + {target, To} + } + ) + end end; {error, _} = Error -> ?event( @@ -561,15 +854,19 @@ maybe_index_block(Block, TargetDepth, Opts) -> }}; {ok, TXs} -> Height = hb_maps:get(<<"height">>, Block, 0, Opts), + L1IDs = [TX#tx.id || TX <- TXs], TXsWithData = ar_block:generate_size_tagged_list_from_txs(TXs, Height), - % Filter out padding entries before processing ValidTXs = lists:filter( fun({{padding, _}, _}) -> false; (_) -> true end, TXsWithData ), TXResults = process_block_txs( ValidTXs, BlockStartOffset, TargetDepth, Opts), - {block_cached, TXResults#{total_txs => TotalTXs}} + ExistingIDs = maps:get(item_ids, TXResults, #{}), + {block_cached, TXResults#{ + total_txs => TotalTXs, + item_ids => ExistingIDs#{1 => L1IDs} + }} end end. @@ -583,8 +880,9 @@ parallel_map(Items, Fun, Opts) -> %% @doc Process a single transaction and return its contribution to the counters. %% Returns a map with keys: items_count, bundle_count, skipped_count -process_block_tx({{padding, _PaddingRoot}, _EndOffset}, _BlockStartOffset, _TargetDepth, _Opts) -> - #{items_count => 0, bundle_count => 0, skipped_count => 0}; +process_block_tx({{padding, _PaddingRoot}, _EndOffset}, _BlockStartOffset, TargetDepth, _Opts) -> + #{items_count => 0, bundle_count => 0, skipped_count => 0, + achieved_depth => max(2, TargetDepth)}; process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, Opts) -> IndexStore = hb_store_arweave:store_from_opts(Opts), TXID = hb_util:encode(TX#tx.id), @@ -605,11 +903,17 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, ) end), case is_bundle_tx(TX, Opts) of - false -> #{items_count => 0, bundle_count => 0, skipped_count => 0}; + false -> + #{items_count => 0, bundle_count => 0, skipped_count => 0, + achieved_depth => max(2, TargetDepth)}; true when TargetDepth > 2 -> - % Indexing a block to depth 3 or greater means we need to load - % and recurse into each of the L1 TXs in the block. - maybe_process_l1_tx(TX#tx.id, #{}, TargetDepth - 1, false, Opts); + L1Result = process_l1_tx_direct( + TXStartOffset, TX#tx.data_size, + TargetDepth - 1, IndexStore, TXID, Opts), + L1Result#{ + achieved_depth => + max(2, maps:get(achieved_depth, L1Result, 0)) + }; true -> % Lightweight processing of block transactions to depth 2. We % can avoid loading the full L1 TX data into memory, and instead @@ -625,7 +929,6 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, ), case BundleRes of {ok, HeaderSize, BundleIndex} -> - % Batch event tracking: measure total time and count for all write_offset calls {TotalTime, {_, ItemsCount}} = timer:tc(fun() -> lists:foldl( fun({ItemID, Size}, {ItemStartOffset, ItemsCountAcc}) -> @@ -642,6 +945,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, BundleIndex ) end), + L2IDs = [ItemID || {ItemID, _Size} <- BundleIndex], ?event(debug_copycat, {bundle_items_indexed, {tx_id, {string, TXID}}, @@ -649,7 +953,9 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, }), % Single event record for the batch record_event_metrics(<<"item_indexed">>, ItemsCount, TotalTime), - #{items_count => ItemsCount, bundle_count => 1, skipped_count => 0}; + #{items_count => ItemsCount, bundle_count => 1, + skipped_count => 0, achieved_depth => 2, + item_ids => #{2 => L2IDs}}; {error, Reason} -> ?event( copycat_short, @@ -658,7 +964,8 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, {reason, Reason} } ), - #{items_count => 0, bundle_count => 1, skipped_count => 1} + #{items_count => 0, bundle_count => 1, + skipped_count => 1, achieved_depth => 0} end end. @@ -701,21 +1008,43 @@ process_block_txs(ValidTXs, BlockStartOffset, TargetDepth, Opts) -> TXWithData, BlockStartOffset, TargetDepth, Opts) end, Opts ), - lists:foldl( + Folded = lists:foldl( fun(Result, Acc) -> #{ - items_count => maps:get(items_count, Result, 0) + maps:get(items_count, Acc, 0), - bundle_count => maps:get(bundle_count, Result, 0) + maps:get(bundle_count, Acc, 0), - skipped_count => maps:get(skipped_count, Result, 0) + maps:get(skipped_count, Acc, 0) + items_count => + maps:get(items_count, Result, 0) + + maps:get(items_count, Acc, 0), + bundle_count => + maps:get(bundle_count, Result, 0) + + maps:get(bundle_count, Acc, 0), + skipped_count => + maps:get(skipped_count, Result, 0) + + maps:get(skipped_count, Acc, 0), + achieved_depth => + min( + maps:get(achieved_depth, Result, ?DEPTH_SENTINEL), + maps:get(achieved_depth, Acc, ?DEPTH_SENTINEL) + ) } end, - #{items_count => 0, bundle_count => 0, skipped_count => 0}, + #{items_count => 0, bundle_count => 0, skipped_count => 0, + achieved_depth => ?DEPTH_SENTINEL}, Results - ). + ), + MergedIDs = merge_all_item_ids( + [maps:get(item_ids, R, #{}) || R <- Results]), + Folded2 = Folded#{item_ids => MergedIDs}, + case maps:get(achieved_depth, Folded2) of + ?DEPTH_SENTINEL -> + Folded2#{achieved_depth => max(2, TargetDepth)}; + _ -> + Folded2 + end. %% @doc Process a single indexed L1 TX candidate after lightweight filter checks. maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> - Skipped = #{items_count => 0, bundle_count => 0, skipped_count => 1}, + Skipped = #{items_count => 0, bundle_count => 0, skipped_count => 1, + achieved_depth => 0}, NormalizedTXID = hb_util:native_id(TXID), EncodedTXID = hb_util:encode(NormalizedTXID), IndexStore = hb_store_arweave:store_from_opts(Opts), @@ -796,7 +1125,8 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> #{ items_count => 0, bundle_count => 1, - skipped_count => 1 + skipped_count => 1, + achieved_depth => 0 }; FilterReason -> ?event( @@ -809,10 +1139,26 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> Skipped end. +%% @doc Fast path for depth>2 block indexing. Skips offset lookup and +%% header re-fetch since the caller already has both. +process_l1_tx_direct(StartOffset, Length, Depth, IndexStore, EncodedTXID, Opts) -> + MemoryCap = get_memory_safe_cap(Opts), + case MemoryCap =/= undefined andalso Length > MemoryCap of + true -> + ?event(copycat_short, + {arweave_bundle_skipped, + {tx_id, {explicit, EncodedTXID}}, + {reason, memory_safe_cap_exceeded} + } + ), + #{items_count => 0, bundle_count => 1, + skipped_count => 1, achieved_depth => 0}; + false -> + process_l1_tx( + StartOffset, Length, Depth, IndexStore, EncodedTXID, Opts) + end. + %% @doc Load the L1 TX data into memory and index it. -%% -%% TODO: process_l1_tx and process_block_tx are very similar and can/should -%% be merged. process_l1_tx( StartOffset, Length, Depth, IndexStore, EncodedTXID, Opts) -> case observe_copycat_l1_stage( @@ -837,7 +1183,7 @@ process_l1_tx( end ), case IndexRes of - {ok, ItemsCount} -> + {ok, ItemsCount, AchievedDepth, BundleIDs} -> record_event_metrics( <<"item_indexed">>, ItemsCount, @@ -846,7 +1192,9 @@ process_l1_tx( #{ items_count => ItemsCount, bundle_count => 1, - skipped_count => 0 + skipped_count => 0, + achieved_depth => 1 + AchievedDepth, + item_ids => shift_item_ids(BundleIDs, 1) }; {error, Reason} -> ?event( @@ -859,7 +1207,8 @@ process_l1_tx( #{ items_count => 0, bundle_count => 1, - skipped_count => 1 + skipped_count => 1, + achieved_depth => 0 } end; {error, Reason} -> @@ -873,7 +1222,8 @@ process_l1_tx( #{ items_count => 0, bundle_count => 1, - skipped_count => 1 + skipped_count => 1, + achieved_depth => 0 }; not_found -> ?event( @@ -886,7 +1236,8 @@ process_l1_tx( #{ items_count => 0, bundle_count => 1, - skipped_count => 1 + skipped_count => 1, + achieved_depth => 0 } end. %% @doc Ensure the root L1 TX offset exists locally before `id=...` indexing. @@ -964,7 +1315,7 @@ query_l1_tx_offset(TXID, IndexStore, Opts) -> index_full_bundle_bytes(_BundleData, _BundleStartOffset, Depth, _Store, _Opts) when Depth =< 0 -> - {ok, 0}; + {ok, 0, 0, #{}}; index_full_bundle_bytes(BundleData, BundleStartOffset, Depth, Store, Opts) -> case ar_bundles:decode_bundle_header(BundleData) of invalid_bundle_header -> @@ -978,13 +1329,25 @@ index_full_bundle_bytes(BundleData, BundleStartOffset, Depth, Store, Opts) -> Depth, Store, Opts, - 0 + 0, + ?DEPTH_SENTINEL, + [], + #{} ) end. %% @doc Index bundle children from decoded bundle bytes and recurse descendants in-memory. -index_full_bundle_items([], _ItemsBin, _ItemStartOffset, _Depth, _Store, _Opts, Count) -> - {ok, Count}; +%% Returns {ok, Count, MinAchievedDepth, ItemIDs} or {error, Reason}. +%% ItemIDs is a map of relative-depth => list of raw 32-byte IDs. +index_full_bundle_items( + [], _ItemsBin, _ItemStartOffset, Depth, _Store, _Opts, + Count, MinDepth, ThisLevelIDs, DescIDs) -> + FinalDepth = case MinDepth of + ?DEPTH_SENTINEL -> Depth; + _ -> 1 + MinDepth + end, + AllIDs = DescIDs#{1 => lists:reverse(ThisLevelIDs)}, + {ok, Count, FinalDepth, AllIDs}; index_full_bundle_items( [{ItemID, Size} | Rest], ItemsBin, @@ -992,29 +1355,32 @@ index_full_bundle_items( Depth, Store, Opts, - Count + Count, + MinDepth, + ThisLevelIDs, + DescIDs ) when byte_size(ItemsBin) >= Size -> ItemBinary = binary:part(ItemsBin, 0, Size), + EncodedItemID = hb_util:encode(ItemID), + ParseResult = validate_and_flag_item_id( + ItemBinary, ItemID, EncodedItemID, Store), hb_store_arweave:write_offset( Store, - hb_util:encode(ItemID), + EncodedItemID, <<"ans104@1.0">>, ItemStartOffset, Size ), - DescendantCount = - case Depth > 1 of - true -> - index_full_bundle_descendants( - ItemBinary, - ItemStartOffset, - Depth - 1, - Store, - Opts - ); - false -> - 0 + {DescendantCount, ItemAchievedDepth, ChildIDs} = + case {Depth > 1, ParseResult} of + {true, {ok, HeaderSize, ParsedItem}} -> + index_full_bundle_descendants_parsed( + ParsedItem, HeaderSize, + ItemStartOffset, Depth - 1, Store, Opts); + _ -> + {0, Depth - 1, #{}} end, + ShiftedChildIDs = shift_item_ids(ChildIDs, 1), index_full_bundle_items( Rest, binary:part(ItemsBin, Size, byte_size(ItemsBin) - Size), @@ -1022,38 +1388,78 @@ index_full_bundle_items( Depth, Store, Opts, - Count + 1 + DescendantCount + Count + 1 + DescendantCount, + min(MinDepth, ItemAchievedDepth), + [ItemID | ThisLevelIDs], + merge_item_ids(DescIDs, ShiftedChildIDs) ); -index_full_bundle_items(_BundleIndex, _ItemsBin, _ItemStartOffset, _Depth, _Store, _Opts, _Count) -> +index_full_bundle_items( + _BundleIndex, _ItemsBin, _ItemStartOffset, _Depth, + _Store, _Opts, _Count, _MinDepth, _ThisLevelIDs, _DescIDs) -> {error, invalid_bundle_header}. -%% @doc Recurse into a nested bundle data item from in-memory bytes. -index_full_bundle_descendants(_ItemBinary, _ItemStartOffset, Depth, _Store, _Opts) +%% @doc Recurse into a nested data item using an already-parsed header. +%% Returns {Count, AchievedDepth, ItemIDs}. +index_full_bundle_descendants_parsed( + _ParsedItem, _HeaderSize, _ItemStartOffset, Depth, _Store, _Opts) when Depth =< 0 -> - 0; -index_full_bundle_descendants(ItemBinary, ItemStartOffset, Depth, Store, Opts) -> + {0, 0, #{}}; +index_full_bundle_descendants_parsed( + ParsedItem, HeaderSize, ItemStartOffset, Depth, Store, Opts) -> + case is_bundle_tx(ParsedItem, Opts) of + true -> + case index_full_bundle_bytes( + ParsedItem#tx.data, + ItemStartOffset + HeaderSize, + Depth, + Store, + Opts + ) of + {ok, Count, ChildDepth, ChildIDs} -> + {Count, ChildDepth, ChildIDs}; + _ -> + {0, 0, #{}} + end; + false -> + {0, Depth, #{}} + end. + +%% @doc Validate an item ID by hashing the signature from the deserialized +%% header. Returns {ok, HeaderSize, ParsedItem} on successful parse, or +%% error if deserialization fails. Mismatch flags are written but don't +%% prevent the item from being indexed. +validate_and_flag_item_id(ItemBinary, DeclaredID, EncodedDeclaredID, Store) -> try ar_bundles:deserialize_header(ItemBinary) of - {ok, HeaderSize, HeaderTX} -> - case is_bundle_tx(HeaderTX, Opts) of + {ok, HeaderSize, ParsedItem} -> + ComputedID = crypto:hash(sha256, ParsedItem#tx.signature), + case ComputedID =:= DeclaredID of true -> - case index_full_bundle_bytes( - HeaderTX#tx.data, - ItemStartOffset + HeaderSize, - Depth, - Store, - Opts - ) of - {ok, Count} -> Count; - _ -> 0 - end; + ok; false -> - 0 - end; + case Store of + #{ <<"index-store">> := IndexStore } -> + hb_store:write( + IndexStore, + hb_store_arweave_offset:mismatch_path( + DeclaredID), + ComputedID + ); + _ -> ok + end, + ?event(copycat_short, + {item_id_mismatch, + {declared_id, {explicit, EncodedDeclaredID}}, + {computed_id, + {explicit, hb_util:encode(ComputedID)}} + } + ) + end, + {ok, HeaderSize, ParsedItem}; _ -> - 0 + error catch _:_ -> - 0 + error end. %% @doc Check whether a TX header indicates bundle content. @@ -1545,6 +1951,9 @@ auto_stop_on_indexed_block_test_parallel() -> ?assert(has_any_indexed_tx(Higher1, Opts)), ?assert(has_any_indexed_tx(IndexedBlock, Opts)), ?assertNot(has_any_indexed_tx(IndexedBlock-1, Opts)), + ?assert(is_block_indexed(IndexedBlock, 2, Opts)), + ?assert(is_block_indexed(Higher1, 2, Opts)), + ?assert(is_block_indexed(Higher2, 2, Opts)), ok. explicit_to_reindexes_all_test_parallel() -> @@ -1621,6 +2030,8 @@ auto_stop_partial_index_test_parallel() -> ?assert(has_any_indexed_tx(HigherBlock, Opts)), ?assert(has_any_indexed_tx(Block, Opts)), ?assertNot(has_any_indexed_tx(Block-1, Opts)), + ?assert(is_block_indexed(HigherBlock, 2, Opts)), + ?assertNot(is_block_indexed(Block, 2, Opts)), ok. negative_parse_range_test_parallel() -> @@ -2188,4 +2599,331 @@ assert_indexed_range(From, To, _Opts) when From < To -> ok; assert_indexed_range(From, To, Opts) -> ?assert(has_any_indexed_tx(From, Opts)), - assert_indexed_range(From - 1, To, Opts). \ No newline at end of file + assert_indexed_range(From - 1, To, Opts). + +block_marker_depth_2_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + Block = 1827942, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary>>, + Opts + ), + ?assert(is_block_indexed(Block, 2, Opts)), + ?assertNot(is_block_indexed(Block, 3, Opts)), + ok. + +depth_1_normalizes_to_2_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + TX1 = #tx{ + format = 2, + id = crypto:strong_rand_bytes(32), + data_size = 100, + tags = [] + }, + TX2 = #tx{ + format = 2, + id = crypto:strong_rand_bytes(32), + data_size = 200, + tags = [] + }, + Tuples = [ + {{TX1, <<>>}, 100}, + {{TX2, <<>>}, 300} + ], + Result = process_block_txs(Tuples, 0, 1, Opts), + ?assertEqual(2, maps:get(achieved_depth, Result)), + Height = 88888888, + mark_block_indexed(Height, maps:get(achieved_depth, Result), Opts), + ?assert(is_block_indexed(Height, 1, Opts)), + ?assert(is_block_indexed(Height, 2, Opts)), + ?assertNot(is_block_indexed(Height, 3, Opts)), + ok. + +block_marker_cutover_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + LowerBlock = 1827941, + UpperBlock = 1827942, + {ok, UpperBlock} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(UpperBlock))/binary, "&to=", + (hb_util:bin(UpperBlock))/binary>>, + Opts + ), + Cutover = read_cutover_height(Opts), + ?assertNotEqual(undefined, Cutover), + ?assert(is_block_indexed(UpperBlock, 2, Opts)), + ?assertNot(is_block_indexed(LowerBlock, 2, Opts)), + ok. + +achieved_depth_block_depth_3_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + Block = 1827942, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&depth=3">>, + Opts + ), + ?assert(is_block_indexed(Block, 3, Opts)), + ok. + +invalid_bundle_bytes_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + StoreOpts = hb_store_arweave:store_from_opts(Opts), + ?assertEqual( + {error, invalid_bundle_header}, + index_full_bundle_bytes(<<"not a bundle">>, 0, 2, StoreOpts, Opts) + ), + ok. + +small_block_depth_3_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + Block = 1889322, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&depth=3">>, + Opts + ), + ?assert(is_block_indexed(Block, 3, Opts)), + #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), + {ok, L1Bin} = hb_store:read(Store, block_items_path(Block, 1)), + ?assert(length(decode_item_ids(L1Bin)) > 0), + {ok, L2Bin} = hb_store:read(Store, block_items_path(Block, 2)), + ?assert(length(decode_item_ids(L2Bin)) > 0), + {ok, L3Bin} = hb_store:read(Store, block_items_path(Block, 3)), + L3IDs = decode_item_ids(L3Bin), + ?assertEqual(3, length(L3IDs)), + assert_item_read( + <<"npAzk_BomjWBQQr_xnmlhdxjyl97EJnNv_MAaXffs1s">>, + Opts), + ok. + +no_mismatch_flags_on_valid_bundles_test() -> + {_TestStore, StoreOpts, Opts} = setup_index_opts(), + Block = 1827942, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&depth=3">>, + Opts + ), + #{ <<"index-store">> := IndexStore } = StoreOpts, + ItemID = hb_util:native_id( + <<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>), + ?assertEqual( + not_found, + hb_store:read( + IndexStore, + hb_store_arweave_offset:mismatch_path(ItemID) + ) + ), + ok. + +mismatch_path_encoding_test() -> + ID = crypto:strong_rand_bytes(32), + Path = hb_store_arweave_offset:mismatch_path(ID), + ?assert(binary:match(Path, <<"mismatch/">>) =/= nomatch), + ok. + +exact_marker_depth_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + Block = 1827942, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&depth=3">>, + Opts + ), + #{ <<"index-store">> := Store } = + hb_store_arweave:store_from_opts(Opts), + {ok, StoredBin} = + hb_store:read(Store, block_indexed_path(Block)), + StoredDepth = binary_to_integer(StoredBin), + ?assertEqual(3, StoredDepth), + ok. + +fabricated_mismatch_test() -> + {_TestStore, StoreOpts, _Opts} = setup_index_opts(), + {Priv, Pub} = ar_wallet:new(), + Target = crypto:strong_rand_bytes(32), + Anchor = crypto:strong_rand_bytes(32), + Item = ar_bundles:sign_item( + ar_bundles:new_item(Target, Anchor, [], <<"test data">>), + {Priv, Pub} + ), + ItemBinary = ar_bundles:serialize(Item), + RealID = crypto:hash(sha256, Item#tx.signature), + FakeID = crypto:strong_rand_bytes(32), + EncodedFakeID = hb_util:encode(FakeID), + validate_and_flag_item_id(ItemBinary, FakeID, EncodedFakeID, StoreOpts), + #{ <<"index-store">> := IndexStore } = StoreOpts, + {ok, StoredActualID} = + hb_store:read( + IndexStore, + hb_store_arweave_offset:mismatch_path(FakeID) + ), + ?assertEqual(RealID, StoredActualID), + ?assertEqual( + not_found, + hb_store:read( + IndexStore, + hb_store_arweave_offset:mismatch_path(RealID) + ) + ), + ok. + +block_item_ids_depth_2_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {ok, 1827942} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=1827942&to=1827942">>, + Opts + ), + #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), + {ok, L1Bin} = hb_store:read(Store, block_items_path(1827942, 1)), + L1IDs = decode_item_ids(L1Bin), + ?assert(length(L1IDs) > 0), + {ok, L2Bin} = hb_store:read(Store, block_items_path(1827942, 2)), + L2IDs = decode_item_ids(L2Bin), + ?assert(length(L2IDs) > 0), + L2Encoded = [hb_util:encode(ID) || ID <- L2IDs], + Pos54K = index_of(<<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, L2Encoded), + PosOBK = index_of(<<"OBKr-7UrmjxFD-h-qP-XLuvCgtyuO_IDpBMgIytvusA">>, L2Encoded), + ?assert(is_integer(Pos54K)), + ?assert(is_integer(PosOBK)), + ?assert(Pos54K < PosOBK), + ?assertEqual(not_found, hb_store:read(Store, block_items_path(1827942, 3))), + ok. + +block_item_ids_depth_3_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {ok, 1827942} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=1827942&to=1827942&depth=3">>, + Opts + ), + #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), + {ok, L1Bin} = hb_store:read(Store, block_items_path(1827942, 1)), + L1Count = length(decode_item_ids(L1Bin)), + ?assertEqual(5, L1Count), + {ok, L2Bin} = hb_store:read(Store, block_items_path(1827942, 2)), + L2Count = length(decode_item_ids(L2Bin)), + ?assert(L2Count > 0), + {ok, L3Bin} = hb_store:read(Store, block_items_path(1827942, 3)), + L3Count = length(decode_item_ids(L3Bin)), + ?assert(L3Count >= 1), + L3IDs = decode_item_ids(L3Bin), + L3Encoded = [hb_util:encode(ID) || ID <- L3IDs], + ?assert(lists:member( + <<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, L3Encoded)), + ok. + +list_index_with_items_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {ok, 1827942} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=1827942&to=1827942">>, + Opts + ), + {ok, ListResult} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=1827942&to=1827942&mode=list">>, + Opts + ), + Body = hb_json:decode(hb_maps:get(<<"body">>, ListResult)), + BlockInfo = maps:get(<<"1827942">>, Body), + ?assert(is_integer(maps:get(<<"depth">>, BlockInfo))), + Items = maps:get(<<"items">>, BlockInfo), + ?assert(maps:get(<<"1">>, Items) > 0), + ?assert(maps:get(<<"2">>, Items) > 0), + ok. + +list_items_single_block_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {ok, 1827942} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=1827942&to=1827942">>, + Opts + ), + {ok, ListResult} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=1827942&to=1827942&mode=list-items">>, + Opts + ), + Body = hb_json:decode(hb_maps:get(<<"body">>, ListResult)), + BlockInfo = maps:get(<<"1827942">>, Body), + Items = maps:get(<<"items">>, BlockInfo), + L1Items = maps:get(<<"1">>, Items), + ?assert(is_list(L1Items)), + ?assert(length(L1Items) > 0), + L2Items = maps:get(<<"2">>, Items), + ?assert(is_list(L2Items)), + ?assert(length(L2Items) > 0), + {ok, Block} = fetch_block_header(1827942, Opts), + BlockTXIDs = hb_maps:get(<<"txs">>, Block, [], Opts), + ?assertEqual(BlockTXIDs, L1Items), + ?assert(lists:member( + <<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, L2Items)), + ok. + +list_items_rejects_range_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {error, _} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=1827942&to=1827940&mode=list-items">>, + Opts + ), + ok. + +decode_item_ids_validation_test() -> + ?assertEqual([], decode_item_ids(<<>>)), + GoodBin = <<0:256, 1:256>>, + ?assertEqual(2, length(decode_item_ids(GoodBin))), + BadBin = <<0:240>>, + ?assertEqual({error, invalid_item_ids_binary}, decode_item_ids(BadBin)), + ok. + +corrupt_item_ids_read_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), + Height = 99999999, + hb_store:write(Store, block_indexed_path(Height), <<"2">>), + hb_store:write(Store, block_items_path(Height, 1), <<0:256>>), + hb_store:write(Store, block_items_path(Height, 2), <<0:240>>), + Counts = read_block_item_counts(Height, Opts), + ?assertEqual(1, maps:get(<<"1">>, Counts)), + ?assertEqual(<<"corrupt">>, maps:get(<<"2">>, Counts)), + IDs = read_block_item_ids(Height, Opts), + ?assertEqual(1, length(maps:get(<<"1">>, IDs))), + ?assertEqual(<<"corrupt">>, maps:get(<<"2">>, IDs)), + ok. + +memory_cap_depth3_floors_to_2_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + CappedOpts = set_memory_safe_cap(1, Opts), + Block = 1827942, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&depth=3">>, + CappedOpts + ), + ?assert(is_block_indexed(Block, 2, CappedOpts)), + ?assertNot(is_block_indexed(Block, 3, CappedOpts)), + ok. + +index_of(Elem, List) -> index_of(Elem, List, 1). + +index_of(_Elem, [], _N) -> not_found; +index_of(Elem, [Elem | _], N) -> N; +index_of(Elem, [_ | Rest], N) -> index_of(Elem, Rest, N + 1). From 5f7b98977ce8efe7e172635146106f83d29337fa Mon Sep 17 00:00:00 2001 From: Niko Storni Date: Thu, 9 Apr 2026 12:00:00 +0200 Subject: [PATCH 09/68] feat: Add parallel block processing with shared memory budget --- src/core/resolver/hb_opts.erl | 2 + src/hb_copycat_budget.erl | 192 ++++++++++++++++ src/preloaded/query/dev_copycat_arweave.erl | 234 ++++++++++++++------ 3 files changed, 366 insertions(+), 62 deletions(-) create mode 100644 src/hb_copycat_budget.erl diff --git a/src/core/resolver/hb_opts.erl b/src/core/resolver/hb_opts.erl index 74bca1805..d22d4c576 100644 --- a/src/core/resolver/hb_opts.erl +++ b/src/core/resolver/hb_opts.erl @@ -285,7 +285,9 @@ raw_default_message() -> <<"commitment-device">> => <<"httpsig@1.0">>, % Copycat-specific options. <<"copycat-memory-cap">> => 6 * 1024 * 1024 * 1024, + <<"copycat-memory-budget">> => 6 * 1024 * 1024 * 1024, <<"copycat-depth-recursion-cap">> => 6, % 2x the deepest we've seen to date + <<"arweave-block-workers">> => 3, % Dev options <<"mode">> => debug, <<"profiling">> => true, diff --git a/src/hb_copycat_budget.erl b/src/hb_copycat_budget.erl new file mode 100644 index 000000000..dff471b58 --- /dev/null +++ b/src/hb_copycat_budget.erl @@ -0,0 +1,192 @@ +%%% @doc Atomics-based byte budget pool for copycat memory throttling. +%%% Controls how many bytes of TX data can be held in memory simultaneously +%%% across all copycat workers. Uses persistent_term for constant-time access. +-module(hb_copycat_budget). +-export([ensure_started/1, reset/1, lease/1, release/1, get_budget/0, stats/0]). +-include_lib("eunit/include/eunit.hrl"). + +-define(PERSISTENT_KEY, hb_copycat_budget). +-define(IDX_LEASED, 1). +-define(IDX_PEAK, 2). +-define(IDX_BUDGET, 3). +-define(IDX_RETRIES, 4). +-define(RETRY_SLEEP_MS, 50). + +-define(INIT_LOCK, hb_copycat_budget_init). + +ensure_started(Budget) when is_integer(Budget), Budget > 0 -> + case persistent_term:get(?PERSISTENT_KEY, undefined) of + undefined -> + init_with_lock(Budget); + _Ref -> + ok + end. + +init_with_lock(Budget) -> + try register(?INIT_LOCK, self()) of + true -> + try + case persistent_term:get(?PERSISTENT_KEY, undefined) of + undefined -> + Ref = atomics:new(4, [{signed, true}]), + atomics:put(Ref, ?IDX_BUDGET, Budget), + persistent_term:put(?PERSISTENT_KEY, Ref); + _AlreadySet -> + ok + end + after + unregister(?INIT_LOCK) + end, + ok + catch + error:badarg -> + await_init(Budget) + end. + +await_init(Budget) -> + case persistent_term:get(?PERSISTENT_KEY, undefined) of + undefined -> + case whereis(?INIT_LOCK) of + undefined -> + init_with_lock(Budget); + _Pid -> + timer:sleep(1), + await_init(Budget) + end; + _Ref -> + ok + end. + +reset(Budget) when is_integer(Budget), Budget > 0 -> + Ref = atomics:new(4, [{signed, true}]), + atomics:put(Ref, ?IDX_BUDGET, Budget), + persistent_term:put(?PERSISTENT_KEY, Ref), + ok. + +lease(Size) when is_integer(Size), Size > 0 -> + Ref = persistent_term:get(?PERSISTENT_KEY), + lease_loop(Ref, Size). + +lease_loop(Ref, Size) -> + Current = atomics:get(Ref, ?IDX_LEASED), + Budget = atomics:get(Ref, ?IDX_BUDGET), + case Current + Size > Budget of + true -> + atomics:add(Ref, ?IDX_RETRIES, 1), + timer:sleep(?RETRY_SLEEP_MS), + lease_loop(Ref, Size); + false -> + case atomics:compare_exchange(Ref, ?IDX_LEASED, Current, Current + Size) of + ok -> + update_peak(Ref, Current + Size), + ok; + _Changed -> + lease_loop(Ref, Size) + end + end. + +release(Size) when is_integer(Size), Size > 0 -> + Ref = persistent_term:get(?PERSISTENT_KEY), + atomics:sub(Ref, ?IDX_LEASED, Size), + ok. + +get_budget() -> + case persistent_term:get(?PERSISTENT_KEY, undefined) of + undefined -> undefined; + Ref -> atomics:get(Ref, ?IDX_BUDGET) + end. + +stats() -> + case persistent_term:get(?PERSISTENT_KEY, undefined) of + undefined -> + not_started; + Ref -> + #{ + leased => atomics:get(Ref, ?IDX_LEASED), + peak => atomics:get(Ref, ?IDX_PEAK), + budget => atomics:get(Ref, ?IDX_BUDGET), + retries => atomics:get(Ref, ?IDX_RETRIES) + } + end. + +update_peak(Ref, NewLeased) -> + Peak = atomics:get(Ref, ?IDX_PEAK), + case NewLeased =< Peak of + true -> ok; + false -> + case atomics:compare_exchange(Ref, ?IDX_PEAK, Peak, NewLeased) of + ok -> ok; + _Changed -> update_peak(Ref, NewLeased) + end + end. + +%%% Tests + +lease_release_cycle_test() -> + reset(1000), + ?assertEqual(1000, get_budget()), + ok = lease(400), + #{leased := 400, peak := 400, budget := 1000} = stats(), + ok = lease(300), + #{leased := 700, peak := 700} = stats(), + ok = release(400), + #{leased := 300, peak := 700} = stats(), + ok = release(300), + #{leased := 0, peak := 700} = stats(), + reset_to_default(), + ok. + +blocks_when_over_budget_test() -> + reset(100), + ok = lease(100), + Parent = self(), + Ref = make_ref(), + Pid = spawn(fun() -> + Parent ! {Ref, trying}, + ok = lease(50), + Parent ! {Ref, got_lease} + end), + receive {Ref, trying} -> ok end, + timer:sleep(120), + receive + {Ref, got_lease} -> error(should_have_blocked) + after 0 -> ok + end, + release(60), + receive + {Ref, got_lease} -> ok + after 500 -> + exit(Pid, kill), + error(lease_never_granted) + end, + release(50), + #{leased := 40} = stats(), + release(40), + reset_to_default(), + ok. + +concurrent_leases_test() -> + Budget = 1000, + reset(Budget), + Parent = self(), + NumWorkers = 20, + LeaseSize = 200, + Pids = [spawn(fun() -> + ok = lease(LeaseSize), + timer:sleep(10), + release(LeaseSize), + Parent ! {done, self()} + end) || _ <- lists:seq(1, NumWorkers)], + lists:foreach(fun(Pid) -> + receive {done, Pid} -> ok + after 5000 -> error({timeout, Pid}) + end + end, Pids), + #{leased := 0, peak := Peak, budget := Budget} = stats(), + ?assert(Peak =< Budget), + ?assert(Peak > 0), + reset_to_default(), + ok. + +reset_to_default() -> + reset(hb_opts:get(copycat_memory_budget, 6 * 1024 * 1024 * 1024, #{})). diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 62c8bea10..b69625451 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -12,7 +12,6 @@ -include_lib("eunit/include/eunit.hrl"). -define(ARWEAVE_DEVICE, <<"~arweave@2.9">>). --define(BLOCK_MARKER_PREFIX, <<"block/">>). -define(CUTOVER_KEY, <<"block/marker-cutover-height">>). -define(DEPTH_SENTINEL, 99999). % By default we'll index blocks to depth 2 which is: @@ -70,14 +69,14 @@ arweave(_Base, Request, Opts) -> {error, unavailable} -> {error, unavailable}; {ok, {From, To}} -> list_index(From, To, Opts) end; - <<"list-items">> -> + <<"inventory">> -> case parse_range(Request, Opts) of {error, unavailable} -> {error, unavailable}; - {ok, {From, To}} -> list_items_index(From, To, Opts) + {ok, {From, To}} -> inventory_index(From, To, Opts) end; Mode -> {error, <<"Unsupported mode `", (hb_util:bin(Mode))/binary, - "`. Supported modes are: write, list, list-items">>} + "`. Supported modes are: write, list, inventory">>} end. %% @doc Set safe memory resource allocation cap for the in-memory %% bundle processing. in bytes. @@ -95,9 +94,22 @@ get_depth_recursion_cap(Opts) -> get_memory_safe_cap(Opts) -> hb_opts:get(copycat_memory_cap, undefined, Opts). +%% @doc Return the effective per-TX memory cap, clamped to the global budget. +%% Lazily initializes the budget pool on first call. +effective_memory_cap(Opts) -> + Budget = hb_opts:get( + copycat_memory_budget, 6 * 1024 * 1024 * 1024, Opts), + hb_copycat_budget:ensure_started(Budget), + PoolSize = hb_copycat_budget:get_budget(), + Cap = get_memory_safe_cap(Opts), + case Cap of + undefined -> PoolSize; + _ -> min(Cap, PoolSize) + end. + %% @doc Return the store path for a block completion marker. block_indexed_path(Height) -> - <>. + <<"block/", (hb_util:bin(Height))/binary, "/depth">>. %% @doc Return the store path for a per-block item index at a given depth. block_items_path(Height, Depth) -> @@ -636,27 +648,37 @@ read_block_item_counts(Height, Opts) -> read_block_item_ids(Height, Opts) -> probe_block_items(Height, Opts, fun decode_and_encode_ids/1). -%% @doc mode=list-items: return full item ID lists for a single block. -list_items_index(From, To, _Opts) when From =/= To -> - {error, <<"mode=list-items requires from=to (single block only)">>}; -list_items_index(From, _To, Opts) -> - BlockKey = hb_util:bin(From), - BlockInfo = case fetch_block_header(From, Opts) of - {ok, Block} -> - Base = assemble_block_info(From, Block, Opts), - case maps:get(<<"depth">>, Base, undefined) of - undefined -> Base; - _ -> Base#{<<"items">> => read_block_item_ids(From, Opts)} - end; - {error, _} -> - #{<<"error">> => <<"block not found">>} - end, - JSON = hb_json:encode(#{BlockKey => BlockInfo}), +%% @doc mode=inventory: return per-depth item ID lists from the local index store. +%% Supports range queries. The inventory read itself is local-only (no network). +%% Note: range parsing may call latest_height/1 if from/to are omitted or negative. +inventory_index(From, undefined, Opts) -> + inventory_index(From, 0, Opts); +inventory_index(From, To, _Opts) when From < To -> + {ok, #{ + <<"content-type">> => <<"application/json">>, + <<"body">> => hb_json:encode(#{}) + }}; +inventory_index(From, To, Opts) -> + Result = inventory_local(From, To, Opts, #{}), + JSON = hb_json:encode(Result), {ok, #{ <<"content-type">> => <<"application/json">>, <<"body">> => JSON }}. +inventory_local(Current, To, _Opts, Acc) when Current < To -> Acc; +inventory_local(Current, To, Opts, Acc) -> + case read_block_depth(Current, Opts) of + undefined -> + inventory_local(Current - 1, To, Opts, Acc); + Depth -> + ItemIDs = read_block_item_ids(Current, Opts), + BlockKey = hb_util:bin(Current), + BlockInfo = #{<<"depth">> => Depth, <<"items">> => ItemIDs}, + inventory_local(Current - 1, To, Opts, + Acc#{BlockKey => BlockInfo}) + end. + fetch_block_header(Height, Opts) -> ?event(debug_copycat, {fetching_block, Height}), observe_event(<<"block_header">>, fun() -> @@ -687,7 +709,7 @@ classify_txs(TXIDs, Opts) -> %% If `To' is provided, every block in [`To', `Current'] is processed. If `To' %% is omitted, stop at the first block already indexed at the requested depth %% (via block markers above cutover, or legacy per-TX check below cutover). -fetch_blocks(Current, To, TargetDepth, _Opts) +fetch_blocks(Current, To, TargetDepth, _Opts) when is_integer(To), Current < To -> ?event(copycat_short, {arweave_block_indexing_completed, @@ -699,26 +721,97 @@ fetch_blocks(Current, To, TargetDepth, _Opts) fetch_blocks(Current, undefined, _TargetDepth, _Opts) when Current < 0 -> {ok, 0}; fetch_blocks(Current, undefined, TargetDepth, Opts) -> - BlockRes = fetch_block_header(Current, Opts), - case is_already_indexed(BlockRes, TargetDepth, Opts) of - true -> + BlockWorkers = block_workers(Opts), + fetch_blocks_open_ended(Current, TargetDepth, BlockWorkers, Opts); +fetch_blocks(Current, To, TargetDepth, Opts) -> + BlockWorkers = block_workers(Opts), + fetch_blocks_ranged(Current, To, TargetDepth, BlockWorkers, Opts). + +block_workers(Opts) -> + max(1, hb_opts:get(arweave_block_workers, 3, Opts)). + +%% @doc Process a known range of blocks in parallel batches. +fetch_blocks_ranged(Current, To, TargetDepth, _Workers, _Opts) + when Current < To -> + ?event(copycat_short, + {arweave_block_indexing_completed, + {reached_target, To}, + {target_depth, TargetDepth} + } + ), + {ok, To}; +fetch_blocks_ranged(Current, To, TargetDepth, Workers, Opts) -> + BatchEnd = max(To, Current - Workers + 1), + Heights = lists:seq(Current, BatchEnd, -1), + hb_pmap:parallel_map( + Heights, + fun(H) -> + observe_event(<<"block_indexed">>, fun() -> + fetch_and_process_block(H, To, TargetDepth, Opts) + end) + end, + Workers + ), + fetch_blocks_ranged(BatchEnd - 1, To, TargetDepth, Workers, Opts). + +%% @doc Process blocks until an already-indexed block is found. +%% Fetches headers in parallel, stops at the first indexed block, +%% then processes the unindexed prefix in parallel. +fetch_blocks_open_ended(Current, _TargetDepth, _Workers, _Opts) + when Current < 0 -> + {ok, 0}; +fetch_blocks_open_ended(Current, TargetDepth, Workers, Opts) -> + BatchEnd = max(0, Current - Workers + 1), + Heights = lists:seq(Current, BatchEnd, -1), + HeaderResults = hb_pmap:parallel_map( + Heights, + fun(H) -> {H, fetch_block_header(H, Opts)} end, + Workers + ), + case find_indexed_prefix(HeaderResults, TargetDepth, Opts) of + {all_unindexed, ToProcess} -> + process_prefetched_blocks( + ToProcess, TargetDepth, Workers, Opts), + fetch_blocks_open_ended( + BatchEnd - 1, TargetDepth, Workers, Opts); + {stop_at, StopHeight, ToProcess} -> + process_prefetched_blocks( + ToProcess, TargetDepth, Workers, Opts), ?event(copycat_short, {arweave_block_indexing_completed, - {stop_at_indexed_block, Current} + {stop_at_indexed_block, StopHeight} } ), - {ok, Current}; + {ok, StopHeight} + end. + +%% @doc Walk header results in order, return the unindexed prefix and +%% either the stop height or all_unindexed. +find_indexed_prefix(HeaderResults, TargetDepth, Opts) -> + find_indexed_prefix(HeaderResults, TargetDepth, Opts, []). + +find_indexed_prefix([], _TargetDepth, _Opts, Acc) -> + {all_unindexed, lists:reverse(Acc)}; +find_indexed_prefix([{H, BlockRes} | Rest], TargetDepth, Opts, Acc) -> + case is_already_indexed(BlockRes, TargetDepth, Opts) of + true -> + {stop_at, H, lists:reverse(Acc)}; false -> + find_indexed_prefix( + Rest, TargetDepth, Opts, [{H, BlockRes} | Acc]) + end. + +%% @doc Process a list of {Height, BlockRes} tuples in parallel. +process_prefetched_blocks(Blocks, TargetDepth, Workers, Opts) -> + hb_pmap:parallel_map( + Blocks, + fun({H, BlockRes}) -> observe_event(<<"block_indexed">>, fun() -> - process_block(BlockRes, Current, undefined, TargetDepth, Opts) - end), - fetch_blocks(Current - 1, undefined, TargetDepth, Opts) - end; -fetch_blocks(Current, To, TargetDepth, Opts) -> - observe_event(<<"block_indexed">>, fun() -> - fetch_and_process_block(Current, To, TargetDepth, Opts) - end), - fetch_blocks(Current - 1, To, TargetDepth, Opts). + process_block(BlockRes, H, undefined, TargetDepth, Opts) + end) + end, + Workers + ). %% @doc Determine whether a fetched block is considered indexed at the %% requested depth. Checks block markers first. For blocks at or above @@ -1079,12 +1172,13 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> true -> bundle; false -> not_bundle end, - within_memory_cap ?= - case Length =< get_memory_safe_cap(Opts) of - true -> within_memory_cap; - false -> memory_safe_cap_exceeded + within_effective_cap ?= + case Length =< effective_memory_cap(Opts) of + true -> within_effective_cap; + false -> effective_cap_exceeded end, - process_l1_tx( + ok ?= hb_copycat_budget:lease(Length), + try process_l1_tx( StartOffset, Length, Depth, @@ -1092,6 +1186,9 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> EncodedTXID, Opts ) + after + hb_copycat_budget:release(Length) + end else {error, Reason} -> ?event( @@ -1114,12 +1211,12 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> } ), Skipped; - memory_safe_cap_exceeded -> + effective_cap_exceeded -> ?event( copycat_short, {arweave_bundle_skipped, {tx_id, {explicit, EncodedTXID}}, - {reason, memory_safe_cap_exceeded} + {reason, effective_cap_exceeded} } ), #{ @@ -1142,20 +1239,26 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> %% @doc Fast path for depth>2 block indexing. Skips offset lookup and %% header re-fetch since the caller already has both. process_l1_tx_direct(StartOffset, Length, Depth, IndexStore, EncodedTXID, Opts) -> - MemoryCap = get_memory_safe_cap(Opts), - case MemoryCap =/= undefined andalso Length > MemoryCap of + EffectiveCap = effective_memory_cap(Opts), + case Length > EffectiveCap of true -> ?event(copycat_short, {arweave_bundle_skipped, {tx_id, {explicit, EncodedTXID}}, - {reason, memory_safe_cap_exceeded} + {reason, effective_cap_exceeded} } ), #{items_count => 0, bundle_count => 1, skipped_count => 1, achieved_depth => 0}; false -> - process_l1_tx( - StartOffset, Length, Depth, IndexStore, EncodedTXID, Opts) + ok = hb_copycat_budget:lease(Length), + try + process_l1_tx( + StartOffset, Length, Depth, + IndexStore, EncodedTXID, Opts) + after + hb_copycat_budget:release(Length) + end end. %% @doc Load the L1 TX data into memory and index it. @@ -2847,20 +2950,21 @@ list_index_with_items_test() -> ?assert(maps:get(<<"2">>, Items) > 0), ok. -list_items_single_block_test() -> +inventory_single_block_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), {ok, 1827942} = hb_ao:resolve( <<"~copycat@1.0/arweave&from=1827942&to=1827942">>, Opts ), - {ok, ListResult} = + {ok, InvResult} = hb_ao:resolve( - <<"~copycat@1.0/arweave&from=1827942&to=1827942&mode=list-items">>, + <<"~copycat@1.0/arweave&from=1827942&to=1827942&mode=inventory">>, Opts ), - Body = hb_json:decode(hb_maps:get(<<"body">>, ListResult)), + Body = hb_json:decode(hb_maps:get(<<"body">>, InvResult)), BlockInfo = maps:get(<<"1827942">>, Body), + ?assert(is_integer(maps:get(<<"depth">>, BlockInfo))), Items = maps:get(<<"items">>, BlockInfo), L1Items = maps:get(<<"1">>, Items), ?assert(is_list(L1Items)), @@ -2868,20 +2972,26 @@ list_items_single_block_test() -> L2Items = maps:get(<<"2">>, Items), ?assert(is_list(L2Items)), ?assert(length(L2Items) > 0), - {ok, Block} = fetch_block_header(1827942, Opts), - BlockTXIDs = hb_maps:get(<<"txs">>, Block, [], Opts), - ?assertEqual(BlockTXIDs, L1Items), + ?assertEqual(5, length(L1Items)), ?assert(lists:member( <<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, L2Items)), ok. -list_items_rejects_range_test() -> - {_TestStore, _StoreOpts, Opts} = setup_index_opts(), - {error, _} = - hb_ao:resolve( - <<"~copycat@1.0/arweave&from=1827942&to=1827940&mode=list-items">>, - Opts - ), +inventory_range_test() -> + {_TestStore, StoreOpts, Opts} = setup_index_opts(), + #{ <<"index-store">> := Store } = StoreOpts, + hb_store:write(Store, block_indexed_path(77777777), <<"2">>), + hb_store:write(Store, block_items_path(77777777, 1), <<0:256>>), + hb_store:write(Store, block_items_path(77777777, 2), <<>>), + hb_store:write(Store, block_indexed_path(77777778), <<"2">>), + hb_store:write(Store, block_items_path(77777778, 1), <<1:256>>), + hb_store:write(Store, block_items_path(77777778, 2), <<>>), + {ok, InvResult} = inventory_index(77777778, 77777777, Opts), + Body = hb_json:decode(hb_maps:get(<<"body">>, InvResult)), + ?assert(maps:is_key(<<"77777777">>, Body)), + ?assert(maps:is_key(<<"77777778">>, Body)), + ?assertEqual(2, maps:get(<<"depth">>, maps:get(<<"77777777">>, Body))), + ?assertEqual(2, maps:get(<<"depth">>, maps:get(<<"77777778">>, Body))), ok. decode_item_ids_validation_test() -> From 7b8f0d7585b2419233d79dee963d51fe829cca9e Mon Sep 17 00:00:00 2001 From: Niko Storni Date: Thu, 9 Apr 2026 12:00:00 +0200 Subject: [PATCH 10/68] feat: Add parent lookup endpoint at ~arweave@2.9/parent= --- src/core/store/hb_store_arweave.erl | 31 ++- src/preloaded/arweave/dev_arweave.erl | 44 +++- src/preloaded/query/dev_copycat_arweave.erl | 256 ++++++++++++++++++-- 3 files changed, 308 insertions(+), 23 deletions(-) diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 5b37003de..7d94be78a 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -6,7 +6,7 @@ %%% Unused Store API: -export([resolve/3, write/3, link/3, group/3]). %%% Indexing API: --export([store_from_opts/1, write_offset/5, read_offset/3, read_chunks/3]). +-export([store_from_opts/1, write_offset/5, read_offset/3, read_parent/2, decode_parent_entries/1, read_chunks/3]). -include("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). @@ -94,6 +94,35 @@ read_offset(StoreOpts = #{ <<"index-store">> := IndexStore }, ID, Opts) -> end; read_offset(_, _, _) -> not_found. +%% @doc Read the parent entries for an item from the index store. +read_parent(#{ <<"index-store">> := IndexStore }, ID) -> + NormalizedID = hb_util:native_id(ID), + ParentPath = <<"parent/", NormalizedID/binary>>, + case hb_store:read(IndexStore, ParentPath) of + {ok, Bin} -> + case decode_parent_entries(Bin) of + {error, _} = Err -> Err; + Entries -> {ok, Entries} + end; + _ -> + not_found + end; +read_parent(_, _) -> not_found. + +decode_parent_entries(<<>>) -> []; +decode_parent_entries(<<0, Height:64/big-unsigned, Rest/binary>>) -> + case decode_parent_entries(Rest) of + {error, _} = Err -> Err; + Tail -> [{Height, block} | Tail] + end; +decode_parent_entries(<<1, ParentID:32/binary, Rest/binary>>) -> + case decode_parent_entries(Rest) of + {error, _} = Err -> Err; + Tail -> [{ParentID, bundle} | Tail] + end; +decode_parent_entries(_Corrupt) -> + {error, corrupt_parent_data}. + %% @doc Read the data at the given key, reading the `local-store' first if %% available. read(StoreOpts, #{ <<"read">> := ID }, _NodeOpts) when ?IS_ID(ID) -> diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index 0cd3e2148..d0738fe5b 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -7,7 +7,7 @@ -implements(<<"arweave@2.9">>). -device_libraries([lib_arweave_common]). -export([info/0]). --export([tx/3, raw/3, chunk/3, block/3, current/3, status/3, price/3, tx_anchor/3]). +-export([tx/3, raw/3, chunk/3, block/3, parent/3, current/3, status/3, price/3, tx_anchor/3]). -export([pending/3]). -export([post_tx_header/2, post_tx/3, post_tx/4, post_chunk/2]). %%% Helper functions @@ -752,6 +752,48 @@ only_if_cached(Req, Opts) -> hb_maps:get(<<"cache-control">>, Req, [], Opts) ). +%% @doc Look up the parent (block or bundle) that contains an item. +parent(Base, Request, Opts) -> + case find_key(<<"parent">>, Base, Request, Opts) of + not_found -> + {error, not_found}; + ID -> + StoreOpts = hb_store_arweave:store_from_opts(Opts), + try hb_store_arweave:read_parent(StoreOpts, ID) of + {ok, [{Height, block} | _]} -> + Entry = #{ + <<"type">> => <<"block">>, + <<"height">> => Height + }, + {ok, #{ + <<"content-type">> => <<"application/json">>, + <<"body">> => + hb_json:encode(#{<<"parents">> => [Entry]}) + }}; + {ok, [{ParentID, bundle} | _]} -> + Entry = #{ + <<"type">> => <<"bundle">>, + <<"id">> => hb_util:encode(ParentID) + }, + {ok, #{ + <<"content-type">> => <<"application/json">>, + <<"body">> => + hb_json:encode(#{<<"parents">> => [Entry]}) + }}; + {error, Reason} -> + ?event(warning, + {parent_read_error, {id, ID}, {reason, Reason}}), + {error, not_found}; + not_found -> + {error, not_found} + catch + error:function_clause -> + {error, not_found}; + error:badarg -> + {error, not_found} + end + end. + %% @doc Retrieve the current block information from Arweave. current(_Base, _Request, Opts) -> request(<<"GET">>, <<"/block/current">>, Opts). diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index b69625451..440d8ba5f 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -116,6 +116,21 @@ block_items_path(Height, Depth) -> <<"block/", (hb_util:bin(Height))/binary, "/items/", (hb_util:bin(Depth))/binary>>. +%% @doc Return the store path for a parent index entry. +parent_path(ItemID) when byte_size(ItemID) =:= 32 -> + <<"parent/", ItemID/binary>>. + +%% @doc Encode a parent entry for storage. +encode_parent_entry(Height, block) when is_integer(Height) -> + <<0, Height:64/big-unsigned>>; +encode_parent_entry(ParentID, bundle) when byte_size(ParentID) =:= 32 -> + <<1, ParentID:32/binary>>. + +%% @doc Write a parent entry for an item to the index store. +write_parent(ItemID, ParentData, Type, Store) -> + Entry = encode_parent_entry(ParentData, Type), + hb_store:write(Store, parent_path(ItemID), Entry). + %% @doc Encode a list of 32-byte raw IDs into a single binary. encode_item_ids(IDs) -> << <> || ID <- IDs >>. @@ -746,9 +761,13 @@ fetch_blocks_ranged(Current, To, TargetDepth, Workers, Opts) -> hb_pmap:parallel_map( Heights, fun(H) -> - observe_event(<<"block_indexed">>, fun() -> - fetch_and_process_block(H, To, TargetDepth, Opts) - end) + case is_block_indexed(H, TargetDepth, Opts) of + true -> ok; + false -> + observe_event(<<"block_indexed">>, fun() -> + fetch_and_process_block(H, To, TargetDepth, Opts) + end) + end end, Workers ), @@ -954,7 +973,7 @@ maybe_index_block(Block, TargetDepth, Opts) -> TXsWithData ), TXResults = process_block_txs( - ValidTXs, BlockStartOffset, TargetDepth, Opts), + ValidTXs, BlockStartOffset, TargetDepth, Height, Opts), ExistingIDs = maps:get(item_ids, TXResults, #{}), {block_cached, TXResults#{ total_txs => TotalTXs, @@ -973,10 +992,10 @@ parallel_map(Items, Fun, Opts) -> %% @doc Process a single transaction and return its contribution to the counters. %% Returns a map with keys: items_count, bundle_count, skipped_count -process_block_tx({{padding, _PaddingRoot}, _EndOffset}, _BlockStartOffset, TargetDepth, _Opts) -> +process_block_tx({{padding, _PaddingRoot}, _EndOffset}, _BlockStartOffset, TargetDepth, _BlockHeight, _Opts) -> #{items_count => 0, bundle_count => 0, skipped_count => 0, achieved_depth => max(2, TargetDepth)}; -process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, Opts) -> +process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, BlockHeight, Opts) -> IndexStore = hb_store_arweave:store_from_opts(Opts), TXID = hb_util:encode(TX#tx.id), TXEndOffset = BlockStartOffset + EndOffset, @@ -995,6 +1014,8 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, TX#tx.data_size ) end), + #{ <<"index-store">> := ParentStore } = IndexStore, + write_parent(TX#tx.id, BlockHeight, block, ParentStore), case is_bundle_tx(TX, Opts) of false -> #{items_count => 0, bundle_count => 0, skipped_count => 0, @@ -1002,7 +1023,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, true when TargetDepth > 2 -> L1Result = process_l1_tx_direct( TXStartOffset, TX#tx.data_size, - TargetDepth - 1, IndexStore, TXID, Opts), + TargetDepth - 1, IndexStore, TXID, TX#tx.id, Opts), L1Result#{ achieved_depth => max(2, maps:get(achieved_depth, L1Result, 0)) @@ -1032,6 +1053,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, ItemStartOffset, Size ), + write_parent(ItemID, TX#tx.id, bundle, ParentStore), {ItemStartOffset + Size, ItemsCountAcc + 1} end, {TXStartOffset + HeaderSize, 0}, @@ -1094,11 +1116,11 @@ header_chunk(HeaderSize, FirstChunk, StartOffset, Opts) -> %% When arweave_index_workers <= 1, processes sequentially (one worker at a time). %% When arweave_index_workers > 1, processes in parallel with the specified concurrency limit. %% Returns a map with keys: items_count, bundle_count, skipped_count. -process_block_txs(ValidTXs, BlockStartOffset, TargetDepth, Opts) -> +process_block_txs(ValidTXs, BlockStartOffset, TargetDepth, BlockHeight, Opts) -> Results = parallel_map( ValidTXs, fun(TXWithData) -> process_block_tx( - TXWithData, BlockStartOffset, TargetDepth, Opts) end, + TXWithData, BlockStartOffset, TargetDepth, BlockHeight, Opts) end, Opts ), Folded = lists:foldl( @@ -1184,6 +1206,7 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> Depth, IndexStore, EncodedTXID, + hb_util:decode(EncodedTXID), Opts ) after @@ -1238,7 +1261,7 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> %% @doc Fast path for depth>2 block indexing. Skips offset lookup and %% header re-fetch since the caller already has both. -process_l1_tx_direct(StartOffset, Length, Depth, IndexStore, EncodedTXID, Opts) -> +process_l1_tx_direct(StartOffset, Length, Depth, IndexStore, EncodedTXID, ParentID, Opts) -> EffectiveCap = effective_memory_cap(Opts), case Length > EffectiveCap of true -> @@ -1255,7 +1278,7 @@ process_l1_tx_direct(StartOffset, Length, Depth, IndexStore, EncodedTXID, Opts) try process_l1_tx( StartOffset, Length, Depth, - IndexStore, EncodedTXID, Opts) + IndexStore, EncodedTXID, ParentID, Opts) after hb_copycat_budget:release(Length) end @@ -1263,7 +1286,7 @@ process_l1_tx_direct(StartOffset, Length, Depth, IndexStore, EncodedTXID, Opts) %% @doc Load the L1 TX data into memory and index it. process_l1_tx( - StartOffset, Length, Depth, IndexStore, EncodedTXID, Opts) -> + StartOffset, Length, Depth, IndexStore, EncodedTXID, ParentID, Opts) -> case observe_copycat_l1_stage( <<"l1_read_chunks">>, fun() -> hb_store_arweave:read_chunks(StartOffset, Length, Opts) end @@ -1279,6 +1302,7 @@ process_l1_tx( StartOffset, Depth, IndexStore, + ParentID, Opts ) end @@ -1416,10 +1440,10 @@ query_l1_tx_offset(TXID, IndexStore, Opts) -> {error, not_found} end. -index_full_bundle_bytes(_BundleData, _BundleStartOffset, Depth, _Store, _Opts) +index_full_bundle_bytes(_BundleData, _BundleStartOffset, Depth, _Store, _ParentID, _Opts) when Depth =< 0 -> {ok, 0, 0, #{}}; -index_full_bundle_bytes(BundleData, BundleStartOffset, Depth, Store, Opts) -> +index_full_bundle_bytes(BundleData, BundleStartOffset, Depth, Store, ParentID, Opts) -> case ar_bundles:decode_bundle_header(BundleData) of invalid_bundle_header -> {error, invalid_bundle_header}; @@ -1431,6 +1455,7 @@ index_full_bundle_bytes(BundleData, BundleStartOffset, Depth, Store, Opts) -> BundleStartOffset + HeaderSize, Depth, Store, + ParentID, Opts, 0, ?DEPTH_SENTINEL, @@ -1443,7 +1468,7 @@ index_full_bundle_bytes(BundleData, BundleStartOffset, Depth, Store, Opts) -> %% Returns {ok, Count, MinAchievedDepth, ItemIDs} or {error, Reason}. %% ItemIDs is a map of relative-depth => list of raw 32-byte IDs. index_full_bundle_items( - [], _ItemsBin, _ItemStartOffset, Depth, _Store, _Opts, + [], _ItemsBin, _ItemStartOffset, Depth, _Store, _ParentID, _Opts, Count, MinDepth, ThisLevelIDs, DescIDs) -> FinalDepth = case MinDepth of ?DEPTH_SENTINEL -> Depth; @@ -1457,6 +1482,7 @@ index_full_bundle_items( ItemStartOffset, Depth, Store, + ParentID, Opts, Count, MinDepth, @@ -1474,12 +1500,14 @@ index_full_bundle_items( ItemStartOffset, Size ), + #{ <<"index-store">> := IdxStore } = Store, + write_parent(ItemID, ParentID, bundle, IdxStore), {DescendantCount, ItemAchievedDepth, ChildIDs} = case {Depth > 1, ParseResult} of {true, {ok, HeaderSize, ParsedItem}} -> index_full_bundle_descendants_parsed( ParsedItem, HeaderSize, - ItemStartOffset, Depth - 1, Store, Opts); + ItemStartOffset, Depth - 1, Store, ItemID, Opts); _ -> {0, Depth - 1, #{}} end, @@ -1490,6 +1518,7 @@ index_full_bundle_items( ItemStartOffset + Size, Depth, Store, + ParentID, Opts, Count + 1 + DescendantCount, min(MinDepth, ItemAchievedDepth), @@ -1498,17 +1527,17 @@ index_full_bundle_items( ); index_full_bundle_items( _BundleIndex, _ItemsBin, _ItemStartOffset, _Depth, - _Store, _Opts, _Count, _MinDepth, _ThisLevelIDs, _DescIDs) -> + _Store, _ParentID, _Opts, _Count, _MinDepth, _ThisLevelIDs, _DescIDs) -> {error, invalid_bundle_header}. %% @doc Recurse into a nested data item using an already-parsed header. %% Returns {Count, AchievedDepth, ItemIDs}. index_full_bundle_descendants_parsed( - _ParsedItem, _HeaderSize, _ItemStartOffset, Depth, _Store, _Opts) + _ParsedItem, _HeaderSize, _ItemStartOffset, Depth, _Store, _ParentID, _Opts) when Depth =< 0 -> {0, 0, #{}}; index_full_bundle_descendants_parsed( - ParsedItem, HeaderSize, ItemStartOffset, Depth, Store, Opts) -> + ParsedItem, HeaderSize, ItemStartOffset, Depth, Store, ParentID, Opts) -> case is_bundle_tx(ParsedItem, Opts) of true -> case index_full_bundle_bytes( @@ -1516,6 +1545,7 @@ index_full_bundle_descendants_parsed( ItemStartOffset + HeaderSize, Depth, Store, + ParentID, Opts ) of {ok, Count, ChildDepth, ChildIDs} -> @@ -2736,7 +2766,7 @@ depth_1_normalizes_to_2_test() -> {{TX1, <<>>}, 100}, {{TX2, <<>>}, 300} ], - Result = process_block_txs(Tuples, 0, 1, Opts), + Result = process_block_txs(Tuples, 0, 1, 88888888, Opts), ?assertEqual(2, maps:get(achieved_depth, Result)), Height = 88888888, mark_block_indexed(Height, maps:get(achieved_depth, Result), Opts), @@ -2780,7 +2810,7 @@ invalid_bundle_bytes_test() -> StoreOpts = hb_store_arweave:store_from_opts(Opts), ?assertEqual( {error, invalid_bundle_header}, - index_full_bundle_bytes(<<"not a bundle">>, 0, 2, StoreOpts, Opts) + index_full_bundle_bytes(<<"not a bundle">>, 0, 2, StoreOpts, <<0:256>>, Opts) ), ok. @@ -3032,6 +3062,190 @@ memory_cap_depth3_floors_to_2_test() -> ?assertNot(is_block_indexed(Block, 3, CappedOpts)), ok. +parent_encode_decode_test() -> + BlockEntry = encode_parent_entry(12345, block), + ?assertEqual(<<0, 12345:64/big-unsigned>>, BlockEntry), + BundleID = crypto:strong_rand_bytes(32), + BundleEntry = encode_parent_entry(BundleID, bundle), + ?assertEqual(<<1, BundleID:32/binary>>, BundleEntry), + Combined = <>, + Decoded = hb_store_arweave:decode_parent_entries(Combined), + ?assertEqual([{12345, block}, {BundleID, bundle}], Decoded), + ok. + +parent_not_found_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + StoreOpts2 = hb_store_arweave:store_from_opts(Opts), + UnknownID = crypto:strong_rand_bytes(32), + ?assertEqual(not_found, hb_store_arweave:read_parent(StoreOpts2, UnknownID)), + ok. + +parent_depth_2_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + Block = 1827942, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&depth=2">>, + Opts + ), + StoreOpts2 = hb_store_arweave:store_from_opts(Opts), + {ok, InvResult} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&mode=inventory">>, + Opts + ), + Body = hb_json:decode(hb_maps:get(<<"body">>, InvResult)), + BlockInfo = maps:get(hb_util:bin(Block), Body), + L1Items = maps:get(<<"1">>, maps:get(<<"items">>, BlockInfo)), + L1ID = hb_util:decode(hd(L1Items)), + {ok, [{Block, block}]} = hb_store_arweave:read_parent(StoreOpts2, L1ID), + L2Items = maps:get(<<"2">>, maps:get(<<"items">>, BlockInfo)), + case L2Items of + [] -> ok; + [FirstL2 | _] -> + L2ID = hb_util:decode(FirstL2), + {ok, [{L2Parent, bundle}]} = + hb_store_arweave:read_parent(StoreOpts2, L2ID), + ?assert(lists:member( + hb_util:encode(L2Parent), L1Items)) + end, + ok. + +parent_depth_3_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + Block = 1889322, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&depth=3">>, + Opts + ), + StoreOpts2 = hb_store_arweave:store_from_opts(Opts), + {ok, InvResult} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&mode=inventory">>, + Opts + ), + Body = hb_json:decode(hb_maps:get(<<"body">>, InvResult)), + BlockInfo = maps:get(hb_util:bin(Block), Body), + L3Items = maps:get(<<"3">>, maps:get(<<"items">>, BlockInfo)), + ?assert(length(L3Items) > 0), + L2Items = maps:get(<<"2">>, maps:get(<<"items">>, BlockInfo)), + L3ID = hb_util:decode(hd(L3Items)), + {ok, [{L3Parent, bundle}]} = + hb_store_arweave:read_parent(StoreOpts2, L3ID), + ?assert(lists:member(hb_util:encode(L3Parent), L2Items)), + ok. + +parent_corrupt_data_test() -> + ?assertEqual([], hb_store_arweave:decode_parent_entries(<<>>)), + ?assertEqual( + {error, corrupt_parent_data}, + hb_store_arweave:decode_parent_entries(<<5, 1, 2, 3>>)), + Truncated = <<0, 1, 2, 3>>, + ?assertEqual( + {error, corrupt_parent_data}, + hb_store_arweave:decode_parent_entries(Truncated)), + ValidThenCorrupt = <<0, 100:64/big-unsigned, 99>>, + ?assertEqual( + {error, corrupt_parent_data}, + hb_store_arweave:decode_parent_entries(ValidThenCorrupt)), + ok. + +parent_endpoint_block_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + Block = 1827942, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&depth=2">>, + Opts + ), + {ok, InvResult} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&mode=inventory">>, + Opts + ), + InvBody = hb_json:decode(hb_maps:get(<<"body">>, InvResult)), + BlockInfo = maps:get(hb_util:bin(Block), InvBody), + L1Items = maps:get(<<"1">>, maps:get(<<"items">>, BlockInfo)), + L1EncodedID = hd(L1Items), + {ok, ParentResult} = + hb_ao:resolve( + <<"~arweave@2.9/parent=", L1EncodedID/binary>>, + Opts + ), + ?assertEqual( + <<"application/json">>, + hb_maps:get(<<"content-type">>, ParentResult)), + Body = hb_json:decode(hb_maps:get(<<"body">>, ParentResult)), + Parents = maps:get(<<"parents">>, Body), + ?assertEqual(1, length(Parents)), + [Entry] = Parents, + ?assertEqual(<<"block">>, maps:get(<<"type">>, Entry)), + ?assertEqual(Block, maps:get(<<"height">>, Entry)), + ok. + +parent_endpoint_bundle_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + Block = 1827942, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&depth=2">>, + Opts + ), + {ok, InvResult} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&mode=inventory">>, + Opts + ), + InvBody = hb_json:decode(hb_maps:get(<<"body">>, InvResult)), + BlockInfo = maps:get(hb_util:bin(Block), InvBody), + L1Items = maps:get(<<"1">>, maps:get(<<"items">>, BlockInfo)), + L2Items = maps:get(<<"2">>, maps:get(<<"items">>, BlockInfo)), + ?assert(length(L2Items) > 0), + L2EncodedID = hd(L2Items), + {ok, ParentResult} = + hb_ao:resolve( + <<"~arweave@2.9/parent=", L2EncodedID/binary>>, + Opts + ), + ?assertEqual( + <<"application/json">>, + hb_maps:get(<<"content-type">>, ParentResult)), + Body = hb_json:decode(hb_maps:get(<<"body">>, ParentResult)), + [Entry] = maps:get(<<"parents">>, Body), + ?assertEqual(<<"bundle">>, maps:get(<<"type">>, Entry)), + ParentID = maps:get(<<"id">>, Entry), + ?assert(lists:member(ParentID, L1Items)), + ok. + +parent_endpoint_not_found_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + FakeID = <<"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA">>, + ?assertEqual( + {error, not_found}, + hb_ao:resolve( + <<"~arweave@2.9/parent=", FakeID/binary>>, + Opts + ) + ), + ok. + index_of(Elem, List) -> index_of(Elem, List, 1). index_of(_Elem, [], _N) -> not_found; From c3007a52795d284a2b6bdf184d6db19a90892b69 Mon Sep 17 00:00:00 2001 From: Niko Storni Date: Tue, 14 Apr 2026 17:05:21 +0200 Subject: [PATCH 11/68] fix: Stop latest_height from silently returning 0 on network errors - Return tagged tuples from latest_height and normalize_height - Propagate errors through parse_range using maybe block - Return {error, unavailable} (HTTP 503) on upstream failures - Validate resolved heights are non-negative in parse_range - Log original upstream error reason before collapsing to unavailable - Add regression tests with mock server for both failure paths --- src/preloaded/query/dev_copycat_arweave.erl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 440d8ba5f..99da3aa4d 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -54,12 +54,16 @@ arweave(_Base, Request, Opts) -> }} end; error -> - TargetDepth = request_depth(Request, ?DEFAULT_BLOCK_DEPTH, Opts), case parse_range(Request, Opts) of - {error, unavailable} -> {error, unavailable}; + {error, unavailable} -> + {error, unavailable}; {ok, {From, To}} -> + TargetDepth = request_depth( + Request, ?DEFAULT_BLOCK_DEPTH, Opts), ?event(copycat_short, - {indexing_blocks, {from, From}, {to, To}, {depth, TargetDepth}} + {indexing_blocks, + {from, From}, {to, To}, + {depth, TargetDepth}} ), fetch_blocks(From, To, TargetDepth, Opts) end From b98280df967970e0808c653aff9fdacefef1cc5f Mon Sep 17 00:00:00 2001 From: speeddragon Date: Tue, 28 Apr 2026 14:42:34 +0100 Subject: [PATCH 12/68] fix: Catch non UTF8 tags error --- src/preloaded/query/dev_copycat_arweave.erl | 56 ++++++++++++++------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 99da3aa4d..f5d4e0031 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -391,24 +391,33 @@ process_l1_request(TXID, Request, Opts) -> observe_copycat_l1_stage( <<"l1_request_total">>, fun() -> - maybe - {ok, OwnerFilters} ?= parse_owner_filter(Request, Opts), - {ok, IncludeTag} ?= parse_tag_filter(<<"include-tag">>, Request, Opts), - {ok, ExcludeTag} ?= parse_tag_filter(<<"exclude-tag">>, Request, Opts), - {ok, - maybe_process_l1_tx( - TXID, - OwnerFilters#{ - include_tag => IncludeTag, - exclude_tag => ExcludeTag - }, - Depth, - QueryL1Offset, - Opts - )} - else - {error, _} = Error -> - Error + try + maybe + {ok, OwnerFilters} ?= parse_owner_filter(Request, Opts), + {ok, IncludeTag} ?= parse_tag_filter(<<"include-tag">>, Request, Opts), + {ok, ExcludeTag} ?= parse_tag_filter(<<"exclude-tag">>, Request, Opts), + {ok, + maybe_process_l1_tx( + TXID, + OwnerFilters#{ + include_tag => IncludeTag, + exclude_tag => ExcludeTag + }, + Depth, + QueryL1Offset, + Opts + )} + else + {error, _} = Error -> + Error + end + catch + _:Reason:Stacktrace -> + ?event(copycat_short, + {error, + {reason, Reason}, + {stacktrace, Stacktrace}}), + {error, Reason} end end ). @@ -1020,7 +1029,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, end), #{ <<"index-store">> := ParentStore } = IndexStore, write_parent(TX#tx.id, BlockHeight, block, ParentStore), - case is_bundle_tx(TX, Opts) of + try is_bundle_tx(TX, Opts) of false -> #{items_count => 0, bundle_count => 0, skipped_count => 0, achieved_depth => max(2, TargetDepth)}; @@ -1086,6 +1095,14 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, #{items_count => 0, bundle_count => 1, skipped_count => 1, achieved_depth => 0} end + catch + _:Reason:Stacktrace -> + ?event(copycat_short, + {arweave_bundle_skipped, + {tx, {explicit, TX#tx.id}}, + {reason, Reason}, + {stacktrace, Stacktrace}}), + #{items_count => 0, bundle_count => 0, skipped_count => 1, achieved_depth => 0} end. download_bundle_header(EndOffset, Size, Opts) -> @@ -1600,6 +1617,7 @@ validate_and_flag_item_id(ItemBinary, DeclaredID, EncodedDeclaredID, Store) -> end. %% @doc Check whether a TX header indicates bundle content. +%% NOTE: This function can throw if transaction tags aren't properly formated is_bundle_tx(TX, _Opts) -> ar_tx:type(TX) =/= binary. From aa182127911ebc43971e3ead68d718afaa9920a4 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Wed, 29 Apr 2026 01:12:54 +0100 Subject: [PATCH 13/68] impr: Missing try catch on non UTF8 tags --- src/preloaded/query/dev_copycat_arweave.erl | 26 +++++++++++++++------ 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index f5d4e0031..af51bd66a 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -1034,13 +1034,25 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, #{items_count => 0, bundle_count => 0, skipped_count => 0, achieved_depth => max(2, TargetDepth)}; true when TargetDepth > 2 -> - L1Result = process_l1_tx_direct( - TXStartOffset, TX#tx.data_size, - TargetDepth - 1, IndexStore, TXID, TX#tx.id, Opts), - L1Result#{ - achieved_depth => - max(2, maps:get(achieved_depth, L1Result, 0)) - }; + %% Retry to perseve bundle count + try + L1Result = process_l1_tx_direct( + TXStartOffset, TX#tx.data_size, + TargetDepth - 1, IndexStore, TXID, TX#tx.id, Opts), + L1Result#{ + achieved_depth => + max(2, maps:get(achieved_depth, L1Result, 0)) + } + catch + _:Reason:Stacktrace -> + ?event(copycat_short, + {arweave_bundle_skipped, + {tx, {explicit, TX#tx.id}}, + {reason, Reason}, + {stacktrace, Stacktrace}}), + #{items_count => 0, bundle_count => 1, + skipped_count => 1, achieved_depth => 0} + end; true -> % Lightweight processing of block transactions to depth 2. We % can avoid loading the full L1 TX data into memory, and instead From 2af34c172f07058b67db838360110ffc0a31cb35 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 1 May 2026 10:55:18 +0100 Subject: [PATCH 14/68] impr: signed not needed --- src/hb_copycat_budget.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hb_copycat_budget.erl b/src/hb_copycat_budget.erl index dff471b58..2c98edbb7 100644 --- a/src/hb_copycat_budget.erl +++ b/src/hb_copycat_budget.erl @@ -28,7 +28,7 @@ init_with_lock(Budget) -> try case persistent_term:get(?PERSISTENT_KEY, undefined) of undefined -> - Ref = atomics:new(4, [{signed, true}]), + Ref = atomics:new(4, [{signed, false}]), atomics:put(Ref, ?IDX_BUDGET, Budget), persistent_term:put(?PERSISTENT_KEY, Ref); _AlreadySet -> @@ -58,7 +58,7 @@ await_init(Budget) -> end. reset(Budget) when is_integer(Budget), Budget > 0 -> - Ref = atomics:new(4, [{signed, true}]), + Ref = atomics:new(4, [{signed, false}]), atomics:put(Ref, ?IDX_BUDGET, Budget), persistent_term:put(?PERSISTENT_KEY, Ref), ok. From 36eb361bbccdf1a3071051f70909189bb5d24da2 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 1 May 2026 10:59:42 +0100 Subject: [PATCH 15/68] impr: Add overlay_count metrics from store LMDB to see if server is writting faster than OS is writting in disk --- src/core/monitor/hb_prometheus.erl | 13 ++++++-- src/core/store/hb_store_lmdb.erl | 52 +++++++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/src/core/monitor/hb_prometheus.erl b/src/core/monitor/hb_prometheus.erl index f4917da77..9f8bb4383 100644 --- a/src/core/monitor/hb_prometheus.erl +++ b/src/core/monitor/hb_prometheus.erl @@ -1,7 +1,7 @@ %%% @doc HyperBEAM wrapper for Prometheus metrics. -module(hb_prometheus). -export([ensure_started/0, declare/2, measure_and_report/2, measure_and_report/3]). --export([observe/2, observe/3, inc/2, inc/3, inc/4, dec/2, dec/3, dec/4]). +-export([observe/2, observe/3, inc/2, inc/3, inc/4, dec/2, dec/3, dec/4, set/4]). -define(STARTED_CACHE_KEY, {?MODULE, started}). %% @doc Ensure the Prometheus application has been started. Caches startup @@ -118,4 +118,13 @@ dec(Type, Metrics, Labels, Value) -> end. do_dec(gauge, Name, Labels, Value) -> - prometheus_gauge:dec(Name, Labels, Value). \ No newline at end of file + prometheus_gauge:dec(Name, Labels, Value). + +set(gauge, Name, Labels, Value) -> + case ensure_started() of + ok -> + try prometheus_gauge:set(Name, Labels, Value) + catch error:mfa_already_exists -> ok + end; + _ -> ok + end. \ No newline at end of file diff --git a/src/core/store/hb_store_lmdb.erl b/src/core/store/hb_store_lmdb.erl index a5b649ffc..0d6e67e77 100644 --- a/src/core/store/hb_store_lmdb.erl +++ b/src/core/store/hb_store_lmdb.erl @@ -23,6 +23,7 @@ -export([start/3, stop/3, scope/0, scope/1, reset/3]). -export([read/3, write/3, list/3, match/3]). -export([group/3, link/3, type/3, resolve/3]). +-export([overlay_count/1]). %% Test framework and project includes -include_lib("eunit/include/eunit.hrl"). @@ -62,9 +63,12 @@ start(Opts = #{ <<"name">> := DataDir }, _Req, _NodeOpts) -> batch_size, hb_util:int(maps:get(<<"batch-size">>, Opts, ?DEFAULT_BATCH_SIZE)) }, - no_mem_init, - no_sync + no_mem_init ] ++ + case maps:get(<<"sync">>, Opts, false) of + true -> []; + false -> [no_sync] + end ++ case maps:get(<<"read-ahead">>, Opts, true) of true -> []; false -> [no_readahead] @@ -84,7 +88,11 @@ start(Opts = #{ <<"name">> := DataDir }, _Req, _NodeOpts) -> % Create the LMDB environment with specified size limit {ok, Env} = elmdb:env_open(DataDirPath, EnvOpts), {ok, DBInstance} = elmdb:db_open(Env, [create]), - {ok, #{ <<"env">> => Env, <<"db">> => DBInstance }}; + SyncInterval = hb_util:int(maps:get(<<"sync-interval">>, Opts, 0)), + MonitorPid = spawn(fun() -> + overlay_monitor_loop(Env, DBInstance, DataDir, SyncInterval, 0) + end), + {ok, #{ <<"env">> => Env, <<"db">> => DBInstance, <<"monitor">> => MonitorPid }}; start(_Store, _Req, _NodeOpts) -> {error, {badarg, <<"StoreOpts must be a map">>}}. @@ -547,8 +555,19 @@ resolve(Opts, #{ <<"resolve">> := Path }, _NodeOpts) -> %% @doc Retrieve or create the LMDB environment handle for a database. find_env(Opts) -> hb_store:find(Opts). +%% @doc Return the number of writes currently pending in the elmdb overlay. +%% Safe to call on any live database — does not trigger any I/O. +-spec overlay_count(map()) -> non_neg_integer(). +overlay_count(Opts) -> + #{ <<"db">> := DB } = find_env(Opts), + elmdb:overlay_count(DB). + %% Shutdown LMDB environment and cleanup resources -stop(#{ <<"store-module">> := ?MODULE, <<"name">> := DataDir }, _Req, _Opts) -> +stop(#{ <<"store-module">> := ?MODULE, <<"name">> := DataDir } = StoreOpts, _Req, _Opts) -> + case maps:get(<<"monitor">>, StoreOpts, undefined) of + undefined -> ok; + Pid -> exit(Pid, shutdown) + end, % Soft-close by name; refs stay valid and reopen lazily on next access. catch elmdb:env_close_by_name(hb_util:list(DataDir)), ok; @@ -593,6 +612,26 @@ sample_metrics(Name, StartTime, Type) -> miss -> ok end. +%% @doc Periodically samples overlay_count and reports it to Prometheus. +%% When sync-interval > 0, also calls env_sync every that many seconds, +%% decoupling durability from the per-commit flush worker path. +overlay_monitor_loop(Env, DBInstance, StoreName, SyncInterval, SecondsSinceSync) -> + receive + stop -> ok + after 1000 -> + Count = elmdb:overlay_count(DBInstance), + hb_prometheus:set(gauge, hb_store_lmdb_overlay_count, [StoreName], Count), + NextSecondsSinceSync = + case SyncInterval > 0 andalso SecondsSinceSync + 1 >= SyncInterval of + true -> + elmdb:env_sync(Env), + 0; + false -> + SecondsSinceSync + 1 + end, + overlay_monitor_loop(Env, DBInstance, StoreName, SyncInterval, NextSecondsSinceSync) + end. + init_prometheus() -> hb_prometheus:declare(histogram, [ {name, hb_store_lmdb_duration_seconds}, @@ -605,6 +644,11 @@ init_prometheus() -> {labels, [name]}, {help, "LMDB name requested"} ]), + hb_prometheus:declare(gauge, [ + {name, hb_store_lmdb_overlay_count}, + {labels, [store_name]}, + {help, "Number of writes pending in the elmdb overlay for each store"} + ]), ok. %% @doc Test suite demonstrating basic store operations. From 2cc56d9a9adf5bc70fd6f561a35a04ae4d187032 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 1 May 2026 13:16:19 +0100 Subject: [PATCH 16/68] impr: Check if write fails and throw fast in index write cases --- src/preloaded/query/dev_copycat_arweave.erl | 178 +++++++++----------- 1 file changed, 79 insertions(+), 99 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index af51bd66a..d6648e442 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -205,73 +205,56 @@ is_block_indexed(Height, TargetDepth, Opts) -> %% no items at that level), plus any partial depths beyond AchievedDepth %% that were collected during indexing. write_block_item_ids(Height, AchievedDepth, ItemIDs, Opts) -> - case hb_store_arweave:store_from_opts(Opts) of - no_store -> ok; - #{ <<"index-store">> := Store } -> - MaxStoredDepth = case maps:keys(ItemIDs) of - [] -> AchievedDepth; - Keys -> max(AchievedDepth, lists:max(Keys)) - end, - Results = lists:map( - fun(D) -> - IDs = maps:get(D, ItemIDs, []), - Bin = encode_item_ids(IDs), - hb_store:write( - Store, - block_items_path(Height, D), - Bin - ) - end, - lists:seq(1, MaxStoredDepth) - ), - case lists:all(fun(R) -> R =:= ok end, Results) of - true -> ok; - false -> - ?event(copycat_short, - {block_item_ids_write_failed, - {height, Height}}), - {error, item_ids_write_failed} - end - end. - -%% @doc Write a block completion marker with the achieved depth. -mark_block_indexed(Height, Depth, Opts) -> - case hb_store_arweave:store_from_opts(Opts) of - no_store -> ok; - #{ <<"index-store">> := Store } -> + Store = get_index_store(Opts), + MaxStoredDepth = case maps:keys(ItemIDs) of + [] -> AchievedDepth; + Keys -> max(AchievedDepth, lists:max(Keys)) + end, + Results = lists:map( + fun(D) -> + IDs = maps:get(D, ItemIDs, []), + Bin = encode_item_ids(IDs), hb_store:write( Store, - block_indexed_path(Height), - integer_to_binary(Depth) + block_items_path(Height, D), + Bin ) + end, + lists:seq(1, MaxStoredDepth) + ), + case lists:all(fun(R) -> R =:= ok end, Results) of + true -> ok; + false -> + ?event(copycat_short, + {block_item_ids_write_failed, + {height, Height}}), + {error, item_ids_write_failed} end. +%% @doc Write a block completion marker with the achieved depth. +mark_block_indexed(Height, Depth, Opts) -> + Store = get_index_store(Opts), + hb_store:write( + Store, + block_indexed_path(Height), + integer_to_binary(Depth) + ). + %% @doc Read the persisted cutover height from the index store. read_cutover_height(Opts) -> - case hb_store_arweave:store_from_opts(Opts) of - no_store -> undefined; - #{ <<"index-store">> := Store } -> - case hb_store:read(Store, ?CUTOVER_KEY) of - {ok, Bin} -> hb_util:int(Bin); - not_found -> undefined - end + Store = get_index_store(Opts), + case hb_store:read(Store, ?CUTOVER_KEY) of + {ok, Bin} -> hb_util:int(Bin); + not_found -> undefined end. %% @doc Write the cutover height if not already set. ensure_cutover_height(Height, Opts) -> case read_cutover_height(Opts) of undefined -> - case hb_store_arweave:store_from_opts(Opts) of - no_store -> ok; - #{ <<"index-store">> := Store } -> - hb_store:write( - Store, ?CUTOVER_KEY, hb_util:bin(Height)), - ?event(copycat_short, - {marker_cutover_initialized, - {height, Height} - } - ) - end; + Store = get_index_store(Opts), + hb_store:write(Store, ?CUTOVER_KEY, hb_util:bin(Height)), + ?event(copycat_short, {marker_cutover_initialized, {height, Height}}); _ -> ok end. @@ -560,13 +543,10 @@ latest_height(Opts) -> %% @doc Check if a transaction ID is indexed in the arweave index store. is_tx_indexed(TXID, Opts) -> - case hb_store_arweave:store_from_opts(Opts) of - no_store -> false; - #{ <<"index-store">> := Store } -> - case hb_store:read(Store, hb_store_arweave_offset:path(TXID), Opts) of - {ok, _} -> true; - {error, not_found} -> false - end + Store = get_index_store(Opts), + case hb_store:read(Store, hb_store_arweave_offset:path(TXID), Opts) of + {ok, _} -> true; + not_found -> false end. %% @doc List indexed blocks and transactions in the given range. @@ -597,7 +577,7 @@ list_index_blocks(Current, To, Opts, Acc) -> [] -> list_index_blocks(Current - 1, To, Opts, Acc); _ -> - {IndexedTXs, NotIndexedTXs} = classify_txs(TXIDs, Opts), + {IndexedTXs, _NotIndexedTXs} = classify_txs(TXIDs, Opts), case IndexedTXs of [] -> % Do not include blocks with no locally indexed TXs. @@ -632,15 +612,11 @@ assemble_block_info(Height, Block, Opts) -> <<"indexed">> => IndexedTXs, <<"not-indexed">> => NotIndexedTXs }, - case read_block_depth(Height, Opts) of + case read_block_marker_depth(Height, Opts) of undefined -> Base; Depth -> Base#{<<"depth">> => Depth} end. -%% @doc Read the achieved depth from a block marker. -read_block_depth(Height, Opts) -> - read_block_marker_depth(Height, Opts). - %% @doc Probe item entries upward from depth 1, applying TransformFun to each. probe_block_items(Height, Opts, TransformFun) -> case hb_store_arweave:store_from_opts(Opts) of @@ -696,7 +672,7 @@ inventory_index(From, To, Opts) -> inventory_local(Current, To, _Opts, Acc) when Current < To -> Acc; inventory_local(Current, To, Opts, Acc) -> - case read_block_depth(Current, Opts) of + case read_block_marker_depth(Current, Opts) of undefined -> inventory_local(Current - 1, To, Opts, Acc); Depth -> @@ -932,15 +908,18 @@ process_block(BlockRes, Current, To, TargetDepth, Opts) -> {height, Current}, {target, To} } - ); - _ -> + ), + throw(item_ids_write_failed); + Error -> ?event( copycat_short, {arweave_block_marker_failed, {height, Current}, - {target, To} + {target, To}, + {error, Error} } - ) + ), + throw({writing_to_index_store, Error}) end end; {error, _} = Error -> @@ -1009,7 +988,7 @@ process_block_tx({{padding, _PaddingRoot}, _EndOffset}, _BlockStartOffset, Targe #{items_count => 0, bundle_count => 0, skipped_count => 0, achieved_depth => max(2, TargetDepth)}; process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, BlockHeight, Opts) -> - IndexStore = hb_store_arweave:store_from_opts(Opts), + ArweaveStore = hb_store_arweave:store_from_opts(Opts), TXID = hb_util:encode(TX#tx.id), TXEndOffset = BlockStartOffset + EndOffset, TXStartOffset = TXEndOffset - TX#tx.data_size, @@ -1018,17 +997,17 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, {offset, TXStartOffset}, {size, TX#tx.data_size} }), - observe_event(<<"item_indexed">>, fun() -> + ok = observe_event(<<"item_indexed">>, fun() -> hb_store_arweave:write_offset( - IndexStore, + ArweaveStore, TXID, <<"tx@1.0">>, TXStartOffset, TX#tx.data_size ) end), - #{ <<"index-store">> := ParentStore } = IndexStore, - write_parent(TX#tx.id, BlockHeight, block, ParentStore), + #{ <<"index-store">> := IndexStore } = ArweaveStore, + ok = write_parent(TX#tx.id, BlockHeight, block, IndexStore), try is_bundle_tx(TX, Opts) of false -> #{items_count => 0, bundle_count => 0, skipped_count => 0, @@ -1038,7 +1017,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, try L1Result = process_l1_tx_direct( TXStartOffset, TX#tx.data_size, - TargetDepth - 1, IndexStore, TXID, TX#tx.id, Opts), + TargetDepth - 1, ArweaveStore, TXID, TX#tx.id, Opts), L1Result#{ achieved_depth => max(2, maps:get(achieved_depth, L1Result, 0)) @@ -1071,14 +1050,14 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, {TotalTime, {_, ItemsCount}} = timer:tc(fun() -> lists:foldl( fun({ItemID, Size}, {ItemStartOffset, ItemsCountAcc}) -> - hb_store_arweave:write_offset( - IndexStore, + ok = hb_store_arweave:write_offset( + ArweaveStore, hb_util:encode(ItemID), <<"ans104@1.0">>, ItemStartOffset, Size ), - write_parent(ItemID, TX#tx.id, bundle, ParentStore), + ok = write_parent(ItemID, TX#tx.id, bundle, IndexStore), {ItemStartOffset + Size, ItemsCountAcc + 1} end, {TXStartOffset + HeaderSize, 0}, @@ -1514,7 +1493,7 @@ index_full_bundle_items( ItemsBin, ItemStartOffset, Depth, - Store, + #{ <<"index-store">> := IndexStore } = Store, ParentID, Opts, Count, @@ -1525,16 +1504,15 @@ index_full_bundle_items( ItemBinary = binary:part(ItemsBin, 0, Size), EncodedItemID = hb_util:encode(ItemID), ParseResult = validate_and_flag_item_id( - ItemBinary, ItemID, EncodedItemID, Store), - hb_store_arweave:write_offset( + ItemBinary, ItemID, EncodedItemID, IndexStore), + ok = hb_store_arweave:write_offset( Store, EncodedItemID, <<"ans104@1.0">>, ItemStartOffset, Size ), - #{ <<"index-store">> := IdxStore } = Store, - write_parent(ItemID, ParentID, bundle, IdxStore), + ok = write_parent(ItemID, ParentID, bundle, IndexStore), {DescendantCount, ItemAchievedDepth, ChildIDs} = case {Depth > 1, ParseResult} of {true, {ok, HeaderSize, ParsedItem}} -> @@ -1594,7 +1572,7 @@ index_full_bundle_descendants_parsed( %% header. Returns {ok, HeaderSize, ParsedItem} on successful parse, or %% error if deserialization fails. Mismatch flags are written but don't %% prevent the item from being indexed. -validate_and_flag_item_id(ItemBinary, DeclaredID, EncodedDeclaredID, Store) -> +validate_and_flag_item_id(ItemBinary, DeclaredID, EncodedDeclaredID, IndexStore) -> try ar_bundles:deserialize_header(ItemBinary) of {ok, HeaderSize, ParsedItem} -> ComputedID = crypto:hash(sha256, ParsedItem#tx.signature), @@ -1602,16 +1580,12 @@ validate_and_flag_item_id(ItemBinary, DeclaredID, EncodedDeclaredID, Store) -> true -> ok; false -> - case Store of - #{ <<"index-store">> := IndexStore } -> - hb_store:write( - IndexStore, - hb_store_arweave_offset:mismatch_path( - DeclaredID), - ComputedID - ); - _ -> ok - end, + ok = hb_store:write( + IndexStore, + hb_store_arweave_offset:mismatch_path( + DeclaredID), + ComputedID + ), ?event(copycat_short, {item_id_mismatch, {declared_id, {explicit, EncodedDeclaredID}}, @@ -1716,6 +1690,12 @@ observe_copycat_l1_stage(MetricName, Fun) -> record_copycat_l1_metrics(MetricName, 1, Time), Result. +get_index_store(Opts) -> + case hb_store_arweave:store_from_opts(Opts) of + #{ <<"index-store">> := Store } -> Store; + _ -> throw(no_index_store_available) + end. + %%% Tests index_ids_test_parallel() -> @@ -2931,8 +2911,8 @@ fabricated_mismatch_test() -> RealID = crypto:hash(sha256, Item#tx.signature), FakeID = crypto:strong_rand_bytes(32), EncodedFakeID = hb_util:encode(FakeID), - validate_and_flag_item_id(ItemBinary, FakeID, EncodedFakeID, StoreOpts), #{ <<"index-store">> := IndexStore } = StoreOpts, + validate_and_flag_item_id(ItemBinary, FakeID, EncodedFakeID, IndexStore), {ok, StoredActualID} = hb_store:read( IndexStore, @@ -3284,4 +3264,4 @@ index_of(Elem, List) -> index_of(Elem, List, 1). index_of(_Elem, [], _N) -> not_found; index_of(Elem, [Elem | _], N) -> N; -index_of(Elem, [_ | Rest], N) -> index_of(Elem, Rest, N + 1). +index_of(Elem, [_ | Rest], N) -> index_of(Elem, Rest, N + 1). \ No newline at end of file From a5d40a330ae992000ec750d1b96fcc0441a3c2e2 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 1 May 2026 14:38:14 +0100 Subject: [PATCH 17/68] impr: Add a max retry limit to hb_copycat_budget --- src/hb_copycat_budget.erl | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/hb_copycat_budget.erl b/src/hb_copycat_budget.erl index 2c98edbb7..c449814ac 100644 --- a/src/hb_copycat_budget.erl +++ b/src/hb_copycat_budget.erl @@ -4,6 +4,7 @@ -module(hb_copycat_budget). -export([ensure_started/1, reset/1, lease/1, release/1, get_budget/0, stats/0]). -include_lib("eunit/include/eunit.hrl"). +-include("include/hb.hrl"). -define(PERSISTENT_KEY, hb_copycat_budget). -define(IDX_LEASED, 1). @@ -11,6 +12,7 @@ -define(IDX_BUDGET, 3). -define(IDX_RETRIES, 4). -define(RETRY_SLEEP_MS, 50). +-define(LEASE_LOOP_MAX_RETRIES, 100). -define(INIT_LOCK, hb_copycat_budget_init). @@ -65,23 +67,30 @@ reset(Budget) when is_integer(Budget), Budget > 0 -> lease(Size) when is_integer(Size), Size > 0 -> Ref = persistent_term:get(?PERSISTENT_KEY), - lease_loop(Ref, Size). - -lease_loop(Ref, Size) -> + lease_loop(Ref, Size, 0). + +lease_loop(Ref, Size, ?LEASE_LOOP_MAX_RETRIES) -> + ?event(error, + {lease_loop_max_retries_exhausted, + {ref, Ref}, + {size, Size}, + {max_retries, ?LEASE_LOOP_MAX_RETRIES}}), + throw(exhausted_lease_loop_max_retires); +lease_loop(Ref, Size, Retries) -> Current = atomics:get(Ref, ?IDX_LEASED), Budget = atomics:get(Ref, ?IDX_BUDGET), case Current + Size > Budget of true -> atomics:add(Ref, ?IDX_RETRIES, 1), timer:sleep(?RETRY_SLEEP_MS), - lease_loop(Ref, Size); + lease_loop(Ref, Size, Retries + 1); false -> case atomics:compare_exchange(Ref, ?IDX_LEASED, Current, Current + Size) of ok -> update_peak(Ref, Current + Size), ok; _Changed -> - lease_loop(Ref, Size) + lease_loop(Ref, Size, Retries + 1) end end. From 833b6af2aafba1838179f34ee199abf7638c42ad Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 1 May 2026 14:38:29 +0100 Subject: [PATCH 18/68] impr: Remove dead code --- src/preloaded/query/dev_copycat_arweave.erl | 22 --------------------- 1 file changed, 22 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index d6648e442..eb1dd7f6d 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -1101,28 +1101,6 @@ download_bundle_header(EndOffset, Size, Opts) -> dev_arweave:bundle_header(EndOffset - Size, Opts) end). -header_chunk(invalid_bundle_header, _FirstChunk, _StartOffset, _Opts) -> - {error, invalid_bundle_header}; -header_chunk(HeaderSize, FirstChunk, _StartOffset, _Opts) - when HeaderSize =< byte_size(FirstChunk) -> - {ok, FirstChunk}; -header_chunk(HeaderSize, FirstChunk, StartOffset, Opts) -> - Res = - hb_ao:resolve( - << - ?ARWEAVE_DEVICE/binary, - "/chunk&offset=", - (hb_util:bin(StartOffset + byte_size(FirstChunk)))/binary, - "&length=", - (hb_util:bin(HeaderSize - byte_size(FirstChunk)))/binary - >>, - Opts - ), - case Res of - {ok, OtherChunks} -> {ok, <>}; - Other -> Other - end. - %% @doc Process transactions: spawn workers and manage the worker pool. %% This function processes transactions in parallel using parallel_map. %% When arweave_index_workers <= 1, processes sequentially (one worker at a time). From a54b6fb701f2a42ecb70d29a03430f0c2f99b772 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 1 May 2026 15:34:19 +0100 Subject: [PATCH 19/68] impr: Remove memory_safe_cap (redundant), to only use copycat_memory_budget --- src/core/resolver/hb_opts.erl | 2 +- src/preloaded/query/dev_copycat_arweave.erl | 47 +++------------------ 2 files changed, 7 insertions(+), 42 deletions(-) diff --git a/src/core/resolver/hb_opts.erl b/src/core/resolver/hb_opts.erl index d22d4c576..6b9ee9633 100644 --- a/src/core/resolver/hb_opts.erl +++ b/src/core/resolver/hb_opts.erl @@ -1263,4 +1263,4 @@ ensure_node_history_test() -> ] }, ?assertEqual({error, invalid_values}, ensure_node_history(InvalidItems, RequiredOpts)). --endif. +-endif. \ No newline at end of file diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index eb1dd7f6d..40e55e504 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -7,7 +7,7 @@ -module(dev_copycat_arweave). -device_libraries([lib_arweave_common]). -export([arweave/3]). --export([add_owner_alias/3, resolve_owner_alias/2, set_memory_safe_cap/2, get_memory_safe_cap/1, set_depth_recursion_cap/2, get_depth_recursion_cap/1]). +-export([add_owner_alias/3, resolve_owner_alias/2, set_depth_recursion_cap/2, get_depth_recursion_cap/1]). -include_lib("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). @@ -20,6 +20,7 @@ % Note: this means that the children of L2 bundles are not indexed at % depth 2. -define(DEFAULT_BLOCK_DEPTH, 2). +-define(DEFAULT_COPYCAT_MEMORY_BUDGET, 6 * 1024 * 1024 * 1024). % GET /~cron@1.0/once&cron-path=~copycat@1.0/arweave @@ -82,10 +83,6 @@ arweave(_Base, Request, Opts) -> {error, <<"Unsupported mode `", (hb_util:bin(Mode))/binary, "`. Supported modes are: write, list, inventory">>} end. -%% @doc Set safe memory resource allocation cap for the in-memory -%% bundle processing. in bytes. -set_memory_safe_cap(Cap, Opts) when is_integer(Cap), Cap > 0 -> - Opts#{copycat_memory_cap => Cap}. %% @doc Set bundles descendant recursion cap, avoids recursion %% in very nested bundles (very rare). set_depth_recursion_cap(Cap, Opts) when is_integer(Cap), Cap > 0 -> @@ -93,23 +90,14 @@ set_depth_recursion_cap(Cap, Opts) when is_integer(Cap), Cap > 0 -> %% @doc Get the set depth recursion cap from hb_opts. get_depth_recursion_cap(Opts) -> hb_opts:get(copycat_depth_recursion_cap, undefined, Opts). -%% @doc Get the L1 TX data size that gets handled in-memory -%% from hb_opts. -get_memory_safe_cap(Opts) -> - hb_opts:get(copycat_memory_cap, undefined, Opts). %% @doc Return the effective per-TX memory cap, clamped to the global budget. %% Lazily initializes the budget pool on first call. effective_memory_cap(Opts) -> Budget = hb_opts:get( - copycat_memory_budget, 6 * 1024 * 1024 * 1024, Opts), + copycat_memory_budget, ?DEFAULT_COPYCAT_MEMORY_BUDGET, Opts), hb_copycat_budget:ensure_started(Budget), - PoolSize = hb_copycat_budget:get_budget(), - Cap = get_memory_safe_cap(Opts), - case Cap of - undefined -> PoolSize; - _ -> min(Cap, PoolSize) - end. + hb_copycat_budget:get_budget(). %% @doc Return the store path for a block completion marker. block_indexed_path(Height) -> @@ -362,9 +350,8 @@ parse_tag_filter(Key, Request, Opts) -> %% @doc Process the `id=...` copycat path for an already indexed L1 TX. %% applies L1-level owner/tag filters on the lightweight TX header first, then, %% if the TX passes and is a bundle, loads the full L1 payload once and indexes -%% descendants in-memory (under the configured copycat_memory_cap) up to the -%% requested safe depth (defaults to full recursion till the set -%% copycat_depth_recursion_cap). +%% descendants in-memory up to the requested safe depth (defaults to full recursion +%% till the set copycat_depth_recursion_cap). process_l1_request(TXID, Request, Opts) -> Depth = request_depth(Request, <<"safe_max">>, Opts), QueryL1Offset = @@ -2412,13 +2399,6 @@ request_depth_clamping_test() -> ?assertEqual(6, request_depth(#{}, <<"safe_max">>, #{})), ok. -memory_cap_setter_getter_test() -> - {_TestStore, _StoreOpts, Opts0} = setup_index_opts(), - ?assertEqual(6 * 1024 * 1024 * 1024, get_memory_safe_cap(Opts0)), - Opts1 = set_memory_safe_cap(1024, Opts0), - ?assertEqual(1024, get_memory_safe_cap(Opts1)), - ok. - id_depth_1_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), {Block, TXID} = {1827942, <<"T2pluNnaavL7-S2GkO_m3pASLUqMH_XQ9IiIhZKfySs">>}, @@ -3039,21 +3019,6 @@ corrupt_item_ids_read_test() -> ?assertEqual(<<"corrupt">>, maps:get(<<"2">>, IDs)), ok. -memory_cap_depth3_floors_to_2_test() -> - {_TestStore, _StoreOpts, Opts} = setup_index_opts(), - CappedOpts = set_memory_safe_cap(1, Opts), - Block = 1827942, - {ok, Block} = - hb_ao:resolve( - <<"~copycat@1.0/arweave&from=", - (hb_util:bin(Block))/binary, "&to=", - (hb_util:bin(Block))/binary, "&depth=3">>, - CappedOpts - ), - ?assert(is_block_indexed(Block, 2, CappedOpts)), - ?assertNot(is_block_indexed(Block, 3, CappedOpts)), - ok. - parent_encode_decode_test() -> BlockEntry = encode_parent_entry(12345, block), ?assertEqual(<<0, 12345:64/big-unsigned>>, BlockEntry), From 9a3544096913977e78782ae3ad5e7ee8ca13a013 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Sun, 3 May 2026 19:09:22 +0100 Subject: [PATCH 20/68] impr: Do not return not_found where there is an error in dev_arweave:parent --- src/preloaded/arweave/dev_arweave.erl | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index d0738fe5b..e40bf32f2 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -787,10 +787,19 @@ parent(Base, Request, Opts) -> not_found -> {error, not_found} catch - error:function_clause -> - {error, not_found}; - error:badarg -> - {error, not_found} + error:Reason:Stacktrace -> + ?event(error, + {parent_read_error, + {id, ID}, + {reason, Reason}, + {stacktrace, Stacktrace} + }), + {failure, + #{ + <<"status">> => 500, + <<"type">> => <<"parent_read_error">> + } + } end end. From 8f31fc87afda56447f57ce7056f0a3ca6b764050 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Sun, 3 May 2026 19:16:10 +0100 Subject: [PATCH 21/68] impr: Re-add warning before exit on hb_event --- src/core/monitor/hb_event.erl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/core/monitor/hb_event.erl b/src/core/monitor/hb_event.erl index ae991d980..36e343eb2 100644 --- a/src/core/monitor/hb_event.erl +++ b/src/core/monitor/hb_event.erl @@ -409,6 +409,14 @@ check_overload(Last, N) -> % we can be restarted by the next caller. case MemorySize of MemorySize when MemorySize > ?MAX_MEMORY -> + ?debug_print( + {error, + prometheus_event_queue_terminating_on_memory_overload, + {queue, Len}, + {memory_bytes, MemorySize}, + {last_event, Last} + } + ), exit(memory_overload); _ -> no_action end; From bbe47a18e70fd124184f987f77ecab4d6fcac62b Mon Sep 17 00:00:00 2001 From: speeddragon Date: Mon, 4 May 2026 14:55:10 +0100 Subject: [PATCH 22/68] impr: Minor fixes --- src/preloaded/query/dev_copycat_arweave.erl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 40e55e504..5c50e3a03 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -7,7 +7,7 @@ -module(dev_copycat_arweave). -device_libraries([lib_arweave_common]). -export([arweave/3]). --export([add_owner_alias/3, resolve_owner_alias/2, set_depth_recursion_cap/2, get_depth_recursion_cap/1]). +-export([set_depth_recursion_cap/2, get_depth_recursion_cap/1]). -include_lib("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). @@ -475,7 +475,7 @@ has_tag_pair(#tx{tags = Tags}, #{name := Name, value := Value}) -> LowerValue -> true; _ -> false end -end; + end; has_tag_pair(_, _) -> false. %% @doc Parse the range from the request. @@ -1000,7 +1000,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, #{items_count => 0, bundle_count => 0, skipped_count => 0, achieved_depth => max(2, TargetDepth)}; true when TargetDepth > 2 -> - %% Retry to perseve bundle count + %% Retry to preserve bundle count try L1Result = process_l1_tx_direct( TXStartOffset, TX#tx.data_size, @@ -1745,7 +1745,7 @@ block_depth_3_test() -> <<"~copycat@1.0/arweave&from=1827942&to=1827942&depth=3">>, Opts ), - % L3 item read when doing depth=2 + % L3 item read when doing depth=3 assert_item_read( <<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, Opts), From d16ae50064e61ba350e24fc69bb0d690ddaaa7a4 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Mon, 4 May 2026 15:22:06 +0100 Subject: [PATCH 23/68] impr: stop monitor in hb_store_lmdb --- src/core/store/hb_store_lmdb.erl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/core/store/hb_store_lmdb.erl b/src/core/store/hb_store_lmdb.erl index 0d6e67e77..8f3ad4cbd 100644 --- a/src/core/store/hb_store_lmdb.erl +++ b/src/core/store/hb_store_lmdb.erl @@ -566,7 +566,15 @@ overlay_count(Opts) -> stop(#{ <<"store-module">> := ?MODULE, <<"name">> := DataDir } = StoreOpts, _Req, _Opts) -> case maps:get(<<"monitor">>, StoreOpts, undefined) of undefined -> ok; - Pid -> exit(Pid, shutdown) + Pid -> + Ref = erlang:monitor(process, Pid), + exit(Pid, shutdown), + receive + {'DOWN', Ref, process, Pid, _Reason} -> ok + after 5000 -> + erlang:demonitor(Ref, [flush]), + ok + end end, % Soft-close by name; refs stay valid and reopen lazily on next access. catch elmdb:env_close_by_name(hb_util:list(DataDir)), From 63703d43dfc1ee854643d2971cc595e450865b21 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Mon, 4 May 2026 20:36:44 +0100 Subject: [PATCH 24/68] impr: Fix hb_store new structure for previous changes --- src/core/store/hb_store.erl | 8 ++-- src/core/store/hb_store_arweave.erl | 6 +-- src/preloaded/query/dev_copycat_arweave.erl | 51 +++++++++++---------- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/src/core/store/hb_store.erl b/src/core/store/hb_store.erl index 1ad92d85b..cc1a2141e 100644 --- a/src/core/store/hb_store.erl +++ b/src/core/store/hb_store.erl @@ -92,9 +92,9 @@ behavior_info(callbacks) -> %% @doc Store access policies to function names. -define(STORE_ACCESS_POLICIES, #{ - <<"read">> => [read, resolve, list, type, match] ++ ?COMMON_POLICIES, - <<"write">> => [write, link, group, reset] ++ ?COMMON_POLICIES, - <<"admin">> => [reset] ++ ?COMMON_POLICIES + <<"read">> => [read, resolve, list, type, match, scope, start, stop], + <<"write">> => [write, link, group, reset, scope, start, stop], + <<"admin">> => [start, stop, reset, scope] }). %%% Store named terms registry functions. @@ -561,8 +561,6 @@ start_one(Store = #{ <<"store-module">> := Mod }, Req, Opts) -> end. call_store_start(Mod, Store, Req, Opts) -> - %% function_exported doesn't load the module. We need to call ensure_loaded - %% here since is the first time we call a function to load the module. code:ensure_loaded(Mod), case erlang:function_exported(Mod, start, 3) of true -> Mod:start(Store, Req, Opts); diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 7d94be78a..06aa01044 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -31,9 +31,9 @@ first_arweave_store( first_arweave_store([_ | Rest]) -> first_arweave_store(Rest). %% @doc Start the Arweave store, and the downstream associated index store. -start(#{<<"index-store">> := IndexStore}, _Req, _Opts) -> +start(#{<<"index-store">> := IndexStore}, Req, Opts) -> init_prometheus(), - hb_store:start(IndexStore). + hb_store:start(IndexStore, Req, Opts). %% @doc Although the index is local, loading an item via the index will make %% requests to a remote node, so we define the scope as remote. @@ -458,4 +458,4 @@ load_item_deserialize_throws_test() -> ProbeOffset = 376836336327208, Size = 4096, ok = write_offset(Opts, FakeID, <<"ans104@1.0">>, ProbeOffset - 1, Size), - ?assertMatch({error, _}, read(Opts, FakeID)). + ?assertMatch({error, _}, read(Opts, FakeID, #{})). diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 5c50e3a03..958a32d29 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -170,12 +170,12 @@ read_block_marker_depth(Height, Opts) -> case hb_store_arweave:store_from_opts(Opts) of no_store -> undefined; #{ <<"index-store">> := Store } -> - case hb_store:read(Store, block_indexed_path(Height)) of + case hb_store:read(Store, block_indexed_path(Height), Opts) of {ok, Bin} -> try binary_to_integer(Bin) catch _:_ -> undefined end; - not_found -> undefined + {error, not_found} -> undefined end end. @@ -231,9 +231,9 @@ mark_block_indexed(Height, Depth, Opts) -> %% @doc Read the persisted cutover height from the index store. read_cutover_height(Opts) -> Store = get_index_store(Opts), - case hb_store:read(Store, ?CUTOVER_KEY) of + case hb_store:read(Store, ?CUTOVER_KEY, Opts) of {ok, Bin} -> hb_util:int(Bin); - not_found -> undefined + {error, not_found} -> undefined end. %% @doc Write the cutover height if not already set. @@ -533,7 +533,7 @@ is_tx_indexed(TXID, Opts) -> Store = get_index_store(Opts), case hb_store:read(Store, hb_store_arweave_offset:path(TXID), Opts) of {ok, _} -> true; - not_found -> false + {error, not_found} -> false end. %% @doc List indexed blocks and transactions in the given range. @@ -609,17 +609,17 @@ probe_block_items(Height, Opts, TransformFun) -> case hb_store_arweave:store_from_opts(Opts) of no_store -> #{}; #{ <<"index-store">> := Store } -> - probe_block_items(Height, Store, 1, #{}, TransformFun) + probe_block_items(Height, Store, 1, #{}, TransformFun, Opts) end. -probe_block_items(Height, Store, Depth, Acc, TransformFun) -> - case hb_store:read(Store, block_items_path(Height, Depth)) of +probe_block_items(Height, Store, Depth, Acc, TransformFun, Opts) -> + case hb_store:read(Store, block_items_path(Height, Depth), Opts) of {ok, Bin} -> Key = hb_util:bin(Depth), probe_block_items( Height, Store, Depth + 1, - Acc#{Key => TransformFun(Bin)}, TransformFun); - not_found -> + Acc#{Key => TransformFun(Bin)}, TransformFun, Opts); + {error, not_found} -> Acc end. @@ -2798,11 +2798,11 @@ small_block_depth_3_test() -> ), ?assert(is_block_indexed(Block, 3, Opts)), #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), - {ok, L1Bin} = hb_store:read(Store, block_items_path(Block, 1)), + {ok, L1Bin} = hb_store:read(Store, block_items_path(Block, 1), Opts), ?assert(length(decode_item_ids(L1Bin)) > 0), - {ok, L2Bin} = hb_store:read(Store, block_items_path(Block, 2)), + {ok, L2Bin} = hb_store:read(Store, block_items_path(Block, 2), Opts), ?assert(length(decode_item_ids(L2Bin)) > 0), - {ok, L3Bin} = hb_store:read(Store, block_items_path(Block, 3)), + {ok, L3Bin} = hb_store:read(Store, block_items_path(Block, 3), Opts), L3IDs = decode_item_ids(L3Bin), ?assertEqual(3, length(L3IDs)), assert_item_read( @@ -2827,7 +2827,8 @@ no_mismatch_flags_on_valid_bundles_test() -> not_found, hb_store:read( IndexStore, - hb_store_arweave_offset:mismatch_path(ItemID) + hb_store_arweave_offset:mismatch_path(ItemID), + Opts ) ), ok. @@ -2851,13 +2852,13 @@ exact_marker_depth_test() -> #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), {ok, StoredBin} = - hb_store:read(Store, block_indexed_path(Block)), + hb_store:read(Store, block_indexed_path(Block), Opts), StoredDepth = binary_to_integer(StoredBin), ?assertEqual(3, StoredDepth), ok. fabricated_mismatch_test() -> - {_TestStore, StoreOpts, _Opts} = setup_index_opts(), + {_TestStore, StoreOpts, Opts} = setup_index_opts(), {Priv, Pub} = ar_wallet:new(), Target = crypto:strong_rand_bytes(32), Anchor = crypto:strong_rand_bytes(32), @@ -2874,14 +2875,16 @@ fabricated_mismatch_test() -> {ok, StoredActualID} = hb_store:read( IndexStore, - hb_store_arweave_offset:mismatch_path(FakeID) + hb_store_arweave_offset:mismatch_path(FakeID), + Opts ), ?assertEqual(RealID, StoredActualID), ?assertEqual( not_found, hb_store:read( IndexStore, - hb_store_arweave_offset:mismatch_path(RealID) + hb_store_arweave_offset:mismatch_path(RealID), + Opts ) ), ok. @@ -2894,10 +2897,10 @@ block_item_ids_depth_2_test() -> Opts ), #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), - {ok, L1Bin} = hb_store:read(Store, block_items_path(1827942, 1)), + {ok, L1Bin} = hb_store:read(Store, block_items_path(1827942, 1), Opts), L1IDs = decode_item_ids(L1Bin), ?assert(length(L1IDs) > 0), - {ok, L2Bin} = hb_store:read(Store, block_items_path(1827942, 2)), + {ok, L2Bin} = hb_store:read(Store, block_items_path(1827942, 2), Opts), L2IDs = decode_item_ids(L2Bin), ?assert(length(L2IDs) > 0), L2Encoded = [hb_util:encode(ID) || ID <- L2IDs], @@ -2906,7 +2909,7 @@ block_item_ids_depth_2_test() -> ?assert(is_integer(Pos54K)), ?assert(is_integer(PosOBK)), ?assert(Pos54K < PosOBK), - ?assertEqual(not_found, hb_store:read(Store, block_items_path(1827942, 3))), + ?assertEqual(not_found, hb_store:read(Store, block_items_path(1827942, 3), Opts)), ok. block_item_ids_depth_3_test() -> @@ -2917,13 +2920,13 @@ block_item_ids_depth_3_test() -> Opts ), #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), - {ok, L1Bin} = hb_store:read(Store, block_items_path(1827942, 1)), + {ok, L1Bin} = hb_store:read(Store, block_items_path(1827942, 1), Opts), L1Count = length(decode_item_ids(L1Bin)), ?assertEqual(5, L1Count), - {ok, L2Bin} = hb_store:read(Store, block_items_path(1827942, 2)), + {ok, L2Bin} = hb_store:read(Store, block_items_path(1827942, 2), Opts), L2Count = length(decode_item_ids(L2Bin)), ?assert(L2Count > 0), - {ok, L3Bin} = hb_store:read(Store, block_items_path(1827942, 3)), + {ok, L3Bin} = hb_store:read(Store, block_items_path(1827942, 3), Opts), L3Count = length(decode_item_ids(L3Bin)), ?assert(L3Count >= 1), L3IDs = decode_item_ids(L3Bin), From e18879a958dd69863cf3e00951867c11689ccf28 Mon Sep 17 00:00:00 2001 From: Sam Williams Date: Sun, 12 Apr 2026 23:51:40 -0400 Subject: [PATCH 25/68] impr: document (subfork) offset index upgrade, allowing `relative` refs --- src/core/store/hb_store_arweave_offset.erl | 34 +++++++++++++++------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/src/core/store/hb_store_arweave_offset.erl b/src/core/store/hb_store_arweave_offset.erl index 1913645d5..8c0a906e3 100644 --- a/src/core/store/hb_store_arweave_offset.erl +++ b/src/core/store/hb_store_arweave_offset.erl @@ -1,22 +1,36 @@ %%% @doc Succinct encoding and decoding for Arweave data offset indexing. %%% Arweave data items are extremely numerous (>25,000,000,000 as of Feb 2026), and %%% as such small optimizations to the encoding of their offsets have a significant -%%% effect. For exampple, a single byte sized in the encoding at time of writing +%%% effect. For example, a single byte sized in the encoding at time of writing %%% saves ~25 GB of storage. %%% -%%% The encoding is as follows: -%%% << Version:4, Codec:4, StartOffset:64, Length/binary >> +%%% Version 1 of the encoding is as follows: +%%% Encoded ::= MempoolTX | RelativeRef | ConfirmedMessage +%%% MempoolTX ::= << Version:4, 0:4 >> +%%% RelativeRef ::= << Version:4, Codec:4, RELATIVE:64, ParentID:256, Range >> +%%% ConfirmedMessage ::= << Version:4, Codec:4, Range >> +%%% Range ::= << Offset:64, Length:unsigned-variable-length-integer >> %%% where: %%% - Version: 4-bit unsigned integer. Max: 15. Current: version `1`. -%%% - Codec: 4-bit unsigned integer. Max: 15. -%%% - StartOffset: 64-bit uint. Max: 2^64-1. -%%% - Length: unsigned variable-length integer. +%%% - Codec: 4-bit unsigned integer. Max: 15. Registry included below. +%%% - Offset: 64-bit uint. Max: 2^64-1. +%%% - RELATIVE: An atom, expressing that the offset is relative to the start +%%% of another transaction, rather than the start of the Arweave global +%%% address space. Always expressed as 2^64-1. +%%% - ParentID: The ID of a parent message for a relative offset, 256-bit uint. +%%% - Length: big-endian unsigned variable-length integer. +%%% - MempoolTX: Always << 1:4, 0: 4>>, indicating the version and that the +%%% key refers to an Arweave transaction that is not yet confirmed. +%%% - RelativeRef: A reference to an offset inside an unconfirmed Arweave +%%% transaction, yet to receive a global offset. +%%% - ConfirmedMessage: A message (any codec) that has been confirmed and has +%%% received a global offset. %%% -%%% Codecs: +%%% Codec Registry: %%% - 0: `tx@1.0`: An Arweave transaction. -%%% - 1: [Reserved for ANS-102: The initial JSON data item format.] +%%% - 1: `ans102@1.0`: The initial JSON data item format. %%% - 2: `~ans104@1.0`: Binary data items. -%%% - 3: [Reserved for `~httpsig@1.0`: RFC-9421 compatible HTTP signed messages.] +%%% - 3: `~httpsig@1.0`: RFC-9421 compatible HTTP signed messages. %%% %%% Codec indexes should, in general, be sorted by the time of their first write %%% to Arweave: Arweave TXs as 0, ANS-102 as 1, ANS-104 as 2, etc. @@ -24,7 +38,6 @@ %%% All `length` values are read by decoding all of the remaining bytes in the %%% offset encoding as an unsigned big-endian integer. This allows the length %%% to contract to only the number of bytes actually necessary to represent it. -%%% -module(hb_store_arweave_offset). -export([encode/3, decode/1, path/1, mismatch_path/1]). -include("include/hb.hrl"). @@ -33,6 +46,7 @@ -define(IN_BIT_RANGE(X, Bits), (X >= 0 andalso X < (1 bsl Bits))). -define(OFFSET_SZ, (8*8)). % 64-bit uint. Max: 2^64-1. +-define(OFFSET_MAX, ((1 bsl ?OFFSET_SZ) - 1)). -define(FORMAT_VERSION, 1). % 4-bit uint. Max: 15. %% @doc Reserved for future use. At the present time, store containing offsets are From 22ab8623ab4139ff274ecb8f44c8981c9b462c74 Mon Sep 17 00:00:00 2001 From: Sam Williams Date: Mon, 13 Apr 2026 02:00:36 -0400 Subject: [PATCH 26/68] wip: impl improved offset indexing format; tidy Arweave data access routes --- src/core/store/hb_store_arweave.erl | 160 ++++++++++----------- src/core/store/hb_store_arweave_offset.erl | 156 +++++++++++++------- src/preloaded/arweave/dev_arweave.erl | 74 ++++++---- 3 files changed, 232 insertions(+), 158 deletions(-) diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 06aa01044..3b711673b 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -81,12 +81,11 @@ read_offset(StoreOpts = #{ <<"index-store">> := IndexStore }, ID, Opts) -> ), case ReadRes of {ok, OffsetBinary} -> - {Version, CodecName, StartOffset, Length} = + {CodecName, Offset, Length} = hb_store_arweave_offset:decode(OffsetBinary), {ok, #{ - <<"version">> => Version, <<"codec-device">> => CodecName, - <<"start-offset">> => StartOffset, + <<"offset">> => Offset, <<"length">> => Length }}; _ -> @@ -145,16 +144,19 @@ do_read(StoreOpts, ID, Opts) -> case read_offset(StoreOpts, ID, Opts) of {ok, #{ - <<"version">> := Version, - <<"codec-device">> := CodecName, - <<"start-offset">> := StartOffset, + <<"codec-device">> := Codec, + <<"offset">> := Offset, <<"length">> := Length - }} -> + } + } -> Loaded = - case CodecName of - <<"ans104@1.0">> -> load_item(ID, StartOffset, Length, Opts); - <<"tx@1.0">> -> load_tx(ID, StartOffset, Length, Opts) - end, + load_message( + Codec, + ID, + root_offset(Offset, StoreOpts), + Length, + StoreOpts + ), case Loaded of {ok, Message} -> hb_store_remote_node:maybe_cache(StoreOpts, Message), @@ -162,55 +164,93 @@ do_read(StoreOpts, ID, Opts) -> arweave_offsets, {read_ok, {id, {string, ID}}, - {format_version, Version}, - {type, CodecName}, - {start_offset, StartOffset}, + {codec, Codec}, + {offset, Offset}, {length, Length} } ), - record_partition_metric(StartOffset, ok, Opts), + record_partition_metric(Offset, ok, StoreOpts), Loaded; {error, Reason} -> ?event( arweave_offsets, {read_chunks_not_found, {id, {string, ID}}, - {format_version, Version}, - {type, CodecName}, - {start_offset, StartOffset}, + {codec, Codec}, + {offset, Offset}, {length, Length}, {reason, Reason} } ), - record_partition_metric(StartOffset, not_found, Opts), + record_partition_metric(Offset, not_found, StoreOpts), if Reason =:= not_found -> not_found; true -> {error, Reason} end end; not_found -> - ?event( - arweave_offsets, - {miss, {id, {explicit, ID}}} - ), + ?event(arweave_offsets, {miss, {id, {explicit, ID}}}), not_found end. +%% @doc Takes a `read_offset/2' result and returns it, normalized to the +%% outer-most root that is known: Either the mempool or a global byte offset. +root_offset(relative, _Store) -> relative; +root_offset(GlobalOffset, _Store) when is_integer(GlobalOffset) -> GlobalOffset; +root_offset(Offset, Store) -> root_offset(Offset, 0, Store). +root_offset(#{ <<"relative">> := P, <<"offset">> := Off }, Acc, Store) -> + case read_offset(Store, P) of + {ok, Next = #{ <<"relative">> := _, <<"offset">> := _ }} -> + % We have another relative offset. Continue. + root_offset(Next, Acc + Off, Store); + {ok, relative} -> + % We have reached an unconfirmed TX as the root of the relative offset + % chain, so we return an offset against that. + #{ <<"relative">> => P, <<"offset">> => Acc + Off }; + {ok, GlobalOffset} when is_integer(GlobalOffset) -> + % We have reached a confirmed TX as the root of the relative offset + % chain, so we return a global offset. + GlobalOffset + Acc + Off; + _ -> + % The result was unknown, so we total accumulator and current offset + % and return it with the `relative` key intact. + #{ <<"relative">> => P, <<"offset">> => Acc + Off } + end; +root_offset(Other, _, _) -> Other. + +%% @doc Load a TX from Arweave. Supports either confirmed or pending TXs. +load_message(<<"tx@1.0">>, ID, Type, _Length, Opts) -> + % Determine the correct path to hit to load the TX. Confirmed TXs require + % `tx=ID`, while pending TXs require `pending=ID`. + PathKeys = + if Type =:= relative -> #{ <<"path">> => <<"pending">>, <<"pending">> => ID }; + true -> #{ <<"path">> => <<"tx">>, <<"tx">> => ID } + end, + hb_prometheus:measure_and_report( + fun() -> + hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9">> }, + PathKeys#{ <<"exclude-data">> => false }, + Opts + ) + end, + hb_store_arweave_chunk_fetch_duration_seconds, + [load_tx] + ); %% @doc Load an ANS-104 item from the given start offset and length. -%% Returns an `ok' tuple with the deserialized item, or an `error' tuple with -%% the reason. The `StartOffset` is the precise starting byte of the item _header_, +%% The `StartOffset` is the precise starting byte of the item _header_, %% not the data segment. The `Length` covers the full size of the item, including %% header. The `ExpectedID` is verified against the deserialized item's ID to %% guard against stale offsets (e.g. after a reorg). -load_item(ExpectedID, StartOffset, Length, Opts) -> +load_message(<<"ans104@1.0">>, ID, Offset, Length, Opts) -> hb_prometheus:measure_and_report( fun() -> - case read_chunks(StartOffset, Length, Opts) of + case read_chunks(Offset, Length, Opts) of {ok, SerializedItem} -> try Item = ar_bundles:deserialize(SerializedItem), case hb_util:encode(Item#tx.id) of - ExpectedID -> + ID -> {ok, hb_message:convert( Item, <<"structured@1.0">>, @@ -219,16 +259,14 @@ load_item(ExpectedID, StartOffset, Length, Opts) -> )}; ActualID -> ?event(error, {load_item, {id_mismatch}}), - {error, - {id_mismatch, - ExpectedID, ActualID}} + {error, {id_mismatch, ID, ActualID}} end catch _:Reason:Stacktrace -> %% Due to malformed encoding, attempt to deserialize %% can throw. ?event(error, {load_item, - {expected_id, ExpectedID}, + {expected_id, ID}, {reason, Reason}, {stacktrace, Stacktrace} }), @@ -243,61 +281,21 @@ load_item(ExpectedID, StartOffset, Length, Opts) -> [load_item] ). -%% @doc Load a TX from the given start offset and length. The `StartOffset' is -%% the start of the first chunk of the data and runs for the length of the data -%% segment, ignoring header size. -load_tx(ID, StartOffset, Length, Opts) -> - hb_prometheus:measure_and_report( - fun() -> - {ok, StructuredTXHeader} = hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9">> }, - #{ - <<"path">> => <<"tx">>, - <<"tx">> => ID, - <<"exclude-data">> => true - }, - Opts - ), - TXHeader = - hb_message:convert( - StructuredTXHeader, - <<"tx@1.0">>, - <<"structured@1.0">>, - Opts - ), - case Length of - 0 -> - {ok, hb_message:convert( - TXHeader, - <<"structured@1.0">>, - <<"tx@1.0">>, - Opts)}; - _ -> - case read_chunks(StartOffset, Length, Opts) of - {ok, Data} -> - {ok, hb_message:convert( - TXHeader#tx{data = Data}, - <<"structured@1.0">>, - <<"tx@1.0">>, - Opts - )}; - {error, Reason} -> - {error, Reason} - end - end - end, - hb_store_arweave_chunk_fetch_duration_seconds, - [load_tx] - ). - %% @doc Read the chunks from the given start offset and length using the %% `~arweave@2.9` device. -read_chunks(StartOffset, Length, Opts) -> +read_chunks(Offset, Length, Opts) -> hb_ao:resolve( #{ <<"device">> => <<"arweave@2.9">> }, #{ <<"path">> => <<"chunk">>, - <<"offset">> => StartOffset + 1, + <<"offset">> => + % TODO: The rationale for this seems to be that Arweave offsets + % start at the last byte of the previous chunk. It is unclear + % whether it is wise to apply this offset here, or perhaps it + % should be applied in the device key itself. + if is_integer(Offset) -> Offset + 1; + true -> Offset + end, <<"length">> => Length }, Opts diff --git a/src/core/store/hb_store_arweave_offset.erl b/src/core/store/hb_store_arweave_offset.erl index 8c0a906e3..e030e8769 100644 --- a/src/core/store/hb_store_arweave_offset.erl +++ b/src/core/store/hb_store_arweave_offset.erl @@ -3,7 +3,7 @@ %%% as such small optimizations to the encoding of their offsets have a significant %%% effect. For example, a single byte sized in the encoding at time of writing %%% saves ~25 GB of storage. -%%% +%%% %%% Version 1 of the encoding is as follows: %%% Encoded ::= MempoolTX | RelativeRef | ConfirmedMessage %%% MempoolTX ::= << Version:4, 0:4 >> @@ -15,7 +15,7 @@ %%% - Codec: 4-bit unsigned integer. Max: 15. Registry included below. %%% - Offset: 64-bit uint. Max: 2^64-1. %%% - RELATIVE: An atom, expressing that the offset is relative to the start -%%% of another transaction, rather than the start of the Arweave global +%%% of another transaction, rather than the start of the Arweave global %%% address space. Always expressed as 2^64-1. %%% - ParentID: The ID of a parent message for a relative offset, 256-bit uint. %%% - Length: big-endian unsigned variable-length integer. @@ -25,34 +25,31 @@ %%% transaction, yet to receive a global offset. %%% - ConfirmedMessage: A message (any codec) that has been confirmed and has %%% received a global offset. -%%% +%%% %%% Codec Registry: %%% - 0: `tx@1.0`: An Arweave transaction. %%% - 1: `ans102@1.0`: The initial JSON data item format. %%% - 2: `~ans104@1.0`: Binary data items. %%% - 3: `~httpsig@1.0`: RFC-9421 compatible HTTP signed messages. -%%% +%%% %%% Codec indexes should, in general, be sorted by the time of their first write %%% to Arweave: Arweave TXs as 0, ANS-102 as 1, ANS-104 as 2, etc. -%%% -%%% All `length` values are read by decoding all of the remaining bytes in the +%%% +%%% All `length` values are read by decoding all of the remaining bytes in the %%% offset encoding as an unsigned big-endian integer. This allows the length %%% to contract to only the number of bytes actually necessary to represent it. -module(hb_store_arweave_offset). -export([encode/3, decode/1, path/1, mismatch_path/1]). -include("include/hb.hrl"). +-include_lib("eunit/include/eunit.hrl"). -%% @doc Determine if a value is within a given unsigned bit range. --define(IN_BIT_RANGE(X, Bits), (X >= 0 andalso X < (1 bsl Bits))). +-define(IN_BIT_RANGE(X, Bits), (is_integer(X) andalso X >= 0 andalso X < (1 bsl Bits))). --define(OFFSET_SZ, (8*8)). % 64-bit uint. Max: 2^64-1. +-define(OFFSET_SZ, (8*8)). -define(OFFSET_MAX, ((1 bsl ?OFFSET_SZ) - 1)). --define(FORMAT_VERSION, 1). % 4-bit uint. Max: 15. +-define(FORMAT_VERSION, 1). +-define(MEMPOOL_TX, <>). -%% @doc Reserved for future use. At the present time, store containing offsets are -%% expected to be utilized only as sub-stores to a `hb_store_arweave' store. As -%% as consequence, the path is simply the ID of the data item, with the prefix -%% of `~arweave@2.9/offset/` implied. path(ID) when ?IS_ID(ID) -> hb_util:native_id(ID); path(ID) -> throw({cannot_encode_path, ID}). @@ -60,51 +57,112 @@ mismatch_path(ID) when ?IS_ID(ID) -> <<"mismatch/", (hb_util:native_id(ID))/binary>>; mismatch_path(ID) -> throw({cannot_encode_mismatch_path, ID}). -%% @doc Encode the offset of the data if it is valid. Throws `cannot_encode_offset' -%% if invalid. -encode(Type, StartOffset, Length) - when - (Type == true orelse Type == false orelse is_binary(Type)) - andalso ?IN_BIT_RANGE(StartOffset, ?OFFSET_SZ*8) - andalso is_integer(Length) andalso Length >= 0 - -> +%% @doc Encode an offset entry. +%% MempoolTX: a single byte when the key refers to an unconfirmed TX. +encode(<<"tx@1.0">>, relative, _Length) -> + ?MEMPOOL_TX; +%% RelativeRef: sentinel offset + parent ID + range. +encode(Codec, #{ <<"relative">> := ParentID, <<"offset">> := RelOffset }, Length) + when is_binary(Codec) andalso ?IS_ID(ParentID) + andalso ?IN_BIT_RANGE(RelOffset, ?OFFSET_SZ) + andalso is_integer(Length) andalso Length >= 0 -> << - (encode_format(Type))/binary, + (encode_format(Codec))/binary, + ?OFFSET_MAX:?OFFSET_SZ, + (hb_util:native_id(ParentID))/binary, + RelOffset:?OFFSET_SZ, + (binary:encode_unsigned(Length))/binary + >>; +%% ConfirmedMessage: global offset + length. +encode(Codec, StartOffset, Length) + when is_binary(Codec) + andalso is_integer(StartOffset) + andalso ?IN_BIT_RANGE(StartOffset, ?OFFSET_SZ) + andalso is_integer(Length) andalso Length >= 0 -> + << + (encode_format(Codec))/binary, StartOffset:?OFFSET_SZ, (binary:encode_unsigned(Length))/binary >>; -encode(IsTX, StartOffset, Length) -> - throw({cannot_encode_offset, {IsTX, StartOffset, Length}}). +encode(Codec, Offset, Length) -> + throw({cannot_encode_offset, {Codec, Offset, Length}}). -decode(<>) -> - {Version, CodecName} = decode_format(Format), - {Version, CodecName, StartOffset, binary:decode_unsigned(Length)}; +%% @doc Decode an offset entry. +decode(?MEMPOOL_TX) -> + % MempoolTX: exactly one byte, version 1, codec tx@1.0. + {<<"tx@1.0">>, relative, 0}; +decode(<>) -> + % RelativeRef: `RELATIVE` atom in the offset field signals a parent-relative ref. + {_, Codec} = decode_format(Fmt), + { + Codec, + #{ + <<"relative">> => hb_util:encode(ParentID), + <<"offset">> => RelOffset + }, + binary:decode_unsigned(Length) + }; +decode(<>) -> + % ConfirmedMessage: global offset. + {_, Codec} = decode_format(Fmt), + {Codec, Offset, binary:decode_unsigned(Length)}; decode(Binary) -> throw({cannot_decode_offset, Binary}). -%% @doc Encode the type of the data. -encode_type(<<"tx@1.0">>) -> 0; -encode_type(<<"ans102@1.0">>) -> 1; -encode_type(<<"ans104@1.0">>) -> 2; -encode_type(<<"httpsig@1.0">>) -> 3; -encode_type(Type) -> throw({cannot_encode_type, Type}). +encode_codec(<<"tx@1.0">>) -> 0; +encode_codec(<<"ans102@1.0">>) -> 1; +encode_codec(<<"ans104@1.0">>) -> 2; +encode_codec(<<"httpsig@1.0">>) -> 3; +encode_codec(Codec) -> throw({cannot_encode_codec, Codec}). -%% @doc Decode the type of the data to a binary codec name. -decode_type(0) -> <<"tx@1.0">>; -decode_type(1) -> <<"ans102@1.0">>; -decode_type(2) -> <<"ans104@1.0">>; -decode_type(3) -> <<"httpsig@1.0">>; -decode_type(Type) -> throw({cannot_decode_type, Type}). +decode_codec(0) -> <<"tx@1.0">>; +decode_codec(1) -> <<"ans102@1.0">>; +decode_codec(2) -> <<"ans104@1.0">>; +decode_codec(3) -> <<"httpsig@1.0">>; +decode_codec(Codec) -> throw({cannot_decode_codec, Codec}). -%% @doc Encode the format of the offset. See the module documentation for the -%% present index of supported codecs. encode_format(CodecName) -> - << ?FORMAT_VERSION:4, (encode_type(CodecName)):4 >>; -encode_format(CodecName) -> - throw({cannot_encode_format, CodecName}). + <>. -%% @doc Decode the format of the offset. -decode_format(<>) -> - {FormatVersion, decode_type(CodecName)}; +decode_format(<<_Version:4, CodecName:4>>) -> + {?FORMAT_VERSION, decode_codec(CodecName)}; decode_format(Binary) -> - throw({cannot_decode_format, Binary}). \ No newline at end of file + throw({cannot_decode_format, Binary}). + +%%% Tests + +confirmed_round_trip_test() -> + Encoded = encode(<<"tx@1.0">>, 12345, 678), + ?assertEqual({<<"tx@1.0">>, 12345, 678}, decode(Encoded)). + +mempool_tx_round_trip_test() -> + Encoded = encode(<<"tx@1.0">>, relative, 0), + ?assertEqual(1, byte_size(Encoded)), + ?assertEqual({<<"tx@1.0">>, relative, 0}, decode(Encoded)). + +relative_ref_round_trip_test() -> + ParentID = hb_util:encode(crypto:strong_rand_bytes(32)), + Encoded = + encode(<<"ans104@1.0">>, + #{ <<"relative">> => ParentID, <<"offset">> => 321 }, + 654 + ), + ?assertEqual( + { + <<"ans104@1.0">>, + #{ <<"relative">> => ParentID, <<"offset">> => 321 }, + 654 + }, + decode(Encoded) + ). + +relative_ref_zero_offset_round_trip_test() -> + ParentID = hb_util:encode(crypto:strong_rand_bytes(32)), + Encoded = + encode( + <<"ans104@1.0">>, + #{ <<"relative">> => ParentID, <<"offset">> => 0 }, + 100 + ), + ?assertMatch({<<"ans104@1.0">>, #{ <<"offset">> := 0 }, 100}, decode(Encoded)). \ No newline at end of file diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index e40bf32f2..a826bf4ed 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -175,7 +175,7 @@ head_raw(Base, Request, Opts) -> {ok, #{ <<"codec-device">> := CodecDevice, - <<"start-offset">> := StartOffset, + <<"offset">> := StartOffset, <<"length">> := Length }} -> CodecFun = @@ -404,22 +404,47 @@ post_chunk(Request, Opts) -> %% global Arweave data tree, or relative to the start of a specific pending %% transaction. get_chunk(_Base, Request, Opts) -> - Offset = hb_util:int(hb_maps:get(<<"offset">>, Request, 0, Opts)), - Length = hb_util:int(hb_maps:get(<<"length">>, Request, 1, Opts)), - MaybeRelativeTXID = hb_maps:get(<<"pending">>, Request, undefined, Opts), + {ok, Offset, Length, MaybeRelativeTXID} = extract_chunk_params(Request, Opts), case fetch_chunk_range(Offset, Length, MaybeRelativeTXID, Opts) of {ok, Chunks} -> - Data = iolist_to_binary(Chunks), - case hb_maps:is_key(<<"length">>, Request, Opts) of - true -> - {ok, binary:part(Data, 0, min(Length, byte_size(Data)))}; - false -> - {ok, Data} + Data = hb_util:bin(Chunks), + case Length of + undefined -> + {ok, Data}; + Length -> + { + ok, + binary:part(Data, 0, min(hb_util:int(Length), byte_size(Data))) + } end; {error, Reason} -> {error, Reason} end. +%% @doc Extract the parameters from a chunk request. Supports both global offsets +%% and relative offset+parent ID pairs. +extract_chunk_params(Request, Opts) -> + Length = hb_maps:get(<<"length">>, Request, undefined, Opts), + case hb_maps:find(<<"offset">>, Request, Opts) of + {ok, RelativeInfo} when is_map(RelativeInfo) -> + {ok, RelativeOffset} = hb_maps:find(<<"offset">>, RelativeInfo, Opts), + {ok, RelativeTXID} = hb_maps:find(<<"relative">>, RelativeInfo, Opts), + {ok, hb_util:int(RelativeOffset), Length, RelativeTXID}; + {ok, Offset} when is_integer(Offset) orelse is_binary(Offset) -> + { + ok, + hb_util:int(Offset), + Length, + hb_maps:get(<<"pending">>, Request, undefined, Opts) + } + end. + +%% @doc Fetch a range of chunks in parallel. Determines the appropriate algorithm +%% to use to get the chunks based on offset, length, and an optional relative +%% transaction ID. Notably, this function returns the binary for all of the +%% chunks that were fetched, not just the requested length. This allows callers +%% to avoid wasted additional requests in some circumstances, but also requires +%% them to handle truncation themselves. fetch_chunk_range(Offset, Length, undefined, Opts) when (Offset >= ?STRICT_DATA_SPLIT_THRESHOLD) andalso ((Offset + Length - 1) >= ?STRICT_DATA_SPLIT_THRESHOLD) -> @@ -1029,25 +1054,18 @@ to_tx_message(Type, ID, Path, {ok, #{ <<"body">> := Body }}, LogExtra, Opts) -> } ), {ok, Data} = - case hb_opts:get(exclude_data, false, Opts) of - true -> {ok, ?DEFAULT_DATA}; + case (TXHeader#tx.data_size == 0) orelse hb_opts:get(exclude_data, false, Opts) of + true -> {ok, <<>>}; false -> - DataRes = - case Type of - tx -> - request(<<"GET">>, <<"/raw/", ID/binary>>, Opts); - pending -> - get_chunk_range_relative( - 0, - TXHeader#tx.data_size, - ID, - Opts - ) - end, - case DataRes of - {ok, RawData} -> {ok, RawData}; - {error, not_found} -> {ok, ?DEFAULT_DATA}; - Error -> Error + case Type of + tx -> request(<<"GET">>, <<"/raw/", ID/binary>>, Opts); + pending -> + get_chunk_range_relative( + 0, + TXHeader#tx.data_size, + ID, + Opts + ) end end, { From cc80fe131a94fdd5d5c2efc74968e6f85b891588 Mon Sep 17 00:00:00 2001 From: Sam Williams Date: Mon, 13 Apr 2026 02:39:38 -0400 Subject: [PATCH 27/68] wip: progress towards `GET /raw` on pending IDs --- src/preloaded/arweave/dev_arweave.erl | 56 ++++++++++++++++++--------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index a826bf4ed..9a0f516be 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -175,9 +175,10 @@ head_raw(Base, Request, Opts) -> {ok, #{ <<"codec-device">> := CodecDevice, - <<"offset">> := StartOffset, + <<"offset">> := RawOffset, <<"length">> := Length }} -> + StartOffset = hb_store_arweave:root_offset(RawOffset, Opts), CodecFun = case CodecDevice of <<"ans104@1.0">> -> fun head_raw_ans104/4; @@ -209,15 +210,14 @@ head_raw(Base, Request, Opts) -> %% @doc Arweave transaction headers are not part of the Arweave data tree, and %% thus we do not add their header bytes to the offset in order to read their %% data. -head_raw_tx(TXID, StartOffset, Length, Opts) -> +head_raw_tx(TXID, Offset, Length, Opts) -> + BaseReq = #{ <<"exclude-data">> => true }, {ok, StructuredTXHeader} = - get_tx( - #{ <<"tx">> => TXID }, - #{ <<"exclude-data">> => true }, - Opts - ), + if is_integer(Offset) -> get_tx(#{}, BaseReq#{ <<"tx">> => TXID }, Opts); + true -> pending(#{}, BaseReq#{ <<"pending">> => TXID }, Opts) + end, ContentType = - hb_ao:get( + hb_maps:get( <<"content-type">>, StructuredTXHeader, <<"application/octet-stream">>, @@ -226,21 +226,29 @@ head_raw_tx(TXID, StartOffset, Length, Opts) -> [<<"no-cache">>, <<"no-store">>] } ), - {ok, - #{ - <<"raw-id">> => TXID, - <<"offset">> => StartOffset, - <<"data-offset">> => StartOffset, - <<"content-type">> => ContentType, - <<"header-length">> => 0, - <<"content-length">> => Length, - <<"accept-ranges">> => <<"bytes">> - } - }. + {ok, #{ + <<"raw-id">> => TXID, + <<"offset">> => Offset, + <<"data-offset">> => Offset, + <<"content-type">> => ContentType, + <<"header-length">> => 0, + <<"content-length">> => Length, + <<"accept-ranges">> => <<"bytes">> + }}. %% @doc ANS-104 headers are stored as part of the global Arweave data tree, so %% so to read the data associated with their IDs, we must first read the header %% chunk, deserialize it, and offset our data read from its starting offset. +head_raw_ans104(TXID, Offset, Length, _Opts) when not is_integer(Offset) -> + {ok, #{ + <<"raw-id">> => TXID, + <<"offset">> => Offset, + <<"data-offset">> => Offset, + <<"content-type">> => <<"application/octet-stream">>, + <<"header-length">> => 0, + <<"content-length">> => Length, + <<"accept-ranges">> => <<"bytes">> + }}; head_raw_ans104(TXID, ArweaveOffset, Length, Opts) -> ?event(debug_raw, {head_raw_ans104, {txid, TXID}, {arweave_offset, ArweaveOffset}, {length, Length}}), HeaderReq = @@ -298,6 +306,16 @@ get_raw(Base, Request, Opts) -> case head_raw(Base, Request, Opts) of not_found -> {error, not_found}; Err = {error, _} -> Err; + {ok, + Header = #{ + <<"data-offset">> := DataOffset, + <<"content-length">> := ContentLength + } + } when not is_integer(DataOffset) -> + case hb_store_arweave:read_chunks(DataOffset, ContentLength, Opts) of + {ok, Data} -> {ok, Header#{ <<"body">> => Data }}; + Error -> Error + end; {ok, Header = #{ <<"raw-id">> := TXID, From f83e5536604f44db69d4a43d580ceffd858207ca Mon Sep 17 00:00:00 2001 From: Sam Williams Date: Mon, 13 Apr 2026 02:40:18 -0400 Subject: [PATCH 28/68] slop: Claude's draft of copycat on pending --- src/core/store/hb_store_arweave.erl | 3 +- src/preloaded/query/dev_copycat_arweave.erl | 88 +++++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 3b711673b..36e1afd20 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -340,7 +340,8 @@ record_partition_metric(Offset, Result, StoreOpts) when is_integer(Offset) -> end); false -> ok - end. + end; +record_partition_metric(_, _, _) -> ok. %% @doc Initialize the Prometheus metrics for the Arweave store. Executed on %% `start/1' of the store. diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 958a32d29..4bce17c2d 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -29,6 +29,8 @@ %% fetch blocks from the latest known block towards the Genesis block. arweave(_Base, Request, Opts) -> case hb_maps:get(<<"mode">>, Request, <<"write">>, Opts) of + <<"mempool">> -> + index_mempool(Request, Opts); <<"write">> -> case hb_maps:find(<<"id">>, Request, Opts) of {ok, TXID} -> @@ -1661,6 +1663,92 @@ get_index_store(Opts) -> _ -> throw(no_index_store_available) end. +%% @doc Scan the mempool and index any accessible unconfirmed TXs. +index_mempool(_Request, Opts) -> + case dev_arweave:pending(#{}, #{}, Opts) of + {ok, TXIDs} when is_list(TXIDs) -> + Results = parallel_map(TXIDs, + fun(TXID) -> index_mempool_tx(TXID, Opts) end, Opts), + Summary = lists:foldl(fun(R, Acc) -> + K = case R of + ok -> indexed; existing -> existing; + missing_data -> missing_data; _ -> failed + end, + Acc#{ K => maps:get(K, Acc) + 1 } + end, #{ indexed => 0, existing => 0, + missing_data => 0, failed => 0 }, Results), + ?event(copycat_short, {mempool_scan_completed, Summary}), + {ok, Summary}; + Error -> Error + end. + +index_mempool_tx(TXID, Opts) -> + case is_tx_indexed(TXID, Opts) of + true -> existing; + false -> + case dev_arweave:pending(#{}, #{ <<"pending">> => TXID }, Opts) of + {ok, StructuredTX} -> + TX = hb_message:convert(StructuredTX, + <<"tx@1.0">>, <<"structured@1.0">>, Opts), + case has_mempool_data(TX) of + true -> write_mempool_offsets(TXID, TX, Opts); + false -> missing_data + end; + _ -> failed + end + end. + +has_mempool_data(#tx{ data_size = 0 }) -> true; +has_mempool_data(#tx{ data = D, data_size = S }) + when is_binary(D) -> byte_size(D) =:= S; +has_mempool_data(_) -> false. + +write_mempool_offsets(TXID, TX, Opts) -> + Store = hb_store_arweave:store_from_opts(Opts), + ok = hb_store_arweave:write_offset( + Store, TXID, <<"tx@1.0">>, relative, TX#tx.data_size), + write_mempool_children(Store, TXID, TX, Opts), + ok. + +write_mempool_children(Store, TXID, TX, Opts) -> + case is_bundle_tx(TX, Opts) of + true -> + try ar_bundles:decode_bundle_header(TX#tx.data) of + {ItemsBin, BundleIndex} -> + HeaderSize = byte_size(TX#tx.data) - byte_size(ItemsBin), + write_mempool_items(Store, TXID, BundleIndex, HeaderSize); + _ -> ok + catch _:_ -> ok + end; + false -> + case standalone_item_id(TX) of + {ok, ItemID} -> + Ref = #{ <<"relative">> => TXID, <<"offset">> => 0 }, + hb_store_arweave:write_offset( + Store, ItemID, <<"ans104@1.0">>, + Ref, TX#tx.data_size); + not_found -> ok + end + end. + +write_mempool_items(_Store, _TXID, [], _Offset) -> ok; +write_mempool_items(Store, TXID, [{ItemID, Size} | Rest], Offset) -> + Ref = #{ <<"relative">> => TXID, <<"offset">> => Offset }, + hb_store_arweave:write_offset( + Store, hb_util:encode(ItemID), <<"ans104@1.0">>, Ref, Size), + write_mempool_items(Store, TXID, Rest, Offset + Size). + +standalone_item_id(#tx{ data = Data }) when is_binary(Data), Data =/= <<>> -> + try + Item = ar_bundles:deserialize(Data), + case ar_bundles:verify_item(Item) of + true -> {ok, hb_util:encode(Item#tx.id)}; + false -> not_found + end + catch _:_ -> not_found + end; +standalone_item_id(_) -> not_found. + %%% Tests index_ids_test_parallel() -> From f04e71b5a37710b8fecd4ed6111ea8e5155250d5 Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Mon, 20 Apr 2026 19:49:29 +0200 Subject: [PATCH 29/68] fix: port restore chunk reads without explicit len fix from impr/ --- src/preloaded/arweave/dev_arweave.erl | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index 9a0f516be..77ac5688e 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -422,14 +422,15 @@ post_chunk(Request, Opts) -> %% global Arweave data tree, or relative to the start of a specific pending %% transaction. get_chunk(_Base, Request, Opts) -> + HasExplicitLength = hb_maps:is_key(<<"length">>, Request, Opts), {ok, Offset, Length, MaybeRelativeTXID} = extract_chunk_params(Request, Opts), case fetch_chunk_range(Offset, Length, MaybeRelativeTXID, Opts) of {ok, Chunks} -> Data = hb_util:bin(Chunks), - case Length of - undefined -> + case HasExplicitLength of + false -> {ok, Data}; - Length -> + true -> { ok, binary:part(Data, 0, min(hb_util:int(Length), byte_size(Data))) @@ -442,7 +443,7 @@ get_chunk(_Base, Request, Opts) -> %% @doc Extract the parameters from a chunk request. Supports both global offsets %% and relative offset+parent ID pairs. extract_chunk_params(Request, Opts) -> - Length = hb_maps:get(<<"length">>, Request, undefined, Opts), + Length = hb_maps:get(<<"length">>, Request, 1, Opts), case hb_maps:find(<<"offset">>, Request, Opts) of {ok, RelativeInfo} when is_map(RelativeInfo) -> {ok, RelativeOffset} = hb_maps:find(<<"offset">>, RelativeInfo, Opts), @@ -1998,6 +1999,12 @@ get_mid_chunk_pre_split_test_parallel() -> ), ok. +extract_chunk_params_default_length_test_parallel() -> + ?assertEqual( + {ok, 123, 1, undefined}, + extract_chunk_params(#{ <<"offset">> => 123 }, #{}) + ). + get_pre_split_small_chunks_test_parallel() -> TXID = <<"4FnBmvgWmqXWEEprjVqBsV5aRpAgF6_yJX_GTGsSZjY">>, Opts = setup_arweave_index_opts([TXID]), From 00ba4c672d80f1b72373457c7f90ade36b9709c5 Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Thu, 16 Apr 2026 12:41:07 +0200 Subject: [PATCH 30/68] fix: non-numeric pending offsets & tx_to_message hard match --- src/preloaded/arweave/dev_arweave.erl | 160 ++++++++++++++++++++------ 1 file changed, 128 insertions(+), 32 deletions(-) diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index 77ac5688e..9fcdda0ff 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -882,28 +882,43 @@ pending(Base, Request, Opts) -> % Retreive a bare TX header by its TXID request(<<"GET">>, <<"/unconfirmed_tx/", TXID/binary>>, Opts); {ok, RawOffset} -> - Offset = hb_util:int(RawOffset), - % Download an unconfirmed chunk by its offset - request( - <<"GET">>, - << - "/unconfirmed_chunk/", - TXID/binary, - "/", - (hb_util:bin(Offset))/binary - >>, - Opts#{ - <<"exclude-data">> => - hb_util:bool( - find_key( - <<"exclude-data">>, - Base, - Request, - Opts - ) - ) - } - ) + try hb_util:int(RawOffset) of + Offset when Offset < 0 -> + {error, #{ + <<"status">> => 400, + <<"content-type">> => <<"application/json">>, + <<"body">> => <<"{\"error\":\"invalid_offset\"}">> + }}; + Offset -> + % Download an unconfirmed chunk by its offset + request( + <<"GET">>, + << + "/unconfirmed_chunk/", + TXID/binary, + "/", + (hb_util:bin(Offset))/binary + >>, + Opts#{ + <<"exclude-data">> => + hb_util:bool( + find_key( + <<"exclude-data">>, + Base, + Request, + Opts + ) + ) + } + ) + catch + _:_ -> + {error, #{ + <<"status">> => 400, + <<"content-type">> => <<"application/json">>, + <<"body">> => <<"{\"error\":\"invalid_offset\"}">> + }} + end end end. @@ -1072,7 +1087,7 @@ to_tx_message(Type, ID, Path, {ok, #{ <<"body">> := Body }}, LogExtra, Opts) -> {tx, TXHeader} } ), - {ok, Data} = + DataRes = case (TXHeader#tx.data_size == 0) orelse hb_opts:get(exclude_data, false, Opts) of true -> {ok, <<>>}; false -> @@ -1087,15 +1102,30 @@ to_tx_message(Type, ID, Path, {ok, #{ <<"body">> := Body }}, LogExtra, Opts) -> ) end end, - { - ok, - hb_message:convert( - TXHeader#tx{ data = Data }, - <<"structured@1.0">>, - <<"tx@1.0">>, - Opts - ) - }. + case DataRes of + {ok, Data} -> + { + ok, + hb_message:convert( + TXHeader#tx{ data = Data }, + <<"structured@1.0">>, + <<"tx@1.0">>, + Opts + ) + }; + {error, not_found} -> + { + ok, + hb_message:convert( + TXHeader#tx{ data = ?DEFAULT_DATA }, + <<"structured@1.0">>, + <<"tx@1.0">>, + Opts + ) + }; + Error -> + Error + end. event_request(Path, Method, Status, Extra) -> BaseList = [{request, {explicit, Path}}, {method, Method}, {status, Status}], @@ -1270,6 +1300,52 @@ best_response_non_map_error_round_trips_test_parallel() -> to_message(<<"/tx">>, <<"GET">>, {error, FailedConnect}, [], #{}) ). +tx_raw_fetch_error_round_trips_test() -> + {ok, MockNode, MockHandle} = hb_mock_server:start([ + {"/raw/:id", tx_raw, {500, <<"boom">>}} + ]), + ClientOpts = post_tx_json_client_opts(), + HeaderBody = post_tx_json_payload(ClientOpts), + TXID = maps:get(<<"id">>, hb_json:decode(HeaderBody)), + Opts = + ClientOpts#{ + routes => [ + #{ + <<"template">> => + #{ + <<"path">> => <<"^/arweave/raw">>, + <<"method">> => <<"GET">> + }, + <<"nodes">> => + [ + #{ + <<"match">> => <<"^/arweave">>, + <<"with">> => MockNode, + <<"opts">> => #{ http_client => httpc } + } + ], + <<"parallel">> => 1, + <<"responses">> => 1, + <<"stop-after">> => true, + <<"admissible-status">> => 200 + } + ] + }, + try + ?assertMatch( + {error, _}, + to_message( + <<"/tx/", TXID/binary>>, + <<"GET">>, + {ok, #{ <<"body">> => HeaderBody }}, + [], + Opts + ) + ) + after + hb_mock_server:stop(MockHandle) + end. + post_tx_json_two_node_test(Node1TxResponse, Node2TxResponse) -> {ok, MockNode1, MockHandle1} = hb_mock_server:start([ {"/tx", tx, Node1TxResponse} @@ -1796,6 +1872,26 @@ get_bad_tx_test_parallel() -> Res = hb_http:get(Node, Path, #{}), ?assertEqual({error, not_found}, Res). +pending_invalid_offset_returns_invalid_offset_test() -> + {error, Error} = + hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9">> }, + #{ + <<"path">> => <<"pending">>, + <<"pending">> => <<"cat">>, + <<"offset">> => <<"dog">> + }, + #{} + ), + ?assertMatch( + #{ + <<"status">> := 400, + <<"content-type">> := <<"application/json">>, + <<"body">> := <<"{\"error\":\"invalid_offset\"}">> + }, + Error + ). + %% @doc: helper test to generate and write a dataitem to disk so that we %% can validate it using 3rd-party js libraries and gateways. serialize_data_item_test_disabled() -> From d57e998046aa26a85ba7dc183fa0b0437c7d6965 Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Thu, 16 Apr 2026 20:10:58 +0200 Subject: [PATCH 31/68] chore: add pinned dev_arweave:pending/3 list --- src/preloaded/arweave/dev_arweave.erl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index 9fcdda0ff..cb9d6cfd3 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -875,7 +875,13 @@ tx_anchor(_Base, _Request, Opts) -> %% nodes, or a specific unconfirmed transaction header by its TXID. pending(Base, Request, Opts) -> case find_key(<<"pending">>, Base, Request, Opts) of - not_found -> request(<<"GET">>, <<"/tx/pending">>, Opts); + not_found -> + case hb_opts:get(arweave_static_pending_txids, not_found, Opts) of + TXIDs when is_list(TXIDs) -> + {ok, TXIDs}; + _ -> + request(<<"GET">>, <<"/tx/pending">>, Opts) + end; TXID -> case hb_maps:find(<<"offset">>, Request, Opts) of error -> From e943cfd0f59e44693d9debe4c21bab6a2a5968f4 Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Fri, 17 Apr 2026 09:59:13 +0200 Subject: [PATCH 32/68] fix: attempt 1 for get_chunk_range_relative/4 chunk relative decoding --- src/preloaded/arweave/dev_arweave.erl | 44 ++++++++++++++++++--------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index cb9d6cfd3..16108b3d4 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -523,27 +523,33 @@ get_chunk_range_relative(Offset, Length, RelativeTXID, Opts) -> ), GETFun = fun(XOffset) -> - pending( - #{}, - #{ <<"offset">> => XOffset, <<"pending">> => RelativeTXID }, - Opts + decode_relative_chunk( + pending( + #{}, + #{ <<"offset">> => XOffset, <<"pending">> => RelativeTXID }, + Opts + ) ) end, case fetch_and_collect(Offsets, GETFun, Opts) of {ok, ChunkInfos} -> - Concatenated = - hb_util:bin( - lists:map( - fun(JSONStruct) -> - hb_util:decode(maps:get(<<"chunk">>, JSONStruct)) - end, - ChunkInfos - ) - ), - {ok, Concatenated}; + assemble_relative_chunks(ChunkInfos, Offset); Error -> Error end. +assemble_relative_chunks(ChunkInfos, Offset) -> + assemble_chunks(ChunkInfos, Offset + 1). + +decode_relative_chunk({ok, JSON}) -> + Chunk = hb_util:decode(maps:get(<<"chunk">>, JSON)), + ChunkEnd = ar_merkle:extract_note( + hb_util:decode(maps:get(<<"data_path">>, JSON)) + ), + ChunkStart = ChunkEnd - byte_size(Chunk) + 1, + {ok, {ChunkStart, ChunkEnd, Chunk}}; +decode_relative_chunk({error, _} = Err) -> + Err. + %% @doc Iteratively detect gaps in coverage and fetch the chunk at the start %% of each gap until the entire range [Offset, EndOffset] is covered. fill_gaps(ChunkInfos, Offset, EndOffset, Opts) -> @@ -2107,6 +2113,16 @@ extract_chunk_params_default_length_test_parallel() -> extract_chunk_params(#{ <<"offset">> => 123 }, #{}) ). +assemble_relative_chunks_zero_offset_test_parallel() -> + {ok, [Chunk]} = + assemble_relative_chunks([{1, 5, <<"abcde">>}], 0), + ?assertEqual(<<"abcde">>, hb_util:bin(Chunk)). + +assemble_relative_chunks_nonzero_offset_test_parallel() -> + {ok, [Chunk]} = + assemble_relative_chunks([{1, 5, <<"abcde">>}], 2), + ?assertEqual(<<"cde">>, hb_util:bin(Chunk)). + get_pre_split_small_chunks_test_parallel() -> TXID = <<"4FnBmvgWmqXWEEprjVqBsV5aRpAgF6_yJX_GTGsSZjY">>, Opts = setup_arweave_index_opts([TXID]), From 742184e6195e86a282de81d18684444e73e4b274 Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Fri, 17 Apr 2026 10:33:59 +0200 Subject: [PATCH 33/68] fix: index mempool bundle children from raw pending bytes --- src/preloaded/query/dev_copycat_arweave.erl | 33 +++++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 4bce17c2d..db125ec76 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -1707,21 +1707,26 @@ write_mempool_offsets(TXID, TX, Opts) -> Store = hb_store_arweave:store_from_opts(Opts), ok = hb_store_arweave:write_offset( Store, TXID, <<"tx@1.0">>, relative, TX#tx.data_size), - write_mempool_children(Store, TXID, TX, Opts), + case load_mempool_data(TXID, TX, Opts) of + {ok, Data} -> + write_mempool_children(Store, TXID, TX, Data, Opts); + _ -> + ok + end, ok. -write_mempool_children(Store, TXID, TX, Opts) -> +write_mempool_children(Store, TXID, TX, Data, Opts) -> case is_bundle_tx(TX, Opts) of true -> - try ar_bundles:decode_bundle_header(TX#tx.data) of + try ar_bundles:decode_bundle_header(Data) of {ItemsBin, BundleIndex} -> - HeaderSize = byte_size(TX#tx.data) - byte_size(ItemsBin), + HeaderSize = byte_size(Data) - byte_size(ItemsBin), write_mempool_items(Store, TXID, BundleIndex, HeaderSize); _ -> ok catch _:_ -> ok end; false -> - case standalone_item_id(TX) of + case standalone_item_id(Data) of {ok, ItemID} -> Ref = #{ <<"relative">> => TXID, <<"offset">> => 0 }, hb_store_arweave:write_offset( @@ -1738,7 +1743,23 @@ write_mempool_items(Store, TXID, [{ItemID, Size} | Rest], Offset) -> Store, hb_util:encode(ItemID), <<"ans104@1.0">>, Ref, Size), write_mempool_items(Store, TXID, Rest, Offset + Size). -standalone_item_id(#tx{ data = Data }) when is_binary(Data), Data =/= <<>> -> +load_mempool_data(_TXID, #tx{ data_size = 0 }, _Opts) -> + {ok, <<>>}; +load_mempool_data(TXID, #tx{ data_size = Size }, Opts) when Size > 0 -> + hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => #{ + <<"relative">> => TXID, + <<"offset">> => 0 + }, + <<"length">> => Size + }, + Opts + ). + +standalone_item_id(Data) when is_binary(Data), Data =/= <<>> -> try Item = ar_bundles:deserialize(Data), case ar_bundles:verify_item(Item) of From d09bbef833fd9f9c6fbb06a01442e27d2e599344 Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Fri, 17 Apr 2026 11:43:30 +0200 Subject: [PATCH 34/68] fix: normalize arweave reads across pending & confirmed bundle offsets --- src/core/store/hb_store_arweave.erl | 19 ++++++++++++--- src/preloaded/arweave/dev_arweave.erl | 33 ++++++++++++++++++--------- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 36e1afd20..15c17ad9f 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -199,14 +199,14 @@ root_offset(GlobalOffset, _Store) when is_integer(GlobalOffset) -> GlobalOffset; root_offset(Offset, Store) -> root_offset(Offset, 0, Store). root_offset(#{ <<"relative">> := P, <<"offset">> := Off }, Acc, Store) -> case read_offset(Store, P) of - {ok, Next = #{ <<"relative">> := _, <<"offset">> := _ }} -> + {ok, #{ <<"offset">> := Next = #{ <<"relative">> := _, <<"offset">> := _ } }} -> % We have another relative offset. Continue. root_offset(Next, Acc + Off, Store); - {ok, relative} -> + {ok, #{ <<"offset">> := relative }} -> % We have reached an unconfirmed TX as the root of the relative offset % chain, so we return an offset against that. #{ <<"relative">> => P, <<"offset">> => Acc + Off }; - {ok, GlobalOffset} when is_integer(GlobalOffset) -> + {ok, #{ <<"offset">> := GlobalOffset }} when is_integer(GlobalOffset) -> % We have reached a confirmed TX as the root of the relative offset % chain, so we return a global offset. GlobalOffset + Acc + Off; @@ -458,3 +458,16 @@ load_item_deserialize_throws_test() -> Size = 4096, ok = write_offset(Opts, FakeID, <<"ans104@1.0">>, ProbeOffset - 1, Size), ?assertMatch({error, _}, read(Opts, FakeID, #{})). + +root_offset_confirmed_parent_test() -> + Store = [hb_test_utils:test_store()], + Opts = #{ <<"index-store">> => Store }, + ParentID = <<"bndIwac23-s0K11TLC1N7z472sLGAkiOdhds87ZywoE">>, + ok = write_offset(Opts, ParentID, <<"tx@1.0">>, 12345, 99), + ?assertEqual( + 12352, + root_offset( + #{ <<"relative">> => ParentID, <<"offset">> => 7 }, + Opts + ) + ). \ No newline at end of file diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index 16108b3d4..553ac014a 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -239,16 +239,19 @@ head_raw_tx(TXID, Offset, Length, Opts) -> %% @doc ANS-104 headers are stored as part of the global Arweave data tree, so %% so to read the data associated with their IDs, we must first read the header %% chunk, deserialize it, and offset our data read from its starting offset. -head_raw_ans104(TXID, Offset, Length, _Opts) when not is_integer(Offset) -> - {ok, #{ - <<"raw-id">> => TXID, - <<"offset">> => Offset, - <<"data-offset">> => Offset, - <<"content-type">> => <<"application/octet-stream">>, - <<"header-length">> => 0, - <<"content-length">> => Length, - <<"accept-ranges">> => <<"bytes">> - }}; +head_raw_ans104(TXID, Offset, Length, Opts) when not is_integer(Offset) -> + HeaderReq = + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => Offset, + <<"length">> => min(Length, ?DATA_CHUNK_SIZE) + }, + case hb_ao:resolve(#{ <<"device">> => <<"arweave@2.9">> }, HeaderReq, Opts) of + {ok, HeaderChunk} -> + do_head_raw_ans104(TXID, Offset, Length, HeaderChunk, Opts); + {error, Error} -> + {error, Error} + end; head_raw_ans104(TXID, ArweaveOffset, Length, Opts) -> ?event(debug_raw, {head_raw_ans104, {txid, TXID}, {arweave_offset, ArweaveOffset}, {length, Length}}), HeaderReq = @@ -275,7 +278,7 @@ do_head_raw_ans104(TXID, ArweaveOffset, Length, Data, _Opts) -> #{ <<"raw-id">> => TXID, <<"offset">> => ArweaveOffset, - <<"data-offset">> => ArweaveOffset + HeaderSize, + <<"data-offset">> => add_ans104_offset(ArweaveOffset, HeaderSize), <<"content-type">> => ContentType, <<"header-length">> => HeaderSize, <<"content-length">> => Length - HeaderSize, @@ -298,6 +301,14 @@ deserialize_ans104_header(Data) -> } end. +add_ans104_offset(Offset, HeaderSize) when is_integer(Offset) -> + Offset + HeaderSize; +add_ans104_offset(#{ <<"relative">> := ParentID, <<"offset">> := Offset }, HeaderSize) -> + #{ + <<"relative">> => ParentID, + <<"offset">> => Offset + HeaderSize + }. + %% @doc Get raw transaction *data* and `content-type` of an Arweave message. %% Does not deserialize the message, nor return signature information. Included %% only for compatibility with the legacy Arweave gateway `/raw` endpoint. From b0e3948b1b6cfd884e5e544d67f827cb480966f5 Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Fri, 17 Apr 2026 22:46:41 +0200 Subject: [PATCH 35/68] fix: bundle l2 items discovery --- src/preloaded/query/dev_copycat_arweave.erl | 62 +++++++++++++++++++-- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index db125ec76..210ec5fc9 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -1718,12 +1718,10 @@ write_mempool_offsets(TXID, TX, Opts) -> write_mempool_children(Store, TXID, TX, Data, Opts) -> case is_bundle_tx(TX, Opts) of true -> - try ar_bundles:decode_bundle_header(Data) of - {ItemsBin, BundleIndex} -> - HeaderSize = byte_size(Data) - byte_size(ItemsBin), + case load_mempool_bundle_index(TXID, Data, Opts) of + {ok, HeaderSize, BundleIndex} -> write_mempool_items(Store, TXID, BundleIndex, HeaderSize); _ -> ok - catch _:_ -> ok end; false -> case standalone_item_id(Data) of @@ -1759,6 +1757,62 @@ load_mempool_data(TXID, #tx{ data_size = Size }, Opts) when Size > 0 -> Opts ). +load_mempool_bundle_index(_TXID, Data, _Opts) when is_binary(Data), Data =/= <<>> -> + try ar_bundles:decode_bundle_header(Data) of + {ItemsBin, BundleIndex} -> + {ok, byte_size(Data) - byte_size(ItemsBin), BundleIndex}; + invalid_bundle_header -> + {error, invalid_bundle_header} + catch _:_ -> + {error, invalid_bundle_header} + end; +load_mempool_bundle_index(TXID, <<>>, Opts) -> + try + {ok, FirstChunk} = + hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => #{ + <<"relative">> => TXID, + <<"offset">> => 0 + } + }, + Opts + ), + case ar_bundles:bundle_header_size(FirstChunk) of + invalid_bundle_header -> + {error, invalid_bundle_header}; + HeaderSize when HeaderSize =< byte_size(FirstChunk) -> + {_ItemsBin, BundleIndex} = + ar_bundles:decode_bundle_header( + binary:part(FirstChunk, 0, HeaderSize) + ), + {ok, HeaderSize, BundleIndex}; + HeaderSize -> + RemainingSize = HeaderSize - byte_size(FirstChunk), + {ok, RemainingChunk} = + hb_ao:resolve( + #{ <<"device">> => <<"arweave@2.9">> }, + #{ + <<"path">> => <<"chunk">>, + <<"offset">> => #{ + <<"relative">> => TXID, + <<"offset">> => byte_size(FirstChunk) + }, + <<"length">> => RemainingSize + }, + Opts + ), + HeaderBin = <>, + {_ItemsBin, BundleIndex} = + ar_bundles:decode_bundle_header(HeaderBin), + {ok, HeaderSize, BundleIndex} + end + catch _:_ -> + {error, invalid_bundle_header} + end. + standalone_item_id(Data) when is_binary(Data), Data =/= <<>> -> try Item = ar_bundles:deserialize(Data), From 42cf8d09af2669e398bcfb4deeef0f746e8f4fca Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Sun, 19 Apr 2026 17:56:26 +0200 Subject: [PATCH 36/68] feat: stable pending-chunks traversal --- src/preloaded/arweave/dev_arweave.erl | 298 ++++++++++++++++++++------ 1 file changed, 234 insertions(+), 64 deletions(-) diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index 553ac014a..e3584e360 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -519,33 +519,32 @@ get_chunk_range_variable_size(Offset, EndOffset, Opts) -> end. %% @doc Return a chunk or range of bytes relative to a specific, unconfirmed, -%% transaction's data root. +%% transaction's data root. Pending chunk lookups query the only chunk by +%% `data_size` for single-chunk TXs, otherwise start at 256KiB and advance in +%% 256KiB steps with a final cap at `data_size`. get_chunk_range_relative(Offset, Length, RelativeTXID, Opts) -> - hb_prometheus:observe( - Length, - arweave_chunk_load_requested_bytes, - [] - ), - Offsets = - generate_offsets( - max(1, Offset + 1), - (Offset + Length), - ?DATA_CHUNK_SIZE - ), - GETFun = - fun(XOffset) -> - decode_relative_chunk( - pending( - #{}, - #{ <<"offset">> => XOffset, <<"pending">> => RelativeTXID }, - Opts - ) - ) - end, - case fetch_and_collect(Offsets, GETFun, Opts) of - {ok, ChunkInfos} -> - assemble_relative_chunks(ChunkInfos, Offset); - Error -> Error + case pending_tx_data_size(RelativeTXID, Opts) of + {ok, DataSize} -> + hb_prometheus:observe( + Length, + arweave_chunk_load_requested_bytes, + [] + ), + Offsets = pending_relative_chunk_offsets(Offset, Length, DataSize), + GETFun = + fun(XOffset) -> + QueryRes = pending_chunk_query(RelativeTXID, XOffset, Opts), + decode_relative_chunk( + QueryRes + ) + end, + case fetch_and_collect(Offsets, GETFun, Opts) of + {ok, ChunkInfos} -> + assemble_relative_chunks(ChunkInfos, Offset); + Error -> Error + end; + Error -> + Error end. assemble_relative_chunks(ChunkInfos, Offset) -> @@ -618,6 +617,179 @@ generate_offsets(Current, End, _Step, Acc) when Current > End -> generate_offsets(Current, End, Step, Acc) -> generate_offsets(Current + Step, End, Step, [Current | Acc]). +pending_chunk_query(TXID, XOffset, Opts) -> + {RetryCount, RetryDelay} = pending_chunk_poll_config(Opts), + pending_chunk_query( + TXID, + XOffset, + Opts, + RetryCount, + RetryDelay, + RetryCount, + erlang:monotonic_time(millisecond) + ). + +pending_chunk_query( + TXID, + XOffset, + Opts, + RetryCount, + RetryDelay, + TotalRetries, + StartTimeMs +) -> + Attempt = TotalRetries - RetryCount + 1, + case pending_chunk_request(TXID, XOffset, Opts) of + {error, not_found} when RetryCount > 0 -> + maybe_log_pending_chunk_retry( + Attempt, + TXID, + XOffset, + RetryDelay, + Opts + ), + timer:sleep(RetryDelay), + pending_chunk_query( + TXID, + XOffset, + Opts, + RetryCount - 1, + RetryDelay, + TotalRetries, + StartTimeMs + ); + {ok, _} = Result when Attempt > 1 -> + pending_chunk_progress( + Opts, + {pending_chunk_recovered, + {tx_id, {explicit, TXID}}, + {offset, XOffset}, + {attempt, Attempt}, + { + elapsed_ms, + erlang:monotonic_time(millisecond) - StartTimeMs + }} + ), + Result; + {error, not_found} = Result when Attempt > 1 -> + pending_chunk_progress( + Opts, + {pending_chunk_gave_up, + {tx_id, {explicit, TXID}}, + {offset, XOffset}, + {attempts, Attempt}, + { + elapsed_ms, + erlang:monotonic_time(millisecond) - StartTimeMs + }} + ), + Result; + Result -> + Result + end. + +maybe_log_pending_chunk_retry(Attempt, TXID, XOffset, RetryDelay, Opts) -> + case Attempt =:= 1 orelse Attempt rem 10 =:= 0 of + true -> + pending_chunk_progress( + Opts, + {pending_chunk_retrying, + {tx_id, {explicit, TXID}}, + {offset, XOffset}, + {attempt, Attempt}, + {retry_in_ms, RetryDelay}} + ); + false -> + ok + end. + +pending_chunk_progress(Opts, Event) -> + case hb_opts:get(arweave_mempool_progress, false, Opts) of + true -> ?event(copycat_short, Event); + false -> ok + end. + +pending_chunk_request(TXID, XOffset, Opts) -> + request( + <<"GET">>, + << + "/unconfirmed_chunk/", + TXID/binary, + "/", + (hb_util:bin(XOffset))/binary + >>, + Opts + ). + +pending_chunk_poll_config(Opts) -> + RawRetryCount = max(0, hb_opts:get(arweave_pending_chunk_poll_attempts, 0, Opts)), + RetryDelay = max(1, hb_opts:get(arweave_pending_chunk_poll_ms, 500, Opts)), + MinRetryWindowMs = max( + 0, + hb_opts:get(arweave_pending_chunk_poll_min_ms, 20000, Opts) + ), + RetryCount = + case RawRetryCount of + 0 -> 0; + _ -> max(RawRetryCount, ceil_div(MinRetryWindowMs, RetryDelay)) + end, + {RetryCount, RetryDelay}. + +ceil_div(0, _Denominator) -> 0; +ceil_div(Numerator, Denominator) -> + (Numerator + Denominator - 1) div Denominator. + +%% @doc Fetch the advertised data size for an unconfirmed transaction. +pending_tx_data_size(TXID, Opts) -> + case pending(#{}, #{ <<"pending">> => TXID }, Opts#{ exclude_data => true }) of + {ok, JSON} -> + {ok, hb_util:int(maps:get(<<"data_size">>, JSON))}; + Error -> + Error + end. + +%% @doc Return the pending chunk end offsets using 256KiB stepping with a final +%% cap at `data_size`. +pending_relative_chunk_offsets(_Offset, Length, _DataSize) when Length =< 0 -> + []; +pending_relative_chunk_offsets(Offset, _Length, DataSize) when Offset >= DataSize -> + []; +pending_relative_chunk_offsets(Offset, Length, DataSize) -> + RangeStart = max(1, Offset + 1), + RangeEnd = min(Offset + Length, DataSize), + ChunkEnds = pending_chunk_end_offsets(DataSize), + pending_relative_chunk_offsets(ChunkEnds, RangeStart, RangeEnd, 0, []). + +pending_relative_chunk_offsets( + [ChunkEnd | Rest], RangeStart, RangeEnd, PrevEnd, Acc +) -> + ChunkStart = PrevEnd + 1, + NewAcc = + case chunk_overlaps_range(ChunkStart, ChunkEnd, RangeStart, RangeEnd) of + true -> [ChunkEnd | Acc]; + false -> Acc + end, + pending_relative_chunk_offsets(Rest, RangeStart, RangeEnd, ChunkEnd, NewAcc); +pending_relative_chunk_offsets([], _RangeStart, _RangeEnd, _PrevEnd, Acc) -> + lists:reverse(Acc). + +pending_chunk_end_offsets(DataSize) when DataSize =< ?DATA_CHUNK_SIZE -> + [DataSize]; +pending_chunk_end_offsets(DataSize) -> + pending_chunk_end_offsets(?DATA_CHUNK_SIZE, DataSize, []). + +pending_chunk_end_offsets(Current, DataSize, Acc) when Current < DataSize -> + pending_chunk_end_offsets( + Current + ?DATA_CHUNK_SIZE, + DataSize, + [Current | Acc] + ); +pending_chunk_end_offsets(_Current, DataSize, Acc) -> + lists:reverse([DataSize | Acc]). + +chunk_overlaps_range(ChunkStart, ChunkEnd, RangeStart, RangeEnd) -> + ChunkEnd >= RangeStart andalso ChunkStart =< RangeEnd. + %% @doc Decode a chunk response into a {Start, End, Binary} tuple. %% Runs inside the pmap worker so raw JSON is GC'd per-worker. decode_chunk({ok, JSON}) -> @@ -900,48 +1072,27 @@ pending(Base, Request, Opts) -> request(<<"GET">>, <<"/tx/pending">>, Opts) end; TXID -> + ExcludeData = + case find_key(<<"exclude-data">>, Base, Request, Opts) of + not_found -> hb_opts:get(exclude_data, false, Opts); + Value -> hb_util:bool(Value) + end, case hb_maps:find(<<"offset">>, Request, Opts) of error -> % Retreive a bare TX header by its TXID - request(<<"GET">>, <<"/unconfirmed_tx/", TXID/binary>>, Opts); - {ok, RawOffset} -> - try hb_util:int(RawOffset) of - Offset when Offset < 0 -> - {error, #{ - <<"status">> => 400, - <<"content-type">> => <<"application/json">>, - <<"body">> => <<"{\"error\":\"invalid_offset\"}">> - }}; - Offset -> - % Download an unconfirmed chunk by its offset - request( - <<"GET">>, - << - "/unconfirmed_chunk/", - TXID/binary, - "/", - (hb_util:bin(Offset))/binary - >>, - Opts#{ - <<"exclude-data">> => - hb_util:bool( - find_key( - <<"exclude-data">>, - Base, - Request, - Opts - ) - ) - } - ) - catch - _:_ -> - {error, #{ - <<"status">> => 400, - <<"content-type">> => <<"application/json">>, + request( + <<"GET">>, + <<"/unconfirmed_tx/", TXID/binary>>, + Opts#{ + exclude_data => ExcludeData + } + ); + {ok, _RawOffset} -> + {error, #{ + <<"status">> => 400, + <<"content-type">> => <<"application/json">>, <<"body">> => <<"{\"error\":\"invalid_offset\"}">> - }} - end + }} end end. @@ -2134,6 +2285,25 @@ assemble_relative_chunks_nonzero_offset_test_parallel() -> assemble_relative_chunks([{1, 5, <<"abcde">>}], 2), ?assertEqual(<<"cde">>, hb_util:bin(Chunk)). +pending_relative_chunk_offsets_single_chunk_test_parallel() -> + ?assertEqual([1234], pending_relative_chunk_offsets(0, 1, 1234)), + ?assertEqual([1234], pending_relative_chunk_offsets(0, 1234, 1234)). + +pending_relative_chunk_offsets_standard_multi_chunk_test_parallel() -> + DataSize = 315127, + ?assertEqual( + [?DATA_CHUNK_SIZE], + pending_relative_chunk_offsets(0, 1, DataSize) + ), + ?assertEqual( + [DataSize], + pending_relative_chunk_offsets(?DATA_CHUNK_SIZE, 1, DataSize) + ), + ?assertEqual( + [?DATA_CHUNK_SIZE, DataSize], + pending_relative_chunk_offsets(0, DataSize, DataSize) + ). + get_pre_split_small_chunks_test_parallel() -> TXID = <<"4FnBmvgWmqXWEEprjVqBsV5aRpAgF6_yJX_GTGsSZjY">>, Opts = setup_arweave_index_opts([TXID]), From 3d9caa02836272b8a515800417db44d035d48942 Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Sun, 19 Apr 2026 18:05:17 +0200 Subject: [PATCH 37/68] feat: mempool copycat rich logs --- src/preloaded/query/dev_copycat_arweave.erl | 203 ++++++++++++++++---- 1 file changed, 165 insertions(+), 38 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 210ec5fc9..7f1304b70 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -1667,61 +1667,174 @@ get_index_store(Opts) -> index_mempool(_Request, Opts) -> case dev_arweave:pending(#{}, #{}, Opts) of {ok, TXIDs} when is_list(TXIDs) -> + mempool_progress( + Opts, + {mempool_scan_started, {pending_count, length(TXIDs)}} + ), Results = parallel_map(TXIDs, fun(TXID) -> index_mempool_tx(TXID, Opts) end, Opts), - Summary = lists:foldl(fun(R, Acc) -> - K = case R of - ok -> indexed; existing -> existing; - missing_data -> missing_data; _ -> failed - end, - Acc#{ K => maps:get(K, Acc) + 1 } - end, #{ indexed => 0, existing => 0, - missing_data => 0, failed => 0 }, Results), - ?event(copycat_short, {mempool_scan_completed, Summary}), + Summary = lists:foldl( + fun mempool_accumulate_result/2, + mempool_empty_summary(), + Results + ), + mempool_progress(Opts, {mempool_scan_completed, Summary}), {ok, Summary}; Error -> Error end. -index_mempool_tx(TXID, Opts) -> - case is_tx_indexed(TXID, Opts) of - true -> existing; - false -> - case dev_arweave:pending(#{}, #{ <<"pending">> => TXID }, Opts) of - {ok, StructuredTX} -> - TX = hb_message:convert(StructuredTX, - <<"tx@1.0">>, <<"structured@1.0">>, Opts), - case has_mempool_data(TX) of - true -> write_mempool_offsets(TXID, TX, Opts); - false -> missing_data - end; - _ -> failed - end +mempool_progress(Opts, Event) -> + case hb_opts:get(arweave_mempool_progress, false, Opts) of + true -> ?event(copycat_short, Event); + false -> ok end. -has_mempool_data(#tx{ data_size = 0 }) -> true; -has_mempool_data(#tx{ data = D, data_size = S }) - when is_binary(D) -> byte_size(D) =:= S; -has_mempool_data(_) -> false. +mempool_empty_summary() -> + #{ + indexed => 0, + existing => 0, + missing_data => 0, + failed => 0, + tx_offsets_written => 0, + bundle_txs => 0, + items_indexed => 0 + }. + +mempool_accumulate_result(Result, Acc) -> + maps:fold( + fun(Key, Value, SummaryAcc) -> + SummaryAcc#{ Key => maps:get(Key, SummaryAcc) + Value } + end, + Acc, + mempool_result_summary(Result) + ). + +mempool_result_summary(existing) -> + (mempool_empty_summary())#{ existing => 1 }; +mempool_result_summary(indexed) -> + (mempool_empty_summary())#{ indexed => 1 }; +mempool_result_summary(missing_data) -> + (mempool_empty_summary())#{ missing_data => 1 }; +mempool_result_summary(ok) -> + (mempool_empty_summary())#{ indexed => 1 }; +mempool_result_summary(failed) -> + (mempool_empty_summary())#{ failed => 1 }; +mempool_result_summary(#{ status := Status } = Result) -> + Base = mempool_result_summary(Status), + lists:foldl( + fun(Key, SummaryAcc) -> + SummaryAcc#{ + Key => maps:get(Key, SummaryAcc) + maps:get(Key, Result, 0) + } + end, + Base, + [tx_offsets_written, bundle_txs, items_indexed] + ); +mempool_result_summary(_) -> + (mempool_empty_summary())#{ failed => 1 }. + +index_mempool_tx(TXID, Opts) -> + mempool_progress(Opts, {mempool_tx_started, {tx_id, {explicit, TXID}}}), + Result = + case is_tx_indexed(TXID, Opts) of + true -> existing; + false -> + mempool_progress( + Opts, + {mempool_tx_header_fetch_started, {tx_id, {explicit, TXID}}} + ), + case dev_arweave:pending( + #{}, + #{ <<"pending">> => TXID }, + Opts#{ exclude_data => true } + ) of + {ok, StructuredTX} -> + mempool_progress( + Opts, + {mempool_tx_header_fetch_finished, + {tx_id, {explicit, TXID}}} + ), + TX = hb_message:convert( + StructuredTX, + <<"tx@1.0">>, + <<"structured@1.0">>, + Opts + ), + mempool_progress( + Opts, + {mempool_tx_convert_finished, + {tx_id, {explicit, TXID}}, + {data_size, TX#tx.data_size}, + {bundle, is_bundle_tx(TX, Opts)}} + ), + write_mempool_offsets(TXID, TX, Opts); + _ -> + failed + end + end, + mempool_progress( + Opts, + {mempool_tx_finished, + {tx_id, {explicit, TXID}}, + mempool_progress_result(Result)} + ), + Result. + +mempool_progress_result(existing) -> + {status, existing}; +mempool_progress_result(indexed) -> + {status, indexed}; +mempool_progress_result(missing_data) -> + {status, missing_data}; +mempool_progress_result(ok) -> + {status, indexed}; +mempool_progress_result(failed) -> + {status, failed}; +mempool_progress_result(#{ status := Status }) -> + {status, Status}; +mempool_progress_result(_) -> + {status, failed}. write_mempool_offsets(TXID, TX, Opts) -> Store = hb_store_arweave:store_from_opts(Opts), - ok = hb_store_arweave:write_offset( - Store, TXID, <<"tx@1.0">>, relative, TX#tx.data_size), + mempool_progress( + Opts, + {mempool_data_load_started, + {tx_id, {explicit, TXID}}, + {length, TX#tx.data_size}} + ), case load_mempool_data(TXID, TX, Opts) of {ok, Data} -> + mempool_progress( + Opts, + {mempool_data_loaded, + {tx_id, {explicit, TXID}}, + {loaded_bytes, byte_size(Data)}} + ), + ok = hb_store_arweave:write_offset( + Store, TXID, <<"tx@1.0">>, relative, TX#tx.data_size), write_mempool_children(Store, TXID, TX, Data, Opts); - _ -> - ok - end, - ok. + _Error -> + #{ status => missing_data } + end. write_mempool_children(Store, TXID, TX, Data, Opts) -> case is_bundle_tx(TX, Opts) of true -> case load_mempool_bundle_index(TXID, Data, Opts) of {ok, HeaderSize, BundleIndex} -> - write_mempool_items(Store, TXID, BundleIndex, HeaderSize); - _ -> ok + write_mempool_items(Store, TXID, BundleIndex, HeaderSize), + #{ + status => indexed, + tx_offsets_written => 1, + bundle_txs => 1, + items_indexed => length(BundleIndex) + }; + _Error -> + #{ + status => failed, + tx_offsets_written => 1 + } end; false -> case standalone_item_id(Data) of @@ -1729,8 +1842,17 @@ write_mempool_children(Store, TXID, TX, Data, Opts) -> Ref = #{ <<"relative">> => TXID, <<"offset">> => 0 }, hb_store_arweave:write_offset( Store, ItemID, <<"ans104@1.0">>, - Ref, TX#tx.data_size); - not_found -> ok + Ref, TX#tx.data_size), + #{ + status => indexed, + tx_offsets_written => 1, + items_indexed => 1 + }; + not_found -> + #{ + status => indexed, + tx_offsets_written => 1 + } end end. @@ -1813,7 +1935,11 @@ load_mempool_bundle_index(TXID, <<>>, Opts) -> {error, invalid_bundle_header} end. -standalone_item_id(Data) when is_binary(Data), Data =/= <<>> -> +standalone_item_id(<> = Data) + when is_binary(Data), Data =/= <<>> -> + case lists:member(SigType, [<<1, 0>>, <<2, 0>>, <<3, 0>>, <<4, 0>>, <<7, 0>>]) of + false -> not_found; + true -> try Item = ar_bundles:deserialize(Data), case ar_bundles:verify_item(Item) of @@ -1821,6 +1947,7 @@ standalone_item_id(Data) when is_binary(Data), Data =/= <<>> -> false -> not_found end catch _:_ -> not_found + end end; standalone_item_id(_) -> not_found. From d491d00840440355cdb1fb0af767e14681ec9dda Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Mon, 20 Apr 2026 09:58:24 +0200 Subject: [PATCH 38/68] feat: filter TXs by sender --- src/preloaded/query/dev_copycat_arweave.erl | 133 ++++++++++++++------ 1 file changed, 95 insertions(+), 38 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 7f1304b70..41748e9d0 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -1664,7 +1664,8 @@ get_index_store(Opts) -> end. %% @doc Scan the mempool and index any accessible unconfirmed TXs. -index_mempool(_Request, Opts) -> +index_mempool(Request, Opts) -> + SenderFilter = mempool_sender_filter(Request, Opts), case dev_arweave:pending(#{}, #{}, Opts) of {ok, TXIDs} when is_list(TXIDs) -> mempool_progress( @@ -1672,7 +1673,7 @@ index_mempool(_Request, Opts) -> {mempool_scan_started, {pending_count, length(TXIDs)}} ), Results = parallel_map(TXIDs, - fun(TXID) -> index_mempool_tx(TXID, Opts) end, Opts), + fun(TXID) -> index_mempool_tx(TXID, SenderFilter, Opts) end, Opts), Summary = lists:foldl( fun mempool_accumulate_result/2, mempool_empty_summary(), @@ -1689,6 +1690,23 @@ mempool_progress(Opts, Event) -> false -> ok end. +mempool_sender_filter(Request, Opts) -> + case hb_maps:find(<<"sender">>, Request, Opts) of + {ok, Sender} when is_binary(Sender) -> + normalize_sender_filter(Sender); + _ -> + not_found + end. + +normalize_sender_filter(Sender) when is_binary(Sender) -> + case byte_size(Sender) of + 32 -> hb_util:human_id(Sender); + 42 -> Sender; + 43 -> Sender; + 44 -> Sender; + _ -> Sender + end. + mempool_empty_summary() -> #{ indexed => 0, @@ -1711,6 +1729,8 @@ mempool_accumulate_result(Result, Acc) -> mempool_result_summary(existing) -> (mempool_empty_summary())#{ existing => 1 }; +mempool_result_summary(filtered) -> + mempool_empty_summary(); mempool_result_summary(indexed) -> (mempool_empty_summary())#{ indexed => 1 }; mempool_result_summary(missing_data) -> @@ -1733,44 +1753,14 @@ mempool_result_summary(#{ status := Status } = Result) -> mempool_result_summary(_) -> (mempool_empty_summary())#{ failed => 1 }. -index_mempool_tx(TXID, Opts) -> +index_mempool_tx(TXID, SenderFilter, Opts) -> mempool_progress(Opts, {mempool_tx_started, {tx_id, {explicit, TXID}}}), Result = - case is_tx_indexed(TXID, Opts) of - true -> existing; - false -> - mempool_progress( - Opts, - {mempool_tx_header_fetch_started, {tx_id, {explicit, TXID}}} - ), - case dev_arweave:pending( - #{}, - #{ <<"pending">> => TXID }, - Opts#{ exclude_data => true } - ) of - {ok, StructuredTX} -> - mempool_progress( - Opts, - {mempool_tx_header_fetch_finished, - {tx_id, {explicit, TXID}}} - ), - TX = hb_message:convert( - StructuredTX, - <<"tx@1.0">>, - <<"structured@1.0">>, - Opts - ), - mempool_progress( - Opts, - {mempool_tx_convert_finished, - {tx_id, {explicit, TXID}}, - {data_size, TX#tx.data_size}, - {bundle, is_bundle_tx(TX, Opts)}} - ), - write_mempool_offsets(TXID, TX, Opts); - _ -> - failed - end + case SenderFilter of + not_found -> + index_mempool_tx_unfiltered(TXID, Opts); + _ -> + index_mempool_tx_filtered(TXID, SenderFilter, Opts) end, mempool_progress( Opts, @@ -1780,8 +1770,69 @@ index_mempool_tx(TXID, Opts) -> ), Result. +index_mempool_tx_unfiltered(TXID, Opts) -> + case is_tx_indexed(TXID, Opts) of + true -> existing; + false -> + case load_mempool_tx_header(TXID, Opts) of + {ok, TX} -> write_mempool_offsets(TXID, TX, Opts); + error -> failed + end + end. + +index_mempool_tx_filtered(TXID, SenderFilter, Opts) -> + case load_mempool_tx_header(TXID, Opts) of + {ok, TX} -> + case mempool_tx_sender_matches(TX, SenderFilter) of + false -> filtered; + true -> + case is_tx_indexed(TXID, Opts) of + true -> existing; + false -> write_mempool_offsets(TXID, TX, Opts) + end + end; + error -> + failed + end. + +load_mempool_tx_header(TXID, Opts) -> + mempool_progress( + Opts, + {mempool_tx_header_fetch_started, {tx_id, {explicit, TXID}}} + ), + case dev_arweave:pending( + #{}, + #{ <<"pending">> => TXID }, + Opts#{ exclude_data => true } + ) of + {ok, StructuredTX} -> + mempool_progress( + Opts, + {mempool_tx_header_fetch_finished, + {tx_id, {explicit, TXID}}} + ), + TX = hb_message:convert( + StructuredTX, + <<"tx@1.0">>, + <<"structured@1.0">>, + Opts + ), + mempool_progress( + Opts, + {mempool_tx_convert_finished, + {tx_id, {explicit, TXID}}, + {data_size, TX#tx.data_size}, + {bundle, is_bundle_tx(TX, Opts)}} + ), + {ok, TX}; + _ -> + error + end. + mempool_progress_result(existing) -> {status, existing}; +mempool_progress_result(filtered) -> + {status, filtered}; mempool_progress_result(indexed) -> {status, indexed}; mempool_progress_result(missing_data) -> @@ -1795,6 +1846,12 @@ mempool_progress_result(#{ status := Status }) -> mempool_progress_result(_) -> {status, failed}. +mempool_tx_sender_matches(TX, SenderFilter) -> + case ar_tx:get_owner_address(TX) of + not_set -> false; + OwnerAddress -> normalize_sender_filter(OwnerAddress) =:= SenderFilter + end. + write_mempool_offsets(TXID, TX, Opts) -> Store = hb_store_arweave:store_from_opts(Opts), mempool_progress( From e97cb631ee3b5d56ba589e3d24df76603e399841 Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Mon, 20 Apr 2026 10:40:57 +0200 Subject: [PATCH 39/68] chore: add tests --- src/preloaded/query/dev_copycat_arweave.erl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 41748e9d0..0e90a51d9 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -2010,6 +2010,19 @@ standalone_item_id(_) -> not_found. %%% Tests +mempool_result_summary_filtered_test_parallel() -> + ?assertEqual(mempool_empty_summary(), mempool_result_summary(filtered)). + +normalize_sender_filter_binary_address_test_parallel() -> + Address = crypto:strong_rand_bytes(32), + ?assertEqual(hb_util:human_id(Address), normalize_sender_filter(Address)). + +mempool_tx_sender_matches_owner_address_test_parallel() -> + Address = crypto:strong_rand_bytes(32), + TX = #tx{ owner = <<1>>, owner_address = Address }, + ?assert(mempool_tx_sender_matches(TX, hb_util:human_id(Address))), + ?assertNot(mempool_tx_sender_matches(TX, hb_util:human_id(crypto:strong_rand_bytes(32)))). + index_ids_test_parallel() -> %% Test block: https://viewblock.io/arweave/block/1827942 %% Note: this block includes a data item with an Ethereum signature. This From 4b3076d265d4573ae466a52f1299a7126f2b33a1 Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Mon, 20 Apr 2026 11:49:48 +0200 Subject: [PATCH 40/68] feat: add mempool sender filtered determinsitv test --- src/preloaded/query/dev_copycat_arweave.erl | 73 ++++++++++++++++++++- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 0e90a51d9..2125f1951 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -1666,7 +1666,7 @@ get_index_store(Opts) -> %% @doc Scan the mempool and index any accessible unconfirmed TXs. index_mempool(Request, Opts) -> SenderFilter = mempool_sender_filter(Request, Opts), - case dev_arweave:pending(#{}, #{}, Opts) of + case mempool_pending(#{}, #{}, Opts) of {ok, TXIDs} when is_list(TXIDs) -> mempool_progress( Opts, @@ -1690,6 +1690,14 @@ mempool_progress(Opts, Event) -> false -> ok end. +mempool_pending(Base, Request, Opts) -> + case hb_opts:get(arweave_pending_fun, undefined, Opts) of + Fun when is_function(Fun, 3) -> + Fun(Base, Request, Opts); + _ -> + dev_arweave:pending(Base, Request, Opts) + end. + mempool_sender_filter(Request, Opts) -> case hb_maps:find(<<"sender">>, Request, Opts) of {ok, Sender} when is_binary(Sender) -> @@ -1800,11 +1808,25 @@ load_mempool_tx_header(TXID, Opts) -> Opts, {mempool_tx_header_fetch_started, {tx_id, {explicit, TXID}}} ), - case dev_arweave:pending( + case mempool_pending( #{}, #{ <<"pending">> => TXID }, Opts#{ exclude_data => true } ) of + {ok, TX} when is_record(TX, tx) -> + mempool_progress( + Opts, + {mempool_tx_header_fetch_finished, + {tx_id, {explicit, TXID}}} + ), + mempool_progress( + Opts, + {mempool_tx_convert_finished, + {tx_id, {explicit, TXID}}, + {data_size, TX#tx.data_size}, + {bundle, is_bundle_tx(TX, Opts)}} + ), + {ok, TX}; {ok, StructuredTX} -> mempool_progress( Opts, @@ -2023,6 +2045,53 @@ mempool_tx_sender_matches_owner_address_test_parallel() -> ?assert(mempool_tx_sender_matches(TX, hb_util:human_id(Address))), ?assertNot(mempool_tx_sender_matches(TX, hb_util:human_id(crypto:strong_rand_bytes(32)))). +mempool_sender_filter_indexes_matching_tx_test_parallel() -> + TestStore = hb_test_utils:test_store(), + IndexStore = #{ <<"index-store">> => [TestStore] }, + BaseOpts = #{ + store => [TestStore], + arweave_index_ids => true, + arweave_index_store => IndexStore + }, + ok = hb_store:reset([TestStore]), + ok = hb_store:start([TestStore]), + MatchTXID = hb_util:human_id(crypto:strong_rand_bytes(32)), + OtherTXID = hb_util:human_id(crypto:strong_rand_bytes(32)), + Sender = <<"FPjbN_btYKzcf8QASjs30v5C0FPv7XpwKXENBW8dqVw">>, + MatchTX = mempool_test_pending_tx(Sender), + OtherTX = mempool_test_pending_tx( + hb_util:human_id(crypto:strong_rand_bytes(32)) + ), + Opts = BaseOpts#{ + arweave_pending_fun => + fun(_, #{ <<"pending">> := PendingTXID }, _) + when PendingTXID =:= MatchTXID -> + {ok, MatchTX}; + (_, #{ <<"pending">> := PendingTXID }, _) + when PendingTXID =:= OtherTXID -> + {ok, OtherTX}; + (_, Request, _) when map_size(Request) =:= 0 -> + {ok, [MatchTXID, OtherTXID]} + end + }, + ?assertEqual( + {ok, (mempool_empty_summary())#{ indexed => 1, tx_offsets_written => 1 }}, + arweave( + #{}, + #{ <<"mode">> => <<"mempool">>, <<"sender">> => Sender }, + Opts + ) + ), + ?assert(is_tx_indexed(MatchTXID, Opts)), + ?assertNot(is_tx_indexed(OtherTXID, Opts)). + +mempool_test_pending_tx(Sender) -> + #tx{ + format = 2, + owner = <<1>>, + owner_address = Sender + }. + index_ids_test_parallel() -> %% Test block: https://viewblock.io/arweave/block/1827942 %% Note: this block includes a data item with an Ethereum signature. This From 9ec16f526a8f8ff6fa90cdaf2038f1680b9d271b Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Wed, 29 Apr 2026 21:17:49 +0200 Subject: [PATCH 41/68] fix: pending header recursion - rebase --- src/preloaded/arweave/dev_arweave.erl | 4 ++-- src/preloaded/query/dev_copycat_arweave.erl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index e3584e360..bdf784a3b 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -741,7 +741,7 @@ ceil_div(Numerator, Denominator) -> %% @doc Fetch the advertised data size for an unconfirmed transaction. pending_tx_data_size(TXID, Opts) -> - case pending(#{}, #{ <<"pending">> => TXID }, Opts#{ exclude_data => true }) of + case pending(#{}, #{ <<"pending">> => TXID, <<"exclude-data">> => true }, Opts) of {ok, JSON} -> {ok, hb_util:int(maps:get(<<"data_size">>, JSON))}; Error -> @@ -1084,7 +1084,7 @@ pending(Base, Request, Opts) -> <<"GET">>, <<"/unconfirmed_tx/", TXID/binary>>, Opts#{ - exclude_data => ExcludeData + <<"exclude-data">> => ExcludeData } ); {ok, _RawOffset} -> diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 2125f1951..2fe608e45 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -1810,8 +1810,8 @@ load_mempool_tx_header(TXID, Opts) -> ), case mempool_pending( #{}, - #{ <<"pending">> => TXID }, - Opts#{ exclude_data => true } + #{ <<"pending">> => TXID, <<"exclude-data">> => true }, + Opts ) of {ok, TX} when is_record(TX, tx) -> mempool_progress( From 95b420a71a238c43abaa5d2ee502d6549982d388 Mon Sep 17 00:00:00 2001 From: Rani Elhusseini Date: Wed, 29 Apr 2026 21:29:41 +0200 Subject: [PATCH 42/68] fix: mempool test --- src/preloaded/query/dev_copycat_arweave.erl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 2fe608e45..44a8e2144 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -2049,9 +2049,9 @@ mempool_sender_filter_indexes_matching_tx_test_parallel() -> TestStore = hb_test_utils:test_store(), IndexStore = #{ <<"index-store">> => [TestStore] }, BaseOpts = #{ - store => [TestStore], - arweave_index_ids => true, - arweave_index_store => IndexStore + <<"store">> => [TestStore], + <<"arweave-index-ids">> => true, + <<"arweave-index-store">> => IndexStore }, ok = hb_store:reset([TestStore]), ok = hb_store:start([TestStore]), @@ -2063,7 +2063,7 @@ mempool_sender_filter_indexes_matching_tx_test_parallel() -> hb_util:human_id(crypto:strong_rand_bytes(32)) ), Opts = BaseOpts#{ - arweave_pending_fun => + <<"arweave-pending-fun">> => fun(_, #{ <<"pending">> := PendingTXID }, _) when PendingTXID =:= MatchTXID -> {ok, MatchTX}; From 8313f2bcde8e606e35482ecdc2c6af088e4e36d6 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Tue, 5 May 2026 10:41:34 -0400 Subject: [PATCH 43/68] fix: tests in dev_copycat_arweave after edge rebase --- src/core/store/hb_store_arweave.erl | 10 +- src/hb_copycat_budget.erl | 2 +- src/preloaded/arweave/dev_arweave.erl | 4 +- src/preloaded/query/dev_copycat_arweave.erl | 135 +++++++++++--------- 4 files changed, 83 insertions(+), 68 deletions(-) diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 15c17ad9f..9d91751c6 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -6,7 +6,7 @@ %%% Unused Store API: -export([resolve/3, write/3, link/3, group/3]). %%% Indexing API: --export([store_from_opts/1, write_offset/5, read_offset/3, read_parent/2, decode_parent_entries/1, read_chunks/3]). +-export([store_from_opts/1, write_offset/5, read_offset/3, read_parent/3, decode_parent_entries/1, read_chunks/3]). -include("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). @@ -16,7 +16,7 @@ %% for the `arweave_index_store' option, and if not found, searches the main %% `store' list for the first Arweave store with an index. store_from_opts(Opts) -> - case hb_opts:get(arweave_index_store, no_store, Opts) of + case hb_opts:get(<<"arweave-index-store">>, no_store, Opts) of no_store -> first_arweave_store(hb_opts:get(store, [], Opts)); IndexStoreOpts -> IndexStoreOpts end. @@ -94,10 +94,10 @@ read_offset(StoreOpts = #{ <<"index-store">> := IndexStore }, ID, Opts) -> read_offset(_, _, _) -> not_found. %% @doc Read the parent entries for an item from the index store. -read_parent(#{ <<"index-store">> := IndexStore }, ID) -> +read_parent(#{ <<"index-store">> := IndexStore }, ID, Opts) -> NormalizedID = hb_util:native_id(ID), ParentPath = <<"parent/", NormalizedID/binary>>, - case hb_store:read(IndexStore, ParentPath) of + case hb_store:read(IndexStore, ParentPath, Opts) of {ok, Bin} -> case decode_parent_entries(Bin) of {error, _} = Err -> Err; @@ -106,7 +106,7 @@ read_parent(#{ <<"index-store">> := IndexStore }, ID) -> _ -> not_found end; -read_parent(_, _) -> not_found. +read_parent(_, _, _) -> not_found. decode_parent_entries(<<>>) -> []; decode_parent_entries(<<0, Height:64/big-unsigned, Rest/binary>>) -> diff --git a/src/hb_copycat_budget.erl b/src/hb_copycat_budget.erl index c449814ac..8099967d8 100644 --- a/src/hb_copycat_budget.erl +++ b/src/hb_copycat_budget.erl @@ -198,4 +198,4 @@ concurrent_leases_test() -> ok. reset_to_default() -> - reset(hb_opts:get(copycat_memory_budget, 6 * 1024 * 1024 * 1024, #{})). + reset(hb_opts:get(<<"copycat_memory_budget">>, 6 * 1024 * 1024 * 1024, #{})). diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index bdf784a3b..d214aceca 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -992,7 +992,7 @@ parent(Base, Request, Opts) -> {error, not_found}; ID -> StoreOpts = hb_store_arweave:store_from_opts(Opts), - try hb_store_arweave:read_parent(StoreOpts, ID) of + try hb_store_arweave:read_parent(StoreOpts, ID, Opts) of {ok, [{Height, block} | _]} -> Entry = #{ <<"type">> => <<"block">>, @@ -1202,7 +1202,7 @@ to_message(Path = <<"/block/", _/binary>>, <<"GET">>, {ok, #{ <<"body">> := Body Opts ), CacheRes = - case hb_opts:get(arweave_index_blocks, true, Opts) of + case hb_opts:get(<<"arweave-index-blocks">>, true, Opts) of true -> dev_arweave_block_cache:write(Block, Opts); false -> skipped end, diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 44a8e2144..53a8a3818 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -88,16 +88,20 @@ arweave(_Base, Request, Opts) -> %% @doc Set bundles descendant recursion cap, avoids recursion %% in very nested bundles (very rare). set_depth_recursion_cap(Cap, Opts) when is_integer(Cap), Cap > 0 -> - Opts#{copycat_depth_recursion_cap => Cap}. + Opts#{<<"copycat_depth_recursion_cap">> => Cap}. %% @doc Get the set depth recursion cap from hb_opts. get_depth_recursion_cap(Opts) -> - hb_opts:get(copycat_depth_recursion_cap, undefined, Opts). + hb_opts:get(<<"copycat_depth_recursion_cap">>, undefined, Opts). %% @doc Return the effective per-TX memory cap, clamped to the global budget. %% Lazily initializes the budget pool on first call. effective_memory_cap(Opts) -> - Budget = hb_opts:get( - copycat_memory_budget, ?DEFAULT_COPYCAT_MEMORY_BUDGET, Opts), + Budget = + hb_opts:get( + <<"copycat_memory_budget">>, + ?DEFAULT_COPYCAT_MEMORY_BUDGET, + Opts + ), hb_copycat_budget:ensure_started(Budget), hb_copycat_budget:get_budget(). @@ -121,9 +125,9 @@ encode_parent_entry(ParentID, bundle) when byte_size(ParentID) =:= 32 -> <<1, ParentID:32/binary>>. %% @doc Write a parent entry for an item to the index store. -write_parent(ItemID, ParentData, Type, Store) -> +write_parent(ItemID, ParentData, Type, Store, Opts) -> Entry = encode_parent_entry(ParentData, Type), - hb_store:write(Store, parent_path(ItemID), Entry). + hb_store:write(Store, #{parent_path(ItemID) => Entry}, Opts). %% @doc Encode a list of 32-byte raw IDs into a single binary. encode_item_ids(IDs) -> @@ -206,8 +210,8 @@ write_block_item_ids(Height, AchievedDepth, ItemIDs, Opts) -> Bin = encode_item_ids(IDs), hb_store:write( Store, - block_items_path(Height, D), - Bin + #{block_items_path(Height, D) => Bin}, + Opts ) end, lists:seq(1, MaxStoredDepth) @@ -226,8 +230,8 @@ mark_block_indexed(Height, Depth, Opts) -> Store = get_index_store(Opts), hb_store:write( Store, - block_indexed_path(Height), - integer_to_binary(Depth) + #{block_indexed_path(Height) => integer_to_binary(Depth)}, + Opts ). %% @doc Read the persisted cutover height from the index store. @@ -243,7 +247,7 @@ ensure_cutover_height(Height, Opts) -> case read_cutover_height(Opts) of undefined -> Store = get_index_store(Opts), - hb_store:write(Store, ?CUTOVER_KEY, hb_util:bin(Height)), + hb_store:write(Store, #{?CUTOVER_KEY => hb_util:bin(Height)}, Opts), ?event(copycat_short, {marker_cutover_initialized, {height, Height}}); _ -> ok end. @@ -255,14 +259,14 @@ normalize_owner_id(Addr) -> %% @doc Adds an address to the owners aliases cache in Opts, mapping %% Alias -> native address for fast lookup and once per address computation. add_owner_alias(Addr, Alias, Opts) when is_binary(Alias) -> - ExistingAliases = hb_opts:get(owner_aliases, #{}, Opts), - Opts#{ owner_aliases => ExistingAliases#{ Alias => normalize_owner_id(Addr) }}; + ExistingAliases = hb_opts:get(<<"owner_aliases">>, #{}, Opts), + Opts#{ <<"owner_aliases">> => ExistingAliases#{ Alias => normalize_owner_id(Addr) }}; add_owner_alias(_Addr, Alias, _Opts) -> throw({invalid_owner_alias, Alias}). %% @doc Retrieve the address of a given alias. resolve_owner_alias(Alias, Opts) when is_binary(Alias) -> - Aliases = hb_opts:get(owner_aliases, #{}, Opts), + Aliases = hb_opts:get(<<"owner_aliases">>, #{}, Opts), case hb_maps:find(Alias, Aliases) of {ok, Addr} -> {ok, Addr}; error -> {error, {owner_alias_not_found, Alias}} @@ -721,7 +725,7 @@ fetch_blocks(Current, To, TargetDepth, Opts) -> fetch_blocks_ranged(Current, To, TargetDepth, BlockWorkers, Opts). block_workers(Opts) -> - max(1, hb_opts:get(arweave_block_workers, 3, Opts)). + max(1, hb_opts:get(<<"arweave_block_workers">>, 3, Opts)). %% @doc Process a known range of blocks in parallel batches. fetch_blocks_ranged(Current, To, TargetDepth, _Workers, _Opts) @@ -924,7 +928,7 @@ process_block(BlockRes, Current, To, TargetDepth, Opts) -> %% @doc Index the IDs of all transactions in the block if configured to do so. maybe_index_block(Block, TargetDepth, Opts) -> TotalTXs = length(hb_maps:get(<<"txs">>, Block, [], Opts)), - case hb_opts:get(arweave_index_ids, true, Opts) of + case hb_opts:get(<<"arweave-index-ids">>, true, Opts) of false -> {block_skipped, #{ items_count => 0, @@ -996,7 +1000,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, ) end), #{ <<"index-store">> := IndexStore } = ArweaveStore, - ok = write_parent(TX#tx.id, BlockHeight, block, IndexStore), + ok = write_parent(TX#tx.id, BlockHeight, block, IndexStore, Opts), try is_bundle_tx(TX, Opts) of false -> #{items_count => 0, bundle_count => 0, skipped_count => 0, @@ -1046,7 +1050,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, ItemStartOffset, Size ), - ok = write_parent(ItemID, TX#tx.id, bundle, IndexStore), + ok = write_parent(ItemID, TX#tx.id, bundle, IndexStore, Opts), {ItemStartOffset + Size, ItemsCountAcc + 1} end, {TXStartOffset + HeaderSize, 0}, @@ -1479,7 +1483,7 @@ index_full_bundle_items( ItemStartOffset, Size ), - ok = write_parent(ItemID, ParentID, bundle, IndexStore), + ok = write_parent(ItemID, ParentID, bundle, IndexStore, Opts), {DescendantCount, ItemAchievedDepth, ChildIDs} = case {Depth > 1, ParseResult} of {true, {ok, HeaderSize, ParsedItem}} -> @@ -1549,9 +1553,8 @@ validate_and_flag_item_id(ItemBinary, DeclaredID, EncodedDeclaredID, IndexStore) false -> ok = hb_store:write( IndexStore, - hb_store_arweave_offset:mismatch_path( - DeclaredID), - ComputedID + #{hb_store_arweave_offset:mismatch_path(DeclaredID) => ComputedID}, + #{} ), ?event(copycat_short, {item_id_mismatch, @@ -2240,25 +2243,27 @@ invalid_bundle_header_test_parallel() -> download_bundle_header(EndOffset, Size, Opts)), ok. -invalid_bundle_test_parallel() -> - {_TestStore, _StoreOpts, Opts} = setup_index_opts(), - Block = 1307606, - {ok, Block} = - hb_ao:resolve( - <<"~copycat@1.0/arweave&from=", (hb_util:bin(Block))/binary, "&to=", (hb_util:bin(Block))/binary>>, +invalid_bundle_test_parallel_() -> + {timeout, 60, fun() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + Block = 1307606, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", (hb_util:bin(Block))/binary, "&to=", (hb_util:bin(Block))/binary>>, + Opts + ), + assert_bundle_read( + <<"8S12ZqO6-_icGkeuH8mFq6x9q7OIoXOqFRGH5k-wshg">>, + [ + {<<"gintz-t6q_kdeP_IBQVGnp9fgFzs-pPGGehXW-V7ZRk">>, <<"1">>} + ], Opts ), - assert_bundle_read( - <<"8S12ZqO6-_icGkeuH8mFq6x9q7OIoXOqFRGH5k-wshg">>, - [ - {<<"gintz-t6q_kdeP_IBQVGnp9fgFzs-pPGGehXW-V7ZRk">>, <<"1">>} - ], - Opts - ), - % L1 TX with bundle tags, but data is not a valid bundle. The L1 TX - % should still be indexed. - assert_item_read(<<"cGNURX2IUt98VKVIeXSfYe6eulNwPEqijaQfvatzd_o">>, Opts), - ok. + % L1 TX with bundle tags, but data is not a valid bundle. The L1 TX + % should still be indexed. + assert_item_read(<<"cGNURX2IUt98VKVIeXSfYe6eulNwPEqijaQfvatzd_o">>, Opts), + ok + end}. block_with_large_integer_test_parallel() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), @@ -2558,7 +2563,7 @@ auto_stop_partial_index_test_parallel() -> TXIDs = hb_maps:get(<<"txs">>, BlockData, [], Opts), ?assert(length(TXIDs) > 0), [OneTXID | _] = TXIDs, - hb_store_arweave:write_offset(StoreOpts, OneTXID, <<"tx@1.0">>, 0, 0), + ok = hb_store_arweave:write_offset(StoreOpts, OneTXID, <<"tx@1.0">>, 0, 0), {ok, Block} = hb_ao:resolve( << @@ -2698,7 +2703,7 @@ negative_from_index_test_parallel() -> ?assertNot(has_any_indexed_tx(NextBlock + 1, Opts)), ok. -owner_alias_roundtrip_test() -> +owner_alias_roundtrip_test_parallel() -> Opts1 = add_owner_alias( <<"FPjbN7EVwP3XwQJx8qnKqJDYa4TLJ0Y8gu4AaiUuW1c">>, @@ -3087,7 +3092,10 @@ assert_bundle_read(BundleID, ExpectedItems, Opts) -> assert_item_read(ItemID, Opts) -> ?event(debug_test, {resolving, {explicit, ItemID}}), ReadResult = hb_store_arweave:read( - hb_store_arweave:store_from_opts(Opts), ItemID), + hb_store_arweave:store_from_opts(Opts), + #{<<"read">> => ItemID}, + Opts + ), ?assertMatch({ok, _}, ReadResult, ItemID), {ok, Item} = ReadResult, ?event(debug_test, {item, Item}), @@ -3097,8 +3105,11 @@ assert_item_read(ItemID, Opts) -> assert_item_not_read(ItemID, Opts) -> ReadResult = hb_store_arweave:read( - hb_store_arweave:store_from_opts(Opts), ItemID), - ?assertEqual(not_found, ReadResult), + hb_store_arweave:store_from_opts(Opts), + #{<<"read">> => ItemID}, + Opts + ), + ?assertEqual({error, not_found}, ReadResult), ok. has_any_indexed_tx(Height, Opts) -> @@ -3253,7 +3264,7 @@ no_mismatch_flags_on_valid_bundles_test() -> ItemID = hb_util:native_id( <<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>), ?assertEqual( - not_found, + {error, not_found}, hb_store:read( IndexStore, hb_store_arweave_offset:mismatch_path(ItemID), @@ -3309,7 +3320,7 @@ fabricated_mismatch_test() -> ), ?assertEqual(RealID, StoredActualID), ?assertEqual( - not_found, + {error, not_found}, hb_store:read( IndexStore, hb_store_arweave_offset:mismatch_path(RealID), @@ -3338,7 +3349,7 @@ block_item_ids_depth_2_test() -> ?assert(is_integer(Pos54K)), ?assert(is_integer(PosOBK)), ?assert(Pos54K < PosOBK), - ?assertEqual(not_found, hb_store:read(Store, block_items_path(1827942, 3), Opts)), + ?assertEqual({error, not_found}, hb_store:read(Store, block_items_path(1827942, 3), Opts)), ok. block_item_ids_depth_3_test() -> @@ -3414,12 +3425,12 @@ inventory_single_block_test() -> inventory_range_test() -> {_TestStore, StoreOpts, Opts} = setup_index_opts(), #{ <<"index-store">> := Store } = StoreOpts, - hb_store:write(Store, block_indexed_path(77777777), <<"2">>), - hb_store:write(Store, block_items_path(77777777, 1), <<0:256>>), - hb_store:write(Store, block_items_path(77777777, 2), <<>>), - hb_store:write(Store, block_indexed_path(77777778), <<"2">>), - hb_store:write(Store, block_items_path(77777778, 1), <<1:256>>), - hb_store:write(Store, block_items_path(77777778, 2), <<>>), + ok = hb_store:write(Store, #{block_indexed_path(77777777) => <<"2">>}, Opts), + ok = hb_store:write(Store, #{block_items_path(77777777, 1) => <<0:256>>}, Opts), + ok = hb_store:write(Store, #{block_items_path(77777777, 2) => <<>>}, Opts), + ok = hb_store:write(Store, #{block_indexed_path(77777778) => <<"2">>}, Opts), + ok = hb_store:write(Store, #{block_items_path(77777778, 1) => <<1:256>>}, Opts), + ok = hb_store:write(Store, #{block_items_path(77777778, 2) => <<>>}, Opts), {ok, InvResult} = inventory_index(77777778, 77777777, Opts), Body = hb_json:decode(hb_maps:get(<<"body">>, InvResult)), ?assert(maps:is_key(<<"77777777">>, Body)), @@ -3440,9 +3451,9 @@ corrupt_item_ids_read_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), Height = 99999999, - hb_store:write(Store, block_indexed_path(Height), <<"2">>), - hb_store:write(Store, block_items_path(Height, 1), <<0:256>>), - hb_store:write(Store, block_items_path(Height, 2), <<0:240>>), + ok = hb_store:write(Store, #{block_indexed_path(Height) => <<"2">>}, Opts), + ok = hb_store:write(Store, #{block_items_path(Height, 1) => <<0:256>>}, Opts), + ok = hb_store:write(Store, #{block_items_path(Height, 2) => <<0:240>>}, Opts), Counts = read_block_item_counts(Height, Opts), ?assertEqual(1, maps:get(<<"1">>, Counts)), ?assertEqual(<<"corrupt">>, maps:get(<<"2">>, Counts)), @@ -3466,7 +3477,11 @@ parent_not_found_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), StoreOpts2 = hb_store_arweave:store_from_opts(Opts), UnknownID = crypto:strong_rand_bytes(32), - ?assertEqual(not_found, hb_store_arweave:read_parent(StoreOpts2, UnknownID)), + ?assertEqual( + not_found, + hb_store_arweave:read_parent(StoreOpts2, UnknownID, Opts), + Opts + ), ok. parent_depth_2_test() -> @@ -3491,14 +3506,14 @@ parent_depth_2_test() -> BlockInfo = maps:get(hb_util:bin(Block), Body), L1Items = maps:get(<<"1">>, maps:get(<<"items">>, BlockInfo)), L1ID = hb_util:decode(hd(L1Items)), - {ok, [{Block, block}]} = hb_store_arweave:read_parent(StoreOpts2, L1ID), + {ok, [{Block, block}]} = hb_store_arweave:read_parent(StoreOpts2, L1ID, Opts), L2Items = maps:get(<<"2">>, maps:get(<<"items">>, BlockInfo)), case L2Items of [] -> ok; [FirstL2 | _] -> L2ID = hb_util:decode(FirstL2), {ok, [{L2Parent, bundle}]} = - hb_store_arweave:read_parent(StoreOpts2, L2ID), + hb_store_arweave:read_parent(StoreOpts2, L2ID, Opts), ?assert(lists:member( hb_util:encode(L2Parent), L1Items)) end, @@ -3529,7 +3544,7 @@ parent_depth_3_test() -> L2Items = maps:get(<<"2">>, maps:get(<<"items">>, BlockInfo)), L3ID = hb_util:decode(hd(L3Items)), {ok, [{L3Parent, bundle}]} = - hb_store_arweave:read_parent(StoreOpts2, L3ID), + hb_store_arweave:read_parent(StoreOpts2, L3ID, Opts), ?assert(lists:member(hb_util:encode(L3Parent), L2Items)), ok. From eba5cb23cc98b51a46da8ee5a189ee060b0aa24b Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Tue, 5 May 2026 14:51:32 -0400 Subject: [PATCH 44/68] fix: encode anchor based on arweave spec --- src/preloaded/query/dev_query_arweave.erl | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/preloaded/query/dev_query_arweave.erl b/src/preloaded/query/dev_query_arweave.erl index 03accc358..8525d6f4d 100644 --- a/src/preloaded/query/dev_query_arweave.erl +++ b/src/preloaded/query/dev_query_arweave.erl @@ -158,7 +158,7 @@ query(Msg, <<"recipient">>, _Args, Opts) -> query(Msg, <<"anchor">>, _Args, Opts) -> case find_field_key(<<"field-anchor">>, Msg, Opts) of {ok, null} -> {ok, <<"">>}; - {ok, Anchor} -> {ok, hb_util:human_id(Anchor)} + {ok, Anchor} -> encode_anchor(Anchor) end; query(Msg, <<"data">>, _Args, Opts) -> Data = @@ -184,6 +184,20 @@ query(Obj, Field, Args, _Opts) -> }), {ok, <<"Not implemented.">>}. +%% @doc Encode a transaction anchor (`last_tx`) for the GraphQL response. +%% Per the Arweave spec, an anchor is one of: +%% - empty (first TX from a wallet), +%% - a 32-byte raw TX ID (the wallet's last outgoing TX), or +%% - a 48-byte raw block hash (any of the last 50 blocks). +%% The cached value may already be base64url-encoded (43 / 64 chars). Other +%% sizes are not valid per the spec. +encode_anchor(<<>>) -> {ok, <<>>}; +encode_anchor(Bin) when is_binary(Bin), byte_size(Bin) == 32 -> {ok, hb_util:encode(Bin)}; +encode_anchor(Bin) when is_binary(Bin), byte_size(Bin) == 48 -> {ok, hb_util:encode(Bin)}; +encode_anchor(Bin) when is_binary(Bin), byte_size(Bin) == 43 -> {ok, Bin}; +encode_anchor(Bin) when is_binary(Bin), byte_size(Bin) == 64 -> {ok, Bin}; +encode_anchor(Other) -> {error, <<"invalid_anchor: ", Other/binary>>}. + %% @doc Find and return a value from the fields of a message (from its %% commitments). find_field_key(Field, Msg, Opts) -> From 72c2a8c8dc2864751cd8e9c25d218e2103f8b575 Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Tue, 5 May 2026 14:53:06 -0400 Subject: [PATCH 45/68] fix: no variable can return null or not_found rather than empty map --- src/preloaded/query/dev_query_graphql.erl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/preloaded/query/dev_query_graphql.erl b/src/preloaded/query/dev_query_graphql.erl index 25571a82b..7fda6ecdc 100644 --- a/src/preloaded/query/dev_query_graphql.erl +++ b/src/preloaded/query/dev_query_graphql.erl @@ -109,11 +109,11 @@ handle(_Base, RawReq, Opts) -> ?event({request, {processed, Req}}), Query = hb_maps:get(<<"query">>, Req, <<>>, Opts), OpName = hb_maps:get(<<"operationName">>, Req, undefined, Opts), - Vars = - hb_message:uncommitted_deep( - hb_maps:get(<<"variables">>, Req, #{}, Opts), - Opts - ), + Vars = + case hb_maps:get(<<"variables">>, Req, #{}, Opts) of + V when is_map(V) -> hb_message:uncommitted_deep(V, Opts); + _ -> #{} + end, ?event( {graphql_run_called, {query, Query}, From 34e3ff2d866d3ecc4216580ad7675e322a07e67b Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Tue, 5 May 2026 20:22:13 -0400 Subject: [PATCH 46/68] chore: clean the code --- src/preloaded/query/dev_copycat_arweave.erl | 122 ++++++++++---------- 1 file changed, 60 insertions(+), 62 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 53a8a3818..d2bb02e49 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -29,66 +29,34 @@ %% fetch blocks from the latest known block towards the Genesis block. arweave(_Base, Request, Opts) -> case hb_maps:get(<<"mode">>, Request, <<"write">>, Opts) of - <<"mempool">> -> - index_mempool(Request, Opts); - <<"write">> -> + <<"mempool">> -> index_mempool(Request, Opts); + <<"write">> -> case hb_maps:find(<<"id">>, Request, Opts) of - {ok, TXID} -> - case process_l1_request(TXID, Request, Opts) of - {ok, Stats} when is_map(Stats) -> - ?event( - copycat_short, - {arweave_tx_indexed, - {id, {explicit, TXID}}, - {items_indexed, maps:get(items_count, Stats, 0)}, - {bundle_txs, maps:get(bundle_count, Stats, 0)}, - {skipped_txs, maps:get(skipped_count, Stats, 0)} - } - ), - {ok, Stats#{ - <<"body">> => maps:get(items_count, Stats, 0) - }}; - _ -> - {ok, #{ - items_count => 0, - bundle_count => 0, - skipped_count => 0, - <<"body">> => 0 - }} - end; + {ok, TXID} -> index_explicit_tx(TXID, Request, Opts); error -> - case parse_range(Request, Opts) of - {error, unavailable} -> - {error, unavailable}; - {ok, {From, To}} -> - TargetDepth = request_depth( - Request, ?DEFAULT_BLOCK_DEPTH, Opts), - ?event(copycat_short, - {indexing_blocks, - {from, From}, {to, To}, - {depth, TargetDepth}} - ), - fetch_blocks(From, To, TargetDepth, Opts) - end - end; - <<"list">> -> - case parse_range(Request, Opts) of - {error, unavailable} -> {error, unavailable}; - {ok, {From, To}} -> list_index(From, To, Opts) - end; - <<"inventory">> -> - case parse_range(Request, Opts) of - {error, unavailable} -> {error, unavailable}; - {ok, {From, To}} -> inventory_index(From, To, Opts) + Depth = request_depth(Request, ?DEFAULT_BLOCK_DEPTH, Opts), + with_range( + Request, + Opts, + fun(F, T, O) -> fetch_blocks(F, T, Depth, O) end + ) end; + <<"list">> -> with_range(Request, Opts, fun list_index/3); + <<"inventory">> -> with_range(Request, Opts, fun inventory_index/3); + <<"headers">> -> with_range(Request, Opts, fun index_headers/3); Mode -> - {error, <<"Unsupported mode `", (hb_util:bin(Mode))/binary, - "`. Supported modes are: write, list, inventory">>} + { + error, + <<"Unsupported mode `", (hb_util:bin(Mode))/binary,"`. Supported", + "modes are: write, list, inventory, headers, mempool">> + } end. + %% @doc Set bundles descendant recursion cap, avoids recursion %% in very nested bundles (very rare). set_depth_recursion_cap(Cap, Opts) when is_integer(Cap), Cap > 0 -> Opts#{<<"copycat_depth_recursion_cap">> => Cap}. + %% @doc Get the set depth recursion cap from hb_opts. get_depth_recursion_cap(Opts) -> hb_opts:get(<<"copycat_depth_recursion_cap">>, undefined, Opts). @@ -513,6 +481,12 @@ parse_range(Request, Opts) -> {error, unavailable} end. +with_range(Request, Opts, Fun) -> + case parse_range(Request, Opts) of + {error, unavailable} -> {error, unavailable}; + {ok, {From, To}} -> Fun(From, To, Opts) + end. + normalize_height(Height, Opts) -> RequestedHeight = hb_util:int(Height), case RequestedHeight < 0 of @@ -702,27 +676,51 @@ classify_txs(TXIDs, Opts) -> TXIDs ). +%% @doc Index a single L1 TX by ID. Returns indexing stats (items, bundles, +%% skipped) on success, or zeroed stats on failure. +index_explicit_tx(TXID, Request, Opts) -> + case process_l1_request(TXID, Request, Opts) of + {ok, Stats} when is_map(Stats) -> + ?event(copycat_short, + {arweave_tx_indexed, + {id, {explicit, TXID}}, + {items_indexed, maps:get(items_count, Stats, 0)}, + {bundle_txs, maps:get(bundle_count, Stats, 0)}, + {skipped_txs, maps:get(skipped_count, Stats, 0)} + } + ), + {ok, Stats#{ <<"body">> => maps:get(items_count, Stats, 0) }}; + _ -> + {ok, #{ + items_count => 0, + bundle_count => 0, + skipped_count => 0, + <<"body">> => 0 + }} + end. + %% @doc Fetch blocks from an Arweave node while moving downward from `Current'. %% If `To' is provided, every block in [`To', `Current'] is processed. If `To' %% is omitted, stop at the first block already indexed at the requested depth %% (via block markers above cutover, or legacy per-TX check below cutover). -fetch_blocks(Current, To, TargetDepth, _Opts) - when is_integer(To), Current < To -> +fetch_blocks(From, To, Depth, Opts) -> + ?event(copycat_short, + {indexing_blocks, {from, From}, {to, To}, {depth, Depth}}), + do_fetch_blocks(From, To, Depth, Opts). + +do_fetch_blocks(Current, To, Depth, _Opts) when is_integer(To), Current < To -> ?event(copycat_short, {arweave_block_indexing_completed, - {reached_target, To}, - {target_depth, TargetDepth} + {reached_target, To}, {target_depth, Depth} } ), {ok, To}; -fetch_blocks(Current, undefined, _TargetDepth, _Opts) when Current < 0 -> +do_fetch_blocks(Current, undefined, _Depth, _Opts) when Current < 0 -> {ok, 0}; -fetch_blocks(Current, undefined, TargetDepth, Opts) -> - BlockWorkers = block_workers(Opts), - fetch_blocks_open_ended(Current, TargetDepth, BlockWorkers, Opts); -fetch_blocks(Current, To, TargetDepth, Opts) -> - BlockWorkers = block_workers(Opts), - fetch_blocks_ranged(Current, To, TargetDepth, BlockWorkers, Opts). +do_fetch_blocks(Current, undefined, Depth, Opts) -> + fetch_blocks_open_ended(Current, Depth, block_workers(Opts), Opts); +do_fetch_blocks(Current, To, Depth, Opts) -> + fetch_blocks_ranged(Current, To, Depth, block_workers(Opts), Opts). block_workers(Opts) -> max(1, hb_opts:get(<<"arweave_block_workers">>, 3, Opts)). From 32d1d93d3553373451f236cff4dbc4877ff349ce Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Wed, 6 May 2026 15:39:21 -0400 Subject: [PATCH 47/68] fix: breaking tests and some cleanup --- src/preloaded/query/dev_copycat_arweave.erl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index d2bb02e49..d4d3844bf 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -43,7 +43,7 @@ arweave(_Base, Request, Opts) -> end; <<"list">> -> with_range(Request, Opts, fun list_index/3); <<"inventory">> -> with_range(Request, Opts, fun inventory_index/3); - <<"headers">> -> with_range(Request, Opts, fun index_headers/3); + <<"headers">> -> with_range(Request, Opts, fun index_tx_headers/3); Mode -> { error, @@ -950,7 +950,8 @@ maybe_index_block(Block, TargetDepth, Opts) -> {ok, TXs} -> Height = hb_maps:get(<<"height">>, Block, 0, Opts), L1IDs = [TX#tx.id || TX <- TXs], - TXsWithData = ar_block:generate_size_tagged_list_from_txs(TXs, Height), + TXsWithData = + ar_block:generate_size_tagged_list_from_txs(TXs, Height), ValidTXs = lists:filter( fun({{padding, _}, _}) -> false; (_) -> true end, TXsWithData @@ -1153,7 +1154,7 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> {ok, #{ <<"codec-device">> := <<"tx@1.0">>, - <<"start-offset">> := StartOffset, + <<"offset">> := StartOffset, <<"length">> := Length }} ?= observe_copycat_l1_stage( From 0fe11dee6a042b88de9f6a8e0b2529966f331f87 Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Wed, 6 May 2026 15:46:24 -0400 Subject: [PATCH 48/68] feat(wip): store tx-headers in local-store to access via graphql --- src/preloaded/query/dev_copycat_arweave.erl | 99 +++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index d4d3844bf..a3c441598 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -650,6 +650,77 @@ inventory_local(Current, To, Opts, Acc) -> Acc#{BlockKey => BlockInfo}) end. +%% @doc mode=headers: walk every confirmed item recorded under the indexed +%% blocks in [From..To], read the full message via the normal cache path, +%% and write it into the local store via `hb_cache:write'. Top-level tag +%% fields and commitment fields land path-keyed under `/...' so any +%% future filter (tags, owners, recipients, target, anchor, ...) is naturally +%% answerable by `hb_cache:match'. +%% +%% Items already laid out under local-store are skipped, so reruns are +%% idempotent and serve as the natural retry mechanism for previously failed +%% ids. +index_tx_headers(From, undefined, Opts) -> + index_tx_headers(From, 0, Opts); +index_tx_headers(From, To, _Opts) when From < To -> + {ok, {From, To}}; +index_tx_headers(From, To, Opts) -> + Candidates = collect_header_candidates(From, To, Opts, []), + Workers = hb_opts:get(<<"copycat-headers-workers">>, 8, Opts), + ?event(copycat_short, + {headers_scan_started, + {from, From}, {to, To}, + {candidates, length(Candidates)}, + {workers, Workers} + } + ), + hb_pmap:parallel_map( + Candidates, + fun(ID) -> + case index_headers(ID, Opts) of + skipped -> ?event(copycat_short, {header_skipped, {id, ID}}); + {ok, _} -> ?event(copycat_short, {header_indexed, {id, ID}}); + {error, Reason} -> + ?event(copycat_short, + {header_index_crash, + {id, ID}, {reason, Reason} + } + ) + end + end, + Workers + ), + ?event(copycat_short, {headers_scan_completed, {from, From}, {to, To}}), + {ok, {From, To}}. + +%% @doc Walk indexed blocks from `Current' down to `To', collecting all +%% confirmed item IDs across every depth. +collect_header_candidates(Current, To, _Opts, Acc) when Current < To -> Acc; +collect_header_candidates(Current, To, Opts, Acc) -> + case read_block_marker_depth(Current, Opts) of + undefined -> + collect_header_candidates(Current - 1, To, Opts, Acc); + _Depth -> + Candidates = + lists:append(maps:values(read_block_item_ids(Current, Opts))), + collect_header_candidates(Current - 1, To, Opts, Candidates ++ Acc) + end. + +%% @doc If the ID is not already in the local store, read it from the arweave +%% store and write it into the local store. +index_headers(ID, Opts) -> + LocalOpts = hb_store:scope(Opts, local), + try + case hb_cache:read(ID, LocalOpts) of + {ok, _} -> skipped; + _ -> + {ok, Msg} = hb_cache:read(ID, Opts), + hb_cache:write(Msg, LocalOpts) + end + catch _:Reason -> + {error, Reason} + end. + fetch_block_header(Height, Opts) -> ?event(debug_copycat, {fetching_block, Height}), observe_event(<<"block_header">>, fun() -> @@ -3649,6 +3720,34 @@ parent_endpoint_not_found_test() -> ), ok. +strip_preserves_verify_test_parallel() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {ok, 1827942} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=1827942&to=1827942&mode=write&depth=3">>, + Opts + ), + L1ID = <<"bXEgFm4K2b5VD64skBNAlS3I__4qxlM3Sm4Z5IXj3h8">>, + L2ID = <<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, + L3ID = <<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, + lists:foreach( + fun(ID) -> + index_headers(ID, Opts), + {ok, HeaderMsg} = hb_cache:read(ID, hb_store:scope(Opts, local)), + ?event( + {verify_msg, + {id, ID}, + {unsigned, hb_message:id(HeaderMsg, unsigned, Opts)}, + {signed, hb_message:id(HeaderMsg, signed, Opts)}, + {all, hb_message:id(HeaderMsg, all, Opts)} + } + ), + ?assert(hb_message:verify(HeaderMsg, all, Opts), {verify_failed, ID}), + ?assertEqual(ID, hb_message:id(HeaderMsg, signed, Opts)) + end, + [L1ID, L3ID, L2ID] + ). + index_of(Elem, List) -> index_of(Elem, List, 1). index_of(_Elem, [], _N) -> not_found; From 0ba5634661fc55215281f4828f1059361d5d14cb Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Wed, 6 May 2026 16:22:07 -0400 Subject: [PATCH 49/68] chore: fix a test --- src/core/store/hb_store_arweave.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 9d91751c6..15b3a41b4 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -457,7 +457,7 @@ load_item_deserialize_throws_test() -> ProbeOffset = 376836336327208, Size = 4096, ok = write_offset(Opts, FakeID, <<"ans104@1.0">>, ProbeOffset - 1, Size), - ?assertMatch({error, _}, read(Opts, FakeID, #{})). + ?assertMatch({error, _}, read(Opts, #{ <<"read">> => FakeID }, #{})). root_offset_confirmed_parent_test() -> Store = [hb_test_utils:test_store()], From b0fc6bdb2f11c9bdb82db5522b6965fe6ba35cce Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Fri, 8 May 2026 09:47:40 -0400 Subject: [PATCH 50/68] feat: added support for fee in graphql --- src/preloaded/query/dev_query_arweave.erl | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/preloaded/query/dev_query_arweave.erl b/src/preloaded/query/dev_query_arweave.erl index 8525d6f4d..8cac1e7f3 100644 --- a/src/preloaded/query/dev_query_arweave.erl +++ b/src/preloaded/query/dev_query_arweave.erl @@ -145,11 +145,16 @@ query(#{ <<"key">> := Key }, <<"key">>, _Args, _Opts) -> query(#{ <<"address">> := Address }, <<"address">>, _Args, _Opts) -> {ok, Address}; query(Msg, <<"fee">>, _Args, Opts) -> - {ok, hb_maps:get(<<"fee">>, Msg, 0, Opts)}; + case find_field_key(<<"field-reward">>, Msg, Opts) of + {ok, null} -> {ok, 0}; + {ok, Reward} -> hb_util:safe_int(Reward) + end; query(Msg, <<"quantity">>, _Args, Opts) -> {ok, hb_maps:get(<<"quantity">>, Msg, 0, Opts)}; query(Number, <<"winston">>, _Args, _Opts) when is_number(Number) -> {ok, Number}; +query(Number, <<"ar">>, _Args, _Opts) when is_number(Number) -> + {ok, winston_to_ar(Number)}; query(Msg, <<"recipient">>, _Args, Opts) -> case find_field_key(<<"field-target">>, Msg, Opts) of {ok, null} -> {ok, <<"">>}; @@ -650,3 +655,13 @@ explicit_ids(Args, Opts) -> _ -> [] end ). + +winston_to_ar(W) when is_integer(W), W >= 0 -> + case {W div 1000000000000, W rem 1000000000000} of + {Whole, 0} -> + hb_util:bin(io_lib:format("~B", [Whole])); + {Whole, Frac} -> + Padded = io_lib:format("~12..0B", [Frac]), + Trimmed = string:trim(Padded, trailing, "0"), + hb_util:bin(io_lib:format("~B.~s", [Whole, Trimmed])) + end. \ No newline at end of file From b2d8cea8d0d8d822a6bdd1283335cd2a04ac296c Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Fri, 8 May 2026 09:49:10 -0400 Subject: [PATCH 51/68] fix: avoid rest of the filter also failing if first filter failed --- src/preloaded/query/dev_query_arweave.erl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/preloaded/query/dev_query_arweave.erl b/src/preloaded/query/dev_query_arweave.erl index 8cac1e7f3..5eb1714f7 100644 --- a/src/preloaded/query/dev_query_arweave.erl +++ b/src/preloaded/query/dev_query_arweave.erl @@ -455,6 +455,7 @@ match_args([{Field, X} | Rest], Acc, Opts) -> ?event({match, {field, Field}, {arg, X}}), case match(Field, X, Opts) of {ok, Result} -> match_args(Rest, [Result | Acc], Opts); + not_found -> match_args(Rest, [[] | Acc], Opts); _Error -> match_args(Rest, Acc, Opts) end. From b66df6b3c8f9110a6111736c792c2c04690e7a6f Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 8 May 2026 19:17:25 +0100 Subject: [PATCH 52/68] impr: Moving some functions to hb_store_arweave from dev_copycat_arweave, tests fixed on hb_store_arweave --- src/core/resolver/hb_opts.erl | 1 + src/core/store/hb_store_arweave.erl | 338 ++++++++++++++-- src/hb_copycat_budget.erl | 2 +- src/preloaded/arweave/dev_arweave.erl | 3 +- src/preloaded/query/dev_copycat_arweave.erl | 426 +++++--------------- 5 files changed, 410 insertions(+), 360 deletions(-) diff --git a/src/core/resolver/hb_opts.erl b/src/core/resolver/hb_opts.erl index 6b9ee9633..286355714 100644 --- a/src/core/resolver/hb_opts.erl +++ b/src/core/resolver/hb_opts.erl @@ -288,6 +288,7 @@ raw_default_message() -> <<"copycat-memory-budget">> => 6 * 1024 * 1024 * 1024, <<"copycat-depth-recursion-cap">> => 6, % 2x the deepest we've seen to date <<"arweave-block-workers">> => 3, + <<"copycat-scope">> => ["offset", "parent"], % Dev options <<"mode">> => debug, <<"profiling">> => true, diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 15b3a41b4..17680e858 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -6,11 +6,20 @@ %%% Unused Store API: -export([resolve/3, write/3, link/3, group/3]). %%% Indexing API: --export([store_from_opts/1, write_offset/5, read_offset/3, read_parent/3, decode_parent_entries/1, read_chunks/3]). +-export([store_from_opts/1, write_offset/6, write_parent/5, read_offset/3, read_parent/3, decode_parent_entries/1, read_chunks/3]). +-export([block_indexed_path/1, block_items_path/2]). +-export([read_block_item_counts/2, read_block_item_ids/2]). +-export([ensure_cutover_height/2, read_cutover_height/1, is_tx_indexed/2 ]). +-export([write_block_item_ids/4, read_block_marker_depth/2]). +-export([decode_item_ids/1, is_block_indexed/3, is_post_cutover/2, mark_block_indexed/3 ]). +-export([root_offset/2]). -include("include/hb.hrl"). -include_lib("eunit/include/eunit.hrl"). -define(PARTITION_SIZE, 3_600_000_000_000). +-define(SCOPE_PARENT, <<"parent">>). +-define(SCOPE_OFFSET, <<"offset">>). +-define(CUTOVER_KEY, <<"block/marker-cutover-height">>). %% @doc Find the first Arweave store from the given node message. Searches first %% for the `arweave_index_store' option, and if not found, searches the main @@ -122,6 +131,28 @@ decode_parent_entries(<<1, ParentID:32/binary, Rest/binary>>) -> decode_parent_entries(_Corrupt) -> {error, corrupt_parent_data}. + +%% @doc Return the store path for a parent index entry. +parent_path(ItemID) when byte_size(ItemID) =:= 32 -> + <<"parent/", ItemID/binary>>. + +%% @doc Encode a parent entry for storage. +encode_parent_entry(Height, block) when is_integer(Height) -> + <<0, Height:64/big-unsigned>>; +encode_parent_entry(ParentID, bundle) when byte_size(ParentID) =:= 32 -> + <<1, ParentID:32/binary>>. + +%% Block Information Index + +%% @doc Return the store path for a block completion marker. +block_indexed_path(Height) -> + <<"block/", (hb_util:bin(Height))/binary, "/depth">>. + +%% @doc Return the store path for a per-block item index at a given depth. +block_items_path(Height, Depth) -> + <<"block/", (hb_util:bin(Height))/binary, + "/items/", (hb_util:bin(Depth))/binary>>. + %% @doc Read the data at the given key, reading the `local-store' first if %% available. read(StoreOpts, #{ <<"read">> := ID }, _NodeOpts) when ?IS_ID(ID) -> @@ -301,31 +332,210 @@ read_chunks(Offset, Length, Opts) -> Opts ). +%% @doc Write a parent entry for an item to the index store. +write_parent(ItemID, ParentData, Type, Store, Opts) -> + case + lists:member( + ?SCOPE_PARENT, + hb_maps:get(<<"copycat-scope">>, Opts, [?SCOPE_PARENT]) + ) of + true -> + Entry = encode_parent_entry(ParentData, Type), + hb_store:write(Store, #{parent_path(ItemID) => Entry}, Opts); + false -> + ok + end. + %% @doc Write offset information to the index store. write_offset( - StoreOpts = #{ <<"index-store">> := IndexStore }, + #{ <<"index-store">> := IndexStore }, ID, CodecName, StartOffset, - Length + Length, + Opts ) -> - Value = hb_store_arweave_offset:encode(CodecName, StartOffset, Length), - ?event( - debug_store_arweave, - {writing_offset, - {id, {explicit, ID}}, - {type, CodecName}, - {start_offset, StartOffset}, - {length, Length}, - {value, {explicit, Value}} - } + case + lists:member( + ?SCOPE_OFFSET, + hb_maps:get(<<"copycat-scope">>, Opts, [?SCOPE_OFFSET]) + ) of + true -> + Value = hb_store_arweave_offset:encode(CodecName, StartOffset, Length), + ?event( + debug_store_arweave, + {writing_offset, + {id, {explicit, ID}}, + {type, CodecName}, + {start_offset, StartOffset}, + {length, Length}, + {value, {explicit, Value}} + } + ), + hb_store:write( + IndexStore, + #{ hb_store_arweave_offset:path(ID) => Value }, + Opts + ); + false -> + ok + end. + +%% @doc Probe item entries upward from depth 1, applying TransformFun to each. +probe_block_items(Height, Opts, TransformFun) -> + case store_from_opts(Opts) of + no_store -> + erlang:display({no_store, Opts}), + #{}; + #{ <<"index-store">> := Store } -> + probe_block_items(Height, Store, 1, #{}, TransformFun, Opts) + end. + +probe_block_items(Height, Store, Depth, Acc, TransformFun, Opts) -> + case hb_store:read(Store, block_items_path(Height, Depth), Opts) of + {ok, Bin} -> + Key = hb_util:bin(Depth), + probe_block_items( + Height, Store, Depth + 1, + Acc#{Key => TransformFun(Bin)}, TransformFun, Opts); + {error, not_found} -> + Acc + end. + +count_ids(Bin) when byte_size(Bin) rem 32 =:= 0 -> + byte_size(Bin) div 32; +count_ids(_) -> <<"corrupt">>. + +decode_and_encode_ids(Bin) -> + case decode_item_ids(Bin) of + {error, _} -> <<"corrupt">>; + List -> [hb_util:encode(ID) || ID <- List] + end. + +read_block_item_counts(Height, Opts) -> + probe_block_items(Height, Opts, fun count_ids/1). + +read_block_item_ids(Height, Opts) -> + probe_block_items(Height, Opts, fun decode_and_encode_ids/1). + +%% @doc Write per-depth item ID lists for a block. +%% Writes an entry for every depth from 1 through AchievedDepth (empty if +%% no items at that level), plus any partial depths beyond AchievedDepth +%% that were collected during indexing. +write_block_item_ids(Height, AchievedDepth, ItemIDs, Opts) -> + Store = get_index_store(Opts), + MaxStoredDepth = case maps:keys(ItemIDs) of + [] -> AchievedDepth; + Keys -> max(AchievedDepth, lists:max(Keys)) + end, + Results = lists:map( + fun(D) -> + IDs = maps:get(D, ItemIDs, []), + Bin = encode_item_ids(IDs), + hb_store:write( + Store, + #{block_items_path(Height, D) => Bin}, + Opts + ) + end, + lists:seq(1, MaxStoredDepth) ), + case lists:all(fun(R) -> R =:= ok end, Results) of + true -> ok; + false -> + ?event(copycat_short, + {block_item_ids_write_failed, + {height, Height}}), + {error, item_ids_write_failed} + end. + +%% @doc Encode a list of 32-byte raw IDs into a single binary. +encode_item_ids(IDs) -> + << <> || ID <- IDs >>. + +%% @doc Decode a binary of concatenated 32-byte IDs into a list. +%% Rejects binaries whose size is not a multiple of 32. +decode_item_ids(<<>>) -> []; +decode_item_ids(Bin) when byte_size(Bin) rem 32 =/= 0 -> + {error, invalid_item_ids_binary}; +decode_item_ids(Bin) -> + decode_item_ids_acc(Bin, []). + +decode_item_ids_acc(<<>>, Acc) -> lists:reverse(Acc); +decode_item_ids_acc(<>, Acc) -> + decode_item_ids_acc(Rest, [ID | Acc]). + +%% @doc Read the stored marker depth for a block, or undefined if none. +read_block_marker_depth(Height, Opts) -> + case store_from_opts(Opts) of + no_store -> undefined; + #{ <<"index-store">> := Store } -> + case hb_store:read(Store, block_indexed_path(Height), Opts) of + {ok, Bin} -> + try binary_to_integer(Bin) + catch _:_ -> undefined + end; + {error, not_found} -> undefined + end + end. + +%% @doc Check if a block has been indexed at the given depth or deeper. +is_block_indexed(undefined, _TargetDepth, _Opts) -> + false; +is_block_indexed(Height, TargetDepth, Opts) -> + case read_block_marker_depth(Height, Opts) of + undefined -> false; + StoredDepth -> StoredDepth >= TargetDepth + end. + +%% @doc Write a block completion marker with the achieved depth. +mark_block_indexed(Height, Depth, Opts) -> + Store = get_index_store(Opts), hb_store:write( - IndexStore, - #{ hb_store_arweave_offset:path(ID) => Value }, - StoreOpts + Store, + #{block_indexed_path(Height) => integer_to_binary(Depth)}, + Opts ). +%% @doc Read the persisted cutover height from the index store. +read_cutover_height(Opts) -> + Store = get_index_store(Opts), + case hb_store:read(Store, ?CUTOVER_KEY, Opts) of + {ok, Bin} -> hb_util:int(Bin); + {error, not_found} -> undefined + end. + +%% @doc Write the cutover height if not already set. +ensure_cutover_height(Height, Opts) -> + case read_cutover_height(Opts) of + undefined -> + Store = get_index_store(Opts), + hb_store:write(Store, #{?CUTOVER_KEY => hb_util:bin(Height)}, Opts), + ?event(copycat_short, {marker_cutover_initialized, {height, Height}}); + _ -> ok + end. + +%% @doc Check if a transaction ID is indexed in the arweave index store. +is_tx_indexed(TXID, Opts) -> + Store = get_index_store(Opts), + case hb_store:read(Store, hb_store_arweave_offset:path(TXID), Opts) of + {ok, _} -> true; + {error, not_found} -> false + end. + +is_post_cutover(undefined, _Opts) -> false; +is_post_cutover(Height, Opts) -> + case read_cutover_height(Opts) of + undefined -> false; + Cutover -> Height >= Cutover + end. + +get_index_store(Opts) -> + case store_from_opts(Opts) of + #{ <<"index-store">> := Store } -> Store; + _ -> throw(no_index_store_available) + end. + %% @doc Record the partition that data is found in when it is requested. record_partition_metric(Offset, Result, StoreOpts) when is_integer(Offset) -> case hb_opts:get(prometheus, not hb_features:test(), StoreOpts) of @@ -377,17 +587,24 @@ init_prometheus() -> %%% Tests +setup_test_store() -> + IndexStore = [hb_test_utils:test_store()], + ArweaveStore = + #{ + <<"store-module">> => hb_store_arweave, + <<"index-store">> => IndexStore + }, + Opts = #{<<"store">> => [ArweaveStore]}, + {IndexStore, ArweaveStore, Opts}. + write_read_tx_test() -> - Store = [hb_test_utils:test_store()], - Opts = #{ - <<"index-store">> => Store - }, + {_, ArweaveStoreOpts, Opts} = setup_test_store(), ID = <<"bndIwac23-s0K11TLC1N7z472sLGAkiOdhds87ZywoE">>, EndOffset = 363524457284025, Size = 8387, StartOffset = EndOffset - Size, - ok = write_offset(Opts, ID, <<"tx@1.0">>, StartOffset, Size), - {ok, Bundle} = read(Opts, #{ <<"read">> => ID }, Opts), + ok = write_offset(ArweaveStoreOpts, ID, <<"tx@1.0">>, StartOffset, Size, Opts), + {ok, Bundle} = read(ArweaveStoreOpts, #{ <<"read">> => ID }, Opts), ?assert(hb_message:verify(Bundle, all, #{})), {ok, Child} = hb_ao:resolve( @@ -421,27 +638,23 @@ write_read_tx_test() -> %% @doc Stale ANS-104 offset: fake ID pointing to a known bundle TX's %% data range. The deserialized item's ID won't match the fake ID. stale_ans104_offset_returns_error_test() -> - Store = [hb_test_utils:test_store()], - Opts = #{<<"index-store">> => Store}, + {_, ArweaveStoreOpts, Opts} = setup_test_store(), FakeID = <<"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA">>, RealEndOffset = 363524457284025, RealSize = 8387, RealStartOffset = RealEndOffset - RealSize, - ok = write_offset(Opts, FakeID, <<"ans104@1.0">>, RealStartOffset, RealSize), - Result = read(Opts, #{ <<"read">> => FakeID }, Opts), + ok = write_offset(ArweaveStoreOpts, FakeID, <<"ans104@1.0">>, RealStartOffset, RealSize, Opts), + Result = read(ArweaveStoreOpts, #{ <<"read">> => FakeID }, Opts), ?assertMatch({error, {id_mismatch, _, _}}, Result). %% @doc The L1 TX has bundle tags, but data is not a valid bundle. write_read_fake_bundle_tx_test() -> - Store = [hb_test_utils:test_store()], - Opts = #{ - <<"index-store">> => Store - }, + {_, ArweaveStoreOpts, Opts} = setup_test_store(), ID = <<"cGNURX2IUt98VKVIeXSfYe6eulNwPEqijaQfvatzd_o">>, Size = 2, StartOffset = 155309918167286, - ok = write_offset(Opts, ID, <<"tx@1.0">>, StartOffset, Size), - {ok, TX} = read(Opts, #{ <<"read">> => ID }, Opts), + ok = write_offset(ArweaveStoreOpts, ID, <<"tx@1.0">>, StartOffset, Size, Opts), + {ok, TX} = read(ArweaveStoreOpts, #{ <<"read">> => ID }, Opts), ?assert(hb_message:verify(TX, all, #{})), ok. @@ -449,25 +662,68 @@ write_read_fake_bundle_tx_test() -> %% so ar_bundles:deserialize/1 throws. The catch in load_item/4 must convert %% that throw into {error, _} rather than crashing. load_item_deserialize_throws_test() -> - Store = [hb_test_utils:test_store()], - Opts = #{<<"index-store">> => Store}, + {_, ArweaveStoreOpts, Opts} = setup_test_store(), FakeID = <<"BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB">>, %% Same interior offset used in dev_arweave bundle_header_garbage_guard test: %% the bytes at ProbeOffset are mid-TX application data, not an ANS-104 header. ProbeOffset = 376836336327208, Size = 4096, - ok = write_offset(Opts, FakeID, <<"ans104@1.0">>, ProbeOffset - 1, Size), - ?assertMatch({error, _}, read(Opts, #{ <<"read">> => FakeID }, #{})). + ok = write_offset(Opts, FakeID, <<"ans104@1.0">>, ProbeOffset - 1, Size, Opts), + ?assertMatch({error, _}, read(ArweaveStoreOpts, #{ <<"read">> => FakeID }, Opts)). root_offset_confirmed_parent_test() -> - Store = [hb_test_utils:test_store()], - Opts = #{ <<"index-store">> => Store }, + {_, ArweaveStoreOpts, Opts} = setup_test_store(), ParentID = <<"bndIwac23-s0K11TLC1N7z472sLGAkiOdhds87ZywoE">>, - ok = write_offset(Opts, ParentID, <<"tx@1.0">>, 12345, 99), + ok = write_offset(ArweaveStoreOpts, ParentID, <<"tx@1.0">>, 12345, 99, Opts), ?assertEqual( 12352, root_offset( #{ <<"relative">> => ParentID, <<"offset">> => 7 }, - Opts + ArweaveStoreOpts ) - ). \ No newline at end of file + ). + +corrupt_item_ids_read_test() -> + {IndexStore, _StoreOpts, Opts} = setup_test_store(), + Height = 99999999, + ok = hb_store:write(IndexStore, #{block_indexed_path(Height) => <<"2">>}, Opts), + ok = hb_store:write(IndexStore, #{block_items_path(Height, 1) => <<0:256>>}, Opts), + ok = hb_store:write(IndexStore, #{block_items_path(Height, 2) => <<0:240>>}, Opts), + Counts = read_block_item_counts(Height, Opts), + erlang:display({counts, Counts}), + ?assertEqual(1, maps:get(<<"1">>, Counts)), + ?assertEqual(<<"corrupt">>, maps:get(<<"2">>, Counts)), + IDs = read_block_item_ids(Height, Opts), + ?assertEqual(1, length(maps:get(<<"1">>, IDs))), + ?assertEqual(<<"corrupt">>, maps:get(<<"2">>, IDs)), + ok. + +parent_encode_decode_test() -> + BlockEntry = encode_parent_entry(12345, block), + ?assertEqual(<<0, 12345:64/big-unsigned>>, BlockEntry), + BundleID = crypto:strong_rand_bytes(32), + BundleEntry = encode_parent_entry(BundleID, bundle), + ?assertEqual(<<1, BundleID:32/binary>>, BundleEntry), + Combined = <>, + Decoded = decode_parent_entries(Combined), + ?assertEqual([{12345, block}, {BundleID, bundle}], Decoded), + ok. + +parent_not_found_test() -> + {_IndexStore, ArweaveStoreOpts, Opts} = setup_test_store(), + UnknownID = crypto:strong_rand_bytes(32), + ?assertEqual( + not_found, + hb_store_arweave:read_parent(ArweaveStoreOpts, UnknownID, Opts), + Opts + ), + ok. + +decode_item_ids_validation_test() -> + ?assertEqual([], decode_item_ids(<<>>)), + GoodBin = <<0:256, 1:256>>, + ?assertEqual(2, length(decode_item_ids(GoodBin))), + BadBin = <<0:240>>, + ?assertEqual({error, invalid_item_ids_binary}, decode_item_ids(BadBin)), + ok. + diff --git a/src/hb_copycat_budget.erl b/src/hb_copycat_budget.erl index 8099967d8..3aabf0184 100644 --- a/src/hb_copycat_budget.erl +++ b/src/hb_copycat_budget.erl @@ -198,4 +198,4 @@ concurrent_leases_test() -> ok. reset_to_default() -> - reset(hb_opts:get(<<"copycat_memory_budget">>, 6 * 1024 * 1024 * 1024, #{})). + reset(hb_opts:get(<<"copycat-memory-budget">>, 6 * 1024 * 1024 * 1024, #{})). diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index d214aceca..cae0c4884 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -1662,7 +1662,8 @@ index_test_tx(TXID, IndexStore, Opts) -> TXID, <<"tx@1.0">>, StartOffset, - Size + Size, + Opts ), ?assertMatch({ok, _}, hb_store_arweave:read_offset(IndexStore, TXID, Opts)), ok. diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index a3c441598..a5149f8d8 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -12,14 +12,11 @@ -include_lib("eunit/include/eunit.hrl"). -define(ARWEAVE_DEVICE, <<"~arweave@2.9">>). --define(CUTOVER_KEY, <<"block/marker-cutover-height">>). -define(DEPTH_SENTINEL, 99999). -% By default we'll index blocks to depth 2 which is: -% - depth 1: L1 TXs -% - depth 2: L2 bundles and dataitems -% Note: this means that the children of L2 bundles are not indexed at -% depth 2. --define(DEFAULT_BLOCK_DEPTH, 2). +%% `full` uses the copycat-depth-recursion-cap option +%% as a safe depth to go to. This can be changed to an +%% integer value. +-define(DEFAULT_BLOCK_DEPTH, <<"full">>). -define(DEFAULT_COPYCAT_MEMORY_BUDGET, 6 * 1024 * 1024 * 1024). % GET /~cron@1.0/once&cron-path=~copycat@1.0/arweave @@ -55,64 +52,24 @@ arweave(_Base, Request, Opts) -> %% @doc Set bundles descendant recursion cap, avoids recursion %% in very nested bundles (very rare). set_depth_recursion_cap(Cap, Opts) when is_integer(Cap), Cap > 0 -> - Opts#{<<"copycat_depth_recursion_cap">> => Cap}. + Opts#{<<"copycat-depth-recursion-cap">> => Cap}. %% @doc Get the set depth recursion cap from hb_opts. get_depth_recursion_cap(Opts) -> - hb_opts:get(<<"copycat_depth_recursion_cap">>, undefined, Opts). + hb_opts:get(<<"copycat-depth-recursion-cap">>, undefined, Opts). %% @doc Return the effective per-TX memory cap, clamped to the global budget. %% Lazily initializes the budget pool on first call. effective_memory_cap(Opts) -> Budget = hb_opts:get( - <<"copycat_memory_budget">>, + <<"copycat-memory-budget">>, ?DEFAULT_COPYCAT_MEMORY_BUDGET, Opts ), hb_copycat_budget:ensure_started(Budget), hb_copycat_budget:get_budget(). -%% @doc Return the store path for a block completion marker. -block_indexed_path(Height) -> - <<"block/", (hb_util:bin(Height))/binary, "/depth">>. - -%% @doc Return the store path for a per-block item index at a given depth. -block_items_path(Height, Depth) -> - <<"block/", (hb_util:bin(Height))/binary, - "/items/", (hb_util:bin(Depth))/binary>>. - -%% @doc Return the store path for a parent index entry. -parent_path(ItemID) when byte_size(ItemID) =:= 32 -> - <<"parent/", ItemID/binary>>. - -%% @doc Encode a parent entry for storage. -encode_parent_entry(Height, block) when is_integer(Height) -> - <<0, Height:64/big-unsigned>>; -encode_parent_entry(ParentID, bundle) when byte_size(ParentID) =:= 32 -> - <<1, ParentID:32/binary>>. - -%% @doc Write a parent entry for an item to the index store. -write_parent(ItemID, ParentData, Type, Store, Opts) -> - Entry = encode_parent_entry(ParentData, Type), - hb_store:write(Store, #{parent_path(ItemID) => Entry}, Opts). - -%% @doc Encode a list of 32-byte raw IDs into a single binary. -encode_item_ids(IDs) -> - << <> || ID <- IDs >>. - -%% @doc Decode a binary of concatenated 32-byte IDs into a list. -%% Rejects binaries whose size is not a multiple of 32. -decode_item_ids(<<>>) -> []; -decode_item_ids(Bin) when byte_size(Bin) rem 32 =/= 0 -> - {error, invalid_item_ids_binary}; -decode_item_ids(Bin) -> - decode_item_ids_acc(Bin, []). - -decode_item_ids_acc(<<>>, Acc) -> lists:reverse(Acc); -decode_item_ids_acc(<>, Acc) -> - decode_item_ids_acc(Rest, [ID | Acc]). - %% @doc Shift all depth keys in an item ID map by Offset. shift_item_ids(Map, Offset) -> maps:fold( @@ -139,87 +96,6 @@ merge_item_ids(A, B) -> B ). -%% @doc Read the stored marker depth for a block, or undefined if none. -read_block_marker_depth(Height, Opts) -> - case hb_store_arweave:store_from_opts(Opts) of - no_store -> undefined; - #{ <<"index-store">> := Store } -> - case hb_store:read(Store, block_indexed_path(Height), Opts) of - {ok, Bin} -> - try binary_to_integer(Bin) - catch _:_ -> undefined - end; - {error, not_found} -> undefined - end - end. - -%% @doc Check if a block has been indexed at the given depth or deeper. -is_block_indexed(undefined, _TargetDepth, _Opts) -> - false; -is_block_indexed(Height, TargetDepth, Opts) -> - case read_block_marker_depth(Height, Opts) of - undefined -> false; - StoredDepth -> StoredDepth >= TargetDepth - end. - -%% @doc Write per-depth item ID lists for a block. -%% Writes an entry for every depth from 1 through AchievedDepth (empty if -%% no items at that level), plus any partial depths beyond AchievedDepth -%% that were collected during indexing. -write_block_item_ids(Height, AchievedDepth, ItemIDs, Opts) -> - Store = get_index_store(Opts), - MaxStoredDepth = case maps:keys(ItemIDs) of - [] -> AchievedDepth; - Keys -> max(AchievedDepth, lists:max(Keys)) - end, - Results = lists:map( - fun(D) -> - IDs = maps:get(D, ItemIDs, []), - Bin = encode_item_ids(IDs), - hb_store:write( - Store, - #{block_items_path(Height, D) => Bin}, - Opts - ) - end, - lists:seq(1, MaxStoredDepth) - ), - case lists:all(fun(R) -> R =:= ok end, Results) of - true -> ok; - false -> - ?event(copycat_short, - {block_item_ids_write_failed, - {height, Height}}), - {error, item_ids_write_failed} - end. - -%% @doc Write a block completion marker with the achieved depth. -mark_block_indexed(Height, Depth, Opts) -> - Store = get_index_store(Opts), - hb_store:write( - Store, - #{block_indexed_path(Height) => integer_to_binary(Depth)}, - Opts - ). - -%% @doc Read the persisted cutover height from the index store. -read_cutover_height(Opts) -> - Store = get_index_store(Opts), - case hb_store:read(Store, ?CUTOVER_KEY, Opts) of - {ok, Bin} -> hb_util:int(Bin); - {error, not_found} -> undefined - end. - -%% @doc Write the cutover height if not already set. -ensure_cutover_height(Height, Opts) -> - case read_cutover_height(Opts) of - undefined -> - Store = get_index_store(Opts), - hb_store:write(Store, #{?CUTOVER_KEY => hb_util:bin(Height)}, Opts), - ?event(copycat_short, {marker_cutover_initialized, {height, Height}}); - _ -> ok - end. - %% @doc Normalize an owner address into the native ID form used for comparisons. normalize_owner_id(Addr) -> hb_util:native_id(hb_util:bin(Addr)). @@ -325,9 +201,9 @@ parse_tag_filter(Key, Request, Opts) -> %% applies L1-level owner/tag filters on the lightweight TX header first, then, %% if the TX passes and is a bundle, loads the full L1 payload once and indexes %% descendants in-memory up to the requested safe depth (defaults to full recursion -%% till the set copycat_depth_recursion_cap). +%% till the set copycat-depth-recursion-cap). process_l1_request(TXID, Request, Opts) -> - Depth = request_depth(Request, <<"safe_max">>, Opts), + Depth = request_depth(Request, <<"full">>, Opts), QueryL1Offset = hb_util:bool( hb_maps:get(<<"query-l1-offset">>, Request, false, Opts) @@ -369,12 +245,12 @@ process_l1_request(TXID, Request, Opts) -> %% safe cap. Depth is relative so depth 1 is always one level below the %% root specified in the request (either a block or an L1 TX ID). %% -%% `safe_max` resolves to the current copycat depth recursion cap. +%% `full` resolves to the current copycat depth recursion cap. request_depth(Request, Default, Opts) -> MaxRecursionCap = get_depth_recursion_cap(Opts), RequestedDepth = case hb_maps:get(<<"depth">>, Request, Default, Opts) of - <<"safe_max">> -> MaxRecursionCap; + <<"full">> -> MaxRecursionCap; Value -> hb_util:int(Value) end, erlang:min( @@ -508,14 +384,6 @@ latest_height(Opts) -> {error, Reason} -> {error, Reason} end. -%% @doc Check if a transaction ID is indexed in the arweave index store. -is_tx_indexed(TXID, Opts) -> - Store = get_index_store(Opts), - case hb_store:read(Store, hb_store_arweave_offset:path(TXID), Opts) of - {ok, _} -> true; - {error, not_found} -> false - end. - %% @doc List indexed blocks and transactions in the given range. %% Returns JSON with block heights as keys, each containing indexed and not-indexed lists. list_index(From, undefined, Opts) -> @@ -560,7 +428,7 @@ list_index_blocks(Current, To, Opts, Acc) -> _ -> BlockInfo#{ <<"items">> => - read_block_item_counts( + hb_store_arweave:read_block_item_counts( Current, Opts)} end, NewAcc = Acc#{BlockKey => WithItems}, @@ -579,46 +447,11 @@ assemble_block_info(Height, Block, Opts) -> <<"indexed">> => IndexedTXs, <<"not-indexed">> => NotIndexedTXs }, - case read_block_marker_depth(Height, Opts) of + case hb_store_arweave:read_block_marker_depth(Height, Opts) of undefined -> Base; Depth -> Base#{<<"depth">> => Depth} end. -%% @doc Probe item entries upward from depth 1, applying TransformFun to each. -probe_block_items(Height, Opts, TransformFun) -> - case hb_store_arweave:store_from_opts(Opts) of - no_store -> #{}; - #{ <<"index-store">> := Store } -> - probe_block_items(Height, Store, 1, #{}, TransformFun, Opts) - end. - -probe_block_items(Height, Store, Depth, Acc, TransformFun, Opts) -> - case hb_store:read(Store, block_items_path(Height, Depth), Opts) of - {ok, Bin} -> - Key = hb_util:bin(Depth), - probe_block_items( - Height, Store, Depth + 1, - Acc#{Key => TransformFun(Bin)}, TransformFun, Opts); - {error, not_found} -> - Acc - end. - -count_ids(Bin) when byte_size(Bin) rem 32 =:= 0 -> - byte_size(Bin) div 32; -count_ids(_) -> <<"corrupt">>. - -decode_and_encode_ids(Bin) -> - case decode_item_ids(Bin) of - {error, _} -> <<"corrupt">>; - List -> [hb_util:encode(ID) || ID <- List] - end. - -read_block_item_counts(Height, Opts) -> - probe_block_items(Height, Opts, fun count_ids/1). - -read_block_item_ids(Height, Opts) -> - probe_block_items(Height, Opts, fun decode_and_encode_ids/1). - %% @doc mode=inventory: return per-depth item ID lists from the local index store. %% Supports range queries. The inventory read itself is local-only (no network). %% Note: range parsing may call latest_height/1 if from/to are omitted or negative. @@ -639,11 +472,11 @@ inventory_index(From, To, Opts) -> inventory_local(Current, To, _Opts, Acc) when Current < To -> Acc; inventory_local(Current, To, Opts, Acc) -> - case read_block_marker_depth(Current, Opts) of + case hb_store_arweave:read_block_marker_depth(Current, Opts) of undefined -> inventory_local(Current - 1, To, Opts, Acc); Depth -> - ItemIDs = read_block_item_ids(Current, Opts), + ItemIDs = hb_store_arweave:read_block_item_ids(Current, Opts), BlockKey = hb_util:bin(Current), BlockInfo = #{<<"depth">> => Depth, <<"items">> => ItemIDs}, inventory_local(Current - 1, To, Opts, @@ -697,12 +530,12 @@ index_tx_headers(From, To, Opts) -> %% confirmed item IDs across every depth. collect_header_candidates(Current, To, _Opts, Acc) when Current < To -> Acc; collect_header_candidates(Current, To, Opts, Acc) -> - case read_block_marker_depth(Current, Opts) of + case hb_store_arweave:read_block_marker_depth(Current, Opts) of undefined -> collect_header_candidates(Current - 1, To, Opts, Acc); _Depth -> Candidates = - lists:append(maps:values(read_block_item_ids(Current, Opts))), + lists:append(maps:values(hb_store_arweave:read_block_item_ids(Current, Opts))), collect_header_candidates(Current - 1, To, Opts, Candidates ++ Acc) end. @@ -738,7 +571,7 @@ fetch_block_header(Height, Opts) -> classify_txs(TXIDs, Opts) -> lists:foldl( fun(TXID, {IndexedAcc, NotIndexedAcc}) -> - case is_tx_indexed(TXID, Opts) of + case hb_store_arweave:is_tx_indexed(TXID, Opts) of true -> {[TXID | IndexedAcc], NotIndexedAcc}; false -> {IndexedAcc, [TXID | NotIndexedAcc]} end @@ -794,7 +627,7 @@ do_fetch_blocks(Current, To, Depth, Opts) -> fetch_blocks_ranged(Current, To, Depth, block_workers(Opts), Opts). block_workers(Opts) -> - max(1, hb_opts:get(<<"arweave_block_workers">>, 3, Opts)). + max(1, hb_opts:get(<<"arweave-block-workers">>, 3, Opts)). %% @doc Process a known range of blocks in parallel batches. fetch_blocks_ranged(Current, To, TargetDepth, _Workers, _Opts) @@ -812,7 +645,7 @@ fetch_blocks_ranged(Current, To, TargetDepth, Workers, Opts) -> hb_pmap:parallel_map( Heights, fun(H) -> - case is_block_indexed(H, TargetDepth, Opts) of + case hb_store_arweave:is_block_indexed(H, TargetDepth, Opts) of true -> ok; false -> observe_event(<<"block_indexed">>, fun() -> @@ -889,17 +722,17 @@ process_prefetched_blocks(Blocks, TargetDepth, Workers, Opts) -> %% the cutover, falls back to legacy per-TX check. is_already_indexed({ok, Block}, TargetDepth, Opts) -> Height = hb_maps:get(<<"height">>, Block, undefined, Opts), - case is_block_indexed(Height, TargetDepth, Opts) of + case hb_store_arweave:is_block_indexed(Height, TargetDepth, Opts) of true -> true; false -> - case is_post_cutover(Height, Opts) of + case hb_store_arweave:is_post_cutover(Height, Opts) of true -> false; false -> TXIDs = hb_maps:get(<<"txs">>, Block, [], Opts), lists:any( - fun(TXID) -> is_tx_indexed(TXID, Opts) end, + fun(TXID) -> hb_store_arweave:is_tx_indexed(TXID, Opts) end, TXIDs ) end @@ -907,13 +740,6 @@ is_already_indexed({ok, Block}, TargetDepth, Opts) -> is_already_indexed({error, _}, _TargetDepth, _Opts) -> false. -is_post_cutover(undefined, _Opts) -> false; -is_post_cutover(Height, Opts) -> - case read_cutover_height(Opts) of - undefined -> false; - Cutover -> Height >= Cutover - end. - fetch_and_process_block(Current, To, TargetDepth, Opts) -> BlockRes = fetch_block_header(Current, Opts), process_block(BlockRes, Current, To, TargetDepth, Opts). @@ -945,11 +771,11 @@ process_block(BlockRes, Current, To, TargetDepth, Opts) -> max(2, TargetDepth)), ItemIDs = maps:get(item_ids, Results, #{}), maybe - ok ?= write_block_item_ids( + ok ?= hb_store_arweave:write_block_item_ids( Current, AchievedDepth, ItemIDs, Opts), - ok ?= mark_block_indexed( + ok ?= hb_store_arweave:mark_block_indexed( Current, AchievedDepth, Opts), - ensure_cutover_height(Current, Opts), + hb_store_arweave:ensure_cutover_height(Current, Opts), ?event( copycat_short, {arweave_block_indexed, @@ -1066,11 +892,12 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, TXID, <<"tx@1.0">>, TXStartOffset, - TX#tx.data_size + TX#tx.data_size, + Opts ) end), #{ <<"index-store">> := IndexStore } = ArweaveStore, - ok = write_parent(TX#tx.id, BlockHeight, block, IndexStore, Opts), + ok = hb_store_arweave:write_parent(TX#tx.id, BlockHeight, block, IndexStore, Opts), try is_bundle_tx(TX, Opts) of false -> #{items_count => 0, bundle_count => 0, skipped_count => 0, @@ -1118,9 +945,10 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, hb_util:encode(ItemID), <<"ans104@1.0">>, ItemStartOffset, - Size + Size, + Opts ), - ok = write_parent(ItemID, TX#tx.id, bundle, IndexStore, Opts), + ok = hb_store_arweave:write_parent(ItemID, TX#tx.id, bundle, IndexStore, Opts), {ItemStartOffset + Size, ItemsCountAcc + 1} end, {TXStartOffset + HeaderSize, 0}, @@ -1455,8 +1283,6 @@ ensure_l1_tx_offset(TXID, EncodedTXID, IndexStore, QueryL1Offset, Opts) -> end. query_l1_tx_offset(TXID, IndexStore, Opts) -> - % TODO: move this into dev_arweave - I think? Unless it's possible to - % query this already via one of the existing ~arweave@2.9 paths? case observe_copycat_l1_stage( <<"l1_offset_query_http">>, fun() -> @@ -1482,7 +1308,8 @@ query_l1_tx_offset(TXID, IndexStore, Opts) -> TXID, <<"tx@1.0">>, StartOffset, - Size + Size, + Opts ) end ), @@ -1551,9 +1378,10 @@ index_full_bundle_items( EncodedItemID, <<"ans104@1.0">>, ItemStartOffset, - Size + Size, + Opts ), - ok = write_parent(ItemID, ParentID, bundle, IndexStore, Opts), + ok = hb_store_arweave:write_parent(ItemID, ParentID, bundle, IndexStore, Opts), {DescendantCount, ItemAchievedDepth, ChildIDs} = case {Depth > 1, ParseResult} of {true, {ok, HeaderSize, ParsedItem}} -> @@ -1730,12 +1558,6 @@ observe_copycat_l1_stage(MetricName, Fun) -> record_copycat_l1_metrics(MetricName, 1, Time), Result. -get_index_store(Opts) -> - case hb_store_arweave:store_from_opts(Opts) of - #{ <<"index-store">> := Store } -> Store; - _ -> throw(no_index_store_available) - end. - %% @doc Scan the mempool and index any accessible unconfirmed TXs. index_mempool(Request, Opts) -> SenderFilter = mempool_sender_filter(Request, Opts), @@ -1852,7 +1674,7 @@ index_mempool_tx(TXID, SenderFilter, Opts) -> Result. index_mempool_tx_unfiltered(TXID, Opts) -> - case is_tx_indexed(TXID, Opts) of + case hb_store_arweave:is_tx_indexed(TXID, Opts) of true -> existing; false -> case load_mempool_tx_header(TXID, Opts) of @@ -1867,7 +1689,7 @@ index_mempool_tx_filtered(TXID, SenderFilter, Opts) -> case mempool_tx_sender_matches(TX, SenderFilter) of false -> filtered; true -> - case is_tx_indexed(TXID, Opts) of + case hb_store_arweave:is_tx_indexed(TXID, Opts) of true -> existing; false -> write_mempool_offsets(TXID, TX, Opts) end @@ -2155,8 +1977,8 @@ mempool_sender_filter_indexes_matching_tx_test_parallel() -> Opts ) ), - ?assert(is_tx_indexed(MatchTXID, Opts)), - ?assertNot(is_tx_indexed(OtherTXID, Opts)). + ?assert(hb_store_arweave:is_tx_indexed(MatchTXID, Opts)), + ?assertNot(hb_store_arweave:is_tx_indexed(OtherTXID, Opts)). mempool_test_pending_tx(Sender) -> #tx{ @@ -2174,7 +1996,7 @@ index_ids_test_parallel() -> {_TestStore, StoreOpts, Opts} = setup_index_opts(), {ok, 1827942} = hb_ao:resolve( - <<"~copycat@1.0/arweave&from=1827942&to=1827942">>, + <<"~copycat@1.0/arweave&from=1827942&to=1827942&depth=2">>, Opts ), ?assertMatch( @@ -2567,9 +2389,9 @@ auto_stop_on_indexed_block_test_parallel() -> ?assert(has_any_indexed_tx(Higher1, Opts)), ?assert(has_any_indexed_tx(IndexedBlock, Opts)), ?assertNot(has_any_indexed_tx(IndexedBlock-1, Opts)), - ?assert(is_block_indexed(IndexedBlock, 2, Opts)), - ?assert(is_block_indexed(Higher1, 2, Opts)), - ?assert(is_block_indexed(Higher2, 2, Opts)), + ?assert(hb_store_arweave:is_block_indexed(IndexedBlock, 2, Opts)), + ?assert(hb_store_arweave:is_block_indexed(Higher1, 2, Opts)), + ?assert(hb_store_arweave:is_block_indexed(Higher2, 2, Opts)), ok. explicit_to_reindexes_all_test_parallel() -> @@ -2633,7 +2455,7 @@ auto_stop_partial_index_test_parallel() -> TXIDs = hb_maps:get(<<"txs">>, BlockData, [], Opts), ?assert(length(TXIDs) > 0), [OneTXID | _] = TXIDs, - ok = hb_store_arweave:write_offset(StoreOpts, OneTXID, <<"tx@1.0">>, 0, 0), + ok = hb_store_arweave:write_offset(StoreOpts, OneTXID, <<"tx@1.0">>, 0, 0, Opts), {ok, Block} = hb_ao:resolve( << @@ -2646,8 +2468,8 @@ auto_stop_partial_index_test_parallel() -> ?assert(has_any_indexed_tx(HigherBlock, Opts)), ?assert(has_any_indexed_tx(Block, Opts)), ?assertNot(has_any_indexed_tx(Block-1, Opts)), - ?assert(is_block_indexed(HigherBlock, 2, Opts)), - ?assertNot(is_block_indexed(Block, 2, Opts)), + ?assert(hb_store_arweave:is_block_indexed(HigherBlock, 2, Opts)), + ?assertNot(hb_store_arweave:is_block_indexed(Block, 2, Opts)), ok. negative_parse_range_test_parallel() -> @@ -2884,23 +2706,23 @@ l1_filter_reason_test() -> request_depth_clamping_test() -> {_TestStore, _StoreOpts, Opts0} = setup_index_opts(), - ?assertEqual(6, request_depth(#{}, <<"safe_max">>, Opts0)), + ?assertEqual(6, request_depth(#{}, <<"full">>, Opts0)), ?assertEqual( 2, - request_depth(#{<<"depth">> => <<"2">>}, <<"safe_max">>, Opts0) + request_depth(#{<<"depth">> => <<"2">>}, <<"full">>, Opts0) ), ?assertEqual( 1, - request_depth(#{<<"depth">> => <<"0">>}, <<"safe_max">>, Opts0) + request_depth(#{<<"depth">> => <<"0">>}, <<"full">>, Opts0) ), ?assertEqual( 6, - request_depth(#{<<"depth">> => <<"999">>}, <<"safe_max">>, Opts0) + request_depth(#{<<"depth">> => <<"999">>}, <<"full">>, Opts0) ), Opts1 = set_depth_recursion_cap(2, Opts0), - ?assertEqual(2, request_depth(#{}, <<"safe_max">>, Opts1)), + ?assertEqual(2, request_depth(#{}, <<"full">>, Opts1)), % no recursion cap set, use default from hb_opts - ?assertEqual(6, request_depth(#{}, <<"safe_max">>, #{})), + ?assertEqual(6, request_depth(#{}, <<"full">>, #{})), ok. id_depth_1_test() -> @@ -3186,7 +3008,7 @@ has_any_indexed_tx(Height, Opts) -> case fetch_block_header(Height, Opts) of {ok, Block} -> TXIDs = hb_maps:get(<<"txs">>, Block, [], Opts), - lists:any(fun(TXID) -> is_tx_indexed(TXID, Opts) end, TXIDs); + lists:any(fun(TXID) -> hb_store_arweave:is_tx_indexed(TXID, Opts) end, TXIDs); {error, _} -> false end. @@ -3216,18 +3038,18 @@ assert_indexed_range(From, To, Opts) -> ?assert(has_any_indexed_tx(From, Opts)), assert_indexed_range(From - 1, To, Opts). -block_marker_depth_2_test() -> +block_marker_default_depth_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), Block = 1827942, {ok, Block} = hb_ao:resolve( <<"~copycat@1.0/arweave&from=", (hb_util:bin(Block))/binary, "&to=", - (hb_util:bin(Block))/binary>>, + (hb_util:bin(Block))/binary, "&depth=2">>, Opts ), - ?assert(is_block_indexed(Block, 2, Opts)), - ?assertNot(is_block_indexed(Block, 3, Opts)), + ?assert(hb_store_arweave:is_block_indexed(Block, 2, Opts)), + ?assertNot(hb_store_arweave:is_block_indexed(Block, 3, Opts)), ok. depth_1_normalizes_to_2_test() -> @@ -3251,10 +3073,10 @@ depth_1_normalizes_to_2_test() -> Result = process_block_txs(Tuples, 0, 1, 88888888, Opts), ?assertEqual(2, maps:get(achieved_depth, Result)), Height = 88888888, - mark_block_indexed(Height, maps:get(achieved_depth, Result), Opts), - ?assert(is_block_indexed(Height, 1, Opts)), - ?assert(is_block_indexed(Height, 2, Opts)), - ?assertNot(is_block_indexed(Height, 3, Opts)), + hb_store_arweave:mark_block_indexed(Height, maps:get(achieved_depth, Result), Opts), + ?assert(hb_store_arweave:is_block_indexed(Height, 1, Opts)), + ?assert(hb_store_arweave:is_block_indexed(Height, 2, Opts)), + ?assertNot(hb_store_arweave:is_block_indexed(Height, 3, Opts)), ok. block_marker_cutover_test() -> @@ -3268,12 +3090,27 @@ block_marker_cutover_test() -> (hb_util:bin(UpperBlock))/binary>>, Opts ), - Cutover = read_cutover_height(Opts), + Cutover = hb_store_arweave:read_cutover_height(Opts), ?assertNotEqual(undefined, Cutover), - ?assert(is_block_indexed(UpperBlock, 2, Opts)), - ?assertNot(is_block_indexed(LowerBlock, 2, Opts)), + ?assert(hb_store_arweave:is_block_indexed(UpperBlock, 2, Opts)), + ?assertNot(hb_store_arweave:is_block_indexed(LowerBlock, 2, Opts)), ok. +achieved_depth_block_depth_2_test() -> + {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + Block = 1827942, + {ok, Block} = + hb_ao:resolve( + <<"~copycat@1.0/arweave&from=", + (hb_util:bin(Block))/binary, "&to=", + (hb_util:bin(Block))/binary, "&depth=2">>, + Opts + ), + ?assert(hb_store_arweave:is_block_indexed(Block, 2, Opts)), + ?assertNot(hb_store_arweave:is_block_indexed(Block, 3, Opts)), + ok. + + achieved_depth_block_depth_3_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), Block = 1827942, @@ -3284,7 +3121,7 @@ achieved_depth_block_depth_3_test() -> (hb_util:bin(Block))/binary, "&depth=3">>, Opts ), - ?assert(is_block_indexed(Block, 3, Opts)), + ?assert(hb_store_arweave:is_block_indexed(Block, 3, Opts)), ok. invalid_bundle_bytes_test() -> @@ -3306,14 +3143,14 @@ small_block_depth_3_test() -> (hb_util:bin(Block))/binary, "&depth=3">>, Opts ), - ?assert(is_block_indexed(Block, 3, Opts)), + ?assert(hb_store_arweave:is_block_indexed(Block, 3, Opts)), #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), - {ok, L1Bin} = hb_store:read(Store, block_items_path(Block, 1), Opts), - ?assert(length(decode_item_ids(L1Bin)) > 0), - {ok, L2Bin} = hb_store:read(Store, block_items_path(Block, 2), Opts), - ?assert(length(decode_item_ids(L2Bin)) > 0), - {ok, L3Bin} = hb_store:read(Store, block_items_path(Block, 3), Opts), - L3IDs = decode_item_ids(L3Bin), + {ok, L1Bin} = hb_store:read(Store, hb_store_arweave:block_items_path(Block, 1), Opts), + ?assert(length(hb_store_arweave:decode_item_ids(L1Bin)) > 0), + {ok, L2Bin} = hb_store:read(Store, hb_store_arweave:block_items_path(Block, 2), Opts), + ?assert(length(hb_store_arweave:decode_item_ids(L2Bin)) > 0), + {ok, L3Bin} = hb_store:read(Store, hb_store_arweave:block_items_path(Block, 3), Opts), + L3IDs = hb_store_arweave:decode_item_ids(L3Bin), ?assertEqual(3, length(L3IDs)), assert_item_read( <<"npAzk_BomjWBQQr_xnmlhdxjyl97EJnNv_MAaXffs1s">>, @@ -3362,7 +3199,7 @@ exact_marker_depth_test() -> #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), {ok, StoredBin} = - hb_store:read(Store, block_indexed_path(Block), Opts), + hb_store:read(Store, hb_store_arweave:block_indexed_path(Block), Opts), StoredDepth = binary_to_integer(StoredBin), ?assertEqual(3, StoredDepth), ok. @@ -3403,15 +3240,15 @@ block_item_ids_depth_2_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), {ok, 1827942} = hb_ao:resolve( - <<"~copycat@1.0/arweave&from=1827942&to=1827942">>, + <<"~copycat@1.0/arweave&from=1827942&to=1827942&depth=2">>, Opts ), #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), - {ok, L1Bin} = hb_store:read(Store, block_items_path(1827942, 1), Opts), - L1IDs = decode_item_ids(L1Bin), + {ok, L1Bin} = hb_store:read(Store, hb_store_arweave:block_items_path(1827942, 1), Opts), + L1IDs = hb_store_arweave:decode_item_ids(L1Bin), ?assert(length(L1IDs) > 0), - {ok, L2Bin} = hb_store:read(Store, block_items_path(1827942, 2), Opts), - L2IDs = decode_item_ids(L2Bin), + {ok, L2Bin} = hb_store:read(Store, hb_store_arweave:block_items_path(1827942, 2), Opts), + L2IDs = hb_store_arweave:decode_item_ids(L2Bin), ?assert(length(L2IDs) > 0), L2Encoded = [hb_util:encode(ID) || ID <- L2IDs], Pos54K = index_of(<<"54K1ehEIKZxGSusgZzgbGYaHfllwWQ09-S9-eRUJg5Y">>, L2Encoded), @@ -3419,7 +3256,7 @@ block_item_ids_depth_2_test() -> ?assert(is_integer(Pos54K)), ?assert(is_integer(PosOBK)), ?assert(Pos54K < PosOBK), - ?assertEqual({error, not_found}, hb_store:read(Store, block_items_path(1827942, 3), Opts)), + ?assertEqual({error, not_found}, hb_store:read(Store, hb_store_arweave:block_items_path(1827942, 3), Opts)), ok. block_item_ids_depth_3_test() -> @@ -3430,16 +3267,16 @@ block_item_ids_depth_3_test() -> Opts ), #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), - {ok, L1Bin} = hb_store:read(Store, block_items_path(1827942, 1), Opts), - L1Count = length(decode_item_ids(L1Bin)), + {ok, L1Bin} = hb_store:read(Store, hb_store_arweave:block_items_path(1827942, 1), Opts), + L1Count = length(hb_store_arweave:decode_item_ids(L1Bin)), ?assertEqual(5, L1Count), - {ok, L2Bin} = hb_store:read(Store, block_items_path(1827942, 2), Opts), - L2Count = length(decode_item_ids(L2Bin)), + {ok, L2Bin} = hb_store:read(Store, hb_store_arweave:block_items_path(1827942, 2), Opts), + L2Count = length(hb_store_arweave:decode_item_ids(L2Bin)), ?assert(L2Count > 0), - {ok, L3Bin} = hb_store:read(Store, block_items_path(1827942, 3), Opts), - L3Count = length(decode_item_ids(L3Bin)), + {ok, L3Bin} = hb_store:read(Store, hb_store_arweave:block_items_path(1827942, 3), Opts), + L3Count = length(hb_store_arweave:decode_item_ids(L3Bin)), ?assert(L3Count >= 1), - L3IDs = decode_item_ids(L3Bin), + L3IDs = hb_store_arweave:decode_item_ids(L3Bin), L3Encoded = [hb_util:encode(ID) || ID <- L3IDs], ?assert(lists:member( <<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, L3Encoded)), @@ -3495,12 +3332,12 @@ inventory_single_block_test() -> inventory_range_test() -> {_TestStore, StoreOpts, Opts} = setup_index_opts(), #{ <<"index-store">> := Store } = StoreOpts, - ok = hb_store:write(Store, #{block_indexed_path(77777777) => <<"2">>}, Opts), - ok = hb_store:write(Store, #{block_items_path(77777777, 1) => <<0:256>>}, Opts), - ok = hb_store:write(Store, #{block_items_path(77777777, 2) => <<>>}, Opts), - ok = hb_store:write(Store, #{block_indexed_path(77777778) => <<"2">>}, Opts), - ok = hb_store:write(Store, #{block_items_path(77777778, 1) => <<1:256>>}, Opts), - ok = hb_store:write(Store, #{block_items_path(77777778, 2) => <<>>}, Opts), + ok = hb_store:write(Store, #{hb_store_arweave:block_indexed_path(77777777) => <<"2">>}, Opts), + ok = hb_store:write(Store, #{hb_store_arweave:block_items_path(77777777, 1) => <<0:256>>}, Opts), + ok = hb_store:write(Store, #{hb_store_arweave:block_items_path(77777777, 2) => <<>>}, Opts), + ok = hb_store:write(Store, #{hb_store_arweave:block_indexed_path(77777778) => <<"2">>}, Opts), + ok = hb_store:write(Store, #{hb_store_arweave:block_items_path(77777778, 1) => <<1:256>>}, Opts), + ok = hb_store:write(Store, #{hb_store_arweave:block_items_path(77777778, 2) => <<>>}, Opts), {ok, InvResult} = inventory_index(77777778, 77777777, Opts), Body = hb_json:decode(hb_maps:get(<<"body">>, InvResult)), ?assert(maps:is_key(<<"77777777">>, Body)), @@ -3509,51 +3346,6 @@ inventory_range_test() -> ?assertEqual(2, maps:get(<<"depth">>, maps:get(<<"77777778">>, Body))), ok. -decode_item_ids_validation_test() -> - ?assertEqual([], decode_item_ids(<<>>)), - GoodBin = <<0:256, 1:256>>, - ?assertEqual(2, length(decode_item_ids(GoodBin))), - BadBin = <<0:240>>, - ?assertEqual({error, invalid_item_ids_binary}, decode_item_ids(BadBin)), - ok. - -corrupt_item_ids_read_test() -> - {_TestStore, _StoreOpts, Opts} = setup_index_opts(), - #{ <<"index-store">> := Store } = hb_store_arweave:store_from_opts(Opts), - Height = 99999999, - ok = hb_store:write(Store, #{block_indexed_path(Height) => <<"2">>}, Opts), - ok = hb_store:write(Store, #{block_items_path(Height, 1) => <<0:256>>}, Opts), - ok = hb_store:write(Store, #{block_items_path(Height, 2) => <<0:240>>}, Opts), - Counts = read_block_item_counts(Height, Opts), - ?assertEqual(1, maps:get(<<"1">>, Counts)), - ?assertEqual(<<"corrupt">>, maps:get(<<"2">>, Counts)), - IDs = read_block_item_ids(Height, Opts), - ?assertEqual(1, length(maps:get(<<"1">>, IDs))), - ?assertEqual(<<"corrupt">>, maps:get(<<"2">>, IDs)), - ok. - -parent_encode_decode_test() -> - BlockEntry = encode_parent_entry(12345, block), - ?assertEqual(<<0, 12345:64/big-unsigned>>, BlockEntry), - BundleID = crypto:strong_rand_bytes(32), - BundleEntry = encode_parent_entry(BundleID, bundle), - ?assertEqual(<<1, BundleID:32/binary>>, BundleEntry), - Combined = <>, - Decoded = hb_store_arweave:decode_parent_entries(Combined), - ?assertEqual([{12345, block}, {BundleID, bundle}], Decoded), - ok. - -parent_not_found_test() -> - {_TestStore, _StoreOpts, Opts} = setup_index_opts(), - StoreOpts2 = hb_store_arweave:store_from_opts(Opts), - UnknownID = crypto:strong_rand_bytes(32), - ?assertEqual( - not_found, - hb_store_arweave:read_parent(StoreOpts2, UnknownID, Opts), - Opts - ), - ok. - parent_depth_2_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), Block = 1827942, From 3a27a2fc0fbf295fa9bbded8a3063d7dd13044a0 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 8 May 2026 19:18:37 +0100 Subject: [PATCH 53/68] impr: Fix bug on index scope --- src/core/include/hb_store_arweave.hrl | 3 +++ src/core/resolver/hb_opts.erl | 3 ++- src/core/store/hb_store_arweave.erl | 3 +-- 3 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 src/core/include/hb_store_arweave.hrl diff --git a/src/core/include/hb_store_arweave.hrl b/src/core/include/hb_store_arweave.hrl new file mode 100644 index 000000000..6bd62ae9b --- /dev/null +++ b/src/core/include/hb_store_arweave.hrl @@ -0,0 +1,3 @@ +-define(SCOPE_PARENT, <<"parent">>). +-define(SCOPE_OFFSET, <<"offset">>). + diff --git a/src/core/resolver/hb_opts.erl b/src/core/resolver/hb_opts.erl index 286355714..3504aa9ef 100644 --- a/src/core/resolver/hb_opts.erl +++ b/src/core/resolver/hb_opts.erl @@ -20,6 +20,7 @@ -include("include/hb.hrl"). -include("include/hb_opts.hrl"). -include("include/hb_arweave_nodes.hrl"). +-include("include/hb_store_arweave.hrl"). -include("../../_build/hb_preloaded_index.hrl"). -ifndef(PRELOADED_DEVICES_INDEX_MESSAGE_ID). @@ -288,7 +289,7 @@ raw_default_message() -> <<"copycat-memory-budget">> => 6 * 1024 * 1024 * 1024, <<"copycat-depth-recursion-cap">> => 6, % 2x the deepest we've seen to date <<"arweave-block-workers">> => 3, - <<"copycat-scope">> => ["offset", "parent"], + <<"copycat-scope">> => [?SCOPE_OFFSET, ?SCOPE_PARENT], % Dev options <<"mode">> => debug, <<"profiling">> => true, diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 17680e858..626652a05 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -14,11 +14,10 @@ -export([decode_item_ids/1, is_block_indexed/3, is_post_cutover/2, mark_block_indexed/3 ]). -export([root_offset/2]). -include("include/hb.hrl"). +-include("include/hb_store_arweave.hrl"). -include_lib("eunit/include/eunit.hrl"). -define(PARTITION_SIZE, 3_600_000_000_000). --define(SCOPE_PARENT, <<"parent">>). --define(SCOPE_OFFSET, <<"offset">>). -define(CUTOVER_KEY, <<"block/marker-cutover-height">>). %% @doc Find the first Arweave store from the given node message. Searches first From 206e1e274cdf4deced4ad8bfd12eec4b57efae6c Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 8 May 2026 00:58:36 +0100 Subject: [PATCH 54/68] impr: Use hb_opts instead of hb_maps --- src/core/store/hb_store_arweave.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 626652a05..33eda8585 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -336,7 +336,7 @@ write_parent(ItemID, ParentData, Type, Store, Opts) -> case lists:member( ?SCOPE_PARENT, - hb_maps:get(<<"copycat-scope">>, Opts, [?SCOPE_PARENT]) + hb_opts:get(<<"copycat-scope">>, [], Opts) ) of true -> Entry = encode_parent_entry(ParentData, Type), @@ -357,7 +357,7 @@ write_offset( case lists:member( ?SCOPE_OFFSET, - hb_maps:get(<<"copycat-scope">>, Opts, [?SCOPE_OFFSET]) + hb_opts:get(<<"copycat-scope">>, [], Opts) ) of true -> Value = hb_store_arweave_offset:encode(CodecName, StartOffset, Length), From b570152a6b343f2abfcc550ebffc161564f6e3c0 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 8 May 2026 19:31:11 +0100 Subject: [PATCH 55/68] impr: Make test work after rebase --- src/core/store/hb_store_arweave.erl | 2 +- src/preloaded/query/dev_copycat_arweave.erl | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 33eda8585..2c5b9c209 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -667,7 +667,7 @@ load_item_deserialize_throws_test() -> %% the bytes at ProbeOffset are mid-TX application data, not an ANS-104 header. ProbeOffset = 376836336327208, Size = 4096, - ok = write_offset(Opts, FakeID, <<"ans104@1.0">>, ProbeOffset - 1, Size, Opts), + ok = write_offset(ArweaveStoreOpts, FakeID, <<"ans104@1.0">>, ProbeOffset - 1, Size, Opts), ?assertMatch({error, _}, read(ArweaveStoreOpts, #{ <<"read">> => FakeID }, Opts)). root_offset_confirmed_parent_test() -> diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index a5149f8d8..b18f2086a 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -1786,7 +1786,7 @@ write_mempool_offsets(TXID, TX, Opts) -> {loaded_bytes, byte_size(Data)}} ), ok = hb_store_arweave:write_offset( - Store, TXID, <<"tx@1.0">>, relative, TX#tx.data_size), + Store, TXID, <<"tx@1.0">>, relative, TX#tx.data_size, Opts), write_mempool_children(Store, TXID, TX, Data, Opts); _Error -> #{ status => missing_data } @@ -1797,7 +1797,7 @@ write_mempool_children(Store, TXID, TX, Data, Opts) -> true -> case load_mempool_bundle_index(TXID, Data, Opts) of {ok, HeaderSize, BundleIndex} -> - write_mempool_items(Store, TXID, BundleIndex, HeaderSize), + write_mempool_items(Store, TXID, BundleIndex, HeaderSize, Opts), #{ status => indexed, tx_offsets_written => 1, @@ -1816,7 +1816,7 @@ write_mempool_children(Store, TXID, TX, Data, Opts) -> Ref = #{ <<"relative">> => TXID, <<"offset">> => 0 }, hb_store_arweave:write_offset( Store, ItemID, <<"ans104@1.0">>, - Ref, TX#tx.data_size), + Ref, TX#tx.data_size, Opts), #{ status => indexed, tx_offsets_written => 1, @@ -1830,12 +1830,12 @@ write_mempool_children(Store, TXID, TX, Data, Opts) -> end end. -write_mempool_items(_Store, _TXID, [], _Offset) -> ok; -write_mempool_items(Store, TXID, [{ItemID, Size} | Rest], Offset) -> +write_mempool_items(_Store, _TXID, [], _Offset, _Opts) -> ok; +write_mempool_items(Store, TXID, [{ItemID, Size} | Rest], Offset, Opts) -> Ref = #{ <<"relative">> => TXID, <<"offset">> => Offset }, hb_store_arweave:write_offset( - Store, hb_util:encode(ItemID), <<"ans104@1.0">>, Ref, Size), - write_mempool_items(Store, TXID, Rest, Offset + Size). + Store, hb_util:encode(ItemID), <<"ans104@1.0">>, Ref, Size, Opts), + write_mempool_items(Store, TXID, Rest, Offset + Size, Opts). load_mempool_data(_TXID, #tx{ data_size = 0 }, _Opts) -> {ok, <<>>}; From 109a194b86af920870a59b61b06d39d6e49b6d7c Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Tue, 12 May 2026 15:36:21 -0400 Subject: [PATCH 56/68] feat: optimize the header indexing size on disk --- src/preloaded/query/dev_copycat_arweave.erl | 114 +++++++------------- 1 file changed, 40 insertions(+), 74 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index b18f2086a..aa44b1688 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -40,12 +40,11 @@ arweave(_Base, Request, Opts) -> end; <<"list">> -> with_range(Request, Opts, fun list_index/3); <<"inventory">> -> with_range(Request, Opts, fun inventory_index/3); - <<"headers">> -> with_range(Request, Opts, fun index_tx_headers/3); Mode -> { - error, - <<"Unsupported mode `", (hb_util:bin(Mode))/binary,"`. Supported", - "modes are: write, list, inventory, headers, mempool">> + error, + <<"Unsupported mode `", (hb_util:bin(Mode))/binary,"`. Supported", + "modes are: write, list, inventory, mempool">> } end. @@ -482,76 +481,36 @@ inventory_local(Current, To, Opts, Acc) -> inventory_local(Current - 1, To, Opts, Acc#{BlockKey => BlockInfo}) end. - -%% @doc mode=headers: walk every confirmed item recorded under the indexed -%% blocks in [From..To], read the full message via the normal cache path, -%% and write it into the local store via `hb_cache:write'. Top-level tag -%% fields and commitment fields land path-keyed under `/...' so any -%% future filter (tags, owners, recipients, target, anchor, ...) is naturally -%% answerable by `hb_cache:match'. -%% -%% Items already laid out under local-store are skipped, so reruns are -%% idempotent and serve as the natural retry mechanism for previously failed -%% ids. -index_tx_headers(From, undefined, Opts) -> - index_tx_headers(From, 0, Opts); -index_tx_headers(From, To, _Opts) when From < To -> - {ok, {From, To}}; -index_tx_headers(From, To, Opts) -> - Candidates = collect_header_candidates(From, To, Opts, []), - Workers = hb_opts:get(<<"copycat-headers-workers">>, 8, Opts), - ?event(copycat_short, - {headers_scan_started, - {from, From}, {to, To}, - {candidates, length(Candidates)}, - {workers, Workers} - } - ), - hb_pmap:parallel_map( - Candidates, - fun(ID) -> - case index_headers(ID, Opts) of - skipped -> ?event(copycat_short, {header_skipped, {id, ID}}); - {ok, _} -> ?event(copycat_short, {header_indexed, {id, ID}}); - {error, Reason} -> - ?event(copycat_short, - {header_index_crash, - {id, ID}, {reason, Reason} + +%% @doc Materialise a parsed `#tx{}' header into local-store so that +%% `hb_cache:match' can answer GraphQL filters. No-op when header indexing is +%% disabled via `<<"index-headers">>'. +write_item_header(TX, Codec, Opts) -> + case hb_opts:get(<<"index-headers">>, true, Opts) of + true -> + LocalOpts = hb_store:scope(Opts, local), + maybe + Msg = hb_message:convert(TX, <<"structured@1.0">>, Codec, LocalOpts), + {ok, _Path} ?= hb_cache:write(Msg, LocalOpts), + ?event(copycat_short, + {header_inline_written, + {id, {explicit, hb_util:encode(TX#tx.id)}}, + {codec, Codec} + } + ), + ok + else + {error, R} -> + ?event(copycat_short, + {header_write_failed, + {id, {explicit, hb_util:encode(TX#tx.id)}}, + {codec, Codec}, + {reason, R} } - ) - end - end, - Workers - ), - ?event(copycat_short, {headers_scan_completed, {from, From}, {to, To}}), - {ok, {From, To}}. - -%% @doc Walk indexed blocks from `Current' down to `To', collecting all -%% confirmed item IDs across every depth. -collect_header_candidates(Current, To, _Opts, Acc) when Current < To -> Acc; -collect_header_candidates(Current, To, Opts, Acc) -> - case hb_store_arweave:read_block_marker_depth(Current, Opts) of - undefined -> - collect_header_candidates(Current - 1, To, Opts, Acc); - _Depth -> - Candidates = - lists:append(maps:values(hb_store_arweave:read_block_item_ids(Current, Opts))), - collect_header_candidates(Current - 1, To, Opts, Candidates ++ Acc) - end. - -%% @doc If the ID is not already in the local store, read it from the arweave -%% store and write it into the local store. -index_headers(ID, Opts) -> - LocalOpts = hb_store:scope(Opts, local), - try - case hb_cache:read(ID, LocalOpts) of - {ok, _} -> skipped; - _ -> - {ok, Msg} = hb_cache:read(ID, Opts), - hb_cache:write(Msg, LocalOpts) - end - catch _:Reason -> - {error, Reason} + ), + {error, R} + end; + _ -> ok end. fetch_block_header(Height, Opts) -> @@ -898,6 +857,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, end), #{ <<"index-store">> := IndexStore } = ArweaveStore, ok = hb_store_arweave:write_parent(TX#tx.id, BlockHeight, block, IndexStore, Opts), + ok = write_item_header(TX, <<"tx@1.0">>, Opts), try is_bundle_tx(TX, Opts) of false -> #{items_count => 0, bundle_count => 0, skipped_count => 0, @@ -1382,6 +1342,11 @@ index_full_bundle_items( Opts ), ok = hb_store_arweave:write_parent(ItemID, ParentID, bundle, IndexStore, Opts), + ok = + case ParseResult of + {ok, _, Parsed} -> write_item_header(Parsed, <<"ans104@1.0">>, Opts); + _ -> ok + end, {DescendantCount, ItemAchievedDepth, ChildIDs} = case {Depth > 1, ParseResult} of {true, {ok, HeaderSize, ParsedItem}} -> @@ -3524,7 +3489,8 @@ strip_preserves_verify_test_parallel() -> L3ID = <<"8aJrRWtHcJvJ61qsH6agGkemzrtLw3W22xFrpCGAnTM">>, lists:foreach( fun(ID) -> - index_headers(ID, Opts), + %% mode=write now materialises the header inline, so no + %% separate index_headers call is needed. {ok, HeaderMsg} = hb_cache:read(ID, hb_store:scope(Opts, local)), ?event( {verify_msg, From 1a5f102936bdb0240fea18a0a88bac95dc28fb33 Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Tue, 12 May 2026 15:37:10 -0400 Subject: [PATCH 57/68] fix: gql queries matching with lower and uppercase tag names --- src/preloaded/query/dev_query_graphql.erl | 37 ++++++++++++----------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/src/preloaded/query/dev_query_graphql.erl b/src/preloaded/query/dev_query_graphql.erl index 7fda6ecdc..a269d25f2 100644 --- a/src/preloaded/query/dev_query_graphql.erl +++ b/src/preloaded/query/dev_query_graphql.erl @@ -240,23 +240,26 @@ message_query(Msg, <<"cursor">>, _Args, Opts) -> message_query(_Obj, _Field, _, _) -> {ok, <<"Not found.">>}. -keys_to_template(Keys) -> - maps:from_list(lists:foldl( - fun(#{<<"name">> := Name, <<"value">> := Value}, Acc) -> - [{Name, Value} | Acc]; - (#{<<"name">> := Name, <<"values">> := [Value]}, Acc) -> - [{Name, Value} | Acc]; - (#{<<"name">> := Name, <<"values">> := Values}, _Acc) -> - throw( - {multivalue_tag_search_not_supported, #{ - <<"name">> => Name, - <<"values">> => Values - }} - ) - end, - [], - Keys - )). +%% @doc Build a tag-match template from a list of GraphQL tag filters. +keys_to_template(Keys) -> + maps:from_list([key_to_pair(K) || K <- Keys]). + +key_to_pair(#{ <<"name">> := Name, <<"value">> := Value }) -> + {normalize_tag_name(Name), Value}; +key_to_pair(#{ <<"name">> := Name, <<"values">> := [Value] }) -> + {normalize_tag_name(Name), Value}; +key_to_pair(#{ <<"name">> := Name, <<"values">> := Values} ) -> + throw( + {multivalue_tag_search_not_supported, #{ + <<"name">> => Name, + <<"values">> => Values + }} + ). + +%% @doc Lowercase a GraphQL tag name to match the storage convention used by +%% `dev_codec_ans104_from'. Without this, query for `Action' never matches `action'. +normalize_tag_name(Name) -> + hb_util:to_lower(hb_ao:normalize_key(Name)). %%% Test helpers. From f18a6ffc599834164d5fcebeef53a0619c26fbe5 Mon Sep 17 00:00:00 2001 From: Niko Storni Date: Tue, 19 May 2026 01:20:59 +0200 Subject: [PATCH 58/68] chore: align rebase resolutions with edge conventions - dev_copycat_arweave: cap bundle_header reads at Size - dev_copycat_arweave: route through lib_arweave_common - hb_store: dedupe start/stop/scope via COMMON_POLICIES --- src/core/store/hb_store.erl | 6 +++--- src/preloaded/query/dev_copycat_arweave.erl | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/core/store/hb_store.erl b/src/core/store/hb_store.erl index cc1a2141e..07897b7aa 100644 --- a/src/core/store/hb_store.erl +++ b/src/core/store/hb_store.erl @@ -92,9 +92,9 @@ behavior_info(callbacks) -> %% @doc Store access policies to function names. -define(STORE_ACCESS_POLICIES, #{ - <<"read">> => [read, resolve, list, type, match, scope, start, stop], - <<"write">> => [write, link, group, reset, scope, start, stop], - <<"admin">> => [start, stop, reset, scope] + <<"read">> => [read, resolve, list, type, match] ++ ?COMMON_POLICIES, + <<"write">> => [write, link, group, reset] ++ ?COMMON_POLICIES, + <<"admin">> => [reset] ++ ?COMMON_POLICIES }). %%% Store named terms registry functions. diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index aa44b1688..4992cdfbe 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -947,9 +947,10 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, #{items_count => 0, bundle_count => 0, skipped_count => 1, achieved_depth => 0} end. +%% @doc Download and decode a bundle header from chunk data. download_bundle_header(EndOffset, Size, Opts) -> observe_event(<<"bundle_header">>, fun() -> - dev_arweave:bundle_header(EndOffset - Size, Opts) + lib_arweave_common:bundle_header(EndOffset - Size, Size, Opts) end). %% @doc Process transactions: spawn workers and manage the worker pool. From 9b1a971b3450de501b0b1dfb29993bb82228035b Mon Sep 17 00:00:00 2001 From: Niko Storni Date: Tue, 19 May 2026 02:43:15 +0200 Subject: [PATCH 59/68] fix: prevent crash on copycat tag filtering - the previous code called into a module that no longer exists - routes the tag lookup to the existing helper instead - tag include and exclude filtering works again --- src/preloaded/arweave/dev_arweave.erl | 8 ++++---- src/preloaded/query/dev_copycat_arweave.erl | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index cae0c4884..7d4ec7fca 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -1021,15 +1021,15 @@ parent(Base, Request, Opts) -> {error, not_found} catch error:Reason:Stacktrace -> - ?event(error, - {parent_read_error, + ?event(error, + {parent_read_error, {id, ID}, {reason, Reason}, {stacktrace, Stacktrace} }), - {failure, + {failure, #{ - <<"status">> => 500, + <<"status">> => 500, <<"type">> => <<"parent_read_error">> } } diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 4992cdfbe..30e98ffdb 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -313,7 +313,7 @@ maybe_exclude_tag(TX, ExcludeTag) -> end. has_tag_pair(#tx{tags = Tags}, #{name := Name, value := Value}) -> - TagValue = dev_arweave_common:tagfind(Name, Tags, not_found), + TagValue = ar_tx:tagfind(Name, Tags, not_found), case TagValue of not_found -> false; @@ -3511,4 +3511,4 @@ index_of(Elem, List) -> index_of(Elem, List, 1). index_of(_Elem, [], _N) -> not_found; index_of(Elem, [Elem | _], N) -> N; -index_of(Elem, [_ | Rest], N) -> index_of(Elem, Rest, N + 1). \ No newline at end of file +index_of(Elem, [_ | Rest], N) -> index_of(Elem, Rest, N + 1). From 6ad9d4530a8896b8f1e9114cb11a79b97de5dde5 Mon Sep 17 00:00:00 2001 From: Niko Storni Date: Wed, 20 May 2026 02:38:20 +0200 Subject: [PATCH 60/68] fix: bound copycat range test indexing depth - keep range/list/auto-stop setup writes at direct bundle-item depth - avoid exercising full recursive chunk indexing in marker-focused tests --- src/preloaded/query/dev_copycat_arweave.erl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 30e98ffdb..4744460f5 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -2221,7 +2221,7 @@ tx_with_no_data_test_parallel() -> "~copycat@1.0/arweave&" "from=", BlockBin/binary, "&" "to=", BlockBin/binary, "&" - "mode=write" + "mode=write&depth=2" >>, Opts ), @@ -2290,7 +2290,7 @@ list_index_test_parallel() -> "~copycat@1.0/arweave&" "from=", BlockBin/binary, "&" "to=", BlockBin/binary, "&" - "mode=write" + "mode=write&depth=2" >>, Opts ), @@ -2338,7 +2338,7 @@ auto_stop_on_indexed_block_test_parallel() -> "~copycat@1.0/arweave&" "from=", (hb_util:bin(IndexedBlock))/binary, "&" "to=", (hb_util:bin(IndexedBlock))/binary, "&" - "mode=write" + "mode=write&depth=2" >>, Opts ), @@ -2347,7 +2347,7 @@ auto_stop_on_indexed_block_test_parallel() -> << "~copycat@1.0/arweave&" "from=", (hb_util:bin(Higher2))/binary, "&" - "mode=write" + "mode=write&depth=2" >>, Opts ), @@ -2370,7 +2370,7 @@ explicit_to_reindexes_all_test_parallel() -> "~copycat@1.0/arweave&" "from=", (hb_util:bin(IndexedBlock))/binary, "&" "to=", (hb_util:bin(IndexedBlock))/binary, "&" - "mode=write" + "mode=write&depth=2" >>, Opts ), @@ -2381,7 +2381,7 @@ explicit_to_reindexes_all_test_parallel() -> "~copycat@1.0/arweave&" "from=", (hb_util:bin(IndexedBlock+1))/binary, "&" "to=", (hb_util:bin(LowerBlock))/binary, "&" - "mode=write" + "mode=write&depth=2" >>, Opts ), @@ -2404,7 +2404,7 @@ auto_stop_partial_index_test_parallel() -> "~copycat@1.0/arweave&" "from=", (hb_util:bin(Block))/binary, "&" "to=", (hb_util:bin(Block))/binary, "&" - "mode=write" + "mode=write&depth=2" >>, NoIndexOpts ), @@ -2427,7 +2427,7 @@ auto_stop_partial_index_test_parallel() -> << "~copycat@1.0/arweave&" "from=", (hb_util:bin(HigherBlock))/binary, "&" - "mode=write" + "mode=write&depth=2" >>, Opts ), @@ -2540,7 +2540,7 @@ negative_from_index_test_parallel() -> "~copycat@1.0/arweave&" "from=", (hb_util:bin(StopBlock))/binary, "&" "to=", (hb_util:bin(StopBlock))/binary, "&" - "mode=write" + "mode=write&depth=2" >>, Opts ), @@ -2549,7 +2549,7 @@ negative_from_index_test_parallel() -> << "~copycat@1.0/arweave&" "from=", NegativeFrom/binary, "&" - "mode=write" + "mode=write&depth=2" >>, Opts ), From 2a962bb42495a6dd00a91c0a91849b69912f7d1f Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Tue, 2 Jun 2026 21:15:24 -0400 Subject: [PATCH 61/68] feat(query): self-contained block-range, tx.block, tag and owner queries - enumerate transactions over a block range from the per-block item index - resolve transaction.block by walking the parent index to the L1 block - multi-value tag filters match any (OR), intersect across filters (AND) - scope query reads to local so owners/recipients never hit arweave.net (5s to ~30ms); drop not_found commitments that returned 500 - deterministic pagination via {offset, id} total order - graphql: accept operationName: null from GraphQL IDEs --- src/preloaded/query/dev_query_arweave.erl | 270 +++++++++++++++++----- src/preloaded/query/dev_query_graphql.erl | 6 +- 2 files changed, 212 insertions(+), 64 deletions(-) diff --git a/src/preloaded/query/dev_query_arweave.erl b/src/preloaded/query/dev_query_arweave.erl index 5eb1714f7..4c0544fd0 100644 --- a/src/preloaded/query/dev_query_arweave.erl +++ b/src/preloaded/query/dev_query_arweave.erl @@ -54,7 +54,15 @@ query(Obj, <<"transactions">>, Args, Opts) -> {field, <<"transactions">>}, {args, Args} }), - Matches = match_args(Args, Opts), + Matches = + case has_set_filter(Args, Opts) of + true -> match_args(Args, Opts); + false -> + enumerate_block_range( + hb_maps:get(<<"block">>, Args, undefined, Opts), + Opts + ) + end, WithExplicit = case explicit_ids(Args, Opts) of [] -> Matches; @@ -78,9 +86,16 @@ query(Obj, <<"transactions">>, Args, Opts) -> ?event({transactions_matches, Matches}), {ok, connection(Ordered, Args, Opts)}; query(Obj, <<"block">>, Args, Opts) -> - case query(Obj, <<"blocks">>, Args, Opts) of - {ok, []} -> {ok, null}; - {ok, [Msg|_]} -> {ok, Msg} + case hb_maps:get(<<"id">>, Args, undefined, Opts) of + undefined -> + %% `block' field on a transaction node: resolve its containing + %% block by walking the parent index up to the L1 block. + tx_containing_block(Obj, Opts); + _ -> + case query(Obj, <<"blocks">>, Args, Opts) of + {ok, []} -> {ok, null}; + {ok, [Msg|_]} -> {ok, Msg} + end end; query(Obj, <<"blocks">>, Args, Opts) -> ?event({blocks, @@ -93,7 +108,7 @@ query(Obj, <<"blocks">>, Args, Opts) -> Blocks = lists:filtermap( fun(Match) -> - case hb_cache:read(Match, Opts) of + case hb_cache:read(Match, local_opts(Opts)) of {ok, Msg} -> {true, Msg}; _ -> false end @@ -110,35 +125,20 @@ query(Block, <<"height">>, _Args, Opts) -> query(Block, <<"timestamp">>, _Args, Opts) -> {ok, hb_maps:get(<<"timestamp">>, Block, null, Opts)}; query(Msg, <<"signature">>, _Args, Opts) -> - % Return the signature of the transaction. - % Other TX access methods are defined below. - case hb_message:commitments(#{ <<"committer">> => '_' }, Msg, Opts) of + case first_commitment(<<"committer">>, Msg, Opts) of not_found -> {ok, null}; - Commitments -> - case hb_maps:keys(Commitments) of - [] -> {ok, null}; - [CommID | _] -> - {ok, Commitment} = hb_maps:find(CommID, Commitments, Opts), - hb_maps:find(<<"signature">>, Commitment, Opts) - end + {ok, Commitment} -> hb_maps:find(<<"signature">>, Commitment, Opts) end; query(Msg, <<"owner">>, _Args, Opts) -> - ?event({query_owner, Msg}), - case hb_message:commitments(#{ <<"committer">> => '_' }, Msg, Opts) of + case first_commitment(<<"committer">>, Msg, Opts) of not_found -> {ok, null}; - Commitments -> - case hb_maps:keys(Commitments) of - [] -> {ok, null}; - [CommID | _] -> - {ok, Commitment} = hb_maps:find(CommID, Commitments, Opts), - {ok, Address} = hb_maps:find(<<"committer">>, Commitment, Opts), - {ok, KeyID} = hb_maps:find(<<"keyid">>, Commitment, Opts), - Key = hb_util:remove_scheme_prefix(KeyID), - {ok, #{ - <<"address">> => Address, - <<"key">> => Key - }} - end + {ok, Commitment} -> + {ok, Address} = hb_maps:find(<<"committer">>, Commitment, Opts), + {ok, KeyID} = hb_maps:find(<<"keyid">>, Commitment, Opts), + {ok, #{ + <<"address">> => Address, + <<"key">> => hb_util:remove_scheme_prefix(KeyID) + }} end; query(#{ <<"key">> := Key }, <<"key">>, _Args, _Opts) -> {ok, Key}; @@ -203,20 +203,26 @@ encode_anchor(Bin) when is_binary(Bin), byte_size(Bin) == 43 -> {ok, Bin}; encode_anchor(Bin) when is_binary(Bin), byte_size(Bin) == 64 -> {ok, Bin}; encode_anchor(Other) -> {error, <<"invalid_anchor: ", Other/binary>>}. -%% @doc Find and return a value from the fields of a message (from its -%% commitments). -find_field_key(Field, Msg, Opts) -> - case hb_message:commitments(#{ Field => '_' }, Msg, Opts) of - not_found -> {ok, null}; +%% @doc Return the first commitment of a message matching `MatchField', or +%% `not_found'. Centralizes the commitments lookup used by the field accessors. +first_commitment(MatchField, Msg, Opts) -> + case hb_message:commitments(#{ MatchField => '_' }, Msg, Opts) of + not_found -> not_found; Commitments -> case hb_maps:keys(Commitments) of - [] -> {ok, null}; - [CommID | _] -> - {ok, Commitment} = hb_maps:find(CommID, Commitments, Opts), - case hb_maps:find(Field, Commitment, Opts) of - {ok, Value} -> {ok, Value}; - error -> {ok, null} - end + [] -> not_found; + [CommID | _] -> hb_maps:find(CommID, Commitments, Opts) + end + end. + +%% @doc Find and return a committed field value from a message, or null. +find_field_key(Field, Msg, Opts) -> + case first_commitment(Field, Msg, Opts) of + not_found -> {ok, null}; + {ok, Commitment} -> + case hb_maps:find(Field, Commitment, Opts) of + {ok, Value} -> {ok, Value}; + error -> {ok, null} end end. @@ -226,7 +232,7 @@ connection(Ordered, Args, Opts) -> ResultsCount = length(Ordered), {DroppedCount, Remaining} = drop_to_cursor(Args, Ordered, Opts), CountToReturn = page_size(Args, Opts), - ResultsPage = read_ids(Remaining, CountToReturn, Opts), + ResultsPage = read_ids(Remaining, CountToReturn, local_opts(Opts)), #{ <<"count">> => hb_util:bin(ResultsCount), <<"edges">> => ResultsPage, @@ -293,8 +299,9 @@ sort_offset_annotated(AnnotatedIDs, SortOrder, _Opts) -> ), Ascending = lists:sort( - fun(#{ <<"offset">> := OffsetA }, #{ <<"offset">> := OffsetB }) -> - OffsetA < OffsetB + fun(#{ <<"offset">> := OffsetA, <<"id">> := IdA }, + #{ <<"offset">> := OffsetB, <<"id">> := IdB }) -> + {OffsetA, IdA} =< {OffsetB, IdB} end, WithOffset ), @@ -310,7 +317,12 @@ sort_offset_annotated(AnnotatedIDs, SortOrder, _Opts) -> {without_offset, length(WithoutOffset)} } ), - UserOrderSorted ++ WithoutOffset. + StableWithout = + lists:sort( + fun(#{ <<"id">> := IdA }, #{ <<"id">> := IdB }) -> IdA =< IdB end, + WithoutOffset + ), + UserOrderSorted ++ StableWithout. %% @doc Convert a block height range (`#{<<"min">> => Min, <<"max">> => Max}') %% into weave byte offset boundaries `{StartOffset, EndOffset}'. Notably, the @@ -361,18 +373,7 @@ block_range_to_offset_range(Heights, Opts) -> read_block(Height, Opts) -> case read_cached_block(Height, Opts) of {ok, Block} -> {ok, Block}; - {error, not_found} -> - case hb_opts:get(query_arweave_remote_block_ranges, true, Opts) of - true -> - ?event({read_block_remote, {height, Height}}), - hb_ao:resolve( - #{ <<"device">> => <<"arweave@2.9">> }, - #{ <<"path">> => <<"block">>, <<"block">> => Height }, - Opts - ); - _ -> not_found - end; - not_found -> + _NotCached -> case hb_opts:get(query_arweave_remote_block_ranges, true, Opts) of true -> ?event({read_block_remote, {height, Height}}), @@ -487,7 +488,10 @@ match(<<"id">>, ID, _Opts) -> match(<<"ids">>, IDs, _Opts) -> {ok, IDs}; match(<<"tags">>, Tags, Opts) -> - hb_cache:match(dev_query_graphql:keys_to_template(Tags), Opts); + case lists:any(fun(T) -> is_multi_value_tag(T, Opts) end, Tags) of + false -> hb_cache:match(dev_query_graphql:keys_to_template(Tags), Opts); + true -> {ok, intersect_id_sets([tag_filter_ids(T, Opts) || T <- Tags])} + end; match(<<"owners">>, Owners, Opts) -> {ok, matching_commitments(<<"committer">>, Owners, Opts)}; match(<<"owner">>, Owner, Opts) -> @@ -499,6 +503,44 @@ match(<<"recipients">>, Recipients, Opts) -> match(UnsupportedFilter, _, _) -> throw({unsupported_query_filter, UnsupportedFilter}). +%% @doc True if a tag filter supplies more than one value (Arweave OR match). +is_multi_value_tag(Tag, Opts) -> + case hb_maps:get(<<"values">>, Tag, undefined, Opts) of + Values when is_list(Values) -> length(Values) > 1; + _ -> false + end. + +%% @doc The IDs matching a single tag filter: the union over its values. +tag_filter_ids(Tag, Opts) -> + Name = hb_maps:get(<<"name">>, Tag, undefined, Opts), + NormName = hb_util:to_lower(hb_ao:normalize_key(Name)), + lists:foldl( + fun(Value, Acc) -> + case hb_cache:match(#{ NormName => Value }, Opts) of + {ok, IDs} -> hb_util:unique(IDs ++ Acc); + _ -> Acc + end + end, + [], + tag_filter_values(Tag, Opts) + ). + +%% @doc The values of a tag filter, accepting either `values' or singular `value'. +tag_filter_values(Tag, Opts) -> + case hb_maps:get(<<"values">>, Tag, undefined, Opts) of + Values when is_list(Values) -> Values; + _ -> + case hb_maps:get(<<"value">>, Tag, undefined, Opts) of + undefined -> []; + Value -> [Value] + end + end. + +%% @doc Intersect a list of ID sets (AND across tag filters). +intersect_id_sets([]) -> []; +intersect_id_sets([First | Rest]) -> + lists:foldl(fun(Set, Acc) -> hb_util:list_with(Set, Acc) end, First, Rest). + %%% Block range post-filter %% @doc Offset-annotate a list of IDs, returning {StartOffset, ID} pairs. @@ -588,13 +630,24 @@ matching_commitments(Field, Value, Opts) when is_binary(Value) -> {ids, IDs} } ), - lists:map(fun(ID) -> commitment_id_to_base_id(ID, Opts) end, IDs); + lists:filtermap( + fun(ID) -> + case commitment_id_to_base_id(ID, Opts) of + not_found -> false; + BaseID -> {true, BaseID} + end + end, + IDs + ); _ -> not_found end. %% @doc Convert a commitment message's ID to a base ID. commitment_id_to_base_id(ID, Opts) -> - Store = hb_opts:get(store, no_store, Opts), + %% Read the matched commitment's signature from the local-scoped store only. + %% Using the full store here cascades to the gateway/arweave stores + %% on a local miss, adding seconds per matched owner/recipient. + Store = scoped_store(Opts), ?event({commitment_id_to_base_id, ID}), case hb_store:read(Store, << ID/binary, "/signature">>, Opts) of {ok, EncSig} -> @@ -637,11 +690,16 @@ all_signed_ids(ID, Store, Opts) -> [ID] end. +%% @doc Opts with the store scoped to the local stores (per `query_arweave_scope', +%% default `[local]'), so query result reads never cascade to the gateway/arweave +%% network stores. Keeps the query path self-contained. +local_opts(Opts) -> + hb_store:scope(Opts, hb_opts:get(query_arweave_scope, [local], Opts)). + %% @doc Scope the stores used for block matching. The searched stores can be %% scoped by setting the `query_arweave_scope' option. scoped_store(Opts) -> - Scope = hb_opts:get(query_arweave_scope, [local], Opts), - hb_opts:get(store, no_store, hb_store:scope(Opts, Scope)). + hb_opts:get(store, no_store, local_opts(Opts)). %% @doc Return the explicit IDs from the arguments, if given. Searches for %% both `ids' and `id' keys. @@ -657,6 +715,92 @@ explicit_ids(Args, Opts) -> end ). +%% @doc True if the args contain any filter that produces a candidate ID set. +%% When false, a `transactions' query enumerates from the block range instead +%% of intersecting matchers. +has_set_filter(Args, Opts) -> + lists:any( + fun(Key) -> + case hb_maps:get(Key, Args, undefined, Opts) of + undefined -> false; + null -> false; + _ -> true + end + end, + [<<"ids">>, <<"id">>, <<"owners">>, <<"recipients">>, <<"tags">>] + ). + +%% @doc Enumerate all indexed transaction IDs within a block height range, using +%% the per-block item index written by the copycat at index time. Returns `[]' +%% when no range is given (an unbounded, unfiltered transactions query). +enumerate_block_range(undefined, _Opts) -> []; +enumerate_block_range(null, _Opts) -> []; +enumerate_block_range(Heights, Opts) -> + Min = hb_util:int(hb_maps:get(<<"min">>, Heights, 0, Opts)), + Max = + case hb_maps:get(<<"max">>, Heights, undefined, Opts) of + undefined -> hb_util:ok_or(latest_cached_block(Opts), Min); + RawMax -> hb_util:int(RawMax) + end, + HeightRange = + case Max >= Min of + true -> lists:seq(Min, Max); + false -> [] + end, + lists:flatmap( + fun(Height) -> + maps:fold( + fun(_Depth, IDs, Acc) -> IDs ++ Acc end, + [], + hb_store_arweave:read_block_item_ids(Height, Opts) + ) + end, + HeightRange + ). + +%% @doc Resolve the block that contains a transaction node, for the `block' +%% field of a `Transaction'. Walks the parent index from the item up to its L1 +%% block, then loads the block message. Returns `{ok, null}' when unknown. +tx_containing_block(Msg, Opts) -> + try hb_message:id(Msg, all, Opts) of + ID when is_binary(ID) -> + case tx_block_height(ID, Opts) of + {ok, Height} -> + case read_block(Height, Opts) of + {ok, Block} -> {ok, Block}; + _ -> {ok, null} + end; + not_found -> {ok, null} + end; + _ -> {ok, null} + catch _:_ -> {ok, null} + end. + +%% @doc Find the L1 block height containing an item by following the parent +%% index: a `block' entry resolves directly; a `bundle' entry recurses into the +%% parent. Bounded to guard against cyclic/corrupt parent chains. +tx_block_height(ID, Opts) -> + case hb_store_arweave:store_from_opts(Opts) of + no_store -> not_found; + Store -> walk_parent_to_block(ID, Store, Opts, 0) + end. + +walk_parent_to_block(_ID, _Store, _Opts, Depth) when Depth > 8 -> not_found; +walk_parent_to_block(ID, Store, Opts, Depth) -> + case hb_store_arweave:read_parent(Store, ID, Opts) of + {ok, Entries} -> + case lists:keyfind(block, 2, Entries) of + {Height, block} -> {ok, Height}; + false -> + case [Parent || {Parent, bundle} <- Entries] of + [ParentID | _] -> + walk_parent_to_block(ParentID, Store, Opts, Depth + 1); + [] -> not_found + end + end; + _ -> not_found + end. + winston_to_ar(W) when is_integer(W), W >= 0 -> case {W div 1000000000000, W rem 1000000000000} of {Whole, 0} -> diff --git a/src/preloaded/query/dev_query_graphql.erl b/src/preloaded/query/dev_query_graphql.erl index a269d25f2..59f290684 100644 --- a/src/preloaded/query/dev_query_graphql.erl +++ b/src/preloaded/query/dev_query_graphql.erl @@ -108,7 +108,11 @@ handle(_Base, RawReq, Opts) -> end, ?event({request, {processed, Req}}), Query = hb_maps:get(<<"query">>, Req, <<>>, Opts), - OpName = hb_maps:get(<<"operationName">>, Req, undefined, Opts), + OpName = + case hb_maps:get(<<"operationName">>, Req, undefined, Opts) of + Name when is_binary(Name) -> Name; + _ -> undefined + end, Vars = case hb_maps:get(<<"variables">>, Req, #{}, Opts) of V when is_map(V) -> hb_message:uncommitted_deep(V, Opts); From ee0bb1fee60308da3ed4ccfcd9f3c8ccbb380ec6 Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Tue, 2 Jun 2026 21:15:24 -0400 Subject: [PATCH 62/68] refactor(copycat): consolidate result-map and json-response helpers Add counters/4 and json_response/1, simplify has_tag_pair, drop commented-out dead ecdsa tests. No behavior change; copycat suite 65/65. --- src/preloaded/query/dev_copycat_arweave.erl | 116 +++++--------------- 1 file changed, 30 insertions(+), 86 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 4744460f5..f64aab23a 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -320,10 +320,7 @@ has_tag_pair(#tx{tags = Tags}, #{name := Name, value := Value}) -> _ -> LowerTagValue = hb_util:to_lower(TagValue), LowerValue = hb_util:to_lower(Value), - case LowerTagValue of - LowerValue -> true; - _ -> false - end + LowerTagValue =:= LowerValue end; has_tag_pair(_, _) -> false. @@ -388,17 +385,10 @@ latest_height(Opts) -> list_index(From, undefined, Opts) -> list_index(From, 0, Opts); list_index(From, To, _Opts) when From < To -> - {ok, #{ - <<"content-type">> => <<"application/json">>, - <<"body">> => hb_json:encode(#{}) - }}; + json_response(#{}); list_index(From, To, Opts) -> Result = list_index_blocks(From, To, Opts, #{}), - JSON = hb_json:encode(Result), - {ok, #{ - <<"content-type">> => <<"application/json">>, - <<"body">> => JSON - }}. + json_response(Result). %% @doc Iterate through blocks and check index status for each transaction. list_index_blocks(Current, To, _Opts, Acc) when Current < To -> @@ -457,16 +447,16 @@ assemble_block_info(Height, Block, Opts) -> inventory_index(From, undefined, Opts) -> inventory_index(From, 0, Opts); inventory_index(From, To, _Opts) when From < To -> - {ok, #{ - <<"content-type">> => <<"application/json">>, - <<"body">> => hb_json:encode(#{}) - }}; + json_response(#{}); inventory_index(From, To, Opts) -> Result = inventory_local(From, To, Opts, #{}), - JSON = hb_json:encode(Result), + json_response(Result). + +%% @doc Wrap a map as an `application/json' HTTP response with encoded body. +json_response(Map) -> {ok, #{ <<"content-type">> => <<"application/json">>, - <<"body">> => JSON + <<"body">> => hb_json:encode(Map) }}. inventory_local(Current, To, _Opts, Acc) when Current < To -> Acc; @@ -830,11 +820,19 @@ parallel_map(Items, Fun, Opts) -> MaxWorkers = max(1, hb_opts:get(arweave_index_workers, 1, Opts)), hb_pmap:parallel_map(Items, Fun, MaxWorkers). +%% @doc Build the standard 4-key indexing counters result map. +counters(Items, Bundles, Skipped, Depth) -> + #{ + items_count => Items, + bundle_count => Bundles, + skipped_count => Skipped, + achieved_depth => Depth + }. + %% @doc Process a single transaction and return its contribution to the counters. %% Returns a map with keys: items_count, bundle_count, skipped_count process_block_tx({{padding, _PaddingRoot}, _EndOffset}, _BlockStartOffset, TargetDepth, _BlockHeight, _Opts) -> - #{items_count => 0, bundle_count => 0, skipped_count => 0, - achieved_depth => max(2, TargetDepth)}; + counters(0, 0, 0, max(2, TargetDepth)); process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, BlockHeight, Opts) -> ArweaveStore = hb_store_arweave:store_from_opts(Opts), TXID = hb_util:encode(TX#tx.id), @@ -860,8 +858,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, ok = write_item_header(TX, <<"tx@1.0">>, Opts), try is_bundle_tx(TX, Opts) of false -> - #{items_count => 0, bundle_count => 0, skipped_count => 0, - achieved_depth => max(2, TargetDepth)}; + counters(0, 0, 0, max(2, TargetDepth)); true when TargetDepth > 2 -> %% Retry to preserve bundle count try @@ -879,8 +876,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, {tx, {explicit, TX#tx.id}}, {reason, Reason}, {stacktrace, Stacktrace}}), - #{items_count => 0, bundle_count => 1, - skipped_count => 1, achieved_depth => 0} + counters(0, 1, 1, 0) end; true -> % Lightweight processing of block transactions to depth 2. We @@ -934,8 +930,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, {reason, Reason} } ), - #{items_count => 0, bundle_count => 1, - skipped_count => 1, achieved_depth => 0} + counters(0, 1, 1, 0) end catch _:Reason:Stacktrace -> @@ -944,7 +939,7 @@ process_block_tx({{TX, _TXDataRoot}, EndOffset}, BlockStartOffset, TargetDepth, {tx, {explicit, TX#tx.id}}, {reason, Reason}, {stacktrace, Stacktrace}}), - #{items_count => 0, bundle_count => 0, skipped_count => 1, achieved_depth => 0} + counters(0, 0, 1, 0) end. %% @doc Download and decode a bundle header from chunk data. @@ -984,8 +979,7 @@ process_block_txs(ValidTXs, BlockStartOffset, TargetDepth, BlockHeight, Opts) -> ) } end, - #{items_count => 0, bundle_count => 0, skipped_count => 0, - achieved_depth => ?DEPTH_SENTINEL}, + counters(0, 0, 0, ?DEPTH_SENTINEL), Results ), MergedIDs = merge_all_item_ids( @@ -1000,8 +994,7 @@ process_block_txs(ValidTXs, BlockStartOffset, TargetDepth, BlockHeight, Opts) -> %% @doc Process a single indexed L1 TX candidate after lightweight filter checks. maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> - Skipped = #{items_count => 0, bundle_count => 0, skipped_count => 1, - achieved_depth => 0}, + Skipped = counters(0, 0, 1, 0), NormalizedTXID = hb_util:native_id(TXID), EncodedTXID = hb_util:encode(NormalizedTXID), IndexStore = hb_store_arweave:store_from_opts(Opts), @@ -1084,12 +1077,7 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> {reason, effective_cap_exceeded} } ), - #{ - items_count => 0, - bundle_count => 1, - skipped_count => 1, - achieved_depth => 0 - }; + counters(0, 1, 1, 0); FilterReason -> ?event( copycat_short, @@ -1113,8 +1101,7 @@ process_l1_tx_direct(StartOffset, Length, Depth, IndexStore, EncodedTXID, Parent {reason, effective_cap_exceeded} } ), - #{items_count => 0, bundle_count => 1, - skipped_count => 1, achieved_depth => 0}; + counters(0, 1, 1, 0); false -> ok = hb_copycat_budget:lease(Length), try @@ -1173,12 +1160,7 @@ process_l1_tx( {reason, Reason} } ), - #{ - items_count => 0, - bundle_count => 1, - skipped_count => 1, - achieved_depth => 0 - } + counters(0, 1, 1, 0) end; {error, Reason} -> ?event( @@ -1188,12 +1170,7 @@ process_l1_tx( {reason, Reason} } ), - #{ - items_count => 0, - bundle_count => 1, - skipped_count => 1, - achieved_depth => 0 - }; + counters(0, 1, 1, 0); not_found -> ?event( copycat_short, @@ -1202,12 +1179,7 @@ process_l1_tx( {reason, not_found} } ), - #{ - items_count => 0, - bundle_count => 1, - skipped_count => 1, - achieved_depth => 0 - } + counters(0, 1, 1, 0) end. %% @doc Ensure the root L1 TX offset exists locally before `id=...` indexing. %% if the offset is missing and `query_l1_offset` is enabled, fetches the TX @@ -2146,34 +2118,6 @@ empty_block_test_parallel() -> ), ok. -% ecdsa_no_data_test() -> -% {_TestStore, _StoreOpts, Opts} = setup_index_opts(), -% {ok, 1827904} = -% hb_ao:resolve( -% <<"~copycat@1.0/arweave&from=1827904&to=1827904">>, -% Opts -% ), -% assert_bundle_read( -% Opts, -% <<"VNhX_pSANk_8j0jZBR5bh_5jr-lkfbHDjtHd8FKqx7U">>, -% [ -% {<<"3xDKhrCQcPuBtcm1ipZS5C9gAfFYClgHuHOHAXGfchM">>, <<"1">>}, -% {<<"JantC8f89VE-RidArHnU9589gY5T37NDXnWpI7H_psc">>, <<"7">>} -% ] -% ), -% ok. - -% ecdsa_with_data_test() -> -% {_TestStore, _StoreOpts, Opts} = setup_index_opts(), -% Block = 1720431, -% fetch_and_process_block(Block, Block, Opts), -% {ok, Block} = -% hb_ao:resolve( -% <<"~copycat@1.0/arweave&from=", (hb_util:bin(Block))/binary, "&to=", (hb_util:bin(Block))/binary>>, -% Opts -% ), -% ok. - %% @doc Disabled because the test takes ~30 seconds to run. %% dev_arweave:get_tx_data_tag_exclude_data_test has some test coverage for %% handling an L1 TX with a data tag. From 8ba34ef90907a831401a16f20e08cd168e62a88b Mon Sep 17 00:00:00 2001 From: speeddragon Date: Wed, 3 Jun 2026 18:39:32 +0100 Subject: [PATCH 63/68] fix: Test without binary store key --- src/preloaded/arweave/dev_arweave.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index 7d4ec7fca..811915534 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -1483,7 +1483,7 @@ tx_raw_fetch_error_round_trips_test() -> TXID = maps:get(<<"id">>, hb_json:decode(HeaderBody)), Opts = ClientOpts#{ - routes => [ + <<"routes">> => [ #{ <<"template">> => #{ @@ -1495,7 +1495,7 @@ tx_raw_fetch_error_round_trips_test() -> #{ <<"match">> => <<"^/arweave">>, <<"with">> => MockNode, - <<"opts">> => #{ http_client => httpc } + <<"opts">> => #{ <<"http_client">> => httpc } } ], <<"parallel">> => 1, From e4718cd046e04ce50b1f8ce1760e1c81aa53c1c6 Mon Sep 17 00:00:00 2001 From: Ayush Agrawal Date: Wed, 3 Jun 2026 15:06:00 -0400 Subject: [PATCH 64/68] fix(query): declare graphql as hb app dep so it ships in releases ensure_all_started(graphql) + graphql:load_schema in the ~query@1.0/graphql device failed on released nodes (e.g. PermawebOS): graphql was a build-only dep, so relx excluded it from rel/hb/lib. Works under rebar3 shell (all deps on path) but the released node lacked the app, so ensure_started/1 timed out and the endpoint 500'd. Declaring it in hb.app.src applications includes it. --- src/hb.app.src | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hb.app.src b/src/hb.app.src index 21773478a..495f9cc97 100644 --- a/src/hb.app.src +++ b/src/hb.app.src @@ -11,7 +11,8 @@ cowboy, os_mon, gun, - hackney + hackney, + graphql ]}, {env, []}, {modules, []}, From ea49c234a4cdc7954818b4ca627d9c4e67433386 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Thu, 4 Jun 2026 21:41:29 +0100 Subject: [PATCH 65/68] impr: Remove the cut over logic, created to support different versions. Fix tests --- src/core/store/hb_store_arweave.erl | 33 +--------- src/preloaded/query/dev_copycat_arweave.erl | 69 +++++++-------------- 2 files changed, 27 insertions(+), 75 deletions(-) diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 2c5b9c209..8aedb842e 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -9,16 +9,15 @@ -export([store_from_opts/1, write_offset/6, write_parent/5, read_offset/3, read_parent/3, decode_parent_entries/1, read_chunks/3]). -export([block_indexed_path/1, block_items_path/2]). -export([read_block_item_counts/2, read_block_item_ids/2]). --export([ensure_cutover_height/2, read_cutover_height/1, is_tx_indexed/2 ]). +-export([is_tx_indexed/2 ]). -export([write_block_item_ids/4, read_block_marker_depth/2]). --export([decode_item_ids/1, is_block_indexed/3, is_post_cutover/2, mark_block_indexed/3 ]). +-export([decode_item_ids/1, is_block_indexed/3, mark_block_indexed/3 ]). -export([root_offset/2]). -include("include/hb.hrl"). --include("include/hb_store_arweave.hrl"). +-include("core/include/hb_store_arweave.hrl"). -include_lib("eunit/include/eunit.hrl"). -define(PARTITION_SIZE, 3_600_000_000_000). --define(CUTOVER_KEY, <<"block/marker-cutover-height">>). %% @doc Find the first Arweave store from the given node message. Searches first %% for the `arweave_index_store' option, and if not found, searches the main @@ -496,24 +495,6 @@ mark_block_indexed(Height, Depth, Opts) -> Opts ). -%% @doc Read the persisted cutover height from the index store. -read_cutover_height(Opts) -> - Store = get_index_store(Opts), - case hb_store:read(Store, ?CUTOVER_KEY, Opts) of - {ok, Bin} -> hb_util:int(Bin); - {error, not_found} -> undefined - end. - -%% @doc Write the cutover height if not already set. -ensure_cutover_height(Height, Opts) -> - case read_cutover_height(Opts) of - undefined -> - Store = get_index_store(Opts), - hb_store:write(Store, #{?CUTOVER_KEY => hb_util:bin(Height)}, Opts), - ?event(copycat_short, {marker_cutover_initialized, {height, Height}}); - _ -> ok - end. - %% @doc Check if a transaction ID is indexed in the arweave index store. is_tx_indexed(TXID, Opts) -> Store = get_index_store(Opts), @@ -522,13 +503,6 @@ is_tx_indexed(TXID, Opts) -> {error, not_found} -> false end. -is_post_cutover(undefined, _Opts) -> false; -is_post_cutover(Height, Opts) -> - case read_cutover_height(Opts) of - undefined -> false; - Cutover -> Height >= Cutover - end. - get_index_store(Opts) -> case store_from_opts(Opts) of #{ <<"index-store">> := Store } -> Store; @@ -725,4 +699,3 @@ decode_item_ids_validation_test() -> BadBin = <<0:240>>, ?assertEqual({error, invalid_item_ids_binary}, decode_item_ids(BadBin)), ok. - diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index f64aab23a..b78b07715 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -482,9 +482,10 @@ write_item_header(TX, Codec, Opts) -> maybe Msg = hb_message:convert(TX, <<"structured@1.0">>, Codec, LocalOpts), {ok, _Path} ?= hb_cache:write(Msg, LocalOpts), - ?event(copycat_short, + ?event(debug_copycat, {header_inline_written, {id, {explicit, hb_util:encode(TX#tx.id)}}, + {tx, TX}, {codec, Codec} } ), @@ -621,11 +622,6 @@ fetch_blocks_open_ended(Current, TargetDepth, Workers, Opts) -> Workers ), case find_indexed_prefix(HeaderResults, TargetDepth, Opts) of - {all_unindexed, ToProcess} -> - process_prefetched_blocks( - ToProcess, TargetDepth, Workers, Opts), - fetch_blocks_open_ended( - BatchEnd - 1, TargetDepth, Workers, Opts); {stop_at, StopHeight, ToProcess} -> process_prefetched_blocks( ToProcess, TargetDepth, Workers, Opts), @@ -634,7 +630,12 @@ fetch_blocks_open_ended(Current, TargetDepth, Workers, Opts) -> {stop_at_indexed_block, StopHeight} } ), - {ok, StopHeight} + {ok, StopHeight}; + {all_unindexed, ToProcess} -> + process_prefetched_blocks( + ToProcess, TargetDepth, Workers, Opts), + fetch_blocks_open_ended( + BatchEnd - 1, TargetDepth, Workers, Opts) end. %% @doc Walk header results in order, return the unindexed prefix and @@ -671,21 +672,7 @@ process_prefetched_blocks(Blocks, TargetDepth, Workers, Opts) -> %% the cutover, falls back to legacy per-TX check. is_already_indexed({ok, Block}, TargetDepth, Opts) -> Height = hb_maps:get(<<"height">>, Block, undefined, Opts), - case hb_store_arweave:is_block_indexed(Height, TargetDepth, Opts) of - true -> - true; - false -> - case hb_store_arweave:is_post_cutover(Height, Opts) of - true -> - false; - false -> - TXIDs = hb_maps:get(<<"txs">>, Block, [], Opts), - lists:any( - fun(TXID) -> hb_store_arweave:is_tx_indexed(TXID, Opts) end, - TXIDs - ) - end - end; + hb_store_arweave:is_block_indexed(Height, TargetDepth, Opts); is_already_indexed({error, _}, _TargetDepth, _Opts) -> false. @@ -724,7 +711,6 @@ process_block(BlockRes, Current, To, TargetDepth, Opts) -> Current, AchievedDepth, ItemIDs, Opts), ok ?= hb_store_arweave:mark_block_indexed( Current, AchievedDepth, Opts), - hb_store_arweave:ensure_cutover_height(Current, Opts), ?event( copycat_short, {arweave_block_indexed, @@ -2335,12 +2321,13 @@ explicit_to_reindexes_all_test_parallel() -> %% @doc Manually write to the index to simulate a partially indexed block. %% This should also trigger a stop when the `to` option is omitted. auto_stop_partial_index_test_parallel() -> - {_TestStore, StoreOpts, Opts} = setup_index_opts(), + {IndexStore, StoreOpts, Opts} = setup_index_opts(), Block = 1826700, HigherBlock = Block + 1, NoIndexOpts = Opts#{ <<"arweave-index-ids">> => false, - <<"arweave-index-blocks">> => true + <<"arweave-index-blocks">> => true, + <<"index-headers">> => false }, {ok, Block} = hb_ao:resolve( @@ -2366,6 +2353,12 @@ auto_stop_partial_index_test_parallel() -> ?assert(length(TXIDs) > 0), [OneTXID | _] = TXIDs, ok = hb_store_arweave:write_offset(StoreOpts, OneTXID, <<"tx@1.0">>, 0, 0, Opts), + %% Write block depth maker, to indicate the block was previously indexed. + hb_store:write( + IndexStore, + #{hb_store_arweave:block_indexed_path(Block) => integer_to_binary(2)}, + Opts + ), {ok, Block} = hb_ao:resolve( << @@ -2379,7 +2372,7 @@ auto_stop_partial_index_test_parallel() -> ?assert(has_any_indexed_tx(Block, Opts)), ?assertNot(has_any_indexed_tx(Block-1, Opts)), ?assert(hb_store_arweave:is_block_indexed(HigherBlock, 2, Opts)), - ?assertNot(hb_store_arweave:is_block_indexed(Block, 2, Opts)), + ?assert(hb_store_arweave:is_block_indexed(Block, 2, Opts)), ok. negative_parse_range_test_parallel() -> @@ -2867,7 +2860,8 @@ setup_index_opts() -> Opts = #{ <<"store">> => Store, <<"arweave-index-ids">> => true, - <<"arweave-index-store">> => StoreOpts + <<"arweave-index-store">> => StoreOpts, + <<"index-headers">> => false }, {TestStore, StoreOpts, Opts}. @@ -2989,23 +2983,6 @@ depth_1_normalizes_to_2_test() -> ?assertNot(hb_store_arweave:is_block_indexed(Height, 3, Opts)), ok. -block_marker_cutover_test() -> - {_TestStore, _StoreOpts, Opts} = setup_index_opts(), - LowerBlock = 1827941, - UpperBlock = 1827942, - {ok, UpperBlock} = - hb_ao:resolve( - <<"~copycat@1.0/arweave&from=", - (hb_util:bin(UpperBlock))/binary, "&to=", - (hb_util:bin(UpperBlock))/binary>>, - Opts - ), - Cutover = hb_store_arweave:read_cutover_height(Opts), - ?assertNotEqual(undefined, Cutover), - ?assert(hb_store_arweave:is_block_indexed(UpperBlock, 2, Opts)), - ?assertNot(hb_store_arweave:is_block_indexed(LowerBlock, 2, Opts)), - ok. - achieved_depth_block_depth_2_test() -> {_TestStore, _StoreOpts, Opts} = setup_index_opts(), Block = 1827942, @@ -3423,7 +3400,9 @@ parent_endpoint_not_found_test() -> ok. strip_preserves_verify_test_parallel() -> - {_TestStore, _StoreOpts, Opts} = setup_index_opts(), + {_TestStore, _StoreOpts, DefaultOpts} = setup_index_opts(), + Opts = DefaultOpts#{<<"index-headers">> => true}, + {ok, 1827942} = hb_ao:resolve( <<"~copycat@1.0/arweave&from=1827942&to=1827942&mode=write&depth=3">>, From 660c1cd365d68cb05f46629407e3ef7321e9b16e Mon Sep 17 00:00:00 2001 From: speeddragon Date: Thu, 4 Jun 2026 23:10:39 +0100 Subject: [PATCH 66/68] fix: Merge conflicts regarding offset renaming --- src/core/store/hb_store_arweave.erl | 35 +++++++++++---------- src/preloaded/arweave/dev_arweave.erl | 8 ++--- src/preloaded/query/dev_copycat_arweave.erl | 6 ++-- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/src/core/store/hb_store_arweave.erl b/src/core/store/hb_store_arweave.erl index 8aedb842e..ae246eafe 100644 --- a/src/core/store/hb_store_arweave.erl +++ b/src/core/store/hb_store_arweave.erl @@ -12,7 +12,7 @@ -export([is_tx_indexed/2 ]). -export([write_block_item_ids/4, read_block_marker_depth/2]). -export([decode_item_ids/1, is_block_indexed/3, mark_block_indexed/3 ]). --export([root_offset/2]). +-export([root_offset/3]). -include("include/hb.hrl"). -include("core/include/hb_store_arweave.hrl"). -include_lib("eunit/include/eunit.hrl"). @@ -78,11 +78,11 @@ type(_Store, #{ <<"type">> := _ID }, _NodeOpts) -> {error, not_found}. %% @doc Read the offset of the data at the given key. -read_offset(StoreOpts = #{ <<"index-store">> := IndexStore }, ID, Opts) -> +read_offset(#{ <<"index-store">> := IndexStore }, ID, Opts) -> ReadRes = hb_prometheus:measure_and_report( fun() -> - hb_store:read(IndexStore, hb_store_arweave_offset:path(ID), StoreOpts) + hb_store:read(IndexStore, hb_store_arweave_offset:path(ID), Opts) end, hb_store_arweave_index_check_duration_seconds ), @@ -92,7 +92,7 @@ read_offset(StoreOpts = #{ <<"index-store">> := IndexStore }, ID, Opts) -> hb_store_arweave_offset:decode(OffsetBinary), {ok, #{ <<"codec-device">> => CodecName, - <<"offset">> => Offset, + <<"start-offset">> => Offset, <<"length">> => Length }}; _ -> @@ -174,7 +174,7 @@ do_read(StoreOpts, ID, Opts) -> {ok, #{ <<"codec-device">> := Codec, - <<"offset">> := Offset, + <<"start-offset">> := Offset, <<"length">> := Length } } -> @@ -182,7 +182,7 @@ do_read(StoreOpts, ID, Opts) -> load_message( Codec, ID, - root_offset(Offset, StoreOpts), + root_offset(Offset, StoreOpts, Opts), Length, StoreOpts ), @@ -223,19 +223,19 @@ do_read(StoreOpts, ID, Opts) -> %% @doc Takes a `read_offset/2' result and returns it, normalized to the %% outer-most root that is known: Either the mempool or a global byte offset. -root_offset(relative, _Store) -> relative; -root_offset(GlobalOffset, _Store) when is_integer(GlobalOffset) -> GlobalOffset; -root_offset(Offset, Store) -> root_offset(Offset, 0, Store). -root_offset(#{ <<"relative">> := P, <<"offset">> := Off }, Acc, Store) -> - case read_offset(Store, P) of - {ok, #{ <<"offset">> := Next = #{ <<"relative">> := _, <<"offset">> := _ } }} -> +root_offset(relative, _Store, _Opts) -> relative; +root_offset(GlobalOffset, _Store, _Opts) when is_integer(GlobalOffset) -> GlobalOffset; +root_offset(Offset, Store, Opts) -> root_offset(Offset, 0, Store, Opts). +root_offset(#{ <<"relative">> := P, <<"offset">> := Off }, Acc, Store, Opts) -> + case read_offset(Store, P, Opts) of + {ok, #{ <<"start-offset">> := Next = #{ <<"relative">> := _, <<"offset">> := _ } }} -> % We have another relative offset. Continue. - root_offset(Next, Acc + Off, Store); - {ok, #{ <<"offset">> := relative }} -> + root_offset(Next, Acc + Off, Store, Opts); + {ok, #{ <<"start-offset">> := relative }} -> % We have reached an unconfirmed TX as the root of the relative offset % chain, so we return an offset against that. #{ <<"relative">> => P, <<"offset">> => Acc + Off }; - {ok, #{ <<"offset">> := GlobalOffset }} when is_integer(GlobalOffset) -> + {ok, #{ <<"start-offset">> := GlobalOffset }} when is_integer(GlobalOffset) -> % We have reached a confirmed TX as the root of the relative offset % chain, so we return a global offset. GlobalOffset + Acc + Off; @@ -244,7 +244,7 @@ root_offset(#{ <<"relative">> := P, <<"offset">> := Off }, Acc, Store) -> % and return it with the `relative` key intact. #{ <<"relative">> => P, <<"offset">> => Acc + Off } end; -root_offset(Other, _, _) -> Other. +root_offset(Other, _, _, _) -> Other. %% @doc Load a TX from Arweave. Supports either confirmed or pending TXs. load_message(<<"tx@1.0">>, ID, Type, _Length, Opts) -> @@ -652,7 +652,8 @@ root_offset_confirmed_parent_test() -> 12352, root_offset( #{ <<"relative">> => ParentID, <<"offset">> => 7 }, - ArweaveStoreOpts + ArweaveStoreOpts, + Opts ) ). diff --git a/src/preloaded/arweave/dev_arweave.erl b/src/preloaded/arweave/dev_arweave.erl index 811915534..58ff87b9a 100644 --- a/src/preloaded/arweave/dev_arweave.erl +++ b/src/preloaded/arweave/dev_arweave.erl @@ -170,15 +170,15 @@ head_raw(Base, Request, Opts) -> case find_key(<<"raw">>, Base, Request, Opts) of TXID when ?IS_ID(TXID) -> % Read the data from the local cache. - IndexStore = hb_store_arweave:store_from_opts(Opts), - case hb_store_arweave:read_offset(IndexStore, TXID, Opts) of + ArweaveStore = hb_store_arweave:store_from_opts(Opts), + case hb_store_arweave:read_offset(ArweaveStore, TXID, Opts) of {ok, #{ <<"codec-device">> := CodecDevice, - <<"offset">> := RawOffset, + <<"start-offset">> := RawOffset, <<"length">> := Length }} -> - StartOffset = hb_store_arweave:root_offset(RawOffset, Opts), + StartOffset = hb_store_arweave:root_offset(RawOffset, ArweaveStore, Opts), CodecFun = case CodecDevice of <<"ans104@1.0">> -> fun head_raw_ans104/4; diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index b78b07715..4e2239b77 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -993,7 +993,7 @@ maybe_process_l1_tx(TXID, Filters, Depth, QueryL1Offset, Opts) -> {ok, #{ <<"codec-device">> := <<"tx@1.0">>, - <<"offset">> := StartOffset, + <<"start-offset">> := StartOffset, <<"length">> := Length }} ?= observe_copycat_l1_stage( @@ -1175,7 +1175,7 @@ ensure_l1_tx_offset(_TXID, _EncodedTXID, IndexStore, _LoadL1Offset, _Opts) when is_map(IndexStore) =:= false -> {error, missing_offset}; ensure_l1_tx_offset(TXID, EncodedTXID, IndexStore, QueryL1Offset, Opts) -> - case hb_store_arweave:read_offset(IndexStore, TXID) of + case hb_store_arweave:read_offset(IndexStore, TXID, Opts) of {ok, _} = OffsetRes -> OffsetRes; not_found when QueryL1Offset -> @@ -1188,7 +1188,7 @@ ensure_l1_tx_offset(TXID, EncodedTXID, IndexStore, QueryL1Offset, Opts) -> ), case query_l1_tx_offset(EncodedTXID, IndexStore, Opts) of ok -> - case hb_store_arweave:read_offset(IndexStore, TXID) of + case hb_store_arweave:read_offset(IndexStore, TXID, Opts) of {ok, _} = OffsetRes -> OffsetRes; not_found -> From dc0ba05a6c99b44e11c84964d5c46fc1786a6c6d Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 5 Jun 2026 15:02:15 +0100 Subject: [PATCH 67/68] impr: Always reprocess blocks in ranged copycat arweave requests --- src/preloaded/query/dev_copycat_arweave.erl | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/preloaded/query/dev_copycat_arweave.erl b/src/preloaded/query/dev_copycat_arweave.erl index 4e2239b77..f5749aa6b 100644 --- a/src/preloaded/query/dev_copycat_arweave.erl +++ b/src/preloaded/query/dev_copycat_arweave.erl @@ -595,13 +595,9 @@ fetch_blocks_ranged(Current, To, TargetDepth, Workers, Opts) -> hb_pmap:parallel_map( Heights, fun(H) -> - case hb_store_arweave:is_block_indexed(H, TargetDepth, Opts) of - true -> ok; - false -> - observe_event(<<"block_indexed">>, fun() -> - fetch_and_process_block(H, To, TargetDepth, Opts) - end) - end + observe_event(<<"block_indexed">>, fun() -> + fetch_and_process_block(H, To, TargetDepth, Opts) + end) end, Workers ), From 4c98147595e2c057aee641c09f050e62e6b91951 Mon Sep 17 00:00:00 2001 From: speeddragon Date: Fri, 5 Jun 2026 19:51:24 +0100 Subject: [PATCH 68/68] impr: Updare lmdb-rs to a version that doesn't cause OOM when using more than 1 worker for indexing --- rebar.config | 2 +- rebar.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rebar.config b/rebar.config index 83c8910f5..d60449bb1 100644 --- a/rebar.config +++ b/rebar.config @@ -176,7 +176,7 @@ ]}. {deps, [ - {elmdb, {git, "https://github.com/permaweb/elmdb-rs.git", {ref, "06ccf937abc250cb22c782d568efcaa39f5452ff"}}}, + {elmdb, {git, "https://github.com/permaweb/elmdb-rs.git", {ref, "68316484a5bd45bcc4b180a60883db7272c5dbde"}}}, {b64rs, {git, "https://github.com/permaweb/b64rs.git", {ref, "94b7d8e51d9a44f3bd12b7d138dd0d2cb74c169f"}}}, {base32, "1.0.0"}, {cowlib, "2.16.0"}, diff --git a/rebar.lock b/rebar.lock index 479c45556..0125bfea9 100644 --- a/rebar.lock +++ b/rebar.lock @@ -11,7 +11,7 @@ {<<"ddskerl">>,{pkg,<<"ddskerl">>,<<"0.4.2">>},1}, {<<"elmdb">>, {git,"https://github.com/permaweb/elmdb-rs.git", - {ref,"06ccf937abc250cb22c782d568efcaa39f5452ff"}}, + {ref,"68316484a5bd45bcc4b180a60883db7272c5dbde"}}, 0}, {<<"eqwalizer_support">>, {git_subdir,"https://github.com/whatsapp/eqwalizer.git",