Skip to content

Commit fa8ee9d

Browse files
committed
Add TagCache
1 parent 40cc950 commit fa8ee9d

11 files changed

Lines changed: 596 additions & 107 deletions

File tree

changelog/current.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ This release focuses in compliance with the YAML standard, mostly by ensuring pa
1313
- `NodeType`: rename TAGD->TAGH and TAGV->TAGP
1414
- Internal changes to improve the design of event handlers, moving relocation and error checking logic to `ParseEngine`, where it is most suited.
1515
- Fix warnings with clang-tidy 22
16+
- Add `TagCache` accelerator structure (in `c4/yml/tag.hpp`), used by `Tree::resolve_tags()` and while parsing. This reduces significantly the arena requirements for heavily-tagged YAML by allowing reuse of resolved tags.
1617

1718

1819
## YAML fixes: valid cases

samples/quickstart.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5150,14 +5150,25 @@ void sample_tag_directives()
51505150
%TAG !m! !meta-
51515151
--- !m!light green
51525152
)");
5153-
// tags are not resolved by default. Use .resolve_tags() to
5154-
// accomplish this:
5155-
tree.resolve_tags();
5153+
// tags are not resolved by default. Use Tree::resolve_tags()
5154+
// to accomplish this in an existing tree:
5155+
ryml::TagCache tag_cache; // reduces memory requirements by reusing resolved tags
5156+
tree.resolve_tags(tag_cache);
51565157
CHECK(ryml::emitrs_yaml<std::string>(tree) == R"(%TAG !m! !my-
51575158
--- !<!my-light> fluorescent
51585159
...
51595160
%TAG !m! !meta-
51605161
--- !<!meta-light> green
5162+
)");
5163+
// You can also Use ParserOptions to force resolution of tags
5164+
// while parsing:
5165+
ryml::ParserOptions opts = ryml::ParserOptions{}.resolve_tags(true);
5166+
ryml::Tree resolved_tree = ryml::parse_in_arena(ryml::to_csubstr(yaml), opts);
5167+
CHECK(ryml::emitrs_yaml<std::string>(resolved_tree) == R"(%TAG !m! !my-
5168+
--- !<!my-light> fluorescent
5169+
...
5170+
%TAG !m! !meta-
5171+
--- !<!meta-light> green
51615172
)");
51625173
// see also tree.normalize_tags()
51635174
// see also tree.normalize_tags_long()

src/c4/yml/event_handler_tree.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ struct EventHandlerTree : public EventHandlerStack<EventHandlerTree, EventHandle
3737
* @{ */
3838

3939
using state = EventHandlerTreeState;
40+
enum { requires_strings_on_buffers = false };
4041

4142
/** @} */
4243

@@ -45,6 +46,7 @@ struct EventHandlerTree : public EventHandlerStack<EventHandlerTree, EventHandle
4546
/** @cond dev */
4647
Tree *C4_RESTRICT m_tree;
4748
id_type m_curr_doc;
49+
TagCache m_tag_cache;
4850

4951
#ifdef RYML_DBG
5052
#define _enable_(bits) _enable__<bits>(); _c4dbgpf("node[{}]: enable {}", m_curr->node_id, #bits)
@@ -91,11 +93,13 @@ struct EventHandlerTree : public EventHandlerStack<EventHandlerTree, EventHandle
9193
_reset_parser_state(m_curr, id, id);
9294
}
9395
m_curr_doc = m_tree->ancestor_doc(id);
96+
m_tag_cache.clear();
9497
}
9598

9699
Callbacks const& callbacks() const { return m_stack.m_callbacks; }
97100

98101
C4_ALWAYS_INLINE TagDirectives& tag_directives() { return m_tree->m_tag_directives; } // NOLINT(readability-make-member-function-const)
102+
C4_ALWAYS_INLINE TagCache &tag_cache() { return m_tag_cache; }
99103

100104
/** @} */
101105

src/c4/yml/parse_engine.def.hpp

Lines changed: 73 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -476,14 +476,30 @@ void ParseEngine<EventHandler>::_relocate_arena(csubstr prev_arena, substr next_
476476
}
477477
_ryml_relocate(m_evt_handler->m_src);
478478
for(size_t i = 0; i < m_pending_tags.num_entries; ++i)
479-
_ryml_relocate(m_pending_tags.annotations[i].str); // LCOV_EXCL_LINE
479+
{
480+
_ryml_relocate(m_pending_tags.annotations[i].str); // LCOV_EXCL_LINE
481+
_ryml_relocate(m_pending_tags.annotations[i].orig); // LCOV_EXCL_LINE
482+
}
480483
for(size_t i = 0; i < m_pending_anchors.num_entries; ++i)
481-
_ryml_relocate(m_pending_anchors.annotations[i].str); // LCOV_EXCL_LINE
482-
TagDirectives &tds = m_evt_handler->tag_directives();
483-
for(size_t i = 0, sz = tds.size(); i < sz; ++i)
484484
{
485-
_ryml_relocate(tds.m_directives[i].handle); // LCOV_EXCL_LINE
486-
_ryml_relocate(tds.m_directives[i].prefix); // LCOV_EXCL_LINE
485+
_ryml_relocate(m_pending_anchors.annotations[i].str); // LCOV_EXCL_LINE
486+
_ryml_relocate(m_pending_anchors.annotations[i].orig); // LCOV_EXCL_LINE
487+
}
488+
{
489+
TagDirectives &tds = m_evt_handler->tag_directives();
490+
for(size_t i = 0, sz = tds.size(); i < sz; ++i)
491+
{
492+
_ryml_relocate(tds.m_directives[i].handle); // LCOV_EXCL_LINE
493+
_ryml_relocate(tds.m_directives[i].prefix); // LCOV_EXCL_LINE
494+
}
495+
}
496+
{
497+
TagCache &tch = m_evt_handler->tag_cache();
498+
for(id_type i = 0, sz = tch.m_entries.size(); i < sz; ++i)
499+
{
500+
_ryml_relocate(tch.m_entries[i].tag); // LCOV_EXCL_LINE
501+
_ryml_relocate(tch.m_entries[i].resolved); // LCOV_EXCL_LINE
502+
}
487503
}
488504
if(other)
489505
_ryml_relocate(*other); // LCOV_EXCL_LINE
@@ -4608,27 +4624,63 @@ template<class EventHandler>
46084624
csubstr ParseEngine<EventHandler>::_resolve_tag(csubstr tag)
46094625
{
46104626
_c4dbgpf("resolving tag: {} curr_doc={}", _prs(tag), m_evt_handler->m_curr_doc);
4627+
_c4assert(tag.is_sub(_buf()));
4628+
TagCache::LookupResult ret = m_evt_handler->tag_cache().find(tag, m_evt_handler->m_curr_doc);
4629+
if(ret)
4630+
{
4631+
_c4dbgpf("resolving tag: found in cache[{}]: {}", ret.pos, _prs(ret.resolved));
4632+
return ret.resolved;
4633+
}
4634+
_c4dbgpf("resolving tag: not in cache: {} curr_doc={}", _prs(tag), m_evt_handler->m_curr_doc);
46114635
size_t bufsz = 0;
46124636
substr buf = m_evt_handler->arena_rem();
46134637
TagDirectives const& C4_RESTRICT tds = m_evt_handler->tag_directives();
46144638
csubstr ttag = tds.resolve(buf, &bufsz, tag, m_evt_handler->m_curr_doc,
46154639
m_evt_handler->m_curr->pos,
46164640
m_evt_handler->m_stack.m_callbacks);
4617-
_c4dbgpf("resolving tag: bufsz={}", bufsz);
4618-
if(bufsz)
4641+
_c4dbgpf("resolving tag: bufsz={} ttag.len={} !!ttag.str={}", bufsz, ttag.len, !!ttag.str);
4642+
_c4assert((bufsz > buf.len) == (!ttag.str));
4643+
_c4assert(!!bufsz == (ttag.len == bufsz));
4644+
// try again if the arena size was not enough
4645+
if(!ttag.str)
4646+
{
4647+
_c4dbgpf("tag requires arena, but it was small. arena.len={} arena.slack={} tag.required={}", m_evt_handler->arena_rem().len, m_evt_handler->arena().len, ttag.len);
4648+
_c4assert(ttag.len == bufsz);
4649+
buf = _alloc_arena(bufsz, &tag);
4650+
if(buf.str) // the alloc may fail eg with the ints handler
4651+
{
4652+
ttag = tds.resolve(buf, &bufsz, tag, m_evt_handler->m_curr_doc,
4653+
m_evt_handler->m_curr->pos,
4654+
m_evt_handler->m_stack.m_callbacks);
4655+
}
4656+
_c4assert(ttag.len == bufsz);
4657+
_c4assert(!ttag.str || ttag.is_sub(m_evt_handler->arena()));
4658+
}
4659+
else if(bufsz) // if we succeeded writing into the arena, grow it as needed
46194660
{
4620-
substr bufretry = _alloc_arena(bufsz, &tag);
4621-
if(C4_UNLIKELY(bufsz > buf.len))
4661+
_c4dbgp("tag required arena. update size");
4662+
_c4assert(ttag.len == bufsz);
4663+
_c4assert(ttag.is_sub(buf));
4664+
(void)_alloc_arena(bufsz);
4665+
}
4666+
if C4_IF_CONSTEXPR (EventHandler::requires_strings_on_buffers) // NOLINT
4667+
{
4668+
_c4dbgpf("handler requires tags in buffers. !!ttag.str={} in_arena={} in_src={}", !!ttag.str, ttag.is_sub(m_evt_handler->arena()), ttag.is_sub(_buf()));
4669+
// is the resolved tag not in any of those buffers?
4670+
if(ttag.str && !ttag.is_sub(m_evt_handler->arena()) && !ttag.is_sub(_buf()))
46224671
{
4623-
if(bufretry.str) // some handlers may be just counting the required arena size
4624-
{
4625-
ttag = tds.resolve(bufretry, &bufsz, tag, m_evt_handler->m_curr_doc,
4626-
m_evt_handler->m_curr->pos,
4627-
m_evt_handler->m_stack.m_callbacks);
4628-
}
4672+
_c4dbgpf("copying resolved tag to arena: slack={} required={}", m_evt_handler->arena_rem().len, ttag.len);
4673+
buf = _alloc_arena(ttag.len, &tag);
4674+
if(buf.str) // the alloc may fail eg with the ints handler
4675+
memcpy(buf.str, ttag.str, ttag.len);
4676+
ttag.str = buf.str; // keep the current len!
4677+
_c4assert(!ttag.str || ttag.is_sub(m_evt_handler->arena()));
46294678
}
46304679
}
4631-
_c4dbgpf("resolving tag: {} --> {}", _prs(tag), _prs(ttag.str ? ttag : csubstr("(arena full)")));
4680+
_c4dbgpf("resolved tag: {} --> [{}]~~~{}~~~", _prs(tag), ttag.len, ttag.str ? ttag : csubstr("(out of size)"));
4681+
_c4assert(ttag.len > 0);
4682+
// cache the hard-earned result!
4683+
m_evt_handler->tag_cache().add(tag, ttag, m_evt_handler->m_curr_doc, ret.pos);
46324684
return ttag;
46334685
}
46344686

@@ -8248,18 +8300,19 @@ uint32_t ParseEngine<EventHandler>::_get_annotations_same_line(csubstr token_sou
82488300
_c4dbgpf("second annotation: {} indent={} line={}", second->str, second->indentation, second->line);
82498301
}
82508302
auto extract_string = [&](EntryPtr e){
8251-
if(e->str.begins_with_any("!<"))
8303+
// tags can be null when the arena ran out of space
8304+
if(!e->str.str || e->str.begins_with_any("!<"))
82528305
{
82538306
csubstr tag = e->orig;
82548307
_c4assert(tag.str);
82558308
_c4assert(tag.len);
82568309
_c4assert(tag.is_sub(token_soup));
8257-
_c4dbgpf("tag: {} -> {}", e->str, tag);
8310+
_c4dbgpf("tag: {} -> {}", e->str.str ? e->str : csubstr("(out of size)"), tag);
82588311
return tag;
82598312
}
82608313
csubstr anchor = e->str;
8261-
_c4assert(anchor.str);
82628314
_c4assert(anchor.len);
8315+
_c4assert(anchor.str);
82638316
_c4assert(anchor.is_sub(token_soup));
82648317
_c4assert(!anchor.begins_with('&'));
82658318
_c4assert(anchor.str - token_soup.str > 0);

src/c4/yml/tag.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,5 +532,79 @@ csubstr TagDirectives::resolve(substr buf, size_t *bufsz, csubstr tag, id_type i
532532
}
533533
}
534534

535+
536+
//-----------------------------------------------------------------------------
537+
TagCache::LookupResult TagCache::find(csubstr tag, id_type doc_id, id_type linear_threshold) const noexcept
538+
{
539+
LookupResult ret = {};
540+
id_type sz = m_entries.size();
541+
if(sz < linear_threshold) // do a linear search on small size
542+
{
543+
for(size_t i = 0; i < sz; ++i)
544+
{
545+
Entry const& C4_RESTRICT e = m_entries[i];
546+
if(e.tag == tag && e.doc_id == doc_id)
547+
{
548+
ret.resolved = e.resolved;
549+
ret.pos = i;
550+
return ret;
551+
}
552+
else if(e.tag > tag || ((e.tag == tag) && e.doc_id > doc_id))
553+
{
554+
ret.pos = i;
555+
return ret;
556+
}
557+
}
558+
ret.pos = sz;
559+
}
560+
else // do a binary search on larger size
561+
{
562+
id_type first = 0;
563+
id_type count = sz;
564+
while(count)
565+
{
566+
id_type halfsz = count / id_type(2);
567+
id_type mid = first + halfsz;
568+
_RYML_ASSERT_BASIC_(m_entries.m_callbacks, mid < sz);
569+
Entry const& C4_RESTRICT e = m_entries[mid];
570+
if(e.tag < tag || (e.tag == tag && e.doc_id < doc_id))
571+
{
572+
first = mid + 1;
573+
_RYML_ASSERT_BASIC_(m_entries.m_callbacks, count >= halfsz + 1);
574+
count -= halfsz + 1;
575+
}
576+
else
577+
{
578+
count = halfsz;
579+
}
580+
}
581+
ret.pos = first;
582+
if(first < sz)
583+
{
584+
Entry const& C4_RESTRICT e = m_entries[first];
585+
if(e.tag == tag && e.doc_id == doc_id)
586+
{
587+
ret.resolved = m_entries[first].resolved;
588+
}
589+
}
590+
}
591+
return ret;
592+
}
593+
594+
void TagCache::add(csubstr tag, csubstr resolved, id_type doc_id, const_iterator pos)
595+
{
596+
id_type sz = m_entries.size();
597+
_RYML_ASSERT_BASIC_(m_entries.m_callbacks, pos <= sz);
598+
_RYML_ASSERT_BASIC_(m_entries.m_callbacks, pos == sz || tag < m_entries[pos].tag || (tag == m_entries[pos].tag && doc_id < m_entries[pos].doc_id));
599+
m_entries.resize(sz + 1);
600+
Entry *C4_RESTRICT ptr = m_entries.m_stack;
601+
if(pos < sz)
602+
memmove(ptr + pos + 1, ptr + pos, (sz - pos) * sizeof(Entry));
603+
ptr[pos].tag = tag;
604+
ptr[pos].resolved = resolved;
605+
ptr[pos].doc_id = doc_id;
606+
_c4dbgpf("tagcache: add entry @pos={}: docid={} {}->{}", pos, doc_id, tag, (resolved.str ? resolved : csubstr("(out of size)")));
607+
}
608+
535609
} // namespace yml
536610
} // namespace c4

src/c4/yml/tag.hpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
#ifndef _C4_YML_TAG_HPP_
22
#define _C4_YML_TAG_HPP_
33

4+
#ifndef _C4_YML_COMMON_HPP_
45
#include <c4/yml/common.hpp>
6+
#endif
7+
#ifndef _C4_YML_DETAIL_STACK_HPP_
8+
#include <c4/yml/detail/stack.hpp>
9+
#endif
510

611
namespace c4 {
712
namespace yml {
@@ -55,6 +60,47 @@ RYML_EXPORT csubstr normalize_tag_long(csubstr tag, substr output);
5560
RYML_EXPORT bool is_custom_tag(csubstr tag);
5661
RYML_EXPORT bool is_valid_tag_handle(csubstr handle);
5762

63+
64+
//-----------------------------------------------------------------------------
65+
66+
/** Accelerator structure to reduce memory requirements by enabling
67+
* reuse of resolved tags. */
68+
struct RYML_EXPORT TagCache
69+
{
70+
struct Entry
71+
{
72+
csubstr tag;
73+
csubstr resolved;
74+
id_type doc_id;
75+
};
76+
using Entries = detail::stack<Entry>;
77+
using const_iterator = id_type;
78+
struct LookupResult
79+
{
80+
csubstr resolved;
81+
const_iterator pos;
82+
operator bool() const noexcept { return resolved.len > 0; }
83+
};
84+
85+
public:
86+
87+
TagCache() noexcept : m_entries() {}
88+
LookupResult find(csubstr tag, id_type doc_id, id_type linear_threshold=Entries::sso_size) const noexcept;
89+
void add(csubstr tag, csubstr resolved, id_type doc_id, const_iterator pos);
90+
91+
void clear() { m_entries.clear(); }
92+
93+
public:
94+
95+
/** @cond dev */
96+
Entries m_entries;
97+
/** @endcond */
98+
99+
};
100+
101+
102+
//-----------------------------------------------------------------------------
103+
58104
struct RYML_EXPORT TagDirective
59105
{
60106
/** Eg <pre>!e!</pre> in <pre>%TAG !e! tag:example.com,2000:app/</pre> */

0 commit comments

Comments
 (0)