{"columns":[{"alerts":[{"code":"near_unique","level":"info","message":"100.0% of rows are unique strings"},{"code":"one_word","level":"warn","message":"97.8% rows are a single word"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"gloss","extras":{"language_counts":{},"language_sample_size":2000,"length_histogram":{"counts":[21,0,13,0,0,139,0,0,377,0,376,0,0,337,0,0,285,0,189,0,0,127,0,0,66,0,35,0,0,19,0,0,10,0,3,0,0,2,0,1],"edges":[1.0,1.375,1.75,2.125,2.5,2.875,3.25,3.625,4.0,4.375,4.75,5.125,5.5,5.875,6.25,6.625,7.0,7.375,7.75,8.125,8.5,8.875,9.25,9.625,10.0,10.375,10.75,11.125,11.5,11.875,12.25,12.625,13.0,13.375,13.75,14.125,14.5,14.875,15.25,15.625,16.0]},"near_unique":true,"sample":["computer","reputation","eraser","camel","community","stadium","exact","laundry","caterpillar","birthday","rush","card","love","black","serious","cup","children","cooperate","valley","almost","university","pull","thank you","game","forever","skunk","flexible","fast","name","spread","beginning","cabinet","woman","upset","negative","favorite","prince","vocabulary","interpret","chemistry","boil","pause","present","painter","roar","sunshine","guilty","zero","when","heap"],"top_values":[],"top_words":[["up",7],["hearing",3],["last",3],["take",3],["every",3],["year",2],["all",2],["hot",2],["now",2],["dog",2],["language",2],["give",2],["school",2],["want",2],["paper",2],["room",2],["week",2],["you",2],["day",2],["here",2],["new",2],["president",2],["sign",2],["south",2],["hard",2]],"vocab_skipped":null,"word_histogram":{"counts":[1955,0,0,0,0,0,0,0,0,0,0,0,0,0,0,42,0,0,0,0,0,0,0,0,0,0,0,0,0,3],"edges":[1.0,1.0666666666666667,1.1333333333333333,1.2,1.2666666666666666,1.3333333333333333,1.4,1.4666666666666668,1.5333333333333332,1.6,1.6666666666666665,1.7333333333333334,1.8,1.8666666666666667,1.9333333333333333,2.0,2.0666666666666664,2.1333333333333333,2.2,2.2666666666666666,2.333333333333333,2.4,2.466666666666667,2.533333333333333,2.6,2.666666666666667,2.7333333333333334,2.8,2.8666666666666667,2.9333333333333336,3.0]}},"kind":"text","n":2000,"n_null":0,"n_unique":2000,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.0,"emoji_rate":0.0,"len_max":16,"len_mean":6.0075,"len_median":6.0,"len_min":1,"len_p95":10.0,"n_duplicates":0,"n_empty":0,"one_word_rate":0.9775,"readability_flesch_mean":54.57720000000003,"url_rate":0.0,"vocab_size":1984,"word_mean":1.024,"word_median":1.0}},{"alerts":[{"code":"skipped","level":"info","message":"no profiler for kind=unknown"}],"column":"instances","extras":{},"kind":"unknown","n":2000,"n_null":0,"n_unique":null,"null_rate":0.0,"stats":{}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["row_count","column_count","columns[gloss].n_unique","columns[gloss].stats.one_word_rate","columns[gloss].stats.len_mean","columns[gloss].stats.len_max","columns[gloss].stats.word_mean","columns[gloss].stats.vocab_size","columns[gloss].top_words","columns[instances].alerts"],"featured_charts":[{"caption":"Character-length distribution of glosses \u2014 expect a tight cluster around 6 with a max of 16.","column":"gloss","kind":"length"},{"caption":"Top recurring tokens within glosses; note that even the most common word ('up') appears only 7 times, confirming near-unique labels.","column":"gloss","kind":"bar"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset is a 2000-row index from a WLASL (Word-Level American Sign Language) source, with two columns: 'gloss' (text labels) and 'instances' (an unparsed/unknown field, likely nested data). The 'gloss' column is essentially a vocabulary list \u2014 every one of the 2000 rows is unique, 97.75% are single words, and the mean length is just 6 characters. The 'instances' column was skipped by the profiler and warrants manual inspection, since it likely contains the actual sign-language sample records keyed to each gloss. Start by looking at the gloss length distribution to confirm the single-word pattern, then dig into the structure of 'instances' separately.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.one_word_rate","stats.len_mean","stats.len_max","stats.word_mean","stats.vocab_size","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds short glosses\u20142000 rows, all unique, with 97.75% being a single word and a mean length of 6 characters (max 16). The vocabulary is 1984 distinct tokens across 2000 rows, so almost every entry is its own term, with only minor repeats like 'up' (7) or 'hearing' (3). It reads as a lexicon-style label field rather than free text.","role":"label","scope":"column","target":"gloss","treatment":"Treat as a high-cardinality categorical key; embed or hash rather than one-hot."},{"confidence":"low","critiques":[],"evidence_keys":["alerts","kind","n","n_unique","null_rate","stats"],"model":"anthropic:claude-opus-4-7","narrative":"The column is named \"instances\" but saturn skipped detailed profiling, so its kind is unknown and no descriptive statistics were computed. We can only confirm 2000 rows with a 0.0 null rate; uniqueness, distribution, and dtype are all unreported. Without a sample value or type signal, the semantic role cannot be inferred from the evidence.","role":"other","scope":"column","target":"instances","treatment":"Re-profile or inspect raw values manually before deciding how to use this column."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":1018,"prompt_tokens":2540,"total_tokens":3558}},"language_counts":{},"meta":{"generated_at":"2026-05-01T23:08:03+00:00","mode":"full","row_count":2000,"sampled_rows":2000,"seed":42,"source":"/home/coolhand/html/datavis/data_trove/cache/wlasl_index.json"},"notes":[],"saturn_version":"0.2.0","schema":{"gloss":"text","instances":"unknown"}}
