{"columns":[{"alerts":[{"code":"near_unique","level":"info","message":"100.0% of rows are unique strings"},{"code":"one_word","level":"warn","message":"100.0% rows are a single word"}],"column":"image","extras":{"language_counts":{},"language_sample_size":4319,"length_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4319,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[22.5,22.525,22.55,22.575,22.6,22.625,22.65,22.675,22.7,22.725,22.75,22.775,22.8,22.825,22.85,22.875,22.9,22.925,22.95,22.975,23.0,23.025,23.05,23.075,23.1,23.125,23.15,23.175,23.2,23.225,23.25,23.275,23.3,23.325,23.35,23.375,23.4,23.425,23.45,23.475,23.5]},"near_unique":true,"sample":["VizWiz_val_00000005.jpg","VizWiz_val_00003270.jpg","VizWiz_val_00003745.jpg","VizWiz_val_00003570.jpg","VizWiz_val_00001823.jpg","VizWiz_val_00004205.jpg","VizWiz_val_00001890.jpg","VizWiz_val_00003887.jpg","VizWiz_val_00003578.jpg","VizWiz_val_00000160.jpg","VizWiz_val_00002619.jpg","VizWiz_val_00002291.jpg","VizWiz_val_00001201.jpg","VizWiz_val_00000027.jpg","VizWiz_val_00001261.jpg","VizWiz_val_00000795.jpg","VizWiz_val_00000398.jpg","VizWiz_val_00003631.jpg","VizWiz_val_00004289.jpg","VizWiz_val_00001735.jpg","VizWiz_val_00001323.jpg","VizWiz_val_00000179.jpg","VizWiz_val_00000699.jpg","VizWiz_val_00000425.jpg","VizWiz_val_00003022.jpg","VizWiz_val_00003315.jpg","VizWiz_val_00002390.jpg","VizWiz_val_00000821.jpg","VizWiz_val_00000325.jpg","VizWiz_val_00004198.jpg","VizWiz_val_00002834.jpg","VizWiz_val_00002863.jpg","VizWiz_val_00000031.jpg","VizWiz_val_00001692.jpg","VizWiz_val_00001564.jpg","VizWiz_val_00001881.jpg","VizWiz_val_00004051.jpg","VizWiz_val_00002200.jpg","VizWiz_val_00001930.jpg","VizWiz_val_00002284.jpg","VizWiz_val_00003529.jpg","VizWiz_val_00004001.jpg","VizWiz_val_00001209.jpg","VizWiz_val_00003997.jpg","VizWiz_val_00004128.jpg","VizWiz_val_00004219.jpg","VizWiz_val_00001493.jpg","VizWiz_val_00001700.jpg","VizWiz_val_00000701.jpg","VizWiz_val_00003817.jpg"],"top_values":[],"top_words":[["vizwiz_val_00000000.jpg",1],["vizwiz_val_00000001.jpg",1],["vizwiz_val_00000002.jpg",1],["vizwiz_val_00000003.jpg",1],["vizwiz_val_00000004.jpg",1],["vizwiz_val_00000005.jpg",1],["vizwiz_val_00000006.jpg",1],["vizwiz_val_00000007.jpg",1],["vizwiz_val_00000008.jpg",1],["vizwiz_val_00000009.jpg",1],["vizwiz_val_00000010.jpg",1],["vizwiz_val_00000011.jpg",1],["vizwiz_val_00000012.jpg",1],["vizwiz_val_00000013.jpg",1],["vizwiz_val_00000014.jpg",1],["vizwiz_val_00000015.jpg",1],["vizwiz_val_00000016.jpg",1],["vizwiz_val_00000017.jpg",1],["vizwiz_val_00000018.jpg",1],["vizwiz_val_00000019.jpg",1],["vizwiz_val_00000020.jpg",1],["vizwiz_val_00000021.jpg",1],["vizwiz_val_00000022.jpg",1],["vizwiz_val_00000023.jpg",1],["vizwiz_val_00000024.jpg",1]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4319,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":4319,"n_null":0,"n_unique":4319,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.0,"emoji_rate":0.0,"len_max":23,"len_mean":23.0,"len_median":23.0,"len_min":23,"len_p95":23.0,"n_duplicates":0,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":-47.97999999999996,"url_rate":0.0,"vocab_size":4319,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"multilingual","level":"info","message":"9 languages detected in sample"},{"code":"duplicates","level":"warn","message":"35.2% duplicate strings"}],"column":"question","extras":{"language_counts":{"__engine":"fasttext:4,317","ast":1,"en":4308,"es":2,"fy":1,"hu":1,"ia":1,"it":1,"la":2},"language_sample_size":4319,"length_histogram":{"counts":[759,550,931,609,368,250,143,143,96,84,68,42,35,35,30,23,19,12,23,14,8,14,8,6,10,8,8,3,5,3,3,2,1,2,1,1,0,1,0,1],"edges":[7.0,13.425,19.85,26.275,32.7,39.125,45.55,51.975,58.4,64.82499999999999,71.25,77.675,84.1,90.52499999999999,96.95,103.375,109.8,116.225,122.64999999999999,129.075,135.5,141.92499999999998,148.35,154.775,161.2,167.625,174.04999999999998,180.475,186.9,193.325,199.75,206.17499999999998,212.6,219.025,225.45,231.875,238.29999999999998,244.725,251.15,257.575,264.0]},"near_unique":false,"sample":["What the screen says? Thank you.","What color is this please? ","What is that a bottle of?","What kind of dinner bowl is this?","What's this?","What flavor is this?","Please describe this picture.","What is this? ","what color is this shirt?","Can you give me an idea of what's on the screen? Thank you.","What flavor are these?","What color is this shirt?","What is my blood pressure reading?","What type of pills are these?","what is this?","Can you read me the gift code on this card? ","What's on the screen","What product is this please, and thank you?","What does this say?","What's in this box?","What is this?","Is this person attractive?","What is this?","What's on the screen?","Excuse me, tell me what you can see exactly in this picture.","WHAT COLOR IS THIS?","Can you please tell me what's in that container?","How much is this monitor?","Can you tell me what this warning is?","Is that milk?","What is this?","What is this?","Can anyone tell me what's actually on the screen of the laptop?","What is this?","what kinda soda is this?","What is the color of this fabric?","Can you tell how many pounds this meat is?","What is the label on this spice?","What's in this bottle?","What can is this?","Piece of meal. ","What are the dials on this washer.","What is this?","What is this?","I wanna know exactly what I'm seeing, but I'm not real sure right now because I just started.","What kind of chips are these?","What is this medication?","What is this?","What is it?","What is the name and manufacturer serial number of this CCTV?"],"top_values":[["What is this?",523],["What does this say?",61],["What color is this?",57],["What is it?",56],["What's this?",40],["What is this item?",37],["What is in this box?",34],["What color is this shirt?",27],["What's on the screen?",25],["What is in this can?",23],["What flavor is this?",23],["what is this?",22],["What does the screen say?",19],["What is that?",19],["What is this product?",19],["What is this? ",18],["What kind of coffee is this?",16],["What is the expiration date?",14],["What does this label say?",12],["What is in this bottle?",12]],"top_words":[["what",3172],["is",2827],["this",1916],["the",1383],["this?",1138],["of",697],["you",677],["on",560],["can",525],["in",502],["tell",444],["me",435],["i",407],["does",401],["color",360],["to",319],["a",317],["what's",315],["it",283],["thank",280],["and",261],["kind",258],["are",238],["say?",235],["you.",224]],"vocab_skipped":null,"word_histogram":{"counts":[840,1428,494,542,306,110,159,110,37,56,31,38,28,14,25,25,7,16,16,4,8,6,7,2,2,2,3,0,2,1],"edges":[2.0,3.6333333333333333,5.266666666666667,6.9,8.533333333333333,10.166666666666666,11.8,13.433333333333334,15.066666666666666,16.7,18.333333333333332,19.966666666666665,21.6,23.233333333333334,24.866666666666667,26.5,28.133333333333333,29.766666666666666,31.4,33.03333333333333,34.666666666666664,36.3,37.93333333333333,39.56666666666666,41.2,42.833333333333336,44.46666666666667,46.1,47.733333333333334,49.36666666666667,51.0]}},"kind":"text","n":4319,"n_null":0,"n_unique":2798,"null_rate":0.0,"stats":{"allcaps_rate":0.002546885853206761,"boilerplate_rate":0.003473026163463765,"duplicate_rate":0.35216485297522576,"emoji_rate":0.0,"len_max":264,"len_mean":35.10141236397314,"len_median":26.0,"len_min":7,"len_p95":95.0,"n_duplicates":1521,"n_empty":0,"one_word_rate":0.0,"readability_flesch_mean":101.69042404667057,"url_rate":0.0,"vocab_size":2779,"word_mean":7.258856216716833,"word_median":5.0}},{"alerts":[{"code":"skipped","level":"info","message":"no profiler for kind=unknown"}],"column":"answers","extras":{},"kind":"unknown","n":4319,"n_null":0,"n_unique":null,"null_rate":0.0,"stats":{}},{"alerts":[],"column":"answer_type","extras":{"singletons":0,"top_values":[["other",2691],["unanswerable",1385],["yes/no",195],["number",48]]},"kind":"categorical","n":4319,"n_null":0,"n_unique":4,"null_rate":0.0,"stats":{"cardinality":4,"entropy":1.225366156251876,"entropy_ratio":0.612683078125938,"top_rate":0.6230608937253994,"top_value":"other"}},{"alerts":[],"column":"answerable","extras":{"histogram":{"counts":[1385,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2934],"edges":[0.0,0.025,0.05,0.07500000000000001,0.1,0.125,0.15000000000000002,0.17500000000000002,0.2,0.225,0.25,0.275,0.30000000000000004,0.325,0.35000000000000003,0.375,0.4,0.42500000000000004,0.45,0.47500000000000003,0.5,0.525,0.55,0.5750000000000001,0.6000000000000001,0.625,0.65,0.675,0.7000000000000001,0.7250000000000001,0.75,0.775,0.8,0.8250000000000001,0.8500000000000001,0.875,0.9,0.925,0.9500000000000001,0.9750000000000001,1.0]},"sample":[1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0]},"kind":"numeric","n":4319,"n_null":0,"n_unique":2,"null_rate":0.0,"stats":{"iqr":1.0,"kurtosis":-1.4095366412457953,"max":1.0,"mean":0.6793239175735124,"median":1.0,"min":0.0,"n_outliers":0,"outlier_rate":0.0,"q1":0.0,"q3":1.0,"skew":-0.7684161364483475,"std":0.4667905124864011,"zero_rate":0.3206760824264876}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["row_count","columns.question.stats.duplicate_rate","columns.question.top_values","columns.answer_type.top_values","columns.answer_type.stats.top_rate","columns.answerable.stats.mean","columns.answerable.stats.zero_rate","columns.question.language_counts"],"featured_charts":[{"caption":"Shows the strong skew toward 'other' and 'unanswerable' categories that together cover ~94% of rows.","column":"answer_type","kind":"bar"},{"caption":"Highlights that about 32% of questions are flagged as not answerable.","column":"answerable","kind":"donut"},{"caption":"Top question values reveal how often generic prompts like 'What is this?' dominate the dataset.","column":"question","kind":"bar"},{"caption":"Question length distribution shows most are short (median 26 chars) with a long tail up to 264 chars.","column":"question","kind":"length"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset contains 4,319 rows from the VizWiz validation annotations, structured around image filenames, the questions asked about each image, the answers, an answer_type label, and an answerable flag. The questions column is the most interesting: about 35% are duplicates, with 'What is this?' alone appearing 523 times, suggesting a heavy concentration of generic identification queries. Answer_type is dominated by 'other' (62%) and 'unanswerable' (32%), and the answerable flag confirms that roughly 32% of items are flagged as not answerable \u2014 a key signal for any downstream modeling. The image column is uniquely identifying per row and not worth deeper analysis, while the answers column was skipped by the profiler.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.len_min","stats.len_max","stats.one_word_rate","stats.vocab_size","stats.duplicate_rate","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds image filenames following the pattern `vizwiz_val_########.jpg`, with all 4319 values being unique single tokens of exactly 23 characters. There are no nulls, duplicates, or vocabulary variation \u2014 every row maps one-to-one to a distinct image in what appears to be the VizWiz validation split. The negative Flesch score is an artifact of scoring filenames as prose and can be ignored.","role":"identifier","scope":"column","target":"image","treatment":"Use as a foreign key to load the corresponding image file; do not feed as text to a model."},{"confidence":"high","critiques":[],"evidence_keys":["language_counts","n","n_unique","stats.duplicate_rate","stats.n_duplicates","stats.len_mean","stats.word_mean","stats.readability_flesch_mean","stats.vocab_size","top_values","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"Short natural-language questions, mostly English (4308/4319) and overwhelmingly identification prompts \u2014 \"What is this?\" alone appears 523 times and the top 10 values are all generic \"what is/color/says\" queries. Heavy duplication (35.2%, 1521 rows) and a small vocab (2779 unique words across 4319 rows) suggest a VQA-style prompt set rather than diverse free text. Mean length is 35 chars / 7.3 words with very high Flesch readability (101.7), and a handful of non-English rows (es, la, it, fy, hu, ia, ast) introduce minor language drift.","role":"free_text","scope":"column","target":"question","treatment":"Tokenize and embed for modelling; deduplicate or weight by frequency given the 35% duplicate rate."},{"confidence":"low","critiques":[],"evidence_keys":["alerts","kind","n","null_rate","stats"],"model":"anthropic:claude-opus-4-7","narrative":"The column 'answers' was skipped by the profiler, so its kind is unknown and no descriptive statistics were computed. All 4319 rows are non-null, but uniqueness, type, and value distribution are unavailable. The name suggests it holds response content, likely structured (e.g., nested objects or arrays) which is why automatic profiling bailed out.","role":"other","scope":"column","target":"answers","treatment":"Inspect raw values manually and parse into a typed structure before further profiling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_rate","stats.top_value","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Categorical label with only 4 distinct values across 4319 rows and no nulls, classifying answers into 'other', 'unanswerable', 'yes/no', and 'number'. The distribution is heavily imbalanced: 'other' covers 62.3% and 'unanswerable' another 1385 rows, while 'number' appears only 48 times. Entropy ratio of 0.61 confirms the skew toward the top two classes.","role":"label","scope":"column","target":"answer_type","treatment":"One-hot or integer-encode; consider class-weighting or stratified sampling given the imbalance toward 'other'."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.mean","stats.zero_rate","stats.min","stats.max"],"model":"anthropic:claude-opus-4-7","narrative":"Binary 0/1 flag indicating whether a question is answerable, with 4319 rows and no nulls. Roughly 68% are marked answerable (mean 0.6793) and 32% are zeros, giving a moderate class imbalance toward the positive class. Only two unique values confirm this is a clean indicator rather than a probability score.","role":"label","scope":"column","target":"answerable","treatment":"Use directly as a binary target; account for the ~68/32 class imbalance during training or evaluation."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":1929,"prompt_tokens":6711,"total_tokens":8640}},"language_counts":{"ast":1,"en":4308,"es":2,"fy":1,"hu":1,"ia":1,"it":1,"la":2},"meta":{"generated_at":"2026-05-01T18:36:21+00:00","mode":"full","row_count":4319,"sampled_rows":4319,"seed":42,"source":"/home/coolhand/html/datavis/data_trove/cache/vizwiz_val_annotations.json"},"notes":[],"saturn_version":"0.2.0","schema":{"answer_type":"categorical","answerable":"numeric","answers":"unknown","image":"text","question":"text"}}
