{"columns":[{"alerts":[{"code":"near_unique","level":"info","message":"100.0% of rows are unique strings"},{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"allcaps","level":"info","message":"100.0% rows are all-caps"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"Letter reference","extras":{"language_counts":{},"language_sample_size":4970,"length_histogram":{"counts":[22,0,0,0,0,0,0,0,0,0,207,0,0,0,0,0,0,0,0,0,1000,0,0,0,0,0,0,0,0,0,1805,0,0,0,0,0,0,0,0,1936],"edges":[7.0,7.1,7.2,7.3,7.4,7.5,7.6,7.7,7.8,7.9,8.0,8.1,8.2,8.3,8.4,8.5,8.6,8.7,8.8,8.9,9.0,9.1,9.2,9.3,9.4,9.5,9.6,9.7,9.8,9.9,10.0,10.1,10.2,10.3,10.4,10.5,10.6,10.7,10.8,10.9,11.0]},"near_unique":true,"sample":["ARUNDEL_003","PASTON_337","RERUM_019","PEPYS_078","HAMILTO_005","WENTWOR_001","HARLEY_074","SMYTH_020","PETTY_010","BACON_092","ORIGIN3_031","JONSON_006","CORNWAL_149","ARUNDEL_014","COSIN_025","CELY_110","BACON_357","PLUMPTO_031","WHARTON_006","FLEMING_127","CROMWEL_008","BACON_101","BRYSKET_010","BARRING_002","PASTON_062","PASTON_399","LEYCEST_059","CELY_130","BACON_261","WENTWOR_016","OXINDE_154","WYATT_012","PAGET_013","FLEMING_073","ESSEX_034","HARLEY_058","STONOR_113","HUTTON_046","HASTING_030","JONES_060","PEPYS_054","STONOR_060","CORNWAL_141","STONOR_056","TIXALL_009","WENTWOR_055","DUPPA_056","FLEMING_079","BROWNE_055","SIGNET_064"],"top_values":[],"top_words":[["allen_001",1],["allen_002",1],["allen_003",1],["allen_004",1],["arundel_001",1],["arundel_002",1],["arundel_003",1],["arundel_004",1],["arundel_005",1],["arundel_006",1],["arundel_007",1],["arundel_008",1],["arundel_009",1],["arundel_010",1],["arundel_011",1],["arundel_012",1],["arundel_013",1],["arundel_014",1],["arundel_015",1],["arundel_016",1],["arundel_017",1],["arundel_018",1],["arundel_019",1],["arundel_020",1],["arundel_021",1]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4970,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":4970,"n_null":0,"n_unique":4970,"null_rate":0.0,"stats":{"allcaps_rate":1.0,"boilerplate_rate":0.0,"duplicate_rate":0.0,"emoji_rate":0.0,"len_max":11,"len_mean":10.091750503018108,"len_median":10.0,"len_min":7,"len_p95":11.0,"n_duplicates":0,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":49.31000000000003,"url_rate":0.0,"vocab_size":4970,"word_mean":1.0,"word_median":1.0}},{"alerts":[],"column":"Author name","extras":{"singletons":250,"top_values":[["JOHN_HOLLES_SR",136],["THOMAS_CROMWELL",93],["DOROTHY_OSBORNE/TEMPLE",85],["NATHANIEL_BACON_I",77],["JOHN_CHAMBERLAIN",71],["THOMAS_WENTWORTH",67],["MARGARET_PASTON[N.MAUTBY]",66],["ARABELLA_STUART",65],["BRILLIANA_HARLEY[N.CONWAY]",61],["STEPHEN_GARDINER",58],["SAMUEL_PEPYS",58],["JOHN_PARKHURST",55],["JOHN_JONES",53],["ANTHONY_ANTONIE",51],["ROBERT_DUDLEY",47],["KATHERINE_PASTON[N.KNYVETT]",47],["RICHARD_CELY_JR",46],["WILLIAM_CECIL",45],["THOMAS_KNYVETT",45],["THOMAS_HOWARD_III",44]]},"kind":"categorical","n":4970,"n_null":0,"n_unique":695,"null_rate":0.0,"stats":{"cardinality":695,"entropy":8.191508717550976,"entropy_ratio":0.8676646791858825,"top_rate":0.027364185110663984,"top_value":"JOHN_HOLLES_SR"}},{"alerts":[{"code":"null_rate","level":"warn","message":"25.0% null"}],"column":"Author API","extras":{"singletons":80,"top_values":[["SIR",560],["LADY",278],["MERCHANT",149],["KING_OF_ENGLAND",140],["1ST_EARL_OF_CLARE/POLITICIAN(DNB)",136],["BISHOP_OF_WINCHESTER",111],["CLERK",103],["EARL_OF_ESSEX/ROYAL_MINISTER(DNB)",93],["SIR/LOCAL_POLITICIAN(DNB)",79],["BISHOP_OF_NORWICH",77],["1ST_EARL_OF_STRAFFORD/LORD_LIEUTENANT_OF_IRELAND",67],["PUBLIC_SERVANT",58],["COLONEL",55],["1ST_LORD_BURGHLEY/ROYAL_MINISTER(DNB)",48],["EARL_OF_LEICESTER/COURTIER/MAGNATE(DNB)",47],["2ND_EARL_OF_ARUNDEL_AND_SURREY/POLITICIAN(DNB)",44],["3RD_EARL_OF_DERBY",40],["SIR/NATURAL_PHILOSOPHER/ADMINISTRATOR(DNB)",40],["SIR/2ND_BART/SCHOLAR/POLITICIAN(DNB)",38],["SIR/LORD_CHANCELLOR(DNB)",38]]},"kind":"categorical","n":4970,"n_null":1244,"n_unique":252,"null_rate":0.25030181086519115,"stats":{"cardinality":252,"entropy":6.059938250719454,"entropy_ratio":0.7596496937342953,"top_rate":0.15029522275899088,"top_value":"SIR"}},{"alerts":[],"column":"Author gender","extras":{"singletons":0,"top_values":[["MALE",4130],["FEMALE",840]]},"kind":"categorical","n":4970,"n_null":0,"n_unique":2,"null_rate":0.0,"stats":{"cardinality":2,"entropy":0.6554444445609847,"entropy_ratio":0.6554444445609847,"top_rate":0.8309859154929577,"top_value":"MALE"}},{"alerts":[{"code":"null_rate","level":"warn","message":"31.2% null"}],"column":"Author DOB","extras":{"singletons":32,"top_values":[["1565?",136],["1546?",120],["1633",99],["1485?",97],["1627",85],["1593",79],["1533",79],["1585",73],["1554",71],["1575",66],["1600?",61],["1623",60],["1497?",58],["1631",55],["1511",55],["1596",53],["1597?",53],["1442",52],["1614",50],["1608",48]]},"kind":"categorical","n":4970,"n_null":1549,"n_unique":217,"null_rate":0.31167002012072437,"stats":{"cardinality":217,"entropy":6.746030720090486,"entropy_ratio":0.8691601096300233,"top_rate":0.03975445776088863,"top_value":"1565?"}},{"alerts":[{"code":"null_rate","level":"warn","message":"44.4% null"}],"column":"Relation to recipient","extras":{"singletons":9,"top_values":[["FRIEND",387],["BROTHER",353],["SON",250],["KIN",187],["BROTHER-IN-LAW",169],["MOTHER",162],["HUSBAND",160],["FATHER",149],["FAMILY_SERVANT",140],["WIFE",126],["COUSIN",115],["SON-IN-LAW",93],["FUTURE_WIFE",78],["DAUGHTER",47],["NEPHEW",47],["SISTER-IN-LAW",42],["SISTER",38],["NIECE",37],["FATHER-IN-LAW",29],["UNCLE",26]]},"kind":"categorical","n":4970,"n_null":2205,"n_unique":45,"null_rate":0.44366197183098594,"stats":{"cardinality":45,"entropy":4.2114013736338975,"entropy_ratio":0.7668452341612103,"top_rate":0.1399638336347197,"top_value":"FRIEND"}},{"alerts":[{"code":"long_tail","level":"info","message":"317 singleton categories"}],"column":"Recipient","extras":{"singletons":317,"top_values":[["JOHN_PASTON_I",262],["NATHANIEL_BACON_I",251],["JOAN_BARRINGTON",182],["JANE_CORNWALLIS/BACON[N.MEAUTYS]",180],["GEORGE_CELY",119],["HENRY_OXINDEN[BARHAM]",107],["DANIEL_FLEMING",100],["ROBERT_PLUMPTON_I",87],["WILLIAM_TEMPLE",85],["JOHN_PASTON_III",84],["WILLIAM_STONOR",81],["THOMAS_LANGLEY",75],["THOMAS_STOCKWELL",73],["THOMAS_WOLSEY",65],["EDWARD_HARLEY",64],["HENRY_CLIFFORD_II",63],["CHRISTOPHER_HATTON_III",63],["HENRY_TUDOR_VIII",57],["JOHN_PASTON_II",56],["MARGARET_PASTON[N.MAUTBY]",50]]},"kind":"categorical","n":4970,"n_null":0,"n_unique":623,"null_rate":0.0,"stats":{"cardinality":623,"entropy":7.35063173999037,"entropy_ratio":0.7918304189786014,"top_rate":0.05271629778672032,"top_value":"JOHN_PASTON_I"}},{"alerts":[],"column":"Recipient API","extras":{"singletons":134,"top_values":[["SIR",542],["LADY",444],["SIR/LOCAL_POLITICIAN(DNB)",253],["MERCHANT",154],["SIR/ANTIQUARY",100],["BARONET/DIPLOMAT/AUTHOR(DNB)",85],["SIR/MERCHANT",82],["BISHOP_OF_DURHAM/LORD_CHANCELLOR",75],["ROYAL_MINISTER/ARCHBISHOP_OF_YORK/CARDINAL(DNB)",71],["KING_OF_ENGLAND",68],["BISHOP_OF_WINCHESTER",68],["1ST_EARL_OF_CUMBERLAND",64],["VISCOUNT",63],["BISHOP_OF_DURHAM",59],["CAPTAIN",55],["EARL_OF_LEICESTER/COURTIER/MAGNATE(DNB)",49],["SIR/PRINCIPLE_SECRETARY(DNB)",45],["VISCOUNTESS",45],["SIR/LORD_KEEPER_OF_THE_GREAT_SEAL",43],["VISCOUNT_DORCHESTER/DIPLOMNAT(DNB)",43]]},"kind":"categorical","n":4970,"n_null":940,"n_unique":326,"null_rate":0.1891348088531187,"stats":{"cardinality":326,"entropy":6.127651962756565,"entropy_ratio":0.7339623292981595,"top_rate":0.1344913151364764,"top_value":"SIR"}},{"alerts":[],"column":"Recipient gender","extras":{"singletons":0,"top_values":[["MALE",4074],["FEMALE",892],["MALE/FEMALE",4]]},"kind":"categorical","n":4970,"n_null":0,"n_unique":3,"null_rate":0.0,"stats":{"cardinality":3,"entropy":0.6881350036126386,"entropy_ratio":0.43416484825321605,"top_rate":0.819718309859155,"top_value":"MALE"}},{"alerts":[],"column":"Recipient DOB","extras":{"singletons":43,"top_values":[["1421",264],["1546?",256],["1581",185],["1558?",182],["1633",140],["1608",108],["1628",90],["1453",88],["1444",86],["1449?",82],["1360?",75],["1595",71],["1473?",65],["1632?",64],["1624",64],["1631",62],["1644",61],["1493?",61],["1533",57],["1491",57]]},"kind":"categorical","n":4970,"n_null":975,"n_unique":210,"null_rate":0.19617706237424548,"stats":{"cardinality":210,"entropy":6.309687028506575,"entropy_ratio":0.8179266545324468,"top_rate":0.06608260325406759,"top_value":"1421"}},{"alerts":[{"code":"null_rate","level":"warn","message":"46.2% null"}],"column":"Relation to author","extras":{"singletons":8,"top_values":[["FRIEND",387],["BROTHER",324],["SON",295],["BROTHER-IN-LAW",192],["KIN",188],["WIFE",160],["MOTHER",160],["FATHER",138],["HUSBAND",126],["COUSIN",115],["MOTHER-IN-LAW",81],["FUTURE_HUSBAND",78],["SISTER",67],["UNCLE",59],["FAMILY_SERVANT",53],["FATHER-IN-LAW",43],["NEPHEW",29],["SON-IN-LAW",27],["SISTER-IN-LAW",26],["AUNT",25]]},"kind":"categorical","n":4970,"n_null":2297,"n_unique":43,"null_rate":0.4621730382293763,"stats":{"cardinality":43,"entropy":4.127188674927957,"entropy_ratio":0.760594785087028,"top_rate":0.1447811447811448,"top_value":"FRIEND"}},{"alerts":[{"code":"imbalance","level":"warn","message":"top value is 95.5% of rows"}],"column":"Change from 2006?","extras":{"singletons":0,"top_values":[["ok",4746],["corrected",175],["corrected in spreadsheet",46],["ok sic",3]]},"kind":"categorical","n":4970,"n_null":0,"n_unique":4,"null_rate":0.0,"stats":{"cardinality":4,"entropy":0.30250930623195826,"entropy_ratio":0.15125465311597913,"top_rate":0.9549295774647887,"top_value":"ok"}},{"alerts":[{"code":"null_rate","level":"warn","message":"98.8% null"}],"column":"Order of Gardiner letters in file","extras":{"histogram":{"counts":[9,8,8,8,8,8,9],"edges":[1.0,9.142857142857142,17.285714285714285,25.428571428571427,33.57142857142857,41.71428571428571,49.857142857142854,58.0]},"sample":[39.0,29.0,30.0,31.0,32.0,33.0,40.0,41.0,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0,50.0,3.0,4.0,5.0,34.0,21.0,22.0,58.0,23.0,24.0,25.0,26.0,27.0,54.0,55.0,56.0,28.0,57.0,35.0,36.0,1.0,2.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,37.0,17.0,18.0,19.0,38.0,20.0,51.0,52.0,53.0]},"kind":"numeric","n":4970,"n_null":4912,"n_unique":58,"null_rate":0.9883299798792756,"stats":{"iqr":28.5,"kurtosis":-1.2007136485280998,"max":58.0,"mean":29.5,"median":29.5,"min":1.0,"n_outliers":0,"outlier_rate":0.0,"q1":15.25,"q3":43.75,"skew":0.0,"std":16.886878535320456,"zero_rate":0.0}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["row_count","column_count","Author gender","Recipient gender","Author API","Recipient API","Relation to author","Relation to recipient","Order of Gardiner letters in file","Change from 2006?"],"featured_charts":[{"caption":"Shows the strong male skew among letter authors (83% male, 17% female).","column":"Author gender","kind":"donut"},{"caption":"Recipients are similarly male-dominated; check the small 'MALE/FEMALE' slice for joint-addressed letters.","column":"Recipient gender","kind":"donut"},{"caption":"FRIEND, BROTHER, and SON lead \u2014 useful for seeing what kinship ties drove correspondence.","column":"Relation to author","kind":"bar"},{"caption":"Top social roles of authors (SIR, LADY, MERCHANT) reveal the elite tilt of the corpus.","column":"Author API","kind":"bar"},{"caption":"Compare against author roles to see whether letters mostly travelled within the same social strata.","column":"Recipient API","kind":"bar"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset catalogues 4,970 historical letters (the PCEEC corpus metadata), with 13 columns describing each letter's reference code, author, recipient, their genders, dates of birth, social roles (API), and kinship relations. The social skew is striking: authors are 83% male versus 17% female, and recipients are 82% male versus 18% female, so any analysis of women's correspondence will work from a much smaller base. Roles and relations are heavily concentrated too \u2014 'SIR' tops both author and recipient API fields, and 'FRIEND', 'BROTHER', and 'SON' dominate the kinship columns \u2014 though both API fields have long tails of 250+ distinct values worth scanning. Note also that 'Order of Gardiner letters in file' is 98.8% null (only relevant to a 58-letter subset) and 'Change from 2006?' is 95% 'ok', so neither carries much analytic signal.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.allcaps_rate","stats.one_word_rate","stats.duplicate_rate","stats.len_min","stats.len_max","stats.vocab_size","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds a unique letter reference code, formatted as an all-caps single token combining a name and a zero-padded sequence number (e.g. ALLEN_001, ARUNDEL_006). Every one of the 4970 rows is distinct with no nulls or duplicates, and lengths cluster tightly between 7 and 11 characters. The vocabulary equals the row count, confirming this is a primary identifier rather than a feature.","role":"identifier","scope":"column","target":"Letter reference","treatment":"Use as a row key for joins; exclude from modelling features."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Categorical column naming letter authors, with 695 distinct individuals across 4,970 rows and zero nulls. Distribution is moderately concentrated: JOHN_HOLLES_SR tops the list at 136 occurrences (2.7%), followed by THOMAS_CROMWELL (93) and DOROTHY_OSBORNE/TEMPLE (85), and entropy ratio of 0.87 indicates a fairly even spread across the long tail. Naming convention uses uppercase tokens with underscores, and some entries carry annotations like [N.MAUTBY] or compound surnames (OSBORNE/TEMPLE) that an analyst should normalise before grouping.","role":"feature","scope":"column","target":"Author name","treatment":"Normalise bracketed annotations and treat as a high-cardinality categorical (target-encode or group rare authors)."},{"confidence":"medium","critiques":[],"evidence_keys":["n_unique","null_rate","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Appears to be an authority/role tag for the author of each record, with 252 distinct titles or office descriptors (e.g. SIR, LADY, MERCHANT, KING_OF_ENGLAND, BISHOP_OF_WINCHESTER) suggesting historical/prosopographical data. Distribution is moderately diffuse (entropy ratio 0.76, top value SIR only 15.0%), but a quarter of rows are null (null_rate 0.2503) and several values mix role with DNB-style annotations like '1ST_EARL_OF_CLARE/POLITICIAN(DNB)', indicating inconsistent encoding.","role":"metadata","scope":"column","target":"Author API","treatment":"Normalise the compound 'ROLE/SUBROLE(DNB)' strings and treat missingness explicitly before using as a categorical feature."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.top_rate","stats.top_value","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Binary gender label for the author of each record, fully populated across all 4970 rows. The split is heavily skewed: MALE accounts for 83.1% (4130) versus FEMALE at 840, giving an entropy ratio of 0.655. No nulls or unexpected categories appear.","role":"feature","scope":"column","target":"Author gender","treatment":"One-hot or binary-encode; consider class-imbalance handling if used as a stratifier or predictor."},{"confidence":"high","critiques":[],"evidence_keys":["null_rate","n_unique","stats.top_value","stats.top_rate","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This appears to be the author's year of birth, stored as a categorical string rather than a number \u2014 many values carry a trailing '?' (e.g. '1565?', '1546?') indicating uncertain dates from historical records. 31.17% of rows are null and the top value covers only 3.98% of entries, with 217 distinct values and high entropy (ratio 0.87) suggesting a wide spread across early modern centuries. The mix of clean years ('1633', '1593') and questioned years signals inconsistent provenance that will break naive numeric parsing.","role":"metadata","scope":"column","target":"Author DOB","treatment":"Strip '?' markers and cast to integer year (with an 'uncertain' flag column) before any temporal analysis."},{"confidence":"high","critiques":[],"evidence_keys":["null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Categorical describing the relationship between two parties (likely a donor/sender and a recipient), with 45 distinct kinship or social labels such as FRIEND, BROTHER, SON, KIN, and FAMILY_SERVANT. The distribution is fairly flat \u2014 top value FRIEND covers only 13.99% and entropy ratio is 0.77 \u2014 so no single relation dominates. Most striking is the 44.37% null rate, meaning nearly half of records have no recorded relation.","role":"feature","scope":"column","target":"Relation to recipient","treatment":"Impute nulls as an explicit 'UNKNOWN' category and consider grouping the 45 levels into broader buckets (immediate family, extended kin, non-kin) before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Recipient is a categorical field naming the addressee of each record (likely letters in a historical correspondence corpus), with 623 distinct names across 4970 rows and no nulls. The distribution has a long tail but is concentrated on a few major figures: JOHN_PASTON_I tops at 5.27% (262), followed by NATHANIEL_BACON_I (251), JOAN_BARRINGTON (182), and JANE_CORNWALLIS/BACON[N.MEAUTYS] (180). High entropy ratio (0.79) confirms breadth, while the naming convention (uppercase with underscores, roman numerals, bracketed aliases) suggests a curated historical-letters dataset.","role":"label","scope":"column","target":"Recipient","treatment":"Group rare recipients into an 'other' bucket before encoding, given the long tail across 623 categories."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This column appears to capture the recipient's title or role/API descriptor in correspondence records, with 326 distinct values across 4970 rows. The distribution is moderately diverse (entropy_ratio 0.73) \u2014 the most common value 'SIR' covers only 13.4% of records, followed by 'LADY' and compound role strings like 'SIR/LOCAL_POLITICIAN(DNB)'. Notable surprises: 18.9% nulls, and many values are concatenated multi-role strings rather than atomic titles, suggesting inconsistent encoding.","role":"feature","scope":"column","target":"Recipient API","treatment":"Split compound '/'-delimited roles and one-hot or target-encode atomic titles before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_value","stats.top_rate","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Categorical recipient gender field with three values and no nulls across 4970 rows. Heavily skewed toward MALE at 81.97% (4074), with FEMALE at 892 and a rare MALE/FEMALE combo at just 4 records. Low entropy ratio of 0.43 confirms the imbalance, and the mixed category is small enough to need a handling decision.","role":"feature","scope":"column","target":"Recipient gender","treatment":"One-hot encode and decide whether to merge or drop the 4 MALE/FEMALE rows before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"This is the recipient's date of birth recorded as a year only, with values clustered between the 15th and 17th centuries (e.g. 1421, 1546?, 1581) \u2014 consistent with a historical correspondence or archival dataset. Roughly 19.6% of rows are null and many entries carry trailing '?' marks indicating archivist uncertainty, which inflates cardinality (210 distinct values for 4970 rows) and means '1546?' and '1546' would be treated as different categories. Entropy ratio of 0.82 shows the distribution is fairly spread across years rather than dominated by one cohort, with the top year '1421' covering only 6.6%.","role":"timestamp","scope":"column","target":"Recipient DOB","treatment":"Strip '?' uncertainty markers, cast to integer year, and bucket into centuries or decades before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n_unique","null_rate","stats.top_value","stats.top_rate","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Categorical relationship label between an author and another person, with 43 distinct values dominated by family/social ties (FRIEND 14.5%, BROTHER, SON, BROTHER-IN-LAW, KIN). Nearly half the rows (46.2%) are null, which is the main concern. Entropy ratio of 0.76 indicates the non-null values are spread fairly evenly across the top categories rather than collapsing onto one.","role":"feature","scope":"column","target":"Relation to author","treatment":"Impute or add an explicit 'unknown' category for the 46% nulls, then group rare levels before one-hot encoding."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_rate","stats.top_value","stats.entropy_ratio","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"A data-quality flag tracking whether each row changed since 2006, with four categories dominated by 'ok' at 95.5% of 4,970 rows. The remaining values ('corrected', 'corrected in spreadsheet', 'ok sic') indicate manual review notes, and the inconsistent labels suggest free-form curator entries rather than a controlled vocabulary. Entropy ratio of 0.15 confirms severe imbalance.","role":"metadata","scope":"column","target":"Change from 2006?","treatment":"Drop or collapse to a binary corrected/ok flag; too imbalanced to be a useful feature."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.min","stats.max","stats.mean","stats.median","stats.skew","stats.n_outliers"],"model":"anthropic:claude-opus-4-7","narrative":"This column appears to be an ordinal index assigning each Gardiner letter (an Egyptian hieroglyph category) its position within the file, running from 1 to 58 with a perfectly symmetric distribution (mean and median both 29.5, skew 0). The striking signal is a 98.83% null rate: only 58 of 4970 rows carry a value, exactly matching n_unique, so this is effectively a one-row-per-letter lookup sparsely attached to a much larger table. No outliers and uniform spread confirm it is a sequence, not a measurement.","role":"metadata","scope":"column","target":"Order of Gardiner letters in file","treatment":"Drop from modelling; retain only if you need to preserve the original Gardiner ordering via a join on letter."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":4643,"prompt_tokens":15915,"total_tokens":20558}},"language_counts":{},"meta":{"generated_at":"2026-05-01T17:52:28+00:00","mode":"full","row_count":4970,"sampled_rows":4970,"seed":42,"source":"/home/coolhand/servers/diachronica/corpus/historical-corpora/pceec/data/aif_2022.csv"},"notes":[],"saturn_version":"0.2.0","schema":{"Author API":"categorical","Author DOB":"categorical","Author gender":"categorical","Author name":"categorical","Change from 2006?":"categorical","Letter reference":"text","Order of Gardiner letters in file":"numeric","Recipient":"categorical","Recipient API":"categorical","Recipient DOB":"categorical","Recipient gender":"categorical","Relation to author":"categorical","Relation to recipient":"categorical"}}
