{"columns":[{"alerts":[{"code":"near_unique","level":"info","message":"100.0% of rows are unique strings"},{"code":"one_word","level":"warn","message":"100.0% rows are a single word"},{"code":"allcaps","level":"info","message":"100.0% rows are all-caps"},{"code":"short_text","level":"info","message":"95th-percentile length under 20 chars"}],"column":"fips","extras":{"language_counts":{},"language_sample_size":3222,"length_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3222,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[4.5,4.525,4.55,4.575,4.6,4.625,4.65,4.675,4.7,4.725,4.75,4.775,4.8,4.825,4.85,4.875,4.9,4.925,4.95,4.975,5.0,5.025,5.05,5.075,5.1,5.125,5.15,5.175,5.2,5.225,5.25,5.275,5.3,5.325,5.35,5.375,5.4,5.425,5.45,5.475,5.5]},"near_unique":true,"sample":["01007","47021","49031","48279","27091","56033","28017","51165","48291","05019","37125","31109","20017","01049","20107","16087","08109","48371","72109","26129","20201","05055","13279","12001","42021","47089","34019","17043","08009","56023","39145","40011","01063","26069","22115","28011","54057","30093","28085","31107","48221","53063","20039","53055","55063","72007","22013","26087","13293","51057"],"top_values":[],"top_words":[["01001",1],["01003",1],["01005",1],["01007",1],["01009",1],["01011",1],["01013",1],["01015",1],["01017",1],["01019",1],["01021",1],["01023",1],["01025",1],["01027",1],["01029",1],["01031",1],["01033",1],["01035",1],["01037",1],["01039",1],["01041",1],["01043",1],["01045",1],["01047",1],["01049",1]],"vocab_skipped":null,"word_histogram":{"counts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3222,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"edges":[0.5,0.5333333333333333,0.5666666666666667,0.6,0.6333333333333333,0.6666666666666666,0.7,0.7333333333333334,0.7666666666666666,0.8,0.8333333333333333,0.8666666666666667,0.9,0.9333333333333333,0.9666666666666667,1.0,1.0333333333333332,1.0666666666666667,1.1,1.1333333333333333,1.1666666666666665,1.2,1.2333333333333334,1.2666666666666666,1.3,1.3333333333333335,1.3666666666666667,1.4,1.4333333333333333,1.4666666666666668,1.5]}},"kind":"text","n":3222,"n_null":0,"n_unique":3222,"null_rate":0.0,"stats":{"allcaps_rate":1.0,"boilerplate_rate":0.0,"duplicate_rate":0.0,"emoji_rate":0.0,"len_max":5,"len_mean":5.0,"len_median":5.0,"len_min":5,"len_p95":5.0,"n_duplicates":0,"n_empty":0,"one_word_rate":1.0,"readability_flesch_mean":121.22000000000004,"url_rate":0.0,"vocab_size":3222,"word_mean":1.0,"word_median":1.0}},{"alerts":[{"code":"near_unique","level":"info","message":"100.0% of rows are unique strings"}],"column":"county_name","extras":{"language_counts":{},"language_sample_size":3222,"length_histogram":{"counts":[26,72,121,190,264,407,420,363,320,240,231,152,139,165,41,28,16,10,5,0,1,1,0,1,1,0,2,0,1,1,0,0,0,0,2,1,0,0,0,1],"edges":[16.0,17.075,18.15,19.225,20.3,21.375,22.45,23.525,24.6,25.674999999999997,26.75,27.825,28.9,29.975,31.049999999999997,32.125,33.2,34.275,35.349999999999994,36.425,37.5,38.575,39.65,40.724999999999994,41.8,42.875,43.95,45.025,46.099999999999994,47.175,48.25,49.324999999999996,50.4,51.475,52.55,53.625,54.699999999999996,55.775,56.85,57.925,59.0]},"near_unique":true,"sample":["Bibb County, Alabama","Cheatham County, Tennessee","Piute County, Utah","Lamb County, Texas","Martin County, Minnesota","Sheridan County, Wyoming","Chickasaw County, Mississippi","Rockingham County, Virginia","Liberty County, Texas","Clark County, Arkansas","Moore County, North Carolina","Lancaster County, Nebraska","Chase County, Kansas","DeKalb County, Alabama","Linn County, Kansas","Washington County, Idaho","Saguache County, Colorado","Pecos County, Texas","Patillas Municipio, Puerto Rico","Ogemaw County, Michigan","Washington County, Kansas","Greene County, Arkansas","Toombs County, Georgia","Alachua County, Florida","Cambria County, Pennsylvania","Jefferson County, Tennessee","Hunterdon County, New Jersey","DuPage County, Illinois","Baca County, Colorado","Lincoln County, Wyoming","Scioto County, Ohio","Blaine County, Oklahoma","Greene County, Alabama","Iosco County, Michigan","Vernon Parish, Louisiana","Bolivar County, Mississippi","Mineral County, West Virginia","Silver Bow County, Montana","Lincoln County, Mississippi","Knox County, Nebraska","Hood County, Texas","Spokane County, Washington","Decatur County, Kansas","San Juan County, Washington","La Crosse County, Wisconsin","Aguas Buenas Municipio, Puerto Rico","Bienville Parish, Louisiana","Lapeer County, Michigan","Upson County, Georgia","Essex County, Virginia"],"top_values":[],"top_words":[["county,",2999],["texas",256],["virginia",189],["georgia",159],["north",155],["carolina",147],["new",131],["dakota",121],["kentucky",120],["missouri",115],["south",113],["kansas",105],["illinois",102],["iowa",101],["tennessee",95],["indiana",93],["nebraska",93],["ohio",91],["minnesota",87],["mississippi",84],["michigan",83],["oklahoma",78],["municipio,",78],["puerto",78],["rico",78]],"vocab_skipped":null,"word_histogram":{"counts":[2492,0,0,0,0,0,0,667,0,0,0,0,0,0,0,57,0,0,0,0,0,0,5,0,0,0,0,0,0,1],"edges":[3.0,3.1333333333333333,3.2666666666666666,3.4,3.533333333333333,3.6666666666666665,3.8,3.9333333333333336,4.066666666666666,4.2,4.333333333333333,4.466666666666667,4.6,4.733333333333333,4.866666666666667,5.0,5.133333333333333,5.266666666666667,5.4,5.533333333333333,5.666666666666666,5.8,5.933333333333334,6.066666666666666,6.2,6.333333333333334,6.466666666666667,6.6,6.733333333333333,6.866666666666667,7.0]}},"kind":"text","n":3222,"n_null":0,"n_unique":3222,"null_rate":0.0,"stats":{"allcaps_rate":0.0,"boilerplate_rate":0.0,"duplicate_rate":0.0,"emoji_rate":0.0,"len_max":59,"len_mean":24.324022346368714,"len_median":24.0,"len_min":16,"len_p95":31.0,"n_duplicates":0,"n_empty":0,"one_word_rate":0.0,"readability_flesch_mean":10.283900000000026,"url_rate":0.0,"vocab_size":1990,"word_mean":3.2482929857231535,"word_median":3.0}},{"alerts":[],"column":"rural","extras":{"singletons":0,"top_values":[["True",2212],["False",1010]]},"kind":"categorical","n":3222,"n_null":0,"n_unique":2,"null_rate":0.0,"stats":{"cardinality":2,"entropy":0.8971383342699701,"entropy_ratio":0.8971383342699701,"top_rate":0.686530105524519,"top_value":"True"}},{"alerts":[],"column":"rural_category","extras":{"singletons":0,"top_values":[["Rural",2212],["Urban/Suburban",1010]]},"kind":"categorical","n":3222,"n_null":0,"n_unique":2,"null_rate":0.0,"stats":{"cardinality":2,"entropy":0.8971383342699701,"entropy_ratio":0.8971383342699701,"top_rate":0.686530105524519,"top_value":"Rural"}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["row_count","column_count","columns[2].top_values","columns[2].stats.top_rate","columns[3].top_values","columns[1].top_words","columns[0].n_unique"],"featured_charts":[{"caption":"Roughly 69% of counties are classified Rural versus 31% Urban/Suburban.","column":"rural_category","kind":"donut"},{"caption":"Confirms the same 2,212 vs 1,010 split as rural_category \u2014 these two columns are redundant.","column":"rural","kind":"bar"},{"caption":"County name lengths cluster tightly around 24 characters, reflecting the consistent 'X County, State' format.","column":"county_name","kind":"length"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset catalogs 3,222 U.S. counties, each identified by a unique 5-character FIPS code and county name, and classified as either rural or urban/suburban. The two classification columns (`rural` and `rural_category`) are perfectly redundant, both showing 2,212 counties (about 68.7%) flagged as Rural versus 1,010 as Urban/Suburban. The most useful angle here is the rural/urban split, since FIPS and county_name are unique identifiers with no aggregate signal. Top words in `county_name` hint at geographic concentration, with Texas (256), Virginia (189), and Georgia (159) contributing the most counties.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.len_min","stats.len_max","stats.len_mean","stats.one_word_rate","stats.duplicate_rate","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"This column holds 5-character FIPS codes, one per row across all 3222 records with zero nulls and zero duplicates. Every value is exactly 5 characters, single-word, and the sample tokens (01001, 01003, 01005...) match the standard 2-digit state + 3-digit county FIPS format. With n_unique equal to n, this is a row-level identifier rather than a feature.","role":"identifier","scope":"column","target":"fips","treatment":"Treat as a county key and left-join to geographic reference tables; do not use as a model feature."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.len_min","stats.len_max","stats.len_mean","stats.duplicate_rate","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"Full county identifiers, almost certainly formatted like 'X County, <state>' \u2014 2,999 of 3,222 rows contain the token 'county,' and the remaining top tokens (texas, virginia, georgia, north carolina) are US state names. Every one of the 3,222 values is unique with zero nulls, duplicates, or empty strings, and lengths cluster tightly between 16 and 31 characters. Texas (256) and Virginia (189) lead the state distribution, consistent with the known US county counts.","role":"identifier","scope":"column","target":"county_name","treatment":"Use as a geographic key; left-join to state/FIPS lookups rather than feeding to a model directly."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.top_value","stats.top_rate","stats.entropy","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"Boolean flag indicating rural status, stored as the strings \"True\"/\"False\" with no nulls across 3222 rows. The split is imbalanced toward rural: 2212 True (68.7%) versus 1010 False, giving entropy 0.897 of the maximum 1.0.","role":"feature","scope":"column","target":"rural","treatment":"Cast string \"True\"/\"False\" to boolean or 0/1 before modelling."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.cardinality","stats.entropy_ratio","stats.top_rate","stats.top_value","top_values"],"model":"anthropic:claude-opus-4-7","narrative":"A binary geographic classifier splitting records into 'Rural' (2212) and 'Urban/Suburban' (1010) with no nulls across 3222 rows. The split is uneven at roughly 68.7% rural, but entropy ratio of 0.897 indicates the minority class is still well represented. Cardinality of 2 makes this a clean categorical feature with no dirty variants.","role":"feature","scope":"column","target":"rural_category","treatment":"Encode as a binary indicator (e.g., is_rural) before modelling."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":1504,"prompt_tokens":5127,"total_tokens":6631}},"language_counts":{},"meta":{"generated_at":"2026-05-01T16:51:11+00:00","mode":"full","row_count":3222,"sampled_rows":3222,"seed":42,"source":"/home/coolhand/html/datavis/data_trove/cache/healthcare_data/rural_urban_classification_20260121.parquet"},"notes":[],"saturn_version":"0.2.0","schema":{"county_name":"text","fips":"text","rural":"boolean","rural_category":"categorical"}}
