{"columns":[{"alerts":[],"column":"movieId","extras":{"histogram":{"counts":[7196,1111,0,1114,796,435,754,816,789,1132,1107,1398,1556,1541,1535,1847,2673,2763,3280,3282,3146,3258,3464,3492,3484,3486,3401,3369,3050,2907,2325,1839,1672,1531,1566,1679,1911,2168,2580,2132],"edges":[1.0,7319.9,14638.8,21957.699999999997,29276.6,36595.5,43914.399999999994,51233.299999999996,58552.2,65871.09999999999,73190.0,80508.9,87827.79999999999,95146.7,102465.59999999999,109784.5,117103.4,124422.29999999999,131741.19999999998,139060.1,146379.0,153697.9,161016.8,168335.69999999998,175654.59999999998,182973.5,190292.4,197611.3,204930.19999999998,212249.09999999998,219568.0,226886.9,234205.8,241524.69999999998,248843.59999999998,256162.5,263481.39999999997,270800.3,278119.2,285438.1,292757.0]},"sample":[469.0,577.0,652.0,886.0,1252.0,1620.0,1630.0,1781.0,1977.0,2022.0,2070.0,2081.0,2267.0,2287.0,2780.0,2883.0,3119.0,3342.0,3363.0,3734.0,3919.0,4020.0,4029.0,5192.0,5669.0,5785.0,6027.0,6028.0,6775.0,7030.0,7042.0,7202.0,7351.0,7587.0,7623.0,7831.0,8143.0,8400.0,8537.0,8610.0,8662.0,8738.0,8915.0,8975.0,8982.0,25874.0,25947.0,26871.0,27595.0,27624.0,27912.0,31053.0,31437.0,33677.0,34542.0,38798.0,40478.0,40819.0,43934.0,44779.0,45668.0,47571.0,48584.0,48715.0,50514.0,51894.0,52460.0,52804.0,58293.0,58306.0,58760.0,58847.0,59031.0,59485.0,61110.0,61352.0,65601.0,67980.0,68157.0,68462.0,69275.0,69475.0,72571.0,72852.0,72872.0,74754.0,74791.0,76217.0,76720.0,77177.0,78957.0,80498.0,80748.0,81355.0,82449.0,82469.0,83275.0,83796.0,83835.0,86860.0,87181.0,89219.0,90254.0,91331.0,91564.0,91789.0,92222.0,92479.0,97779.0,101505.0,102716.0,102819.0,104064.0,104233.0,105377.0,105519.0,106854.0,107675.0,107901.0,108932.0,109925.0,110284.0,112807.0,114601.0,116199.0,116576.0,116748.0,117374.0,117656.0,119141.0,120492.0,120536.0,120586.0,120755.0,121399.0,121659.0,121689.0,121757.0,122411.0,122851.0,122982.0,123151.0,123526.0,123639.0,124021.0,124318.0,124737.0,125101.0,125257.0,125485.0,125541.0,125795.0,126046.0,126785.0,127579.0,128105.0,128227.0,128437.0,128983.0,129378.0,129466.0,129779.0,130516.0,130522.0,130528.0,131365.0,132210.0,132278.0,132632.0,133065.0,133285.0,133443.0,133953.0,134653.0,135593.0,135613.0,136888.0,138440.0,138740.0,138912.0,139319.0,139361.0,139701.0,140076.0,140104.0,140373.0,140922.0,141126.0,142168.0,142202.0,143373.0,145640.0,146230.0,147136.0,147597.0,147752.0,147998.0,148162.0,148781.0,149240.0,149568.0,149624.0,149700.0,150064.0,151621.0,152513.0,152559.0,152790.0,152910.0,153018.0,153028.0,153076.0,153228.0,153324.0,153350.0,153596.0,153830.0,153865.0,154307.0,154480.0,154917.0,155151.0,155411.0,155469.0,155854.0,155858.0,155884.0,157168.0,157480.0,157685.0,158107.0,158294.0,158388.0,158609.0,158813.0,159083.0,159387.0,159678.0,159680.0,160022.0,160068.0,160139.0,160692.0,160884.0,162330.0,163124.0,163629.0,163843.0,164157.0,164393.0,164552.0,164635.0,164887.0,164937.0,165313.0,165515.0,165541.0,165597.0,165779.0,166373.0,166437.0,166900.0,167306.0,167730.0,168412.0,169148.0,169394.0,169624.0,170193.0,171483.0,171761.0,173307.0,173435.0,173591.0,173807.0,173821.0,174079.0,174177.0,175237.0,175305.0,175513.0,175583.0,175689.0,175881.0,175891.0,176235.0,177127.0,177315.0,177347.0,177931.0,178045.0,178775.0,179105.0,180053.0,180435.0,180643.0,180983.0,181663.0,182247.0,183429.0,183573.0,183947.0,184255.0,184755.0,185115.0,185277.0,186189.0,186643.0,187967.0,188833.0,189091.0,189115.0,189333.0,189383.0,190081.0,190383.0,190413.0,190485.0,191753.0,193179.0,193701.0,193839.0,195011.0,195141.0,195245.0,195489.0,195605.0,195687.0,195713.0,195823.0,195993.0,196203.0,196385.0,196575.0,197127.0,197385.0,198057.0,198309.0,198543.0,198977.0,199021.0,199325.0,199706.0,201646.0,201680.0,201712.0,201787.0,202319.0,202333.0,202411.0,203262.0,203300.0,205575.0,205749.0,205771.0,205843.0,206327.0,206739.0,206791.0,206849.0,207501.0,207996.0,208695.0,209351.0,210041.0,210265.0,210829.0,210891.0,211145.0,211430.0,212339.0,212349.0,212715.0,212919.0,213942.0,214088.0,214362.0,214494.0,214544.0,214696.0,215643.0,215655.0,216181.0,216352.0,216442.0,216751.0,217091.0,217294.0,217631.0,218505.0,218941.0,219037.0,219139.0,219278.0,219290.0,220230.0,220276.0,220518.0,220922.0,221222.0,221248.0,221856.0,223240.0,223300.0,223340.0,223742.0,224753.0,225794.0,225866.0,226482.0,226636.0,227096.0,227258.0,227502.0,228777.0,231237.0,231257.0,233203.0,233637.0,233939.0,234999.0,235041.0,235165.0,235265.0,235351.0,235635.0,237746.0,238590.0,239602.0,241176.0,242112.0,242488.0,243600.0,244282.0,244582.0,245262.0,245314.0,249134.0,250824.0,253154.0,257551.0,258173.0,259673.0,259993.0,260289.0,260507.0,260969.0,261569.0,261635.0,262619.0,263503.0,264089.0,265138.0,265572.0,268366.0,268436.0,269726.0,270916.0,271258.0,271891.0,272573.0,272613.0,272663.0,274581.0,274671.0,274781.0,275147.0,275193.0,275491.0,275495.0,275559.0,275857.0,278166.0,279268.0,279360.0,280186.0,280428.0,280498.0,280944.0,281156.0,281506.0,281860.0,282489.0,282527.0,282555.0,282921.0,283097.0,283487.0,283597.0,284093.0,284423.0,286377.0,288691.0,289077.0,289351.0,289747.0,290135.0,290868.0,291280.0,292389.0]},"kind":"numeric","n":87585,"n_null":0,"n_unique":87585,"null_rate":0.0,"stats":{"iqr":100546.0,"kurtosis":-0.5774915128949369,"max":292757.0,"mean":157651.3655192099,"median":165741.0,"min":1.0,"n_outliers":0,"outlier_rate":0.0,"q1":112657.0,"q3":213203.0,"skew":-0.39139923474865146,"std":79013.40209904632,"zero_rate":0.0}},{"alerts":[{"code":"near_unique","level":"info","message":"99.8% of rows are unique strings"}],"column":"title","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[44,1905,14749,17609,20947,12610,7127,3670,3134,2086,1102,932,548,381,213,180,123,81,41,33,23,13,11,8,3,1,2,4,0,2,1,0,0,1,0,0,0,0,0,1],"edges":[2.0,6.725,11.45,16.174999999999997,20.9,25.625,30.349999999999998,35.074999999999996,39.8,44.525,49.25,53.974999999999994,58.699999999999996,63.425,68.14999999999999,72.875,77.6,82.32499999999999,87.05,91.77499999999999,96.5,101.225,105.94999999999999,110.675,115.39999999999999,120.12499999999999,124.85,129.575,134.29999999999998,139.02499999999998,143.75,148.475,153.2,157.92499999999998,162.64999999999998,167.375,172.1,176.825,181.54999999999998,186.27499999999998,191.0]},"near_unique":true,"sample":["Catwalk (1996)","Disco (2019)","Alien: Alone (2019)","Shock Wave 2 (2020)","Marx Reloaded (2011)","Belchior - Apenas um Cora\u00e7\u00e3o Selvagem (2022)","Current (1963)","The Lead (2020)","How It Feels to Be Run Over (1900)","Easy Rider (1969)","Gregorio (1982)","Mythica: The Godslayer (2016)","Patton Oswalt: Finest Hour (2011)","Three Colors: White (Trzy kolory: Bialy) (1994)","Honky Tonk (1941)","Don't Think About It (Non Pensarci) (2007)","Helen of Troy (1956)","Looking for Jimmy (2002)","Bad Things (2023)","High Rollers (1976)","The Great Bear (2011)","Waiting Game, The (2000)","Capitalism: A Love Story (2009)","Rose Tattoo, The (1955)","Little Bee (2007)","Assassin 33 A.D. (2020)","The Exhibitionists (2012)","Essential Killing (2010)","Countess from Hong Kong, A (1967)","Kaguya-sama: Love Is War -The First Kiss That Never Ends- (2022)","High Resolution (2018)","The Network of Freedom (2017)","Our Deal (2011)","Alex L'ariete (2000)","The Immortalists (2014)","Ip Man 3 (2015)","Forced Landing (1941)","Perry Mason: The Case of the Notorious Nun (1986)","The Bells Toll for the Barefooted (1965)","Going South (2009)","Hollow Scream (2018)","The West Side Waltz (1995)","Final (2001)","Mo Gilligan: There's Mo to Life (2022)","Spirited (2022)","Never Get Tired: The Bomb the Music Industry! Story (2015)","Maniac Killer (1987)","Freedom (2001)","Wing and the Thigh, The (L'aile ou la cuisse) (1976)","Chernobyl: The Invisible Enemy (2021)"],"top_values":[],"top_words":[["the",6207],["of",2077],["a",1038],["in",770],["(2018)",756],["(2016)",733],["(2017)",714],["(2019)",714],["and",680],["(2014)",678],["(2015)",665],["(2020)",636],["(2013)",572],["(2012)",553],["(2021)",535],["(2011)",506],["to",500],["(2009)",483],["(2010)",478],["(2022)",470],["(2008)",458],["(2007)",443],["(2006)",397],["(2005)",354],["love",320]],"vocab_skipped":null,"word_histogram":{"counts":[112,14845,23252,19215,12559,7398,4305,2282,1468,0,822,518,307,199,112,72,38,42,19,0,6,5,1,3,0,3,1,0,0,1],"edges":[1.0,1.9,2.8,3.7,4.6,5.5,6.4,7.3,8.2,9.1,10.0,10.9,11.8,12.700000000000001,13.6,14.5,15.4,16.3,17.2,18.1,19.0,19.900000000000002,20.8,21.7,22.6,23.5,24.400000000000002,25.3,26.2,27.1,28.0]}},"kind":"text","n":87585,"n_null":0,"n_unique":87382,"null_rate":0.0,"stats":{"allcaps_rate":0.0055260603984700575,"boilerplate_rate":9.133984129702575e-05,"duplicate_rate":0.0023177484729120282,"emoji_rate":3.4252440486384655e-05,"len_max":191,"len_mean":25.281349546155162,"len_median":23.0,"len_min":2,"len_p95":48.0,"n_duplicates":203,"n_empty":0,"one_word_rate":0.0012787577781583604,"readability_flesch_mean":81.20267045454548,"url_rate":0.0,"vocab_size":19981,"word_mean":4.22596334988868,"word_median":4.0}},{"alerts":[{"code":"one_word","level":"warn","message":"91.9% rows are a single word"},{"code":"duplicates","level":"warn","message":"97.9% duplicate strings"}],"column":"genres","extras":{"language_counts":{},"language_sample_size":5000,"length_histogram":{"counts":[125,24325,3474,2417,14431,11262,3313,2692,9626,5612,3690,1617,1066,758,1008,640,366,430,193,76,135,159,42,45,25,30,5,9,6,3,3,0,0,1,0,0,0,0,0,1],"edges":[3.0,4.85,6.7,8.55,10.4,12.25,14.100000000000001,15.950000000000001,17.8,19.650000000000002,21.5,23.35,25.200000000000003,27.05,28.900000000000002,30.75,32.6,34.45,36.300000000000004,38.15,40.0,41.85,43.7,45.550000000000004,47.400000000000006,49.25,51.1,52.95,54.800000000000004,56.650000000000006,58.5,60.35,62.2,64.05000000000001,65.9,67.75,69.60000000000001,71.45,73.3,75.15,77.0]},"near_unique":false,"sample":["Documentary","Drama","Horror|Sci-Fi","Action|Crime","(no genres listed)","Documentary","(no genres listed)","Thriller","Horror","Adventure|Drama","Drama","Fantasy","Comedy","Comedy|Drama","Comedy|Crime|Drama|Romance|Western","Comedy|Drama","Action|Adventure|Drama|Romance|War","(no genres listed)","Horror|Thriller","Comedy|Crime","Adventure|Animation|Children|Fantasy","Comedy","Documentary","Drama|Romance","Animation|Children|Comedy","Sci-Fi","Drama","Thriller|War","Comedy","Animation|Comedy|Romance","Drama|Romance","Drama","Drama|Romance","Action|Adventure|Crime","Documentary","Action","Action|Adventure","(no genres listed)","(no genres listed)","Drama","Horror|Sci-Fi","Drama","Drama|Sci-Fi|Thriller","Comedy","Comedy","Documentary","(no genres listed)","Drama","Comedy","Documentary"],"top_values":[["Drama",12443],["Documentary",8132],["Comedy",7761],["(no genres listed)",7080],["Comedy|Drama",3245],["Drama|Romance",2825],["Horror",2487],["Comedy|Romance",2229],["Thriller",1410],["Comedy|Drama|Romance",1335],["Drama|Thriller",1281],["Horror|Thriller",1242],["Crime|Drama",1150],["Animation",1120],["Drama|War",818],["Action",789],["Action|Drama",696],["Western",686],["Crime|Drama|Thriller",655],["Romance",645]],"top_words":[["drama",2804],["documentary",1864],["comedy",1771],["(no",1609],["genres",1609],["listed)",1609],["comedy|drama",745],["drama|romance",658],["horror",582],["comedy|romance",517],["thriller",317],["drama|thriller",302],["comedy|drama|romance",285],["horror|thriller",280],["crime|drama",259],["animation",257],["action",182],["western",173],["drama|war",167],["crime|drama|thriller",160],["romance",154],["action|drama",148],["comedy|horror",136],["action|thriller",134],["sci-fi",123]],"vocab_skipped":null,"word_histogram":{"counts":[80505,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7080],"edges":[1.0,1.0666666666666667,1.1333333333333333,1.2,1.2666666666666666,1.3333333333333333,1.4,1.4666666666666668,1.5333333333333332,1.6,1.6666666666666665,1.7333333333333334,1.8,1.8666666666666667,1.9333333333333333,2.0,2.0666666666666664,2.1333333333333333,2.2,2.2666666666666666,2.333333333333333,2.4,2.466666666666667,2.533333333333333,2.6,2.666666666666667,2.7333333333333334,2.8,2.8666666666666667,2.9333333333333336,3.0]}},"kind":"text","n":87585,"n_null":0,"n_unique":1798,"null_rate":0.0,"stats":{"allcaps_rate":1.1417480162128219e-05,"boilerplate_rate":0.0,"duplicate_rate":0.9794713706684934,"emoji_rate":0.0,"len_max":77,"len_mean":13.240326539932637,"len_median":12.0,"len_min":3,"len_p95":27.0,"n_duplicates":85787,"n_empty":0,"one_word_rate":0.9191642404521322,"readability_flesch_mean":-117.66554999999997,"url_rate":0.0,"vocab_size":909,"word_mean":1.1616715190957356,"word_median":1.0}}],"insights":{"errors":[],"insights":[{"confidence":"high","critiques":[],"evidence_keys":["row_count","columns.genres.n_unique","columns.genres.top_values","columns.genres.stats.one_word_rate","columns.title.n_unique","columns.title.top_words","columns.title.stats.len_mean","columns.movieId.stats.min","columns.movieId.stats.max","columns.movieId.stats.median"],"featured_charts":[{"caption":"See how Drama, Documentary, and Comedy dominate, and note the large '(no genres listed)' bucket.","column":"genres","kind":"bar"},{"caption":"Share of the top genre combinations versus the long tail of 1,798 unique strings.","column":"genres","kind":"donut"},{"caption":"Title character-length distribution centres near 23 with a long tail out to 191.","column":"title","kind":"length"},{"caption":"Check how movieIds spread from 1 to 292,757 to gauge sparsity in the identifier range.","column":"movieId","kind":"histogram"}],"model":"anthropic:claude-opus-4-7","narrative":"This dataset is a movie catalogue of 87,585 rows with three columns: a unique movieId, a title, and a pipe-delimited genres string. The genres column is the most analytically interesting: only 1,798 unique combinations exist, and Drama, Documentary, and Comedy dominate, while 7,080 rows are tagged '(no genres listed)' \u2014 a sizeable gap worth flagging. Titles are nearly unique (87,382 distinct of 87,585), and the frequent '(2014)'\u2013'(2019)' tokens in titles suggest the catalogue skews toward recent years. movieId spans 1 to 292,757 with no outliers, indicating a sparse identifier range rather than a clean sequence. Start with the genre distribution and the missing-genre share before any deeper modelling.","scope":"dataset","target":"__global__"},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","null_rate","stats.min","stats.max","stats.mean","stats.median"],"model":"anthropic:claude-opus-4-7","narrative":"movieId is fully unique across all 87585 rows with zero nulls, spanning 1 to 292757 \u2014 classic surrogate key behaviour rather than a measurable quantity. The wide range with a mean of 157651 and median of 165741 suggests non-contiguous IDs (gaps in the sequence), but there is no statistical signal to extract from it.","role":"identifier","scope":"column","target":"movieId","treatment":"use as a join key; exclude from modelling features."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","stats.len_mean","stats.word_mean","stats.duplicate_rate","stats.n_duplicates","stats.readability_flesch_mean","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"Short free-text titles, averaging 4.2 words and 25 characters, with 87,382 unique values across 87,585 rows. The frequent year tokens like (2018), (2016), (2017), (2019) suggest these are titled works (likely films or publications) with release years appended. Near-unique with only 203 duplicates and a high Flesch readability of 81.2; emoji and URL rates are effectively zero.","role":"free_text","scope":"column","target":"title","treatment":"Tokenize and embed (or strip the trailing year into a separate feature) before modelling; do not use as a key."},{"confidence":"high","critiques":[],"evidence_keys":["n","n_unique","duplicate_rate","one_word_rate","word_mean","top_values","top_words"],"model":"anthropic:claude-opus-4-7","narrative":"Pipe-delimited movie genre tags, with 1,798 distinct combinations across 87,585 rows and a 97.9% duplicate rate. Drama (12,443), Documentary (8,132), and Comedy (7,761) dominate, while 7,080 rows carry the literal placeholder '(no genres listed)' that should be treated as missing. 91.9% of values are single tokens (word_mean 1.16), so multi-genre entries like 'Comedy|Drama' are the minority.","role":"feature","scope":"column","target":"genres","treatment":"split on '|' and one-hot or multi-hot encode; recode '(no genres listed)' as null."}],"providers":["anthropic:claude-opus-4-7"],"total_usage":{"completion_tokens":1451,"prompt_tokens":4935,"total_tokens":6386}},"language_counts":{},"meta":{"generated_at":"2026-05-01T18:36:17+00:00","mode":"full","row_count":87585,"sampled_rows":87585,"seed":42,"source":"/home/coolhand/html/datavis/data_trove/entertainment/movies/ml-32m/movies.csv"},"notes":[],"saturn_version":"0.2.0","schema":{"genres":"text","movieId":"numeric","title":"text"}}
