<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>pandas Archives - Vijay Gokarn</title>
	<atom:link href="https://vijay-gokarn.com/tag/pandas/feed/" rel="self" type="application/rss+xml" />
	<link>https://vijay-gokarn.com/tag/pandas/</link>
	<description>&#34;Ignite Curiosity. Fuel the Future.&#34;</description>
	<lastBuildDate>Sun, 19 Apr 2026 03:33:59 +0000</lastBuildDate>
	<language>en-US</language>
	<sy:updatePeriod>
	hourly	</sy:updatePeriod>
	<sy:updateFrequency>
	1	</sy:updateFrequency>
	<generator>https://wordpress.org/?v=6.9.4</generator>

<image>
	<url>https://i0.wp.com/vijay-gokarn.com/wp-content/uploads/2023/09/cropped-ideogram.jpeg?fit=32%2C32&#038;ssl=1</url>
	<title>pandas Archives - Vijay Gokarn</title>
	<link>https://vijay-gokarn.com/tag/pandas/</link>
	<width>32</width>
	<height>32</height>
</image> 
<site xmlns="com-wordpress:feed-additions:1">230943525</site>	<item>
		<title>From Amazon Reviews to Numbers: A Hands-On Tour of One-Hot, Bag of Words, and TF-IDF</title>
		<link>https://vijay-gokarn.com/from-amazon-reviews-to-numbers-a-hands-on-tour-of-one-hot-bag-of-words-and-tf-idf/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=from-amazon-reviews-to-numbers-a-hands-on-tour-of-one-hot-bag-of-words-and-tf-idf</link>
		
		<dc:creator><![CDATA[Vijay Gokarn]]></dc:creator>
		<pubDate>Sat, 11 Apr 2026 15:02:33 +0000</pubDate>
				<category><![CDATA[generative-ai]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[data-analysis]]></category>
		<category><![CDATA[pandas]]></category>
		<guid isPermaLink="false">https://vijay-gokarn.com/?p=263</guid>

					<description><![CDATA[<p>NLP · Machine Learning · Text Feature Engineering From Amazon Reviews to Numbers: A Hands-On Tour of One-Hot, Bag of Words, and TF-IDF Corpus128 real reviews TechniquesOHE · BoW · TF-IDF StackPython · sklearn · BeautifulSoup SourceGitHub ↗ How I took 128 real Amazon product reviews and turned them into features a machine-learning model can [&#8230;]</p>
<p>The post <a href="https://vijay-gokarn.com/from-amazon-reviews-to-numbers-a-hands-on-tour-of-one-hot-bag-of-words-and-tf-idf/">From Amazon Reviews to Numbers: A Hands-On Tour of One-Hot, Bag of Words, and TF-IDF</a> appeared first on <a href="https://vijay-gokarn.com">Vijay Gokarn</a>.</p>
]]></description>
										<content:encoded><![CDATA[
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:ital,wght@0,300;0,400;0,600;1,300;1,400&#038;family=DM+Sans:wght@300;400;500&#038;family=DM+Mono:wght@400&#038;display=swap" rel="stylesheet">

<style>
  .vg-blog-wrap {
    --ink: #0e0e0e;
    --paper: #f7f4ef;
    --paper-dark: #ede9e1;
    --teal: #0f6e56;
    --teal-light: #1d9e75;
    --teal-muted: #e1f5ee;
    --amber: #ba7517;
    --amber-light: #fac775;
    --amber-muted: #faeeda;
    --charcoal: #2c2c2a;
    --muted: #888780;
    --border: rgba(14,14,14,0.12);
    --border-strong: rgba(14,14,14,0.25);
    font-family: 'DM Sans', sans-serif;
    font-weight: 300;
    color: var(--ink);
    background: var(--paper);
    line-height: 1.75;
    font-size: 16px;
    overflow-x: hidden;
  }
  .vg-blog-wrap *, .vg-blog-wrap *::before, .vg-blog-wrap *::after {
    box-sizing: border-box; margin: 0; padding: 0;
  }

  /* ── HERO ── */
  .vg-post-hero {
    background: var(--ink);
    padding: 5rem 4rem 4rem;
    position: relative;
    overflow: hidden;
  }
  .vg-post-hero::after {
    content: '';
    position: absolute;
    bottom: 0; right: 0;
    width: 40%;
    height: 100%;
    background: rgba(15,110,86,0.12);
    clip-path: polygon(20% 0%, 100% 0%, 100% 100%, 0% 100%);
  }
  .vg-post-hero-inner { position: relative; z-index: 1; max-width: 860px; }
  .vg-post-eyebrow {
    font-size: 0.7rem;
    letter-spacing: 0.22em;
    text-transform: uppercase;
    color: var(--teal-light);
    font-weight: 500;
    margin-bottom: 1.25rem;
    display: flex;
    align-items: center;
    gap: 0.75rem;
  }
  .vg-post-eyebrow::before {
    content: '';
    display: inline-block;
    width: 1.5rem; height: 1px;
    background: var(--teal-light);
  }
  .vg-post-title {
    font-family: 'Cormorant Garamond', serif;
    font-size: clamp(2.2rem, 5vw, 3.8rem);
    font-weight: 300;
    line-height: 1.1;
    color: var(--paper);
    letter-spacing: -0.02em;
    margin-bottom: 1.5rem;
    max-width: 22ch;
  }
  .vg-post-title em { font-style: italic; color: var(--amber-light); }
  .vg-post-meta {
    display: flex;
    gap: 2rem;
    flex-wrap: wrap;
  }
  .vg-meta-item {
    font-size: 0.72rem;
    letter-spacing: 0.1em;
    text-transform: uppercase;
    color: rgba(247,244,239,0.4);
  }
  .vg-meta-item span { color: rgba(247,244,239,0.75); margin-left: 0.4rem; }

  /* ── INTRO BAND ── */
  .vg-intro-band {
    background: var(--teal-muted);
    padding: 2.5rem 4rem;
    border-left: 4px solid var(--teal);
  }
  .vg-intro-band p {
    font-size: 1.05rem;
    line-height: 1.85;
    color: var(--charcoal);
    font-weight: 300;
    max-width: 80ch;
  }
  .vg-intro-band strong { color: var(--teal); font-weight: 500; }

  /* ── BODY LAYOUT ── */
  .vg-post-body {
    max-width: 860px;
    margin: 0 auto;
    padding: 4rem 4rem;
  }

  /* ── SECTION HEADERS ── */
  .vg-step {
    margin-bottom: 3.5rem;
  }
  .vg-step-label {
    font-size: 0.65rem;
    letter-spacing: 0.22em;
    text-transform: uppercase;
    color: var(--teal);
    font-weight: 500;
    margin-bottom: 0.5rem;
    display: flex;
    align-items: center;
    gap: 0.6rem;
  }
  .vg-step-label::before {
    content: '';
    display: inline-block;
    width: 1.25rem; height: 1px;
    background: var(--teal);
  }
  .vg-step h2 {
    font-family: 'Cormorant Garamond', serif;
    font-size: clamp(1.5rem, 3vw, 2.1rem);
    font-weight: 300;
    line-height: 1.2;
    color: var(--ink);
    margin-bottom: 1.25rem;
  }
  .vg-step h2 em { font-style: italic; color: var(--teal); }
  .vg-step p {
    font-size: 0.94rem;
    line-height: 1.9;
    color: var(--charcoal);
    font-weight: 300;
    margin-bottom: 1rem;
  }
  .vg-step p strong { color: var(--ink); font-weight: 500; }

  /* ── CALLOUT / TIP BOXES ── */
  .vg-callout {
    background: var(--paper-dark);
    border-left: 3px solid var(--amber);
    padding: 1.25rem 1.5rem;
    margin: 1.5rem 0;
    font-size: 0.88rem;
    line-height: 1.8;
    color: var(--charcoal);
  }
  .vg-callout strong { color: var(--amber); font-weight: 500; }
  .vg-callout code {
    font-family: 'DM Mono', monospace;
    font-size: 0.82rem;
    background: rgba(14,14,14,0.06);
    padding: 0.1rem 0.4rem;
    color: var(--ink);
  }

  /* ── TECHNIQUE CARDS ── */
  .vg-technique-grid {
    display: grid;
    grid-template-columns: repeat(3, 1fr);
    gap: 1.25rem;
    margin: 2rem 0;
  }
  .vg-technique-card {
    background: var(--paper);
    border: 0.5px solid var(--border-strong);
    padding: 1.5rem;
    position: relative;
  }
  .vg-technique-card::before {
    content: '';
    position: absolute;
    top: 0; left: 0;
    width: 100%; height: 3px;
  }
  .vg-technique-card.ohe::before { background: var(--muted); }
  .vg-technique-card.bow::before { background: var(--amber); }
  .vg-technique-card.tfidf::before { background: var(--teal); }
  .vg-technique-card h3 {
    font-family: 'Cormorant Garamond', serif;
    font-size: 1.2rem;
    font-weight: 400;
    color: var(--ink);
    margin-bottom: 0.4rem;
  }
  .vg-technique-card .vg-abbr {
    font-family: 'DM Mono', monospace;
    font-size: 0.68rem;
    color: var(--muted);
    letter-spacing: 0.1em;
    margin-bottom: 0.75rem;
    display: block;
  }
  .vg-technique-card p {
    font-size: 0.82rem;
    line-height: 1.7;
    color: var(--charcoal);
    margin-bottom: 0.75rem !important;
  }
  .vg-technique-card .vg-weakness {
    font-size: 0.75rem;
    color: var(--muted);
    border-top: 0.5px solid var(--border);
    padding-top: 0.6rem;
    margin-top: 0.5rem;
    font-style: italic;
  }

  /* ── FORMULA BLOCK ── */
  .vg-formula {
    background: var(--ink);
    padding: 1.5rem 2rem;
    margin: 1.5rem 0;
    font-family: 'DM Mono', monospace;
    font-size: 0.9rem;
    color: var(--amber-light);
    letter-spacing: 0.04em;
    overflow-x: auto;
    white-space: nowrap;
  }
  .vg-formula .vg-formula-label {
    font-family: 'DM Sans', sans-serif;
    font-size: 0.65rem;
    letter-spacing: 0.18em;
    text-transform: uppercase;
    color: rgba(247,244,239,0.3);
    margin-bottom: 0.5rem;
    display: block;
    white-space: normal;
  }

  /* ── STAT ROW ── */
  .vg-stat-row {
    display: grid;
    grid-template-columns: repeat(3, 1fr);
    gap: 1rem;
    margin: 2rem 0;
  }
  .vg-stat-box {
    background: var(--paper-dark);
    border: 0.5px solid var(--border);
    padding: 1.25rem;
    text-align: center;
  }
  .vg-stat-box .vg-stat-n {
    font-family: 'Cormorant Garamond', serif;
    font-size: 2.2rem;
    font-weight: 300;
    line-height: 1;
    color: var(--teal);
    letter-spacing: -0.02em;
  }
  .vg-stat-box .vg-stat-l {
    font-size: 0.68rem;
    letter-spacing: 0.12em;
    text-transform: uppercase;
    color: var(--muted);
    margin-top: 0.35rem;
  }

  /* ── COMPARISON TABLE ── */
  .vg-table-wrap { overflow-x: auto; margin: 1.5rem 0; }
  .vg-table {
    width: 100%;
    border-collapse: collapse;
    font-size: 0.83rem;
  }
  .vg-table th {
    background: var(--ink);
    color: var(--paper);
    font-family: 'DM Sans', sans-serif;
    font-weight: 400;
    font-size: 0.68rem;
    letter-spacing: 0.14em;
    text-transform: uppercase;
    padding: 0.75rem 1rem;
    text-align: left;
  }
  .vg-table td {
    padding: 0.7rem 1rem;
    border-bottom: 0.5px solid var(--border);
    color: var(--charcoal);
    vertical-align: top;
    line-height: 1.55;
  }
  .vg-table tr:nth-child(even) td { background: var(--paper-dark); }
  .vg-table .vg-chip {
    display: inline-block;
    font-size: 0.65rem;
    letter-spacing: 0.08em;
    padding: 0.2rem 0.55rem;
    border-radius: 2px;
    font-weight: 400;
  }
  .vg-chip-green { background: var(--teal-muted); color: var(--teal); }
  .vg-chip-amber { background: var(--amber-muted); color: var(--amber); }
  .vg-chip-gray  { background: var(--paper-dark); color: var(--muted); border: 0.5px solid var(--border); }

  /* ── DIVIDER ── */
  .vg-divider {
    border: none;
    border-top: 0.5px solid var(--border);
    margin: 3rem 0;
  }

  /* ── KEY TAKEAWAYS ── */
  .vg-takeaways-section {
    background: var(--ink);
    padding: 4rem;
  }
  .vg-takeaways-section .vg-section-eyebrow {
    font-size: 0.68rem;
    letter-spacing: 0.22em;
    text-transform: uppercase;
    color: var(--amber-light);
    font-weight: 500;
    margin-bottom: 0.5rem;
    display: flex;
    align-items: center;
    gap: 0.6rem;
  }
  .vg-takeaways-section .vg-section-eyebrow::before {
    content: '';
    display: inline-block;
    width: 1.25rem; height: 1px;
    background: var(--amber-light);
  }
  .vg-takeaways-section h2 {
    font-family: 'Cormorant Garamond', serif;
    font-size: clamp(1.6rem, 3vw, 2.4rem);
    font-weight: 300;
    color: var(--paper);
    margin-bottom: 2.5rem;
  }
  .vg-takeaways-section h2 em { font-style: italic; color: var(--amber-light); }
  .vg-takeaways-grid {
    display: grid;
    grid-template-columns: 1fr 1fr;
    gap: 1.25rem;
  }
  .vg-takeaway-card {
    border: 0.5px solid rgba(247,244,239,0.12);
    padding: 1.5rem;
    position: relative;
  }
  .vg-takeaway-card::before {
    content: attr(data-num);
    font-family: 'Cormorant Garamond', serif;
    font-size: 3rem;
    font-weight: 300;
    color: rgba(250,199,117,0.15);
    position: absolute;
    top: 0.5rem; right: 1rem;
    line-height: 1;
  }
  .vg-takeaway-card h4 {
    font-family: 'Cormorant Garamond', serif;
    font-size: 1.1rem;
    font-weight: 400;
    color: var(--amber-light);
    margin-bottom: 0.5rem;
  }
  .vg-takeaway-card p {
    font-size: 0.82rem;
    line-height: 1.75;
    color: rgba(247,244,239,0.65);
    font-weight: 300;
  }

  /* ── INTERVIEW CHEAT SHEET ── */
  .vg-interview-section {
    background: var(--teal-muted);
    padding: 4rem;
  }
  .vg-interview-section .vg-section-eyebrow {
    font-size: 0.68rem;
    letter-spacing: 0.22em;
    text-transform: uppercase;
    color: var(--teal);
    font-weight: 500;
    margin-bottom: 0.5rem;
    display: flex;
    align-items: center;
    gap: 0.6rem;
  }
  .vg-interview-section .vg-section-eyebrow::before {
    content: '';
    display: inline-block;
    width: 1.25rem; height: 1px;
    background: var(--teal);
  }
  .vg-interview-section h2 {
    font-family: 'Cormorant Garamond', serif;
    font-size: clamp(1.6rem, 3vw, 2.4rem);
    font-weight: 300;
    color: var(--ink);
    margin-bottom: 2.5rem;
  }
  .vg-interview-section h2 em { font-style: italic; color: var(--teal); }
  .vg-qa-list { display: flex; flex-direction: column; gap: 0; }
  .vg-qa-item {
    border-top: 0.5px solid rgba(14,14,14,0.12);
    padding: 1.5rem 0;
    display: grid;
    grid-template-columns: 1fr 1.4fr;
    gap: 2rem;
    align-items: start;
  }
  .vg-qa-item:last-child { border-bottom: 0.5px solid rgba(14,14,14,0.12); }
  .vg-qa-q {
    font-family: 'Cormorant Garamond', serif;
    font-size: 1.05rem;
    font-weight: 400;
    color: var(--ink);
    line-height: 1.4;
  }
  .vg-qa-q .vg-q-badge {
    font-family: 'DM Mono', monospace;
    font-size: 0.6rem;
    letter-spacing: 0.1em;
    text-transform: uppercase;
    background: var(--teal);
    color: var(--paper);
    padding: 0.15rem 0.5rem;
    margin-bottom: 0.5rem;
    display: inline-block;
  }
  .vg-qa-a {
    font-size: 0.85rem;
    line-height: 1.75;
    color: var(--charcoal);
    font-weight: 300;
  }
  .vg-qa-a strong { color: var(--teal); font-weight: 500; }
  .vg-qa-a code {
    font-family: 'DM Mono', monospace;
    font-size: 0.78rem;
    background: rgba(14,14,14,0.07);
    padding: 0.1rem 0.35rem;
    color: var(--ink);
  }

  /* ── MEMORY PILLS ── */
  .vg-memory-row {
    display: flex;
    flex-wrap: wrap;
    gap: 0.6rem;
    margin-top: 0.75rem;
  }
  .vg-memory-pill {
    font-size: 0.7rem;
    letter-spacing: 0.06em;
    padding: 0.3rem 0.85rem;
    background: var(--paper);
    border: 0.5px solid var(--border-strong);
    color: var(--charcoal);
    font-weight: 400;
  }
  .vg-memory-pill.teal { border-color: var(--teal); color: var(--teal); background: var(--teal-muted); }
  .vg-memory-pill.amber { border-color: var(--amber); color: var(--amber); background: var(--amber-muted); }

  /* ── FOOTER CTA ── */
  .vg-post-footer {
    background: var(--paper-dark);
    padding: 3rem 4rem;
    display: flex;
    justify-content: space-between;
    align-items: center;
    flex-wrap: wrap;
    gap: 1.5rem;
    border-top: 0.5px solid var(--border);
  }
  .vg-post-footer p {
    font-size: 0.85rem;
    color: var(--muted);
    font-weight: 300;
  }
  .vg-post-footer p strong { color: var(--ink); font-weight: 400; }
  .vg-source-link {
    display: inline-block;
    padding: 0.65rem 1.75rem;
    background: var(--ink);
    color: var(--paper);
    font-size: 0.72rem;
    letter-spacing: 0.12em;
    text-transform: uppercase;
    text-decoration: none;
    font-weight: 400;
    transition: background 0.2s;
  }
  .vg-source-link:hover { background: var(--teal); }

  /* ── SCROLL REVEAL ── */
  .vg-reveal {
    opacity: 0;
    transform: translateY(20px);
    transition: opacity 0.55s ease, transform 0.55s ease;
  }
  .vg-reveal.vg-visible { opacity: 1; transform: translateY(0); }
  .vg-d1 { transition-delay: 0.1s; }
  .vg-d2 { transition-delay: 0.2s; }
  .vg-d3 { transition-delay: 0.3s; }
</style>

<div class="vg-blog-wrap">

  <!-- HERO -->
  <div class="vg-post-hero">
    <div class="vg-post-hero-inner">
      <p class="vg-post-eyebrow">NLP · Machine Learning · Text Feature Engineering</p>
      <h1 class="vg-post-title">From Amazon Reviews to Numbers: A Hands-On Tour of <em>One-Hot, Bag of Words, and TF-IDF</em></h1>
      <div class="vg-post-meta">
        <p class="vg-meta-item">Corpus<span>128 real reviews</span></p>
        <p class="vg-meta-item">Techniques<span>OHE · BoW · TF-IDF</span></p>
        <p class="vg-meta-item">Stack<span>Python · sklearn · BeautifulSoup</span></p>
        <p class="vg-meta-item">Source<span>GitHub ↗</span></p>
      </div>
    </div>
  </div>

  <!-- INTRO BAND -->
  <div class="vg-intro-band">
    <p>How I took <strong>128 real Amazon product reviews</strong> and turned them into features a machine-learning model can actually chew on — and what I learned about where these classical techniques still shine in 2026.</p>
  </div>

  <!-- BODY -->
  <div class="vg-post-body">

    <!-- WHY CLASSICAL -->
    <div class="vg-step vg-reveal">
      <p class="vg-step-label">Context</p>
      <h2>Why bother with &#8220;classical&#8221; text features <em>at all?</em></h2>
      <p>If you have been anywhere near an LLM in the last two years, you have probably heard that &#8220;embeddings solved text.&#8221; They did — for a lot of problems. But if you are building a spam filter with 100k labelled examples, a BM25-powered search box, a cold-start classifier for a brand-new product line, or a compliance-audited system where a human needs to understand why the model fired — then Bag of Words and TF-IDF are still in the toolbox.</p>
      <p>They are <strong>fast, deterministic, interpretable,</strong> and an honest baseline you should always beat before reaching for a neural model.</p>
    </div>

    <hr class="vg-divider">

    <!-- DATA -->
    <div class="vg-step vg-reveal">
      <p class="vg-step-label">Step 1</p>
      <h2>Get real data — <em>not toy sentences</em></h2>
      <p>Every blog post on TF-IDF uses the same three cooked-up sentences about cats and dogs. I wanted the messiness of real user-generated content, so I wrote a BeautifulSoup scraper across ~20 popular ASINs — Echo Dots, AirPods Pro, Kindles, an Apple Watch, a Ninja blender, a PS5 controller, a Nespresso machine, and so on.</p>
      <div class="vg-stat-row">
        <div class="vg-stat-box vg-reveal vg-d1">
          <div class="vg-stat-n">128</div>
          <div class="vg-stat-l">Real Reviews</div>
        </div>
        <div class="vg-stat-box vg-reveal vg-d2">
          <div class="vg-stat-n">14</div>
          <div class="vg-stat-l">Products</div>
        </div>
        <div class="vg-stat-box vg-reveal vg-d3">
          <div class="vg-stat-n">3,461</div>
          <div class="vg-stat-l">Unique Tokens</div>
        </div>
      </div>
      <div class="vg-callout">
        <strong>Scraper gotchas:</strong> Set a real <code>User-Agent</code> header or Amazon returns a stripped page. Anchor on <code>[data-hook="review-body"]</code> inside <code>celwidget</code> blocks — not the <code>div[data-hook="review"]</code> wrapper on the dedicated reviews page. A few reviews came back in Spanish and Arabic — a lovely reminder that real data never matches the shape your slides promised.
      </div>
    </div>

    <hr class="vg-divider">

    <!-- CLEANING -->
    <div class="vg-step vg-reveal">
      <p class="vg-step-label">Step 2</p>
      <h2>Clean the text — <em>the boring part that matters most</em></h2>
      <p>A review like &#8220;I LOVE it!!! Sound is 🔥. Read more&#8221; is not something a counting-based model can work with. Each cleaning step kills a specific kind of noise:</p>
      <div class="vg-table-wrap">
        <table class="vg-table">
          <thead><tr><th>Step</th><th>What it kills</th><th>Why it matters</th></tr></thead>
          <tbody>
            <tr><td>Lowercase</td><td>LOVE vs love</td><td>Avoids vocabulary duplicates</td></tr>
            <tr><td>Drop &#8220;Read more&#8221;</td><td>Amazon truncation marker</td><td>Otherwise becomes one of the most frequent tokens</td></tr>
            <tr><td>Strip punctuation / digits</td><td>!!!, $199</td><td>They rarely help classical models</td></tr>
            <tr><td>Tokenize</td><td>—</td><td>Gives you units to count</td></tr>
            <tr><td>Remove stopwords</td><td>the, and, is</td><td>Appear in every document → no signal</td></tr>
            <tr><td>Lemmatize</td><td>speakers → speaker</td><td>Tightens the vocabulary</td></tr>
          </tbody>
        </table>
      </div>
      <p>After processing: <strong>11,138 tokens</strong> spanning a <strong>3,461-word vocabulary</strong>. Top words were exactly the product-review clichés you would expect — use, one, like, great, noise, sound, quality — a perfect sanity check.</p>
    </div>

    <hr class="vg-divider">

    <!-- THREE ENCODINGS -->
    <div class="vg-step vg-reveal">
      <p class="vg-step-label">Step 3</p>
      <h2>Three ways to turn text into <em>numbers</em></h2>
      <div class="vg-technique-grid">
        <div class="vg-technique-card ohe vg-reveal vg-d1">
          <h3>One-Hot Encoding</h3>
          <span class="vg-abbr">OHE · Binary presence</span>
          <p>For each review, build a binary vector over the whole vocabulary: 1 if the word appears, 0 otherwise. Simplest thing that works, easiest to explain to a non-technical stakeholder.</p>
          <p class="vg-weakness">⚠ Throws away frequency — &#8220;amazing&#8221; once and ten times look identical.</p>
        </div>
        <div class="vg-technique-card bow vg-reveal vg-d2">
          <h3>Bag of Words</h3>
          <span class="vg-abbr">BoW · CountVectorizer</span>
          <p>Same vector shape, but store actual counts. A review that hammers on &#8220;sound&#8221; three times ranks differently from one that drops the word once. Frequency-aware.</p>
          <p class="vg-weakness">⚠ Still order-blind — &#8220;not good, very bad&#8221; ≈ &#8220;good, not very bad&#8221;.</p>
        </div>
        <div class="vg-technique-card tfidf vg-reveal vg-d3">
          <h3>TF-IDF</h3>
          <span class="vg-abbr">TfidfVectorizer · The trick</span>
          <p>Take the BoW count and divide by how common the word is across the whole corpus. Generic words like &#8220;good&#8221; get pushed toward zero. Rare, distinctive words like &#8220;cancellation&#8221; stay loud.</p>
          <p class="vg-weakness">✓ Best signal for downstream classifiers.</p>
        </div>
      </div>
      <div class="vg-formula">
        <span class="vg-formula-label">TF-IDF Formula</span>
        tfidf(t, d) = tf(t, d) · log( N / (1 + df(t)) )
      </div>
      <p>In my corpus, the highest-IDF words were exactly the long-tail product features that appeared in just one review. The lowest-IDF words were the generic review vocabulary. That is the <strong>whole story of TF-IDF in one experiment.</strong></p>
    </div>

    <hr class="vg-divider">

    <!-- AHA MOMENT -->
    <div class="vg-step vg-reveal">
      <p class="vg-step-label">Step 4</p>
      <h2>The &#8220;aha&#8221; moment — <em>one review, three lenses</em></h2>
      <p>Encode the same review three times and print the top-weighted tokens:</p>
      <div class="vg-callout">
        <strong>OHE</strong> just lists every unique word in the review. No ranking.<br><br>
        <strong>BoW</strong> surfaces the most repeated words — almost always filler like <code>one</code>, <code>like</code>, <code>use</code>.<br><br>
        <strong>TF-IDF</strong> surfaces the words <em>this</em> review says that few others do. That is exactly what a downstream classifier wants to see.<br><br>
        Once you have seen this side-by-side even once, you stop reaching for plain BoW unless you have a very specific reason. (Naive Bayes is one — its underlying math prefers raw counts.)
      </div>
    </div>

    <hr class="vg-divider">

    <!-- SPARSITY -->
    <div class="vg-step vg-reveal">
      <p class="vg-step-label">Step 5</p>
      <h2>Sparsity — <em>the thing nobody warns you about</em></h2>
      <p>Every one of my three matrices came out <strong>~98.15% zero.</strong> That is normal — reviews are short, vocabularies are long, and most words do not appear in most documents. Two huge practical implications:</p>
      <div class="vg-callout">
        <strong>Never store these dense.</strong> A 1-million-document × 200k-vocab corpus is a 200-billion-cell matrix. It must live in CSR or equivalent compressed form.<br><br>
        <strong>Classical pipelines do not scale forever.</strong> Once you are in the tens-of-millions-of-documents range, even sparse storage becomes painful — which is one reason industry moved to dense embedding pipelines for web-scale retrieval.
      </div>
    </div>

    <hr class="vg-divider">

    <!-- CLASSIFIER -->
    <div class="vg-step vg-reveal">
      <p class="vg-step-label">Step 6</p>
      <h2>A mini sentiment classifier — <em>and a class imbalance lesson</em></h2>
      <p>4–5 star = positive, 1–2 star = negative, 3-star dropped. Two models per feature set: Logistic Regression with <code>class_weight="balanced"</code> and Multinomial Naive Bayes.</p>
      <p>Headline accuracy looks great — <strong>~97% on the test split.</strong> But the test split has 31 positives and 1 negative. The interesting metric is recall on the negative class, and with only five one-star reviews in the whole corpus, no model is going to learn that cleanly. Amazon surfaces highly-rated reviews first, so any pipeline that scrapes top-of-page reviews inherits the same lopsided distribution.</p>
      <div class="vg-callout">
        <strong>TF-IDF</strong> gives Logistic Regression a small, consistent edge by silencing filler words.<br><br>
        <strong>Naive Bayes</strong> prefers raw BoW counts — rescaling with IDF can actually hurt it.<br><br>
        <strong>Never trust a single accuracy number on imbalanced data.</strong> Always print per-class precision/recall.
      </div>
    </div>

    <hr class="vg-divider">

    <!-- WHERE IT BREAKS -->
    <div class="vg-step vg-reveal">
      <p class="vg-step-label">Step 7</p>
      <h2>Where these techniques break — <em>and where they still win</em></h2>
      <div class="vg-table-wrap">
        <table class="vg-table">
          <thead><tr><th>Scenario</th><th>BoW / TF-IDF</th><th>Embeddings</th></tr></thead>
          <tbody>
            <tr>
              <td>Semantic similarity<br><em style="font-size:0.78rem;color:var(--muted)">&#8220;audio excellent&#8221; vs &#8220;sound great&#8221;</em></td>
              <td><span class="vg-chip vg-chip-gray">Zero shared tokens → fails</span></td>
              <td><span class="vg-chip vg-chip-green">Maps synonyms close ✓</span></td>
            </tr>
            <tr>
              <td>Negation<br><em style="font-size:0.78rem;color:var(--muted)">&#8220;battery lasts&#8221; vs &#8220;battery dies&#8221;</em></td>
              <td><span class="vg-chip vg-chip-gray">Near-identical vectors → fails</span></td>
              <td><span class="vg-chip vg-chip-green">Directional context ✓</span></td>
            </tr>
            <tr>
              <td>Interpretability</td>
              <td><span class="vg-chip vg-chip-green">Each feature is a word ✓</span></td>
              <td><span class="vg-chip vg-chip-amber">1024-dim black box</span></td>
            </tr>
            <tr>
              <td>Training speed</td>
              <td><span class="vg-chip vg-chip-green">Millions of docs, minutes, laptop ✓</span></td>
              <td><span class="vg-chip vg-chip-amber">GPU required at scale</span></td>
            </tr>
            <tr>
              <td>Exact keyword / ID retrieval</td>
              <td><span class="vg-chip vg-chip-green">BM25 still wins ✓</span></td>
              <td><span class="vg-chip vg-chip-amber">Can miss rare tokens</span></td>
            </tr>
            <tr>
              <td>Cold start (zero labels)</td>
              <td><span class="vg-chip vg-chip-green">Cosine sim on day one ✓</span></td>
              <td><span class="vg-chip vg-chip-amber">Needs fine-tuning data</span></td>
            </tr>
          </tbody>
        </table>
      </div>
    </div>

  </div><!-- /vg-post-body -->

  <!-- KEY TAKEAWAYS -->
  <div class="vg-takeaways-section">
    <p class="vg-section-eyebrow">Summary</p>
    <h2>Key <em>takeaways</em></h2>
    <div class="vg-takeaways-grid">
      <div class="vg-takeaway-card vg-reveal" data-num="01">
        <h4>Preprocessing is 80% of the game</h4>
        <p>Before you touch any encoder, understand exactly what &#8220;a token&#8221; means in your corpus. Lowercase, stopwords, lemmatization — each step has a specific purpose.</p>
      </div>
      <div class="vg-takeaway-card vg-reveal vg-d1" data-num="02">
        <h4>Always inspect a single document&#8217;s top features</h4>
        <p>It is the fastest way to develop intuition about what your encoding is actually rewarding. Print OHE vs BoW vs TF-IDF side-by-side at least once.</p>
      </div>
      <div class="vg-takeaway-card vg-reveal vg-d2" data-num="03">
        <h4>Watch sparsity and class imbalance</h4>
        <p>Both will bite you long before modelling choices do. Use CSR storage. Never trust a single accuracy number on skewed data — always check per-class recall.</p>
      </div>
      <div class="vg-takeaway-card vg-reveal vg-d3" data-num="04">
        <h4>Know why you would pick the classical tool</h4>
        <p>If your answer is only &#8220;because it is in every tutorial&#8221;, reach for an embedding model. If your answer is &#8220;interpretability and speed&#8221; — BoW/TF-IDF are still excellent choices.</p>
      </div>
    </div>
  </div>

  <!-- INTERVIEW CHEAT SHEET -->
  <div class="vg-interview-section">
    <p class="vg-section-eyebrow">Interview Prep</p>
    <h2>Cheat sheet — <em>quick definitions to remember</em></h2>
    <div class="vg-qa-list">

      <div class="vg-qa-item vg-reveal">
        <div class="vg-qa-q">
          <span class="vg-q-badge">Define</span><br>
          What is One-Hot Encoding in NLP?
        </div>
        <div class="vg-qa-a">
          <strong>Binary presence vector</strong> over the vocabulary. 1 if the word appears in the document, 0 otherwise. No frequency, no order. Size = vocabulary length.
          <div class="vg-memory-row">
            <span class="vg-memory-pill">Binary: 0 or 1</span>
            <span class="vg-memory-pill">Ignores frequency</span>
            <span class="vg-memory-pill amber">Simplest encoder</span>
          </div>
        </div>
      </div>

      <div class="vg-qa-item vg-reveal vg-d1">
        <div class="vg-qa-q">
          <span class="vg-q-badge">Define</span><br>
          What is Bag of Words?
        </div>
        <div class="vg-qa-a">
          <strong>Word count vector</strong> over the vocabulary. Stores how many times each word appears. Frequency-aware but order-blind — treats a document as an unordered bag of tokens.
          <div class="vg-memory-row">
            <span class="vg-memory-pill">Counts, not binary</span>
            <span class="vg-memory-pill">Order-blind</span>
            <span class="vg-memory-pill amber">CountVectorizer in sklearn</span>
          </div>
        </div>
      </div>

      <div class="vg-qa-item vg-reveal vg-d2">
        <div class="vg-qa-q">
          <span class="vg-q-badge">Define</span><br>
          What is TF-IDF and why does it outperform BoW?
        </div>
        <div class="vg-qa-a">
          <strong>Term Frequency × Inverse Document Frequency.</strong> Scales BoW counts down for words that appear in many documents. Words like &#8220;good&#8221; that are everywhere get suppressed; rare words that are distinctive get amplified. Formula: <code>tf(t,d) · log(N / (1 + df(t)))</code>
          <div class="vg-memory-row">
            <span class="vg-memory-pill teal">Rewards rarity</span>
            <span class="vg-memory-pill teal">Penalises ubiquity</span>
            <span class="vg-memory-pill">TfidfVectorizer</span>
          </div>
        </div>
      </div>

      <div class="vg-qa-item vg-reveal">
        <div class="vg-qa-q">
          <span class="vg-q-badge">Compare</span><br>
          When would you use BoW over TF-IDF?
        </div>
        <div class="vg-qa-a">
          Use raw BoW counts with <strong>Naive Bayes</strong> — its probability estimates are count-based; IDF rescaling can hurt it. Otherwise, TF-IDF almost always gives a better signal for classifiers.
          <div class="vg-memory-row">
            <span class="vg-memory-pill amber">Naive Bayes → BoW</span>
            <span class="vg-memory-pill teal">Logistic Regression → TF-IDF</span>
          </div>
        </div>
      </div>

      <div class="vg-qa-item vg-reveal vg-d1">
        <div class="vg-qa-q">
          <span class="vg-q-badge">Gotcha</span><br>
          What is sparsity and why does it matter?
        </div>
        <div class="vg-qa-a">
          A BoW/TF-IDF matrix is typically <strong>95–99% zeros</strong> because documents are short and vocabularies are large. Always store in <strong>sparse format (CSR)</strong> — a dense matrix of 1M docs × 200k vocab = 200B cells, which won&#8217;t fit in RAM.
          <div class="vg-memory-row">
            <span class="vg-memory-pill">98% zeros = normal</span>
            <span class="vg-memory-pill amber">Always use CSR format</span>
          </div>
        </div>
      </div>

      <div class="vg-qa-item vg-reveal vg-d2">
        <div class="vg-qa-q">
          <span class="vg-q-badge">Weakness</span><br>
          What can&#8217;t BoW/TF-IDF do that embeddings can?
        </div>
        <div class="vg-qa-a">
          They are <strong>lexical, not semantic.</strong> &#8220;Audio is excellent&#8221; and &#8220;sound is great&#8221; share zero tokens → zero similarity. &#8220;Battery lasts&#8221; and &#8220;battery dies&#8221; share most tokens → high similarity. Embeddings fix both by mapping meaning, not just words.
          <div class="vg-memory-row">
            <span class="vg-memory-pill">No synonyms</span>
            <span class="vg-memory-pill">No negation</span>
            <span class="vg-memory-pill teal">Use embeddings for semantics</span>
          </div>
        </div>
      </div>

      <div class="vg-qa-item vg-reveal">
        <div class="vg-qa-q">
          <span class="vg-q-badge">Use Case</span><br>
          When do classical methods still win in 2026?
        </div>
        <div class="vg-qa-a">
          <strong>4 scenarios where BoW/TF-IDF beat neural alternatives:</strong> (1) exact-match / keyword search — BM25 still outperforms embeddings for identifier queries; (2) interpretability requirements; (3) training speed at millions of documents on a laptop; (4) cold-start with zero labelled data.
          <div class="vg-memory-row">
            <span class="vg-memory-pill teal">BM25 search</span>
            <span class="vg-memory-pill teal">Interpretability</span>
            <span class="vg-memory-pill teal">Cold start</span>
            <span class="vg-memory-pill teal">Speed</span>
          </div>
        </div>
      </div>

    </div>
  </div>

  <!-- FOOTER CTA -->
  <div class="vg-post-footer">
    <p>Full pipeline — scraper to classifier — in the <strong>GenAI Mastery Series</strong> source repo.</p>
    <a href="https://github.com/vijaygokarn130/ml-classic-concepts" class="vg-source-link" target="_blank">View Source on GitHub ↗</a>
  </div>

</div><!-- /vg-blog-wrap -->

<script>
(function(){
  var obs = new IntersectionObserver(function(entries){
    entries.forEach(function(e){ if(e.isIntersecting) e.target.classList.add('vg-visible'); });
  }, {threshold: 0.08});
  document.querySelectorAll('.vg-reveal').forEach(function(el){ obs.observe(el); });
})();
</script>
<p>The post <a href="https://vijay-gokarn.com/from-amazon-reviews-to-numbers-a-hands-on-tour-of-one-hot-bag-of-words-and-tf-idf/">From Amazon Reviews to Numbers: A Hands-On Tour of One-Hot, Bag of Words, and TF-IDF</a> appeared first on <a href="https://vijay-gokarn.com">Vijay Gokarn</a>.</p>
]]></content:encoded>
					
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">263</post-id>	</item>
		<item>
		<title>Analyzing Wikipedia Articles with Langchain and OpenAI in Databricks</title>
		<link>https://vijay-gokarn.com/analyzing-wikipedia-articles-with-langchain-and-openai-in-databricks/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=analyzing-wikipedia-articles-with-langchain-and-openai-in-databricks</link>
		
		<dc:creator><![CDATA[Vijay Gokarn]]></dc:creator>
		<pubDate>Tue, 16 Jul 2024 10:59:28 +0000</pubDate>
				<category><![CDATA[ai-agents]]></category>
		<category><![CDATA[databricks]]></category>
		<category><![CDATA[food]]></category>
		<category><![CDATA[generative-ai]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[pandas]]></category>
		<guid isPermaLink="false">https://vijay-gokarn.com/?p=140</guid>

					<description><![CDATA[<p>GenAI Mastery Series · NLP · Databricks · LangChain Categorizing Wikipedia at Scale with OpenAI, LangChain &#038; Databricks Datasetwikimedia/wikipedia · 10,000 articles ModelChatOpenAI (GPT-4) Output50-category JSON classifier Stack Databricks Notebook LangChain Core langchain_openai HuggingFace Datasets ChatPromptTemplate Batch Inference JSON Parsing A complete walkthrough of a large-scale text classification pipeline built inside a Databricks notebook — [&#8230;]</p>
<p>The post <a href="https://vijay-gokarn.com/analyzing-wikipedia-articles-with-langchain-and-openai-in-databricks/">Analyzing Wikipedia Articles with Langchain and OpenAI in Databricks</a> appeared first on <a href="https://vijay-gokarn.com">Vijay Gokarn</a>.</p>
]]></description>
										<content:encoded><![CDATA[
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:ital,wght@0,300;0,400;0,600;1,300;1,400&#038;family=DM+Sans:wght@300;400;500&#038;family=DM+Mono:wght@400&#038;display=swap" rel="stylesheet">

<style>
.vg6 {
  --ink: #0e0e0e; --paper: #f7f4ef; --paper-dark: #ede9e1;
  --teal: #0f6e56; --teal-light: #1d9e75; --teal-muted: #e1f5ee;
  --amber: #ba7517; --amber-light: #fac775; --amber-muted: #faeeda;
  --charcoal: #2c2c2a; --muted: #888780;
  --border: rgba(14,14,14,0.12); --border-strong: rgba(14,14,14,0.25);
  --db-red: #e8353a; --db-dark: #1b1f23;
  font-family: 'DM Sans', sans-serif; font-weight: 300;
  color: var(--ink); background: var(--paper); line-height: 1.75; font-size: 16px; overflow-x: hidden;
}
.vg6 *, .vg6 *::before, .vg6 *::after { box-sizing: border-box; margin: 0; padding: 0; }

/* HERO */
.vg6-hero { background: var(--db-dark); padding: 5rem 4rem 4rem; position: relative; overflow: hidden; }
.vg6-hero::before {
  content: '{ }'; font-family: 'Cormorant Garamond', serif; font-size: 18rem;
  font-weight: 300; color: rgba(255,255,255,0.025); position: absolute;
  right: 0rem; bottom: -4rem; line-height: 1; pointer-events: none; letter-spacing: -0.05em;
}
.vg6-hero-inner { position: relative; z-index: 1; max-width: 900px; }
.vg6-eyebrow { font-size: 0.68rem; letter-spacing: 0.22em; text-transform: uppercase; color: var(--teal-light); font-weight: 500; margin-bottom: 1.25rem; display: flex; align-items: center; gap: 0.75rem; }
.vg6-eyebrow::before { content: ''; display: inline-block; width: 1.5rem; height: 1px; background: var(--teal-light); }
.vg6-hero h1 { font-family: 'Cormorant Garamond', serif; font-size: clamp(2.2rem, 5vw, 3.8rem); font-weight: 300; line-height: 1.1; color: var(--paper); letter-spacing: -0.02em; margin-bottom: 1.5rem; max-width: 28ch; }
.vg6-hero h1 em { font-style: italic; color: var(--amber-light); }
.vg6-meta-row { display: flex; gap: 2rem; flex-wrap: wrap; }
.vg6-meta { font-size: 0.7rem; letter-spacing: 0.1em; text-transform: uppercase; color: rgba(247,244,239,0.35); }
.vg6-meta span { color: rgba(247,244,239,0.7); margin-left: 0.4rem; }

/* STACK BAND */
.vg6-stack-band { background: var(--db-red); padding: 1.1rem 4rem; display: flex; gap: 0.75rem; flex-wrap: wrap; align-items: center; }
.vg6-stack-label { font-size: 0.63rem; letter-spacing: 0.18em; text-transform: uppercase; color: rgba(255,255,255,0.6); font-weight: 400; margin-right: 0.4rem; }
.vg6-stack-pill { font-size: 0.7rem; letter-spacing: 0.05em; padding: 0.28rem 0.85rem; background: rgba(255,255,255,0.12); color: #fff; border: 0.5px solid rgba(255,255,255,0.2); }

/* INTRO */
.vg6-intro { background: var(--teal-muted); padding: 2.5rem 4rem; border-left: 4px solid var(--teal); }
.vg6-intro p { font-size: 1.05rem; line-height: 1.85; color: var(--charcoal); font-weight: 300; max-width: 80ch; }
.vg6-intro strong { color: var(--teal); font-weight: 500; }

/* PREREQS */
.vg6-prereqs { background: var(--paper-dark); padding: 2rem 4rem; display: flex; gap: 2rem; flex-wrap: wrap; align-items: center; border-bottom: 0.5px solid var(--border); }
.vg6-prereq-label { font-size: 0.63rem; letter-spacing: 0.18em; text-transform: uppercase; color: var(--muted); font-weight: 500; flex-shrink: 0; }
.vg6-prereq-chips { display: flex; gap: 0.6rem; flex-wrap: wrap; }
.vg6-prereq-chip { font-size: 0.72rem; padding: 0.3rem 0.9rem; border: 0.5px solid var(--border-strong); color: var(--charcoal); background: var(--paper); display: flex; align-items: center; gap: 0.4rem; }
.vg6-prereq-chip::before { content: '✓'; color: var(--teal); font-size: 0.65rem; font-weight: 600; }

/* BODY */
.vg6-body { max-width: 900px; margin: 0 auto; padding: 4rem; }
.vg6-step { margin-bottom: 3.5rem; }
.vg6-step-label { font-size: 0.63rem; letter-spacing: 0.22em; text-transform: uppercase; color: var(--teal); font-weight: 500; margin-bottom: 0.5rem; display: flex; align-items: center; gap: 0.6rem; }
.vg6-step-label::before { content: ''; display: inline-block; width: 1.25rem; height: 1px; background: var(--teal); }
.vg6-step h2 { font-family: 'Cormorant Garamond', serif; font-size: clamp(1.4rem, 3vw, 2rem); font-weight: 300; line-height: 1.2; color: var(--ink); margin-bottom: 1rem; }
.vg6-step h2 em { font-style: italic; color: var(--teal); }
.vg6-step p { font-size: 0.93rem; line-height: 1.9; color: var(--charcoal); font-weight: 300; margin-bottom: 1rem; }
.vg6-step p strong { color: var(--ink); font-weight: 500; }
.vg6-divider { border: none; border-top: 0.5px solid var(--border); margin: 3rem 0; }

/* CALLOUT */
.vg6-callout { background: var(--paper-dark); border-left: 3px solid var(--amber); padding: 1.25rem 1.5rem; margin: 1.25rem 0; font-size: 0.87rem; line-height: 1.8; color: var(--charcoal); }
.vg6-callout strong { color: var(--amber); font-weight: 500; }
.vg6-callout.teal { border-color: var(--teal); }
.vg6-callout.teal strong { color: var(--teal); }

/* ── CODE BLOCKS ── */
.vg6-code-wrap { margin: 1.25rem 0; border: 0.5px solid rgba(255,255,255,0.06); overflow: hidden; }
.vg6-code-header { background: #2d333b; padding: 0.6rem 1.25rem; display: flex; justify-content: space-between; align-items: center; border-bottom: 0.5px solid rgba(255,255,255,0.06); }
.vg6-code-filename { font-family: 'DM Mono', monospace; font-size: 0.68rem; color: rgba(247,244,239,0.45); letter-spacing: 0.04em; }
.vg6-code-lang { font-size: 0.6rem; letter-spacing: 0.14em; text-transform: uppercase; color: var(--teal-light); font-weight: 500; }
.vg6-code-body { background: var(--db-dark); padding: 1.5rem; overflow-x: auto; }
.vg6-code-body pre { margin: 0; }
.vg6-code-body code { font-family: 'DM Mono', monospace; font-size: 0.82rem; line-height: 1.85; color: #e6edf3; white-space: pre; display: block; }

/* Syntax token colours */
.vg6-k  { color: #ff7b72; }   /* keyword: import, def, for, if */
.vg6-s  { color: #a5d6ff; }   /* string */
.vg6-c  { color: #8b949e; font-style: italic; } /* comment */
.vg6-f  { color: #d2a8ff; }   /* function / class call */
.vg6-n  { color: var(--amber-light); } /* number / constant */
.vg6-v  { color: #79c0ff; }   /* variable name */
.vg6-p  { color: #e6edf3; }   /* punctuation */
.vg6-m  { color: var(--teal-light); } /* magic / decorator */

/* ARCHITECTURE DIAGRAM */
.vg6-arch { display: flex; align-items: center; gap: 0; margin: 1.5rem 0; flex-wrap: wrap; }
.vg6-arch-box { background: var(--paper); border: 0.5px solid var(--border-strong); padding: 0.75rem 1.1rem; text-align: center; flex: 1; min-width: 100px; }
.vg6-arch-box .vg6-arch-icon { font-size: 1.25rem; margin-bottom: 0.25rem; }
.vg6-arch-box h5 { font-family: 'Cormorant Garamond', serif; font-size: 0.95rem; font-weight: 400; color: var(--ink); margin-bottom: 0.15rem; }
.vg6-arch-box p { font-size: 0.68rem; color: var(--muted); line-height: 1.4; font-weight: 300; }
.vg6-arch-box.highlight { background: var(--teal); border-color: var(--teal); }
.vg6-arch-box.highlight h5 { color: var(--paper); }
.vg6-arch-box.highlight p { color: rgba(247,244,239,0.6); }
.vg6-arch-arrow { font-size: 1rem; color: var(--muted); padding: 0 0.3rem; flex-shrink: 0; }

/* OUTPUT CARD */
.vg6-output-section { background: var(--db-dark); padding: 4rem; }
.vg6-output-eyebrow { font-size: 0.65rem; letter-spacing: 0.22em; text-transform: uppercase; color: var(--amber-light); font-weight: 500; margin-bottom: 0.5rem; display: flex; align-items: center; gap: 0.6rem; }
.vg6-output-eyebrow::before { content: ''; display: inline-block; width: 1.25rem; height: 1px; background: var(--amber-light); }
.vg6-output-section > h2 { font-family: 'Cormorant Garamond', serif; font-size: clamp(1.6rem, 3vw, 2.2rem); font-weight: 300; color: var(--paper); margin-bottom: 2rem; }
.vg6-output-section > h2 em { font-style: italic; color: var(--amber-light); }
.vg6-output-grid { display: grid; grid-template-columns: repeat(4, 1fr); gap: 1px; background: rgba(247,244,239,0.06); border: 0.5px solid rgba(247,244,239,0.06); margin-bottom: 2rem; }
.vg6-output-stat { background: var(--db-dark); padding: 1.5rem; }
.vg6-output-stat-n { font-family: 'Cormorant Garamond', serif; font-size: 2.2rem; font-weight: 300; color: var(--teal-light); line-height: 1; margin-bottom: 0.3rem; letter-spacing: -0.02em; }
.vg6-output-stat-l { font-size: 0.65rem; letter-spacing: 0.12em; text-transform: uppercase; color: rgba(247,244,239,0.35); }
.vg6-json-card { background: #161b22; border: 0.5px solid rgba(247,244,239,0.08); overflow: hidden; }
.vg6-json-header { background: #2d333b; padding: 0.55rem 1.25rem; display: flex; justify-content: space-between; }
.vg6-json-header span { font-family: 'DM Mono', monospace; font-size: 0.65rem; color: rgba(247,244,239,0.4); }
.vg6-json-header .vg6-json-tag { color: var(--teal-light); }
.vg6-json-body { padding: 1.5rem; font-family: 'DM Mono', monospace; font-size: 0.83rem; line-height: 1.9; color: #e6edf3; }
.vg6-json-key { color: var(--amber-light); }
.vg6-json-val-str { color: #a5d6ff; }
.vg6-json-val-num { color: #79c0ff; }
.vg6-json-punct { color: rgba(247,244,239,0.4); }

/* CATEGORIES CLOUD */
.vg6-categories { display: flex; flex-wrap: wrap; gap: 0.5rem; margin: 1.5rem 0; }
.vg6-cat-pill { font-size: 0.7rem; letter-spacing: 0.05em; padding: 0.3rem 0.85rem; border: 0.5px solid rgba(247,244,239,0.12); color: rgba(247,244,239,0.55); }
.vg6-cat-pill.active { border-color: var(--teal-light); color: var(--teal-light); background: rgba(29,158,117,0.1); }

/* INTERVIEW */
.vg6-interview-section { background: var(--teal-muted); padding: 4rem; }
.vg6-interview-eyebrow { font-size: 0.65rem; letter-spacing: 0.22em; text-transform: uppercase; color: var(--teal); font-weight: 500; margin-bottom: 0.5rem; display: flex; align-items: center; gap: 0.6rem; }
.vg6-interview-eyebrow::before { content: ''; display: inline-block; width: 1.25rem; height: 1px; background: var(--teal); }
.vg6-interview-section > h2 { font-family: 'Cormorant Garamond', serif; font-size: clamp(1.6rem, 3vw, 2.4rem); font-weight: 300; color: var(--ink); margin-bottom: 2.5rem; }
.vg6-interview-section > h2 em { font-style: italic; color: var(--teal); }
.vg6-qa-list { display: flex; flex-direction: column; }
.vg6-qa-item { display: grid; grid-template-columns: 1fr 1.4fr; gap: 2rem; padding: 1.5rem 0; border-top: 0.5px solid rgba(14,14,14,0.1); align-items: start; }
.vg6-qa-item:last-child { border-bottom: 0.5px solid rgba(14,14,14,0.1); }
.vg6-qa-q { font-family: 'Cormorant Garamond', serif; font-size: 1.05rem; font-weight: 400; color: var(--ink); line-height: 1.4; }
.vg6-q-badge { font-family: 'DM Mono', monospace; font-size: 0.58rem; letter-spacing: 0.1em; text-transform: uppercase; background: var(--teal); color: var(--paper); padding: 0.15rem 0.5rem; margin-bottom: 0.5rem; display: inline-block; }
.vg6-qa-a { font-size: 0.83rem; line-height: 1.8; color: var(--charcoal); font-weight: 300; }
.vg6-qa-a strong { color: var(--teal); font-weight: 500; }
.vg6-qa-a code { font-family: 'DM Mono', monospace; font-size: 0.77rem; background: rgba(14,14,14,0.07); padding: 0.1rem 0.35rem; color: var(--ink); }
.vg6-pills { display: flex; flex-wrap: wrap; gap: 0.5rem; margin-top: 0.75rem; }
.vg6-pill { font-size: 0.67rem; letter-spacing: 0.06em; padding: 0.25rem 0.75rem; border: 0.5px solid var(--border-strong); color: var(--charcoal); }
.vg6-pill.t { border-color: var(--teal); color: var(--teal); background: var(--teal-muted); }
.vg6-pill.a { border-color: var(--amber); color: var(--amber); background: var(--amber-muted); }

/* FOOTER */
.vg6-footer { background: var(--db-dark); padding: 3rem 4rem; display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 1.5rem; border-top: 0.5px solid rgba(247,244,239,0.06); }
.vg6-footer p { font-size: 0.82rem; color: rgba(247,244,239,0.35); font-weight: 300; }
.vg6-footer p strong { color: rgba(247,244,239,0.65); font-weight: 400; }
.vg6-footer-links { display: flex; gap: 1rem; }
.vg6-footer-btn { display: inline-block; padding: 0.65rem 1.75rem; font-size: 0.7rem; letter-spacing: 0.12em; text-transform: uppercase; text-decoration: none; font-weight: 400; }
.vg6-footer-btn.primary { background: var(--teal); color: var(--paper); }
.vg6-footer-btn.ghost { background: transparent; color: rgba(247,244,239,0.55); border: 0.5px solid rgba(247,244,239,0.2); }

/* REVEAL */
.vg6-reveal { opacity: 0; transform: translateY(20px); transition: opacity 0.55s ease, transform 0.55s ease; }
.vg6-reveal.vg6-vis { opacity: 1; transform: translateY(0); }
.vg6-d1 { transition-delay: 0.1s; } .vg6-d2 { transition-delay: 0.2s; }
</style>

<div class="vg6">

<!-- HERO -->
<div class="vg6-hero">
  <div class="vg6-hero-inner">
    <p class="vg6-eyebrow">GenAI Mastery Series · NLP · Databricks · LangChain</p>
    <h1>Categorizing Wikipedia at Scale with <em>OpenAI, LangChain &#038; Databricks</em></h1>
    <div class="vg6-meta-row">
      <p class="vg6-meta">Dataset<span>wikimedia/wikipedia · 10,000 articles</span></p>
      <p class="vg6-meta">Model<span>ChatOpenAI (GPT-4)</span></p>
      <p class="vg6-meta">Output<span>50-category JSON classifier</span></p>
    </div>
  </div>
</div>

<!-- STACK BAND -->
<div class="vg6-stack-band">
  <span class="vg6-stack-label">Stack</span>
  <span class="vg6-stack-pill">Databricks Notebook</span>
  <span class="vg6-stack-pill">LangChain Core</span>
  <span class="vg6-stack-pill">langchain_openai</span>
  <span class="vg6-stack-pill">HuggingFace Datasets</span>
  <span class="vg6-stack-pill">ChatPromptTemplate</span>
  <span class="vg6-stack-pill">Batch Inference</span>
  <span class="vg6-stack-pill">JSON Parsing</span>
</div>

<!-- INTRO -->
<div class="vg6-intro">
  <p>A complete walkthrough of a <strong>large-scale text classification pipeline</strong> built inside a Databricks notebook — from loading 10,000 Wikipedia articles to batch-classifying them into 50 categories using OpenAI&#8217;s language model via LangChain. Every step includes the real working code.</p>
</div>

<!-- PREREQS -->
<div class="vg6-prereqs">
  <span class="vg6-prereq-label">Prerequisites</span>
  <div class="vg6-prereq-chips">
    <span class="vg6-prereq-chip">Databricks Account</span>
    <span class="vg6-prereq-chip">Python (basic)</span>
    <span class="vg6-prereq-chip">OpenAI API Key</span>
    <span class="vg6-prereq-chip">HuggingFace Access</span>
  </div>
</div>

<!-- BODY -->
<div class="vg6-body">

  <!-- ARCHITECTURE -->
  <div class="vg6-step vg6-reveal">
    <p class="vg6-step-label">Overview</p>
    <h2>Pipeline <em>architecture</em></h2>
    <p>The full pipeline runs end-to-end inside a single Databricks notebook. Wikipedia articles are loaded from HuggingFace, cleaned to first-line summaries, batched, and sent to GPT-4 via LangChain&#8217;s chain interface. Responses are parsed from JSON into a DataFrame.</p>
    <div class="vg6-arch">
      <div class="vg6-arch-box"><div class="vg6-arch-icon">📦</div><h5>HuggingFace</h5><p>wikimedia/wikipedia dataset</p></div>
      <div class="vg6-arch-arrow">→</div>
      <div class="vg6-arch-box"><div class="vg6-arch-icon">✂️</div><h5>Clean</h5><p>First-line extraction</p></div>
      <div class="vg6-arch-arrow">→</div>
      <div class="vg6-arch-box highlight"><div class="vg6-arch-icon" style="color:var(--paper)">⛓</div><h5>LangChain</h5><p>Prompt + ChatOpenAI</p></div>
      <div class="vg6-arch-arrow">→</div>
      <div class="vg6-arch-box"><div class="vg6-arch-icon">🔄</div><h5>Batch (8)</h5><p>Rate-limit safe</p></div>
      <div class="vg6-arch-arrow">→</div>
      <div class="vg6-arch-box"><div class="vg6-arch-icon">📊</div><h5>DataFrame</h5><p>id + category</p></div>
    </div>
  </div>

  <hr class="vg6-divider">

  <!-- STEP 1 -->
  <div class="vg6-step vg6-reveal">
    <p class="vg6-step-label">Step 1</p>
    <h2>Install <em>required packages</em></h2>
    <p>In a Databricks notebook, use <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">%pip</code> magic commands to install packages into the cluster. The <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">%restart_python</code> command refreshes the interpreter to pick up the new packages without restarting the whole cluster.</p>
    <div class="vg6-code-wrap">
      <div class="vg6-code-header"><span class="vg6-code-filename">Databricks Notebook — Cell 1</span><span class="vg6-code-lang">Python / Magic</span></div>
      <div class="vg6-code-body"><code><span class="vg6-m">%pip install</span> langchain_openai
<span class="vg6-m">%pip install</span> <span class="vg6-v">--upgrade</span> langchain_core langchain_openai

<span class="vg6-m">%restart_python</span></code></div>
    </div>
  </div>

  <hr class="vg6-divider">

  <!-- STEP 2 -->
  <div class="vg6-step vg6-reveal">
    <p class="vg6-step-label">Step 2</p>
    <h2>Import <em>libraries</em></h2>
    <p>Standard Python utilities (<code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">json</code>, <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">time</code>, <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">os</code>) combined with <strong>LangChain</strong> for the LLM interface, <strong>HuggingFace Datasets</strong> for Wikipedia data loading, and <strong>tqdm</strong> for progress visibility during batch processing.</p>
    <div class="vg6-code-wrap">
      <div class="vg6-code-header"><span class="vg6-code-filename">Databricks Notebook — Cell 2</span><span class="vg6-code-lang">Python</span></div>
      <div class="vg6-code-body"><code><span class="vg6-k">import</span> json
<span class="vg6-k">import</span> time
<span class="vg6-k">import</span> os
<span class="vg6-k">import</span> getpass
<span class="vg6-k">import</span> pandas <span class="vg6-k">as</span> pd

<span class="vg6-k">from</span> datasets <span class="vg6-k">import</span> Dataset, load_dataset
<span class="vg6-k">from</span> tqdm <span class="vg6-k">import</span> tqdm
<span class="vg6-k">from</span> langchain_core.prompts <span class="vg6-k">import</span> ChatPromptTemplate
<span class="vg6-k">from</span> langchain_openai <span class="vg6-k">import</span> ChatOpenAI</code></div>
    </div>
  </div>

  <hr class="vg6-divider">

  <!-- STEP 3 -->
  <div class="vg6-step vg6-reveal">
    <p class="vg6-step-label">Step 3</p>
    <h2>Load &#038; clean <em>the dataset</em></h2>
    <p>The HuggingFace <strong>wikimedia/wikipedia</strong> dataset is massive — we take a 10,000 article slice from the English November 2023 snapshot. The cleaning step extracts only the first line of each article (the summary sentence), which is sufficient for category classification and drastically reduces token usage.</p>
    <div class="vg6-code-wrap">
      <div class="vg6-code-header"><span class="vg6-code-filename">Databricks Notebook — Cell 3</span><span class="vg6-code-lang">Python</span></div>
      <div class="vg6-code-body"><code><span class="vg6-c"># Load the Wikipedia English dataset (Nov 2023 snapshot)</span>
dataset = <span class="vg6-f">load_dataset</span>(<span class="vg6-s">"wikimedia/wikipedia"</span>, <span class="vg6-s">"20231101.en"</span>)

<span class="vg6-c"># Take a 10k article sample</span>
<span class="vg6-v">NUM_SAMPLES</span> = <span class="vg6-n">10000</span>
articles = dataset[<span class="vg6-s">"train"</span>][:<span class="vg6-v">NUM_SAMPLES</span>][<span class="vg6-s">"text"</span>]
ids      = dataset[<span class="vg6-s">"train"</span>][:<span class="vg6-v">NUM_SAMPLES</span>][<span class="vg6-s">"id"</span>]

<span class="vg6-c"># Clean: keep only the first line (article summary) to reduce tokens</span>
articles = [x.<span class="vg6-f">split</span>(<span class="vg6-s">"\n"</span>)[<span class="vg6-n">0</span>] <span class="vg6-k">for</span> x <span class="vg6-k">in</span> articles]

<span class="vg6-c"># Sanity check</span>
<span class="vg6-f">print</span>(<span class="vg6-f">len</span>(articles))   <span class="vg6-c"># → 10000</span>
<span class="vg6-f">print</span>(articles[<span class="vg6-n">99</span>])    <span class="vg6-c"># inspect a sample article</span></code></div>
    </div>
    <div class="vg6-callout teal">
      <strong>Why first line only?</strong> Wikipedia article summaries are dense and self-contained. Using the full article would cost ~10–50x more tokens per classification with minimal accuracy gain. At 10k articles × avg 150 tokens = ~1.5M input tokens — already significant. First-line only brings that to ~200k tokens.
    </div>
  </div>

  <hr class="vg6-divider">

  <!-- STEP 4 -->
  <div class="vg6-step vg6-reveal">
    <p class="vg6-step-label">Step 4</p>
    <h2>Configure <em>OpenAI + LangChain</em></h2>
    <p>Use <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">getpass</code> to securely prompt for the API key without echoing it to the notebook output. Then initialize <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">ChatOpenAI</code> — LangChain&#8217;s wrapper around the OpenAI Chat Completions API.</p>
    <div class="vg6-code-wrap">
      <div class="vg6-code-header"><span class="vg6-code-filename">Databricks Notebook — Cell 4 &#038; 5</span><span class="vg6-code-lang">Python</span></div>
      <div class="vg6-code-body"><code><span class="vg6-c"># Securely enter API key (won't echo to notebook output)</span>
os.environ[<span class="vg6-s">"OPENAI_API_KEY"</span>] = getpass.<span class="vg6-f">getpass</span>(<span class="vg6-s">"Enter your OpenAI API key: "</span>)

<span class="vg6-c"># Initialize the LangChain ChatOpenAI wrapper</span>
llm = <span class="vg6-f">ChatOpenAI</span>()
<span class="vg6-f">print</span>(llm.model_name)  <span class="vg6-c"># → "gpt-3.5-turbo" (default) or your configured model</span></code></div>
    </div>
  </div>

  <hr class="vg6-divider">

  <!-- STEP 5 -->
  <div class="vg6-step vg6-reveal">
    <p class="vg6-step-label">Step 5 — Core Logic</p>
    <h2>Define the <em>prompt template</em></h2>
    <p>The <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">ChatPromptTemplate</code> structures the conversation: a system message sets the classification task with all 50 categories, and the human message carries the article payload. The <strong>double curly braces</strong> <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">&#123;&#123; &#125;&#125;</code> in the JSON schema escape the literal braces so LangChain doesn&#8217;t treat them as template variables.</p>
    <div class="vg6-code-wrap">
      <div class="vg6-code-header"><span class="vg6-code-filename">Databricks Notebook — Cell 6</span><span class="vg6-code-lang">Python</span></div>
      <div class="vg6-code-body"><code>prompt = ChatPromptTemplate.<span class="vg6-f">from_messages</span>([
    (<span class="vg6-s">"system"</span>, <span class="vg6-s">"""Your task is to assess the article and categorize it
into one of the following predefined categories:

'History', 'Geography', 'Science', 'Technology', 'Mathematics',
'Literature', 'Art', 'Music', 'Film', 'Television', 'Sports',
'Politics', 'Philosophy', 'Religion', 'Sociology', 'Psychology',
'Economics', 'Business', 'Medicine', 'Biology', 'Chemistry',
'Physics', 'Astronomy', 'Environmental Science', 'Engineering',
'Computer Science', 'Linguistics', 'Anthropology', 'Archaeology',
'Education', 'Law', 'Military', 'Architecture', 'Fashion',
'Cuisine', 'Travel', 'Mythology', 'Folklore', 'Biography',
'Social Issues', 'Human Rights', 'Technology Ethics',
'Climate Change', 'Conservation', 'Urban Studies', 'Demographics',
'Journalism', 'Cryptocurrency', 'Artificial Intelligence'

Output ONLY a JSON object — no extra text:
{{
    "id": string,
    "category": string
}}"""</span>),
    (<span class="vg6-s">"human"</span>, <span class="vg6-s">"{input}"</span>)
])</code></div>
    </div>
    <div class="vg6-callout">
      <strong>Prompt engineering note:</strong> Listing all valid categories explicitly in the system prompt constrains the model to valid outputs — reducing hallucinated or free-form category names. The strict JSON output instruction combined with downstream <code>json.loads()</code> parsing creates a simple but robust structured output pipeline.
    </div>
  </div>

  <hr class="vg6-divider">

  <!-- STEP 6 -->
  <div class="vg6-step vg6-reveal">
    <p class="vg6-step-label">Step 6</p>
    <h2>Build the chain &#038; <em>test it</em></h2>
    <p>LangChain&#8217;s pipe operator <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">|</code> composes the prompt template and the LLM into a reusable chain. One call to <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">.invoke()</code> with a single article validates the whole setup before committing to batch processing.</p>
    <div class="vg6-code-wrap">
      <div class="vg6-code-header"><span class="vg6-code-filename">Databricks Notebook — Cell 7</span><span class="vg6-code-lang">Python</span></div>
      <div class="vg6-code-body"><code><span class="vg6-c"># Compose prompt → llm into a reusable chain</span>
chain = prompt | llm

<span class="vg6-c"># Test with article[0] before running the full batch</span>
content  = json.<span class="vg6-f">dumps</span>({<span class="vg6-s">"id"</span>: ids[<span class="vg6-n">0</span>], <span class="vg6-s">"article"</span>: articles[<span class="vg6-n">0</span>]})
response = chain.<span class="vg6-f">invoke</span>(content)
<span class="vg6-f">print</span>(response.content)
<span class="vg6-c"># → {"id": "1", "category": "History"}</span></code></div>
    </div>
  </div>

  <hr class="vg6-divider">

  <!-- STEP 7 -->
  <div class="vg6-step vg6-reveal">
    <p class="vg6-step-label">Step 7 — Core Loop</p>
    <h2>Batch processing <em>with rate-limit handling</em></h2>
    <p>Processing 1,000 articles one-by-one would quickly hit OpenAI&#8217;s requests-per-minute limit. The solution: accumulate inputs into batches of 8 and call <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">.batch()</code> with a <strong>1.5-second sleep</strong> between each batch. <code style="font-family:'DM Mono',monospace;font-size:0.82rem;background:rgba(14,14,14,0.07);padding:0.1rem 0.35rem">tqdm</code> wraps the loop to give live progress in the notebook.</p>
    <div class="vg6-code-wrap">
      <div class="vg6-code-header"><span class="vg6-code-filename">Databricks Notebook — Cell 8</span><span class="vg6-code-lang">Python</span></div>
      <div class="vg6-code-body"><code>results    = []
<span class="vg6-v">BATCH_SIZE</span> = <span class="vg6-n">8</span>
inputs     = []

<span class="vg6-k">for</span> index, article <span class="vg6-k">in</span> <span class="vg6-f">tqdm</span>(<span class="vg6-f">enumerate</span>(articles[:<span class="vg6-n">1000</span>])):

    inputs.<span class="vg6-f">append</span>(
        json.<span class="vg6-f">dumps</span>({<span class="vg6-s">"id"</span>: ids[index], <span class="vg6-s">"article"</span>: articles[index]})
    )

    <span class="vg6-k">if</span> <span class="vg6-f">len</span>(inputs) == <span class="vg6-v">BATCH_SIZE</span>:
        time.<span class="vg6-f">sleep</span>(<span class="vg6-n">1.5</span>)            <span class="vg6-c"># respect rate limits</span>
        response = chain.<span class="vg6-f">batch</span>(inputs)
        results += response
        inputs   = []               <span class="vg6-c"># reset buffer</span>

<span class="vg6-c"># Flush any remaining articles in the last partial batch</span>
<span class="vg6-k">if</span> inputs:
    response = chain.<span class="vg6-f">batch</span>(inputs)
    results += response</code></div>
    </div>
    <div class="vg6-callout">
      <strong>Rate limit strategy:</strong> Batch size 8 + 1.5s sleep = ~5 batches/sec = ~40 requests/sec. For the free OpenAI tier (3 RPM), reduce batch size to 1 and increase sleep to 20s. For production use, implement exponential backoff with <code>tenacity</code>.
    </div>
  </div>

  <hr class="vg6-divider">

  <!-- STEP 8 -->
  <div class="vg6-step vg6-reveal">
    <p class="vg6-step-label">Step 8</p>
    <h2>Parse results into <em>a DataFrame</em></h2>
    <p>Not every LLM response will be valid JSON — network hiccups, model refusals, and malformed outputs all happen at scale. The pattern below separates successful parses from failures so you can inspect and retry the failures without losing the successful results.</p>
    <div class="vg6-code-wrap">
      <div class="vg6-code-header"><span class="vg6-code-filename">Databricks Notebook — Cell 9</span><span class="vg6-code-lang">Python</span></div>
      <div class="vg6-code-body"><code>success = []
failure = []

<span class="vg6-k">for</span> output <span class="vg6-k">in</span> results:
    content = output.content
    <span class="vg6-k">try</span>:
        content = json.<span class="vg6-f">loads</span>(content)
        success.<span class="vg6-f">append</span>(content)
    <span class="vg6-k">except</span> ValueError <span class="vg6-k">as</span> e:
        failure.<span class="vg6-f">append</span>(content)  <span class="vg6-c"># keep for retry / inspection</span>

<span class="vg6-f">print</span>(<span class="vg6-f">f</span><span class="vg6-s">"Success: {len(success)} | Failure: {len(failure)}"</span>)

<span class="vg6-c"># Convert to DataFrame for analysis / export</span>
df = pd.<span class="vg6-f">DataFrame</span>(success)
df.<span class="vg6-f">head</span>(<span class="vg6-n">10</span>)</code></div>
    </div>
  </div>

</div><!-- /vg6-body -->

<!-- OUTPUT SECTION -->
<div class="vg6-output-section">
  <p class="vg6-output-eyebrow">Sample Output</p>
  <h2>What the <em>pipeline produces</em></h2>
  <div class="vg6-output-grid vg6-reveal">
    <div class="vg6-output-stat"><div class="vg6-output-stat-n">10k</div><div class="vg6-output-stat-l">Articles Loaded</div></div>
    <div class="vg6-output-stat"><div class="vg6-output-stat-n">1k</div><div class="vg6-output-stat-l">Articles Classified</div></div>
    <div class="vg6-output-stat"><div class="vg6-output-stat-n">50</div><div class="vg6-output-stat-l">Categories</div></div>
    <div class="vg6-output-stat"><div class="vg6-output-stat-n">8</div><div class="vg6-output-stat-l">Batch Size</div></div>
  </div>
  <div class="vg6-json-card vg6-reveal vg6-d1">
    <div class="vg6-json-header">
      <span>LLM Response — Single Article</span>
      <span class="vg6-json-tag">JSON output</span>
    </div>
    <div class="vg6-json-body">
<span class="vg6-json-punct">[</span>
  <span class="vg6-json-punct">{</span>
    <span class="vg6-json-key">&#8220;id&#8221;</span><span class="vg6-json-punct">:</span> <span class="vg6-json-val-str">&#8220;1&#8221;</span><span class="vg6-json-punct">,</span>
    <span class="vg6-json-key">&#8220;category&#8221;</span><span class="vg6-json-punct">:</span> <span class="vg6-json-val-str">&#8220;History&#8221;</span>
  <span class="vg6-json-punct">},</span>
  <span class="vg6-json-punct">{</span>
    <span class="vg6-json-key">&#8220;id&#8221;</span><span class="vg6-json-punct">:</span> <span class="vg6-json-val-str">&#8220;4&#8221;</span><span class="vg6-json-punct">,</span>
    <span class="vg6-json-key">&#8220;category&#8221;</span><span class="vg6-json-punct">:</span> <span class="vg6-json-val-str">&#8220;Computer Science&#8221;</span>
  <span class="vg6-json-punct">},</span>
  <span class="vg6-json-punct">{</span>
    <span class="vg6-json-key">&#8220;id&#8221;</span><span class="vg6-json-punct">:</span> <span class="vg6-json-val-str">&#8220;7&#8221;</span><span class="vg6-json-punct">,</span>
    <span class="vg6-json-key">&#8220;category&#8221;</span><span class="vg6-json-punct">:</span> <span class="vg6-json-val-str">&#8220;Biology&#8221;</span>
  <span class="vg6-json-punct">}</span>
<span class="vg6-json-punct">]</span>
    </div>
  </div>
  <p style="font-size:0.82rem;color:rgba(247,244,239,0.45);margin-top:1.5rem;font-weight:300;">All 50 available classification categories:</p>
  <div class="vg6-categories vg6-reveal vg6-d2">
    <span class="vg6-cat-pill active">History</span><span class="vg6-cat-pill">Geography</span><span class="vg6-cat-pill active">Science</span><span class="vg6-cat-pill">Technology</span><span class="vg6-cat-pill">Mathematics</span><span class="vg6-cat-pill">Literature</span><span class="vg6-cat-pill">Art</span><span class="vg6-cat-pill">Music</span><span class="vg6-cat-pill">Film</span><span class="vg6-cat-pill">Television</span><span class="vg6-cat-pill">Sports</span><span class="vg6-cat-pill active">Politics</span><span class="vg6-cat-pill">Philosophy</span><span class="vg6-cat-pill">Religion</span><span class="vg6-cat-pill">Sociology</span><span class="vg6-cat-pill">Psychology</span><span class="vg6-cat-pill">Economics</span><span class="vg6-cat-pill">Business</span><span class="vg6-cat-pill">Medicine</span><span class="vg6-cat-pill active">Biology</span><span class="vg6-cat-pill">Chemistry</span><span class="vg6-cat-pill">Physics</span><span class="vg6-cat-pill">Astronomy</span><span class="vg6-cat-pill">Environmental Science</span><span class="vg6-cat-pill">Engineering</span><span class="vg6-cat-pill active">Computer Science</span><span class="vg6-cat-pill">Linguistics</span><span class="vg6-cat-pill">Anthropology</span><span class="vg6-cat-pill">Archaeology</span><span class="vg6-cat-pill">Education</span><span class="vg6-cat-pill">Law</span><span class="vg6-cat-pill">Military</span><span class="vg6-cat-pill">Architecture</span><span class="vg6-cat-pill">Fashion</span><span class="vg6-cat-pill">Cuisine</span><span class="vg6-cat-pill">Travel</span><span class="vg6-cat-pill">Mythology</span><span class="vg6-cat-pill">Folklore</span><span class="vg6-cat-pill">Biography</span><span class="vg6-cat-pill">Social Issues</span><span class="vg6-cat-pill">Human Rights</span><span class="vg6-cat-pill active">Artificial Intelligence</span><span class="vg6-cat-pill">Cryptocurrency</span><span class="vg6-cat-pill">Climate Change</span><span class="vg6-cat-pill">Conservation</span><span class="vg6-cat-pill">Urban Studies</span><span class="vg6-cat-pill">Journalism</span><span class="vg6-cat-pill">Technology Ethics</span><span class="vg6-cat-pill">Demographics</span>
  </div>
</div>

<!-- INTERVIEW CHEAT SHEET -->
<div class="vg6-interview-section">
  <p class="vg6-interview-eyebrow">Interview Prep</p>
  <h2>Cheat sheet — <em>quick definitions to remember</em></h2>
  <div class="vg6-qa-list">

    <div class="vg6-qa-item vg6-reveal">
      <div class="vg6-qa-q"><span class="vg6-q-badge">Define</span><br>What is LangChain and what problem does it solve?</div>
      <div class="vg6-qa-a"><strong>A framework for composing LLM-powered applications</strong> from modular building blocks — prompts, models, chains, memory, tools, and agents. It solves the orchestration problem: how do you connect a prompt template to an LLM, parse the output, and chain multiple steps together cleanly?
        <div class="vg6-pills"><span class="vg6-pill t">Prompt + LLM + Output</span><span class="vg6-pill t">Composable chains</span><span class="vg6-pill">Pipe operator |</span></div>
      </div>
    </div>

    <div class="vg6-qa-item vg6-reveal vg6-d1">
      <div class="vg6-qa-q"><span class="vg6-q-badge">Explain</span><br>What is a ChatPromptTemplate?</div>
      <div class="vg6-qa-a">A <strong>reusable message template</strong> that structures the conversation for a chat model. Defines the system role (task instructions) and the human turn (variable input). The <code>{input}</code> placeholder gets filled at runtime. Separating instructions from data is a core prompt engineering best practice.
        <div class="vg6-pills"><span class="vg6-pill t">System = instructions</span><span class="vg6-pill t">Human = data</span><span class="vg6-pill">{input} placeholder</span></div>
      </div>
    </div>

    <div class="vg6-qa-item vg6-reveal">
      <div class="vg6-qa-q"><span class="vg6-q-badge">Explain</span><br>Why use <code>.batch()</code> instead of looping <code>.invoke()</code>?</div>
      <div class="vg6-qa-a"><code>.batch()</code> sends multiple requests <strong>concurrently</strong> using asyncio under the hood, while <code>.invoke()</code> is sequential. For 8 articles, batch is roughly 8x faster. The sleep between batches manages rate limits — you get concurrency within a batch, pacing across batches.
        <div class="vg6-pills"><span class="vg6-pill t">Concurrent within batch</span><span class="vg6-pill a">Sleep between batches</span><span class="vg6-pill">8x throughput gain</span></div>
      </div>
    </div>

    <div class="vg6-qa-item vg6-reveal vg6-d1">
      <div class="vg6-qa-q"><span class="vg6-q-badge">Gotcha</span><br>Why separate success and failure lists instead of crashing on parse error?</div>
      <div class="vg6-qa-a">At 1,000+ LLM calls, <strong>some will fail</strong> — network timeouts, content policy refusals, or models that occasionally output extra text before the JSON. A try/except pattern collects failures without losing the successful results. Failures can be inspected and retried separately.
        <div class="vg6-pills"><span class="vg6-pill a">Never crash on parse error</span><span class="vg6-pill">Inspect failures separately</span><span class="vg6-pill t">Retry pattern</span></div>
      </div>
    </div>

    <div class="vg6-qa-item vg6-reveal">
      <div class="vg6-qa-q"><span class="vg6-q-badge">Best Practice</span><br>How do you get reliable structured JSON from an LLM?</div>
      <div class="vg6-qa-a">Three layers: <strong>(1) Constrain in the prompt</strong> — list valid values, specify exact schema, say &#8220;output ONLY JSON&#8221;. <strong>(2) Use LangChain&#8217;s output parsers</strong> (<code>JsonOutputParser</code>) for automatic parsing and retry. <strong>(3) Validate with Pydantic</strong> — define a model and parse the JSON through it to catch type errors.
        <div class="vg6-pills"><span class="vg6-pill t">Constrain schema in prompt</span><span class="vg6-pill t">JsonOutputParser</span><span class="vg6-pill a">Pydantic validation</span></div>
      </div>
    </div>

    <div class="vg6-qa-item vg6-reveal vg6-d1">
      <div class="vg6-qa-q"><span class="vg6-q-badge">Explain</span><br>Why use Databricks for this pipeline?</div>
      <div class="vg6-qa-a">Databricks provides a <strong>managed Spark + Python environment</strong> that scales horizontally. For 10k–10M articles, you can parallelize across a cluster using Spark UDFs or <code>pandas_udf</code>. It also integrates with Delta Lake for storing results, MLflow for experiment tracking, and Unity Catalog for data governance.
        <div class="vg6-pills"><span class="vg6-pill t">Horizontal scale</span><span class="vg6-pill t">Delta Lake storage</span><span class="vg6-pill">MLflow tracking</span></div>
      </div>
    </div>

    <div class="vg6-qa-item vg6-reveal">
      <div class="vg6-qa-q"><span class="vg6-q-badge">Improve</span><br>How would you scale this to 10 million articles?</div>
      <div class="vg6-qa-a"><strong>Three upgrades:</strong> (1) Wrap the chain call in a <strong>Spark pandas_udf</strong> so it runs in parallel across the cluster. (2) Replace <code>time.sleep()</code> with <strong>exponential backoff</strong> via <code>tenacity</code>. (3) Use <strong>LangChain&#8217;s async batch</strong> with <code>chain.abatch()</code> and asyncio for maximum concurrency per node.
        <div class="vg6-pills"><span class="vg6-pill t">Spark pandas_udf</span><span class="vg6-pill t">chain.abatch()</span><span class="vg6-pill a">tenacity backoff</span></div>
      </div>
    </div>

  </div>
</div>

<!-- FOOTER -->
<div class="vg6-footer">
  <p><strong>GenAI Mastery Series</strong> — vijay-gokarn.com · Vijay Gokarn</p>
  <div class="vg6-footer-links">
    <a href="https://github.com/vijaygokarn130" class="vg6-footer-btn ghost">GitHub ↗</a>
    <a href="https://vijay-gokarn.com" class="vg6-footer-btn primary">Back to Blog ↗</a>
  </div>
</div>

</div><!-- /vg6 -->

<script>
(function(){
  var obs = new IntersectionObserver(function(e){
    e.forEach(function(x){ if(x.isIntersecting) x.target.classList.add('vg6-vis'); });
  }, {threshold: 0.08});
  document.querySelectorAll('.vg6-reveal').forEach(function(el){ obs.observe(el); });
})();
</script>
<p>The post <a href="https://vijay-gokarn.com/analyzing-wikipedia-articles-with-langchain-and-openai-in-databricks/">Analyzing Wikipedia Articles with Langchain and OpenAI in Databricks</a> appeared first on <a href="https://vijay-gokarn.com">Vijay Gokarn</a>.</p>
]]></content:encoded>
					
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">140</post-id>	</item>
		<item>
		<title>Pandas Remove Duplicates</title>
		<link>https://vijay-gokarn.com/pandas-remove-duplicates/?utm_source=rss&#038;utm_medium=rss&#038;utm_campaign=pandas-remove-duplicates</link>
		
		<dc:creator><![CDATA[Vijay Gokarn]]></dc:creator>
		<pubDate>Tue, 09 Jul 2024 11:12:55 +0000</pubDate>
				<category><![CDATA[ai-agents]]></category>
		<category><![CDATA[databricks]]></category>
		<category><![CDATA[food]]></category>
		<category><![CDATA[generative-ai]]></category>
		<category><![CDATA[python]]></category>
		<category><![CDATA[data-analysis]]></category>
		<category><![CDATA[gen-ai]]></category>
		<category><![CDATA[pandas]]></category>
		<guid isPermaLink="false">https://vijay-gokarn.com/?p=119</guid>

					<description><![CDATA[<p>Data Engineering · Python · Pandas · Data Cleaning Handling Duplicate Rows in Pandas — Identify, Remove &#038; Export Clean Data Librarypandas Methodsduplicated() · drop_duplicates() · reset_index() OutputCleaned CSV Stack Python pandas df.duplicated() drop_duplicates() reset_index() to_csv() Duplicate rows are one of the most common data quality issues — and one of the most damaging to [&#8230;]</p>
<p>The post <a href="https://vijay-gokarn.com/pandas-remove-duplicates/">Pandas Remove Duplicates</a> appeared first on <a href="https://vijay-gokarn.com">Vijay Gokarn</a>.</p>
]]></description>
										<content:encoded><![CDATA[
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:ital,wght@0,300;0,400;0,600;1,300;1,400&#038;family=DM+Sans:wght@300;400;500&#038;family=DM+Mono:wght@400&#038;display=swap" rel="stylesheet">

<style>
.vg8 {
  --ink: #0e0e0e; --paper: #f7f4ef; --paper-dark: #ede9e1;
  --teal: #0f6e56; --teal-light: #1d9e75; --teal-muted: #e1f5ee;
  --amber: #ba7517; --amber-light: #fac775; --amber-muted: #faeeda;
  --charcoal: #2c2c2a; --muted: #888780;
  --border: rgba(14,14,14,0.12); --border-strong: rgba(14,14,14,0.25);
  --code-bg: #161b22; --code-header: #2d333b; --code-border: rgba(255,255,255,0.06);
  font-family: 'DM Sans', sans-serif; font-weight: 300;
  color: var(--ink); background: var(--paper); line-height: 1.75; font-size: 16px; overflow-x: hidden;
}
.vg8 *, .vg8 *::before, .vg8 *::after { box-sizing: border-box; margin: 0; padding: 0; }

/* HERO */
.vg8-hero { background: #0d1117; padding: 5rem 4rem 4rem; position: relative; overflow: hidden; }
.vg8-hero::before {
  content: '⊕'; font-family: 'Cormorant Garamond', serif; font-size: 22rem;
  font-weight: 300; color: rgba(255,255,255,0.025); position: absolute;
  right: 1rem; bottom: -5rem; line-height: 1; pointer-events: none;
}
.vg8-hero-inner { position: relative; z-index: 1; max-width: 900px; }
.vg8-eyebrow { font-size: 0.68rem; letter-spacing: 0.22em; text-transform: uppercase; color: var(--teal-light); font-weight: 500; margin-bottom: 1.25rem; display: flex; align-items: center; gap: 0.75rem; }
.vg8-eyebrow::before { content: ''; display: inline-block; width: 1.5rem; height: 1px; background: var(--teal-light); }
.vg8-hero h1 { font-family: 'Cormorant Garamond', serif; font-size: clamp(2.2rem, 5vw, 3.8rem); font-weight: 300; line-height: 1.1; color: var(--paper); letter-spacing: -0.02em; margin-bottom: 1.5rem; max-width: 28ch; }
.vg8-hero h1 em { font-style: italic; color: var(--amber-light); }
.vg8-meta-row { display: flex; gap: 2rem; flex-wrap: wrap; }
.vg8-meta { font-size: 0.7rem; letter-spacing: 0.1em; text-transform: uppercase; color: rgba(247,244,239,0.35); }
.vg8-meta span { color: rgba(247,244,239,0.7); margin-left: 0.4rem; }

/* STACK BAND */
.vg8-stack-band { background: var(--teal); padding: 1.1rem 4rem; display: flex; gap: 0.75rem; flex-wrap: wrap; align-items: center; }
.vg8-stack-label { font-size: 0.63rem; letter-spacing: 0.18em; text-transform: uppercase; color: rgba(255,255,255,0.6); font-weight: 400; margin-right: 0.4rem; }
.vg8-stack-pill { font-size: 0.7rem; letter-spacing: 0.05em; padding: 0.28rem 0.85rem; background: rgba(255,255,255,0.12); color: #fff; border: 0.5px solid rgba(255,255,255,0.2); }

/* INTRO */
.vg8-intro { background: var(--teal-muted); padding: 2.5rem 4rem; border-left: 4px solid var(--teal); }
.vg8-intro p { font-size: 1.05rem; line-height: 1.85; color: var(--charcoal); font-weight: 300; max-width: 80ch; }
.vg8-intro strong { color: var(--teal); font-weight: 500; }

/* BODY */
.vg8-body { max-width: 900px; margin: 0 auto; padding: 4rem; }
.vg8-step { margin-bottom: 3.5rem; }
.vg8-step-label { font-size: 0.63rem; letter-spacing: 0.22em; text-transform: uppercase; color: var(--teal); font-weight: 500; margin-bottom: 0.5rem; display: flex; align-items: center; gap: 0.6rem; }
.vg8-step-label::before { content: ''; display: inline-block; width: 1.25rem; height: 1px; background: var(--teal); }
.vg8-step h2 { font-family: 'Cormorant Garamond', serif; font-size: clamp(1.4rem, 3vw, 2rem); font-weight: 300; line-height: 1.2; color: var(--ink); margin-bottom: 1rem; }
.vg8-step h2 em { font-style: italic; color: var(--teal); }
.vg8-step p { font-size: 0.93rem; line-height: 1.9; color: var(--charcoal); font-weight: 300; margin-bottom: 1rem; }
.vg8-step p strong { color: var(--ink); font-weight: 500; }
.vg8-divider { border: none; border-top: 0.5px solid var(--border); margin: 3rem 0; }
.vg8-ic { font-family: 'DM Mono', monospace; font-size: 0.82rem; background: rgba(14,14,14,0.07); padding: 0.1rem 0.4rem; color: var(--ink); }

/* CALLOUT */
.vg8-callout { background: var(--paper-dark); border-left: 3px solid var(--amber); padding: 1.25rem 1.5rem; margin: 1.25rem 0; font-size: 0.87rem; line-height: 1.8; color: var(--charcoal); }
.vg8-callout strong { color: var(--amber); font-weight: 500; }
.vg8-callout.teal { border-color: var(--teal); }
.vg8-callout.teal strong { color: var(--teal); }

/* STRATEGY CARDS */
.vg8-strategy-grid { display: grid; grid-template-columns: repeat(3, 1fr); gap: 1.25rem; margin: 1.5rem 0; }
.vg8-strategy-card { background: var(--paper); border: 0.5px solid var(--border-strong); padding: 1.5rem; position: relative; }
.vg8-strategy-card::before { content: ''; position: absolute; top: 0; left: 0; width: 100%; height: 4px; }
.vg8-strategy-card:nth-child(1)::before { background: var(--muted); }
.vg8-strategy-card:nth-child(2)::before { background: var(--amber); }
.vg8-strategy-card:nth-child(3)::before { background: var(--teal); }
.vg8-strategy-card .vg8-strat-tag { font-family: 'DM Mono', monospace; font-size: 0.65rem; letter-spacing: 0.1em; text-transform: uppercase; color: var(--muted); margin-bottom: 0.5rem; display: block; }
.vg8-strategy-card:nth-child(2) .vg8-strat-tag { color: var(--amber); }
.vg8-strategy-card:nth-child(3) .vg8-strat-tag { color: var(--teal); }
.vg8-strategy-card h3 { font-family: 'Cormorant Garamond', serif; font-size: 1.15rem; font-weight: 400; color: var(--ink); margin-bottom: 0.4rem; }
.vg8-strategy-card p { font-size: 0.82rem; line-height: 1.7; color: var(--charcoal); font-weight: 300; }

/* PIPELINE */
.vg8-pipeline { display: flex; flex-direction: column; gap: 0; margin: 1.5rem 0; }
.vg8-pipeline-step { display: grid; grid-template-columns: 52px 1fr; gap: 1.5rem; padding: 1.25rem 0; border-top: 0.5px solid var(--border); align-items: start; }
.vg8-pipeline-step:last-child { border-bottom: 0.5px solid var(--border); }
.vg8-pipeline-num { width: 36px; height: 36px; background: var(--teal); display: flex; align-items: center; justify-content: center; font-family: 'Cormorant Garamond', serif; font-size: 1.1rem; font-weight: 300; color: var(--paper); flex-shrink: 0; }
.vg8-pipeline-body h4 { font-family: 'Cormorant Garamond', serif; font-size: 1.1rem; font-weight: 400; color: var(--ink); margin-bottom: 0.3rem; }
.vg8-pipeline-body p { font-size: 0.83rem; line-height: 1.7; color: var(--charcoal); font-weight: 300; }

/* CODE BLOCKS */
.vg8-code-wrap { margin: 1.25rem 0; border: 0.5px solid var(--code-border); overflow: hidden; }
.vg8-code-header { background: var(--code-header); padding: 0.6rem 1.25rem; display: flex; justify-content: space-between; align-items: center; border-bottom: 0.5px solid var(--code-border); }
.vg8-code-filename { font-family: 'DM Mono', monospace; font-size: 0.68rem; color: rgba(247,244,239,0.45); letter-spacing: 0.04em; }
.vg8-code-lang { font-size: 0.6rem; letter-spacing: 0.14em; text-transform: uppercase; color: var(--teal-light); font-weight: 500; }
.vg8-code-body { background: var(--code-bg); padding: 1.5rem; overflow-x: auto; }
.vg8-code-body pre { margin: 0; }
.vg8-code-body code { font-family: 'DM Mono', monospace; font-size: 0.82rem; line-height: 1.85; color: #e6edf3; white-space: pre; display: block; }
/* tokens */
.t8-k { color: #ff7b72; }
.t8-s { color: #a5d6ff; }
.t8-c { color: #8b949e; font-style: italic; }
.t8-f { color: #d2a8ff; }
.t8-n { color: #79c0ff; }
.t8-v { color: #ffa657; }
.t8-b { color: var(--amber-light); }

/* FULL SCRIPT SECTION */
.vg8-full-section { background: var(--paper-dark); padding: 4rem; }
.vg8-full-eyebrow { font-size: 0.65rem; letter-spacing: 0.22em; text-transform: uppercase; color: var(--teal); font-weight: 500; margin-bottom: 0.5rem; display: flex; align-items: center; gap: 0.6rem; }
.vg8-full-eyebrow::before { content: ''; display: inline-block; width: 1.25rem; height: 1px; background: var(--teal); }
.vg8-full-section > h2 { font-family: 'Cormorant Garamond', serif; font-size: clamp(1.6rem, 3vw, 2.4rem); font-weight: 300; color: var(--ink); margin-bottom: 0.75rem; }
.vg8-full-section > h2 em { font-style: italic; color: var(--teal); }
.vg8-full-section > p { font-size: 0.9rem; color: var(--charcoal); font-weight: 300; line-height: 1.8; margin-bottom: 2rem; max-width: 70ch; }

/* INTERVIEW */
.vg8-interview-section { background: var(--ink); padding: 4rem; }
.vg8-interview-eyebrow { font-size: 0.65rem; letter-spacing: 0.22em; text-transform: uppercase; color: var(--amber-light); font-weight: 500; margin-bottom: 0.5rem; display: flex; align-items: center; gap: 0.6rem; }
.vg8-interview-eyebrow::before { content: ''; display: inline-block; width: 1.25rem; height: 1px; background: var(--amber-light); }
.vg8-interview-section > h2 { font-family: 'Cormorant Garamond', serif; font-size: clamp(1.6rem, 3vw, 2.4rem); font-weight: 300; color: var(--paper); margin-bottom: 2.5rem; }
.vg8-interview-section > h2 em { font-style: italic; color: var(--amber-light); }
.vg8-qa-list { display: flex; flex-direction: column; }
.vg8-qa-item { display: grid; grid-template-columns: 1fr 1.4fr; gap: 2rem; padding: 1.5rem 0; border-top: 0.5px solid rgba(247,244,239,0.1); align-items: start; }
.vg8-qa-item:last-child { border-bottom: 0.5px solid rgba(247,244,239,0.1); }
.vg8-qa-q { font-family: 'Cormorant Garamond', serif; font-size: 1.05rem; font-weight: 400; color: var(--paper); line-height: 1.4; }
.vg8-q-badge { font-family: 'DM Mono', monospace; font-size: 0.58rem; letter-spacing: 0.1em; text-transform: uppercase; background: var(--teal); color: var(--paper); padding: 0.15rem 0.5rem; margin-bottom: 0.5rem; display: inline-block; }
.vg8-qa-a { font-size: 0.83rem; line-height: 1.8; color: rgba(247,244,239,0.65); font-weight: 300; }
.vg8-qa-a strong { color: var(--amber-light); font-weight: 400; }
.vg8-qa-a code { font-family: 'DM Mono', monospace; font-size: 0.77rem; background: rgba(247,244,239,0.08); padding: 0.1rem 0.35rem; color: var(--paper); }
.vg8-pills { display: flex; flex-wrap: wrap; gap: 0.5rem; margin-top: 0.75rem; }
.vg8-pill { font-size: 0.67rem; letter-spacing: 0.06em; padding: 0.25rem 0.75rem; border: 0.5px solid rgba(247,244,239,0.15); color: rgba(247,244,239,0.5); }
.vg8-pill.t { border-color: var(--teal-light); color: var(--teal-light); }
.vg8-pill.a { border-color: var(--amber-light); color: var(--amber-light); }

/* FOOTER */
.vg8-footer { background: #0d1117; padding: 3rem 4rem; display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 1.5rem; border-top: 0.5px solid rgba(247,244,239,0.06); }
.vg8-footer p { font-size: 0.82rem; color: rgba(247,244,239,0.35); font-weight: 300; }
.vg8-footer p strong { color: rgba(247,244,239,0.65); font-weight: 400; }
.vg8-footer-links { display: flex; gap: 1rem; }
.vg8-btn { display: inline-block; padding: 0.65rem 1.75rem; font-size: 0.7rem; letter-spacing: 0.12em; text-transform: uppercase; text-decoration: none; font-weight: 400; }
.vg8-btn.primary { background: var(--teal); color: var(--paper); }
.vg8-btn.ghost { background: transparent; color: rgba(247,244,239,0.55); border: 0.5px solid rgba(247,244,239,0.2); }

/* REVEAL */
.vg8-reveal { opacity: 0; transform: translateY(20px); transition: opacity 0.55s ease, transform 0.55s ease; }
.vg8-reveal.vg8-vis { opacity: 1; transform: translateY(0); }
.vg8-d1 { transition-delay: 0.1s; } .vg8-d2 { transition-delay: 0.2s; } .vg8-d3 { transition-delay: 0.3s; }
</style>

<div class="vg8">

<!-- HERO -->
<div class="vg8-hero">
  <div class="vg8-hero-inner">
    <p class="vg8-eyebrow">Data Engineering · Python · Pandas · Data Cleaning</p>
    <h1>Handling Duplicate Rows in Pandas — <em>Identify, Remove &#038; Export Clean Data</em></h1>
    <div class="vg8-meta-row">
      <p class="vg8-meta">Library<span>pandas</span></p>
      <p class="vg8-meta">Methods<span>duplicated() · drop_duplicates() · reset_index()</span></p>
      <p class="vg8-meta">Output<span>Cleaned CSV</span></p>
    </div>
  </div>
</div>

<!-- STACK BAND -->
<div class="vg8-stack-band">
  <span class="vg8-stack-label">Stack</span>
  <span class="vg8-stack-pill">Python</span>
  <span class="vg8-stack-pill">pandas</span>
  <span class="vg8-stack-pill">df.duplicated()</span>
  <span class="vg8-stack-pill">drop_duplicates()</span>
  <span class="vg8-stack-pill">reset_index()</span>
  <span class="vg8-stack-pill">to_csv()</span>
</div>

<!-- INTRO -->
<div class="vg8-intro">
  <p>Duplicate rows are one of the most common data quality issues — and one of the most damaging to model accuracy and analysis reliability. <strong>Pandas</strong> gives you precise tools to detect, inspect, and remove duplicates with a single line of code. This guide walks through the full pipeline: load, detect, choose a strategy, clean, and export.</p>
</div>

<!-- BODY -->
<div class="vg8-body">

  <!-- WHY IT MATTERS -->
  <div class="vg8-step vg8-reveal">
    <p class="vg8-step-label">Context</p>
    <h2>Why duplicates <em>matter</em></h2>
    <p>Duplicate rows skew aggregations, inflate record counts, bias ML model training, and produce misleading visualizations. A sales total that counts the same transaction twice, a classifier trained on repeated samples — both produce results that look correct but aren&#8217;t. <strong>Clean data is the foundation everything else is built on.</strong></p>
    <div class="vg8-strategy-grid">
      <div class="vg8-strategy-card vg8-reveal vg8-d1">
        <span class="vg8-strat-tag">keep=&#8217;first&#8217;</span>
        <h3>Keep First</h3>
        <p>Drop all duplicates <em>except</em> the first occurrence. The original record is preserved. Most common default choice.</p>
      </div>
      <div class="vg8-strategy-card vg8-reveal vg8-d2">
        <span class="vg8-strat-tag">keep=&#8217;last&#8217;</span>
        <h3>Keep Last</h3>
        <p>Drop all duplicates <em>except</em> the last occurrence. Useful when later records represent updated values.</p>
      </div>
      <div class="vg8-strategy-card vg8-reveal vg8-d3">
        <span class="vg8-strat-tag">keep=False</span>
        <h3>Drop All</h3>
        <p>Remove every instance of a duplicated row — including the first. Use when any duplicated record is invalid.</p>
      </div>
    </div>
  </div>

  <hr class="vg8-divider">

  <!-- PIPELINE OVERVIEW -->
  <div class="vg8-step vg8-reveal">
    <p class="vg8-step-label">Pipeline</p>
    <h2>The four-step <em>deduplication pipeline</em></h2>
    <div class="vg8-pipeline">
      <div class="vg8-pipeline-step vg8-reveal">
        <div class="vg8-pipeline-num">1</div>
        <div class="vg8-pipeline-body"><h4>Load</h4><p>Read the raw CSV into a DataFrame with <code class="vg8-ic">pd.read_csv()</code>.</p></div>
      </div>
      <div class="vg8-pipeline-step vg8-reveal vg8-d1">
        <div class="vg8-pipeline-num">2</div>
        <div class="vg8-pipeline-body"><h4>Detect</h4><p>Use <code class="vg8-ic">df.duplicated()</code> to identify and inspect all duplicate rows before touching the data.</p></div>
      </div>
      <div class="vg8-pipeline-step vg8-reveal vg8-d2">
        <div class="vg8-pipeline-num">3</div>
        <div class="vg8-pipeline-body"><h4>Remove</h4><p>Call <code class="vg8-ic">drop_duplicates(keep=...)</code> with your chosen strategy. Reset the index for a clean sequential result.</p></div>
      </div>
      <div class="vg8-pipeline-step vg8-reveal vg8-d3">
        <div class="vg8-pipeline-num">4</div>
        <div class="vg8-pipeline-body"><h4>Export</h4><p>Write the cleaned DataFrame back to CSV with <code class="vg8-ic">to_csv()</code> for downstream use.</p></div>
      </div>
    </div>
  </div>

  <hr class="vg8-divider">

  <!-- STEP 1 — LOAD -->
  <div class="vg8-step vg8-reveal">
    <p class="vg8-step-label">Step 1</p>
    <h2>Load <em>your dataset</em></h2>
    <p>Start by reading your data into a pandas DataFrame. <code class="vg8-ic">pd.read_csv()</code> is the standard entry point for flat files. From here, all deduplication operations work on the in-memory DataFrame — your source file is never modified.</p>
    <div class="vg8-code-wrap">
      <div class="vg8-code-header"><span class="vg8-code-filename">load_data.py</span><span class="vg8-code-lang">Python</span></div>
      <div class="vg8-code-body"><pre><code><span class="t8-k">import</span> pandas <span class="t8-k">as</span> pd

<span class="t8-c"># Read the raw dataset into a DataFrame</span>
df = pd.<span class="t8-f">read_csv</span>(<span class="t8-s">'your_data_file.csv'</span>)

<span class="t8-c"># Quick shape check before cleaning</span>
<span class="t8-f">print</span>(<span class="t8-f">f</span><span class="t8-s">"Rows: {df.shape[0]:,}  |  Columns: {df.shape[1]}"</span>)</code></pre></div>
    </div>
    <div class="vg8-callout teal">
      <strong>Other sources:</strong> The same deduplication logic applies regardless of how you load your data. Use <code class="vg8-ic">pd.read_excel()</code> for XLSX, <code class="vg8-ic">pd.read_parquet()</code> for Parquet, or query a database with <code class="vg8-ic">pd.read_sql()</code> — all return a DataFrame you can clean the same way.
    </div>
  </div>

  <hr class="vg8-divider">

  <!-- STEP 2 — DETECT -->
  <div class="vg8-step vg8-reveal">
    <p class="vg8-step-label">Step 2</p>
    <h2>Detect <em>&#038; inspect duplicates</em></h2>
    <p><code class="vg8-ic">df.duplicated()</code> returns a boolean Series — <code class="vg8-ic">True</code> for every row that is a duplicate of an earlier row. Always <strong>inspect before you remove</strong> — understanding what the duplicates look like helps you choose the right strategy.</p>
    <div class="vg8-code-wrap">
      <div class="vg8-code-header"><span class="vg8-code-filename">detect_duplicates.py</span><span class="vg8-code-lang">Python</span></div>
      <div class="vg8-code-body"><pre><code><span class="t8-c"># Boolean mask: True for every row that is a duplicate</span>
duplicate_mask = df.<span class="t8-f">duplicated</span>()

<span class="t8-c"># How many duplicates exist?</span>
<span class="t8-f">print</span>(<span class="t8-f">f</span><span class="t8-s">"Duplicate rows found: {duplicate_mask.sum():,}"</span>)

<span class="t8-c"># Inspect the duplicate rows themselves</span>
duplicates = df[df.<span class="t8-f">duplicated</span>()]
<span class="t8-f">print</span>(duplicates)

<span class="t8-c"># See ALL occurrences of duplicated rows (including originals)</span>
all_dupes = df[df.<span class="t8-f">duplicated</span>(keep=<span class="t8-b">False</span>)]
<span class="t8-f">print</span>(all_dupes.<span class="t8-f">sort_values</span>(by=df.columns.<span class="t8-f">tolist</span>()))</code></pre></div>
    </div>
    <div class="vg8-callout">
      <strong>Subset duplicates:</strong> By default <code class="vg8-ic">duplicated()</code> checks all columns. To flag rows that are duplicates only on specific columns (e.g. same customer_id): <code class="vg8-ic">df.duplicated(subset=['customer_id'])</code>. This is useful for finding logical duplicates even when other columns differ.
    </div>
  </div>

  <hr class="vg8-divider">

  <!-- STEP 3 — REMOVE -->
  <div class="vg8-step vg8-reveal">
    <p class="vg8-step-label">Step 3</p>
    <h2>Remove duplicates — <em>three strategies</em></h2>
    <p><code class="vg8-ic">drop_duplicates()</code> returns a new DataFrame by default — the original is untouched. The <code class="vg8-ic">keep</code> parameter controls which occurrence survives. After removing, <code class="vg8-ic">reset_index(drop=True)</code> gives you a clean sequential index starting from 0.</p>
    <div class="vg8-code-wrap">
      <div class="vg8-code-header"><span class="vg8-code-filename">remove_duplicates.py</span><span class="vg8-code-lang">Python</span></div>
      <div class="vg8-code-body"><pre><code><span class="t8-c"># ── Strategy 1: keep the FIRST occurrence (default) ──</span>
df_keep_first = df.<span class="t8-f">drop_duplicates</span>(keep=<span class="t8-s">'first'</span>)

<span class="t8-c"># ── Strategy 2: keep the LAST occurrence ──</span>
<span class="t8-c">#    useful when later rows represent updated/corrected records</span>
df_keep_last = df.<span class="t8-f">drop_duplicates</span>(keep=<span class="t8-s">'last'</span>)

<span class="t8-c"># ── Strategy 3: drop ALL occurrences of any duplicated row ──</span>
<span class="t8-c">#    use when any repeated row is invalid data</span>
df_drop_all = df.<span class="t8-f">drop_duplicates</span>(keep=<span class="t8-b">False</span>)

<span class="t8-c"># ── Subset: deduplicate only on specific columns ──</span>
df_subset = df.<span class="t8-f">drop_duplicates</span>(subset=[<span class="t8-s">'customer_id'</span>, <span class="t8-s">'order_date'</span>], keep=<span class="t8-s">'first'</span>)

<span class="t8-c"># ── Reset the index after removal (clean 0-based index) ──</span>
df_cleaned = df_keep_first.<span class="t8-f">reset_index</span>(drop=<span class="t8-b">True</span>, inplace=<span class="t8-b">False</span>)

<span class="t8-c"># Confirm rows removed</span>
<span class="t8-f">print</span>(<span class="t8-f">f</span><span class="t8-s">"Before: {len(df):,}  |  After: {len(df_cleaned):,}  |  Removed: {len(df) - len(df_cleaned):,}"</span>)</code></pre></div>
    </div>
    <div class="vg8-callout teal">
      <strong>inplace vs assignment:</strong> <code class="vg8-ic">drop_duplicates(inplace=True)</code> modifies the DataFrame in place and returns <code class="vg8-ic">None</code>. Prefer the assignment pattern (<code class="vg8-ic">df_cleaned = df.drop_duplicates()</code>) — it preserves the original for comparison and makes your code easier to debug.
    </div>
  </div>

  <hr class="vg8-divider">

  <!-- STEP 4 — EXPORT -->
  <div class="vg8-step vg8-reveal">
    <p class="vg8-step-label">Step 4</p>
    <h2>Export <em>the clean data</em></h2>
    <p>Write the deduplicated DataFrame back to a CSV. Setting <code class="vg8-ic">index=False</code> prevents pandas from writing the row index as an extra column — your downstream consumers will thank you.</p>
    <div class="vg8-code-wrap">
      <div class="vg8-code-header"><span class="vg8-code-filename">export.py</span><span class="vg8-code-lang">Python</span></div>
      <div class="vg8-code-body"><pre><code><span class="t8-c"># Export to CSV — index=False keeps the file clean</span>
df_cleaned.<span class="t8-f">to_csv</span>(<span class="t8-s">'cleaned_data.csv'</span>, index=<span class="t8-b">False</span>)

<span class="t8-f">print</span>(<span class="t8-s">"Cleaned data exported to cleaned_data.csv"</span>)

<span class="t8-c"># Optional: also export to Parquet for better performance at scale</span>
df_cleaned.<span class="t8-f">to_parquet</span>(<span class="t8-s">'cleaned_data.parquet'</span>, index=<span class="t8-b">False</span>)</code></pre></div>
    </div>
  </div>

</div><!-- /vg8-body -->

<!-- FULL SCRIPT -->
<div class="vg8-full-section">
  <p class="vg8-full-eyebrow">Complete Reference</p>
  <h2>Full deduplication <em>script</em></h2>
  <p>Everything in one place — load, detect, remove (keep first), reset index, and export.</p>
  <div class="vg8-code-wrap vg8-reveal">
    <div class="vg8-code-header"><span class="vg8-code-filename">deduplicate.py — full script</span><span class="vg8-code-lang">Python</span></div>
    <div class="vg8-code-body"><pre><code><span class="t8-k">import</span> pandas <span class="t8-k">as</span> pd

<span class="t8-c"># ── 1. Load ─────────────────────────────────────────────</span>
df = pd.<span class="t8-f">read_csv</span>(<span class="t8-s">'your_data_file.csv'</span>)
<span class="t8-f">print</span>(<span class="t8-f">f</span><span class="t8-s">"Loaded {len(df):,} rows"</span>)

<span class="t8-c"># ── 2. Detect ────────────────────────────────────────────</span>
duplicates = df[df.<span class="t8-f">duplicated</span>()]
<span class="t8-f">print</span>(<span class="t8-f">f</span><span class="t8-s">"Duplicate rows found: {len(duplicates):,}"</span>)
<span class="t8-f">print</span>(duplicates)

<span class="t8-c"># ── 3a. Keep last occurrence of each duplicate row ───────</span>
df_cleaned = df.<span class="t8-f">drop_duplicates</span>(keep=<span class="t8-s">'last'</span>)

<span class="t8-c"># ── 3b. Keep first occurrence (swap in if preferred) ─────</span>
<span class="t8-c"># df_cleaned = df.drop_duplicates(keep='first')</span>

<span class="t8-c"># ── 3c. Reset the index to a clean 0-based sequence ──────</span>
df_cleaned.<span class="t8-f">reset_index</span>(drop=<span class="t8-b">True</span>, inplace=<span class="t8-b">True</span>)

<span class="t8-f">print</span>(<span class="t8-f">f</span><span class="t8-s">"Rows after cleaning: {len(df_cleaned):,}"</span>)

<span class="t8-c"># ── 4. Export ─────────────────────────────────────────────</span>
df_cleaned.<span class="t8-f">to_csv</span>(<span class="t8-s">'cleaned_data.csv'</span>, index=<span class="t8-b">False</span>)
<span class="t8-f">print</span>(<span class="t8-s">"Exported to cleaned_data.csv"</span>)</code></pre></div>
  </div>
</div>

<!-- INTERVIEW CHEAT SHEET -->
<div class="vg8-interview-section">
  <p class="vg8-interview-eyebrow">Interview Prep</p>
  <h2>Cheat sheet — <em>quick definitions to remember</em></h2>
  <div class="vg8-qa-list">

    <div class="vg8-qa-item vg8-reveal">
      <div class="vg8-qa-q"><span class="vg8-q-badge">Define</span><br>What does <code>df.duplicated()</code> return?</div>
      <div class="vg8-qa-a">A <strong>boolean Series</strong> the same length as the DataFrame — <code>True</code> for every row that is a duplicate of a previously seen row, <code>False</code> otherwise. The first occurrence is marked <code>False</code> by default.
        <div class="vg8-pills"><span class="vg8-pill t">Boolean Series</span><span class="vg8-pill">True = duplicate</span><span class="vg8-pill a">First = False by default</span></div>
      </div>
    </div>

    <div class="vg8-qa-item vg8-reveal vg8-d1">
      <div class="vg8-qa-q"><span class="vg8-q-badge">Compare</span><br>keep=&#8217;first&#8217; vs keep=&#8217;last&#8217; vs keep=False</div>
      <div class="vg8-qa-a"><strong>first</strong> — keeps the first occurrence, drops all subsequent duplicates. <strong>last</strong> — keeps the final occurrence, useful for updated records. <strong>False</strong> — drops every occurrence of any duplicated row, leaving only rows that were unique to begin with.
        <div class="vg8-pills"><span class="vg8-pill t">first = keep original</span><span class="vg8-pill a">last = keep latest</span><span class="vg8-pill">False = drop all copies</span></div>
      </div>
    </div>

    <div class="vg8-qa-item vg8-reveal">
      <div class="vg8-qa-q"><span class="vg8-q-badge">Explain</span><br>What does the <code>subset</code> parameter do?</div>
      <div class="vg8-qa-a">By default, <code>duplicated()</code> and <code>drop_duplicates()</code> compare <strong>all columns</strong>. The <code>subset</code> parameter restricts the comparison to specific columns — for example <code>subset=['customer_id']</code> finds rows with the same customer ID even if other columns differ.
        <div class="vg8-pills"><span class="vg8-pill t">Default = all columns</span><span class="vg8-pill">subset = logical dedup</span></div>
      </div>
    </div>

    <div class="vg8-qa-item vg8-reveal vg8-d1">
      <div class="vg8-qa-q"><span class="vg8-q-badge">Gotcha</span><br>Why call <code>reset_index(drop=True)</code> after deduplication?</div>
      <div class="vg8-qa-a">After dropping rows, the DataFrame retains the <strong>original row indices</strong> — you&#8217;d have gaps like 0, 1, 4, 7 instead of 0, 1, 2, 3. <code>reset_index(drop=True)</code> renumbers from 0 continuously. <code>drop=True</code> prevents the old index from being added as a column.
        <div class="vg8-pills"><span class="vg8-pill a">Index gaps after drop</span><span class="vg8-pill t">reset_index fixes gaps</span><span class="vg8-pill">drop=True prevents extra col</span></div>
      </div>
    </div>

    <div class="vg8-qa-item vg8-reveal">
      <div class="vg8-qa-q"><span class="vg8-q-badge">Gotcha</span><br>inplace=True vs reassignment — which is preferred?</div>
      <div class="vg8-qa-a">Prefer <strong>reassignment</strong> (<code>df_cleaned = df.drop_duplicates()</code>) — it preserves the original DataFrame for comparison and makes pipelines easier to debug. <code>inplace=True</code> modifies the object and returns <code>None</code>, which can cause confusion when chaining operations. Many pandas best-practice guides now recommend avoiding inplace.
        <div class="vg8-pills"><span class="vg8-pill t">Reassignment = safer</span><span class="vg8-pill a">inplace returns None</span></div>
      </div>
    </div>

    <div class="vg8-qa-item vg8-reveal vg8-d1">
      <div class="vg8-qa-q"><span class="vg8-q-badge">Best Practice</span><br>How do you handle duplicates in a production data pipeline?</div>
      <div class="vg8-qa-a"><strong>Three layers:</strong> (1) <strong>Detect and log</strong> before removing — store duplicate counts as data quality metrics. (2) <strong>Deduplicate at ingestion</strong>, not at query time — clean once, use many times. (3) Add a <strong>unique constraint</strong> in your database or Delta Lake table to prevent duplicates from re-entering at source.
        <div class="vg8-pills"><span class="vg8-pill t">Log before removing</span><span class="vg8-pill t">Clean at ingestion</span><span class="vg8-pill a">DB unique constraints</span></div>
      </div>
    </div>

    <div class="vg8-qa-item vg8-reveal">
      <div class="vg8-qa-q"><span class="vg8-q-badge">Use Case</span><br>When should you NOT remove duplicates?</div>
      <div class="vg8-qa-a">When the repeated rows represent <strong>legitimate repeated events</strong> — a customer placing the same order twice on different days, a sensor reading the same value consecutively, or audit log entries. Always validate with domain knowledge before dropping. Use <code>subset</code> to deduplicate on business keys, not entire rows.
        <div class="vg8-pills"><span class="vg8-pill a">Repeated events = valid</span><span class="vg8-pill t">Use subset= for business keys</span></div>
      </div>
    </div>

  </div>
</div>

<!-- FOOTER -->
<div class="vg8-footer">
  <p><strong>GenAI Mastery Series</strong> — vijay-gokarn.com · Vijay Gokarn</p>
  <div class="vg8-footer-links">
    <a href="https://github.com/vijaygokarn130" class="vg8-btn ghost">GitHub ↗</a>
    <a href="https://vijay-gokarn.com" class="vg8-btn primary">Back to Blog ↗</a>
  </div>
</div>

</div><!-- /vg8 -->

<script>
(function(){
  var obs = new IntersectionObserver(function(e){
    e.forEach(function(x){ if(x.isIntersecting) x.target.classList.add('vg8-vis'); });
  }, {threshold: 0.08});
  document.querySelectorAll('.vg8-reveal').forEach(function(el){ obs.observe(el); });
})();
</script>
<p>The post <a href="https://vijay-gokarn.com/pandas-remove-duplicates/">Pandas Remove Duplicates</a> appeared first on <a href="https://vijay-gokarn.com">Vijay Gokarn</a>.</p>
]]></content:encoded>
					
		
		
		<post-id xmlns="com-wordpress:feed-additions:1">119</post-id>	</item>
	</channel>
</rss>
