Building a Code Dataset Pipeline from NVIDIA Nemotron-Pretraining-Code-v3 Metadata with Streaming, Pandas, and tiktoken


fig, ax = plt.subplots(2, 2, figsize=(14, 9))
lang_counts.head(12).iloc[::-1].plot.barh(ax=ax[0, 0], color="#76b900")
ax[0, 0].set_title("Top 12 languages (sample)"); ax[0, 0].set_xlabel("files")
df["ext"].value_counts().head(12).iloc[::-1].plot.barh(ax=ax[0, 1], color="#5b8def")
ax[0, 1].set_title("Top 12 file extensions (sample)"); ax[0, 1].set_xlabel("files")
df["depth"].clip(upper=12).plot.hist(bins=range(0, 14), ax=ax[1, 0],
                                    color="#f4a261", edgecolor="white")
ax[1, 0].set_title("Directory nesting depth"); ax[1, 0].set_xlabel("'/' count in path")
(df["repo"].value_counts().head(10).iloc[::-1]
  .plot.barh(ax=ax[1, 1], color="#9b5de5"))
ax[1, 1].set_title("Most common repos (sample)"); ax[1, 1].set_xlabel("files")
plt.tight_layout(); plt.show()