from datasets import list_datasetsUnderstanding Huggingface datasets
nlp
    huggingface
  
    This blog will dive into huggingface datasets
  
Get the list of all avaliable datasets from haggingface
all_datasets = list_datasets()
len(all_datasets), all_datasets[:10](14199,
 ['acronym_identification',
  'ade_corpus_v2',
  'adversarial_qa',
  'aeslc',
  'afrikaans_ner_corpus',
  'ag_news',
  'ai2_arc',
  'air_dialogue',
  'ajgt_twitter_ar',
  'allegro_reviews'])Load a dataset from huggingface
from datasets import load_datasetLoad the whole dataset
emotions = load_dataset('emotion')
emotionsUsing custom data configuration default
Reusing dataset emotion (/Users/youfeng/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})Load part of the dataset
emotions = load_dataset('emotion', split='train')
emotionsUsing custom data configuration default
Reusing dataset emotion (/Users/youfeng/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})emotions = load_dataset('emotion', split=['validation', 'test'])
emotionsUsing custom data configuration default
Reusing dataset emotion (/Users/youfeng/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)[Dataset({
     features: ['text', 'label'],
     num_rows: 2000
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 2000
 })]Load a csv file from remote or local
import pandas as pd
import numpy as nptrn_csv = 'https://raw.githubusercontent.com/youfenglab/nlp-with-huggingface/master/data/train.csv'
tst_csv = 'https://raw.githubusercontent.com/youfenglab/nlp-with-huggingface/master/data/train.csv'trn_df = pd.read_csv(trn_csv)
label_cols = trn_df.columns[2:]
label_colsIndex(['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar',
       'conventions'],
      dtype='object')trn_ds = load_dataset("csv", data_files=trn_csv) # or several files []
trn_dsUsing custom data configuration default-8f5d5f2de1b27b24
Reusing dataset csv (/Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)DatasetDict({
    train: Dataset({
        features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
        num_rows: 3911
    })
})
Tip: Load several files
data_files= [file1, file2 ...], but the columns of the files should be the same, otherwise you will get error.
If you want to split the training set into train and validation parts, you can do it as below
ds = trn_ds['train'].train_test_split(test_size=0.2) # Here we use 20% samples as validation part
dsDatasetDict({
    train: Dataset({
        features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
        num_rows: 3128
    })
    test: Dataset({
        features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
        num_rows: 783
    })
})If you want to use param “stratify” like scikit-learn, you can do it as below
Warning
Note: we need covert the column into ClassLabel column, since in this case the label columns are Value now. Otherwise, we will get an error like this:
ValueError: Stratifying by column is only supported for ClassLabel column, and column cohesion is Value.
Let say we want use cohesion column.
trn_dsDatasetDict({
    train: Dataset({
        features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
        num_rows: 3911
    })
})ds = trn_ds.class_encode_column('cohesion')
ds['train'].featuresLoading cached processed dataset at /Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-8be23562eeb71b18.arrow
Loading cached processed dataset at /Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-b4dba551c432230d.arrow
Loading cached processed dataset at /Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-3e6fbdaa550cb114.arrow{'text_id': Value(dtype='string', id=None),
 'full_text': Value(dtype='string', id=None),
 'cohesion': ClassLabel(num_classes=9, names=['1.0', '1.5', '2.0', '2.5', '3.0', '3.5', '4.0', '4.5', '5.0'], id=None),
 'syntax': Value(dtype='float64', id=None),
 'vocabulary': Value(dtype='float64', id=None),
 'phraseology': Value(dtype='float64', id=None),
 'grammar': Value(dtype='float64', id=None),
 'conventions': Value(dtype='float64', id=None)}ds = ds['train'].train_test_split(test_size=0.2, stratify_by_column="cohesion")
dsDatasetDict({
    train: Dataset({
        features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
        num_rows: 3128
    })
    test: Dataset({
        features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
        num_rows: 783
    })
})