from datasets import list_datasets
Understanding Huggingface datasets
nlp
huggingface
This blog will dive into huggingface datasets
Get the list of all avaliable datasets from haggingface
= list_datasets()
all_datasets len(all_datasets), all_datasets[:10]
(14199,
['acronym_identification',
'ade_corpus_v2',
'adversarial_qa',
'aeslc',
'afrikaans_ner_corpus',
'ag_news',
'ai2_arc',
'air_dialogue',
'ajgt_twitter_ar',
'allegro_reviews'])
Load a dataset from huggingface
from datasets import load_dataset
Load the whole dataset
= load_dataset('emotion')
emotions emotions
Using custom data configuration default
Reusing dataset emotion (/Users/youfeng/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)
DatasetDict({
train: Dataset({
features: ['text', 'label'],
num_rows: 16000
})
validation: Dataset({
features: ['text', 'label'],
num_rows: 2000
})
test: Dataset({
features: ['text', 'label'],
num_rows: 2000
})
})
Load part of the dataset
= load_dataset('emotion', split='train')
emotions emotions
Using custom data configuration default
Reusing dataset emotion (/Users/youfeng/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)
Dataset({
features: ['text', 'label'],
num_rows: 16000
})
= load_dataset('emotion', split=['validation', 'test'])
emotions emotions
Using custom data configuration default
Reusing dataset emotion (/Users/youfeng/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)
[Dataset({
features: ['text', 'label'],
num_rows: 2000
}),
Dataset({
features: ['text', 'label'],
num_rows: 2000
})]
Load a csv
file from remote or local
import pandas as pd
import numpy as np
= 'https://raw.githubusercontent.com/youfenglab/nlp-with-huggingface/master/data/train.csv'
trn_csv = 'https://raw.githubusercontent.com/youfenglab/nlp-with-huggingface/master/data/train.csv' tst_csv
= pd.read_csv(trn_csv)
trn_df = trn_df.columns[2:]
label_cols label_cols
Index(['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar',
'conventions'],
dtype='object')
= load_dataset("csv", data_files=trn_csv) # or several files []
trn_ds trn_ds
Using custom data configuration default-8f5d5f2de1b27b24
Reusing dataset csv (/Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)
DatasetDict({
train: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 3911
})
})
Tip: Load several files
data_files= [file1, file2 ...]
, but the columns of the files should be the same, otherwise you will get error.
If you want to split the training set into train and validation parts, you can do it as below
= trn_ds['train'].train_test_split(test_size=0.2) # Here we use 20% samples as validation part
ds ds
DatasetDict({
train: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 3128
})
test: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 783
})
})
If you want to use param “stratify”
like scikit-learn, you can do it as below
Warning
Note: we need covert the column into ClassLabel
column, since in this case the label columns are Value
now. Otherwise, we will get an error like this:
ValueError: Stratifying by column is only supported for ClassLabel column, and column cohesion is Value.
Let say we want use cohesion
column.
trn_ds
DatasetDict({
train: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 3911
})
})
= trn_ds.class_encode_column('cohesion')
ds 'train'].features ds[
Loading cached processed dataset at /Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-8be23562eeb71b18.arrow
Loading cached processed dataset at /Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-b4dba551c432230d.arrow
Loading cached processed dataset at /Users/youfeng/.cache/huggingface/datasets/csv/default-8f5d5f2de1b27b24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-3e6fbdaa550cb114.arrow
{'text_id': Value(dtype='string', id=None),
'full_text': Value(dtype='string', id=None),
'cohesion': ClassLabel(num_classes=9, names=['1.0', '1.5', '2.0', '2.5', '3.0', '3.5', '4.0', '4.5', '5.0'], id=None),
'syntax': Value(dtype='float64', id=None),
'vocabulary': Value(dtype='float64', id=None),
'phraseology': Value(dtype='float64', id=None),
'grammar': Value(dtype='float64', id=None),
'conventions': Value(dtype='float64', id=None)}
= ds['train'].train_test_split(test_size=0.2, stratify_by_column="cohesion")
ds ds
DatasetDict({
train: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 3128
})
test: Dataset({
features: ['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
num_rows: 783
})
})