Skip to content

Readers

MemoryReader

Bases: Reader

Reader for a list of dicts already in memory.

Source code in ckanext/versioned_datastore/lib/importing/readers.py
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
class MemoryReader(Reader):
    """
    Reader for a list of dicts already in memory.
    """

    def __init__(self, source: List[dict]):
        """
        :param source: a list of dicts
        """
        self.source = source

    def get_name(self) -> str:
        return 'Memory reader'

    def get_fields(self) -> List[str]:
        """
        Return the fields present in the source dicts. The fields will be returned in
        the order they are found in the source from first dict to last. This ensures the
        ordering is representative of the source.

        :returns: a list of field names
        """
        fields = []
        for record_data in self.source:
            for field in record_data:
                if field not in fields:
                    fields.append(field)
        return fields

    def read(self) -> Iterable[dict]:
        yield from self.source

    def get_count(self) -> int:
        return len(self.source)

__init__(source)

Parameters:

Name Type Description Default
source List[dict]

a list of dicts

required
Source code in ckanext/versioned_datastore/lib/importing/readers.py
360
361
362
363
364
def __init__(self, source: List[dict]):
    """
    :param source: a list of dicts
    """
    self.source = source

get_fields()

Return the fields present in the source dicts. The fields will be returned in the order they are found in the source from first dict to last. This ensures the ordering is representative of the source.

Returns:

Type Description
List[str]

a list of field names

Source code in ckanext/versioned_datastore/lib/importing/readers.py
369
370
371
372
373
374
375
376
377
378
379
380
381
382
def get_fields(self) -> List[str]:
    """
    Return the fields present in the source dicts. The fields will be returned in
    the order they are found in the source from first dict to last. This ensures the
    ordering is representative of the source.

    :returns: a list of field names
    """
    fields = []
    for record_data in self.source:
        for field in record_data:
            if field not in fields:
                fields.append(field)
    return fields

NoCandidateFileFoundInZip

Bases: Exception

Raised when there is no file found in the zip which we can ingest.

Source code in ckanext/versioned_datastore/lib/importing/readers.py
391
392
393
394
395
396
397
class NoCandidateFileFoundInZip(Exception):
    """
    Raised when there is no file found in the zip which we can ingest.
    """

    def __init__(self):
        super().__init__('No candidate file was found in the zip file')

Reader

Bases: ABC

Abstract base class for reader implementations.

Source code in ckanext/versioned_datastore/lib/importing/readers.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
class Reader(abc.ABC):
    """
    Abstract base class for reader implementations.
    """

    @abc.abstractmethod
    def get_name(self) -> str:
        """
        :returns: a name for the Reader instance (for logging)
        """
        ...

    @abc.abstractmethod
    def get_fields(self) -> List[str]:
        """
        Returns the list of the field names found in the source.

        :returns: the fields in the source this Reader is reading
        """
        ...

    @abc.abstractmethod
    def read(self) -> Iterable[dict]:
        """
        Actually reads the data from the source and yields dicts for each row found.

        :returns: yields dicts
        """
        ...

    @abc.abstractmethod
    def get_count(self) -> int:
        """
        Returns the number of rows in the source.

        :returns: an integer representing the number of rows
        """
        ...

get_count() abstractmethod

Returns the number of rows in the source.

Returns:

Type Description
int

an integer representing the number of rows

Source code in ckanext/versioned_datastore/lib/importing/readers.py
176
177
178
179
180
181
182
183
@abc.abstractmethod
def get_count(self) -> int:
    """
    Returns the number of rows in the source.

    :returns: an integer representing the number of rows
    """
    ...

get_fields() abstractmethod

Returns the list of the field names found in the source.

Returns:

Type Description
List[str]

the fields in the source this Reader is reading

Source code in ckanext/versioned_datastore/lib/importing/readers.py
158
159
160
161
162
163
164
165
@abc.abstractmethod
def get_fields(self) -> List[str]:
    """
    Returns the list of the field names found in the source.

    :returns: the fields in the source this Reader is reading
    """
    ...

get_name() abstractmethod

Returns:

Type Description
str

a name for the Reader instance (for logging)

Source code in ckanext/versioned_datastore/lib/importing/readers.py
151
152
153
154
155
156
@abc.abstractmethod
def get_name(self) -> str:
    """
    :returns: a name for the Reader instance (for logging)
    """
    ...

read() abstractmethod

Actually reads the data from the source and yields dicts for each row found.

Returns:

Type Description
Iterable[dict]

yields dicts

Source code in ckanext/versioned_datastore/lib/importing/readers.py
167
168
169
170
171
172
173
174
@abc.abstractmethod
def read(self) -> Iterable[dict]:
    """
    Actually reads the data from the source and yields dicts for each row found.

    :returns: yields dicts
    """
    ...

ReaderNotFound

Bases: Exception

Exception indicating that no reader could be found for the provided file format.

Source code in ckanext/versioned_datastore/lib/importing/readers.py
21
22
23
24
25
26
27
28
class ReaderNotFound(Exception):
    """
    Exception indicating that no reader could be found for the provided file format.
    """

    def __init__(self, fmt: str):
        super().__init__(f"No reader matched the format '{fmt}'")
        self.fmt = fmt

SVReader

Bases: Reader

Class for reading csv and tsv files.

Source code in ckanext/versioned_datastore/lib/importing/readers.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
class SVReader(Reader):
    """
    Class for reading csv and tsv files.
    """

    def __init__(self, source: Path):
        """
        :param source: the source file path
        """
        self.source = source
        self.encoding = detect_encoding(self.source)
        with self.source.open(encoding=self.encoding, newline='') as f:
            # instead of relying on people to correctly declare the dialect they are
            # using (from experience people are awful at this), sniff it for ourselves
            self.dialect = csv.Sniffer().sniff(f.readline())
        self._count = None

    def get_name(self) -> str:
        """
        Creates a name for this SVReader instance which represents the dialect and
        character encoding found (useful for debugging why someone's data wasn't
        ingested as expected).

        :returns: the name
        """
        dialect_info = ', '.join(
            f'{attr}: {str(getattr(self.dialect, attr))}'
            for attr in [
                'lineterminator',
                'quoting',
                'doublequote',
                'delimiter',
                'quotechar',
                'skipinitialspace',
            ]
        )
        dialect_info = dialect_info.encode('unicode_escape').decode('utf-8')
        return f'SV reader, encoding: {self.encoding}, dialect: [{dialect_info}]'

    def get_fields(self) -> List[str]:
        """
        Reads the first line of the source file and returns it as a list.

        :returns: the field names
        """
        with self.source.open(encoding=self.encoding, newline='') as f:
            return next(csv.reader(f, dialect=self.dialect))

    def read(self) -> Iterable[dict]:
        """
        Reads each line of the source file and yields each row as a dict.

        :returns: yields each row as a dict
        """
        with self.source.open(encoding=self.encoding, newline='') as f:
            yield from csv.DictReader(f, dialect=self.dialect)

    def get_count(self) -> int:
        if self._count is None:
            with self.source.open(encoding=self.encoding, newline='') as f:
                # count each row once, then take 1 off assuming the first row is the
                # header. Use max to avoid returning a negative value if the file is
                # just empty
                self._count = max(0, sum(1 for _ in f) - 1)
        return self._count

__init__(source)

Parameters:

Name Type Description Default
source Path

the source file path

required
Source code in ckanext/versioned_datastore/lib/importing/readers.py
191
192
193
194
195
196
197
198
199
200
201
def __init__(self, source: Path):
    """
    :param source: the source file path
    """
    self.source = source
    self.encoding = detect_encoding(self.source)
    with self.source.open(encoding=self.encoding, newline='') as f:
        # instead of relying on people to correctly declare the dialect they are
        # using (from experience people are awful at this), sniff it for ourselves
        self.dialect = csv.Sniffer().sniff(f.readline())
    self._count = None

get_fields()

Reads the first line of the source file and returns it as a list.

Returns:

Type Description
List[str]

the field names

Source code in ckanext/versioned_datastore/lib/importing/readers.py
225
226
227
228
229
230
231
232
def get_fields(self) -> List[str]:
    """
    Reads the first line of the source file and returns it as a list.

    :returns: the field names
    """
    with self.source.open(encoding=self.encoding, newline='') as f:
        return next(csv.reader(f, dialect=self.dialect))

get_name()

Creates a name for this SVReader instance which represents the dialect and character encoding found (useful for debugging why someone's data wasn't ingested as expected).

Returns:

Type Description
str

the name

Source code in ckanext/versioned_datastore/lib/importing/readers.py
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def get_name(self) -> str:
    """
    Creates a name for this SVReader instance which represents the dialect and
    character encoding found (useful for debugging why someone's data wasn't
    ingested as expected).

    :returns: the name
    """
    dialect_info = ', '.join(
        f'{attr}: {str(getattr(self.dialect, attr))}'
        for attr in [
            'lineterminator',
            'quoting',
            'doublequote',
            'delimiter',
            'quotechar',
            'skipinitialspace',
        ]
    )
    dialect_info = dialect_info.encode('unicode_escape').decode('utf-8')
    return f'SV reader, encoding: {self.encoding}, dialect: [{dialect_info}]'

read()

Reads each line of the source file and yields each row as a dict.

Returns:

Type Description
Iterable[dict]

yields each row as a dict

Source code in ckanext/versioned_datastore/lib/importing/readers.py
234
235
236
237
238
239
240
241
def read(self) -> Iterable[dict]:
    """
    Reads each line of the source file and yields each row as a dict.

    :returns: yields each row as a dict
    """
    with self.source.open(encoding=self.encoding, newline='') as f:
        yield from csv.DictReader(f, dialect=self.dialect)

UnidentifiedEncoding

Bases: Exception

Exception indicating that the encoding could not be identified.

Source code in ckanext/versioned_datastore/lib/importing/readers.py
132
133
134
135
136
137
138
139
140
141
142
143
class UnidentifiedEncoding(Exception):
    """
    Exception indicating that the encoding could not be identified.
    """

    def __init__(self, detected, tested):
        super().__init__(
            f'File encoding could not be identified. Detected encoding: {detected}. '
            f'Also tested: {tested}'
        )
        self.detected = detected
        self.tested = tested

XLSReader

Bases: Reader

Reader for XLS files (i.e. old Excel spreadsheets).

Source code in ckanext/versioned_datastore/lib/importing/readers.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
class XLSReader(Reader):
    """
    Reader for XLS files (i.e. old Excel spreadsheets).
    """

    def __init__(self, source: Path):
        """
        :param source: the file path to the source XLS file
        """
        self.source = source
        with self.source.open('rb') as f:
            book = xlrd.open_workbook(file_contents=f.read())
            # todo: currently we don't deal with multisheeted spreadsheets, we just
            #       choose the first sheet and roll with it
            sheet = book.sheet_by_index(0)
            self.header = [str(cell.value) for cell in sheet.row(0)]
            self.rows = list(sheet.get_rows())[1:]

    def get_name(self) -> str:
        return 'XLS reader'

    def get_fields(self) -> List[str]:
        return self.header

    def read(self) -> Iterable[dict]:
        """
        Yields a dict for each row in the file's first sheet. We do some basic handling
        of types to ensure text, numbers, and booleans are converted correctly. Dates
        are not handled because the way dates are handled in XLS files is extremely
        complicated, and it's easier just to tell people not to do it, or use a string
        representation that Splitgill can parse.

        Empty field names, empty values, and values we can't convert are ignored.

        :returns: yields a dict per row
        """
        converters = {
            # value should be a str, just use it
            xlrd.XL_CELL_TEXT: lambda x: x,
            # value should be a float, just use it
            xlrd.XL_CELL_NUMBER: lambda x: x,
            # value should be a float and dates in xls files are crackers so just use it
            xlrd.XL_CELL_DATE: lambda x: x,
            # value should be an int, convert it to a bool
            xlrd.XL_CELL_BOOLEAN: bool,
        }
        for row in self.rows[1:]:
            yield {
                field: converters[cell.ctype](cell.value)
                for field, cell in zip(self.header, row)
                # ignore empty field names and cell types we don't want to handle
                if field and cell.ctype in converters
            }

    def get_count(self) -> int:
        return len(self.rows)

__init__(source)

Parameters:

Name Type Description Default
source Path

the file path to the source XLS file

required
Source code in ckanext/versioned_datastore/lib/importing/readers.py
258
259
260
261
262
263
264
265
266
267
268
269
def __init__(self, source: Path):
    """
    :param source: the file path to the source XLS file
    """
    self.source = source
    with self.source.open('rb') as f:
        book = xlrd.open_workbook(file_contents=f.read())
        # todo: currently we don't deal with multisheeted spreadsheets, we just
        #       choose the first sheet and roll with it
        sheet = book.sheet_by_index(0)
        self.header = [str(cell.value) for cell in sheet.row(0)]
        self.rows = list(sheet.get_rows())[1:]

read()

Yields a dict for each row in the file's first sheet. We do some basic handling of types to ensure text, numbers, and booleans are converted correctly. Dates are not handled because the way dates are handled in XLS files is extremely complicated, and it's easier just to tell people not to do it, or use a string representation that Splitgill can parse.

Empty field names, empty values, and values we can't convert are ignored.

Returns:

Type Description
Iterable[dict]

yields a dict per row

Source code in ckanext/versioned_datastore/lib/importing/readers.py
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
def read(self) -> Iterable[dict]:
    """
    Yields a dict for each row in the file's first sheet. We do some basic handling
    of types to ensure text, numbers, and booleans are converted correctly. Dates
    are not handled because the way dates are handled in XLS files is extremely
    complicated, and it's easier just to tell people not to do it, or use a string
    representation that Splitgill can parse.

    Empty field names, empty values, and values we can't convert are ignored.

    :returns: yields a dict per row
    """
    converters = {
        # value should be a str, just use it
        xlrd.XL_CELL_TEXT: lambda x: x,
        # value should be a float, just use it
        xlrd.XL_CELL_NUMBER: lambda x: x,
        # value should be a float and dates in xls files are crackers so just use it
        xlrd.XL_CELL_DATE: lambda x: x,
        # value should be an int, convert it to a bool
        xlrd.XL_CELL_BOOLEAN: bool,
    }
    for row in self.rows[1:]:
        yield {
            field: converters[cell.ctype](cell.value)
            for field, cell in zip(self.header, row)
            # ignore empty field names and cell types we don't want to handle
            if field and cell.ctype in converters
        }

XLSXReader

Bases: Reader

Reader for new style Excel spreadsheets.

Source code in ckanext/versioned_datastore/lib/importing/readers.py
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
class XLSXReader(Reader):
    """
    Reader for new style Excel spreadsheets.
    """

    def __init__(self, source: Path):
        """
        :param source: the path to the XLSX file
        """
        self.source = source
        with self.source.open('rb') as f:
            workbook = openpyxl.load_workbook(f, read_only=True)
            # todo: currently we don't deal with multisheeted spreadsheets, we just
            #       choose the first sheet and roll with it
            all_rows = list(workbook.worksheets[0].rows)
            self.header = [str(cell.value) for cell in all_rows[0]]
            self.rows = all_rows[1:]

    def get_name(self) -> str:
        return 'XLSX reader'

    def get_fields(self) -> List[str]:
        return self.header

    def read(self) -> Iterable[dict]:
        """
        Yields the rows from the spreadsheet's first sheet as dicts. All type
        conversions are handled by openpyxl. Empty field names or empty cell values are
        ignored.

        :returns: a dict per row
        """
        for row in self.rows[1:]:
            yield {
                field: cell.value
                for field, cell in zip(self.header, row)
                # ignore empty field names and empty cells
                if field and cell.value is not None
            }

    def get_count(self) -> int:
        return len(self.rows)

__init__(source)

Parameters:

Name Type Description Default
source Path

the path to the XLSX file

required
Source code in ckanext/versioned_datastore/lib/importing/readers.py
316
317
318
319
320
321
322
323
324
325
326
327
def __init__(self, source: Path):
    """
    :param source: the path to the XLSX file
    """
    self.source = source
    with self.source.open('rb') as f:
        workbook = openpyxl.load_workbook(f, read_only=True)
        # todo: currently we don't deal with multisheeted spreadsheets, we just
        #       choose the first sheet and roll with it
        all_rows = list(workbook.worksheets[0].rows)
        self.header = [str(cell.value) for cell in all_rows[0]]
        self.rows = all_rows[1:]

read()

Yields the rows from the spreadsheet's first sheet as dicts. All type conversions are handled by openpyxl. Empty field names or empty cell values are ignored.

Returns:

Type Description
Iterable[dict]

a dict per row

Source code in ckanext/versioned_datastore/lib/importing/readers.py
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
def read(self) -> Iterable[dict]:
    """
    Yields the rows from the spreadsheet's first sheet as dicts. All type
    conversions are handled by openpyxl. Empty field names or empty cell values are
    ignored.

    :returns: a dict per row
    """
    for row in self.rows[1:]:
        yield {
            field: cell.value
            for field, cell in zip(self.header, row)
            # ignore empty field names and empty cells
            if field and cell.value is not None
        }

ZipReader

Bases: Reader

A reader for zip files.

This reader looks in the zip and inspects the files inside in alphabetical order until it finds a file it can read, at which point it creates a reader for the file and this reader is used to fulfill the abstract base class's requirements.

Source code in ckanext/versioned_datastore/lib/importing/readers.py
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
class ZipReader(Reader):
    """
    A reader for zip files.

    This reader looks in the zip and inspects the files inside in alphabetical order
    until it finds a file it can read, at which point it creates a reader for the file
    and this reader is used to fulfill the abstract base class's requirements.
    """

    def __init__(self, source: Path):
        """
        :param source: the path to the zip file
        """
        with zipfile.ZipFile(source) as temp_zip:
            # sort the list of files so that we maintain a consistency when reading zips
            for name in sorted(temp_zip.namelist()):
                extension = name.rsplit('.', 1)[-1]
                if extension in ALL_FORMATS:
                    extracted_source = source.parent / 'zipped_source'
                    with extracted_source.open('wb') as s:
                        with temp_zip.open(name, 'r') as f:
                            shutil.copyfileobj(f, s)
                    try:
                        self.reader = choose_reader(extension, extracted_source)
                        break
                    except ReaderNotFound:
                        continue
            else:
                raise NoCandidateFileFoundInZip()

    def get_name(self) -> str:
        return f'Zip reader, using {self.reader.get_name()}'

    def get_fields(self) -> List[str]:
        return self.reader.get_fields()

    def read(self) -> Iterable[dict]:
        yield from self.reader.read()

    def get_count(self) -> int:
        return self.reader.get_count()

__init__(source)

Parameters:

Name Type Description Default
source Path

the path to the zip file

required
Source code in ckanext/versioned_datastore/lib/importing/readers.py
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
def __init__(self, source: Path):
    """
    :param source: the path to the zip file
    """
    with zipfile.ZipFile(source) as temp_zip:
        # sort the list of files so that we maintain a consistency when reading zips
        for name in sorted(temp_zip.namelist()):
            extension = name.rsplit('.', 1)[-1]
            if extension in ALL_FORMATS:
                extracted_source = source.parent / 'zipped_source'
                with extracted_source.open('wb') as s:
                    with temp_zip.open(name, 'r') as f:
                        shutil.copyfileobj(f, s)
                try:
                    self.reader = choose_reader(extension, extracted_source)
                    break
                except ReaderNotFound:
                    continue
        else:
            raise NoCandidateFileFoundInZip()

choose_reader(resource_format, source)

Chooses an appropriate reader for the provided source. If the source is a file, this uses the resource_format to choose the reader, if the source is a list of dicts, then the MemoryReader is used.

Parameters:

Name Type Description Default
resource_format str

the resource format

required
source Union[Path, List[dict]]

the source data, either a path to a file, or a list of dicts

required

Returns:

Type Description
Reader

a Reader instance

Source code in ckanext/versioned_datastore/lib/importing/readers.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def choose_reader(resource_format: str, source: Union[Path, List[dict]]) -> 'Reader':
    """
    Chooses an appropriate reader for the provided source. If the source is a file, this
    uses the resource_format to choose the reader, if the source is a list of dicts,
    then the MemoryReader is used.

    :param resource_format: the resource format
    :param source: the source data, either a path to a file, or a list of dicts
    :returns: a Reader instance
    """
    if isinstance(source, Path):
        if resource_format in SV_FORMATS:
            return SVReader(source)
        if resource_format in XLS_FORMATS:
            return XLSReader(source)
        if resource_format in XLSX_FORMATS:
            return XLSXReader(source)
        if resource_format in ZIP_FORMATS:
            return ZipReader(source)
        raise ReaderNotFound(resource_format)
    else:
        return MemoryReader(source)

choose_reader_for_resource(resource, source)

Selects a reader for the given resource using the format primarily, and resource url secondarily.

Parameters:

Name Type Description Default
resource dict

the resource dict

required
source Union[Path, List[dict]]

the source data, either a path to a file, or a list of dicts

required

Returns:

Type Description
Reader

a Reader instance

Source code in ckanext/versioned_datastore/lib/importing/readers.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def choose_reader_for_resource(
    resource: dict, source: Union[Path, List[dict]]
) -> 'Reader':
    """
    Selects a reader for the given resource using the format primarily, and resource url
    secondarily.

    :param resource: the resource dict
    :param source: the source data, either a path to a file, or a list of dicts
    :returns: a Reader instance
    """
    # start off trying to use the format they have set/CKAN has inferred
    resource_format = resource.get('format', '')
    if not resource_format:
        # if that isn't available, try and get it from the resource url
        url = resource['url']
        if url:
            resource_format = url.rsplit('.', 1)[-1]
        # todo: could try detecting based on the file? e.g. using python-magic etc
    return choose_reader(resource_format.lower(), source)

detect_encoding(source, test_encoding=True)

Given a file, attempt to detect the character encoding it uses.

Parameters:

Name Type Description Default
source Path

the path to the file

required
test_encoding bool

whether to test the detected encoding - and some common alternatives if necessary (default True)

True

Returns:

Type Description
str

the character encoding

Source code in ckanext/versioned_datastore/lib/importing/readers.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def detect_encoding(source: Path, test_encoding: bool = True) -> str:
    """
    Given a file, attempt to detect the character encoding it uses.

    :param source: the path to the file
    :param test_encoding: whether to test the detected encoding - and some common alternatives if necessary (default True)
    :returns: the character encoding
    """
    with source.open('rb') as f:
        detector = UniversalDetector()
        # feed the universal detector the entire file
        while True:
            chunk = f.read(8192)
            if chunk:
                detector.feed(chunk)
            else:
                detector.close()
                break

        encoding = detector.result['encoding']
        # if the detector failed to work out the encoding (unlikely) or if the
        # encoding it comes up with is ASCII, just default to UTF-8 (UTF-8 is a
        # superset of ASCII)
        if encoding is None or encoding == 'ASCII':
            encoding = 'utf-8'

    if not test_encoding:
        return encoding

    # try these (in order) if the detected encoding doesn't work
    fallback_encodings = ['utf-8', 'latin-1', 'cp1252']
    encodings_to_test = [encoding] + [
        e for e in fallback_encodings if e != encoding.lower()
    ]
    working_encoding = None

    # reading the file multiple times is not ideal, but if the detected encoding is
    # correct this will only loop once
    for enc in encodings_to_test:
        try:
            with source.open('r', encoding=enc) as f:
                while f.read(8192):
                    pass
                # we got all the way through the file without an error
                working_encoding = enc
                break
        except UnicodeDecodeError:
            continue

    if working_encoding is None:
        raise UnidentifiedEncoding(encoding, encodings_to_test)

    return working_encoding