[ramdisk] add cvitek pre-built ramdisk

Change-Id: Ic7d2046a23358129eaf621b5558984a64fa7361d
This commit is contained in:
sam.xiang
2023-02-23 09:56:47 +08:00
parent 4f810186ab
commit 1cf39ecdd5
12481 changed files with 1478086 additions and 0 deletions

View File

@ -0,0 +1,71 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from tabledata import (
SQLiteTableDataSanitizer,
TableData,
TableDataSanitizer,
InvalidTableNameError,
InvalidHeaderNameError,
InvalidDataError,
EmptyDataError,
)
from ._constant import PatternMatch
from ._logger import (
logger,
set_logger,
set_log_level,
)
from .csv.core import (
CsvTableFileLoader,
CsvTableTextLoader,
)
from .error import (
ValidationError,
InvalidPathError,
InvalidFilePathError,
InvalidUrlError,
OpenError,
LoaderNotFoundError,
HTTPError,
ProxyError,
PypandocImportError,
)
from .html.core import (
HtmlTableFileLoader,
HtmlTableTextLoader,
)
from .json.core import (
JsonTableFileLoader,
JsonTableTextLoader,
)
from .loadermanager import (
TableFileLoader,
TableUrlLoader,
)
from .ltsv.core import (
LtsvTableFileLoader,
LtsvTableTextLoader,
)
from .markdown.core import (
MarkdownTableFileLoader,
MarkdownTableTextLoader,
)
from .mediawiki.core import (
MediaWikiTableFileLoader,
MediaWikiTableTextLoader,
)
from .spreadsheet.excelloader import ExcelTableFileLoader
from .spreadsheet.gsloader import GoogleSheetsTableLoader
from .sqlite.core import SqliteFileLoader
from .tsv.core import (
TsvTableFileLoader,
TsvTableTextLoader,
)

View File

@ -0,0 +1,34 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
import abc
import six
@six.add_metaclass(abc.ABCMeta)
class LoaderAcceptorInterface(object):
"""
An interface class of table loader acceptor.
"""
@abc.abstractmethod
def accept(self, loader): # pragma: no cover
pass
class LoaderAcceptor(LoaderAcceptorInterface):
"""
An abstract class of table loader acceptor.
"""
def __init__(self):
self._loader = None
def accept(self, loader):
self._loader = loader

View File

@ -0,0 +1,57 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import os.path
import posixpath
import pathvalidate
import typepy
from six.moves.urllib.parse import urlparse
from .error import InvalidFilePathError
def get_extension(file_path):
if typepy.is_null_string(file_path):
raise InvalidFilePathError("file path is empty")
return os.path.splitext(file_path)[1].lstrip(".")
def convert_idx_to_alphabet(column_idx):
if column_idx < 26:
return chr(65 + column_idx)
return (
convert_idx_to_alphabet(int(column_idx / 26 - 1)) +
convert_idx_to_alphabet(column_idx % 26))
def make_temp_file_path_from_url(temp_dir_path, url):
try:
url_path = urlparse(url).path
except AttributeError:
raise InvalidFilePathError("url must be a string")
if typepy.is_null_string(url_path):
raise InvalidFilePathError("invalid URL path: {}".format(url_path))
temp_name = os.path.basename(url_path.rstrip("/"))
if typepy.is_null_string(temp_name):
temp_name = pathvalidate.replace_symbol(
temp_name, replacement_text="_")
if typepy.is_null_string(temp_name):
raise InvalidFilePathError("invalid URL: {}".format(url))
try:
return posixpath.join(temp_dir_path, temp_name)
except (TypeError, AttributeError):
raise InvalidFilePathError("temp_dir_path must be a string")

View File

@ -0,0 +1,38 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import enum
class Default(object):
ENCODING = "utf-8"
class SourceType(object):
TEXT = "text"
FILE = "file"
URL = "url"
class TableNameTemplate(object):
__FORMAT = "%({:s})s"
DEFAULT = __FORMAT.format("default")
FILENAME = __FORMAT.format("filename")
FORMAT_NAME = __FORMAT.format("format_name")
FORMAT_ID = __FORMAT.format("format_id")
GLOBAL_ID = __FORMAT.format("global_id")
KEY = __FORMAT.format("key")
TITLE = __FORMAT.format("title")
SHEET = __FORMAT.format("sheet")
@enum.unique
class PatternMatch(enum.Enum):
OR = 0
AND = 1

View File

@ -0,0 +1,119 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import abc
import dataproperty
import logbook
import six
logger = logbook.Logger("pytablereader")
logger.disable()
def set_logger(is_enable):
if is_enable != logger.disabled:
return
if is_enable:
logger.enable()
else:
logger.disable()
dataproperty.set_logger(is_enable)
try:
import simplesqlite
simplesqlite.set_logger(is_enable)
except ImportError:
pass
def set_log_level(log_level):
"""
Set logging level of this module. Using
`logbook <http://logbook.readthedocs.io/en/stable/>`__ module for logging.
:param int log_level:
One of the log level of
`logbook <http://logbook.readthedocs.io/en/stable/api/base.html>`__.
Disabled logging if ``log_level`` is ``logbook.NOTSET``.
:raises LookupError: If ``log_level`` is an invalid value.
"""
# validate log level
logbook.get_level_name(log_level)
if log_level == logger.level:
return
if log_level == logbook.NOTSET:
set_logger(is_enable=False)
else:
set_logger(is_enable=True)
logger.level = log_level
dataproperty.set_log_level(log_level)
try:
import simplesqlite
simplesqlite.set_log_level(log_level)
except ImportError:
pass
@six.add_metaclass(abc.ABCMeta)
class LoggerInterface(object):
@abc.abstractmethod
def logging_load(self): # pragma: no cover
pass
class BaseLogger(LoggerInterface):
def __init__(self, loader):
self._loader = loader
class FileSourceLogger(BaseLogger):
def logging_load(self):
message = "loading {:s}: format={:s}, path={}".format(
self._loader.source_type, self._loader.format_name,
self._loader.source)
try:
message += ", encoding={}".format(self._loader.encoding)
except AttributeError:
pass
logger.debug(message)
class TextSourceLogger(BaseLogger):
def logging_load(self):
message = "loading {:s} {:s}".format(
self._loader.format_name, self._loader.source_type)
try:
message += ", len={}".format(len(self._loader.source))
except TypeError:
pass
try:
message += ", encoding={}".format(self._loader.encoding)
except AttributeError:
pass
logger.debug(message)

View File

@ -0,0 +1,107 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import abc
import os.path
from pytablereader import EmptyDataError
import six
import typepy
import pathvalidate as pv
from six.moves.urllib.parse import urlparse
from ._constant import SourceType
from .error import (
InvalidFilePathError,
InvalidUrlError
)
@six.add_metaclass(abc.ABCMeta)
class ValidatorInterface(object):
"""
An interface class for data source validator.
"""
@abc.abstractproperty
def source_type(self):
pass
@abc.abstractmethod
def validate(self):
pass
class BaseValidator(ValidatorInterface):
"""
An abstract base class for data source validator.
"""
@property
def source(self):
return self.__source
def __init__(self, source):
self.__source = source
class FileValidator(BaseValidator):
"""
Validator class for file data source.
"""
@property
def source_type(self):
return SourceType.FILE
def validate(self):
try:
pv.validate_file_path(self.source)
except pv.NullNameError:
raise InvalidFilePathError("file path is empty")
except (ValueError, pv.InvalidCharError, pv.InvalidLengthError) as e:
raise InvalidFilePathError(e)
if not os.path.isfile(self.source):
raise IOError("file not found")
class TextValidator(BaseValidator):
"""
Validator class for text object data source.
"""
@property
def source_type(self):
return SourceType.TEXT
def validate(self):
if typepy.is_null_string(self.source):
raise EmptyDataError("data source is empty")
class UrlValidator(BaseValidator):
"""
Validator class for URL data source.
"""
@property
def source_type(self):
return SourceType.URL
def validate(self):
if typepy.is_null_string(self.source):
raise InvalidUrlError("url is empty")
scheme = urlparse(self.source).scheme
if scheme not in ["http", "https"]:
raise InvalidUrlError(
"invalid scheme: expected=http/https, actual={}".format(
scheme))

View File

@ -0,0 +1,244 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import csv
import io
import platform
from mbstrdecoder import MultiByteStrDecoder
from pytablereader import InvalidDataError
import six
import typepy
from .._constant import (
Default,
TableNameTemplate as tnt,
)
from .._logger import (
FileSourceLogger,
TextSourceLogger,
)
from .._validator import (
FileValidator,
TextValidator
)
from ..interface import TableLoader
from .formatter import CsvTableFormatter
class CsvTableLoader(TableLoader):
"""
The abstract class of CSV table loaders.
.. py:attribute:: header_list
Attribute names of the table. Use the first line of
the CSV file as attribute list if header_list is empty.
.. py:attribute:: delimiter
A one-character string used to separate fields.
Defaults to ``","``.
.. py:attribute:: quotechar
A one-character string used to quote fields containing
special characters, such as the ``delimiter`` or ``quotechar``,
or which contain new-line characters.
Defaults to ``'"'``.
.. py:attribute:: encoding
Encoding of the CSV data.
"""
@property
def format_name(self):
return "csv"
@property
def delimiter(self):
# "delimiter" must be a string, not an unicode
return str(MultiByteStrDecoder(self.__delimiter).unicode_str)
@delimiter.setter
def delimiter(self, value):
self.__delimiter = value
@property
def quotechar(self):
# "quotechar" must be a string, not an unicode
return str(MultiByteStrDecoder(self.__quotechar).unicode_str)
@quotechar.setter
def quotechar(self, value):
self.__quotechar = value
def __init__(self, source):
super(CsvTableLoader, self).__init__(source)
self._csv_reader = None
self.header_list = ()
self.delimiter = ","
self.quotechar = '"'
self.encoding = Default.ENCODING
def _to_data_matrix(self):
try:
return [
[self.__modify_item(data) for data in row]
for row in self._csv_reader
if typepy.is_not_empty_sequence(row)
]
except csv.Error as e:
raise InvalidDataError(e)
@staticmethod
def __modify_item(data):
try:
return typepy.type.Integer(data).convert()
except typepy.TypeConversionError:
pass
try:
return typepy.type.RealNumber(data).convert()
except typepy.TypeConversionError:
pass
return MultiByteStrDecoder(data).unicode_str
class CsvTableFileLoader(CsvTableLoader):
"""
A file loader class to extract tabular data from CSV files.
:param str file_path: Path to the loading CSV file.
.. py:attribute:: table_name
Table name string. Defaults to ``%(filename)s``.
:Examples:
:ref:`example-csv-table-loader`
"""
def __init__(self, file_path):
super(CsvTableFileLoader, self).__init__(file_path)
self._validator = FileValidator(file_path)
self._logger = FileSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from a CSV file.
|load_source_desc_file|
:return:
Loaded table data.
|load_table_name_desc|
=================== ========================================
Format specifier Value after the replacement
=================== ========================================
``%(filename)s`` |filename_desc|
``%(format_name)s`` ``"csv"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ========================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidDataError:
If the CSV data is invalid.
.. seealso::
:py:func:`csv.reader`
"""
self._validate()
self._logger.logging_load()
if all([platform.system() == "Windows", six.PY3]):
self._csv_reader = csv.reader(
io.open(self.source, "r", encoding=self.encoding),
delimiter=self.delimiter, quotechar=self.quotechar,
strict=True, skipinitialspace=True)
else:
self._csv_reader = csv.reader(
open(self.source, "r"),
delimiter=self.delimiter, quotechar=self.quotechar,
strict=True, skipinitialspace=True)
formatter = CsvTableFormatter(self._to_data_matrix())
formatter.accept(self)
return formatter.to_table_data()
def _get_default_table_name_template(self):
return tnt.FILENAME
class CsvTableTextLoader(CsvTableLoader):
"""
A text loader class to extract tabular data from CSV text data.
:param str text: CSV text to load.
.. py:attribute:: table_name
Table name string. Defaults to ``%(format_name)s%(format_id)s``.
:Examples:
:ref:`example-csv-table-loader`
"""
def __init__(self, text):
super(CsvTableTextLoader, self).__init__(text)
self._validator = TextValidator(text)
self._logger = TextSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from a CSV text object.
|load_source_desc_text|
:return:
Loaded table data.
|load_table_name_desc|
=================== ========================================
Format specifier Value after the replacement
=================== ========================================
``%(filename)s`` ``""``
``%(format_name)s`` ``"csv"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ========================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidDataError:
If the CSV data is invalid.
.. seealso::
:py:func:`csv.reader`
"""
self._validate()
self._logger.logging_load()
self._csv_reader = csv.reader(
six.StringIO(self.source.strip()),
delimiter=self.delimiter, quotechar=self.quotechar,
strict=True, skipinitialspace=True)
formatter = CsvTableFormatter(self._to_data_matrix())
formatter.accept(self)
return formatter.to_table_data()
def _get_default_table_name_template(self):
return "{:s}{:s}".format(tnt.FORMAT_NAME, tnt.FORMAT_ID)

View File

@ -0,0 +1,44 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
from pytablereader import InvalidDataError
from tabledata import TableData
import typepy
from ..formatter import TableFormatter
class CsvTableFormatter(TableFormatter):
def to_table_data(self):
if typepy.is_empty_sequence(self._loader.header_list):
header_list = self._source_data[0]
if any([
typepy.is_null_string(header) for header in header_list
]):
raise InvalidDataError(
"the first line includes empty string item."
"all of the items should contain header name."
"actual={}".format(header_list))
data_matrix = self._source_data[1:]
else:
header_list = self._loader.header_list
data_matrix = self._source_data
if not data_matrix:
raise InvalidDataError(
"data row must be greater or equal than one")
self._loader.inc_table_count()
yield TableData(
self._loader.make_table_name(), header_list, data_matrix,
quoting_flags=self._loader.quoting_flags)

View File

@ -0,0 +1,71 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
import requests
class ValidationError(Exception):
"""
Exception raised when data is not properly formatted.
"""
class InvalidPathError(Exception):
"""
Base path exception class.
"""
class InvalidFilePathError(InvalidPathError):
"""
Exception raised when invalid file path used.
"""
class InvalidUrlError(InvalidPathError):
"""
Exception raised when invalid URL used.
"""
class OpenError(IOError):
"""
Exception raised when failed to open a file.
"""
class LoaderNotFoundError(Exception):
"""
Exception raised when loader not found.
"""
class HTTPError(requests.RequestException):
"""
An HTTP error occurred.
.. seealso::
http://docs.python-requests.org/en/master/api/#exceptions
"""
class ProxyError(requests.exceptions.ProxyError):
"""
A proxy error occurred.
.. seealso::
http://docs.python-requests.org/en/master/_modules/requests/exceptions/
"""
class PypandocImportError(ImportError):
"""
Exception raised when import error occurred with pypandoc package.
"""

View File

@ -0,0 +1,10 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from ._file import TableFileLoaderFactory
from ._url import TableUrlLoaderFactory

View File

@ -0,0 +1,113 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import abc
import six
from .._constant import Default
from ..error import LoaderNotFoundError
@six.add_metaclass(abc.ABCMeta)
class BaseTableLoaderFactory(object):
@property
def source(self):
"""
:return: Data source to load.
:rtype: str
"""
return self._source
def __init__(self, source, encoding=Default.ENCODING):
self._source = source
if not encoding:
self._encoding = Default.ENCODING
else:
self._encoding = encoding
@abc.abstractmethod
def create_from_path(self): # pragma: no cover
pass
@abc.abstractmethod
def create_from_format_name(self, format_name): # pragma: no cover
pass
@abc.abstractmethod
def _get_extension_loader_mapping(self): # pragma: no cover
pass
@abc.abstractmethod
def _get_format_name_loader_mapping(self): # pragma: no cover
pass
def get_format_name_list(self):
"""
:return: Available format names.
:rtype: list
"""
return sorted(self._get_format_name_loader_mapping())
def get_extension_list(self):
"""
:return: Available format file extensions.
:rtype: list
"""
return sorted(self._get_extension_loader_mapping())
def _get_loader_class(self, loader_mapping, format_name):
try:
format_name = format_name.lower()
except AttributeError:
raise TypeError("format name must be a string")
try:
return loader_mapping[format_name]
except KeyError:
raise LoaderNotFoundError(", ".join([
"loader not found: format='{}'".format(format_name),
"source='{}'".format(self.source),
]))
def _create_from_extension(self, extension):
try:
loader = self._get_loader_class(
self._get_extension_loader_mapping(), extension)(self.source)
loader.encoding = self._encoding
return loader
except LoaderNotFoundError as e:
raise LoaderNotFoundError("\n".join([
"{:s} (unknown extension).".format(e.args[0]),
"",
"acceptable extensions are: {}.".format(
", ".join(self.get_extension_list())),
"actual: '{}'".format(extension)
]))
def _create_from_format_name(self, format_name):
try:
loader = self._get_loader_class(
self._get_format_name_loader_mapping(),
format_name)(self.source)
loader.encoding = self._encoding
return loader
except LoaderNotFoundError as e:
raise LoaderNotFoundError("\n".join([
"{:s} (unknown format name).".format(e.args[0]),
"acceptable format names are: {}.".format(
", ".join(self.get_format_name_list())),
]))

View File

@ -0,0 +1,143 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
from .._common import get_extension
from .._logger import logger
from ..csv.core import CsvTableFileLoader
from ..html.core import HtmlTableFileLoader
from ..json.core import JsonTableFileLoader
from ..ltsv.core import LtsvTableFileLoader
from ..markdown.core import MarkdownTableFileLoader
from ..mediawiki.core import MediaWikiTableFileLoader
from ..spreadsheet.excelloader import ExcelTableFileLoader
from ..sqlite.core import SqliteFileLoader
from ..tsv.core import TsvTableFileLoader
from ._base import BaseTableLoaderFactory
class TableFileLoaderFactory(BaseTableLoaderFactory):
"""
:param str file_path: Path to the loading file.
:raises pytablereader.InvalidFilePathError:
If the ``file_path`` is an empty path.
"""
@property
def file_extension(self):
"""
:return: File extension of the :py:attr:`.source` (without period).
:rtype: str
"""
return get_extension(self.source)
def create_from_path(self):
"""
Create a file loader from the file extension to loading file.
Supported file extensions are as follows:
========================== =====================================
Format name Loader
========================== =====================================
``"csv"`` :py:class:`~.CsvTableFileLoader`
``"xls"``/``"xlsx"`` :py:class:`~.ExcelTableFileLoader`
``"htm"``/``"html"`` :py:class:`~.HtmlTableFileLoader`
``"json"`` :py:class:`~.JsonTableFileLoader`
``"ltsv"`` :py:class:`~.LtsvTableFileLoader`
``"md"`` :py:class:`~.MarkdownTableFileLoader`
``"sqlite"``/``"sqlite3"`` :py:class:`~.SqliteFileLoader`
``"tsv"`` :py:class:`~.TsvTableFileLoader`
========================== =====================================
:return:
Loader that coincides with the file extension of the
:py:attr:`.file_extension`.
:raises pytablereader.LoaderNotFoundError:
|LoaderNotFoundError_desc| loading the file.
"""
logger.debug(
"TableFileLoaderFactory.create_from_path: extension={}".format(
self.file_extension))
return self._create_from_extension(self.file_extension)
def create_from_format_name(self, format_name):
"""
Create a file loader from a format name.
Supported file formats are as follows:
=============== ======================================
Format name Loader
=============== ======================================
``"csv"`` :py:class:`~.CsvTableFileLoader`
``"excel"`` :py:class:`~.ExcelTableFileLoader`
``"html"`` :py:class:`~.HtmlTableFileLoader`
``"json"`` :py:class:`~.JsonTableFileLoader`
``"ltsv"`` :py:class:`~.LtsvTableFileLoader`
``"markdown"`` :py:class:`~.MarkdownTableFileLoader`
``"mediawiki"`` :py:class:`~.MediaWikiTableFileLoader`
``"sqlite"`` :py:class:`~.SqliteFileLoader`
``"tsv"`` :py:class:`~.TsvTableFileLoader`
=============== ======================================
:param str format_name: Format name string (case insensitive).
:return: Loader that coincides with the ``format_name``:
:raises pytablereader.LoaderNotFoundError:
|LoaderNotFoundError_desc| the format.
"""
logger.debug(
"TableFileLoaderFactory.create_from_format_name: name={}".format(
self.file_extension))
return self._create_from_format_name(format_name)
@staticmethod
def _get_common_loader_mapping():
return {
"csv": CsvTableFileLoader,
"html": HtmlTableFileLoader,
"json": JsonTableFileLoader,
"ltsv": LtsvTableFileLoader,
"sqlite": SqliteFileLoader,
"tsv": TsvTableFileLoader,
}
def _get_extension_loader_mapping(self):
"""
:return: Mappings of format extension and loader class.
:rtype: dict
"""
loader_table = self._get_common_loader_mapping()
loader_table.update({
"htm": HtmlTableFileLoader,
"md": MarkdownTableFileLoader,
"sqlite3": SqliteFileLoader,
"xlsx": ExcelTableFileLoader,
"xls": ExcelTableFileLoader,
})
return loader_table
def _get_format_name_loader_mapping(self):
"""
:return: Mappings of format name and loader class.
:rtype: dict
"""
loader_table = self._get_common_loader_mapping()
loader_table.update({
"excel": ExcelTableFileLoader,
"markdown": MarkdownTableFileLoader,
"mediawiki": MediaWikiTableFileLoader,
})
return loader_table

View File

@ -0,0 +1,224 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import os
import tempfile
import requests
import typepy
from six.moves.urllib.parse import urlparse
from .._common import (
get_extension,
make_temp_file_path_from_url,
)
from .._constant import SourceType
from .._logger import logger
from .._validator import UrlValidator
from ..csv.core import CsvTableTextLoader
from ..error import (
InvalidFilePathError,
InvalidUrlError,
HTTPError,
ProxyError,
)
from ..html.core import HtmlTableTextLoader
from ..json.core import JsonTableTextLoader
from ..ltsv.core import LtsvTableTextLoader
from ..markdown.core import MarkdownTableTextLoader
from ..mediawiki.core import MediaWikiTableTextLoader
from ..spreadsheet.excelloader import ExcelTableFileLoader
from ..sqlite.core import SqliteFileLoader
from ..tsv.core import TsvTableTextLoader
from ._base import BaseTableLoaderFactory
class TableUrlLoaderFactory(BaseTableLoaderFactory):
def __init__(self, url, encoding=None, proxies=None):
super(TableUrlLoaderFactory, self).__init__(None)
self.__url = url
self.__proxies = proxies
self.__temp_dir_path = None
self._encoding = encoding
UrlValidator(url).validate()
def __del__(self):
if typepy.is_null_string(self.__temp_dir_path):
return
os.removedirs(self.__temp_dir_path)
self.__temp_dir_path = None
def create_from_path(self):
"""
Create a file loader from the file extension to loading file.
Supported file extensions are as follows:
========================================= =====================================
Format name Loader
========================================= =====================================
``"csv"`` :py:class:`~.CsvTableTextLoader`
``"xls"``/``"xlsx"`` :py:class:`~.ExcelTableFileLoader`
``"htm"``/``"html"``/``"asp"``/``"aspx"`` :py:class:`~.HtmlTableTextLoader`
``"json"`` :py:class:`~.JsonTableTextLoader`
``"ltsv"`` :py:class:`~.LtsvTableTextLoader`
``"md"`` :py:class:`~.MarkdownTableTextLoader`
``"sqlite"``/``"sqlite3"`` :py:class:`~.SqliteFileLoader`
``"tsv"`` :py:class:`~.TsvTableTextLoader`
========================================= =====================================
:return:
Loader that coincides with the file extension of the URL.
:raises pytablereader.InvalidUrlError: If unacceptable URL format.
:raises pytablereader.LoaderNotFoundError:
|LoaderNotFoundError_desc| loading the URL.
"""
url_path = urlparse(self.__url).path
try:
url_extension = get_extension(url_path.rstrip("/"))
except InvalidFilePathError:
raise InvalidUrlError("url must include path")
logger.debug(
"TableUrlLoaderFactory.create_from_path: extension={}".format(
url_extension))
loader_class = self._get_loader_class(
self._get_extension_loader_mapping(), url_extension)
try:
self._fetch_source(loader_class)
except requests.exceptions.ProxyError as e:
raise ProxyError(e)
return self._create_from_extension(url_extension)
def create_from_format_name(self, format_name):
"""
Create a file loader from a format name.
Supported file formats are as follows:
========================== ======================================
Format name Loader
========================== ======================================
``"csv"`` :py:class:`~.CsvTableTextLoader`
``"excel"`` :py:class:`~.ExcelTableFileLoader`
``"html"`` :py:class:`~.HtmlTableTextLoader`
``"json"`` :py:class:`~.JsonTableTextLoader`
``"ltsv"`` :py:class:`~.LtsvTableTextLoader`
``"markdown"`` :py:class:`~.MarkdownTableTextLoader`
``"mediawiki"`` :py:class:`~.MediaWikiTableTextLoader`
``"sqlite"`` :py:class:`~.SqliteFileLoader`
``"tsv"`` :py:class:`~.TsvTableTextLoader`
========================== ======================================
:param str format_name: Format name string (case insensitive).
:return: Loader that coincide with the ``format_name``:
:raises pytablereader.LoaderNotFoundError:
|LoaderNotFoundError_desc| the format.
:raises TypeError: If ``format_name`` is not a string.
"""
logger.debug(
"TableUrlLoaderFactory.create_from_format_name: name={}".format(
format_name))
loader_class = self._get_loader_class(
self._get_format_name_loader_mapping(), format_name)
try:
self._fetch_source(loader_class)
except requests.exceptions.ProxyError as e:
raise ProxyError(e)
return self._create_from_format_name(format_name)
def _fetch_source(self, loader_class):
loader_source_type = loader_class("").source_type
if loader_source_type not in [SourceType.TEXT, SourceType.FILE]:
raise ValueError(
"unknown loader source: type={}".format(loader_source_type))
r = requests.get(self.__url, proxies=self.__proxies)
try:
r.raise_for_status()
except requests.HTTPError as e:
raise HTTPError(e)
if typepy.is_null_string(self._encoding):
self._encoding = r.encoding
logger.debug("\n".join([
"_fetch_source: ",
" source-type={}".format(loader_source_type),
" content-type={}".format(r.headers["Content-Type"]),
" encoding={}".format(self._encoding),
" status-code={}".format(r.status_code),
]))
if loader_source_type == SourceType.TEXT:
self._source = r.text
elif loader_source_type == SourceType.FILE:
self.__temp_dir_path = tempfile.mkdtemp()
self._source = "{:s}.xlsx".format(
make_temp_file_path_from_url(self.__temp_dir_path, self.__url))
with open(self._source, "wb") as f:
f.write(r.content)
def _get_common_loader_mapping(self):
return {
"csv": CsvTableTextLoader,
"html": HtmlTableTextLoader,
"json": JsonTableTextLoader,
"ltsv": LtsvTableTextLoader,
"sqlite": SqliteFileLoader,
"tsv": TsvTableTextLoader,
}
def _get_extension_loader_mapping(self):
"""
:return: Mappings of format-extension and loader class.
:rtype: dict
"""
loader_table = self._get_common_loader_mapping()
loader_table.update({
"asp": HtmlTableTextLoader,
"aspx": HtmlTableTextLoader,
"htm": HtmlTableTextLoader,
"md": MarkdownTableTextLoader,
"sqlite3": SqliteFileLoader,
"xls": ExcelTableFileLoader,
"xlsx": ExcelTableFileLoader,
})
return loader_table
def _get_format_name_loader_mapping(self):
"""
:return: Mappings of format-name and loader class.
:rtype: dict
"""
loader_table = self._get_common_loader_mapping()
loader_table.update({
"excel": ExcelTableFileLoader,
"markdown": MarkdownTableTextLoader,
"mediawiki": MediaWikiTableTextLoader,
})
return loader_table

View File

@ -0,0 +1,40 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
import abc
from pytablereader import InvalidDataError
import six
from ._acceptor import LoaderAcceptor
@six.add_metaclass(abc.ABCMeta)
class TableFormatterInterface(object):
"""
The abstract class of table data validator.
"""
@abc.abstractmethod
def to_table_data(self): # pragma: no cover
pass
class TableFormatter(LoaderAcceptor, TableFormatterInterface):
"""
The abstract class of |TableData| formatter.
"""
def _validate_source_data(self):
if not self._source_data:
raise InvalidDataError("source data is empty")
def __init__(self, source_data):
self._source_data = source_data
self._validate_source_data()

View File

@ -0,0 +1,159 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import io
from .._constant import (
Default,
TableNameTemplate as tnt,
)
from .._logger import (
FileSourceLogger,
TextSourceLogger,
)
from .._validator import (
FileValidator,
TextValidator
)
from ..interface import TableLoader
from .formatter import HtmlTableFormatter
class HtmlTableLoader(TableLoader):
"""
An abstract class of HTML table loaders.
"""
@property
def format_name(self):
return "html"
def _get_default_table_name_template(self):
return "{:s}_{:s}".format(tnt.TITLE, tnt.KEY)
class HtmlTableFileLoader(HtmlTableLoader):
"""
A file loader class to extract tabular data from HTML files.
:param str file_path: Path to the loading HTML file.
.. py:attribute:: table_name
Table name string. Defaults to ``%(title)s_%(key)s``.
.. py:attribute:: encoding
HTML file encoding. Defaults to ``"utf-8"``.
"""
def __init__(self, file_path=None):
super(HtmlTableFileLoader, self).__init__(file_path)
self.encoding = Default.ENCODING
self._validator = FileValidator(file_path)
self._logger = FileSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from HTML table tags in
a HTML file.
|load_source_desc_file|
:return:
Loaded table data iterator.
|load_table_name_desc|
=================== ==============================================
Format specifier Value after the replacement
=================== ==============================================
``%(filename)s`` |filename_desc|
``%(title)s`` ``<title>`` tag value of the HTML.
``%(key)s`` | This replaced to:
| **(1)** ``id`` attribute of the table tag
| **(2)** ``%(format_name)s%(format_id)s``
| if ``id`` attribute not present in the
| table tag.
``%(format_name)s`` ``"html"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ==============================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidDataError:
If the HTML data is invalid or empty.
.. note::
Table tag attributes ignored with loaded |TableData|.
"""
self._validate()
self._logger.logging_load()
with io.open(self.source, "r", encoding=self.encoding) as fp:
formatter = HtmlTableFormatter(fp.read())
formatter.accept(self)
return formatter.to_table_data()
class HtmlTableTextLoader(HtmlTableLoader):
"""
A text loader class to extract tabular data from HTML text data.
:param str text: HTML text to load.
.. py:attribute:: table_name
Table name string. Defaults to ``%(title)s_%(key)s``.
"""
def __init__(self, text):
super(HtmlTableTextLoader, self).__init__(text)
self._validator = TextValidator(text)
self._logger = TextSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from HTML table tags in
a HTML text object.
|load_source_desc_text|
:return:
Loaded table data iterator.
|load_table_name_desc|
=================== ==============================================
Format specifier Value after the replacement
=================== ==============================================
``%(filename)s`` ``""``
``%(title)s`` ``<title>`` tag value of the HTML.
``%(key)s`` | This replaced to:
| **(1)** ``id`` attribute of the table tag
| **(2)** ``%(format_name)s%(format_id)s``
| if ``id`` attribute is not included
| in the table tag.
``%(format_name)s`` ``"html"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ==============================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidDataError:
If the HTML data is invalid or empty.
"""
self._validate()
self._logger.logging_load()
formatter = HtmlTableFormatter(self.source)
formatter.accept(self)
return formatter.to_table_data()

View File

@ -0,0 +1,115 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import re
import bs4
from pytablereader import InvalidDataError
from tabledata import TableData
import typepy
from .._constant import TableNameTemplate as tnt
from ..formatter import TableFormatter
class HtmlTableFormatter(TableFormatter):
@property
def table_id(self):
return self.__table_id
def __init__(self, source_data):
super(HtmlTableFormatter, self).__init__(source_data)
self.__table_id = None
if typepy.is_null_string(source_data):
raise InvalidDataError
try:
self.__soup = bs4.BeautifulSoup(self._source_data, "lxml")
except bs4.FeatureNotFound:
self.__soup = bs4.BeautifulSoup(self._source_data, "html.parser")
def to_table_data(self):
for table in self.__soup.find_all("table"):
try:
table_data = self.__parse_html(table)
except ValueError:
continue
if table_data.is_empty_record():
continue
yield table_data
def _make_table_name(self):
from collections import OrderedDict
key = self.table_id
if typepy.is_null_string(key):
key = self._loader.get_format_key()
try:
title = self.__soup.title.text
except AttributeError:
title = ""
kv_mapping = self._loader._get_basic_tablename_keyvalue_mapping()
kv_mapping.update(OrderedDict([
(tnt.KEY, key),
(tnt.TITLE, title),
]))
return self._loader._expand_table_name_format(kv_mapping)
def __parse_tag_id(self, table):
self.__table_id = table.get("id")
if self.__table_id is None:
caption = table.find("caption")
if caption is not None:
caption = caption.text.strip()
if typepy.is_not_null_string(caption):
self.__table_id = caption
def __parse_html(self, table):
header_list = []
data_matrix = []
self.__parse_tag_id(table)
row_list = table.find_all("tr")
re_table_val = re.compile("td|th")
for row in row_list:
td_list = row.find_all("td")
if typepy.is_empty_sequence(td_list):
if typepy.is_not_empty_sequence(header_list):
continue
th_list = row.find_all("th")
if typepy.is_empty_sequence(th_list):
continue
header_list = [row.text.strip() for row in th_list]
continue
data_matrix.append([
value.get_text().strip()
for value in row.find_all(re_table_val)
])
if typepy.is_empty_sequence(data_matrix):
raise ValueError("data matrix is empty")
self._loader.inc_table_count()
return TableData(
self._make_table_name(), header_list, data_matrix,
quoting_flags=self._loader.quoting_flags)

View File

@ -0,0 +1,158 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import abc
import threading
import path
from pytablereader import InvalidTableNameError
import six
import typepy
from ._constant import (
SourceType,
TableNameTemplate as tnt
)
@six.add_metaclass(abc.ABCMeta)
class TableLoaderInterface(object):
"""
Interface class of table loader class.
"""
@abc.abstractproperty
def format_name(self): # pragma: no cover
pass
@abc.abstractproperty
def source_type(self): # pragma: no cover
pass
@abc.abstractmethod
def load(self): # pragma: no cover
pass
@abc.abstractmethod
def inc_table_count(self): # pragma: no cover
pass
class TableLoader(TableLoaderInterface):
"""
The abstract class of table data file loader.
.. py:attribute:: table_name
Table name string.
.. py:attribute:: source
Table data source to load.
"""
__table_count_lock = threading.Lock()
__global_table_count = 0
__format_table_count = {}
@property
def source_type(self):
return self._validator.source_type
def __init__(self, source):
self.table_name = tnt.DEFAULT
self.source = source
self.quoting_flags = None
self._validator = None
self._logger = None
def get_format_key(self):
return "{:s}{:d}".format(
self.format_name,
self.__get_format_table_count())
def make_table_name(self):
return self._make_table_name()
def inc_table_count(self):
with self.__table_count_lock:
self.__global_table_count += 1
self.__format_table_count[self.format_name] = (
self.__get_format_table_count() + 1)
@abc.abstractmethod
def _get_default_table_name_template(self): # pragma: no cover
pass
def _validate(self):
self._validate_table_name()
self._validate_source()
def _validate_table_name(self):
try:
if typepy.is_null_string(self.table_name):
raise ValueError("table name is empty")
except (TypeError, AttributeError):
raise TypeError("table_name must be a string")
def _validate_source(self):
self._validator.validate()
def __get_format_table_count(self):
return self.__format_table_count.get(self.format_name, 0)
def _get_filename_tablename_mapping(self):
filename = ""
if all([
self.source_type == SourceType.FILE,
typepy.is_not_null_string(self.source),
]):
filename = path.Path(self.source).namebase
return (tnt.FILENAME, filename)
def _get_basic_tablename_keyvalue_mapping(self):
from collections import OrderedDict
return OrderedDict([
(tnt.DEFAULT, self._get_default_table_name_template()),
(tnt.FORMAT_NAME, self.format_name),
(tnt.FORMAT_ID, str(self.__get_format_table_count())),
(tnt.GLOBAL_ID, str(self.__global_table_count)),
self._get_filename_tablename_mapping(),
])
def _expand_table_name_format(self, table_name_kv_mapping):
self._validate_table_name()
table_name = self.table_name
for teamplate, value in six.iteritems(table_name_kv_mapping):
table_name = table_name.replace(teamplate, value)
return self._sanitize_table_name(table_name)
def _make_table_name(self):
self._validate_table_name()
return self._expand_table_name_format(
self._get_basic_tablename_keyvalue_mapping())
@staticmethod
def _sanitize_table_name(table_name):
if typepy.is_null_string(table_name):
raise InvalidTableNameError(
"table name is empty after the template replacement")
return table_name.strip("_")
@classmethod
def clear_table_count(cls):
with cls.__table_count_lock:
cls.__global_table_count = 0
cls.__format_table_count = {}

View File

@ -0,0 +1,301 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import io
import json
from .._constant import (
Default,
SourceType,
TableNameTemplate as tnt
)
from .._logger import (
FileSourceLogger,
TextSourceLogger,
)
from .._validator import (
FileValidator,
TextValidator
)
from ..error import ValidationError
from ..interface import TableLoader
from .formatter import JsonTableFormatter
class JsonTableLoader(TableLoader):
"""
An abstract class of JSON table loaders.
"""
@property
def format_name(self):
return "json"
class JsonTableFileLoader(JsonTableLoader):
"""
A file loader class to extract tabular data from JSON files.
:param str file_path: Path to the loading JSON file.
.. py:attribute:: table_name
Table name string. Defaults to ``%(filename)s_%(key)s``.
"""
def __init__(self, file_path=None):
super(JsonTableFileLoader, self).__init__(file_path)
self.encoding = Default.ENCODING
self._validator = FileValidator(file_path)
self._logger = FileSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from a JSON file.
|load_source_desc_file|
This method can be loading four types of JSON formats:
**(1)** Single table data in a file,
acceptable JSON Schema is as follows:
.. code-block:: json
:caption: JSON Schema (1): accept single table
{
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"anyOf": [
{"type": "string"},
{"type": "number"},
{"type": "null"},
],
},
},
}
.. code-block:: json
:caption: JSON example for the JSON schema (1)
[
{"attr_b": 4, "attr_c": "a", "attr_a": 1},
{"attr_b": 2.1, "attr_c": "bb", "attr_a": 2},
{"attr_b": 120.9, "attr_c": "ccc", "attr_a": 3}
]
**(2)** Single table data in a file,
acceptable JSON Schema is as follows:
.. code-block:: json
:caption: JSON Schema (2): accept single table
{
"type": "object",
"additionalProperties": {
"anyOf": [
{"type": "string"},
{"type": "number"},
{"type": "null"},
],
},
}
.. code-block:: json
:caption: JSON example for the JSON schema (2)
{
"attr_a": [1, 2, 3],
"attr_b": [4, 2.1, 120.9],
"attr_c": ["a", "bb", "ccc"]
}
**(3)** Multiple table data in a file,
acceptable JSON Schema is as follows:
.. code-block:: json
:caption: JSON Schema (3): accept multiple table
{
"type": "object",
"additionalProperties": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"anyOf": [
{"type": "string"},
{"type": "number"},
{"type": "null"}
]
}
}
}
}
.. code-block:: json
:caption: JSON example for the JSON schema (3)
{
"table_a" : [
{"attr_b": 4, "attr_c": "a", "attr_a": 1},
{"attr_b": 2.1, "attr_c": "bb", "attr_a": 2},
{"attr_b": 120.9, "attr_c": "ccc", "attr_a": 3}
],
"table_b" : [
{"a": 1, "b": 4},
{"a": 2 },
{"a": 3, "b": 120.9}
]
}
**(4)** Multiple table data in a file,
acceptable JSON Schema is as follows:
.. code-block:: json
:caption: JSON Schema (4): accept multiple table
{
"type": "object",
"additionalProperties": {
"type": "object",
"additionalProperties": {
"type": "array",
"items": {
"anyOf": [
{"type": "string"},
{"type": "number"},
{"type": "null"},
],
},
},
},
}
.. code-block:: json
:caption: JSON example for the JSON schema (4)
{
"table_a" : {
"attr_a": [1, 2, 3],
"attr_b": [4, 2.1, 120.9],
"attr_c": ["a", "bb", "ccc"]
},
"table_b" : {
"a": [1, 3],
"b": [4, 120.9]
}
}
:return:
Loaded table data iterator.
|load_table_name_desc|
=================== ==============================================
Format specifier Value after the replacement
=================== ==============================================
``%(filename)s`` |filename_desc|
``%(key)s`` | This replaced the different value
| for each single/multiple JSON tables:
| [single JSON table]
| ``%(format_name)s%(format_id)s``
| [multiple JSON table] Table data key.
``%(format_name)s`` ``"json"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ==============================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidDataError:
If the data is invalid JSON.
:raises pytablereader.error.ValidationError:
If the data is not acceptable JSON format.
"""
self._validate()
self._logger.logging_load()
with io.open(self.source, "r", encoding=self.encoding) as fp:
try:
json_buffer = json.load(fp)
except ValueError as e:
raise ValidationError(e)
formatter = JsonTableFormatter(json_buffer)
formatter.accept(self)
return formatter.to_table_data()
def _get_default_table_name_template(self):
return "{:s}_{:s}".format(tnt.FILENAME, tnt.KEY)
class JsonTableTextLoader(JsonTableLoader):
"""
A text loader class to extract tabular data from JSON text data.
:param str text: JSON text to load.
.. py:attribute:: table_name
Table name string. Defaults to ``%(key)s``.
"""
@property
def source_type(self):
return SourceType.TEXT
def __init__(self, text):
super(JsonTableTextLoader, self).__init__(text)
self._validator = TextValidator(text)
self._logger = TextSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from a JSON text object.
|load_source_desc_text|
:return:
Loaded table data iterator.
|load_table_name_desc|
=================== ==============================================
Format specifier Value after the replacement
=================== ==============================================
``%(filename)s`` ``""``
``%(key)s`` | This replaced the different value
| for each single/multiple JSON tables:
| [single JSON table]
| ``%(format_name)s%(format_id)s``
| [multiple JSON table] Table data key.
``%(format_name)s`` ``"json"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ==============================================
:rtype: |TableData| iterator
.. seealso::
:py:meth:`.JsonTableFileLoader.load()`
"""
self._validate()
self._logger.logging_load()
json_buffer = json.loads(self.source)
formatter = JsonTableFormatter(json_buffer)
formatter.accept(self)
return formatter.to_table_data()
def _get_default_table_name_template(self):
return "{:s}".format(tnt.KEY)

View File

@ -0,0 +1,260 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import abc
import jsonschema
import six
from tabledata import TableData
from six.moves import zip
from .._constant import (
SourceType,
TableNameTemplate as tnt,
)
from ..error import ValidationError
from ..formatter import TableFormatter
class JsonConverter(TableFormatter):
"""
The abstract class of JSON data converter.
"""
_VALUE_TYPE_SCHEMA = {
"anyOf": [
{"type": "string"},
{"type": "number"},
{"type": "null"},
],
}
def __init__(self, json_buffer):
self._buffer = json_buffer
@abc.abstractproperty
def _schema(self): # pragma: no cover
pass
def _validate_source_data(self):
"""
:raises ValidationError:
"""
try:
jsonschema.validate(self._buffer, self._schema)
except jsonschema.ValidationError as e:
raise ValidationError(e)
class SingleJsonTableConverterBase(JsonConverter):
def _make_table_name(self):
kv_mapping = self._loader._get_basic_tablename_keyvalue_mapping()
kv_mapping[tnt.KEY] = self._loader.get_format_key()
if self._loader.source_type == SourceType.FILE:
kv_mapping[tnt.DEFAULT] = tnt.FILENAME
elif self._loader.source_type == SourceType.TEXT:
kv_mapping[tnt.DEFAULT] = tnt.KEY
return self._loader._expand_table_name_format(kv_mapping)
class SingleJsonTableConverterA(SingleJsonTableConverterBase):
"""
A concrete class of JSON table data formatter.
"""
@property
def _schema(self):
return {
"type": "array",
"items": {
"type": "object",
"additionalProperties": self._VALUE_TYPE_SCHEMA,
},
}
def to_table_data(self):
"""
:raises ValueError:
:raises pytablereader.error.ValidationError:
"""
self._validate_source_data()
attr_name_set = set()
for json_record in self._buffer:
attr_name_set = attr_name_set.union(six.viewkeys(json_record))
self._loader.inc_table_count()
yield TableData(
table_name=self._make_table_name(),
header_list=sorted(attr_name_set),
record_list=self._buffer,
quoting_flags=self._loader.quoting_flags)
class SingleJsonTableConverterB(SingleJsonTableConverterBase):
"""
A concrete class of JSON table data formatter.
"""
@property
def _schema(self):
return {
"type": "object",
"additionalProperties": {
"type": "array",
"items": self._VALUE_TYPE_SCHEMA,
},
}
def to_table_data(self):
"""
:raises ValueError:
:raises pytablereader.error.ValidationError:
"""
self._validate_source_data()
self._loader.inc_table_count()
header_list = sorted(six.viewkeys(self._buffer))
yield TableData(
table_name=self._make_table_name(),
header_list=header_list,
record_list=zip(
*[self._buffer.get(header) for header in header_list]),
quoting_flags=self._loader.quoting_flags)
class MultipleJsonTableConverterBase(JsonConverter):
def __init__(self, json_buffer):
super(MultipleJsonTableConverterBase, self).__init__(json_buffer)
self._table_key = None
def _make_table_name(self):
kv_mapping = self._loader._get_basic_tablename_keyvalue_mapping()
kv_mapping[tnt.DEFAULT] = tnt.KEY
kv_mapping[tnt.KEY] = self._table_key
return self._loader._expand_table_name_format(kv_mapping)
class MultipleJsonTableConverterA(MultipleJsonTableConverterBase):
"""
A concrete class of JSON table data converter.
"""
@property
def _schema(self):
return {
"type": "object",
"additionalProperties": {
"type": "array",
"items": {
"type": "object",
"additionalProperties": self._VALUE_TYPE_SCHEMA,
},
},
}
def to_table_data(self):
"""
:raises ValueError:
:raises pytablereader.error.ValidationError:
"""
self._validate_source_data()
for table_key, json_record_list in six.iteritems(self._buffer):
attr_name_set = set()
for json_record in json_record_list:
attr_name_set = attr_name_set.union(six.viewkeys(json_record))
self._loader.inc_table_count()
self._table_key = table_key
yield TableData(
table_name=self._make_table_name(),
header_list=sorted(attr_name_set),
record_list=json_record_list,
quoting_flags=self._loader.quoting_flags)
class MultipleJsonTableConverterB(MultipleJsonTableConverterBase):
"""
A concrete class of JSON table data converter.
"""
@property
def _schema(self):
return {
"type": "object",
"additionalProperties": {
"type": "object",
"additionalProperties": {
"type": "array",
"items": self._VALUE_TYPE_SCHEMA,
},
},
}
def to_table_data(self):
"""
:raises ValueError:
:raises pytablereader.error.ValidationError:
"""
self._validate_source_data()
for table_key, json_record_list in six.iteritems(self._buffer):
header_list = sorted(six.viewkeys(json_record_list))
self._loader.inc_table_count()
self._table_key = table_key
yield TableData(
table_name=self._make_table_name(),
header_list=header_list,
record_list=zip(
*[json_record_list.get(header) for header in header_list]),
quoting_flags=self._loader.quoting_flags)
class JsonTableFormatter(TableFormatter):
def to_table_data(self):
converter_class_list = [
MultipleJsonTableConverterA,
MultipleJsonTableConverterB,
SingleJsonTableConverterA,
SingleJsonTableConverterB,
]
for converter_class in converter_class_list:
converter = converter_class(self._source_data)
converter.accept(self._loader)
try:
for table_data in converter.to_table_data():
yield table_data
return
except ValidationError:
pass
else:
break
raise ValidationError(
"inconvertible JSON schema: json={}".format(self._source_data))

View File

@ -0,0 +1,10 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from ._file import TableFileLoader
from ._url import TableUrlLoader

View File

@ -0,0 +1,40 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from ..interface import TableLoaderInterface
class TableLoaderManager(TableLoaderInterface):
def __init__(self, loader):
self.__loader = loader
@property
def format_name(self):
return self.__loader.format_name
@property
def source_type(self):
return self.__loader.source_type
@property
def encoding(self):
try:
return self.__loader.encoding
except AttributeError:
return None
@encoding.setter
def encoding(self, codec_name):
self.__loader.encoding = codec_name
def load(self):
return self.__loader.load()
def inc_table_count(self):
self.__loader.inc_table_count()

View File

@ -0,0 +1,69 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
import typepy
from .._constant import Default
from ..factory import TableFileLoaderFactory
from ._base import TableLoaderManager
class TableFileLoader(TableLoaderManager):
"""
Loader class to loading tables from a file.
:param str file_path: Path to the file to load.
:param str format_name: Data format name to load.
Supported formats are as follows:
``"csv"``, ``"excel"``, ``"html"``, ``"json"``, ``"ltsv"``,
``"markdown"``, ``"mediawiki"``, ``"sqlite"``, ``"tsv"``.
If the value is |None|, automatically detect file format from
the ``file_path``.
:raise pytablereader.InvalidFilePathError:
If ``file_path`` is an invalid file path.
:raises pytablereader.LoaderNotFoundError:
|LoaderNotFoundError_desc| loading the file.
.. py:method:: load
Loading table data from a file as ``format_name`` format.
Automatically detect file format if ``format_name`` is |None|.
:return: Loaded table data iterator.
:rtype: |TableData| iterator
.. seealso::
* :py:meth:`pytablereader.factory.TableFileLoaderFactory.create_from_format_name`
* :py:meth:`pytablereader.factory.TableFileLoaderFactory.create_from_path`
"""
def __init__(self, file_path, format_name=None, encoding=Default.ENCODING):
loader_factory = TableFileLoaderFactory(file_path, encoding=encoding)
if typepy.is_not_null_string(format_name):
loader = loader_factory.create_from_format_name(format_name)
else:
loader = loader_factory.create_from_path()
super(TableFileLoader, self).__init__(loader)
@classmethod
def get_format_name_list(cls):
"""
:return:
Available format names. These names can use by
:py:class:`.TableFileLoader` class constructor.
:rtype: list
:Example:
.. code:: python
>>> pytablereader.TableFileLoader.get_format_name_list()
['csv', 'excel', 'html', 'json', 'ltsv', 'markdown', 'mediawiki', 'sqlite', 'tsv']
"""
return TableFileLoaderFactory("dummy").get_format_name_list()

View File

@ -0,0 +1,76 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
import typepy
from ..factory import TableUrlLoaderFactory
from ._base import TableLoaderManager
class TableUrlLoader(TableLoaderManager):
"""
Loader class to loading tables from URL.
:param str url: URL to load.
:param str format_name: Data format name to load.
Supported formats are:
``"csv"``, ``"excel"``, ``"html"``, ``"json"``, ``"ltsv"``,
``"markdown"``, ``"mediawiki"``, ``"sqlite"``, ``"tsv"``.
If the value is |None|, automatically detect file format from
the ``url``.
:param dict proxies: http/https proxy information.
.. seealso::
- `requests proxies <http://requests-docs-ja.readthedocs.io/en/latest/user/advanced/#proxies>`__
:raises pytablereader.LoaderNotFoundError:
|LoaderNotFoundError_desc| loading the URL.
:raises pytablereader.HTTPError:
If loader received an HTTP error when access to the URL.
:Example:
:ref:`example-url-table-loader`
.. py:method:: load
Load tables from URL as ``format_name`` format.
:return: Loaded table data iterator.
:rtype: |TableData| iterator
.. seealso::
* :py:meth:`pytablereader.factory.TableUrlLoaderFactory.create_from_format_name`
* :py:meth:`pytablereader.factory.TableUrlLoaderFactory.create_from_path`
"""
def __init__(self, url, format_name=None, encoding=None, proxies=None):
loader_factory = TableUrlLoaderFactory(url, encoding, proxies)
if typepy.is_not_null_string(format_name):
loader = loader_factory.create_from_format_name(format_name)
else:
loader = loader_factory.create_from_path()
super(TableUrlLoader, self).__init__(loader)
@classmethod
def get_format_name_list(cls):
"""
:return:
Available format names. These names can use by
:py:class:`.TableUrlLoader` class constructor.
:rtype: list
:Example:
.. code:: python
>>> pytablereader.TableUrlLoaderFactory.get_format_name_list()
['csv', 'excel', 'html', 'json', 'ltsv', 'markdown', 'mediawiki', 'sqlite', 'tsv']
"""
return TableUrlLoaderFactory("http://dummy.com/").get_format_name_list()

View File

@ -0,0 +1,209 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import io
from pytablereader import (
InvalidHeaderNameError,
InvalidDataError
)
import typepy
import pathvalidate as pv
from .._constant import (
Default,
TableNameTemplate as tnt,
)
from .._logger import (
FileSourceLogger,
TextSourceLogger,
)
from .._validator import (
FileValidator,
TextValidator
)
from ..interface import TableLoader
from ..json.formatter import SingleJsonTableConverterA
class LtsvTableLoader(TableLoader):
"""
Abstract class of
`Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__
format table loaders.
.. py:attribute:: encoding
Encoding of the LTSV data.
"""
@property
def format_name(self):
return "ltsv"
def __init__(self, source):
super(LtsvTableLoader, self).__init__(source)
self._ltsv_input_stream = None
def _to_data_matrix(self):
from collections import OrderedDict
data_matrix = []
for row_idx, row in enumerate(self._ltsv_input_stream):
if typepy.is_empty_sequence(row):
continue
ltsv_record = OrderedDict()
for col_idx, ltsv_item in enumerate(row.strip().split("\t")):
try:
label, value = ltsv_item.split(":")
except ValueError:
raise InvalidDataError(
"invalid lstv item found: line={}, col={}, item='{}'".format(
row_idx, col_idx, ltsv_item))
label = label.strip('"')
try:
pv.validate_ltsv_label(label)
except (pv.NullNameError, pv.InvalidCharError):
raise InvalidHeaderNameError(
"invalid label found (acceptable chars are [0-9A-Za-z_.-]): "
"line={}, col={}, label='{}'".format(
row_idx, col_idx, label))
ltsv_record[label] = value
data_matrix.append(ltsv_record)
# using generator to prepare for future enhancement to support
# iterative load.
yield data_matrix
class LtsvTableFileLoader(LtsvTableLoader):
"""
`Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__
format file loader class.
:param str file_path: Path to the loading LTSV file.
.. py:attribute:: table_name
Table name string. Defaults to ``%(filename)s``.
"""
def __init__(self, file_path):
super(LtsvTableFileLoader, self).__init__(file_path)
self.encoding = Default.ENCODING
self._validator = FileValidator(file_path)
self._logger = FileSourceLogger(self)
self.__file = None
def load(self):
"""
Extract tabular data as |TableData| instances from a LTSV file.
|load_source_desc_file|
:return:
Loaded table data.
|load_table_name_desc|
=================== ========================================
Format specifier Value after the replacement
=================== ========================================
``%(filename)s`` |filename_desc|
``%(format_name)s`` ``"ltsv"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ========================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidHeaderNameError:
If an invalid label name is included in the LTSV file.
:raises pytablereader.InvalidDataError:
If the LTSV data is invalid.
"""
self._validate()
self._logger.logging_load()
self._ltsv_input_stream = io.open(
self.source, "r", encoding=self.encoding)
for data_matrix in self._to_data_matrix():
formatter = SingleJsonTableConverterA(data_matrix)
formatter.accept(self)
return formatter.to_table_data()
def _get_default_table_name_template(self):
return tnt.FILENAME
class LtsvTableTextLoader(LtsvTableLoader):
"""
`Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__
format text loader class.
:param str text: LTSV text to load.
.. py:attribute:: table_name
Table name string. Defaults to ``%(format_name)s%(format_id)s``.
"""
def __init__(self, text):
super(LtsvTableTextLoader, self).__init__(text)
self._validator = TextValidator(text)
self._logger = TextSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from a LTSV text object.
|load_source_desc_text|
:return:
Loaded table data.
|load_table_name_desc|
=================== ========================================
Format specifier Value after the replacement
=================== ========================================
``%(filename)s`` ``""``
``%(format_name)s`` ``"ltsv"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ========================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidHeaderNameError:
If an invalid label name is included in the LTSV file.
:raises pytablereader.InvalidDataError:
If the LTSV data is invalid.
"""
self._validate()
self._logger.logging_load()
self._ltsv_input_stream = self.source.splitlines()
for data_matrix in self._to_data_matrix():
formatter = SingleJsonTableConverterA(data_matrix)
formatter.accept(self)
return formatter.to_table_data()
def _get_default_table_name_template(self):
return "{:s}{:s}".format(tnt.FORMAT_NAME, tnt.FORMAT_ID)

View File

@ -0,0 +1,148 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import io
from .._constant import (
Default,
SourceType,
TableNameTemplate as tnt
)
from .._logger import (
FileSourceLogger,
TextSourceLogger,
)
from .._validator import (
FileValidator,
TextValidator
)
from ..interface import TableLoader
from .formatter import MarkdownTableFormatter
class MarkdownTableLoader(TableLoader):
"""
The abstract class of Markdown table loaders.
"""
@property
def format_name(self):
return "markdown"
class MarkdownTableFileLoader(MarkdownTableLoader):
"""
A file loader class to extract tabular data from Markdown files.
:param str file_path: Path to the loading Markdown file.
.. py:attribute:: table_name
Table name string. Defaults to ``%(filename)s_%(key)s``.
"""
def __init__(self, file_path=None):
super(MarkdownTableFileLoader, self).__init__(file_path)
self.encoding = Default.ENCODING
self._validator = FileValidator(file_path)
self._logger = FileSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from a Markdown file.
|load_source_desc_file|
:return:
Loaded table data iterator.
|load_table_name_desc|
=================== ==============================================
Format specifier Value after the replacement
=================== ==============================================
``%(filename)s`` |filename_desc|
``%(key)s`` ``%(format_name)s%(format_id)s``
``%(format_name)s`` ``"markdown"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ==============================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidDataError:
If the Markdown data is invalid or empty.
"""
self._validate()
self._logger.logging_load()
with io.open(self.source, "r", encoding=self.encoding) as fp:
formatter = MarkdownTableFormatter(fp.read())
formatter.accept(self)
return formatter.to_table_data()
def _get_default_table_name_template(self):
return "{:s}_{:s}".format(tnt.FILENAME, tnt.KEY)
class MarkdownTableTextLoader(MarkdownTableLoader):
"""
A text loader class to extract tabular data from Markdown text data.
:param str text: Markdown text to load.
.. py:attribute:: table_name
Table name string. Defaults to ``%(key)s``.
"""
@property
def source_type(self):
return SourceType.TEXT
def __init__(self, text):
super(MarkdownTableTextLoader, self).__init__(text)
self._validator = TextValidator(text)
self._logger = TextSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from a Markdown text
object.
|load_source_desc_text|
:return:
Loaded table data iterator.
|load_table_name_desc|
=================== ==============================================
Format specifier Value after the replacement
=================== ==============================================
``%(filename)s`` ``""``
``%(key)s`` ``%(format_name)s%(format_id)s``
``%(format_name)s`` ``"markdown"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ==============================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidDataError:
If the Markdown data is invalid or empty.
"""
self._validate()
self._logger.logging_load()
formatter = MarkdownTableFormatter(self.source)
formatter.accept(self)
return formatter.to_table_data()
def _get_default_table_name_template(self):
return "{:s}".format(tnt.KEY)

View File

@ -0,0 +1,25 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
from pytablereader import InvalidDataError
import typepy
from ..html.formatter import HtmlTableFormatter
class MarkdownTableFormatter(HtmlTableFormatter):
def __init__(self, source_data):
import markdown2
if typepy.is_null_string(source_data):
raise InvalidDataError
super(MarkdownTableFormatter, self).__init__(
markdown2.markdown(source_data, extras=["tables"]))

View File

@ -0,0 +1,156 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import io
from .._constant import (
Default,
SourceType,
TableNameTemplate as tnt
)
from .._logger import (
FileSourceLogger,
TextSourceLogger,
)
from .._validator import (
FileValidator,
TextValidator
)
from ..interface import TableLoader
from .formatter import MediaWikiTableFormatter
class MediaWikiTableLoader(TableLoader):
"""
The abstract class of MediaWiki table loaders.
"""
@property
def format_name(self):
return "mediawiki"
class MediaWikiTableFileLoader(MediaWikiTableLoader):
"""
A file loader class to extract tabular data from MediaWiki files.
:param str file_path: Path to the loading file.
.. py:attribute:: table_name
Table name string. Defaults to ``%(filename)s_%(key)s``.
"""
def __init__(self, file_path=None):
super(MediaWikiTableFileLoader, self).__init__(file_path)
self.encoding = Default.ENCODING
self._validator = FileValidator(file_path)
self._logger = FileSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from a MediaWiki file.
|load_source_desc_file|
:return:
Loaded table data iterator.
|load_table_name_desc|
=================== ==============================================
Format specifier Value after the replacement
=================== ==============================================
``%(filename)s`` |filename_desc|
``%(key)s`` | This replaced to:
| **(1)** ``caption`` mark of the table
| **(2)** ``%(format_name)s%(format_id)s``
| if ``caption`` mark not included
| in the table.
``%(format_name)s`` ``"mediawiki"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ==============================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidDataError:
If the MediaWiki data is invalid or empty.
"""
self._validate()
self._logger.logging_load()
with io.open(self.source, "r", encoding=self.encoding) as fp:
formatter = MediaWikiTableFormatter(fp.read())
formatter.accept(self)
return formatter.to_table_data()
def _get_default_table_name_template(self):
return "{:s}_{:s}".format(tnt.FILENAME, tnt.KEY)
class MediaWikiTableTextLoader(MediaWikiTableLoader):
"""
A text loader class to extract tabular data from MediaWiki text data.
:param str text: MediaWiki text to load.
.. py:attribute:: table_name
Table name string. Defaults to ``%(key)s``.
"""
@property
def source_type(self):
return SourceType.TEXT
def __init__(self, text):
super(MediaWikiTableTextLoader, self).__init__(text)
self._validator = TextValidator(text)
self._logger = TextSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from a MediaWiki text
object.
|load_source_desc_text|
:return:
Loaded table data iterator.
|load_table_name_desc|
=================== ==============================================
Format specifier Value after the replacement
=================== ==============================================
``%(filename)s`` ``""``
``%(key)s`` | This replaced to:
| **(1)** ``caption`` mark of the table
| **(2)** ``%(format_name)s%(format_id)s``
| if ``caption`` mark not included
| in the table.
``%(format_name)s`` ``"mediawiki"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ==============================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidDataError:
If the MediaWiki data is invalid or empty.
"""
self._validate()
self._logger.logging_load()
formatter = MediaWikiTableFormatter(self.source)
formatter.accept(self)
return formatter.to_table_data()
def _get_default_table_name_template(self):
return "{:s}".format(tnt.KEY)

View File

@ -0,0 +1,23 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
from ..error import PypandocImportError
from ..html.formatter import HtmlTableFormatter
class MediaWikiTableFormatter(HtmlTableFormatter):
def __init__(self, source_data):
try:
import pypandoc
except ImportError as e:
raise PypandocImportError(e)
super(MediaWikiTableFormatter, self).__init__(
pypandoc.convert_text(source_data, "html", format="mediawiki"))

View File

@ -0,0 +1,69 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
import abc
from .._constant import TableNameTemplate as tnt
from ..interface import TableLoader
class SpreadSheetLoader(TableLoader):
"""
An abstract class of table data.
Especially spreadsheets that consists multiple rows.
.. py:attribute:: start_row
The first row to search header row.
"""
def __init__(self, source):
super(SpreadSheetLoader, self).__init__(source)
self.start_row = 0
self._worksheet = None
self._start_col_idx = None
self._end_col_idx = None
@abc.abstractproperty
def _sheet_name(self): # pragma: no cover
pass
@abc.abstractproperty
def _row_count(self): # pragma: no cover
pass
@abc.abstractproperty
def _col_count(self): # pragma: no cover
pass
@abc.abstractmethod
def _is_empty_sheet(self): # pragma: no cover
pass
@abc.abstractmethod
def _get_start_row_idx(self): # pragma: no cover
pass
@property
def format_name(self):
return "spreadsheet"
def _make_table_name(self):
kv_mapping = self._get_basic_tablename_keyvalue_mapping()
try:
kv_mapping[tnt.SHEET] = self._sheet_name
except AttributeError:
kv_mapping[tnt.SHEET] = ""
return self._expand_table_name_format(kv_mapping)
def _get_default_table_name_template(self):
return "{:s}".format(tnt.SHEET)

View File

@ -0,0 +1,160 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
from pytablereader import InvalidDataError
from tabledata import TableData
import xlrd
from six.moves import range
from .._logger import FileSourceLogger
from .._validator import FileValidator
from ..error import OpenError
from .core import SpreadSheetLoader
class ExcelTableFileLoader(SpreadSheetLoader):
"""
A file loader class to extract tabular data from Microsoft Excel |TM|
files.
:param str file_path: Path to the loading Excel workbook file.
.. py:attribute:: table_name
Table name string. Defaults to ``%(sheet)s``.
.. py:attribute:: start_row
The first row to search header row.
"""
@property
def format_name(self):
return "excel"
@property
def _sheet_name(self):
return self._worksheet.name
@property
def _row_count(self):
return self._worksheet.nrows
@property
def _col_count(self):
return self._worksheet.ncols
def __init__(self, file_path=None):
super(ExcelTableFileLoader, self).__init__(file_path)
self._validator = FileValidator(file_path)
self._logger = FileSourceLogger(self)
def load(self):
"""
Extract tabular data as |TableData| instances from an Excel file.
|spreadsheet_load_desc|
:return:
Loaded |TableData| iterator.
|TableData| created for each sheet in the workbook.
|load_table_name_desc|
=================== ====================================
Format specifier Value after the replacement
=================== ====================================
``%(filename)s`` Filename of the workbook
``%(sheet)s`` Name of the sheet
``%(format_name)s`` ``"spreadsheet"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ====================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidDataError:
If the header row is not found.
:raises pytablereader.error.OpenError:
If failed to open the source file.
"""
self._validate()
self._logger.logging_load()
try:
workbook = xlrd.open_workbook(self.source)
except xlrd.biffh.XLRDError as e:
raise OpenError(e)
for worksheet in workbook.sheets():
self._worksheet = worksheet
if self._is_empty_sheet():
continue
self.__extract_not_empty_col_idx()
try:
start_row_idx = self._get_start_row_idx()
except InvalidDataError:
continue
header_list = self.__get_row_values(start_row_idx)
record_list = [
self.__get_row_values(row_idx)
for row_idx in range(start_row_idx + 1, self._row_count)
]
self.inc_table_count()
yield TableData(
self._make_table_name(), header_list, record_list,
is_strip_quote=True, quoting_flags=self.quoting_flags)
def _is_empty_sheet(self):
return any([
self._col_count == 0,
self._row_count <= 1,
# nrows == 1 means exists header row only
])
def _get_start_row_idx(self):
for row_idx in range(self.start_row, self._row_count):
if self.__is_header_row(row_idx):
break
else:
raise InvalidDataError("header row not found")
return row_idx
def __is_header_row(self, row_idx):
cell_type_list = self._worksheet.row_types(
row_idx, self._start_col_idx, self._end_col_idx + 1)
return xlrd.XL_CELL_EMPTY not in cell_type_list
@staticmethod
def __is_empty_cell_type_list(cell_type_list):
return all([
cell_type == xlrd.XL_CELL_EMPTY
for cell_type in cell_type_list
])
def __extract_not_empty_col_idx(self):
col_idx_list = [
col_idx
for col_idx in range(self._col_count)
if not self.__is_empty_cell_type_list(
self._worksheet.col_types(col_idx))
]
self._start_col_idx = min(col_idx_list)
self._end_col_idx = max(col_idx_list)
def __get_row_values(self, row_idx):
return self._worksheet.row_values(
row_idx, self._start_col_idx, self._end_col_idx + 1)

View File

@ -0,0 +1,184 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from tabledata import TableData
import typepy
from .._constant import TableNameTemplate as tnt
from .._validator import TextValidator
from ..error import OpenError
from .core import SpreadSheetLoader
class GoogleSheetsTableLoader(SpreadSheetLoader):
"""
Concrete class of Google Spreadsheet loader.
.. py:attribute:: table_name
Table name string. Defaults to ``%(sheet)s``.
:param str file_path: Path to the Google Sheets credential JSON file.
:Dependency Packages:
- `gspread <https://github.com/burnash/gspread>`_
- `SimpleSQLite <https://github.com/thombashi/SimpleSQLite>`_
- `oauth2client <https://pypi.python.org/pypi/oauth2client>`_
- `pyOpenSSL <https://pypi.python.org/pypi/pyOpenSSL>`_
:Examples:
:ref:`example-gs-table-loader`
"""
@property
def _sheet_name(self):
return self._worksheet.title
@property
def _row_count(self):
return self._worksheet.row_count
@property
def _col_count(self):
return self._worksheet.col_count
def __init__(self, file_path=None):
super(GoogleSheetsTableLoader, self).__init__(file_path)
self.title = None
self.start_row = 0
self._validator = TextValidator(file_path)
self.__all_values = None
def load(self):
"""
Load table data from a Google Spreadsheet.
This method consider :py:attr:`.source` as a path to the
credential JSON file to access Google Sheets API.
The method automatically search the header row start from
:py:attr:`.start_row`. The condition of the header row is that
all of the columns have value (except empty columns).
:return:
Loaded table data. Return one |TableData| for each sheet in
the workbook. The table name for data will be determined by
:py:meth:`~.GoogleSheetsTableLoader.make_table_name`.
:rtype: iterator of |TableData|
:raises pytablereader.InvalidDataError:
If the header row is not found.
:raises pytablereader.OpenError:
If the spread sheet not found.
"""
import gspread
from oauth2client.service_account import ServiceAccountCredentials
self._validate_table_name()
self._validate_title()
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name(
self.source, scope)
gc = gspread.authorize(credentials)
try:
for worksheet in gc.open(self.title).worksheets():
self._worksheet = worksheet
self.__all_values = [row for row in worksheet.get_all_values()]
if self._is_empty_sheet():
continue
try:
self.__strip_empty_col()
except ValueError:
continue
value_matrix = self.__all_values[self._get_start_row_idx():]
try:
header_list = value_matrix[0]
record_list = value_matrix[1:]
except IndexError:
continue
self.inc_table_count()
yield TableData(
self.make_table_name(), header_list, record_list,
quoting_flags=self.quoting_flags)
except gspread.exceptions.SpreadsheetNotFound:
raise OpenError("spreadsheet '{}' not found".format(self.title))
def _is_empty_sheet(self):
return len(self.__all_values) <= 1
def _get_start_row_idx(self):
row_idx = 0
for row_value_list in self.__all_values:
if all([
typepy.is_not_null_string(value)
for value in row_value_list
]):
break
row_idx += 1
return self.start_row + row_idx
def _validate_title(self):
if typepy.is_null_string(self.title):
raise ValueError("spreadsheet title is empty")
def _make_table_name(self):
self._validate_title()
kv_mapping = self._get_basic_tablename_keyvalue_mapping()
kv_mapping[tnt.TITLE] = self.title
try:
kv_mapping[tnt.SHEET] = self._sheet_name
except AttributeError:
kv_mapping[tnt.SHEET] = ""
return self._expand_table_name_format(kv_mapping)
def __strip_empty_col(self):
from simplesqlite import connect_sqlite_memdb
from simplesqlite.sqlquery import SqlQuery
con = connect_sqlite_memdb()
tmp_table_name = "tmp"
header_list = [
"a{:d}".format(i)
for i in range(len(self.__all_values[0]))
]
con.create_table_from_data_matrix(
table_name=tmp_table_name,
attr_name_list=header_list,
data_matrix=self.__all_values)
for col_idx, header in enumerate(header_list):
result = con.select(
select=SqlQuery.to_attr_str(header), table_name=tmp_table_name)
if any([
typepy.is_not_null_string(record[0])
for record in result.fetchall()
]):
break
strip_header_list = header_list[col_idx:]
if typepy.is_empty_sequence(strip_header_list):
raise ValueError()
result = con.select(
select=",".join(SqlQuery.to_attr_str_list(strip_header_list)),
table_name=tmp_table_name)
self.__all_values = result.fetchall()

View File

@ -0,0 +1,70 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
from .._constant import TableNameTemplate as tnt
from .._validator import FileValidator
from ..interface import TableLoader
from .formatter import SqliteTableFormatter
class SqliteFileLoader(TableLoader):
"""
A file loader class to extract tabular data from SQLite database files.
:param str file_path: Path to the loading SQLite database file.
.. py:attribute:: table_name
Table name string. Defaults to ``%(filename)s_%(key)s``.
:Dependency Packages:
- `SimpleSQLite <https://github.com/thombashi/SimpleSQLite>`__
"""
@property
def format_name(self):
return "sqlite"
def __init__(self, file_path=None):
super(SqliteFileLoader, self).__init__(file_path)
self._validator = FileValidator(file_path)
def load(self):
"""
Extract tabular data as |TableData| instances from a SQLite database
file. |load_source_desc_file|
:return:
Loaded table data iterator.
|load_table_name_desc|
=================== ==============================================
Format specifier Value after the replacement
=================== ==============================================
``%(filename)s`` |filename_desc|
``%(key)s`` ``%(format_name)s%(format_id)s``
``%(format_name)s`` ``"sqlite"``
``%(format_id)s`` |format_id_desc|
``%(global_id)s`` |global_id|
=================== ==============================================
:rtype: |TableData| iterator
:raises pytablereader.InvalidDataError:
If the SQLite database file data is invalid or empty.
"""
self._validate()
formatter = SqliteTableFormatter(self.source)
formatter.accept(self)
return formatter.to_table_data()
def _get_default_table_name_template(self):
return "{:s}{:s}".format(tnt.FORMAT_NAME, tnt.FORMAT_ID)

View File

@ -0,0 +1,50 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
from pytablereader import InvalidDataError
from tabledata import TableData
import typepy
from .._constant import TableNameTemplate as tnt
from ..formatter import TableFormatter
class SqliteTableFormatter(TableFormatter):
def __init__(self, source_data):
super(SqliteTableFormatter, self).__init__(source_data)
self.__table_name = None
if typepy.is_null_string(source_data):
raise InvalidDataError
def to_table_data(self):
from simplesqlite import SimpleSQLite
from simplesqlite.sqlquery import SqlQuery
con = SimpleSQLite(self._source_data, "r")
for table in con.get_table_name_list():
self.__table_name = table
attr_name_list = con.get_attr_name_list(table)
data_matrix = con.select(
select=",".join(SqlQuery.to_attr_str_list(attr_name_list)),
table_name=table)
yield TableData(
table, attr_name_list, data_matrix,
quoting_flags=self._loader.quoting_flags)
def _make_table_name(self):
return self._loader._expand_table_name_format(
self._loader._get_basic_tablename_keyvalue_mapping() + [
(tnt.KEY, self.__table_name),
])

View File

@ -0,0 +1,63 @@
# encoding: utf-8
"""
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
"""
from __future__ import absolute_import
from __future__ import unicode_literals
from .._validator import (
FileValidator,
TextValidator
)
from ..csv.core import (
CsvTableFileLoader,
CsvTableTextLoader
)
class TsvTableFileLoader(CsvTableFileLoader):
"""
Tab separated values (TSV) format file loader class.
:param str file_path: Path to the loading TSV file.
.. py:attribute:: table_name
Table name string. Defaults to ``%(filename)s``.
"""
@property
def format_name(self):
return "tsv"
def __init__(self, file_path):
super(TsvTableFileLoader, self).__init__(file_path)
self.delimiter = "\t"
self._validator = FileValidator(file_path)
class TsvTableTextLoader(CsvTableTextLoader):
"""
Tab separated values (TSV) format text loader class.
:param str text: TSV text to load.
.. py:attribute:: table_name
Table name string. Defaults to ``%(format_name)s%(format_id)s``.
"""
@property
def format_name(self):
return "tsv"
def __init__(self, text):
super(TsvTableTextLoader, self).__init__(text)
self.delimiter = "\t"
self._validator = TextValidator(text)