203 lines
8.8 KiB
Python
203 lines
8.8 KiB
Python
# Copyright (c) 2010-2011 OpenStack, LLC.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from __future__ import with_statement
|
|
import gzip
|
|
import hashlib
|
|
import os
|
|
from paste.deploy import appconfig
|
|
from paste.deploy import loadfilter
|
|
import re
|
|
from slogging.internal_proxy import InternalProxy
|
|
from swift.common.daemon import Daemon
|
|
from swift.common import utils
|
|
import sys
|
|
import time
|
|
|
|
|
|
class LogUploader(Daemon):
|
|
"""LogUploader class.
|
|
|
|
Given a local directory, a swift account, and a container name,
|
|
|
|
LogParser will upload all files in the local directory to the given
|
|
account/container.
|
|
All but the newest files will be uploaded, and the files' md5
|
|
sum will be computed. The hash is used to prevent duplicate data from
|
|
being uploaded multiple times in different files (ex: log lines). Since
|
|
the hash is computed, it is also used as the uploaded object's etag to
|
|
ensure data integrity.
|
|
|
|
Note that after the file is successfully uploaded, it will be unlinked.
|
|
|
|
The given proxy server config is used to instantiate a proxy server for
|
|
the object uploads.
|
|
|
|
The default log file format is: plugin_name-%Y%m%d%H* . Any other format
|
|
of log file names must supply a regular expression that defines groups
|
|
for year, month, day, and hour. The regular expression will be evaluated
|
|
with re.VERBOSE. A common example may be:
|
|
source_filename_pattern = ^cdn_logger-
|
|
(?P<year>[0-9]{4})
|
|
(?P<month>[0-1][0-9])
|
|
(?P<day>[0-3][0-9])
|
|
(?P<hour>[0-2][0-9])
|
|
.*$
|
|
"""
|
|
def __init__(self, uploader_conf, plugin_name, regex=None, cutoff=None):
|
|
super(LogUploader, self).__init__(uploader_conf)
|
|
log_name = '%s-log-uploader' % plugin_name
|
|
self.logger = utils.get_logger(uploader_conf, log_name,
|
|
log_route=plugin_name)
|
|
self.log_dir = uploader_conf.get('log_dir', '/var/log/swift/')
|
|
self.swift_account = uploader_conf['swift_account']
|
|
self.container_name = uploader_conf['container_name']
|
|
proxy_server_conf_loc = uploader_conf.get(
|
|
'proxy_server_conf', '/etc/swift/proxy-server.conf')
|
|
proxy_server_conf = appconfig('config:%s' % proxy_server_conf_loc,
|
|
name='proxy-server')
|
|
memcache = loadfilter('config:%s' % proxy_server_conf_loc,
|
|
name='cache')(object).memcache
|
|
self.internal_proxy = InternalProxy(proxy_server_conf,
|
|
memcache=memcache)
|
|
self.new_log_cutoff = int(cutoff or
|
|
uploader_conf.get('new_log_cutoff', '7200'))
|
|
self.unlink_log = \
|
|
uploader_conf.get('unlink_log',
|
|
'true').lower() in utils.TRUE_VALUES
|
|
self.filename_pattern = regex or \
|
|
uploader_conf.get(
|
|
'source_filename_pattern',
|
|
'''
|
|
^%s-
|
|
(?P<year>[0-9]{4})
|
|
(?P<month>[0-1][0-9])
|
|
(?P<day>[0-3][0-9])
|
|
(?P<hour>[0-2][0-9])
|
|
.*$''' % plugin_name)
|
|
self.content_type = uploader_conf.get('content_type')
|
|
|
|
def run_once(self, *args, **kwargs):
|
|
self.logger.info(_("Uploading logs"))
|
|
start = time.time()
|
|
self.upload_all_logs()
|
|
self.logger.info(_("Uploading logs complete (%0.2f minutes)") %
|
|
((time.time() - start) / 60))
|
|
|
|
def get_relpath_to_files_under_log_dir(self):
|
|
"""Look under log_dir recursively and return all filenames as relpaths
|
|
|
|
:returns : list of strs, the relpath to all filenames under log_dir
|
|
"""
|
|
all_files = []
|
|
for path, dirs, files in os.walk(self.log_dir):
|
|
all_files.extend(os.path.join(path, f) for f in files)
|
|
return [os.path.relpath(f, start=self.log_dir) for f in all_files]
|
|
|
|
def filter_files(self, all_files):
|
|
"""Filter files based on regex pattern.
|
|
|
|
:param all_files: list of strs, relpath of the filenames under log_dir
|
|
:param pattern: regex pattern to match against filenames
|
|
:returns : dict mapping full path of file to match group dict
|
|
"""
|
|
filename2match = {}
|
|
for filename in all_files:
|
|
match = re.match(self.filename_pattern, filename, re.VERBOSE)
|
|
if match:
|
|
full_path = os.path.join(self.log_dir, filename)
|
|
filename2match[full_path] = match.groupdict()
|
|
else:
|
|
self.logger.debug(_('%(filename)s does not match '
|
|
'%(pattern)s') %
|
|
{'filename': filename,
|
|
'pattern': self.filename_pattern})
|
|
return filename2match
|
|
|
|
def upload_all_logs(self):
|
|
"""Match files under log_dir to source_filename_pattern.
|
|
|
|
And upload to swift
|
|
"""
|
|
all_files = self.get_relpath_to_files_under_log_dir()
|
|
filename2match = self.filter_files(all_files)
|
|
if not filename2match:
|
|
self.logger.error(_('No files in %(log_dir)s match %(pattern)s') %
|
|
{'log_dir': self.log_dir,
|
|
'pattern': self.filename_pattern})
|
|
sys.exit(1)
|
|
if not self.internal_proxy.create_container(self.swift_account,
|
|
self.container_name):
|
|
self.logger.error(_('Unable to create container for '
|
|
'%(account)s/%(container)s') % {
|
|
'account': self.swift_account,
|
|
'container': self.container_name})
|
|
return
|
|
for filename, match in filename2match.items():
|
|
# don't process very new logs
|
|
try:
|
|
seconds_since_mtime = time.time() - os.stat(filename).st_mtime
|
|
except OSError:
|
|
# filename wasn't found, skip it
|
|
continue
|
|
if seconds_since_mtime < self.new_log_cutoff:
|
|
self.logger.debug(_("Skipping log: %(file)s "
|
|
"(< %(cutoff)d seconds old)") % {
|
|
'file': filename,
|
|
'cutoff': self.new_log_cutoff})
|
|
continue
|
|
try:
|
|
self.upload_one_log(filename, **match)
|
|
except Exception:
|
|
self.logger.exception(
|
|
_('ERROR: could not upload %s') % filename)
|
|
|
|
def upload_one_log(self, filename, year, month, day, hour):
|
|
"""Upload one file to swift"""
|
|
if os.path.getsize(filename) == 0:
|
|
self.logger.debug(_("Log %s is 0 length, skipping") % filename)
|
|
return
|
|
self.logger.debug(_("Processing log: %s") % filename)
|
|
filehash = hashlib.md5()
|
|
already_compressed = True if filename.endswith('.gz') else False
|
|
opener = gzip.open if already_compressed else open
|
|
f = opener(filename, 'rb')
|
|
try:
|
|
for line in f:
|
|
# filter out bad lines here?
|
|
filehash.update(line)
|
|
finally:
|
|
f.close()
|
|
filehash = filehash.hexdigest()
|
|
# By adding a hash to the filename, we ensure that uploaded files
|
|
# have unique filenames and protect against uploading one file
|
|
# more than one time. By using md5, we get an etag for free.
|
|
target_filename = '/'.join([year, month, day, hour, filehash + '.gz'])
|
|
metadata = {'x-object-meta-original-name': filename}
|
|
if self.content_type:
|
|
metadata['Content-Type'] = self.content_type
|
|
if self.internal_proxy.upload_file(filename,
|
|
self.swift_account,
|
|
self.container_name,
|
|
target_filename,
|
|
compress=(not already_compressed),
|
|
headers=metadata):
|
|
self.logger.debug(_("Uploaded log %(file)s to %(target)s") %
|
|
{'file': filename, 'target': target_filename})
|
|
if self.unlink_log:
|
|
os.unlink(filename)
|
|
else:
|
|
self.logger.error(_("ERROR: Upload of log %s failed!") % filename)
|