Wednesday, April 23, 2014

Project Flume : Collecting Webserver logs into HDFS


1) Download latest Flume binaries from https://flume.apache.org/download.html

$cd $HOME/Downloads
$tar -xvzf apache-flume-1.4.0-bin.tar.gz
$ls -lrt apache-flume-1.4.0-bin













2) Copy binaries into local folder

$sudo  mkdir /usr/local/flume
$sudo
$sudo chown -R butik flume

  







3) Set Flume home and path
  






  
 
$cd $HOME
$vi .bashrc
$exec bash #commit the changes



4) Say hallo world to flume

$cd $FLUME_PREFIX/conf
$vi hw.conf

agent.sources=s1
agent.channels=c1
agent.sinks=k1
agent.sources.s1.type=netcat
agent.sources.s1.channels=c1
agent.sources.s1.bind=0.0.0.0
agent.sources.s1.port=12345
agent.channels.c1.type=memory
agent.sinks.k1.type=logger
agent.sinks.k1.channel=c1


Start the flume agent

$./bin/flume-ng agent -n agent -c conf -f conf/hw.conf -Dflume.root.logger=INFO,console























From another terminal sends command line feeds and check the agent logs.

$telnet localhost 12345
 

































5) Now lets collect webserver logs through Flume

Install apache

$sudo apt-get update
$sudo apt-get install apache2
$sudo vi /var/www/html/index.html




<!DOCTYPE html>
<html>
<body>
<h1>Welcome to the world of bigdata</h1>
<p>Let us flume web server logs into HDFS</p>
</body>
</html>



Check localhost or 127.0.0.1 for the webpage just created.


$cd $FLUME_PREFIX/conf
 






flume-env.sh


JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-amd64
HADOOP_PREFIX=/usr/local/
hadoop
# Note that the Flume conf directory is always included in the classpath.
FLUME_CLASSPATH=/usr/local/
flume/lib/flume-sources-1.0-SNAPSHOT.jar

flume-conf.properties

tail1.sources = src1
tail1.channels = ch1
tail1.sinks = sink1
tail1.sources.src1.type = exec
tail1.sources.src1.command = tail -F /var/log/apache2/access.log
tail1.sources.src1.channels = ch1
tail1.channels.ch1.type = memory
tail1.channels.ch1.capacity = 1000
tail1.sinks.sink1.type = avro
tail1.sinks.sink1.hostname = localhost
tail1.sinks.sink1.port = 6000
tail1.sinks.sink1.batch-size = 1
tail1.sinks.sink1.channel = ch1
##
collector1.sources = src1
collector1.channels = ch1
collector1.sinks = sink1
collector1.sources.src1.type = avro
collector1.sources.src1.bind = localhost
collector1.sources.src1.port = 6000
collector1.sources.src1.

channels = ch1
collector1.channels.ch1.type = memory
collector1.channels.ch1.
capacity = 500
collector1.sinks.sink1.type = hdfs
collector1.sinks.sink1.hdfs.
path = /user/butik/flume/collector1
collector1.sinks.sink1.hdfs.
filePrefix = access_log
collector1.sinks.sink1.hdfs.
writeFormat = Writable
collector1.sinks.sink1.hdfs.
fileType = DataStream
collector1.sinks.sink1.channel = ch1



log4j.properties

flume.root.logger=DEBUG,A1
#flume.root.logger=INFO,
LOGFILE
#flume.log.dir=./logs
flume.log.dir=/usr/local/
flume/logs
flume.log.file=flume.log

#log4j.logger.org.apache.
flume.lifecycle = INFO
log4j.logger.org.jboss = WARN
log4j.logger.org.mortbay = INFO
log4j.logger.org.apache.avro.
ipc.NettyTransceiver = WARN
log4j.logger.org.apache.hadoop = INFO

# Define the root logger to the system property "flume.root.logger".
log4j.rootLogger=${flume.root.
logger}


# Stock log4j rolling file appender
# Default log rotation configuration
log4j.appender.LOGFILE=org.
apache.log4j.RollingFileAppender
log4j.appender.LOGFILE.
MaxFileSize=100MB
log4j.appender.LOGFILE.
MaxBackupIndex=10


log4j.appender.LOGFILE.File=${
flume.log.dir}/${flume.log.file}
log4j.appender.LOGFILE.layout=
org.apache.log4j.PatternLayout
log4j.appender.LOGFILE.layout.
ConversionPattern=%d{dd MMM yyyy HH:mm:ss,SSS} %-5p [%t] (%C.%M:%L) %x - %m%n


# Warning: If you enable the following appender it will fill up your disk if you don't have a cleanup job!
# This uses the updated rolling file appender from log4j-extras that supports a reliable time-based rolling policy.
# See http://logging.apache.org/log4j/companions/extras/apidocs/org/apache/log4j/rolling/TimeBasedRollingPolicy.html
# Add "DAILY" to flume.root.logger above if you want to use this
log4j.appender.DAILY=org.
apache.log4j.rolling.RollingFileAppender

log4j.appender.DAILY.
rollingPolicy=org.apache.log4j.rolling.TimeBasedRollingPolicy
log4j.appender.DAILY.
rollingPolicy.ActiveFileName=${flume.log.dir}/${flume.log.file}
log4j.appender.DAILY.
rollingPolicy.FileNamePattern=${flume.log.dir}/${flume.log.file}.%d{yyyy-MM-dd}
log4j.appender.DAILY.layout=
org.apache.log4j.PatternLayout
log4j.appender.DAILY.layout.
ConversionPattern=%d{dd MMM yyyy HH:mm:ss,SSS} %-5p [%t] (%C.%M:%L) %x - %m%n
# console
# Add "console" to flume.root.logger above if you want to use this
log4j.appender.A1=org.apache.

log4j.ConsoleAppender
log4j.appender.A1.target=
System.err
log4j.appender.A1.layout=org.
apache.log4j.PatternLayout
log4j.appender.A1.layout.
ConversionPattern=%d (%t) [%p - %l] %m%n


Start agents

$cd $FLUME_PREFIX/bin
$./flume-ng agent --conf /usr/local/flume/conf/ --conf-file /usr/local/flume/conf/flume-

conf.properties --name collector1 $./flume-ng agent --conf-file /usr/local/flume/conf/flume-conf.properties --name tail1

 keep refreshing your webpage
 




























We are done. Check the data in HDFS

 

1 comment:

  1. I have created $sudo vi /var/www/html/index.html as per stated above. I have started server and not able to open this in browser or telnet.
    Please guide me how to refresh index.html file to create logs

    ReplyDelete